You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_4x4_sse2.S 79 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define J %r12
  49. #define AO %r13
  50. #define BO %r14
  51. #define CO1 %r15
  52. #define CO2 %rbp
  53. #ifndef WINDOWS_ABI
  54. #define STACKSIZE 64
  55. #define OLD_LDC 8 + STACKSIZE(%rsp)
  56. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  57. #else
  58. #define STACKSIZE 256
  59. #define OLD_A 40 + STACKSIZE(%rsp)
  60. #define OLD_B 48 + STACKSIZE(%rsp)
  61. #define OLD_C 56 + STACKSIZE(%rsp)
  62. #define OLD_LDC 64 + STACKSIZE(%rsp)
  63. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  64. #endif
  65. #define ALPHA 0(%rsp)
  66. #define OFFSET 16(%rsp)
  67. #define KK 24(%rsp)
  68. #define KKK 32(%rsp)
  69. #define AORIG 40(%rsp)
  70. #define BORIG 48(%rsp)
  71. #define BUFFER 128(%rsp)
  72. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
  73. #define PREFETCH prefetch
  74. #define PREFETCHW prefetchw
  75. #define PREFETCHNTA prefetchnta
  76. #ifndef ALLOC_HUGETLB
  77. #define PREFETCHSIZE (8 * 4 + 4)
  78. #else
  79. #define PREFETCHSIZE (8 * 2 + 4)
  80. #endif
  81. #endif
  82. #ifdef GENERIC
  83. #define PREFETCH prefetcht0
  84. #define PREFETCHW prefetcht0
  85. #define PREFETCHNTA prefetchnta
  86. #define PREFETCHSIZE (8 * 4 + 4)
  87. #endif
  88. #ifdef OPTERON
  89. #define movsd movlpd
  90. #endif
  91. #define KERNEL1(xx) \
  92. mulpd %xmm8, %xmm9 ;\
  93. addpd %xmm9, %xmm0 ;\
  94. movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
  95. mulpd %xmm8, %xmm11 ;\
  96. PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\
  97. addpd %xmm11, %xmm1 ;\
  98. movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  99. mulpd %xmm8, %xmm13 ;\
  100. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\
  101. addpd %xmm13, %xmm2 ;\
  102. movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  103. addpd %xmm8, %xmm3 ;\
  104. movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8
  105. #define KERNEL2(xx) \
  106. mulpd %xmm10, %xmm9 ;\
  107. addpd %xmm9, %xmm4 ;\
  108. movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
  109. mulpd %xmm10, %xmm11 ;\
  110. addpd %xmm11, %xmm5 ;\
  111. movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  112. mulpd %xmm10, %xmm13 ;\
  113. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\
  114. addpd %xmm13, %xmm6 ;\
  115. movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  116. addpd %xmm10, %xmm7 ;\
  117. movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10
  118. #define KERNEL3(xx) \
  119. mulpd %xmm12, %xmm15 ;\
  120. addpd %xmm15, %xmm0 ;\
  121. movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
  122. mulpd %xmm12, %xmm11 ;\
  123. addpd %xmm11, %xmm1 ;\
  124. movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  125. mulpd %xmm12, %xmm13 ;\
  126. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\
  127. addpd %xmm13, %xmm2 ;\
  128. movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  129. addpd %xmm12, %xmm3 ;\
  130. movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12
  131. #define KERNEL4(xx) \
  132. mulpd %xmm14, %xmm15 ;\
  133. addpd %xmm15, %xmm4 ;\
  134. movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
  135. mulpd %xmm14, %xmm11 ;\
  136. addpd %xmm11, %xmm5 ;\
  137. movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  138. mulpd %xmm14, %xmm13 ;\
  139. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\
  140. addpd %xmm13, %xmm6 ;\
  141. movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  142. addpd %xmm14, %xmm7 ;\
  143. movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
  144. #define KERNEL5(xx) \
  145. mulpd %xmm8, %xmm9 ;\
  146. addpd %xmm9, %xmm0 ;\
  147. movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
  148. mulpd %xmm8, %xmm11 ;\
  149. PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\
  150. addpd %xmm11, %xmm1 ;\
  151. movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  152. mulpd %xmm8, %xmm13 ;\
  153. mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\
  154. addpd %xmm13, %xmm2 ;\
  155. movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  156. addpd %xmm8, %xmm3 ;\
  157. movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8
  158. #define KERNEL6(xx) \
  159. mulpd %xmm10, %xmm9 ;\
  160. addpd %xmm9, %xmm4 ;\
  161. movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\
  162. mulpd %xmm10, %xmm11 ;\
  163. addpd %xmm11, %xmm5 ;\
  164. movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  165. mulpd %xmm10, %xmm13 ;\
  166. mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\
  167. addpd %xmm13, %xmm6 ;\
  168. movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  169. addpd %xmm10, %xmm7 ;\
  170. movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10
  171. #define KERNEL7(xx) \
  172. mulpd %xmm12, %xmm15 ;\
  173. addpd %xmm15, %xmm0 ;\
  174. movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
  175. mulpd %xmm12, %xmm11 ;\
  176. addpd %xmm11, %xmm1 ;\
  177. movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  178. mulpd %xmm12, %xmm13 ;\
  179. mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\
  180. addpd %xmm13, %xmm2 ;\
  181. movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  182. addpd %xmm12, %xmm3 ;\
  183. movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12
  184. #define KERNEL8(xx) \
  185. mulpd %xmm14, %xmm15 ;\
  186. addpd %xmm15, %xmm4 ;\
  187. movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\
  188. mulpd %xmm14, %xmm11 ;\
  189. addpd %xmm11, %xmm5 ;\
  190. movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\
  191. mulpd %xmm14, %xmm13 ;\
  192. mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\
  193. addpd %xmm13, %xmm6 ;\
  194. movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\
  195. addpd %xmm14, %xmm7 ;\
  196. movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14
  197. PROLOGUE
  198. PROFCODE
  199. subq $STACKSIZE, %rsp
  200. movq %rbx, 0(%rsp)
  201. movq %rbp, 8(%rsp)
  202. movq %r12, 16(%rsp)
  203. movq %r13, 24(%rsp)
  204. movq %r14, 32(%rsp)
  205. movq %r15, 40(%rsp)
  206. #ifdef WINDOWS_ABI
  207. movq %rdi, 48(%rsp)
  208. movq %rsi, 56(%rsp)
  209. movups %xmm6, 64(%rsp)
  210. movups %xmm7, 80(%rsp)
  211. movups %xmm8, 96(%rsp)
  212. movups %xmm9, 112(%rsp)
  213. movups %xmm10, 128(%rsp)
  214. movups %xmm11, 144(%rsp)
  215. movups %xmm12, 160(%rsp)
  216. movups %xmm13, 176(%rsp)
  217. movups %xmm14, 192(%rsp)
  218. movups %xmm15, 208(%rsp)
  219. movq ARG1, M
  220. movq ARG2, N
  221. movq ARG3, K
  222. movq OLD_A, A
  223. movq OLD_B, B
  224. movq OLD_C, C
  225. movq OLD_LDC, LDC
  226. movsd OLD_OFFSET, %xmm4
  227. movaps %xmm3, %xmm0
  228. #else
  229. movq OLD_LDC, LDC
  230. movsd OLD_OFFSET, %xmm4
  231. #endif
  232. movq %rsp, %rbx # save old stack
  233. subq $128 + LOCAL_BUFFER_SIZE, %rsp
  234. andq $-4096, %rsp # align stack
  235. STACK_TOUCHING
  236. movsd %xmm4, OFFSET
  237. movsd %xmm4, KK
  238. leaq (, LDC, SIZE), LDC
  239. #ifdef LN
  240. leaq (, M, SIZE), %rax
  241. addq %rax, C
  242. imulq K, %rax
  243. addq %rax, A
  244. #endif
  245. #ifdef RT
  246. leaq (, N, SIZE), %rax
  247. imulq K, %rax
  248. addq %rax, B
  249. movq N, %rax
  250. imulq LDC, %rax
  251. addq %rax, C
  252. #endif
  253. #ifdef RN
  254. negq KK
  255. #endif
  256. #ifdef RT
  257. movq N, %rax
  258. subq OFFSET, %rax
  259. movq %rax, KK
  260. #endif
  261. testq $1, N
  262. je .L40
  263. ALIGN_4
  264. .L81:
  265. /* Copying to Sub Buffer */
  266. #ifdef LN
  267. movq OFFSET, %rax
  268. addq M, %rax
  269. movq %rax, KK
  270. #endif
  271. leaq BUFFER, BO
  272. #ifdef RT
  273. movq K, %rax
  274. salq $0 + BASE_SHIFT, %rax
  275. subq %rax, B
  276. #endif
  277. #if defined(LN) || defined(RT)
  278. movq KK, %rax
  279. movq B, BORIG
  280. leaq (, %rax, SIZE), %rax
  281. leaq (B, %rax, 1), B
  282. leaq (BO, %rax, 2), BO
  283. #endif
  284. #ifdef LT
  285. movq OFFSET, %rax
  286. movq %rax, KK
  287. #endif
  288. #if defined(LT) || defined(RN)
  289. movq KK, %rax
  290. #else
  291. movq K, %rax
  292. subq KK, %rax
  293. #endif
  294. sarq $3, %rax
  295. jle .L83
  296. ALIGN_4
  297. .L82:
  298. PREFETCH 56 * SIZE(B)
  299. movsd 0 * SIZE(B), %xmm0
  300. movsd 1 * SIZE(B), %xmm1
  301. movsd 2 * SIZE(B), %xmm2
  302. movsd 3 * SIZE(B), %xmm3
  303. movsd 4 * SIZE(B), %xmm4
  304. movsd 5 * SIZE(B), %xmm5
  305. movsd 6 * SIZE(B), %xmm6
  306. movsd 7 * SIZE(B), %xmm7
  307. addq $ 8 * SIZE, B
  308. addq $16 * SIZE, BO
  309. movsd %xmm0, -16 * SIZE(BO)
  310. movsd %xmm0, -15 * SIZE(BO)
  311. movsd %xmm1, -14 * SIZE(BO)
  312. movsd %xmm1, -13 * SIZE(BO)
  313. movsd %xmm2, -12 * SIZE(BO)
  314. movsd %xmm2, -11 * SIZE(BO)
  315. movsd %xmm3, -10 * SIZE(BO)
  316. movsd %xmm3, -9 * SIZE(BO)
  317. movsd %xmm4, -8 * SIZE(BO)
  318. movsd %xmm4, -7 * SIZE(BO)
  319. movsd %xmm5, -6 * SIZE(BO)
  320. movsd %xmm5, -5 * SIZE(BO)
  321. movsd %xmm6, -4 * SIZE(BO)
  322. movsd %xmm6, -3 * SIZE(BO)
  323. movsd %xmm7, -2 * SIZE(BO)
  324. movsd %xmm7, -1 * SIZE(BO)
  325. decq %rax
  326. jne .L82
  327. ALIGN_4
  328. .L83:
  329. #if defined(LT) || defined(RN)
  330. movq KK, %rax
  331. #else
  332. movq K, %rax
  333. subq KK, %rax
  334. #endif
  335. andq $7, %rax
  336. BRANCH
  337. jle .L90
  338. ALIGN_4
  339. .L84:
  340. movsd 0 * SIZE(B), %xmm0
  341. movsd %xmm0, 0 * SIZE(BO)
  342. movsd %xmm0, 1 * SIZE(BO)
  343. addq $1 * SIZE, B
  344. addq $2 * SIZE, BO
  345. decq %rax
  346. jne .L84
  347. ALIGN_4
  348. .L90:
  349. #if defined(LT) || defined(RN)
  350. movq A, AO
  351. #else
  352. movq A, AORIG
  353. #endif
  354. #ifdef RT
  355. subq LDC, C
  356. #endif
  357. movq C, CO1 # coffset1 = c
  358. #ifndef RT
  359. addq LDC, C
  360. #endif
  361. movq M, I
  362. sarq $2, I # i = (m >> 2)
  363. jle .L100
  364. ALIGN_4
  365. .L91:
  366. #ifdef LN
  367. movq K, %rax
  368. salq $2 + BASE_SHIFT, %rax
  369. subq %rax, AORIG
  370. #endif
  371. #if defined(LN) || defined(RT)
  372. movq KK, %rax
  373. movq AORIG, AO
  374. leaq (, %rax, SIZE), %rax
  375. leaq (AO, %rax, 4), AO
  376. #endif
  377. leaq BUFFER, BO
  378. #if defined(LN) || defined(RT)
  379. movq KK, %rax
  380. salq $0 + BASE_SHIFT, %rax
  381. leaq (BO, %rax, 2), BO
  382. #endif
  383. movapd 0 * SIZE(AO), %xmm8
  384. pxor %xmm0, %xmm0
  385. movapd 0 * SIZE(BO), %xmm9
  386. pxor %xmm1, %xmm1
  387. movapd 8 * SIZE(AO), %xmm10
  388. pxor %xmm2, %xmm2
  389. movapd 8 * SIZE(BO), %xmm11
  390. pxor %xmm3, %xmm3
  391. movapd 16 * SIZE(AO), %xmm12
  392. movapd 24 * SIZE(AO), %xmm14
  393. PREFETCHW 4 * SIZE(CO1)
  394. #if defined(LT) || defined(RN)
  395. movq KK, %rax
  396. #else
  397. movq K, %rax
  398. subq KK, %rax
  399. #endif
  400. sarq $3, %rax
  401. je .L95
  402. ALIGN_4
  403. .L92:
  404. mulpd %xmm9, %xmm8
  405. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  406. mulpd 2 * SIZE(AO), %xmm9
  407. addpd %xmm8, %xmm0
  408. movapd 4 * SIZE(AO), %xmm8
  409. addpd %xmm9, %xmm1
  410. movapd 2 * SIZE(BO), %xmm9
  411. mulpd %xmm9, %xmm8
  412. mulpd 6 * SIZE(AO), %xmm9
  413. addpd %xmm8, %xmm2
  414. movapd 32 * SIZE(AO), %xmm8
  415. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  416. addpd %xmm9, %xmm3
  417. movapd 4 * SIZE(BO), %xmm9
  418. mulpd %xmm9, %xmm10
  419. mulpd 10 * SIZE(AO), %xmm9
  420. addpd %xmm10, %xmm0
  421. movapd 12 * SIZE(AO), %xmm10
  422. addpd %xmm9, %xmm1
  423. movapd 6 * SIZE(BO), %xmm9
  424. mulpd %xmm9, %xmm10
  425. mulpd 14 * SIZE(AO), %xmm9
  426. addpd %xmm10, %xmm2
  427. movapd 40 * SIZE(AO), %xmm10
  428. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  429. addpd %xmm9, %xmm3
  430. movapd 16 * SIZE(BO), %xmm9
  431. mulpd %xmm11, %xmm12
  432. mulpd 18 * SIZE(AO), %xmm11
  433. addpd %xmm12, %xmm0
  434. movapd 20 * SIZE(AO), %xmm12
  435. addpd %xmm11, %xmm1
  436. movapd 10 * SIZE(BO), %xmm11
  437. mulpd %xmm11, %xmm12
  438. mulpd 22 * SIZE(AO), %xmm11
  439. addpd %xmm12, %xmm2
  440. movapd 48 * SIZE(AO), %xmm12
  441. PREFETCH (PREFETCHSIZE + 24) * SIZE(AO)
  442. addpd %xmm11, %xmm3
  443. movapd 12 * SIZE(BO), %xmm11
  444. mulpd %xmm11, %xmm14
  445. mulpd 26 * SIZE(AO), %xmm11
  446. addpd %xmm14, %xmm0
  447. movapd 28 * SIZE(AO), %xmm14
  448. addpd %xmm11, %xmm1
  449. movapd 14 * SIZE(BO), %xmm11
  450. mulpd %xmm11, %xmm14
  451. mulpd 30 * SIZE(AO), %xmm11
  452. addpd %xmm14, %xmm2
  453. movapd 56 * SIZE(AO), %xmm14
  454. addpd %xmm11, %xmm3
  455. movapd 24 * SIZE(BO), %xmm11
  456. addq $32 * SIZE, AO
  457. addq $16 * SIZE, BO
  458. decq %rax
  459. jne .L92
  460. ALIGN_4
  461. .L95:
  462. #if defined(LT) || defined(RN)
  463. movq KK, %rax
  464. #else
  465. movq K, %rax
  466. subq KK, %rax
  467. #endif
  468. andq $7, %rax # if (k & 1)
  469. BRANCH
  470. je .L99
  471. ALIGN_4
  472. .L96:
  473. mulpd %xmm9, %xmm8
  474. mulpd 2 * SIZE(AO), %xmm9
  475. addpd %xmm8, %xmm0
  476. movapd 4 * SIZE(AO), %xmm8
  477. addpd %xmm9, %xmm1
  478. movapd 2 * SIZE(BO), %xmm9
  479. addq $4 * SIZE, AO # aoffset += 4
  480. addq $2 * SIZE, BO # boffset1 += 8
  481. decq %rax
  482. jg .L96
  483. ALIGN_4
  484. .L99:
  485. addpd %xmm2, %xmm0
  486. addpd %xmm3, %xmm1
  487. #if defined(LN) || defined(RT)
  488. movq KK, %rax
  489. #ifdef LN
  490. subq $4, %rax
  491. #else
  492. subq $1, %rax
  493. #endif
  494. movq AORIG, AO
  495. movq BORIG, B
  496. leaq BUFFER, BO
  497. leaq (, %rax, SIZE), %rax
  498. leaq (AO, %rax, 4), AO
  499. leaq (B, %rax, 1), B
  500. leaq (BO, %rax, 2), BO
  501. #endif
  502. #if defined(LN) || defined(LT)
  503. movapd 0 * SIZE(B), %xmm2
  504. movapd 2 * SIZE(B), %xmm3
  505. subpd %xmm0, %xmm2
  506. subpd %xmm1, %xmm3
  507. #else
  508. movapd 0 * SIZE(AO), %xmm2
  509. movapd 2 * SIZE(AO), %xmm3
  510. subpd %xmm0, %xmm2
  511. subpd %xmm1, %xmm3
  512. #endif
  513. #ifdef LN
  514. movapd %xmm2, %xmm0
  515. unpckhpd %xmm0, %xmm0
  516. movapd %xmm3, %xmm1
  517. unpckhpd %xmm1, %xmm1
  518. movsd 15 * SIZE(AO), %xmm4
  519. mulsd %xmm4, %xmm1
  520. movsd 14 * SIZE(AO), %xmm5
  521. mulsd %xmm1, %xmm5
  522. subsd %xmm5, %xmm3
  523. movsd 13 * SIZE(AO), %xmm6
  524. mulsd %xmm1, %xmm6
  525. subsd %xmm6, %xmm0
  526. movsd 12 * SIZE(AO), %xmm7
  527. mulsd %xmm1, %xmm7
  528. subsd %xmm7, %xmm2
  529. movsd 10 * SIZE(AO), %xmm4
  530. mulsd %xmm4, %xmm3
  531. movsd 9 * SIZE(AO), %xmm5
  532. mulsd %xmm3, %xmm5
  533. subsd %xmm5, %xmm0
  534. movsd 8 * SIZE(AO), %xmm6
  535. mulsd %xmm3, %xmm6
  536. subsd %xmm6, %xmm2
  537. movsd 5 * SIZE(AO), %xmm4
  538. mulsd %xmm4, %xmm0
  539. movsd 4 * SIZE(AO), %xmm5
  540. mulsd %xmm0, %xmm5
  541. subsd %xmm5, %xmm2
  542. movsd 0 * SIZE(AO), %xmm4
  543. mulsd %xmm4, %xmm2
  544. unpcklpd %xmm0, %xmm2
  545. unpcklpd %xmm1, %xmm3
  546. #endif
  547. #ifdef LT
  548. movapd %xmm2, %xmm0
  549. unpckhpd %xmm0, %xmm0
  550. movapd %xmm3, %xmm1
  551. unpckhpd %xmm1, %xmm1
  552. movsd 0 * SIZE(AO), %xmm4
  553. mulsd %xmm4, %xmm2
  554. movsd 1 * SIZE(AO), %xmm5
  555. mulsd %xmm2, %xmm5
  556. subsd %xmm5, %xmm0
  557. movsd 2 * SIZE(AO), %xmm6
  558. mulsd %xmm2, %xmm6
  559. subsd %xmm6, %xmm3
  560. movsd 3 * SIZE(AO), %xmm7
  561. mulsd %xmm2, %xmm7
  562. subsd %xmm7, %xmm1
  563. movsd 5 * SIZE(AO), %xmm4
  564. mulsd %xmm4, %xmm0
  565. movsd 6 * SIZE(AO), %xmm5
  566. mulsd %xmm0, %xmm5
  567. subsd %xmm5, %xmm3
  568. movsd 7 * SIZE(AO), %xmm6
  569. mulsd %xmm0, %xmm6
  570. subsd %xmm6, %xmm1
  571. movsd 10 * SIZE(AO), %xmm4
  572. mulsd %xmm4, %xmm3
  573. movsd 11 * SIZE(AO), %xmm5
  574. mulsd %xmm3, %xmm5
  575. subsd %xmm5, %xmm1
  576. movsd 15 * SIZE(AO), %xmm4
  577. mulsd %xmm4, %xmm1
  578. unpcklpd %xmm0, %xmm2
  579. unpcklpd %xmm1, %xmm3
  580. #endif
  581. #ifdef RN
  582. movlpd 0 * SIZE(B), %xmm0
  583. movhpd 0 * SIZE(B), %xmm0
  584. mulpd %xmm0, %xmm2
  585. mulpd %xmm0, %xmm3
  586. #endif
  587. #ifdef RT
  588. movlpd 0 * SIZE(B), %xmm0
  589. movhpd 0 * SIZE(B), %xmm0
  590. mulpd %xmm0, %xmm2
  591. mulpd %xmm0, %xmm3
  592. #endif
  593. #ifdef LN
  594. subq $4 * SIZE, CO1
  595. #endif
  596. #if defined(LN) || defined(LT)
  597. movsd %xmm2, 0 * SIZE(CO1)
  598. movhpd %xmm2, 1 * SIZE(CO1)
  599. movsd %xmm3, 2 * SIZE(CO1)
  600. movhpd %xmm3, 3 * SIZE(CO1)
  601. #else
  602. movsd %xmm2, 0 * SIZE(CO1)
  603. movhpd %xmm2, 1 * SIZE(CO1)
  604. movsd %xmm3, 2 * SIZE(CO1)
  605. movhpd %xmm3, 3 * SIZE(CO1)
  606. #endif
  607. #if defined(LN) || defined(LT)
  608. movapd %xmm2, 0 * SIZE(B)
  609. movapd %xmm3, 2 * SIZE(B)
  610. movlpd %xmm2, 0 * SIZE(BO)
  611. movlpd %xmm2, 1 * SIZE(BO)
  612. movhpd %xmm2, 2 * SIZE(BO)
  613. movhpd %xmm2, 3 * SIZE(BO)
  614. movlpd %xmm3, 4 * SIZE(BO)
  615. movlpd %xmm3, 5 * SIZE(BO)
  616. movhpd %xmm3, 6 * SIZE(BO)
  617. movhpd %xmm3, 7 * SIZE(BO)
  618. #else
  619. movapd %xmm2, 0 * SIZE(AO)
  620. movapd %xmm3, 2 * SIZE(AO)
  621. #endif
  622. #ifndef LN
  623. addq $4 * SIZE, CO1
  624. #endif
  625. #if defined(LT) || defined(RN)
  626. movq K, %rax
  627. subq KK, %rax
  628. leaq (,%rax, SIZE), %rax
  629. leaq (AO, %rax, 4), AO
  630. #ifdef LT
  631. addq $4 * SIZE, B
  632. #endif
  633. #endif
  634. #ifdef LN
  635. subq $4, KK
  636. movq BORIG, B
  637. #endif
  638. #ifdef LT
  639. addq $4, KK
  640. #endif
  641. #ifdef RT
  642. movq K, %rax
  643. movq BORIG, B
  644. salq $2 + BASE_SHIFT, %rax
  645. addq %rax, AORIG
  646. #endif
  647. decq I # i --
  648. jg .L91
  649. ALIGN_4
  650. .L100:
  651. testq $2, M
  652. je .L110
  653. ALIGN_4
  654. .L101:
  655. #ifdef LN
  656. movq K, %rax
  657. salq $1 + BASE_SHIFT, %rax
  658. subq %rax, AORIG
  659. #endif
  660. #if defined(LN) || defined(RT)
  661. movq KK, %rax
  662. movq AORIG, AO
  663. leaq (, %rax, SIZE), %rax
  664. leaq (AO, %rax, 2), AO
  665. #endif
  666. leaq BUFFER, BO
  667. #if defined(LN) || defined(RT)
  668. movq KK, %rax
  669. salq $0 + BASE_SHIFT, %rax
  670. leaq (BO, %rax, 2), BO
  671. #endif
  672. movapd 0 * SIZE(AO), %xmm8
  673. pxor %xmm0, %xmm0
  674. movapd 0 * SIZE(BO), %xmm9
  675. pxor %xmm1, %xmm1
  676. movapd 8 * SIZE(AO), %xmm10
  677. pxor %xmm2, %xmm2
  678. movapd 8 * SIZE(BO), %xmm11
  679. pxor %xmm3, %xmm3
  680. #if defined(LT) || defined(RN)
  681. movq KK, %rax
  682. #else
  683. movq K, %rax
  684. subq KK, %rax
  685. #endif
  686. sarq $3, %rax
  687. je .L105
  688. ALIGN_4
  689. .L102:
  690. mulpd %xmm8, %xmm9
  691. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  692. movapd 2 * SIZE(AO), %xmm8
  693. mulpd 2 * SIZE(BO), %xmm8
  694. addpd %xmm9, %xmm0
  695. movapd 16 * SIZE(BO), %xmm9
  696. addpd %xmm8, %xmm1
  697. movapd 4 * SIZE(AO), %xmm8
  698. mulpd 4 * SIZE(BO), %xmm8
  699. addpd %xmm8, %xmm2
  700. movapd 6 * SIZE(AO), %xmm8
  701. mulpd 6 * SIZE(BO), %xmm8
  702. addpd %xmm8, %xmm3
  703. movapd 16 * SIZE(AO), %xmm8
  704. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  705. mulpd %xmm10, %xmm11
  706. movapd 10 * SIZE(AO), %xmm10
  707. mulpd 10 * SIZE(BO), %xmm10
  708. addpd %xmm11, %xmm0
  709. movapd 24 * SIZE(BO), %xmm11
  710. addpd %xmm10, %xmm1
  711. movapd 12 * SIZE(AO), %xmm10
  712. mulpd 12 * SIZE(BO), %xmm10
  713. addpd %xmm10, %xmm2
  714. movapd 14 * SIZE(AO), %xmm10
  715. mulpd 14 * SIZE(BO), %xmm10
  716. addpd %xmm10, %xmm3
  717. movapd 24 * SIZE(AO), %xmm10
  718. addq $16 * SIZE, AO
  719. addq $16 * SIZE, BO
  720. decq %rax
  721. jne .L102
  722. ALIGN_4
  723. .L105:
  724. #if defined(LT) || defined(RN)
  725. movq KK, %rax
  726. #else
  727. movq K, %rax
  728. subq KK, %rax
  729. #endif
  730. andq $7, %rax # if (k & 1)
  731. BRANCH
  732. je .L109
  733. ALIGN_4
  734. .L106:
  735. mulpd %xmm8, %xmm9
  736. addpd %xmm9, %xmm0
  737. movapd 2 * SIZE(AO), %xmm8
  738. movapd 2 * SIZE(BO), %xmm9
  739. addq $2 * SIZE, AO # aoffset += 4
  740. addq $2 * SIZE, BO # boffset1 += 8
  741. decq %rax
  742. jg .L106
  743. ALIGN_4
  744. .L109:
  745. addpd %xmm1, %xmm0
  746. addpd %xmm3, %xmm2
  747. addpd %xmm2, %xmm0
  748. #if defined(LN) || defined(RT)
  749. movq KK, %rax
  750. #ifdef LN
  751. subq $2, %rax
  752. #else
  753. subq $1, %rax
  754. #endif
  755. movq AORIG, AO
  756. movq BORIG, B
  757. leaq BUFFER, BO
  758. leaq (, %rax, SIZE), %rax
  759. leaq (AO, %rax, 2), AO
  760. leaq (B, %rax, 1), B
  761. leaq (BO, %rax, 2), BO
  762. #endif
  763. #if defined(LN) || defined(LT)
  764. movapd 0 * SIZE(B), %xmm2
  765. subpd %xmm0, %xmm2
  766. #else
  767. movapd 0 * SIZE(AO), %xmm2
  768. subpd %xmm0, %xmm2
  769. #endif
  770. #ifdef LN
  771. movapd %xmm2, %xmm0
  772. unpckhpd %xmm0, %xmm0
  773. movsd 3 * SIZE(AO), %xmm4
  774. mulsd %xmm4, %xmm0
  775. movsd 2 * SIZE(AO), %xmm5
  776. mulsd %xmm0, %xmm5
  777. subsd %xmm5, %xmm2
  778. movsd 0 * SIZE(AO), %xmm4
  779. mulsd %xmm4, %xmm2
  780. unpcklpd %xmm0, %xmm2
  781. #endif
  782. #ifdef LT
  783. movapd %xmm2, %xmm0
  784. unpckhpd %xmm0, %xmm0
  785. movsd 0 * SIZE(AO), %xmm4
  786. mulsd %xmm4, %xmm2
  787. movsd 1 * SIZE(AO), %xmm5
  788. mulsd %xmm2, %xmm5
  789. subsd %xmm5, %xmm0
  790. movsd 3 * SIZE(AO), %xmm4
  791. mulsd %xmm4, %xmm0
  792. unpcklpd %xmm0, %xmm2
  793. #endif
  794. #ifdef RN
  795. movlpd 0 * SIZE(B), %xmm0
  796. movhpd 0 * SIZE(B), %xmm0
  797. mulpd %xmm0, %xmm2
  798. #endif
  799. #ifdef RT
  800. movlpd 0 * SIZE(B), %xmm0
  801. movhpd 0 * SIZE(B), %xmm0
  802. mulpd %xmm0, %xmm2
  803. #endif
  804. #ifdef LN
  805. subq $2 * SIZE, CO1
  806. #endif
  807. #if defined(LN) || defined(LT)
  808. movsd %xmm2, 0 * SIZE(CO1)
  809. movhpd %xmm2, 1 * SIZE(CO1)
  810. #else
  811. movsd %xmm2, 0 * SIZE(CO1)
  812. movhpd %xmm2, 1 * SIZE(CO1)
  813. #endif
  814. #if defined(LN) || defined(LT)
  815. movapd %xmm2, 0 * SIZE(B)
  816. movlpd %xmm2, 0 * SIZE(BO)
  817. movlpd %xmm2, 1 * SIZE(BO)
  818. movhpd %xmm2, 2 * SIZE(BO)
  819. movhpd %xmm2, 3 * SIZE(BO)
  820. #else
  821. movapd %xmm2, 0 * SIZE(AO)
  822. #endif
  823. #ifndef LN
  824. addq $2 * SIZE, CO1
  825. #endif
  826. #if defined(LT) || defined(RN)
  827. movq K, %rax
  828. subq KK, %rax
  829. leaq (,%rax, SIZE), %rax
  830. leaq (AO, %rax, 2), AO
  831. #ifdef LT
  832. addq $2 * SIZE, B
  833. #endif
  834. #endif
  835. #ifdef LN
  836. subq $2, KK
  837. movq BORIG, B
  838. #endif
  839. #ifdef LT
  840. addq $2, KK
  841. #endif
  842. #ifdef RT
  843. movq K, %rax
  844. movq BORIG, B
  845. salq $1 + BASE_SHIFT, %rax
  846. addq %rax, AORIG
  847. #endif
  848. ALIGN_4
  849. .L110:
  850. testq $1, M
  851. je .L119
  852. ALIGN_4
  853. .L111:
  854. #ifdef LN
  855. movq K, %rax
  856. salq $0 + BASE_SHIFT, %rax
  857. subq %rax, AORIG
  858. #endif
  859. #if defined(LN) || defined(RT)
  860. movq KK, %rax
  861. movq AORIG, AO
  862. leaq (, %rax, SIZE), %rax
  863. leaq (AO, %rax, 1), AO
  864. #endif
  865. leaq BUFFER, BO
  866. #if defined(LN) || defined(RT)
  867. movq KK, %rax
  868. salq $0 + BASE_SHIFT, %rax
  869. leaq (BO, %rax, 2), BO
  870. #endif
  871. movsd 0 * SIZE(AO), %xmm8
  872. pxor %xmm0, %xmm0
  873. movsd 0 * SIZE(BO), %xmm9
  874. pxor %xmm1, %xmm1
  875. movsd 4 * SIZE(AO), %xmm10
  876. pxor %xmm2, %xmm2
  877. movsd 8 * SIZE(BO), %xmm11
  878. pxor %xmm3, %xmm3
  879. #if defined(LT) || defined(RN)
  880. movq KK, %rax
  881. #else
  882. movq K, %rax
  883. subq KK, %rax
  884. #endif
  885. sarq $3, %rax
  886. je .L115
  887. ALIGN_4
  888. .L112:
  889. mulsd %xmm8, %xmm9
  890. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  891. movsd 1 * SIZE(AO), %xmm8
  892. addsd %xmm9, %xmm0
  893. movsd 16 * SIZE(BO), %xmm9
  894. mulsd 2 * SIZE(BO), %xmm8
  895. addsd %xmm8, %xmm1
  896. movsd 2 * SIZE(AO), %xmm8
  897. mulsd 4 * SIZE(BO), %xmm8
  898. addsd %xmm8, %xmm2
  899. movsd 3 * SIZE(AO), %xmm8
  900. mulsd 6 * SIZE(BO), %xmm8
  901. addsd %xmm8, %xmm3
  902. movsd 8 * SIZE(AO), %xmm8
  903. mulsd %xmm10, %xmm11
  904. movsd 5 * SIZE(AO), %xmm10
  905. addsd %xmm11, %xmm0
  906. movsd 24 * SIZE(BO), %xmm11
  907. mulsd 10 * SIZE(BO), %xmm10
  908. addsd %xmm10, %xmm1
  909. movsd 6 * SIZE(AO), %xmm10
  910. mulsd 12 * SIZE(BO), %xmm10
  911. addsd %xmm10, %xmm2
  912. movsd 7 * SIZE(AO), %xmm10
  913. mulsd 14 * SIZE(BO), %xmm10
  914. addsd %xmm10, %xmm3
  915. movsd 12 * SIZE(AO), %xmm10
  916. addq $ 8 * SIZE, AO
  917. addq $16 * SIZE, BO
  918. decq %rax
  919. jne .L112
  920. ALIGN_4
  921. .L115:
  922. #if defined(LT) || defined(RN)
  923. movq KK, %rax
  924. #else
  925. movq K, %rax
  926. subq KK, %rax
  927. #endif
  928. andq $7, %rax # if (k & 1)
  929. BRANCH
  930. je .L118
  931. ALIGN_4
  932. .L116:
  933. mulsd %xmm8, %xmm9
  934. movsd 1 * SIZE(AO), %xmm8
  935. addsd %xmm9, %xmm0
  936. movsd 2 * SIZE(BO), %xmm9
  937. addq $1 * SIZE, AO # aoffset += 4
  938. addq $2 * SIZE, BO # boffset1 += 8
  939. decq %rax
  940. jg .L116
  941. ALIGN_4
  942. .L118:
  943. addsd %xmm2, %xmm0
  944. addsd %xmm3, %xmm1
  945. addsd %xmm1, %xmm0
  946. #if defined(LN) || defined(RT)
  947. movq KK, %rax
  948. #ifdef LN
  949. subq $1, %rax
  950. #else
  951. subq $1, %rax
  952. #endif
  953. movq AORIG, AO
  954. movq BORIG, B
  955. leaq BUFFER, BO
  956. leaq (, %rax, SIZE), %rax
  957. leaq (AO, %rax, 1), AO
  958. leaq (B, %rax, 1), B
  959. leaq (BO, %rax, 2), BO
  960. #endif
  961. #if defined(LN) || defined(LT)
  962. movsd 0 * SIZE(B), %xmm2
  963. subsd %xmm0, %xmm2
  964. #else
  965. movsd 0 * SIZE(AO), %xmm2
  966. subsd %xmm0, %xmm2
  967. #endif
  968. #ifdef LN
  969. movsd 0 * SIZE(AO), %xmm4
  970. mulsd %xmm4, %xmm2
  971. #endif
  972. #ifdef LT
  973. movsd 0 * SIZE(AO), %xmm4
  974. mulsd %xmm4, %xmm2
  975. #endif
  976. #ifdef RN
  977. movsd 0 * SIZE(B), %xmm0
  978. mulsd %xmm0, %xmm2
  979. #endif
  980. #ifdef RT
  981. movsd 0 * SIZE(B), %xmm0
  982. mulsd %xmm0, %xmm2
  983. #endif
  984. #ifdef LN
  985. subq $1 * SIZE, CO1
  986. #endif
  987. #if defined(LN) || defined(LT)
  988. movsd %xmm2, 0 * SIZE(CO1)
  989. #else
  990. movsd %xmm2, 0 * SIZE(CO1)
  991. #endif
  992. #if defined(LN) || defined(LT)
  993. movsd %xmm2, 0 * SIZE(B)
  994. movlpd %xmm2, 0 * SIZE(BO)
  995. movlpd %xmm2, 1 * SIZE(BO)
  996. #else
  997. movsd %xmm2, 0 * SIZE(AO)
  998. #endif
  999. #ifndef LN
  1000. addq $1 * SIZE, CO1
  1001. #endif
  1002. #if defined(LT) || defined(RN)
  1003. movq K, %rax
  1004. subq KK, %rax
  1005. leaq (,%rax, SIZE), %rax
  1006. leaq (AO, %rax, 1), AO
  1007. #ifdef LT
  1008. addq $1 * SIZE, B
  1009. #endif
  1010. #endif
  1011. #ifdef LN
  1012. subq $1, KK
  1013. movq BORIG, B
  1014. #endif
  1015. #ifdef LT
  1016. addq $1, KK
  1017. #endif
  1018. #ifdef RT
  1019. movq K, %rax
  1020. movq BORIG, B
  1021. salq $0 + BASE_SHIFT, %rax
  1022. addq %rax, AORIG
  1023. #endif
  1024. ALIGN_4
  1025. .L119:
  1026. #ifdef LN
  1027. leaq (, K, SIZE), %rax
  1028. leaq (B, %rax, 1), B
  1029. #endif
  1030. #if defined(LT) || defined(RN)
  1031. movq K, %rax
  1032. subq KK, %rax
  1033. leaq (,%rax, SIZE), %rax
  1034. leaq (B, %rax, 1), B
  1035. #endif
  1036. #ifdef RN
  1037. addq $1, KK
  1038. #endif
  1039. #ifdef RT
  1040. subq $1, KK
  1041. #endif
  1042. ALIGN_4
  1043. .L40:
  1044. testq $2, N
  1045. je .L80
  1046. ALIGN_4
  1047. .L41:
  1048. /* Copying to Sub Buffer */
  1049. #ifdef LN
  1050. movq OFFSET, %rax
  1051. addq M, %rax
  1052. movq %rax, KK
  1053. #endif
  1054. leaq BUFFER, BO
  1055. #ifdef RT
  1056. movq K, %rax
  1057. salq $1 + BASE_SHIFT, %rax
  1058. subq %rax, B
  1059. #endif
  1060. #if defined(LN) || defined(RT)
  1061. movq KK, %rax
  1062. movq B, BORIG
  1063. leaq (, %rax, SIZE), %rax
  1064. leaq (B, %rax, 2), B
  1065. leaq (BO, %rax, 4), BO
  1066. #endif
  1067. #ifdef LT
  1068. movq OFFSET, %rax
  1069. movq %rax, KK
  1070. #endif
  1071. #if defined(LT) || defined(RN)
  1072. movq KK, %rax
  1073. #else
  1074. movq K, %rax
  1075. subq KK, %rax
  1076. #endif
  1077. sarq $2, %rax
  1078. jle .L43
  1079. ALIGN_4
  1080. .L42:
  1081. PREFETCH 56 * SIZE(B)
  1082. movsd 0 * SIZE(B), %xmm0
  1083. movsd 1 * SIZE(B), %xmm1
  1084. movsd 2 * SIZE(B), %xmm2
  1085. movsd 3 * SIZE(B), %xmm3
  1086. movsd 4 * SIZE(B), %xmm4
  1087. movsd 5 * SIZE(B), %xmm5
  1088. movsd 6 * SIZE(B), %xmm6
  1089. movsd 7 * SIZE(B), %xmm7
  1090. addq $ 8 * SIZE, B
  1091. addq $16 * SIZE, BO
  1092. movsd %xmm0, -16 * SIZE(BO)
  1093. movsd %xmm0, -15 * SIZE(BO)
  1094. movsd %xmm1, -14 * SIZE(BO)
  1095. movsd %xmm1, -13 * SIZE(BO)
  1096. movsd %xmm2, -12 * SIZE(BO)
  1097. movsd %xmm2, -11 * SIZE(BO)
  1098. movsd %xmm3, -10 * SIZE(BO)
  1099. movsd %xmm3, -9 * SIZE(BO)
  1100. movsd %xmm4, -8 * SIZE(BO)
  1101. movsd %xmm4, -7 * SIZE(BO)
  1102. movsd %xmm5, -6 * SIZE(BO)
  1103. movsd %xmm5, -5 * SIZE(BO)
  1104. movsd %xmm6, -4 * SIZE(BO)
  1105. movsd %xmm6, -3 * SIZE(BO)
  1106. movsd %xmm7, -2 * SIZE(BO)
  1107. movsd %xmm7, -1 * SIZE(BO)
  1108. decq %rax
  1109. jne .L42
  1110. ALIGN_4
  1111. .L43:
  1112. #if defined(LT) || defined(RN)
  1113. movq KK, %rax
  1114. #else
  1115. movq K, %rax
  1116. subq KK, %rax
  1117. #endif
  1118. andq $3, %rax
  1119. BRANCH
  1120. jle .L50
  1121. ALIGN_4
  1122. .L44:
  1123. movsd 0 * SIZE(B), %xmm0
  1124. movsd 1 * SIZE(B), %xmm1
  1125. movsd %xmm0, 0 * SIZE(BO)
  1126. movsd %xmm0, 1 * SIZE(BO)
  1127. movsd %xmm1, 2 * SIZE(BO)
  1128. movsd %xmm1, 3 * SIZE(BO)
  1129. addq $2 * SIZE, B
  1130. addq $4 * SIZE, BO
  1131. decq %rax
  1132. jne .L44
  1133. ALIGN_4
  1134. .L50:
  1135. #if defined(LT) || defined(RN)
  1136. movq A, AO
  1137. #else
  1138. movq A, AORIG
  1139. #endif
  1140. #ifdef RT
  1141. leaq (, LDC, 2), %rax
  1142. subq %rax, C
  1143. #endif
  1144. movq C, CO1 # coffset1 = c
  1145. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  1146. #ifndef RT
  1147. leaq (C, LDC, 2), C
  1148. #endif
  1149. movq M, I
  1150. sarq $2, I # i = (m >> 2)
  1151. jle .L60
  1152. ALIGN_4
  1153. .L51:
  1154. #ifdef LN
  1155. movq K, %rax
  1156. salq $2 + BASE_SHIFT, %rax
  1157. subq %rax, AORIG
  1158. #endif
  1159. #if defined(LN) || defined(RT)
  1160. movq KK, %rax
  1161. movq AORIG, AO
  1162. leaq (, %rax, SIZE), %rax
  1163. leaq (AO, %rax, 4), AO
  1164. #endif
  1165. leaq BUFFER, BO
  1166. #if defined(LN) || defined(RT)
  1167. movq KK, %rax
  1168. salq $1 + BASE_SHIFT, %rax
  1169. leaq (BO, %rax, 2), BO
  1170. #endif
  1171. movapd 0 * SIZE(AO), %xmm8
  1172. pxor %xmm0, %xmm0
  1173. movapd 0 * SIZE(BO), %xmm9
  1174. pxor %xmm1, %xmm1
  1175. movapd 8 * SIZE(AO), %xmm10
  1176. pxor %xmm4, %xmm4
  1177. movapd 8 * SIZE(BO), %xmm11
  1178. pxor %xmm5, %xmm5
  1179. movapd 16 * SIZE(AO), %xmm12
  1180. movapd 16 * SIZE(BO), %xmm13
  1181. movapd 24 * SIZE(AO), %xmm14
  1182. movapd 24 * SIZE(BO), %xmm15
  1183. PREFETCHW 4 * SIZE(CO1)
  1184. PREFETCHW 4 * SIZE(CO2)
  1185. #if defined(LT) || defined(RN)
  1186. movq KK, %rax
  1187. #else
  1188. movq K, %rax
  1189. subq KK, %rax
  1190. #endif
  1191. sarq $3, %rax
  1192. je .L55
  1193. ALIGN_4
  1194. .L52:
  1195. mulpd %xmm8, %xmm9
  1196. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1197. mulpd 2 * SIZE(BO), %xmm8
  1198. addpd %xmm9, %xmm0
  1199. movapd 0 * SIZE(BO), %xmm9
  1200. addpd %xmm8, %xmm1
  1201. movapd 2 * SIZE(AO), %xmm8
  1202. mulpd %xmm8, %xmm9
  1203. mulpd 2 * SIZE(BO), %xmm8
  1204. addpd %xmm9, %xmm4
  1205. movapd 4 * SIZE(BO), %xmm9
  1206. addpd %xmm8, %xmm5
  1207. movapd 4 * SIZE(AO), %xmm8
  1208. mulpd %xmm8, %xmm9
  1209. mulpd 6 * SIZE(BO), %xmm8
  1210. addpd %xmm9, %xmm0
  1211. movapd 4 * SIZE(BO), %xmm9
  1212. addpd %xmm8, %xmm1
  1213. movapd 6 * SIZE(AO), %xmm8
  1214. mulpd %xmm8, %xmm9
  1215. mulpd 6 * SIZE(BO), %xmm8
  1216. addpd %xmm9, %xmm4
  1217. movapd 32 * SIZE(BO), %xmm9
  1218. addpd %xmm8, %xmm5
  1219. movapd 32 * SIZE(AO), %xmm8
  1220. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  1221. mulpd %xmm10, %xmm11
  1222. mulpd 10 * SIZE(BO), %xmm10
  1223. addpd %xmm11, %xmm0
  1224. movapd 8 * SIZE(BO), %xmm11
  1225. addpd %xmm10, %xmm1
  1226. movapd 10 * SIZE(AO), %xmm10
  1227. mulpd %xmm10, %xmm11
  1228. mulpd 10 * SIZE(BO), %xmm10
  1229. addpd %xmm11, %xmm4
  1230. movapd 12 * SIZE(BO), %xmm11
  1231. addpd %xmm10, %xmm5
  1232. movapd 12 * SIZE(AO), %xmm10
  1233. mulpd %xmm10, %xmm11
  1234. mulpd 14 * SIZE(BO), %xmm10
  1235. addpd %xmm11, %xmm0
  1236. movapd 12 * SIZE(BO), %xmm11
  1237. addpd %xmm10, %xmm1
  1238. movapd 14 * SIZE(AO), %xmm10
  1239. mulpd %xmm10, %xmm11
  1240. mulpd 14 * SIZE(BO), %xmm10
  1241. addpd %xmm11, %xmm4
  1242. movapd 40 * SIZE(BO), %xmm11
  1243. addpd %xmm10, %xmm5
  1244. movapd 40 * SIZE(AO), %xmm10
  1245. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1246. mulpd %xmm12, %xmm13
  1247. mulpd 18 * SIZE(BO), %xmm12
  1248. addpd %xmm13, %xmm0
  1249. movapd 16 * SIZE(BO), %xmm13
  1250. addpd %xmm12, %xmm1
  1251. movapd 18 * SIZE(AO), %xmm12
  1252. mulpd %xmm12, %xmm13
  1253. mulpd 18 * SIZE(BO), %xmm12
  1254. addpd %xmm13, %xmm4
  1255. movapd 20 * SIZE(BO), %xmm13
  1256. addpd %xmm12, %xmm5
  1257. movapd 20 * SIZE(AO), %xmm12
  1258. mulpd %xmm12, %xmm13
  1259. mulpd 22 * SIZE(BO), %xmm12
  1260. addpd %xmm13, %xmm0
  1261. movapd 20 * SIZE(BO), %xmm13
  1262. addpd %xmm12, %xmm1
  1263. movapd 22 * SIZE(AO), %xmm12
  1264. mulpd %xmm12, %xmm13
  1265. mulpd 22 * SIZE(BO), %xmm12
  1266. addpd %xmm13, %xmm4
  1267. movapd 48 * SIZE(BO), %xmm13
  1268. addpd %xmm12, %xmm5
  1269. movapd 48 * SIZE(AO), %xmm12
  1270. PREFETCH (PREFETCHSIZE + 24) * SIZE(AO)
  1271. mulpd %xmm14, %xmm15
  1272. mulpd 26 * SIZE(BO), %xmm14
  1273. addpd %xmm15, %xmm0
  1274. movapd 24 * SIZE(BO), %xmm15
  1275. addpd %xmm14, %xmm1
  1276. movapd 26 * SIZE(AO), %xmm14
  1277. mulpd %xmm14, %xmm15
  1278. mulpd 26 * SIZE(BO), %xmm14
  1279. addpd %xmm15, %xmm4
  1280. movapd 28 * SIZE(BO), %xmm15
  1281. addpd %xmm14, %xmm5
  1282. movapd 28 * SIZE(AO), %xmm14
  1283. mulpd %xmm14, %xmm15
  1284. mulpd 30 * SIZE(BO), %xmm14
  1285. addpd %xmm15, %xmm0
  1286. movapd 28 * SIZE(BO), %xmm15
  1287. addpd %xmm14, %xmm1
  1288. movapd 30 * SIZE(AO), %xmm14
  1289. mulpd %xmm14, %xmm15
  1290. mulpd 30 * SIZE(BO), %xmm14
  1291. addpd %xmm15, %xmm4
  1292. movapd 56 * SIZE(BO), %xmm15
  1293. addpd %xmm14, %xmm5
  1294. movapd 56 * SIZE(AO), %xmm14
  1295. addq $32 * SIZE, AO
  1296. addq $32 * SIZE, BO
  1297. decq %rax
  1298. jne .L52
  1299. ALIGN_4
  1300. .L55:
  1301. #if defined(LT) || defined(RN)
  1302. movq KK, %rax
  1303. #else
  1304. movq K, %rax
  1305. subq KK, %rax
  1306. #endif
  1307. andq $7, %rax # if (k & 1)
  1308. BRANCH
  1309. je .L59
  1310. ALIGN_4
  1311. .L56:
  1312. movapd 0 * SIZE(BO), %xmm9
  1313. mulpd %xmm8, %xmm9
  1314. addpd %xmm9, %xmm0
  1315. mulpd 2 * SIZE(BO), %xmm8
  1316. addpd %xmm8, %xmm1
  1317. movapd 2 * SIZE(AO), %xmm8
  1318. movapd 0 * SIZE(BO), %xmm9
  1319. mulpd %xmm8, %xmm9
  1320. addpd %xmm9, %xmm4
  1321. mulpd 2 * SIZE(BO), %xmm8
  1322. addpd %xmm8, %xmm5
  1323. movapd 4 * SIZE(AO), %xmm8
  1324. addq $4 * SIZE, AO # aoffset += 4
  1325. addq $4 * SIZE, BO # boffset1 += 8
  1326. decq %rax
  1327. jg .L56
  1328. ALIGN_4
  1329. .L59:
  1330. #if defined(LN) || defined(RT)
  1331. movq KK, %rax
  1332. #ifdef LN
  1333. subq $4, %rax
  1334. #else
  1335. subq $2, %rax
  1336. #endif
  1337. movq AORIG, AO
  1338. movq BORIG, B
  1339. leaq BUFFER, BO
  1340. leaq (, %rax, SIZE), %rax
  1341. leaq (AO, %rax, 4), AO
  1342. leaq (B, %rax, 2), B
  1343. leaq (BO, %rax, 4), BO
  1344. #endif
  1345. #if defined(LN) || defined(LT)
  1346. movapd %xmm0, %xmm8
  1347. unpcklpd %xmm1, %xmm0
  1348. unpckhpd %xmm1, %xmm8
  1349. movapd %xmm4, %xmm12
  1350. unpcklpd %xmm5, %xmm4
  1351. unpckhpd %xmm5, %xmm12
  1352. movapd 0 * SIZE(B), %xmm1
  1353. movapd 2 * SIZE(B), %xmm5
  1354. movapd 4 * SIZE(B), %xmm9
  1355. movapd 6 * SIZE(B), %xmm13
  1356. subpd %xmm0, %xmm1
  1357. subpd %xmm8, %xmm5
  1358. subpd %xmm4, %xmm9
  1359. subpd %xmm12, %xmm13
  1360. #else
  1361. movapd 0 * SIZE(AO), %xmm8
  1362. movapd 2 * SIZE(AO), %xmm9
  1363. movapd 4 * SIZE(AO), %xmm10
  1364. movapd 6 * SIZE(AO), %xmm11
  1365. subpd %xmm0, %xmm8
  1366. subpd %xmm4, %xmm9
  1367. subpd %xmm1, %xmm10
  1368. subpd %xmm5, %xmm11
  1369. #endif
  1370. #ifdef LN
  1371. movlpd 15 * SIZE(AO), %xmm0
  1372. movhpd 15 * SIZE(AO), %xmm0
  1373. mulpd %xmm0, %xmm13
  1374. movlpd 14 * SIZE(AO), %xmm2
  1375. movhpd 14 * SIZE(AO), %xmm2
  1376. mulpd %xmm13, %xmm2
  1377. subpd %xmm2, %xmm9
  1378. movlpd 13 * SIZE(AO), %xmm4
  1379. movhpd 13 * SIZE(AO), %xmm4
  1380. mulpd %xmm13, %xmm4
  1381. subpd %xmm4, %xmm5
  1382. movlpd 12 * SIZE(AO), %xmm6
  1383. movhpd 12 * SIZE(AO), %xmm6
  1384. mulpd %xmm13, %xmm6
  1385. subpd %xmm6, %xmm1
  1386. movlpd 10 * SIZE(AO), %xmm0
  1387. movhpd 10 * SIZE(AO), %xmm0
  1388. mulpd %xmm0, %xmm9
  1389. movlpd 9 * SIZE(AO), %xmm2
  1390. movhpd 9 * SIZE(AO), %xmm2
  1391. mulpd %xmm9, %xmm2
  1392. subpd %xmm2, %xmm5
  1393. movlpd 8 * SIZE(AO), %xmm4
  1394. movhpd 8 * SIZE(AO), %xmm4
  1395. mulpd %xmm9, %xmm4
  1396. subpd %xmm4, %xmm1
  1397. movlpd 5 * SIZE(AO), %xmm0
  1398. movhpd 5 * SIZE(AO), %xmm0
  1399. mulpd %xmm0, %xmm5
  1400. movlpd 4 * SIZE(AO), %xmm2
  1401. movhpd 4 * SIZE(AO), %xmm2
  1402. mulpd %xmm5, %xmm2
  1403. subpd %xmm2, %xmm1
  1404. movlpd 0 * SIZE(AO), %xmm0
  1405. movhpd 0 * SIZE(AO), %xmm0
  1406. mulpd %xmm0, %xmm1
  1407. #endif
  1408. #ifdef LT
  1409. movlpd 0 * SIZE(AO), %xmm0
  1410. movhpd 0 * SIZE(AO), %xmm0
  1411. mulpd %xmm0, %xmm1
  1412. movlpd 1 * SIZE(AO), %xmm2
  1413. movhpd 1 * SIZE(AO), %xmm2
  1414. mulpd %xmm1, %xmm2
  1415. subpd %xmm2, %xmm5
  1416. movlpd 2 * SIZE(AO), %xmm4
  1417. movhpd 2 * SIZE(AO), %xmm4
  1418. mulpd %xmm1, %xmm4
  1419. subpd %xmm4, %xmm9
  1420. movlpd 3 * SIZE(AO), %xmm6
  1421. movhpd 3 * SIZE(AO), %xmm6
  1422. mulpd %xmm1, %xmm6
  1423. subpd %xmm6, %xmm13
  1424. movlpd 5 * SIZE(AO), %xmm0
  1425. movhpd 5 * SIZE(AO), %xmm0
  1426. mulpd %xmm0, %xmm5
  1427. movlpd 6 * SIZE(AO), %xmm2
  1428. movhpd 6 * SIZE(AO), %xmm2
  1429. mulpd %xmm5, %xmm2
  1430. subpd %xmm2, %xmm9
  1431. movlpd 7 * SIZE(AO), %xmm4
  1432. movhpd 7 * SIZE(AO), %xmm4
  1433. mulpd %xmm5, %xmm4
  1434. subpd %xmm4, %xmm13
  1435. movlpd 10 * SIZE(AO), %xmm0
  1436. movhpd 10 * SIZE(AO), %xmm0
  1437. mulpd %xmm0, %xmm9
  1438. movlpd 11 * SIZE(AO), %xmm2
  1439. movhpd 11 * SIZE(AO), %xmm2
  1440. mulpd %xmm9, %xmm2
  1441. subpd %xmm2, %xmm13
  1442. movlpd 15 * SIZE(AO), %xmm0
  1443. movhpd 15 * SIZE(AO), %xmm0
  1444. mulpd %xmm0, %xmm13
  1445. #endif
  1446. #ifdef RN
  1447. movlpd 0 * SIZE(B), %xmm0
  1448. movhpd 0 * SIZE(B), %xmm0
  1449. mulpd %xmm0, %xmm8
  1450. mulpd %xmm0, %xmm9
  1451. movlpd 1 * SIZE(B), %xmm1
  1452. movhpd 1 * SIZE(B), %xmm1
  1453. mulpd %xmm8, %xmm1
  1454. subpd %xmm1, %xmm10
  1455. movlpd 1 * SIZE(B), %xmm1
  1456. movhpd 1 * SIZE(B), %xmm1
  1457. mulpd %xmm9, %xmm1
  1458. subpd %xmm1, %xmm11
  1459. movlpd 3 * SIZE(B), %xmm0
  1460. movhpd 3 * SIZE(B), %xmm0
  1461. mulpd %xmm0, %xmm10
  1462. mulpd %xmm0, %xmm11
  1463. #endif
  1464. #ifdef RT
  1465. movlpd 3 * SIZE(B), %xmm0
  1466. movhpd 3 * SIZE(B), %xmm0
  1467. mulpd %xmm0, %xmm10
  1468. mulpd %xmm0, %xmm11
  1469. movlpd 2 * SIZE(B), %xmm1
  1470. movhpd 2 * SIZE(B), %xmm1
  1471. mulpd %xmm10, %xmm1
  1472. subpd %xmm1, %xmm8
  1473. movlpd 2 * SIZE(B), %xmm1
  1474. movhpd 2 * SIZE(B), %xmm1
  1475. mulpd %xmm11, %xmm1
  1476. subpd %xmm1, %xmm9
  1477. movlpd 0 * SIZE(B), %xmm0
  1478. movhpd 0 * SIZE(B), %xmm0
  1479. mulpd %xmm0, %xmm8
  1480. mulpd %xmm0, %xmm9
  1481. #endif
  1482. #ifdef LN
  1483. subq $4 * SIZE, CO1
  1484. subq $4 * SIZE, CO2
  1485. #endif
  1486. #if defined(LN) || defined(LT)
  1487. movsd %xmm1, 0 * SIZE(CO1)
  1488. movsd %xmm5, 1 * SIZE(CO1)
  1489. movsd %xmm9, 2 * SIZE(CO1)
  1490. movsd %xmm13, 3 * SIZE(CO1)
  1491. movhpd %xmm1, 0 * SIZE(CO2)
  1492. movhpd %xmm5, 1 * SIZE(CO2)
  1493. movhpd %xmm9, 2 * SIZE(CO2)
  1494. movhpd %xmm13, 3 * SIZE(CO2)
  1495. #else
  1496. movsd %xmm8, 0 * SIZE(CO1)
  1497. movhpd %xmm8, 1 * SIZE(CO1)
  1498. movsd %xmm9, 2 * SIZE(CO1)
  1499. movhpd %xmm9, 3 * SIZE(CO1)
  1500. movsd %xmm10, 0 * SIZE(CO2)
  1501. movhpd %xmm10, 1 * SIZE(CO2)
  1502. movsd %xmm11, 2 * SIZE(CO2)
  1503. movhpd %xmm11, 3 * SIZE(CO2)
  1504. #endif
  1505. #if defined(LN) || defined(LT)
  1506. movapd %xmm1, 0 * SIZE(B)
  1507. movapd %xmm5, 2 * SIZE(B)
  1508. movapd %xmm9, 4 * SIZE(B)
  1509. movapd %xmm13, 6 * SIZE(B)
  1510. movlpd %xmm1, 0 * SIZE(BO)
  1511. movlpd %xmm1, 1 * SIZE(BO)
  1512. movhpd %xmm1, 2 * SIZE(BO)
  1513. movhpd %xmm1, 3 * SIZE(BO)
  1514. movlpd %xmm5, 4 * SIZE(BO)
  1515. movlpd %xmm5, 5 * SIZE(BO)
  1516. movhpd %xmm5, 6 * SIZE(BO)
  1517. movhpd %xmm5, 7 * SIZE(BO)
  1518. movlpd %xmm9, 8 * SIZE(BO)
  1519. movlpd %xmm9, 9 * SIZE(BO)
  1520. movhpd %xmm9, 10 * SIZE(BO)
  1521. movhpd %xmm9, 11 * SIZE(BO)
  1522. movlpd %xmm13, 12 * SIZE(BO)
  1523. movlpd %xmm13, 13 * SIZE(BO)
  1524. movhpd %xmm13, 14 * SIZE(BO)
  1525. movhpd %xmm13, 15 * SIZE(BO)
  1526. #else
  1527. movapd %xmm8, 0 * SIZE(AO)
  1528. movapd %xmm9, 2 * SIZE(AO)
  1529. movapd %xmm10, 4 * SIZE(AO)
  1530. movapd %xmm11, 6 * SIZE(AO)
  1531. #endif
  1532. #ifndef LN
  1533. addq $4 * SIZE, CO1
  1534. addq $4 * SIZE, CO2
  1535. #endif
  1536. #if defined(LT) || defined(RN)
  1537. movq K, %rax
  1538. subq KK, %rax
  1539. leaq (,%rax, SIZE), %rax
  1540. leaq (AO, %rax, 4), AO
  1541. #ifdef LT
  1542. addq $8 * SIZE, B
  1543. #endif
  1544. #endif
  1545. #ifdef LN
  1546. subq $4, KK
  1547. movq BORIG, B
  1548. #endif
  1549. #ifdef LT
  1550. addq $4, KK
  1551. #endif
  1552. #ifdef RT
  1553. movq K, %rax
  1554. movq BORIG, B
  1555. salq $2 + BASE_SHIFT, %rax
  1556. addq %rax, AORIG
  1557. #endif
  1558. decq I # i --
  1559. jg .L51
  1560. ALIGN_4
  1561. .L60:
  1562. testq $2, M
  1563. je .L70
  1564. ALIGN_4
  1565. .L61:
  1566. #ifdef LN
  1567. movq K, %rax
  1568. salq $1 + BASE_SHIFT, %rax
  1569. subq %rax, AORIG
  1570. #endif
  1571. #if defined(LN) || defined(RT)
  1572. movq KK, %rax
  1573. movq AORIG, AO
  1574. leaq (, %rax, SIZE), %rax
  1575. leaq (AO, %rax, 2), AO
  1576. #endif
  1577. leaq BUFFER, BO
  1578. #if defined(LN) || defined(RT)
  1579. movq KK, %rax
  1580. salq $1 + BASE_SHIFT, %rax
  1581. leaq (BO, %rax, 2), BO
  1582. #endif
  1583. movapd 0 * SIZE(AO), %xmm8
  1584. pxor %xmm0, %xmm0
  1585. movapd 0 * SIZE(BO), %xmm9
  1586. pxor %xmm1, %xmm1
  1587. movapd 8 * SIZE(AO), %xmm10
  1588. pxor %xmm2, %xmm2
  1589. movapd 8 * SIZE(BO), %xmm11
  1590. pxor %xmm3, %xmm3
  1591. movapd 16 * SIZE(BO), %xmm13
  1592. movapd 24 * SIZE(BO), %xmm15
  1593. #if defined(LT) || defined(RN)
  1594. movq KK, %rax
  1595. #else
  1596. movq K, %rax
  1597. subq KK, %rax
  1598. #endif
  1599. sarq $3, %rax
  1600. je .L65
  1601. ALIGN_4
  1602. .L62:
  1603. mulpd %xmm8, %xmm9
  1604. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1605. mulpd 2 * SIZE(BO), %xmm8
  1606. addpd %xmm9, %xmm0
  1607. movapd 4 * SIZE(BO), %xmm9
  1608. addpd %xmm8, %xmm1
  1609. movapd 2 * SIZE(AO), %xmm8
  1610. mulpd %xmm8, %xmm9
  1611. mulpd 6 * SIZE(BO), %xmm8
  1612. addpd %xmm9, %xmm2
  1613. movapd 32 * SIZE(BO), %xmm9
  1614. addpd %xmm8, %xmm3
  1615. movapd 4 * SIZE(AO), %xmm8
  1616. mulpd %xmm8, %xmm11
  1617. mulpd 10 * SIZE(BO), %xmm8
  1618. addpd %xmm11, %xmm0
  1619. movapd 12 * SIZE(BO), %xmm11
  1620. addpd %xmm8, %xmm1
  1621. movapd 6 * SIZE(AO), %xmm8
  1622. mulpd %xmm8, %xmm11
  1623. mulpd 14 * SIZE(BO), %xmm8
  1624. addpd %xmm11, %xmm2
  1625. movapd 40 * SIZE(BO), %xmm11
  1626. addpd %xmm8, %xmm3
  1627. movapd 16 * SIZE(AO), %xmm8
  1628. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  1629. mulpd %xmm10, %xmm13
  1630. mulpd 18 * SIZE(BO), %xmm10
  1631. addpd %xmm13, %xmm0
  1632. movapd 20 * SIZE(BO), %xmm13
  1633. addpd %xmm10, %xmm1
  1634. movapd 10 * SIZE(AO), %xmm10
  1635. mulpd %xmm10, %xmm13
  1636. mulpd 22 * SIZE(BO), %xmm10
  1637. addpd %xmm13, %xmm2
  1638. movapd 48 * SIZE(BO), %xmm13
  1639. addpd %xmm10, %xmm3
  1640. movapd 12 * SIZE(AO), %xmm10
  1641. mulpd %xmm10, %xmm15
  1642. mulpd 26 * SIZE(BO), %xmm10
  1643. addpd %xmm15, %xmm0
  1644. movapd 28 * SIZE(BO), %xmm15
  1645. addpd %xmm10, %xmm1
  1646. movapd 14 * SIZE(AO), %xmm10
  1647. mulpd %xmm10, %xmm15
  1648. mulpd 30 * SIZE(BO), %xmm10
  1649. addpd %xmm15, %xmm2
  1650. movapd 56 * SIZE(BO), %xmm15
  1651. addpd %xmm10, %xmm3
  1652. movapd 24 * SIZE(AO), %xmm10
  1653. addq $16 * SIZE, AO
  1654. addq $32 * SIZE, BO
  1655. decq %rax
  1656. jne .L62
  1657. ALIGN_4
  1658. .L65:
  1659. #if defined(LT) || defined(RN)
  1660. movq KK, %rax
  1661. #else
  1662. movq K, %rax
  1663. subq KK, %rax
  1664. #endif
  1665. andq $7, %rax # if (k & 1)
  1666. BRANCH
  1667. je .L69
  1668. ALIGN_4
  1669. .L66:
  1670. mulpd %xmm8, %xmm9
  1671. mulpd 2 * SIZE(BO), %xmm8
  1672. addpd %xmm9, %xmm0
  1673. movapd 4 * SIZE(BO), %xmm9
  1674. addpd %xmm8, %xmm1
  1675. movapd 2 * SIZE(AO), %xmm8
  1676. addq $2 * SIZE, AO # aoffset += 4
  1677. addq $4 * SIZE, BO # boffset1 += 8
  1678. decq %rax
  1679. jg .L66
  1680. ALIGN_4
  1681. .L69:
  1682. addpd %xmm2, %xmm0
  1683. addpd %xmm3, %xmm1
  1684. #if defined(LN) || defined(RT)
  1685. movq KK, %rax
  1686. #ifdef LN
  1687. subq $2, %rax
  1688. #else
  1689. subq $2, %rax
  1690. #endif
  1691. movq AORIG, AO
  1692. movq BORIG, B
  1693. leaq BUFFER, BO
  1694. leaq (, %rax, SIZE), %rax
  1695. leaq (AO, %rax, 2), AO
  1696. leaq (B, %rax, 2), B
  1697. leaq (BO, %rax, 4), BO
  1698. #endif
  1699. #if defined(LN) || defined(LT)
  1700. movapd %xmm0, %xmm8
  1701. unpcklpd %xmm1, %xmm0
  1702. unpckhpd %xmm1, %xmm8
  1703. movapd 0 * SIZE(B), %xmm1
  1704. movapd 2 * SIZE(B), %xmm5
  1705. subpd %xmm0, %xmm1
  1706. subpd %xmm8, %xmm5
  1707. #else
  1708. movapd 0 * SIZE(AO), %xmm8
  1709. movapd 2 * SIZE(AO), %xmm10
  1710. subpd %xmm0, %xmm8
  1711. subpd %xmm1, %xmm10
  1712. #endif
  1713. #ifdef LN
  1714. movlpd 3 * SIZE(AO), %xmm0
  1715. movhpd 3 * SIZE(AO), %xmm0
  1716. mulpd %xmm0, %xmm5
  1717. movlpd 2 * SIZE(AO), %xmm2
  1718. movhpd 2 * SIZE(AO), %xmm2
  1719. mulpd %xmm5, %xmm2
  1720. subpd %xmm2, %xmm1
  1721. movlpd 0 * SIZE(AO), %xmm0
  1722. movhpd 0 * SIZE(AO), %xmm0
  1723. mulpd %xmm0, %xmm1
  1724. #endif
  1725. #ifdef LT
  1726. movlpd 0 * SIZE(AO), %xmm0
  1727. movhpd 0 * SIZE(AO), %xmm0
  1728. mulpd %xmm0, %xmm1
  1729. movlpd 1 * SIZE(AO), %xmm2
  1730. movhpd 1 * SIZE(AO), %xmm2
  1731. mulpd %xmm1, %xmm2
  1732. subpd %xmm2, %xmm5
  1733. movlpd 3 * SIZE(AO), %xmm0
  1734. movhpd 3 * SIZE(AO), %xmm0
  1735. mulpd %xmm0, %xmm5
  1736. #endif
  1737. #ifdef RN
  1738. movlpd 0 * SIZE(B), %xmm0
  1739. movhpd 0 * SIZE(B), %xmm0
  1740. mulpd %xmm0, %xmm8
  1741. movlpd 1 * SIZE(B), %xmm1
  1742. movhpd 1 * SIZE(B), %xmm1
  1743. mulpd %xmm8, %xmm1
  1744. subpd %xmm1, %xmm10
  1745. movlpd 3 * SIZE(B), %xmm0
  1746. movhpd 3 * SIZE(B), %xmm0
  1747. mulpd %xmm0, %xmm10
  1748. #endif
  1749. #ifdef RT
  1750. movlpd 3 * SIZE(B), %xmm0
  1751. movhpd 3 * SIZE(B), %xmm0
  1752. mulpd %xmm0, %xmm10
  1753. movlpd 2 * SIZE(B), %xmm1
  1754. movhpd 2 * SIZE(B), %xmm1
  1755. mulpd %xmm10, %xmm1
  1756. subpd %xmm1, %xmm8
  1757. movlpd 0 * SIZE(B), %xmm0
  1758. movhpd 0 * SIZE(B), %xmm0
  1759. mulpd %xmm0, %xmm8
  1760. #endif
  1761. #ifdef LN
  1762. subq $2 * SIZE, CO1
  1763. subq $2 * SIZE, CO2
  1764. #endif
  1765. #if defined(LN) || defined(LT)
  1766. movsd %xmm1, 0 * SIZE(CO1)
  1767. movsd %xmm5, 1 * SIZE(CO1)
  1768. movhpd %xmm1, 0 * SIZE(CO2)
  1769. movhpd %xmm5, 1 * SIZE(CO2)
  1770. #else
  1771. movsd %xmm8, 0 * SIZE(CO1)
  1772. movhpd %xmm8, 1 * SIZE(CO1)
  1773. movsd %xmm10, 0 * SIZE(CO2)
  1774. movhpd %xmm10, 1 * SIZE(CO2)
  1775. #endif
  1776. #if defined(LN) || defined(LT)
  1777. movapd %xmm1, 0 * SIZE(B)
  1778. movapd %xmm5, 2 * SIZE(B)
  1779. movlpd %xmm1, 0 * SIZE(BO)
  1780. movlpd %xmm1, 1 * SIZE(BO)
  1781. movhpd %xmm1, 2 * SIZE(BO)
  1782. movhpd %xmm1, 3 * SIZE(BO)
  1783. movlpd %xmm5, 4 * SIZE(BO)
  1784. movlpd %xmm5, 5 * SIZE(BO)
  1785. movhpd %xmm5, 6 * SIZE(BO)
  1786. movhpd %xmm5, 7 * SIZE(BO)
  1787. #else
  1788. movapd %xmm8, 0 * SIZE(AO)
  1789. movapd %xmm10, 2 * SIZE(AO)
  1790. #endif
  1791. #ifndef LN
  1792. addq $2 * SIZE, CO1
  1793. addq $2 * SIZE, CO2
  1794. #endif
  1795. #if defined(LT) || defined(RN)
  1796. movq K, %rax
  1797. subq KK, %rax
  1798. leaq (,%rax, SIZE), %rax
  1799. leaq (AO, %rax, 2), AO
  1800. #ifdef LT
  1801. addq $4 * SIZE, B
  1802. #endif
  1803. #endif
  1804. #ifdef LN
  1805. subq $2, KK
  1806. movq BORIG, B
  1807. #endif
  1808. #ifdef LT
  1809. addq $2, KK
  1810. #endif
  1811. #ifdef RT
  1812. movq K, %rax
  1813. movq BORIG, B
  1814. salq $1 + BASE_SHIFT, %rax
  1815. addq %rax, AORIG
  1816. #endif
  1817. ALIGN_4
  1818. .L70:
  1819. testq $1, M
  1820. je .L79
  1821. ALIGN_4
  1822. .L71:
  1823. #ifdef LN
  1824. movq K, %rax
  1825. salq $0 + BASE_SHIFT, %rax
  1826. subq %rax, AORIG
  1827. #endif
  1828. #if defined(LN) || defined(RT)
  1829. movq KK, %rax
  1830. movq AORIG, AO
  1831. leaq (, %rax, SIZE), %rax
  1832. leaq (AO, %rax, 1), AO
  1833. #endif
  1834. leaq BUFFER, BO
  1835. #if defined(LN) || defined(RT)
  1836. movq KK, %rax
  1837. salq $1 + BASE_SHIFT, %rax
  1838. leaq (BO, %rax, 2), BO
  1839. #endif
  1840. movsd 0 * SIZE(AO), %xmm8
  1841. pxor %xmm0, %xmm0
  1842. movsd 0 * SIZE(BO), %xmm9
  1843. pxor %xmm1, %xmm1
  1844. movsd 4 * SIZE(AO), %xmm10
  1845. pxor %xmm2, %xmm2
  1846. movsd 8 * SIZE(BO), %xmm11
  1847. pxor %xmm3, %xmm3
  1848. movsd 16 * SIZE(BO), %xmm13
  1849. movsd 24 * SIZE(BO), %xmm15
  1850. #if defined(LT) || defined(RN)
  1851. movq KK, %rax
  1852. #else
  1853. movq K, %rax
  1854. subq KK, %rax
  1855. #endif
  1856. sarq $3, %rax
  1857. je .L75
  1858. ALIGN_4
  1859. .L72:
  1860. mulsd %xmm8, %xmm9
  1861. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1862. mulsd 2 * SIZE(BO), %xmm8
  1863. addsd %xmm9, %xmm0
  1864. movsd 4 * SIZE(BO), %xmm9
  1865. addsd %xmm8, %xmm1
  1866. movsd 1 * SIZE(AO), %xmm8
  1867. mulsd %xmm8, %xmm9
  1868. mulsd 6 * SIZE(BO), %xmm8
  1869. addsd %xmm9, %xmm2
  1870. movsd 32 * SIZE(BO), %xmm9
  1871. addsd %xmm8, %xmm3
  1872. movsd 2 * SIZE(AO), %xmm8
  1873. mulsd %xmm8, %xmm11
  1874. mulsd 10 * SIZE(BO), %xmm8
  1875. addsd %xmm11, %xmm0
  1876. movsd 12 * SIZE(BO), %xmm11
  1877. addsd %xmm8, %xmm1
  1878. movsd 3 * SIZE(AO), %xmm8
  1879. mulsd %xmm8, %xmm11
  1880. mulsd 14 * SIZE(BO), %xmm8
  1881. addsd %xmm11, %xmm2
  1882. movsd 40 * SIZE(BO), %xmm11
  1883. addsd %xmm8, %xmm3
  1884. movsd 8 * SIZE(AO), %xmm8
  1885. mulsd %xmm10, %xmm13
  1886. mulsd 18 * SIZE(BO), %xmm10
  1887. addsd %xmm13, %xmm0
  1888. movsd 20 * SIZE(BO), %xmm13
  1889. addsd %xmm10, %xmm1
  1890. movsd 5 * SIZE(AO), %xmm10
  1891. mulsd %xmm10, %xmm13
  1892. mulsd 22 * SIZE(BO), %xmm10
  1893. addsd %xmm13, %xmm2
  1894. movsd 48 * SIZE(BO), %xmm13
  1895. addsd %xmm10, %xmm3
  1896. movsd 6 * SIZE(AO), %xmm10
  1897. mulsd %xmm10, %xmm15
  1898. mulsd 26 * SIZE(BO), %xmm10
  1899. addsd %xmm15, %xmm0
  1900. movsd 28 * SIZE(BO), %xmm15
  1901. addsd %xmm10, %xmm1
  1902. movsd 7 * SIZE(AO), %xmm10
  1903. mulsd %xmm10, %xmm15
  1904. mulsd 30 * SIZE(BO), %xmm10
  1905. addsd %xmm15, %xmm2
  1906. movsd 56 * SIZE(BO), %xmm15
  1907. addsd %xmm10, %xmm3
  1908. movsd 12 * SIZE(AO), %xmm10
  1909. addq $ 8 * SIZE, AO
  1910. addq $32 * SIZE, BO
  1911. decq %rax
  1912. jne .L72
  1913. ALIGN_4
  1914. .L75:
  1915. #if defined(LT) || defined(RN)
  1916. movq KK, %rax
  1917. #else
  1918. movq K, %rax
  1919. subq KK, %rax
  1920. #endif
  1921. andq $7, %rax # if (k & 1)
  1922. BRANCH
  1923. je .L78
  1924. ALIGN_4
  1925. .L76:
  1926. mulsd %xmm8, %xmm9
  1927. mulsd 2 * SIZE(BO), %xmm8
  1928. addsd %xmm9, %xmm0
  1929. addsd %xmm8, %xmm1
  1930. movsd 1 * SIZE(AO), %xmm8
  1931. movsd 4 * SIZE(BO), %xmm9
  1932. addq $1 * SIZE, AO # aoffset += 4
  1933. addq $4 * SIZE, BO # boffset1 += 8
  1934. decq %rax
  1935. jg .L76
  1936. ALIGN_4
  1937. .L78:
  1938. addsd %xmm2, %xmm0
  1939. addsd %xmm3, %xmm1
  1940. #if defined(LN) || defined(RT)
  1941. movq KK, %rax
  1942. #ifdef LN
  1943. subq $1, %rax
  1944. #else
  1945. subq $2, %rax
  1946. #endif
  1947. movq AORIG, AO
  1948. movq BORIG, B
  1949. leaq BUFFER, BO
  1950. leaq (, %rax, SIZE), %rax
  1951. leaq (AO, %rax, 1), AO
  1952. leaq (B, %rax, 2), B
  1953. leaq (BO, %rax, 4), BO
  1954. #endif
  1955. #if defined(LN) || defined(LT)
  1956. movsd 0 * SIZE(B), %xmm4
  1957. movsd 1 * SIZE(B), %xmm5
  1958. #else
  1959. movsd 0 * SIZE(AO), %xmm4
  1960. movsd 1 * SIZE(AO), %xmm5
  1961. #endif
  1962. subsd %xmm0, %xmm4
  1963. subsd %xmm1, %xmm5
  1964. #ifdef LN
  1965. movsd 0 * SIZE(AO), %xmm0
  1966. mulsd %xmm0, %xmm4
  1967. mulsd %xmm0, %xmm5
  1968. #endif
  1969. #ifdef LT
  1970. movsd 0 * SIZE(AO), %xmm0
  1971. mulsd %xmm0, %xmm4
  1972. mulsd %xmm0, %xmm5
  1973. #endif
  1974. #ifdef RN
  1975. mulsd 0 * SIZE(B), %xmm4
  1976. movsd 1 * SIZE(B), %xmm1
  1977. mulsd %xmm4, %xmm1
  1978. subsd %xmm1, %xmm5
  1979. mulsd 3 * SIZE(B), %xmm5
  1980. #endif
  1981. #ifdef RT
  1982. mulsd 3 * SIZE(B), %xmm5
  1983. movlpd 2 * SIZE(B), %xmm1
  1984. mulsd %xmm5, %xmm1
  1985. subsd %xmm1, %xmm4
  1986. mulsd 0 * SIZE(B), %xmm4
  1987. #endif
  1988. #ifdef LN
  1989. subq $1 * SIZE, CO1
  1990. subq $1 * SIZE, CO2
  1991. #endif
  1992. movsd %xmm4, 0 * SIZE(CO1)
  1993. movsd %xmm5, 0 * SIZE(CO2)
  1994. #if defined(LN) || defined(LT)
  1995. movsd %xmm4, 0 * SIZE(B)
  1996. movsd %xmm5, 1 * SIZE(B)
  1997. movsd %xmm4, 0 * SIZE(BO)
  1998. movsd %xmm4, 1 * SIZE(BO)
  1999. movsd %xmm5, 2 * SIZE(BO)
  2000. movsd %xmm5, 3 * SIZE(BO)
  2001. #else
  2002. movsd %xmm4, 0 * SIZE(AO)
  2003. movsd %xmm5, 1 * SIZE(AO)
  2004. #endif
  2005. #ifndef LN
  2006. addq $1 * SIZE, CO1
  2007. addq $1 * SIZE, CO2
  2008. #endif
  2009. #if defined(LT) || defined(RN)
  2010. movq K, %rax
  2011. subq KK, %rax
  2012. leaq (,%rax, SIZE), %rax
  2013. leaq (AO, %rax, 1), AO
  2014. #ifdef LT
  2015. addq $2 * SIZE, B
  2016. #endif
  2017. #endif
  2018. #ifdef LN
  2019. subq $1, KK
  2020. movq BORIG, B
  2021. #endif
  2022. #ifdef LT
  2023. addq $1, KK
  2024. #endif
  2025. #ifdef RT
  2026. movq K, %rax
  2027. movq BORIG, B
  2028. salq $0 + BASE_SHIFT, %rax
  2029. addq %rax, AORIG
  2030. #endif
  2031. ALIGN_4
  2032. .L79:
  2033. #ifdef LN
  2034. leaq (, K, SIZE), %rax
  2035. leaq (B, %rax, 2), B
  2036. #endif
  2037. #if defined(LT) || defined(RN)
  2038. movq K, %rax
  2039. subq KK, %rax
  2040. leaq (,%rax, SIZE), %rax
  2041. leaq (B, %rax, 2), B
  2042. #endif
  2043. #ifdef RN
  2044. addq $2, KK
  2045. #endif
  2046. #ifdef RT
  2047. subq $2, KK
  2048. #endif
  2049. ALIGN_4
  2050. .L80:
  2051. movq N, J
  2052. sarq $2, J # j = (n >> 2)
  2053. jle .L999
  2054. .L01:
  2055. /* Copying to Sub Buffer */
  2056. #ifdef LN
  2057. movq OFFSET, %rax
  2058. addq M, %rax
  2059. movq %rax, KK
  2060. #endif
  2061. leaq BUFFER, BO
  2062. #ifdef RT
  2063. movq K, %rax
  2064. salq $2 + BASE_SHIFT, %rax
  2065. subq %rax, B
  2066. #endif
  2067. #if defined(LN) || defined(RT)
  2068. movq KK, %rax
  2069. movq B, BORIG
  2070. leaq (, %rax, SIZE), %rax
  2071. leaq (B, %rax, 4), B
  2072. leaq (BO, %rax, 8), BO
  2073. #endif
  2074. #ifdef LT
  2075. movq OFFSET, %rax
  2076. movq %rax, KK
  2077. #endif
  2078. #if defined(LT) || defined(RN)
  2079. movq KK, %rax
  2080. #else
  2081. movq K, %rax
  2082. subq KK, %rax
  2083. #endif
  2084. sarq $2, %rax
  2085. jle .L03
  2086. addq %rax, %rax
  2087. ALIGN_4
  2088. .L02:
  2089. PREFETCHNTA 40 * SIZE(B)
  2090. movsd 0 * SIZE(B), %xmm0
  2091. movsd 1 * SIZE(B), %xmm1
  2092. movsd 2 * SIZE(B), %xmm2
  2093. movsd 3 * SIZE(B), %xmm3
  2094. movsd 4 * SIZE(B), %xmm4
  2095. movsd 5 * SIZE(B), %xmm5
  2096. movsd 6 * SIZE(B), %xmm6
  2097. movsd 7 * SIZE(B), %xmm7
  2098. addq $16 * SIZE, BO
  2099. addq $ 8 * SIZE, B
  2100. movsd %xmm0, -16 * SIZE(BO)
  2101. movsd %xmm0, -15 * SIZE(BO)
  2102. movsd %xmm1, -14 * SIZE(BO)
  2103. movsd %xmm1, -13 * SIZE(BO)
  2104. movsd %xmm2, -12 * SIZE(BO)
  2105. movsd %xmm2, -11 * SIZE(BO)
  2106. movsd %xmm3, -10 * SIZE(BO)
  2107. movsd %xmm3, -9 * SIZE(BO)
  2108. movsd %xmm4, -8 * SIZE(BO)
  2109. movsd %xmm4, -7 * SIZE(BO)
  2110. movsd %xmm5, -6 * SIZE(BO)
  2111. movsd %xmm5, -5 * SIZE(BO)
  2112. movsd %xmm6, -4 * SIZE(BO)
  2113. movsd %xmm6, -3 * SIZE(BO)
  2114. movsd %xmm7, -2 * SIZE(BO)
  2115. movsd %xmm7, -1 * SIZE(BO)
  2116. decq %rax
  2117. jne .L02
  2118. ALIGN_4
  2119. .L03:
  2120. #if defined(LT) || defined(RN)
  2121. movq KK, %rax
  2122. #else
  2123. movq K, %rax
  2124. subq KK, %rax
  2125. #endif
  2126. andq $3, %rax
  2127. BRANCH
  2128. jle .L10
  2129. ALIGN_4
  2130. .L04:
  2131. movsd 0 * SIZE(B), %xmm0
  2132. movsd 1 * SIZE(B), %xmm1
  2133. movsd 2 * SIZE(B), %xmm2
  2134. movsd 3 * SIZE(B), %xmm3
  2135. movsd %xmm0, 0 * SIZE(BO)
  2136. movsd %xmm0, 1 * SIZE(BO)
  2137. movsd %xmm1, 2 * SIZE(BO)
  2138. movsd %xmm1, 3 * SIZE(BO)
  2139. movsd %xmm2, 4 * SIZE(BO)
  2140. movsd %xmm2, 5 * SIZE(BO)
  2141. movsd %xmm3, 6 * SIZE(BO)
  2142. movsd %xmm3, 7 * SIZE(BO)
  2143. addq $4 * SIZE, B
  2144. addq $8 * SIZE, BO
  2145. decq %rax
  2146. jne .L04
  2147. ALIGN_4
  2148. .L10:
  2149. #if defined(LT) || defined(RN)
  2150. movq A, AO
  2151. #else
  2152. movq A, AORIG
  2153. #endif
  2154. #ifdef RT
  2155. leaq (, LDC, 4), %rax
  2156. subq %rax, C
  2157. #endif
  2158. movq C, CO1 # coffset1 = c
  2159. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  2160. #ifndef RT
  2161. leaq (C, LDC, 4), C
  2162. #endif
  2163. movq M, I
  2164. sarq $2, I # i = (m >> 2)
  2165. jle .L20
  2166. ALIGN_4
  2167. .L11:
  2168. #ifdef LN
  2169. movq K, %rax
  2170. salq $2 + BASE_SHIFT, %rax
  2171. subq %rax, AORIG
  2172. #endif
  2173. #if defined(LN) || defined(RT)
  2174. movq KK, %rax
  2175. movq AORIG, AO
  2176. leaq (, %rax, SIZE), %rax
  2177. leaq (AO, %rax, 4), AO
  2178. #endif
  2179. leaq BUFFER, BO
  2180. #if defined(LN) || defined(RT)
  2181. movq KK, %rax
  2182. salq $2 + BASE_SHIFT, %rax
  2183. leaq (BO, %rax, 2), BO
  2184. #endif
  2185. movapd 0 * SIZE(BO), %xmm9
  2186. movapd 2 * SIZE(BO), %xmm11
  2187. movapd 4 * SIZE(BO), %xmm13
  2188. movapd 8 * SIZE(BO), %xmm15
  2189. movapd 0 * SIZE(AO), %xmm8
  2190. pxor %xmm0, %xmm0
  2191. movapd 2 * SIZE(AO), %xmm10
  2192. pxor %xmm1, %xmm1
  2193. movapd 4 * SIZE(AO), %xmm12
  2194. pxor %xmm2, %xmm2
  2195. movapd 6 * SIZE(AO), %xmm14
  2196. pxor %xmm3, %xmm3
  2197. PREFETCHW 4 * SIZE(CO1)
  2198. pxor %xmm4, %xmm4
  2199. PREFETCHW 4 * SIZE(CO2)
  2200. pxor %xmm5, %xmm5
  2201. PREFETCHW 4 * SIZE(CO1, LDC, 2)
  2202. pxor %xmm6, %xmm6
  2203. PREFETCHW 4 * SIZE(CO2, LDC, 2)
  2204. pxor %xmm7, %xmm7
  2205. #if defined(LT) || defined(RN)
  2206. movq KK, %rax
  2207. #else
  2208. movq K, %rax
  2209. subq KK, %rax
  2210. #endif
  2211. andq $-8, %rax
  2212. salq $4, %rax
  2213. je .L15
  2214. .L1X:
  2215. KERNEL1(16 * 0)
  2216. KERNEL2(16 * 0)
  2217. KERNEL3(16 * 0)
  2218. KERNEL4(16 * 0)
  2219. KERNEL5(16 * 0)
  2220. KERNEL6(16 * 0)
  2221. KERNEL7(16 * 0)
  2222. KERNEL8(16 * 0)
  2223. KERNEL1(16 * 1)
  2224. KERNEL2(16 * 1)
  2225. KERNEL3(16 * 1)
  2226. KERNEL4(16 * 1)
  2227. KERNEL5(16 * 1)
  2228. KERNEL6(16 * 1)
  2229. KERNEL7(16 * 1)
  2230. KERNEL8(16 * 1)
  2231. cmpq $64 * 2, %rax
  2232. jle .L12
  2233. KERNEL1(16 * 2)
  2234. KERNEL2(16 * 2)
  2235. KERNEL3(16 * 2)
  2236. KERNEL4(16 * 2)
  2237. KERNEL5(16 * 2)
  2238. KERNEL6(16 * 2)
  2239. KERNEL7(16 * 2)
  2240. KERNEL8(16 * 2)
  2241. KERNEL1(16 * 3)
  2242. KERNEL2(16 * 3)
  2243. KERNEL3(16 * 3)
  2244. KERNEL4(16 * 3)
  2245. KERNEL5(16 * 3)
  2246. KERNEL6(16 * 3)
  2247. KERNEL7(16 * 3)
  2248. KERNEL8(16 * 3)
  2249. cmpq $64 * 4, %rax
  2250. jle .L12
  2251. KERNEL1(16 * 4)
  2252. KERNEL2(16 * 4)
  2253. KERNEL3(16 * 4)
  2254. KERNEL4(16 * 4)
  2255. KERNEL5(16 * 4)
  2256. KERNEL6(16 * 4)
  2257. KERNEL7(16 * 4)
  2258. KERNEL8(16 * 4)
  2259. KERNEL1(16 * 5)
  2260. KERNEL2(16 * 5)
  2261. KERNEL3(16 * 5)
  2262. KERNEL4(16 * 5)
  2263. KERNEL5(16 * 5)
  2264. KERNEL6(16 * 5)
  2265. KERNEL7(16 * 5)
  2266. KERNEL8(16 * 5)
  2267. cmpq $64 * 6, %rax
  2268. jle .L12
  2269. KERNEL1(16 * 6)
  2270. KERNEL2(16 * 6)
  2271. KERNEL3(16 * 6)
  2272. KERNEL4(16 * 6)
  2273. KERNEL5(16 * 6)
  2274. KERNEL6(16 * 6)
  2275. KERNEL7(16 * 6)
  2276. KERNEL8(16 * 6)
  2277. KERNEL1(16 * 7)
  2278. KERNEL2(16 * 7)
  2279. KERNEL3(16 * 7)
  2280. KERNEL4(16 * 7)
  2281. KERNEL5(16 * 7)
  2282. KERNEL6(16 * 7)
  2283. KERNEL7(16 * 7)
  2284. KERNEL8(16 * 7)
  2285. addq $16 * 8 * SIZE, AO
  2286. addq $32 * 8 * SIZE, BO
  2287. subq $64 * 8, %rax
  2288. jg .L1X
  2289. .L12:
  2290. leaq (AO, %rax, 2), AO # * 16
  2291. leaq (BO, %rax, 4), BO # * 64
  2292. ALIGN_4
  2293. .L15:
  2294. #if defined(LT) || defined(RN)
  2295. movq KK, %rax
  2296. #else
  2297. movq K, %rax
  2298. subq KK, %rax
  2299. #endif
  2300. andq $7, %rax # if (k & 1)
  2301. BRANCH
  2302. je .L19
  2303. ALIGN_4
  2304. .L16:
  2305. mulpd %xmm8, %xmm9
  2306. addpd %xmm9, %xmm0
  2307. movapd 2 * SIZE(BO), %xmm9
  2308. mulpd %xmm8, %xmm9
  2309. addpd %xmm9, %xmm1
  2310. movapd 4 * SIZE(BO), %xmm9
  2311. mulpd %xmm8, %xmm9
  2312. mulpd 6 * SIZE(BO), %xmm8
  2313. addpd %xmm9, %xmm2
  2314. movapd 0 * SIZE(BO), %xmm9
  2315. addpd %xmm8, %xmm3
  2316. movapd 4 * SIZE(AO), %xmm8
  2317. mulpd %xmm10, %xmm9
  2318. addpd %xmm9, %xmm4
  2319. movapd 2 * SIZE(BO), %xmm9
  2320. mulpd %xmm10, %xmm9
  2321. addpd %xmm9, %xmm5
  2322. movapd 4 * SIZE(BO), %xmm9
  2323. mulpd %xmm10, %xmm9
  2324. mulpd 6 * SIZE(BO), %xmm10
  2325. addpd %xmm9, %xmm6
  2326. movapd 8 * SIZE(BO), %xmm9
  2327. addpd %xmm10, %xmm7
  2328. movapd 6 * SIZE(AO), %xmm10
  2329. addq $4 * SIZE, AO # aoffset += 4
  2330. addq $8 * SIZE, BO # boffset1 += 8
  2331. decq %rax
  2332. jg .L16
  2333. ALIGN_4
  2334. .L19:
  2335. #if defined(LN) || defined(RT)
  2336. movq KK, %rax
  2337. #ifdef LN
  2338. subq $4, %rax
  2339. #else
  2340. subq $4, %rax
  2341. #endif
  2342. movq AORIG, AO
  2343. movq BORIG, B
  2344. leaq BUFFER, BO
  2345. leaq (, %rax, SIZE), %rax
  2346. leaq (AO, %rax, 4), AO
  2347. leaq (B, %rax, 4), B
  2348. leaq (BO, %rax, 8), BO
  2349. #endif
  2350. #if defined(LN) || defined(LT)
  2351. movapd %xmm0, %xmm8
  2352. unpcklpd %xmm1, %xmm0
  2353. unpckhpd %xmm1, %xmm8
  2354. movapd %xmm2, %xmm10
  2355. unpcklpd %xmm3, %xmm2
  2356. unpckhpd %xmm3, %xmm10
  2357. movapd %xmm4, %xmm12
  2358. unpcklpd %xmm5, %xmm4
  2359. unpckhpd %xmm5, %xmm12
  2360. movapd %xmm6, %xmm14
  2361. unpcklpd %xmm7, %xmm6
  2362. unpckhpd %xmm7, %xmm14
  2363. movapd 0 * SIZE(B), %xmm1
  2364. movapd 2 * SIZE(B), %xmm3
  2365. movapd 4 * SIZE(B), %xmm5
  2366. movapd 6 * SIZE(B), %xmm7
  2367. movapd 8 * SIZE(B), %xmm9
  2368. movapd 10 * SIZE(B), %xmm11
  2369. movapd 12 * SIZE(B), %xmm13
  2370. movapd 14 * SIZE(B), %xmm15
  2371. subpd %xmm0, %xmm1
  2372. subpd %xmm2, %xmm3
  2373. subpd %xmm8, %xmm5
  2374. subpd %xmm10, %xmm7
  2375. subpd %xmm4, %xmm9
  2376. subpd %xmm6, %xmm11
  2377. subpd %xmm12, %xmm13
  2378. subpd %xmm14, %xmm15
  2379. #else
  2380. movapd 0 * SIZE(AO), %xmm8
  2381. movapd 2 * SIZE(AO), %xmm9
  2382. movapd 4 * SIZE(AO), %xmm10
  2383. movapd 6 * SIZE(AO), %xmm11
  2384. movapd 8 * SIZE(AO), %xmm12
  2385. movapd 10 * SIZE(AO), %xmm13
  2386. movapd 12 * SIZE(AO), %xmm14
  2387. movapd 14 * SIZE(AO), %xmm15
  2388. subpd %xmm0, %xmm8
  2389. subpd %xmm4, %xmm9
  2390. subpd %xmm1, %xmm10
  2391. subpd %xmm5, %xmm11
  2392. subpd %xmm2, %xmm12
  2393. subpd %xmm6, %xmm13
  2394. subpd %xmm3, %xmm14
  2395. subpd %xmm7, %xmm15
  2396. #endif
  2397. #ifdef LN
  2398. movlpd 15 * SIZE(AO), %xmm0
  2399. movhpd 15 * SIZE(AO), %xmm0
  2400. mulpd %xmm0, %xmm13
  2401. mulpd %xmm0, %xmm15
  2402. movlpd 14 * SIZE(AO), %xmm2
  2403. movhpd 14 * SIZE(AO), %xmm2
  2404. mulpd %xmm13, %xmm2
  2405. subpd %xmm2, %xmm9
  2406. movlpd 14 * SIZE(AO), %xmm2
  2407. movhpd 14 * SIZE(AO), %xmm2
  2408. mulpd %xmm15, %xmm2
  2409. subpd %xmm2, %xmm11
  2410. movlpd 13 * SIZE(AO), %xmm4
  2411. movhpd 13 * SIZE(AO), %xmm4
  2412. mulpd %xmm13, %xmm4
  2413. subpd %xmm4, %xmm5
  2414. movlpd 13 * SIZE(AO), %xmm4
  2415. movhpd 13 * SIZE(AO), %xmm4
  2416. mulpd %xmm15, %xmm4
  2417. subpd %xmm4, %xmm7
  2418. movlpd 12 * SIZE(AO), %xmm6
  2419. movhpd 12 * SIZE(AO), %xmm6
  2420. mulpd %xmm13, %xmm6
  2421. subpd %xmm6, %xmm1
  2422. movlpd 12 * SIZE(AO), %xmm6
  2423. movhpd 12 * SIZE(AO), %xmm6
  2424. mulpd %xmm15, %xmm6
  2425. subpd %xmm6, %xmm3
  2426. movlpd 10 * SIZE(AO), %xmm0
  2427. movhpd 10 * SIZE(AO), %xmm0
  2428. mulpd %xmm0, %xmm9
  2429. mulpd %xmm0, %xmm11
  2430. movlpd 9 * SIZE(AO), %xmm2
  2431. movhpd 9 * SIZE(AO), %xmm2
  2432. mulpd %xmm9, %xmm2
  2433. subpd %xmm2, %xmm5
  2434. movlpd 9 * SIZE(AO), %xmm2
  2435. movhpd 9 * SIZE(AO), %xmm2
  2436. mulpd %xmm11, %xmm2
  2437. subpd %xmm2, %xmm7
  2438. movlpd 8 * SIZE(AO), %xmm4
  2439. movhpd 8 * SIZE(AO), %xmm4
  2440. mulpd %xmm9, %xmm4
  2441. subpd %xmm4, %xmm1
  2442. movlpd 8 * SIZE(AO), %xmm4
  2443. movhpd 8 * SIZE(AO), %xmm4
  2444. mulpd %xmm11, %xmm4
  2445. subpd %xmm4, %xmm3
  2446. movlpd 5 * SIZE(AO), %xmm0
  2447. movhpd 5 * SIZE(AO), %xmm0
  2448. mulpd %xmm0, %xmm5
  2449. mulpd %xmm0, %xmm7
  2450. movlpd 4 * SIZE(AO), %xmm2
  2451. movhpd 4 * SIZE(AO), %xmm2
  2452. mulpd %xmm5, %xmm2
  2453. subpd %xmm2, %xmm1
  2454. movlpd 4 * SIZE(AO), %xmm2
  2455. movhpd 4 * SIZE(AO), %xmm2
  2456. mulpd %xmm7, %xmm2
  2457. subpd %xmm2, %xmm3
  2458. movlpd 0 * SIZE(AO), %xmm0
  2459. movhpd 0 * SIZE(AO), %xmm0
  2460. mulpd %xmm0, %xmm1
  2461. mulpd %xmm0, %xmm3
  2462. #endif
  2463. #ifdef LT
  2464. movlpd 0 * SIZE(AO), %xmm0
  2465. movhpd 0 * SIZE(AO), %xmm0
  2466. mulpd %xmm0, %xmm1
  2467. mulpd %xmm0, %xmm3
  2468. movlpd 1 * SIZE(AO), %xmm2
  2469. movhpd 1 * SIZE(AO), %xmm2
  2470. mulpd %xmm1, %xmm2
  2471. subpd %xmm2, %xmm5
  2472. movlpd 1 * SIZE(AO), %xmm2
  2473. movhpd 1 * SIZE(AO), %xmm2
  2474. mulpd %xmm3, %xmm2
  2475. subpd %xmm2, %xmm7
  2476. movlpd 2 * SIZE(AO), %xmm4
  2477. movhpd 2 * SIZE(AO), %xmm4
  2478. mulpd %xmm1, %xmm4
  2479. subpd %xmm4, %xmm9
  2480. movlpd 2 * SIZE(AO), %xmm4
  2481. movhpd 2 * SIZE(AO), %xmm4
  2482. mulpd %xmm3, %xmm4
  2483. subpd %xmm4, %xmm11
  2484. movlpd 3 * SIZE(AO), %xmm6
  2485. movhpd 3 * SIZE(AO), %xmm6
  2486. mulpd %xmm1, %xmm6
  2487. subpd %xmm6, %xmm13
  2488. movlpd 3 * SIZE(AO), %xmm6
  2489. movhpd 3 * SIZE(AO), %xmm6
  2490. mulpd %xmm3, %xmm6
  2491. subpd %xmm6, %xmm15
  2492. movlpd 5 * SIZE(AO), %xmm0
  2493. movhpd 5 * SIZE(AO), %xmm0
  2494. mulpd %xmm0, %xmm5
  2495. mulpd %xmm0, %xmm7
  2496. movlpd 6 * SIZE(AO), %xmm2
  2497. movhpd 6 * SIZE(AO), %xmm2
  2498. mulpd %xmm5, %xmm2
  2499. subpd %xmm2, %xmm9
  2500. movlpd 6 * SIZE(AO), %xmm2
  2501. movhpd 6 * SIZE(AO), %xmm2
  2502. mulpd %xmm7, %xmm2
  2503. subpd %xmm2, %xmm11
  2504. movlpd 7 * SIZE(AO), %xmm4
  2505. movhpd 7 * SIZE(AO), %xmm4
  2506. mulpd %xmm5, %xmm4
  2507. subpd %xmm4, %xmm13
  2508. movlpd 7 * SIZE(AO), %xmm4
  2509. movhpd 7 * SIZE(AO), %xmm4
  2510. mulpd %xmm7, %xmm4
  2511. subpd %xmm4, %xmm15
  2512. movlpd 10 * SIZE(AO), %xmm0
  2513. movhpd 10 * SIZE(AO), %xmm0
  2514. mulpd %xmm0, %xmm9
  2515. mulpd %xmm0, %xmm11
  2516. movlpd 11 * SIZE(AO), %xmm2
  2517. movhpd 11 * SIZE(AO), %xmm2
  2518. mulpd %xmm9, %xmm2
  2519. subpd %xmm2, %xmm13
  2520. movlpd 11 * SIZE(AO), %xmm2
  2521. movhpd 11 * SIZE(AO), %xmm2
  2522. mulpd %xmm11, %xmm2
  2523. subpd %xmm2, %xmm15
  2524. movlpd 15 * SIZE(AO), %xmm0
  2525. movhpd 15 * SIZE(AO), %xmm0
  2526. mulpd %xmm0, %xmm13
  2527. mulpd %xmm0, %xmm15
  2528. #endif
  2529. #ifdef RN
  2530. movlpd 0 * SIZE(B), %xmm0
  2531. movhpd 0 * SIZE(B), %xmm0
  2532. mulpd %xmm0, %xmm8
  2533. mulpd %xmm0, %xmm9
  2534. movlpd 1 * SIZE(B), %xmm1
  2535. movhpd 1 * SIZE(B), %xmm1
  2536. mulpd %xmm8, %xmm1
  2537. subpd %xmm1, %xmm10
  2538. movlpd 1 * SIZE(B), %xmm1
  2539. movhpd 1 * SIZE(B), %xmm1
  2540. mulpd %xmm9, %xmm1
  2541. subpd %xmm1, %xmm11
  2542. movlpd 2 * SIZE(B), %xmm2
  2543. movhpd 2 * SIZE(B), %xmm2
  2544. mulpd %xmm8, %xmm2
  2545. subpd %xmm2, %xmm12
  2546. movlpd 2 * SIZE(B), %xmm2
  2547. movhpd 2 * SIZE(B), %xmm2
  2548. mulpd %xmm9, %xmm2
  2549. subpd %xmm2, %xmm13
  2550. movlpd 3 * SIZE(B), %xmm3
  2551. movhpd 3 * SIZE(B), %xmm3
  2552. mulpd %xmm8, %xmm3
  2553. subpd %xmm3, %xmm14
  2554. movlpd 3 * SIZE(B), %xmm3
  2555. movhpd 3 * SIZE(B), %xmm3
  2556. mulpd %xmm9, %xmm3
  2557. subpd %xmm3, %xmm15
  2558. movlpd 5 * SIZE(B), %xmm0
  2559. movhpd 5 * SIZE(B), %xmm0
  2560. mulpd %xmm0, %xmm10
  2561. mulpd %xmm0, %xmm11
  2562. movlpd 6 * SIZE(B), %xmm1
  2563. movhpd 6 * SIZE(B), %xmm1
  2564. mulpd %xmm10, %xmm1
  2565. subpd %xmm1, %xmm12
  2566. movlpd 6 * SIZE(B), %xmm1
  2567. movhpd 6 * SIZE(B), %xmm1
  2568. mulpd %xmm11, %xmm1
  2569. subpd %xmm1, %xmm13
  2570. movlpd 7 * SIZE(B), %xmm2
  2571. movhpd 7 * SIZE(B), %xmm2
  2572. mulpd %xmm10, %xmm2
  2573. subpd %xmm2, %xmm14
  2574. movlpd 7 * SIZE(B), %xmm2
  2575. movhpd 7 * SIZE(B), %xmm2
  2576. mulpd %xmm11, %xmm2
  2577. subpd %xmm2, %xmm15
  2578. movlpd 10 * SIZE(B), %xmm0
  2579. movhpd 10 * SIZE(B), %xmm0
  2580. mulpd %xmm0, %xmm12
  2581. mulpd %xmm0, %xmm13
  2582. movlpd 11 * SIZE(B), %xmm1
  2583. movhpd 11 * SIZE(B), %xmm1
  2584. mulpd %xmm12, %xmm1
  2585. subpd %xmm1, %xmm14
  2586. movlpd 11 * SIZE(B), %xmm1
  2587. movhpd 11 * SIZE(B), %xmm1
  2588. mulpd %xmm13, %xmm1
  2589. subpd %xmm1, %xmm15
  2590. movlpd 15 * SIZE(B), %xmm0
  2591. movhpd 15 * SIZE(B), %xmm0
  2592. mulpd %xmm0, %xmm14
  2593. mulpd %xmm0, %xmm15
  2594. #endif
  2595. #ifdef RT
  2596. movlpd 15 * SIZE(B), %xmm0
  2597. movhpd 15 * SIZE(B), %xmm0
  2598. mulpd %xmm0, %xmm14
  2599. mulpd %xmm0, %xmm15
  2600. movlpd 14 * SIZE(B), %xmm1
  2601. movhpd 14 * SIZE(B), %xmm1
  2602. mulpd %xmm14, %xmm1
  2603. subpd %xmm1, %xmm12
  2604. movlpd 14 * SIZE(B), %xmm1
  2605. movhpd 14 * SIZE(B), %xmm1
  2606. mulpd %xmm15, %xmm1
  2607. subpd %xmm1, %xmm13
  2608. movlpd 13 * SIZE(B), %xmm2
  2609. movhpd 13 * SIZE(B), %xmm2
  2610. mulpd %xmm14, %xmm2
  2611. subpd %xmm2, %xmm10
  2612. movlpd 13 * SIZE(B), %xmm2
  2613. movhpd 13 * SIZE(B), %xmm2
  2614. mulpd %xmm15, %xmm2
  2615. subpd %xmm2, %xmm11
  2616. movlpd 12 * SIZE(B), %xmm3
  2617. movhpd 12 * SIZE(B), %xmm3
  2618. mulpd %xmm14, %xmm3
  2619. subpd %xmm3, %xmm8
  2620. movlpd 12 * SIZE(B), %xmm3
  2621. movhpd 12 * SIZE(B), %xmm3
  2622. mulpd %xmm15, %xmm3
  2623. subpd %xmm3, %xmm9
  2624. movlpd 10 * SIZE(B), %xmm0
  2625. movhpd 10 * SIZE(B), %xmm0
  2626. mulpd %xmm0, %xmm12
  2627. mulpd %xmm0, %xmm13
  2628. movlpd 9 * SIZE(B), %xmm1
  2629. movhpd 9 * SIZE(B), %xmm1
  2630. mulpd %xmm12, %xmm1
  2631. subpd %xmm1, %xmm10
  2632. movlpd 9 * SIZE(B), %xmm1
  2633. movhpd 9 * SIZE(B), %xmm1
  2634. mulpd %xmm13, %xmm1
  2635. subpd %xmm1, %xmm11
  2636. movlpd 8 * SIZE(B), %xmm2
  2637. movhpd 8 * SIZE(B), %xmm2
  2638. mulpd %xmm12, %xmm2
  2639. subpd %xmm2, %xmm8
  2640. movlpd 8 * SIZE(B), %xmm2
  2641. movhpd 8 * SIZE(B), %xmm2
  2642. mulpd %xmm13, %xmm2
  2643. subpd %xmm2, %xmm9
  2644. movlpd 5 * SIZE(B), %xmm0
  2645. movhpd 5 * SIZE(B), %xmm0
  2646. mulpd %xmm0, %xmm10
  2647. mulpd %xmm0, %xmm11
  2648. movlpd 4 * SIZE(B), %xmm1
  2649. movhpd 4 * SIZE(B), %xmm1
  2650. mulpd %xmm10, %xmm1
  2651. subpd %xmm1, %xmm8
  2652. movlpd 4 * SIZE(B), %xmm1
  2653. movhpd 4 * SIZE(B), %xmm1
  2654. mulpd %xmm11, %xmm1
  2655. subpd %xmm1, %xmm9
  2656. movlpd 0 * SIZE(B), %xmm0
  2657. movhpd 0 * SIZE(B), %xmm0
  2658. mulpd %xmm0, %xmm8
  2659. mulpd %xmm0, %xmm9
  2660. #endif
  2661. #ifdef LN
  2662. subq $4 * SIZE, CO1
  2663. subq $4 * SIZE, CO2
  2664. #endif
  2665. #if defined(LN) || defined(LT)
  2666. movsd %xmm1, 0 * SIZE(CO1)
  2667. movsd %xmm5, 1 * SIZE(CO1)
  2668. movsd %xmm9, 2 * SIZE(CO1)
  2669. movsd %xmm13, 3 * SIZE(CO1)
  2670. movhpd %xmm1, 0 * SIZE(CO2)
  2671. movhpd %xmm5, 1 * SIZE(CO2)
  2672. movhpd %xmm9, 2 * SIZE(CO2)
  2673. movhpd %xmm13, 3 * SIZE(CO2)
  2674. movsd %xmm3, 0 * SIZE(CO1, LDC, 2)
  2675. movsd %xmm7, 1 * SIZE(CO1, LDC, 2)
  2676. movsd %xmm11, 2 * SIZE(CO1, LDC, 2)
  2677. movsd %xmm15, 3 * SIZE(CO1, LDC, 2)
  2678. movhpd %xmm3, 0 * SIZE(CO2, LDC, 2)
  2679. movhpd %xmm7, 1 * SIZE(CO2, LDC, 2)
  2680. movhpd %xmm11, 2 * SIZE(CO2, LDC, 2)
  2681. movhpd %xmm15, 3 * SIZE(CO2, LDC, 2)
  2682. #else
  2683. movsd %xmm8, 0 * SIZE(CO1)
  2684. movhpd %xmm8, 1 * SIZE(CO1)
  2685. movsd %xmm9, 2 * SIZE(CO1)
  2686. movhpd %xmm9, 3 * SIZE(CO1)
  2687. movsd %xmm10, 0 * SIZE(CO2)
  2688. movhpd %xmm10, 1 * SIZE(CO2)
  2689. movsd %xmm11, 2 * SIZE(CO2)
  2690. movhpd %xmm11, 3 * SIZE(CO2)
  2691. movsd %xmm12, 0 * SIZE(CO1, LDC, 2)
  2692. movhpd %xmm12, 1 * SIZE(CO1, LDC, 2)
  2693. movsd %xmm13, 2 * SIZE(CO1, LDC, 2)
  2694. movhpd %xmm13, 3 * SIZE(CO1, LDC, 2)
  2695. movsd %xmm14, 0 * SIZE(CO2, LDC, 2)
  2696. movhpd %xmm14, 1 * SIZE(CO2, LDC, 2)
  2697. movsd %xmm15, 2 * SIZE(CO2, LDC, 2)
  2698. movhpd %xmm15, 3 * SIZE(CO2, LDC, 2)
  2699. #endif
  2700. #if defined(LN) || defined(LT)
  2701. movapd %xmm1, 0 * SIZE(B)
  2702. movapd %xmm3, 2 * SIZE(B)
  2703. movapd %xmm5, 4 * SIZE(B)
  2704. movapd %xmm7, 6 * SIZE(B)
  2705. movapd %xmm9, 8 * SIZE(B)
  2706. movapd %xmm11, 10 * SIZE(B)
  2707. movapd %xmm13, 12 * SIZE(B)
  2708. movapd %xmm15, 14 * SIZE(B)
  2709. movlpd %xmm1, 0 * SIZE(BO)
  2710. movlpd %xmm1, 1 * SIZE(BO)
  2711. movhpd %xmm1, 2 * SIZE(BO)
  2712. movhpd %xmm1, 3 * SIZE(BO)
  2713. movlpd %xmm3, 4 * SIZE(BO)
  2714. movlpd %xmm3, 5 * SIZE(BO)
  2715. movhpd %xmm3, 6 * SIZE(BO)
  2716. movhpd %xmm3, 7 * SIZE(BO)
  2717. movlpd %xmm5, 8 * SIZE(BO)
  2718. movlpd %xmm5, 9 * SIZE(BO)
  2719. movhpd %xmm5, 10 * SIZE(BO)
  2720. movhpd %xmm5, 11 * SIZE(BO)
  2721. movlpd %xmm7, 12 * SIZE(BO)
  2722. movlpd %xmm7, 13 * SIZE(BO)
  2723. movhpd %xmm7, 14 * SIZE(BO)
  2724. movhpd %xmm7, 15 * SIZE(BO)
  2725. movlpd %xmm9, 16 * SIZE(BO)
  2726. movlpd %xmm9, 17 * SIZE(BO)
  2727. movhpd %xmm9, 18 * SIZE(BO)
  2728. movhpd %xmm9, 19 * SIZE(BO)
  2729. movlpd %xmm11, 20 * SIZE(BO)
  2730. movlpd %xmm11, 21 * SIZE(BO)
  2731. movhpd %xmm11, 22 * SIZE(BO)
  2732. movhpd %xmm11, 23 * SIZE(BO)
  2733. movlpd %xmm13, 24 * SIZE(BO)
  2734. movlpd %xmm13, 25 * SIZE(BO)
  2735. movhpd %xmm13, 26 * SIZE(BO)
  2736. movhpd %xmm13, 27 * SIZE(BO)
  2737. movlpd %xmm15, 28 * SIZE(BO)
  2738. movlpd %xmm15, 29 * SIZE(BO)
  2739. movhpd %xmm15, 30 * SIZE(BO)
  2740. movhpd %xmm15, 31 * SIZE(BO)
  2741. #else
  2742. movapd %xmm8, 0 * SIZE(AO)
  2743. movapd %xmm9, 2 * SIZE(AO)
  2744. movapd %xmm10, 4 * SIZE(AO)
  2745. movapd %xmm11, 6 * SIZE(AO)
  2746. movapd %xmm12, 8 * SIZE(AO)
  2747. movapd %xmm13, 10 * SIZE(AO)
  2748. movapd %xmm14, 12 * SIZE(AO)
  2749. movapd %xmm15, 14 * SIZE(AO)
  2750. #endif
  2751. #ifndef LN
  2752. addq $4 * SIZE, CO1
  2753. addq $4 * SIZE, CO2
  2754. #endif
  2755. #if defined(LT) || defined(RN)
  2756. movq K, %rax
  2757. subq KK, %rax
  2758. leaq (,%rax, SIZE), %rax
  2759. leaq (AO, %rax, 4), AO
  2760. #ifdef LT
  2761. addq $16 * SIZE, B
  2762. #endif
  2763. #endif
  2764. #ifdef LN
  2765. subq $4, KK
  2766. movq BORIG, B
  2767. #endif
  2768. #ifdef LT
  2769. addq $4, KK
  2770. #endif
  2771. #ifdef RT
  2772. movq K, %rax
  2773. movq BORIG, B
  2774. salq $2 + BASE_SHIFT, %rax
  2775. addq %rax, AORIG
  2776. #endif
  2777. decq I # i --
  2778. jg .L11
  2779. ALIGN_4
  2780. .L20:
  2781. testq $3, M
  2782. je .L39
  2783. testq $2, M
  2784. je .L30
  2785. ALIGN_4
  2786. .L21:
  2787. #ifdef LN
  2788. movq K, %rax
  2789. salq $1 + BASE_SHIFT, %rax
  2790. subq %rax, AORIG
  2791. #endif
  2792. #if defined(LN) || defined(RT)
  2793. movq KK, %rax
  2794. movq AORIG, AO
  2795. leaq (, %rax, SIZE), %rax
  2796. leaq (AO, %rax, 2), AO
  2797. #endif
  2798. leaq BUFFER, BO
  2799. #if defined(LN) || defined(RT)
  2800. movq KK, %rax
  2801. salq $2 + BASE_SHIFT, %rax
  2802. leaq (BO, %rax, 2), BO
  2803. #endif
  2804. movapd 0 * SIZE(AO), %xmm8
  2805. pxor %xmm0, %xmm0
  2806. movapd 0 * SIZE(BO), %xmm9
  2807. pxor %xmm1, %xmm1
  2808. movapd 8 * SIZE(AO), %xmm10
  2809. pxor %xmm2, %xmm2
  2810. movapd 8 * SIZE(BO), %xmm11
  2811. pxor %xmm3, %xmm3
  2812. movapd 16 * SIZE(BO), %xmm13
  2813. movapd 24 * SIZE(BO), %xmm15
  2814. #if defined(LT) || defined(RN)
  2815. movq KK, %rax
  2816. #else
  2817. movq K, %rax
  2818. subq KK, %rax
  2819. #endif
  2820. sarq $3, %rax
  2821. je .L25
  2822. ALIGN_4
  2823. .L22:
  2824. mulpd %xmm8, %xmm9
  2825. addpd %xmm9, %xmm0
  2826. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2827. movapd 2 * SIZE(BO), %xmm9
  2828. mulpd %xmm8, %xmm9
  2829. addpd %xmm9, %xmm1
  2830. movapd 4 * SIZE(BO), %xmm9
  2831. mulpd %xmm8, %xmm9
  2832. mulpd 6 * SIZE(BO), %xmm8
  2833. addpd %xmm9, %xmm2
  2834. movapd 32 * SIZE(BO), %xmm9
  2835. addpd %xmm8, %xmm3
  2836. movapd 2 * SIZE(AO), %xmm8
  2837. mulpd %xmm8, %xmm11
  2838. addpd %xmm11, %xmm0
  2839. movapd 10 * SIZE(BO), %xmm11
  2840. mulpd %xmm8, %xmm11
  2841. addpd %xmm11, %xmm1
  2842. movapd 12 * SIZE(BO), %xmm11
  2843. mulpd %xmm8, %xmm11
  2844. mulpd 14 * SIZE(BO), %xmm8
  2845. addpd %xmm11, %xmm2
  2846. movapd 40 * SIZE(BO), %xmm11
  2847. addpd %xmm8, %xmm3
  2848. movapd 4 * SIZE(AO), %xmm8
  2849. mulpd %xmm8, %xmm13
  2850. addpd %xmm13, %xmm0
  2851. movapd 18 * SIZE(BO), %xmm13
  2852. mulpd %xmm8, %xmm13
  2853. addpd %xmm13, %xmm1
  2854. movapd 20 * SIZE(BO), %xmm13
  2855. mulpd %xmm8, %xmm13
  2856. mulpd 22 * SIZE(BO), %xmm8
  2857. addpd %xmm13, %xmm2
  2858. movapd 48 * SIZE(BO), %xmm13
  2859. addpd %xmm8, %xmm3
  2860. movapd 6 * SIZE(AO), %xmm8
  2861. mulpd %xmm8, %xmm15
  2862. addpd %xmm15, %xmm0
  2863. movapd 26 * SIZE(BO), %xmm15
  2864. mulpd %xmm8, %xmm15
  2865. addpd %xmm15, %xmm1
  2866. movapd 28 * SIZE(BO), %xmm15
  2867. mulpd %xmm8, %xmm15
  2868. mulpd 30 * SIZE(BO), %xmm8
  2869. addpd %xmm15, %xmm2
  2870. movapd 56 * SIZE(BO), %xmm15
  2871. addpd %xmm8, %xmm3
  2872. movapd 16 * SIZE(AO), %xmm8
  2873. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  2874. mulpd %xmm10, %xmm9
  2875. addpd %xmm9, %xmm0
  2876. movapd 34 * SIZE(BO), %xmm9
  2877. mulpd %xmm10, %xmm9
  2878. addpd %xmm9, %xmm1
  2879. movapd 36 * SIZE(BO), %xmm9
  2880. mulpd %xmm10, %xmm9
  2881. mulpd 38 * SIZE(BO), %xmm10
  2882. addpd %xmm9, %xmm2
  2883. movapd 64 * SIZE(BO), %xmm9
  2884. addpd %xmm10, %xmm3
  2885. movapd 10 * SIZE(AO), %xmm10
  2886. mulpd %xmm10, %xmm11
  2887. addpd %xmm11, %xmm0
  2888. movapd 42 * SIZE(BO), %xmm11
  2889. mulpd %xmm10, %xmm11
  2890. addpd %xmm11, %xmm1
  2891. movapd 44 * SIZE(BO), %xmm11
  2892. mulpd %xmm10, %xmm11
  2893. mulpd 46 * SIZE(BO), %xmm10
  2894. addpd %xmm11, %xmm2
  2895. movapd 72 * SIZE(BO), %xmm11
  2896. addpd %xmm10, %xmm3
  2897. movapd 12 * SIZE(AO), %xmm10
  2898. mulpd %xmm10, %xmm13
  2899. addpd %xmm13, %xmm0
  2900. movapd 50 * SIZE(BO), %xmm13
  2901. mulpd %xmm10, %xmm13
  2902. addpd %xmm13, %xmm1
  2903. movapd 52 * SIZE(BO), %xmm13
  2904. mulpd %xmm10, %xmm13
  2905. mulpd 54 * SIZE(BO), %xmm10
  2906. addpd %xmm13, %xmm2
  2907. movapd 80 * SIZE(BO), %xmm13
  2908. addpd %xmm10, %xmm3
  2909. movapd 14 * SIZE(AO), %xmm10
  2910. mulpd %xmm10, %xmm15
  2911. addpd %xmm15, %xmm0
  2912. movapd 58 * SIZE(BO), %xmm15
  2913. mulpd %xmm10, %xmm15
  2914. addpd %xmm15, %xmm1
  2915. movapd 60 * SIZE(BO), %xmm15
  2916. mulpd %xmm10, %xmm15
  2917. mulpd 62 * SIZE(BO), %xmm10
  2918. addpd %xmm15, %xmm2
  2919. movapd 88 * SIZE(BO), %xmm15
  2920. addpd %xmm10, %xmm3
  2921. movapd 24 * SIZE(AO), %xmm10
  2922. addq $16 * SIZE, AO
  2923. addq $64 * SIZE, BO
  2924. decq %rax
  2925. jne .L22
  2926. ALIGN_4
  2927. .L25:
  2928. #if defined(LT) || defined(RN)
  2929. movq KK, %rax
  2930. #else
  2931. movq K, %rax
  2932. subq KK, %rax
  2933. #endif
  2934. andq $7, %rax # if (k & 1)
  2935. BRANCH
  2936. je .L29
  2937. ALIGN_4
  2938. .L26:
  2939. mulpd %xmm8, %xmm9
  2940. addpd %xmm9, %xmm0
  2941. movapd 2 * SIZE(BO), %xmm9
  2942. mulpd %xmm8, %xmm9
  2943. addpd %xmm9, %xmm1
  2944. movapd 4 * SIZE(BO), %xmm9
  2945. mulpd %xmm8, %xmm9
  2946. mulpd 6 * SIZE(BO), %xmm8
  2947. addpd %xmm9, %xmm2
  2948. movapd 8 * SIZE(BO), %xmm9
  2949. addpd %xmm8, %xmm3
  2950. movapd 2 * SIZE(AO), %xmm8
  2951. addq $2 * SIZE, AO # aoffset += 4
  2952. addq $8 * SIZE, BO # boffset1 += 8
  2953. decq %rax
  2954. jg .L26
  2955. ALIGN_4
  2956. .L29:
  2957. #if defined(LN) || defined(RT)
  2958. movq KK, %rax
  2959. #ifdef LN
  2960. subq $2, %rax
  2961. #else
  2962. subq $4, %rax
  2963. #endif
  2964. movq AORIG, AO
  2965. movq BORIG, B
  2966. leaq BUFFER, BO
  2967. leaq (, %rax, SIZE), %rax
  2968. leaq (AO, %rax, 2), AO
  2969. leaq (B, %rax, 4), B
  2970. leaq (BO, %rax, 8), BO
  2971. #endif
  2972. #if defined(LN) || defined(LT)
  2973. movapd %xmm0, %xmm8
  2974. unpcklpd %xmm1, %xmm0
  2975. unpckhpd %xmm1, %xmm8
  2976. movapd %xmm2, %xmm10
  2977. unpcklpd %xmm3, %xmm2
  2978. unpckhpd %xmm3, %xmm10
  2979. movapd 0 * SIZE(B), %xmm1
  2980. movapd 2 * SIZE(B), %xmm3
  2981. movapd 4 * SIZE(B), %xmm5
  2982. movapd 6 * SIZE(B), %xmm7
  2983. subpd %xmm0, %xmm1
  2984. subpd %xmm2, %xmm3
  2985. subpd %xmm8, %xmm5
  2986. subpd %xmm10, %xmm7
  2987. #else
  2988. movapd 0 * SIZE(AO), %xmm8
  2989. movapd 2 * SIZE(AO), %xmm10
  2990. movapd 4 * SIZE(AO), %xmm12
  2991. movapd 6 * SIZE(AO), %xmm14
  2992. subpd %xmm0, %xmm8
  2993. subpd %xmm1, %xmm10
  2994. subpd %xmm2, %xmm12
  2995. subpd %xmm3, %xmm14
  2996. #endif
  2997. #ifdef LN
  2998. movlpd 3 * SIZE(AO), %xmm0
  2999. movhpd 3 * SIZE(AO), %xmm0
  3000. mulpd %xmm0, %xmm5
  3001. mulpd %xmm0, %xmm7
  3002. movlpd 2 * SIZE(AO), %xmm2
  3003. movhpd 2 * SIZE(AO), %xmm2
  3004. mulpd %xmm5, %xmm2
  3005. subpd %xmm2, %xmm1
  3006. movlpd 2 * SIZE(AO), %xmm2
  3007. movhpd 2 * SIZE(AO), %xmm2
  3008. mulpd %xmm7, %xmm2
  3009. subpd %xmm2, %xmm3
  3010. movlpd 0 * SIZE(AO), %xmm0
  3011. movhpd 0 * SIZE(AO), %xmm0
  3012. mulpd %xmm0, %xmm1
  3013. mulpd %xmm0, %xmm3
  3014. #endif
  3015. #ifdef LT
  3016. movlpd 0 * SIZE(AO), %xmm0
  3017. movhpd 0 * SIZE(AO), %xmm0
  3018. mulpd %xmm0, %xmm1
  3019. mulpd %xmm0, %xmm3
  3020. movlpd 1 * SIZE(AO), %xmm2
  3021. movhpd 1 * SIZE(AO), %xmm2
  3022. mulpd %xmm1, %xmm2
  3023. subpd %xmm2, %xmm5
  3024. movlpd 1 * SIZE(AO), %xmm2
  3025. movhpd 1 * SIZE(AO), %xmm2
  3026. mulpd %xmm3, %xmm2
  3027. subpd %xmm2, %xmm7
  3028. movlpd 3 * SIZE(AO), %xmm0
  3029. movhpd 3 * SIZE(AO), %xmm0
  3030. mulpd %xmm0, %xmm5
  3031. mulpd %xmm0, %xmm7
  3032. #endif
  3033. #ifdef RN
  3034. movlpd 0 * SIZE(B), %xmm0
  3035. movhpd 0 * SIZE(B), %xmm0
  3036. mulpd %xmm0, %xmm8
  3037. movlpd 1 * SIZE(B), %xmm1
  3038. movhpd 1 * SIZE(B), %xmm1
  3039. mulpd %xmm8, %xmm1
  3040. subpd %xmm1, %xmm10
  3041. movlpd 2 * SIZE(B), %xmm2
  3042. movhpd 2 * SIZE(B), %xmm2
  3043. mulpd %xmm8, %xmm2
  3044. subpd %xmm2, %xmm12
  3045. movlpd 3 * SIZE(B), %xmm3
  3046. movhpd 3 * SIZE(B), %xmm3
  3047. mulpd %xmm8, %xmm3
  3048. subpd %xmm3, %xmm14
  3049. movlpd 5 * SIZE(B), %xmm0
  3050. movhpd 5 * SIZE(B), %xmm0
  3051. mulpd %xmm0, %xmm10
  3052. movlpd 6 * SIZE(B), %xmm1
  3053. movhpd 6 * SIZE(B), %xmm1
  3054. mulpd %xmm10, %xmm1
  3055. subpd %xmm1, %xmm12
  3056. movlpd 7 * SIZE(B), %xmm2
  3057. movhpd 7 * SIZE(B), %xmm2
  3058. mulpd %xmm10, %xmm2
  3059. subpd %xmm2, %xmm14
  3060. movlpd 10 * SIZE(B), %xmm0
  3061. movhpd 10 * SIZE(B), %xmm0
  3062. mulpd %xmm0, %xmm12
  3063. movlpd 11 * SIZE(B), %xmm1
  3064. movhpd 11 * SIZE(B), %xmm1
  3065. mulpd %xmm12, %xmm1
  3066. subpd %xmm1, %xmm14
  3067. movlpd 15 * SIZE(B), %xmm0
  3068. movhpd 15 * SIZE(B), %xmm0
  3069. mulpd %xmm0, %xmm14
  3070. #endif
  3071. #ifdef RT
  3072. movlpd 15 * SIZE(B), %xmm0
  3073. movhpd 15 * SIZE(B), %xmm0
  3074. mulpd %xmm0, %xmm14
  3075. movlpd 14 * SIZE(B), %xmm1
  3076. movhpd 14 * SIZE(B), %xmm1
  3077. mulpd %xmm14, %xmm1
  3078. subpd %xmm1, %xmm12
  3079. movlpd 13 * SIZE(B), %xmm2
  3080. movhpd 13 * SIZE(B), %xmm2
  3081. mulpd %xmm14, %xmm2
  3082. subpd %xmm2, %xmm10
  3083. movlpd 12 * SIZE(B), %xmm3
  3084. movhpd 12 * SIZE(B), %xmm3
  3085. mulpd %xmm14, %xmm3
  3086. subpd %xmm3, %xmm8
  3087. movlpd 10 * SIZE(B), %xmm0
  3088. movhpd 10 * SIZE(B), %xmm0
  3089. mulpd %xmm0, %xmm12
  3090. movlpd 9 * SIZE(B), %xmm1
  3091. movhpd 9 * SIZE(B), %xmm1
  3092. mulpd %xmm12, %xmm1
  3093. subpd %xmm1, %xmm10
  3094. movlpd 8 * SIZE(B), %xmm2
  3095. movhpd 8 * SIZE(B), %xmm2
  3096. mulpd %xmm12, %xmm2
  3097. subpd %xmm2, %xmm8
  3098. movlpd 5 * SIZE(B), %xmm0
  3099. movhpd 5 * SIZE(B), %xmm0
  3100. mulpd %xmm0, %xmm10
  3101. movlpd 4 * SIZE(B), %xmm1
  3102. movhpd 4 * SIZE(B), %xmm1
  3103. mulpd %xmm10, %xmm1
  3104. subpd %xmm1, %xmm8
  3105. movlpd 0 * SIZE(B), %xmm0
  3106. movhpd 0 * SIZE(B), %xmm0
  3107. mulpd %xmm0, %xmm8
  3108. #endif
  3109. #ifdef LN
  3110. subq $2 * SIZE, CO1
  3111. subq $2 * SIZE, CO2
  3112. #endif
  3113. #if defined(LN) || defined(LT)
  3114. movsd %xmm1, 0 * SIZE(CO1)
  3115. movsd %xmm5, 1 * SIZE(CO1)
  3116. movhpd %xmm1, 0 * SIZE(CO2)
  3117. movhpd %xmm5, 1 * SIZE(CO2)
  3118. movsd %xmm3, 0 * SIZE(CO1, LDC, 2)
  3119. movsd %xmm7, 1 * SIZE(CO1, LDC, 2)
  3120. movhpd %xmm3, 0 * SIZE(CO2, LDC, 2)
  3121. movhpd %xmm7, 1 * SIZE(CO2, LDC, 2)
  3122. #else
  3123. movsd %xmm8, 0 * SIZE(CO1)
  3124. movhpd %xmm8, 1 * SIZE(CO1)
  3125. movsd %xmm10, 0 * SIZE(CO2)
  3126. movhpd %xmm10, 1 * SIZE(CO2)
  3127. movsd %xmm12, 0 * SIZE(CO1, LDC, 2)
  3128. movhpd %xmm12, 1 * SIZE(CO1, LDC, 2)
  3129. movsd %xmm14, 0 * SIZE(CO2, LDC, 2)
  3130. movhpd %xmm14, 1 * SIZE(CO2, LDC, 2)
  3131. #endif
  3132. #if defined(LN) || defined(LT)
  3133. movapd %xmm1, 0 * SIZE(B)
  3134. movapd %xmm3, 2 * SIZE(B)
  3135. movapd %xmm5, 4 * SIZE(B)
  3136. movapd %xmm7, 6 * SIZE(B)
  3137. movlpd %xmm1, 0 * SIZE(BO)
  3138. movlpd %xmm1, 1 * SIZE(BO)
  3139. movhpd %xmm1, 2 * SIZE(BO)
  3140. movhpd %xmm1, 3 * SIZE(BO)
  3141. movlpd %xmm3, 4 * SIZE(BO)
  3142. movlpd %xmm3, 5 * SIZE(BO)
  3143. movhpd %xmm3, 6 * SIZE(BO)
  3144. movhpd %xmm3, 7 * SIZE(BO)
  3145. movlpd %xmm5, 8 * SIZE(BO)
  3146. movlpd %xmm5, 9 * SIZE(BO)
  3147. movhpd %xmm5, 10 * SIZE(BO)
  3148. movhpd %xmm5, 11 * SIZE(BO)
  3149. movlpd %xmm7, 12 * SIZE(BO)
  3150. movlpd %xmm7, 13 * SIZE(BO)
  3151. movhpd %xmm7, 14 * SIZE(BO)
  3152. movhpd %xmm7, 15 * SIZE(BO)
  3153. #else
  3154. movapd %xmm8, 0 * SIZE(AO)
  3155. movapd %xmm10, 2 * SIZE(AO)
  3156. movapd %xmm12, 4 * SIZE(AO)
  3157. movapd %xmm14, 6 * SIZE(AO)
  3158. #endif
  3159. #ifndef LN
  3160. addq $2 * SIZE, CO1
  3161. addq $2 * SIZE, CO2
  3162. #endif
  3163. #if defined(LT) || defined(RN)
  3164. movq K, %rax
  3165. subq KK, %rax
  3166. leaq (,%rax, SIZE), %rax
  3167. leaq (AO, %rax, 2), AO
  3168. #ifdef LT
  3169. addq $8 * SIZE, B
  3170. #endif
  3171. #endif
  3172. #ifdef LN
  3173. subq $2, KK
  3174. movq BORIG, B
  3175. #endif
  3176. #ifdef LT
  3177. addq $2, KK
  3178. #endif
  3179. #ifdef RT
  3180. movq K, %rax
  3181. movq BORIG, B
  3182. salq $1 + BASE_SHIFT, %rax
  3183. addq %rax, AORIG
  3184. #endif
  3185. ALIGN_4
  3186. .L30:
  3187. testq $1, M
  3188. je .L39
  3189. ALIGN_4
  3190. .L31:
  3191. #ifdef LN
  3192. movq K, %rax
  3193. salq $0 + BASE_SHIFT, %rax
  3194. subq %rax, AORIG
  3195. #endif
  3196. #if defined(LN) || defined(RT)
  3197. movq KK, %rax
  3198. movq AORIG, AO
  3199. leaq (, %rax, SIZE), %rax
  3200. leaq (AO, %rax, 1), AO
  3201. #endif
  3202. leaq BUFFER, BO
  3203. #if defined(LN) || defined(RT)
  3204. movq KK, %rax
  3205. salq $2 + BASE_SHIFT, %rax
  3206. leaq (BO, %rax, 2), BO
  3207. #endif
  3208. movsd 0 * SIZE(AO), %xmm8
  3209. pxor %xmm0, %xmm0
  3210. movsd 0 * SIZE(BO), %xmm9
  3211. pxor %xmm1, %xmm1
  3212. movsd 8 * SIZE(AO), %xmm10
  3213. pxor %xmm2, %xmm2
  3214. movsd 8 * SIZE(BO), %xmm11
  3215. pxor %xmm3, %xmm3
  3216. movsd 16 * SIZE(BO), %xmm13
  3217. movsd 24 * SIZE(BO), %xmm15
  3218. #if defined(LT) || defined(RN)
  3219. movq KK, %rax
  3220. #else
  3221. movq K, %rax
  3222. subq KK, %rax
  3223. #endif
  3224. sarq $3, %rax
  3225. je .L35
  3226. ALIGN_4
  3227. .L32:
  3228. mulsd %xmm8, %xmm9
  3229. addsd %xmm9, %xmm0
  3230. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  3231. movsd 2 * SIZE(BO), %xmm9
  3232. mulsd %xmm8, %xmm9
  3233. addsd %xmm9, %xmm1
  3234. movsd 4 * SIZE(BO), %xmm9
  3235. mulsd %xmm8, %xmm9
  3236. mulsd 6 * SIZE(BO), %xmm8
  3237. addsd %xmm9, %xmm2
  3238. movsd 32 * SIZE(BO), %xmm9
  3239. addsd %xmm8, %xmm3
  3240. movsd 1 * SIZE(AO), %xmm8
  3241. mulsd %xmm8, %xmm11
  3242. addsd %xmm11, %xmm0
  3243. movsd 10 * SIZE(BO), %xmm11
  3244. mulsd %xmm8, %xmm11
  3245. addsd %xmm11, %xmm1
  3246. movsd 12 * SIZE(BO), %xmm11
  3247. mulsd %xmm8, %xmm11
  3248. mulsd 14 * SIZE(BO), %xmm8
  3249. addsd %xmm11, %xmm2
  3250. movsd 40 * SIZE(BO), %xmm11
  3251. addsd %xmm8, %xmm3
  3252. movsd 2 * SIZE(AO), %xmm8
  3253. mulsd %xmm8, %xmm13
  3254. addsd %xmm13, %xmm0
  3255. movsd 18 * SIZE(BO), %xmm13
  3256. mulsd %xmm8, %xmm13
  3257. addsd %xmm13, %xmm1
  3258. movsd 20 * SIZE(BO), %xmm13
  3259. mulsd %xmm8, %xmm13
  3260. mulsd 22 * SIZE(BO), %xmm8
  3261. addsd %xmm13, %xmm2
  3262. movsd 48 * SIZE(BO), %xmm13
  3263. addsd %xmm8, %xmm3
  3264. movsd 3 * SIZE(AO), %xmm8
  3265. mulsd %xmm8, %xmm15
  3266. addsd %xmm15, %xmm0
  3267. movsd 26 * SIZE(BO), %xmm15
  3268. mulsd %xmm8, %xmm15
  3269. addsd %xmm15, %xmm1
  3270. movsd 28 * SIZE(BO), %xmm15
  3271. mulsd %xmm8, %xmm15
  3272. mulsd 30 * SIZE(BO), %xmm8
  3273. addsd %xmm15, %xmm2
  3274. movsd 56 * SIZE(BO), %xmm15
  3275. addsd %xmm8, %xmm3
  3276. movsd 4 * SIZE(AO), %xmm8
  3277. mulsd %xmm8, %xmm9
  3278. addsd %xmm9, %xmm0
  3279. movsd 34 * SIZE(BO), %xmm9
  3280. mulsd %xmm8, %xmm9
  3281. addsd %xmm9, %xmm1
  3282. movsd 36 * SIZE(BO), %xmm9
  3283. mulsd %xmm8, %xmm9
  3284. mulsd 38 * SIZE(BO), %xmm8
  3285. addsd %xmm9, %xmm2
  3286. movsd 64 * SIZE(BO), %xmm9
  3287. addsd %xmm8, %xmm3
  3288. movsd 5 * SIZE(AO), %xmm8
  3289. mulsd %xmm8, %xmm11
  3290. addsd %xmm11, %xmm0
  3291. movsd 42 * SIZE(BO), %xmm11
  3292. mulsd %xmm8, %xmm11
  3293. addsd %xmm11, %xmm1
  3294. movsd 44 * SIZE(BO), %xmm11
  3295. mulsd %xmm8, %xmm11
  3296. mulsd 46 * SIZE(BO), %xmm8
  3297. addsd %xmm11, %xmm2
  3298. movsd 72 * SIZE(BO), %xmm11
  3299. addsd %xmm8, %xmm3
  3300. movsd 6 * SIZE(AO), %xmm8
  3301. mulsd %xmm8, %xmm13
  3302. addsd %xmm13, %xmm0
  3303. movsd 50 * SIZE(BO), %xmm13
  3304. mulsd %xmm8, %xmm13
  3305. addsd %xmm13, %xmm1
  3306. movsd 52 * SIZE(BO), %xmm13
  3307. mulsd %xmm8, %xmm13
  3308. mulsd 54 * SIZE(BO), %xmm8
  3309. addsd %xmm13, %xmm2
  3310. movsd 80 * SIZE(BO), %xmm13
  3311. addsd %xmm8, %xmm3
  3312. movsd 7 * SIZE(AO), %xmm8
  3313. mulsd %xmm8, %xmm15
  3314. addsd %xmm15, %xmm0
  3315. movsd 58 * SIZE(BO), %xmm15
  3316. mulsd %xmm8, %xmm15
  3317. addsd %xmm15, %xmm1
  3318. movsd 60 * SIZE(BO), %xmm15
  3319. mulsd %xmm8, %xmm15
  3320. mulsd 62 * SIZE(BO), %xmm8
  3321. addsd %xmm15, %xmm2
  3322. movsd 88 * SIZE(BO), %xmm15
  3323. addsd %xmm8, %xmm3
  3324. movsd 8 * SIZE(AO), %xmm8
  3325. addq $ 8 * SIZE, AO
  3326. addq $64 * SIZE, BO
  3327. decq %rax
  3328. jne .L32
  3329. ALIGN_4
  3330. .L35:
  3331. #if defined(LT) || defined(RN)
  3332. movq KK, %rax
  3333. #else
  3334. movq K, %rax
  3335. subq KK, %rax
  3336. #endif
  3337. andq $7, %rax # if (k & 1)
  3338. BRANCH
  3339. je .L38
  3340. ALIGN_4
  3341. .L36:
  3342. mulsd %xmm8, %xmm9
  3343. addsd %xmm9, %xmm0
  3344. movsd 2 * SIZE(BO), %xmm9
  3345. mulsd %xmm8, %xmm9
  3346. addsd %xmm9, %xmm1
  3347. movsd 4 * SIZE(BO), %xmm9
  3348. mulsd %xmm8, %xmm9
  3349. mulsd 6 * SIZE(BO), %xmm8
  3350. addsd %xmm9, %xmm2
  3351. movsd 8 * SIZE(BO), %xmm9
  3352. addsd %xmm8, %xmm3
  3353. movsd 1 * SIZE(AO), %xmm8
  3354. addq $1 * SIZE, AO # aoffset += 4
  3355. addq $8 * SIZE, BO # boffset1 += 8
  3356. decq %rax
  3357. jg .L36
  3358. ALIGN_4
  3359. .L38:
  3360. #if defined(LN) || defined(RT)
  3361. movq KK, %rax
  3362. #ifdef LN
  3363. subq $1, %rax
  3364. #else
  3365. subq $4, %rax
  3366. #endif
  3367. movq AORIG, AO
  3368. movq BORIG, B
  3369. leaq BUFFER, BO
  3370. leaq (, %rax, SIZE), %rax
  3371. leaq (AO, %rax, 1), AO
  3372. leaq (B, %rax, 4), B
  3373. leaq (BO, %rax, 8), BO
  3374. #endif
  3375. #if defined(LN) || defined(LT)
  3376. movsd 0 * SIZE(B), %xmm4
  3377. movsd 1 * SIZE(B), %xmm5
  3378. movsd 2 * SIZE(B), %xmm6
  3379. movsd 3 * SIZE(B), %xmm7
  3380. #else
  3381. movsd 0 * SIZE(AO), %xmm4
  3382. movsd 1 * SIZE(AO), %xmm5
  3383. movsd 2 * SIZE(AO), %xmm6
  3384. movsd 3 * SIZE(AO), %xmm7
  3385. #endif
  3386. subsd %xmm0, %xmm4
  3387. subsd %xmm1, %xmm5
  3388. subsd %xmm2, %xmm6
  3389. subsd %xmm3, %xmm7
  3390. #ifdef LN
  3391. movsd 0 * SIZE(AO), %xmm0
  3392. mulsd %xmm0, %xmm4
  3393. mulsd %xmm0, %xmm5
  3394. mulsd %xmm0, %xmm6
  3395. mulsd %xmm0, %xmm7
  3396. #endif
  3397. #ifdef LT
  3398. movsd 0 * SIZE(AO), %xmm0
  3399. mulsd %xmm0, %xmm4
  3400. mulsd %xmm0, %xmm5
  3401. mulsd %xmm0, %xmm6
  3402. mulsd %xmm0, %xmm7
  3403. #endif
  3404. #ifdef RN
  3405. mulsd 0 * SIZE(B), %xmm4
  3406. movlpd 1 * SIZE(B), %xmm1
  3407. mulsd %xmm4, %xmm1
  3408. subsd %xmm1, %xmm5
  3409. movlpd 2 * SIZE(B), %xmm2
  3410. mulsd %xmm4, %xmm2
  3411. subsd %xmm2, %xmm6
  3412. movlpd 3 * SIZE(B), %xmm3
  3413. mulsd %xmm4, %xmm3
  3414. subsd %xmm3, %xmm7
  3415. mulsd 5 * SIZE(B), %xmm5
  3416. movlpd 6 * SIZE(B), %xmm1
  3417. mulsd %xmm5, %xmm1
  3418. subsd %xmm1, %xmm6
  3419. movlpd 7 * SIZE(B), %xmm2
  3420. mulsd %xmm5, %xmm2
  3421. subsd %xmm2, %xmm7
  3422. mulsd 10 * SIZE(B), %xmm6
  3423. movlpd 11 * SIZE(B), %xmm1
  3424. mulsd %xmm6, %xmm1
  3425. subsd %xmm1, %xmm7
  3426. mulsd 15 * SIZE(B), %xmm7
  3427. #endif
  3428. #ifdef RT
  3429. mulsd 15 * SIZE(B), %xmm7
  3430. movlpd 14 * SIZE(B), %xmm1
  3431. mulsd %xmm7, %xmm1
  3432. subsd %xmm1, %xmm6
  3433. movlpd 13 * SIZE(B), %xmm2
  3434. mulsd %xmm7, %xmm2
  3435. subsd %xmm2, %xmm5
  3436. movlpd 12 * SIZE(B), %xmm3
  3437. mulsd %xmm7, %xmm3
  3438. subsd %xmm3, %xmm4
  3439. mulsd 10 * SIZE(B), %xmm6
  3440. movlpd 9 * SIZE(B), %xmm1
  3441. mulsd %xmm6, %xmm1
  3442. subsd %xmm1, %xmm5
  3443. movlpd 8 * SIZE(B), %xmm2
  3444. mulsd %xmm6, %xmm2
  3445. subsd %xmm2, %xmm4
  3446. mulsd 5 * SIZE(B), %xmm5
  3447. movlpd 4 * SIZE(B), %xmm1
  3448. mulsd %xmm5, %xmm1
  3449. subsd %xmm1, %xmm4
  3450. mulsd 0 * SIZE(B), %xmm4
  3451. #endif
  3452. #ifdef LN
  3453. subq $1 * SIZE, CO1
  3454. subq $1 * SIZE, CO2
  3455. #endif
  3456. movsd %xmm4, 0 * SIZE(CO1)
  3457. movsd %xmm5, 0 * SIZE(CO2)
  3458. movsd %xmm6, 0 * SIZE(CO1, LDC, 2)
  3459. movsd %xmm7, 0 * SIZE(CO2, LDC, 2)
  3460. #if defined(LN) || defined(LT)
  3461. movsd %xmm4, 0 * SIZE(B)
  3462. movsd %xmm5, 1 * SIZE(B)
  3463. movsd %xmm6, 2 * SIZE(B)
  3464. movsd %xmm7, 3 * SIZE(B)
  3465. movsd %xmm4, 0 * SIZE(BO)
  3466. movsd %xmm4, 1 * SIZE(BO)
  3467. movsd %xmm5, 2 * SIZE(BO)
  3468. movsd %xmm5, 3 * SIZE(BO)
  3469. movsd %xmm6, 4 * SIZE(BO)
  3470. movsd %xmm6, 5 * SIZE(BO)
  3471. movsd %xmm7, 6 * SIZE(BO)
  3472. movsd %xmm7, 7 * SIZE(BO)
  3473. #else
  3474. movsd %xmm4, 0 * SIZE(AO)
  3475. movsd %xmm5, 1 * SIZE(AO)
  3476. movsd %xmm6, 2 * SIZE(AO)
  3477. movsd %xmm7, 3 * SIZE(AO)
  3478. #endif
  3479. #ifndef LN
  3480. addq $1 * SIZE, CO1
  3481. addq $1 * SIZE, CO2
  3482. #endif
  3483. #if defined(LT) || defined(RN)
  3484. movq K, %rax
  3485. subq KK, %rax
  3486. leaq (,%rax, SIZE), %rax
  3487. leaq (AO, %rax, 1), AO
  3488. #ifdef LT
  3489. addq $4 * SIZE, B
  3490. #endif
  3491. #endif
  3492. #ifdef LN
  3493. subq $1, KK
  3494. movq BORIG, B
  3495. #endif
  3496. #ifdef LT
  3497. addq $1, KK
  3498. #endif
  3499. #ifdef RT
  3500. movq K, %rax
  3501. movq BORIG, B
  3502. salq $0 + BASE_SHIFT, %rax
  3503. addq %rax, AORIG
  3504. #endif
  3505. ALIGN_4
  3506. .L39:
  3507. #ifdef LN
  3508. leaq (, K, SIZE), %rax
  3509. leaq (B, %rax, 4), B
  3510. #endif
  3511. #if defined(LT) || defined(RN)
  3512. movq K, %rax
  3513. subq KK, %rax
  3514. leaq (,%rax, SIZE), %rax
  3515. leaq (B, %rax, 4), B
  3516. #endif
  3517. #ifdef RN
  3518. addq $4, KK
  3519. #endif
  3520. #ifdef RT
  3521. subq $4, KK
  3522. #endif
  3523. decq J # j --
  3524. jg .L01
  3525. ALIGN_4
  3526. .L999:
  3527. movq %rbx, %rsp
  3528. movq 0(%rsp), %rbx
  3529. movq 8(%rsp), %rbp
  3530. movq 16(%rsp), %r12
  3531. movq 24(%rsp), %r13
  3532. movq 32(%rsp), %r14
  3533. movq 40(%rsp), %r15
  3534. #ifdef WINDOWS_ABI
  3535. movq 48(%rsp), %rdi
  3536. movq 56(%rsp), %rsi
  3537. movups 64(%rsp), %xmm6
  3538. movups 80(%rsp), %xmm7
  3539. movups 96(%rsp), %xmm8
  3540. movups 112(%rsp), %xmm9
  3541. movups 128(%rsp), %xmm10
  3542. movups 144(%rsp), %xmm11
  3543. movups 160(%rsp), %xmm12
  3544. movups 176(%rsp), %xmm13
  3545. movups 192(%rsp), %xmm14
  3546. movups 208(%rsp), %xmm15
  3547. #endif
  3548. addq $STACKSIZE, %rsp
  3549. ret
  3550. EPILOGUE