You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemv_t.S 80 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "l2param.h"
  41. #if GEMV_UNROLL < 2
  42. #undef GEMV_UNROLL
  43. #define GEMV_UNROLL 2
  44. #endif
  45. #ifndef WINDOWS_ABI
  46. #define STACKSIZE 128
  47. #define OLD_INCX 8 + STACKSIZE(%rsp)
  48. #define OLD_Y 16 + STACKSIZE(%rsp)
  49. #define OLD_INCY 24 + STACKSIZE(%rsp)
  50. #define OLD_BUFFER 32 + STACKSIZE(%rsp)
  51. #define ALPHA 48 (%rsp)
  52. #define MMM 64(%rsp)
  53. #define NN 72(%rsp)
  54. #define AA 80(%rsp)
  55. #define LDAX 88(%rsp)
  56. #define ALPHAR 96(%rsp)
  57. #define ALPHAI 104(%rsp)
  58. #define M %rdi
  59. #define N %rsi
  60. #define A %rcx
  61. #define LDA %r8
  62. #define X %r9
  63. #define INCX %rdx
  64. #define Y %rbp
  65. #define INCY %r10
  66. #else
  67. #define STACKSIZE 288
  68. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  69. #define OLD_A 48 + STACKSIZE(%rsp)
  70. #define OLD_LDA 56 + STACKSIZE(%rsp)
  71. #define OLD_X 64 + STACKSIZE(%rsp)
  72. #define OLD_INCX 72 + STACKSIZE(%rsp)
  73. #define OLD_Y 80 + STACKSIZE(%rsp)
  74. #define OLD_INCY 88 + STACKSIZE(%rsp)
  75. #define OLD_BUFFER 96 + STACKSIZE(%rsp)
  76. #define ALPHA 224 (%rsp)
  77. #define MMM 232(%rsp)
  78. #define NN 240(%rsp)
  79. #define AA 248(%rsp)
  80. #define LDAX 256(%rsp)
  81. #define ALPHAR 264(%rsp)
  82. #define ALPHAI 272(%rsp)
  83. #define M %rcx
  84. #define N %rdx
  85. #define A %r8
  86. #define LDA %r9
  87. #define X %rdi
  88. #define INCX %rsi
  89. #define Y %rbp
  90. #define INCY %r10
  91. #endif
  92. #define I %rax
  93. #define A1 %r11
  94. #define A2 %r12
  95. #define X1 %rbx
  96. #define Y1 %r13
  97. #define BUFFER %r14
  98. #ifdef ALIGNED_ACCESS
  99. #define MM %r15
  100. #else
  101. #define MM M
  102. #endif
  103. #undef SUBPS
  104. #ifndef CONJ
  105. #define SUBPS addps
  106. #else
  107. #define SUBPS subps
  108. #endif
  109. PROLOGUE
  110. PROFCODE
  111. subq $STACKSIZE, %rsp
  112. movq %rbx, 0(%rsp)
  113. movq %rbp, 8(%rsp)
  114. movq %r12, 16(%rsp)
  115. movq %r13, 24(%rsp)
  116. movq %r14, 32(%rsp)
  117. movq %r15, 40(%rsp)
  118. #ifdef WINDOWS_ABI
  119. movq %rdi, 48(%rsp)
  120. movq %rsi, 56(%rsp)
  121. movups %xmm6, 64(%rsp)
  122. movups %xmm7, 80(%rsp)
  123. movups %xmm8, 96(%rsp)
  124. movups %xmm9, 112(%rsp)
  125. movups %xmm10, 128(%rsp)
  126. movups %xmm11, 144(%rsp)
  127. movups %xmm12, 160(%rsp)
  128. movups %xmm13, 176(%rsp)
  129. movups %xmm14, 192(%rsp)
  130. movups %xmm15, 208(%rsp)
  131. movq OLD_A, A
  132. movq OLD_LDA, LDA
  133. movq OLD_X, X
  134. movaps %xmm3, %xmm0
  135. movss OLD_ALPHA_I, %xmm1
  136. #endif
  137. movq A, AA
  138. movq N, NN
  139. movq M, MMM
  140. movq LDA, LDAX
  141. movss %xmm0,ALPHAR
  142. movss %xmm1,ALPHAI
  143. .L0t:
  144. xorq I,I
  145. addq $1,I
  146. salq $20,I
  147. subq I,MMM
  148. movq I,M
  149. movss ALPHAR,%xmm0
  150. movss ALPHAI,%xmm1
  151. jge .L00t
  152. movq MMM,M
  153. addq I,M
  154. jle .L999x
  155. .L00t:
  156. movq AA, A
  157. movq NN, N
  158. movq LDAX, LDA
  159. movq OLD_INCX, INCX
  160. movq OLD_Y, Y
  161. movq OLD_INCY, INCY
  162. movq OLD_BUFFER, BUFFER
  163. salq $ZBASE_SHIFT, LDA
  164. salq $ZBASE_SHIFT, INCX
  165. salq $ZBASE_SHIFT, INCY
  166. unpcklps %xmm1, %xmm0
  167. movlps %xmm0, ALPHA
  168. testq M, M
  169. jle .L999
  170. testq N, N
  171. jle .L999
  172. subq $-32 * SIZE, A
  173. movq BUFFER, X1
  174. #ifdef ALIGNED_ACCESS
  175. movq M, MM
  176. movq A, %rax
  177. andq $4 * SIZE - 1, %rax
  178. cmpq $2 * SIZE, %rax
  179. jl .L0X
  180. movsd (X), %xmm0
  181. addq INCX, X
  182. movlps %xmm0, 2 * SIZE(X1)
  183. addq $2 * SIZE, BUFFER
  184. addq $4 * SIZE, X1
  185. decq MM
  186. .L0X:
  187. #endif
  188. movq MM, I
  189. sarq $3, I
  190. jle .L05
  191. ALIGN_4
  192. .L02:
  193. movsd (X), %xmm0
  194. addq INCX, X
  195. movhps (X), %xmm0
  196. addq INCX, X
  197. movsd (X), %xmm1
  198. addq INCX, X
  199. movhps (X), %xmm1
  200. addq INCX, X
  201. movsd (X), %xmm2
  202. addq INCX, X
  203. movhps (X), %xmm2
  204. addq INCX, X
  205. movsd (X), %xmm3
  206. addq INCX, X
  207. movhps (X), %xmm3
  208. addq INCX, X
  209. movaps %xmm0, 0 * SIZE(X1)
  210. movaps %xmm1, 4 * SIZE(X1)
  211. movaps %xmm2, 8 * SIZE(X1)
  212. movaps %xmm3, 12 * SIZE(X1)
  213. addq $16 * SIZE, X1
  214. decq I
  215. jg .L02
  216. ALIGN_4
  217. .L05:
  218. movq MM, I
  219. andq $7, I
  220. jle .L10
  221. ALIGN_2
  222. .L06:
  223. movsd (X), %xmm0
  224. addq INCX, X
  225. movlps %xmm0, 0 * SIZE(X1)
  226. addq $2 * SIZE, X1
  227. decq I
  228. jg .L06
  229. ALIGN_4
  230. .L10:
  231. movq Y, Y1
  232. #ifdef ALIGNED_ACCESS
  233. testq $SIZE, A
  234. jne .L200
  235. testq $2 * SIZE, LDA
  236. jne .L100
  237. #endif
  238. #if GEMV_UNROLL >= 4
  239. cmpq $4, N
  240. jl .L20
  241. ALIGN_3
  242. .L11:
  243. subq $4, N
  244. leaq 32 * SIZE(BUFFER), X1
  245. movq A, A1
  246. leaq (A1, LDA, 2), A2
  247. leaq (A1, LDA, 4), A
  248. xorps %xmm0, %xmm0
  249. xorps %xmm1, %xmm1
  250. xorps %xmm2, %xmm2
  251. xorps %xmm3, %xmm3
  252. xorps %xmm4, %xmm4
  253. xorps %xmm5, %xmm5
  254. xorps %xmm6, %xmm6
  255. xorps %xmm7, %xmm7
  256. #ifdef ALIGNED_ACCESS
  257. cmpq M, MM
  258. je .L1X
  259. #ifdef movsd
  260. xorps %xmm8, %xmm8
  261. #endif
  262. movsd -32 * SIZE(A1), %xmm8
  263. #ifdef movsd
  264. xorps %xmm9, %xmm9
  265. #endif
  266. movsd -32 * SIZE(A1, LDA), %xmm9
  267. #ifdef movsd
  268. xorps %xmm10, %xmm10
  269. #endif
  270. movsd -32 * SIZE(A2), %xmm10
  271. #ifdef movsd
  272. xorps %xmm11, %xmm11
  273. #endif
  274. movsd -32 * SIZE(A2, LDA), %xmm11
  275. #ifdef movsd
  276. xorps %xmm12, %xmm12
  277. #endif
  278. movsd -32 * SIZE(X1), %xmm12
  279. pshufd $0xb1, %xmm8, %xmm14
  280. mulps %xmm12, %xmm8
  281. addps %xmm8, %xmm0
  282. pshufd $0xb1, %xmm9, %xmm15
  283. mulps %xmm12, %xmm9
  284. addps %xmm9, %xmm2
  285. mulps %xmm12, %xmm14
  286. SUBPS %xmm14, %xmm1
  287. mulps %xmm12, %xmm15
  288. SUBPS %xmm15, %xmm3
  289. pshufd $0xb1, %xmm10, %xmm14
  290. mulps %xmm12, %xmm10
  291. addps %xmm10, %xmm4
  292. pshufd $0xb1, %xmm11, %xmm15
  293. mulps %xmm12, %xmm11
  294. addps %xmm11, %xmm6
  295. mulps %xmm12, %xmm14
  296. SUBPS %xmm14, %xmm5
  297. mulps %xmm12, %xmm15
  298. SUBPS %xmm15, %xmm7
  299. addq $2 * SIZE, A1
  300. addq $2 * SIZE, A2
  301. addq $2 * SIZE, X1
  302. ALIGN_3
  303. .L1X:
  304. #endif
  305. movaps -32 * SIZE(X1), %xmm12
  306. movaps -28 * SIZE(X1), %xmm13
  307. #ifdef PREFETCHW
  308. PREFETCHW 7 * SIZE(Y1)
  309. #endif
  310. movq MM, I
  311. sarq $3, I
  312. jle .L15
  313. MOVUPS_A1(-32 * SIZE, A1, %xmm8)
  314. MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm9)
  315. MOVUPS_A1(-32 * SIZE, A2, %xmm10)
  316. MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm11)
  317. decq I
  318. jle .L14
  319. ALIGN_3
  320. .L13:
  321. #ifdef PREFETCH
  322. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1)
  323. #endif
  324. pshufd $0xb1, %xmm8, %xmm14
  325. mulps %xmm12, %xmm8
  326. addps %xmm8, %xmm0
  327. MOVUPS_A1(-28 * SIZE, A1, %xmm8)
  328. pshufd $0xb1, %xmm9, %xmm15
  329. mulps %xmm12, %xmm9
  330. addps %xmm9, %xmm2
  331. MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm9)
  332. mulps %xmm12, %xmm14
  333. SUBPS %xmm14, %xmm1
  334. mulps %xmm12, %xmm15
  335. SUBPS %xmm15, %xmm3
  336. pshufd $0xb1, %xmm10, %xmm14
  337. mulps %xmm12, %xmm10
  338. addps %xmm10, %xmm4
  339. MOVUPS_A1(-28 * SIZE, A2, %xmm10)
  340. pshufd $0xb1, %xmm11, %xmm15
  341. mulps %xmm12, %xmm11
  342. addps %xmm11, %xmm6
  343. MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm11)
  344. mulps %xmm12, %xmm14
  345. SUBPS %xmm14, %xmm5
  346. mulps %xmm12, %xmm15
  347. movaps -24 * SIZE(X1), %xmm12
  348. SUBPS %xmm15, %xmm7
  349. #ifdef PREFETCH
  350. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
  351. #endif
  352. pshufd $0xb1, %xmm8, %xmm14
  353. mulps %xmm13, %xmm8
  354. addps %xmm8, %xmm0
  355. MOVUPS_A1(-24 * SIZE, A1, %xmm8)
  356. pshufd $0xb1, %xmm9, %xmm15
  357. mulps %xmm13, %xmm9
  358. addps %xmm9, %xmm2
  359. MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm9)
  360. mulps %xmm13, %xmm14
  361. SUBPS %xmm14, %xmm1
  362. mulps %xmm13, %xmm15
  363. SUBPS %xmm15, %xmm3
  364. pshufd $0xb1, %xmm10, %xmm14
  365. mulps %xmm13, %xmm10
  366. addps %xmm10, %xmm4
  367. MOVUPS_A1(-24 * SIZE, A2, %xmm10)
  368. pshufd $0xb1, %xmm11, %xmm15
  369. mulps %xmm13, %xmm11
  370. addps %xmm11, %xmm6
  371. MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm11)
  372. mulps %xmm13, %xmm14
  373. SUBPS %xmm14, %xmm5
  374. mulps %xmm13, %xmm15
  375. movaps -20 * SIZE(X1), %xmm13
  376. SUBPS %xmm15, %xmm7
  377. #ifdef PREFETCH
  378. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2)
  379. #endif
  380. pshufd $0xb1, %xmm8, %xmm14
  381. mulps %xmm12, %xmm8
  382. addps %xmm8, %xmm0
  383. MOVUPS_A1(-20 * SIZE, A1, %xmm8)
  384. pshufd $0xb1, %xmm9, %xmm15
  385. mulps %xmm12, %xmm9
  386. addps %xmm9, %xmm2
  387. MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm9)
  388. mulps %xmm12, %xmm14
  389. SUBPS %xmm14, %xmm1
  390. mulps %xmm12, %xmm15
  391. SUBPS %xmm15, %xmm3
  392. pshufd $0xb1, %xmm10, %xmm14
  393. mulps %xmm12, %xmm10
  394. addps %xmm10, %xmm4
  395. MOVUPS_A1(-20 * SIZE, A2, %xmm10)
  396. pshufd $0xb1, %xmm11, %xmm15
  397. mulps %xmm12, %xmm11
  398. addps %xmm11, %xmm6
  399. MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm11)
  400. mulps %xmm12, %xmm14
  401. SUBPS %xmm14, %xmm5
  402. mulps %xmm12, %xmm15
  403. movaps -16 * SIZE(X1), %xmm12
  404. SUBPS %xmm15, %xmm7
  405. #ifdef PREFETCH
  406. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
  407. #endif
  408. pshufd $0xb1, %xmm8, %xmm14
  409. mulps %xmm13, %xmm8
  410. addps %xmm8, %xmm0
  411. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  412. pshufd $0xb1, %xmm9, %xmm15
  413. mulps %xmm13, %xmm9
  414. addps %xmm9, %xmm2
  415. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9)
  416. mulps %xmm13, %xmm14
  417. SUBPS %xmm14, %xmm1
  418. mulps %xmm13, %xmm15
  419. SUBPS %xmm15, %xmm3
  420. #ifdef PREFETCHW
  421. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1)
  422. #endif
  423. pshufd $0xb1, %xmm10, %xmm14
  424. mulps %xmm13, %xmm10
  425. addps %xmm10, %xmm4
  426. MOVUPS_A1(-16 * SIZE, A2, %xmm10)
  427. pshufd $0xb1, %xmm11, %xmm15
  428. mulps %xmm13, %xmm11
  429. addps %xmm11, %xmm6
  430. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11)
  431. mulps %xmm13, %xmm14
  432. SUBPS %xmm14, %xmm5
  433. mulps %xmm13, %xmm15
  434. movaps -12 * SIZE(X1), %xmm13
  435. SUBPS %xmm15, %xmm7
  436. subq $-16 * SIZE, A1
  437. subq $-16 * SIZE, A2
  438. subq $-16 * SIZE, X1
  439. subq $1, I
  440. BRANCH
  441. jg .L13
  442. ALIGN_3
  443. .L14:
  444. pshufd $0xb1, %xmm8, %xmm14
  445. mulps %xmm12, %xmm8
  446. addps %xmm8, %xmm0
  447. MOVUPS_A1(-28 * SIZE, A1, %xmm8)
  448. pshufd $0xb1, %xmm9, %xmm15
  449. mulps %xmm12, %xmm9
  450. addps %xmm9, %xmm2
  451. MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm9)
  452. mulps %xmm12, %xmm14
  453. SUBPS %xmm14, %xmm1
  454. mulps %xmm12, %xmm15
  455. SUBPS %xmm15, %xmm3
  456. pshufd $0xb1, %xmm10, %xmm14
  457. mulps %xmm12, %xmm10
  458. addps %xmm10, %xmm4
  459. MOVUPS_A1(-28 * SIZE, A2, %xmm10)
  460. pshufd $0xb1, %xmm11, %xmm15
  461. mulps %xmm12, %xmm11
  462. addps %xmm11, %xmm6
  463. MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm11)
  464. mulps %xmm12, %xmm14
  465. SUBPS %xmm14, %xmm5
  466. mulps %xmm12, %xmm15
  467. movaps -24 * SIZE(X1), %xmm12
  468. SUBPS %xmm15, %xmm7
  469. pshufd $0xb1, %xmm8, %xmm14
  470. mulps %xmm13, %xmm8
  471. addps %xmm8, %xmm0
  472. MOVUPS_A1(-24 * SIZE, A1, %xmm8)
  473. pshufd $0xb1, %xmm9, %xmm15
  474. mulps %xmm13, %xmm9
  475. addps %xmm9, %xmm2
  476. MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm9)
  477. mulps %xmm13, %xmm14
  478. SUBPS %xmm14, %xmm1
  479. mulps %xmm13, %xmm15
  480. SUBPS %xmm15, %xmm3
  481. pshufd $0xb1, %xmm10, %xmm14
  482. mulps %xmm13, %xmm10
  483. addps %xmm10, %xmm4
  484. MOVUPS_A1(-24 * SIZE, A2, %xmm10)
  485. pshufd $0xb1, %xmm11, %xmm15
  486. mulps %xmm13, %xmm11
  487. addps %xmm11, %xmm6
  488. MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm11)
  489. mulps %xmm13, %xmm14
  490. SUBPS %xmm14, %xmm5
  491. mulps %xmm13, %xmm15
  492. movaps -20 * SIZE(X1), %xmm13
  493. SUBPS %xmm15, %xmm7
  494. pshufd $0xb1, %xmm8, %xmm14
  495. mulps %xmm12, %xmm8
  496. addps %xmm8, %xmm0
  497. MOVUPS_A1(-20 * SIZE, A1, %xmm8)
  498. pshufd $0xb1, %xmm9, %xmm15
  499. mulps %xmm12, %xmm9
  500. addps %xmm9, %xmm2
  501. MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm9)
  502. mulps %xmm12, %xmm14
  503. SUBPS %xmm14, %xmm1
  504. mulps %xmm12, %xmm15
  505. SUBPS %xmm15, %xmm3
  506. pshufd $0xb1, %xmm10, %xmm14
  507. mulps %xmm12, %xmm10
  508. addps %xmm10, %xmm4
  509. MOVUPS_A1(-20 * SIZE, A2, %xmm10)
  510. pshufd $0xb1, %xmm11, %xmm15
  511. mulps %xmm12, %xmm11
  512. addps %xmm11, %xmm6
  513. MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm11)
  514. mulps %xmm12, %xmm14
  515. SUBPS %xmm14, %xmm5
  516. mulps %xmm12, %xmm15
  517. movaps -16 * SIZE(X1), %xmm12
  518. SUBPS %xmm15, %xmm7
  519. pshufd $0xb1, %xmm8, %xmm14
  520. mulps %xmm13, %xmm8
  521. addps %xmm8, %xmm0
  522. pshufd $0xb1, %xmm9, %xmm15
  523. mulps %xmm13, %xmm9
  524. addps %xmm9, %xmm2
  525. mulps %xmm13, %xmm14
  526. SUBPS %xmm14, %xmm1
  527. mulps %xmm13, %xmm15
  528. SUBPS %xmm15, %xmm3
  529. pshufd $0xb1, %xmm10, %xmm14
  530. mulps %xmm13, %xmm10
  531. addps %xmm10, %xmm4
  532. pshufd $0xb1, %xmm11, %xmm15
  533. mulps %xmm13, %xmm11
  534. addps %xmm11, %xmm6
  535. mulps %xmm13, %xmm14
  536. SUBPS %xmm14, %xmm5
  537. mulps %xmm13, %xmm15
  538. movaps -12 * SIZE(X1), %xmm13
  539. SUBPS %xmm15, %xmm7
  540. subq $-16 * SIZE, A1
  541. subq $-16 * SIZE, A2
  542. subq $-16 * SIZE, X1
  543. ALIGN_3
  544. .L15:
  545. testq $4, MM
  546. je .L17
  547. MOVUPS_A1(-32 * SIZE, A1, %xmm8)
  548. MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm9)
  549. MOVUPS_A1(-32 * SIZE, A2, %xmm10)
  550. MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm11)
  551. pshufd $0xb1, %xmm8, %xmm14
  552. mulps %xmm12, %xmm8
  553. addps %xmm8, %xmm0
  554. MOVUPS_A1(-28 * SIZE, A1, %xmm8)
  555. pshufd $0xb1, %xmm9, %xmm15
  556. mulps %xmm12, %xmm9
  557. addps %xmm9, %xmm2
  558. MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm9)
  559. mulps %xmm12, %xmm14
  560. SUBPS %xmm14, %xmm1
  561. mulps %xmm12, %xmm15
  562. SUBPS %xmm15, %xmm3
  563. pshufd $0xb1, %xmm10, %xmm14
  564. mulps %xmm12, %xmm10
  565. addps %xmm10, %xmm4
  566. MOVUPS_A1(-28 * SIZE, A2, %xmm10)
  567. pshufd $0xb1, %xmm11, %xmm15
  568. mulps %xmm12, %xmm11
  569. addps %xmm11, %xmm6
  570. MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm11)
  571. mulps %xmm12, %xmm14
  572. SUBPS %xmm14, %xmm5
  573. mulps %xmm12, %xmm15
  574. movaps -24 * SIZE(X1), %xmm12
  575. SUBPS %xmm15, %xmm7
  576. pshufd $0xb1, %xmm8, %xmm14
  577. mulps %xmm13, %xmm8
  578. addps %xmm8, %xmm0
  579. pshufd $0xb1, %xmm9, %xmm15
  580. mulps %xmm13, %xmm9
  581. addps %xmm9, %xmm2
  582. mulps %xmm13, %xmm14
  583. SUBPS %xmm14, %xmm1
  584. mulps %xmm13, %xmm15
  585. SUBPS %xmm15, %xmm3
  586. pshufd $0xb1, %xmm10, %xmm14
  587. mulps %xmm13, %xmm10
  588. addps %xmm10, %xmm4
  589. pshufd $0xb1, %xmm11, %xmm15
  590. mulps %xmm13, %xmm11
  591. addps %xmm11, %xmm6
  592. mulps %xmm13, %xmm14
  593. SUBPS %xmm14, %xmm5
  594. mulps %xmm13, %xmm15
  595. movaps -20 * SIZE(X1), %xmm13
  596. SUBPS %xmm15, %xmm7
  597. addq $8 * SIZE, A1
  598. addq $8 * SIZE, A2
  599. addq $8 * SIZE, X1
  600. ALIGN_3
  601. .L17:
  602. testq $2, MM
  603. je .L18
  604. MOVUPS_A1(-32 * SIZE, A1, %xmm8)
  605. MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm9)
  606. MOVUPS_A1(-32 * SIZE, A2, %xmm10)
  607. MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm11)
  608. pshufd $0xb1, %xmm8, %xmm14
  609. mulps %xmm12, %xmm8
  610. addps %xmm8, %xmm0
  611. pshufd $0xb1, %xmm9, %xmm15
  612. mulps %xmm12, %xmm9
  613. addps %xmm9, %xmm2
  614. mulps %xmm12, %xmm14
  615. SUBPS %xmm14, %xmm1
  616. mulps %xmm12, %xmm15
  617. SUBPS %xmm15, %xmm3
  618. pshufd $0xb1, %xmm10, %xmm14
  619. mulps %xmm12, %xmm10
  620. addps %xmm10, %xmm4
  621. pshufd $0xb1, %xmm11, %xmm15
  622. mulps %xmm12, %xmm11
  623. addps %xmm11, %xmm6
  624. mulps %xmm12, %xmm14
  625. SUBPS %xmm14, %xmm5
  626. mulps %xmm12, %xmm15
  627. SUBPS %xmm15, %xmm7
  628. movaps %xmm13, %xmm12
  629. addq $4 * SIZE, A1
  630. addq $4 * SIZE, A2
  631. ALIGN_3
  632. .L18:
  633. testq $1, MM
  634. je .L19
  635. #ifdef movsd
  636. xorps %xmm8, %xmm8
  637. #endif
  638. movsd -32 * SIZE(A1), %xmm8
  639. #ifdef movsd
  640. xorps %xmm9, %xmm9
  641. #endif
  642. movsd -32 * SIZE(A1, LDA), %xmm9
  643. #ifdef movsd
  644. xorps %xmm10, %xmm10
  645. #endif
  646. movsd -32 * SIZE(A2), %xmm10
  647. #ifdef movsd
  648. xorps %xmm11, %xmm11
  649. #endif
  650. movsd -32 * SIZE(A2, LDA), %xmm11
  651. pshufd $0xb1, %xmm8, %xmm14
  652. mulps %xmm12, %xmm8
  653. addps %xmm8, %xmm0
  654. pshufd $0xb1, %xmm9, %xmm15
  655. mulps %xmm12, %xmm9
  656. addps %xmm9, %xmm2
  657. mulps %xmm12, %xmm14
  658. SUBPS %xmm14, %xmm1
  659. mulps %xmm12, %xmm15
  660. SUBPS %xmm15, %xmm3
  661. pshufd $0xb1, %xmm10, %xmm14
  662. mulps %xmm12, %xmm10
  663. addps %xmm10, %xmm4
  664. pshufd $0xb1, %xmm11, %xmm15
  665. mulps %xmm12, %xmm11
  666. addps %xmm11, %xmm6
  667. mulps %xmm12, %xmm14
  668. SUBPS %xmm14, %xmm5
  669. mulps %xmm12, %xmm15
  670. SUBPS %xmm15, %xmm7
  671. ALIGN_3
  672. .L19:
  673. pcmpeqb %xmm11, %xmm11
  674. psllq $63, %xmm11
  675. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  676. xorps %xmm11, %xmm0
  677. xorps %xmm11, %xmm2
  678. xorps %xmm11, %xmm4
  679. xorps %xmm11, %xmm6
  680. #else
  681. xorps %xmm11, %xmm1
  682. xorps %xmm11, %xmm3
  683. xorps %xmm11, %xmm5
  684. xorps %xmm11, %xmm7
  685. #endif
  686. #ifdef HAVE_SSE3
  687. haddps %xmm1, %xmm0
  688. haddps %xmm3, %xmm2
  689. haddps %xmm2, %xmm0
  690. haddps %xmm5, %xmm4
  691. haddps %xmm7, %xmm6
  692. haddps %xmm6, %xmm4
  693. #else
  694. movaps %xmm0, %xmm8
  695. unpcklps %xmm1, %xmm0
  696. unpckhps %xmm1, %xmm8
  697. movaps %xmm2, %xmm9
  698. unpcklps %xmm3, %xmm2
  699. unpckhps %xmm3, %xmm9
  700. movaps %xmm4, %xmm10
  701. unpcklps %xmm5, %xmm4
  702. unpckhps %xmm5, %xmm10
  703. movaps %xmm6, %xmm12
  704. unpcklps %xmm7, %xmm6
  705. unpckhps %xmm7, %xmm12
  706. addps %xmm8, %xmm0
  707. addps %xmm9, %xmm2
  708. addps %xmm10, %xmm4
  709. addps %xmm12, %xmm6
  710. movhlps %xmm0, %xmm1
  711. movhlps %xmm2, %xmm3
  712. movhlps %xmm4, %xmm5
  713. movhlps %xmm6, %xmm7
  714. addps %xmm1, %xmm0
  715. addps %xmm3, %xmm2
  716. addps %xmm5, %xmm4
  717. addps %xmm7, %xmm6
  718. movlhps %xmm2, %xmm0
  719. movlhps %xmm6, %xmm4
  720. #endif
  721. pshufd $0xb1, %xmm0, %xmm1
  722. pshufd $0xb1, %xmm4, %xmm5
  723. #ifdef HAVE_SSE3
  724. movddup ALPHA, %xmm15
  725. #else
  726. movsd ALPHA, %xmm15
  727. pshufd $0x44, %xmm15, %xmm15
  728. #endif
  729. mulps %xmm15, %xmm0
  730. mulps %xmm15, %xmm1
  731. mulps %xmm15, %xmm4
  732. mulps %xmm15, %xmm5
  733. xorps %xmm11, %xmm0
  734. xorps %xmm11, %xmm4
  735. #ifdef HAVE_SSE3
  736. haddps %xmm1, %xmm0
  737. haddps %xmm5, %xmm4
  738. #else
  739. movaps %xmm0, %xmm2
  740. shufps $0x88, %xmm1, %xmm0
  741. shufps $0xdd, %xmm1, %xmm2
  742. movaps %xmm4, %xmm6
  743. shufps $0x88, %xmm5, %xmm4
  744. shufps $0xdd, %xmm5, %xmm6
  745. addps %xmm2, %xmm0
  746. addps %xmm6, %xmm4
  747. #endif
  748. movsd (Y), %xmm2
  749. addq INCY, Y
  750. movhps (Y), %xmm2
  751. addq INCY, Y
  752. movsd (Y), %xmm6
  753. addq INCY, Y
  754. movhps (Y), %xmm6
  755. addq INCY, Y
  756. shufps $0xd8, %xmm0, %xmm0
  757. shufps $0xd8, %xmm4, %xmm4
  758. addps %xmm2, %xmm0
  759. addps %xmm6, %xmm4
  760. movlps %xmm0, (Y1)
  761. addq INCY, Y1
  762. movhps %xmm0, (Y1)
  763. addq INCY, Y1
  764. movlps %xmm4, (Y1)
  765. addq INCY, Y1
  766. movhps %xmm4, (Y1)
  767. addq INCY, Y1
  768. cmpq $4, N
  769. jge .L11
  770. ALIGN_3
  771. .L20:
  772. #endif
  773. cmpq $2, N
  774. jl .L30
  775. #if GEMV_UNROLL == 2
  776. ALIGN_3
  777. .L21:
  778. #endif
  779. subq $2, N
  780. leaq 32 * SIZE(BUFFER), X1
  781. movq A, A1
  782. leaq (A1, LDA), A2
  783. leaq (A1, LDA, 2), A
  784. xorps %xmm0, %xmm0
  785. xorps %xmm1, %xmm1
  786. xorps %xmm2, %xmm2
  787. xorps %xmm3, %xmm3
  788. #ifdef ALIGNED_ACCESS
  789. cmpq M, MM
  790. je .L2X
  791. #ifdef movsd
  792. xorps %xmm8, %xmm8
  793. #endif
  794. movsd -32 * SIZE(A1), %xmm8
  795. #ifdef movsd
  796. xorps %xmm9, %xmm9
  797. #endif
  798. movsd -32 * SIZE(A2), %xmm9
  799. #ifdef movsd
  800. xorps %xmm12, %xmm12
  801. #endif
  802. movsd -32 * SIZE(X1), %xmm12
  803. pshufd $0xb1, %xmm8, %xmm4
  804. mulps %xmm12, %xmm8
  805. addps %xmm8, %xmm0
  806. mulps %xmm12, %xmm4
  807. SUBPS %xmm4, %xmm1
  808. pshufd $0xb1, %xmm9, %xmm5
  809. mulps %xmm12, %xmm9
  810. addps %xmm9, %xmm2
  811. mulps %xmm12, %xmm5
  812. SUBPS %xmm5, %xmm3
  813. addq $2 * SIZE, A1
  814. addq $2 * SIZE, A2
  815. addq $2 * SIZE, X1
  816. ALIGN_3
  817. .L2X:
  818. #endif
  819. movaps -32 * SIZE(X1), %xmm12
  820. movaps -28 * SIZE(X1), %xmm13
  821. #if (GEMV_UNROLL == 2) && defined(PREFETCHW)
  822. PREFETCHW 3 * SIZE(Y1)
  823. #endif
  824. movq MM, I
  825. sarq $3, I
  826. jle .L25
  827. MOVUPS_A1(-32 * SIZE, A1, %xmm8)
  828. MOVUPS_A1(-32 * SIZE, A2, %xmm9)
  829. MOVUPS_A1(-28 * SIZE, A1, %xmm10)
  830. MOVUPS_A1(-28 * SIZE, A2, %xmm11)
  831. decq I
  832. jle .L24
  833. ALIGN_3
  834. .L23:
  835. #ifdef PREFETCH
  836. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  837. #endif
  838. pshufd $0xb1, %xmm8, %xmm4
  839. mulps %xmm12, %xmm8
  840. addps %xmm8, %xmm0
  841. MOVUPS_A1(-24 * SIZE, A1, %xmm8)
  842. mulps %xmm12, %xmm4
  843. SUBPS %xmm4, %xmm1
  844. pshufd $0xb1, %xmm9, %xmm5
  845. mulps %xmm12, %xmm9
  846. addps %xmm9, %xmm2
  847. MOVUPS_A1(-24 * SIZE, A2, %xmm9)
  848. mulps %xmm12, %xmm5
  849. SUBPS %xmm5, %xmm3
  850. movaps -24 * SIZE(X1), %xmm12
  851. pshufd $0xb1, %xmm10, %xmm6
  852. mulps %xmm13, %xmm10
  853. addps %xmm10, %xmm0
  854. MOVUPS_A1(-20 * SIZE, A1, %xmm10)
  855. mulps %xmm13, %xmm6
  856. SUBPS %xmm6, %xmm1
  857. pshufd $0xb1, %xmm11, %xmm7
  858. mulps %xmm13, %xmm11
  859. addps %xmm11, %xmm2
  860. MOVUPS_A1(-20 * SIZE, A2, %xmm11)
  861. mulps %xmm13, %xmm7
  862. SUBPS %xmm7, %xmm3
  863. movaps -20 * SIZE(X1), %xmm13
  864. #ifdef PREFETCH
  865. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  866. #endif
  867. pshufd $0xb1, %xmm8, %xmm4
  868. mulps %xmm12, %xmm8
  869. addps %xmm8, %xmm0
  870. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  871. mulps %xmm12, %xmm4
  872. SUBPS %xmm4, %xmm1
  873. pshufd $0xb1, %xmm9, %xmm5
  874. mulps %xmm12, %xmm9
  875. addps %xmm9, %xmm2
  876. MOVUPS_A1(-16 * SIZE, A2, %xmm9)
  877. mulps %xmm12, %xmm5
  878. SUBPS %xmm5, %xmm3
  879. movaps -16 * SIZE(X1), %xmm12
  880. #ifdef PREFETCHW
  881. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
  882. #endif
  883. pshufd $0xb1, %xmm10, %xmm6
  884. mulps %xmm13, %xmm10
  885. addps %xmm10, %xmm0
  886. MOVUPS_A1(-12 * SIZE, A1, %xmm10)
  887. mulps %xmm13, %xmm6
  888. SUBPS %xmm6, %xmm1
  889. pshufd $0xb1, %xmm11, %xmm7
  890. mulps %xmm13, %xmm11
  891. addps %xmm11, %xmm2
  892. MOVUPS_A1(-12 * SIZE, A2, %xmm11)
  893. mulps %xmm13, %xmm7
  894. SUBPS %xmm7, %xmm3
  895. movaps -12 * SIZE(X1), %xmm13
  896. subq $-16 * SIZE, A1
  897. subq $-16 * SIZE, A2
  898. subq $-16 * SIZE, X1
  899. subq $1, I
  900. BRANCH
  901. jg .L23
  902. ALIGN_3
  903. .L24:
  904. pshufd $0xb1, %xmm8, %xmm4
  905. mulps %xmm12, %xmm8
  906. addps %xmm8, %xmm0
  907. MOVUPS_A1(-24 * SIZE, A1, %xmm8)
  908. mulps %xmm12, %xmm4
  909. SUBPS %xmm4, %xmm1
  910. pshufd $0xb1, %xmm9, %xmm5
  911. mulps %xmm12, %xmm9
  912. addps %xmm9, %xmm2
  913. MOVUPS_A1(-24 * SIZE, A2, %xmm9)
  914. mulps %xmm12, %xmm5
  915. SUBPS %xmm5, %xmm3
  916. movaps -24 * SIZE(X1), %xmm12
  917. pshufd $0xb1, %xmm10, %xmm6
  918. mulps %xmm13, %xmm10
  919. addps %xmm10, %xmm0
  920. MOVUPS_A1(-20 * SIZE, A1, %xmm10)
  921. mulps %xmm13, %xmm6
  922. SUBPS %xmm6, %xmm1
  923. pshufd $0xb1, %xmm11, %xmm7
  924. mulps %xmm13, %xmm11
  925. addps %xmm11, %xmm2
  926. MOVUPS_A1(-20 * SIZE, A2, %xmm11)
  927. mulps %xmm13, %xmm7
  928. SUBPS %xmm7, %xmm3
  929. movaps -20 * SIZE(X1), %xmm13
  930. pshufd $0xb1, %xmm8, %xmm4
  931. mulps %xmm12, %xmm8
  932. addps %xmm8, %xmm0
  933. mulps %xmm12, %xmm4
  934. SUBPS %xmm4, %xmm1
  935. pshufd $0xb1, %xmm9, %xmm5
  936. mulps %xmm12, %xmm9
  937. addps %xmm9, %xmm2
  938. mulps %xmm12, %xmm5
  939. SUBPS %xmm5, %xmm3
  940. movaps -16 * SIZE(X1), %xmm12
  941. pshufd $0xb1, %xmm10, %xmm6
  942. mulps %xmm13, %xmm10
  943. addps %xmm10, %xmm0
  944. mulps %xmm13, %xmm6
  945. SUBPS %xmm6, %xmm1
  946. pshufd $0xb1, %xmm11, %xmm7
  947. mulps %xmm13, %xmm11
  948. addps %xmm11, %xmm2
  949. mulps %xmm13, %xmm7
  950. SUBPS %xmm7, %xmm3
  951. movaps -12 * SIZE(X1), %xmm13
  952. subq $-16 * SIZE, A1
  953. subq $-16 * SIZE, A2
  954. subq $-16 * SIZE, X1
  955. ALIGN_3
  956. .L25:
  957. testq $4, MM
  958. je .L27
  959. MOVUPS_A1(-32 * SIZE, A1, %xmm8)
  960. MOVUPS_A1(-32 * SIZE, A2, %xmm9)
  961. MOVUPS_A1(-28 * SIZE, A1, %xmm10)
  962. MOVUPS_A1(-28 * SIZE, A2, %xmm11)
  963. pshufd $0xb1, %xmm8, %xmm4
  964. mulps %xmm12, %xmm8
  965. addps %xmm8, %xmm0
  966. mulps %xmm12, %xmm4
  967. SUBPS %xmm4, %xmm1
  968. pshufd $0xb1, %xmm9, %xmm5
  969. mulps %xmm12, %xmm9
  970. addps %xmm9, %xmm2
  971. mulps %xmm12, %xmm5
  972. SUBPS %xmm5, %xmm3
  973. movaps -24 * SIZE(X1), %xmm12
  974. pshufd $0xb1, %xmm10, %xmm6
  975. mulps %xmm13, %xmm10
  976. addps %xmm10, %xmm0
  977. mulps %xmm13, %xmm6
  978. SUBPS %xmm6, %xmm1
  979. pshufd $0xb1, %xmm11, %xmm7
  980. mulps %xmm13, %xmm11
  981. addps %xmm11, %xmm2
  982. mulps %xmm13, %xmm7
  983. SUBPS %xmm7, %xmm3
  984. movaps -20 * SIZE(X1), %xmm13
  985. addq $8 * SIZE, A1
  986. addq $8 * SIZE, A2
  987. ALIGN_3
  988. .L27:
  989. testq $2, MM
  990. je .L28
  991. MOVUPS_A1(-32 * SIZE, A1, %xmm8)
  992. MOVUPS_A1(-32 * SIZE, A2, %xmm9)
  993. pshufd $0xb1, %xmm8, %xmm4
  994. mulps %xmm12, %xmm8
  995. addps %xmm8, %xmm0
  996. mulps %xmm12, %xmm4
  997. SUBPS %xmm4, %xmm1
  998. pshufd $0xb1, %xmm9, %xmm5
  999. mulps %xmm12, %xmm9
  1000. addps %xmm9, %xmm2
  1001. mulps %xmm12, %xmm5
  1002. SUBPS %xmm5, %xmm3
  1003. movaps %xmm13, %xmm12
  1004. addq $4 * SIZE, A1
  1005. addq $4 * SIZE, A2
  1006. ALIGN_3
  1007. .L28:
  1008. testq $1, MM
  1009. je .L29
  1010. #ifdef movsd
  1011. xorps %xmm8, %xmm8
  1012. #endif
  1013. movsd -32 * SIZE(A1), %xmm8
  1014. #ifdef movsd
  1015. xorps %xmm9, %xmm9
  1016. #endif
  1017. movsd -32 * SIZE(A2), %xmm9
  1018. pshufd $0xb1, %xmm8, %xmm4
  1019. mulps %xmm12, %xmm8
  1020. addps %xmm8, %xmm0
  1021. mulps %xmm12, %xmm4
  1022. SUBPS %xmm4, %xmm1
  1023. pshufd $0xb1, %xmm9, %xmm5
  1024. mulps %xmm12, %xmm9
  1025. addps %xmm9, %xmm2
  1026. mulps %xmm12, %xmm5
  1027. SUBPS %xmm5, %xmm3
  1028. ALIGN_3
  1029. .L29:
  1030. pcmpeqb %xmm5, %xmm5
  1031. psllq $63, %xmm5
  1032. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  1033. xorps %xmm5, %xmm0
  1034. xorps %xmm5, %xmm2
  1035. #else
  1036. xorps %xmm5, %xmm1
  1037. xorps %xmm5, %xmm3
  1038. #endif
  1039. #ifdef HAVE_SSE3
  1040. haddps %xmm1, %xmm0
  1041. haddps %xmm3, %xmm2
  1042. haddps %xmm2, %xmm0
  1043. #else
  1044. movaps %xmm0, %xmm8
  1045. unpcklps %xmm1, %xmm0
  1046. unpckhps %xmm1, %xmm8
  1047. movaps %xmm2, %xmm4
  1048. unpcklps %xmm3, %xmm2
  1049. unpckhps %xmm3, %xmm4
  1050. addps %xmm8, %xmm0
  1051. addps %xmm4, %xmm2
  1052. movhlps %xmm0, %xmm1
  1053. movhlps %xmm2, %xmm3
  1054. addps %xmm1, %xmm0
  1055. addps %xmm3, %xmm2
  1056. movlhps %xmm2, %xmm0
  1057. #endif
  1058. pshufd $0xb1, %xmm0, %xmm1
  1059. #ifdef HAVE_SSE3
  1060. movddup ALPHA, %xmm15
  1061. #else
  1062. movsd ALPHA, %xmm15
  1063. pshufd $0x44, %xmm15, %xmm15
  1064. #endif
  1065. mulps %xmm15, %xmm0
  1066. mulps %xmm15, %xmm1
  1067. xorps %xmm5, %xmm0
  1068. #ifdef HAVE_SSE3
  1069. haddps %xmm1, %xmm0
  1070. #else
  1071. movaps %xmm0, %xmm2
  1072. shufps $0x88, %xmm1, %xmm0
  1073. shufps $0xdd, %xmm1, %xmm2
  1074. addps %xmm2, %xmm0
  1075. #endif
  1076. movsd (Y), %xmm12
  1077. addq INCY, Y
  1078. movhps (Y), %xmm12
  1079. addq INCY, Y
  1080. shufps $0xd8, %xmm0, %xmm0
  1081. addps %xmm12, %xmm0
  1082. movlps %xmm0, (Y1)
  1083. addq INCY, Y1
  1084. movhps %xmm0, (Y1)
  1085. addq INCY, Y1
  1086. #if GEMV_UNROLL == 2
  1087. cmpq $2, N
  1088. jge .L21
  1089. #endif
  1090. ALIGN_3
  1091. .L30:
  1092. cmpq $1, N
  1093. jl .L999
  1094. leaq 32 * SIZE(BUFFER), X1
  1095. movq A, A1
  1096. xorps %xmm0, %xmm0
  1097. xorps %xmm1, %xmm1
  1098. #ifdef ALIGNED_ACCESS
  1099. cmpq M, MM
  1100. je .L3X
  1101. #ifdef movsd
  1102. xorps %xmm8, %xmm8
  1103. #endif
  1104. movsd -32 * SIZE(A1), %xmm8
  1105. #ifdef movsd
  1106. xorps %xmm12, %xmm12
  1107. #endif
  1108. movsd -32 * SIZE(X1), %xmm12
  1109. pshufd $0xb1, %xmm8, %xmm4
  1110. mulps %xmm12, %xmm8
  1111. addps %xmm8, %xmm0
  1112. mulps %xmm12, %xmm4
  1113. SUBPS %xmm4, %xmm1
  1114. addq $2 * SIZE, A1
  1115. addq $2 * SIZE, X1
  1116. ALIGN_3
  1117. .L3X:
  1118. #endif
  1119. movaps -32 * SIZE(X1), %xmm12
  1120. movaps -28 * SIZE(X1), %xmm13
  1121. movq MM, I
  1122. sarq $3, I
  1123. jle .L35
  1124. MOVUPS_A1(-32 * SIZE, A1, %xmm8)
  1125. MOVUPS_A1(-28 * SIZE, A1, %xmm10)
  1126. decq I
  1127. jle .L34
  1128. ALIGN_3
  1129. .L33:
  1130. #ifdef PREFETCH
  1131. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  1132. #endif
  1133. pshufd $0xb1, %xmm8, %xmm4
  1134. mulps %xmm12, %xmm8
  1135. addps %xmm8, %xmm0
  1136. MOVUPS_A1(-24 * SIZE, A1, %xmm8)
  1137. mulps %xmm12, %xmm4
  1138. SUBPS %xmm4, %xmm1
  1139. movaps -24 * SIZE(X1), %xmm12
  1140. pshufd $0xb1, %xmm10, %xmm6
  1141. mulps %xmm13, %xmm10
  1142. addps %xmm10, %xmm0
  1143. MOVUPS_A1(-20 * SIZE, A1, %xmm10)
  1144. mulps %xmm13, %xmm6
  1145. SUBPS %xmm6, %xmm1
  1146. movaps -20 * SIZE(X1), %xmm13
  1147. pshufd $0xb1, %xmm8, %xmm4
  1148. mulps %xmm12, %xmm8
  1149. addps %xmm8, %xmm0
  1150. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  1151. mulps %xmm12, %xmm4
  1152. SUBPS %xmm4, %xmm1
  1153. movaps -16 * SIZE(X1), %xmm12
  1154. #ifdef PREFETCHW
  1155. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
  1156. #endif
  1157. pshufd $0xb1, %xmm10, %xmm6
  1158. mulps %xmm13, %xmm10
  1159. addps %xmm10, %xmm0
  1160. MOVUPS_A1(-12 * SIZE, A1, %xmm10)
  1161. mulps %xmm13, %xmm6
  1162. SUBPS %xmm6, %xmm1
  1163. movaps -12 * SIZE(X1), %xmm13
  1164. subq $-16 * SIZE, A1
  1165. subq $-16 * SIZE, X1
  1166. subq $1, I
  1167. BRANCH
  1168. jg .L33
  1169. ALIGN_3
  1170. .L34:
  1171. pshufd $0xb1, %xmm8, %xmm4
  1172. mulps %xmm12, %xmm8
  1173. addps %xmm8, %xmm0
  1174. MOVUPS_A1(-24 * SIZE, A1, %xmm8)
  1175. mulps %xmm12, %xmm4
  1176. SUBPS %xmm4, %xmm1
  1177. movaps -24 * SIZE(X1), %xmm12
  1178. pshufd $0xb1, %xmm10, %xmm6
  1179. mulps %xmm13, %xmm10
  1180. addps %xmm10, %xmm0
  1181. MOVUPS_A1(-20 * SIZE, A1, %xmm10)
  1182. mulps %xmm13, %xmm6
  1183. SUBPS %xmm6, %xmm1
  1184. movaps -20 * SIZE(X1), %xmm13
  1185. pshufd $0xb1, %xmm8, %xmm4
  1186. mulps %xmm12, %xmm8
  1187. addps %xmm8, %xmm0
  1188. mulps %xmm12, %xmm4
  1189. SUBPS %xmm4, %xmm1
  1190. movaps -16 * SIZE(X1), %xmm12
  1191. pshufd $0xb1, %xmm10, %xmm6
  1192. mulps %xmm13, %xmm10
  1193. addps %xmm10, %xmm0
  1194. mulps %xmm13, %xmm6
  1195. SUBPS %xmm6, %xmm1
  1196. movaps -12 * SIZE(X1), %xmm13
  1197. subq $-16 * SIZE, A1
  1198. subq $-16 * SIZE, X1
  1199. ALIGN_3
  1200. .L35:
  1201. testq $4, MM
  1202. je .L37
  1203. MOVUPS_A1(-32 * SIZE, A1, %xmm8)
  1204. MOVUPS_A1(-28 * SIZE, A1, %xmm10)
  1205. pshufd $0xb1, %xmm8, %xmm4
  1206. mulps %xmm12, %xmm8
  1207. addps %xmm8, %xmm0
  1208. mulps %xmm12, %xmm4
  1209. SUBPS %xmm4, %xmm1
  1210. movaps -24 * SIZE(X1), %xmm12
  1211. pshufd $0xb1, %xmm10, %xmm6
  1212. mulps %xmm13, %xmm10
  1213. addps %xmm10, %xmm0
  1214. mulps %xmm13, %xmm6
  1215. SUBPS %xmm6, %xmm1
  1216. movaps -20 * SIZE(X1), %xmm13
  1217. addq $8 * SIZE, A1
  1218. ALIGN_3
  1219. .L37:
  1220. testq $2, MM
  1221. je .L38
  1222. MOVUPS_A1(-32 * SIZE, A1, %xmm8)
  1223. pshufd $0xb1, %xmm8, %xmm4
  1224. mulps %xmm12, %xmm8
  1225. addps %xmm8, %xmm0
  1226. mulps %xmm12, %xmm4
  1227. SUBPS %xmm4, %xmm1
  1228. movaps %xmm13, %xmm12
  1229. addq $4 * SIZE, A1
  1230. ALIGN_3
  1231. .L38:
  1232. testq $1, MM
  1233. je .L39
  1234. #ifdef movsd
  1235. xorps %xmm8, %xmm8
  1236. #endif
  1237. movsd -32 * SIZE(A1), %xmm8
  1238. pshufd $0xb1, %xmm8, %xmm4
  1239. mulps %xmm12, %xmm8
  1240. addps %xmm8, %xmm0
  1241. mulps %xmm12, %xmm4
  1242. SUBPS %xmm4, %xmm1
  1243. ALIGN_3
  1244. .L39:
  1245. pcmpeqb %xmm5, %xmm5
  1246. psllq $63, %xmm5
  1247. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  1248. xorps %xmm5, %xmm0
  1249. #else
  1250. xorps %xmm5, %xmm1
  1251. #endif
  1252. #ifdef HAVE_SSE3
  1253. haddps %xmm1, %xmm0
  1254. haddps %xmm0, %xmm0
  1255. #else
  1256. movaps %xmm0, %xmm8
  1257. unpcklps %xmm1, %xmm0
  1258. unpckhps %xmm1, %xmm8
  1259. addps %xmm8, %xmm0
  1260. movhlps %xmm0, %xmm1
  1261. addps %xmm1, %xmm0
  1262. #endif
  1263. pshufd $0xb1, %xmm0, %xmm1
  1264. #ifdef HAVE_SSE3
  1265. movddup ALPHA, %xmm15
  1266. #else
  1267. movsd ALPHA, %xmm15
  1268. pshufd $0x44, %xmm15, %xmm15
  1269. #endif
  1270. mulps %xmm15, %xmm0
  1271. mulps %xmm15, %xmm1
  1272. xorps %xmm5, %xmm0
  1273. #ifdef HAVE_SSE3
  1274. haddps %xmm1, %xmm0
  1275. #else
  1276. movaps %xmm0, %xmm2
  1277. shufps $0x88, %xmm1, %xmm0
  1278. shufps $0xdd, %xmm1, %xmm2
  1279. addps %xmm2, %xmm0
  1280. #endif
  1281. movsd (Y), %xmm12
  1282. addq INCY, Y
  1283. shufps $0xd8, %xmm0, %xmm0
  1284. addps %xmm12, %xmm0
  1285. movlps %xmm0, (Y1)
  1286. addq INCY, Y1
  1287. #ifdef ALIGNED_ACCESS
  1288. jmp .L999
  1289. ALIGN_3
  1290. .L100:
  1291. #if GEMV_UNROLL >= 4
  1292. cmpq $4, N
  1293. jl .L110
  1294. ALIGN_3
  1295. .L101:
  1296. subq $4, N
  1297. leaq 32 * SIZE(BUFFER), X1
  1298. movq A, A1
  1299. leaq (A1, LDA, 2), A2
  1300. leaq (A1, LDA, 4), A
  1301. xorps %xmm0, %xmm0
  1302. xorps %xmm1, %xmm1
  1303. xorps %xmm2, %xmm2
  1304. xorps %xmm3, %xmm3
  1305. xorps %xmm4, %xmm4
  1306. xorps %xmm5, %xmm5
  1307. xorps %xmm6, %xmm6
  1308. xorps %xmm7, %xmm7
  1309. #ifdef ALIGNED_ACCESS
  1310. cmpq M, MM
  1311. je .L10X
  1312. #ifdef movsd
  1313. xorps %xmm8, %xmm8
  1314. #endif
  1315. movsd -32 * SIZE(A1), %xmm8
  1316. #ifdef movsd
  1317. xorps %xmm9, %xmm9
  1318. #endif
  1319. movsd -32 * SIZE(A1, LDA), %xmm9
  1320. #ifdef movsd
  1321. xorps %xmm10, %xmm10
  1322. #endif
  1323. movsd -32 * SIZE(A2), %xmm10
  1324. #ifdef movsd
  1325. xorps %xmm11, %xmm11
  1326. #endif
  1327. movsd -32 * SIZE(A2, LDA), %xmm11
  1328. #ifdef movsd
  1329. xorps %xmm12, %xmm12
  1330. #endif
  1331. movsd -32 * SIZE(X1), %xmm12
  1332. pshufd $0xb1, %xmm8, %xmm14
  1333. mulps %xmm12, %xmm8
  1334. addps %xmm8, %xmm0
  1335. pshufd $0xb1, %xmm9, %xmm15
  1336. mulps %xmm12, %xmm9
  1337. addps %xmm9, %xmm2
  1338. mulps %xmm12, %xmm14
  1339. SUBPS %xmm14, %xmm1
  1340. mulps %xmm12, %xmm15
  1341. SUBPS %xmm15, %xmm3
  1342. pshufd $0xb1, %xmm10, %xmm14
  1343. mulps %xmm12, %xmm10
  1344. addps %xmm10, %xmm4
  1345. pshufd $0xb1, %xmm11, %xmm15
  1346. mulps %xmm12, %xmm11
  1347. addps %xmm11, %xmm6
  1348. mulps %xmm12, %xmm14
  1349. SUBPS %xmm14, %xmm5
  1350. mulps %xmm12, %xmm15
  1351. SUBPS %xmm15, %xmm7
  1352. addq $2 * SIZE, A1
  1353. addq $2 * SIZE, A2
  1354. addq $2 * SIZE, X1
  1355. ALIGN_3
  1356. .L10X:
  1357. #endif
  1358. movaps -32 * SIZE(X1), %xmm12
  1359. movaps -28 * SIZE(X1), %xmm13
  1360. #ifdef PREFETCHW
  1361. PREFETCHW 7 * SIZE(Y1)
  1362. #endif
  1363. movq MM, I
  1364. sarq $3, I
  1365. jle .L105
  1366. movaps -32 * SIZE(A1), %xmm8
  1367. movsd -32 * SIZE(A1, LDA), %xmm9
  1368. movhps -30 * SIZE(A1, LDA), %xmm9
  1369. movaps -32 * SIZE(A2), %xmm10
  1370. movsd -32 * SIZE(A2, LDA), %xmm11
  1371. movhps -30 * SIZE(A2, LDA), %xmm11
  1372. decq I
  1373. jle .L104
  1374. ALIGN_3
  1375. .L103:
  1376. #ifdef PREFETCH
  1377. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1)
  1378. #endif
  1379. pshufd $0xb1, %xmm8, %xmm14
  1380. mulps %xmm12, %xmm8
  1381. addps %xmm8, %xmm0
  1382. movaps -28 * SIZE(A1), %xmm8
  1383. pshufd $0xb1, %xmm9, %xmm15
  1384. mulps %xmm12, %xmm9
  1385. addps %xmm9, %xmm2
  1386. movsd -28 * SIZE(A1, LDA), %xmm9
  1387. movhps -26 * SIZE(A1, LDA), %xmm9
  1388. mulps %xmm12, %xmm14
  1389. SUBPS %xmm14, %xmm1
  1390. mulps %xmm12, %xmm15
  1391. SUBPS %xmm15, %xmm3
  1392. pshufd $0xb1, %xmm10, %xmm14
  1393. mulps %xmm12, %xmm10
  1394. addps %xmm10, %xmm4
  1395. movaps -28 * SIZE(A2), %xmm10
  1396. pshufd $0xb1, %xmm11, %xmm15
  1397. mulps %xmm12, %xmm11
  1398. addps %xmm11, %xmm6
  1399. movsd -28 * SIZE(A2, LDA), %xmm11
  1400. movhps -26 * SIZE(A2, LDA), %xmm11
  1401. mulps %xmm12, %xmm14
  1402. SUBPS %xmm14, %xmm5
  1403. mulps %xmm12, %xmm15
  1404. movaps -24 * SIZE(X1), %xmm12
  1405. SUBPS %xmm15, %xmm7
  1406. #ifdef PREFETCH
  1407. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
  1408. #endif
  1409. pshufd $0xb1, %xmm8, %xmm14
  1410. mulps %xmm13, %xmm8
  1411. addps %xmm8, %xmm0
  1412. movaps -24 * SIZE(A1), %xmm8
  1413. pshufd $0xb1, %xmm9, %xmm15
  1414. mulps %xmm13, %xmm9
  1415. addps %xmm9, %xmm2
  1416. movsd -24 * SIZE(A1, LDA), %xmm9
  1417. movhps -22 * SIZE(A1, LDA), %xmm9
  1418. mulps %xmm13, %xmm14
  1419. SUBPS %xmm14, %xmm1
  1420. mulps %xmm13, %xmm15
  1421. SUBPS %xmm15, %xmm3
  1422. pshufd $0xb1, %xmm10, %xmm14
  1423. mulps %xmm13, %xmm10
  1424. addps %xmm10, %xmm4
  1425. movaps -24 * SIZE(A2), %xmm10
  1426. pshufd $0xb1, %xmm11, %xmm15
  1427. mulps %xmm13, %xmm11
  1428. addps %xmm11, %xmm6
  1429. movsd -24 * SIZE(A2, LDA), %xmm11
  1430. movhps -22 * SIZE(A2, LDA), %xmm11
  1431. mulps %xmm13, %xmm14
  1432. SUBPS %xmm14, %xmm5
  1433. mulps %xmm13, %xmm15
  1434. movaps -20 * SIZE(X1), %xmm13
  1435. SUBPS %xmm15, %xmm7
  1436. #ifdef PREFETCH
  1437. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2)
  1438. #endif
  1439. pshufd $0xb1, %xmm8, %xmm14
  1440. mulps %xmm12, %xmm8
  1441. addps %xmm8, %xmm0
  1442. movaps -20 * SIZE(A1), %xmm8
  1443. pshufd $0xb1, %xmm9, %xmm15
  1444. mulps %xmm12, %xmm9
  1445. addps %xmm9, %xmm2
  1446. movsd -20 * SIZE(A1, LDA), %xmm9
  1447. movhps -18 * SIZE(A1, LDA), %xmm9
  1448. mulps %xmm12, %xmm14
  1449. SUBPS %xmm14, %xmm1
  1450. mulps %xmm12, %xmm15
  1451. SUBPS %xmm15, %xmm3
  1452. pshufd $0xb1, %xmm10, %xmm14
  1453. mulps %xmm12, %xmm10
  1454. addps %xmm10, %xmm4
  1455. movaps -20 * SIZE(A2), %xmm10
  1456. pshufd $0xb1, %xmm11, %xmm15
  1457. mulps %xmm12, %xmm11
  1458. addps %xmm11, %xmm6
  1459. movsd -20 * SIZE(A2, LDA), %xmm11
  1460. movhps -18 * SIZE(A2, LDA), %xmm11
  1461. mulps %xmm12, %xmm14
  1462. SUBPS %xmm14, %xmm5
  1463. mulps %xmm12, %xmm15
  1464. movaps -16 * SIZE(X1), %xmm12
  1465. SUBPS %xmm15, %xmm7
  1466. #ifdef PREFETCH
  1467. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
  1468. #endif
  1469. pshufd $0xb1, %xmm8, %xmm14
  1470. mulps %xmm13, %xmm8
  1471. addps %xmm8, %xmm0
  1472. movaps -16 * SIZE(A1), %xmm8
  1473. pshufd $0xb1, %xmm9, %xmm15
  1474. mulps %xmm13, %xmm9
  1475. addps %xmm9, %xmm2
  1476. movsd -16 * SIZE(A1, LDA), %xmm9
  1477. movhps -14 * SIZE(A1, LDA), %xmm9
  1478. mulps %xmm13, %xmm14
  1479. SUBPS %xmm14, %xmm1
  1480. mulps %xmm13, %xmm15
  1481. SUBPS %xmm15, %xmm3
  1482. #ifdef PREFETCHW
  1483. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1)
  1484. #endif
  1485. pshufd $0xb1, %xmm10, %xmm14
  1486. mulps %xmm13, %xmm10
  1487. addps %xmm10, %xmm4
  1488. movaps -16 * SIZE(A2), %xmm10
  1489. pshufd $0xb1, %xmm11, %xmm15
  1490. mulps %xmm13, %xmm11
  1491. addps %xmm11, %xmm6
  1492. movsd -16 * SIZE(A2, LDA), %xmm11
  1493. movhps -14 * SIZE(A2, LDA), %xmm11
  1494. mulps %xmm13, %xmm14
  1495. SUBPS %xmm14, %xmm5
  1496. mulps %xmm13, %xmm15
  1497. movaps -12 * SIZE(X1), %xmm13
  1498. SUBPS %xmm15, %xmm7
  1499. subq $-16 * SIZE, A1
  1500. subq $-16 * SIZE, A2
  1501. subq $-16 * SIZE, X1
  1502. subq $1, I
  1503. BRANCH
  1504. jg .L103
  1505. ALIGN_3
  1506. .L104:
  1507. pshufd $0xb1, %xmm8, %xmm14
  1508. mulps %xmm12, %xmm8
  1509. addps %xmm8, %xmm0
  1510. movaps -28 * SIZE(A1), %xmm8
  1511. pshufd $0xb1, %xmm9, %xmm15
  1512. mulps %xmm12, %xmm9
  1513. addps %xmm9, %xmm2
  1514. movsd -28 * SIZE(A1, LDA), %xmm9
  1515. movhps -26 * SIZE(A1, LDA), %xmm9
  1516. mulps %xmm12, %xmm14
  1517. SUBPS %xmm14, %xmm1
  1518. mulps %xmm12, %xmm15
  1519. SUBPS %xmm15, %xmm3
  1520. pshufd $0xb1, %xmm10, %xmm14
  1521. mulps %xmm12, %xmm10
  1522. addps %xmm10, %xmm4
  1523. movaps -28 * SIZE(A2), %xmm10
  1524. pshufd $0xb1, %xmm11, %xmm15
  1525. mulps %xmm12, %xmm11
  1526. addps %xmm11, %xmm6
  1527. movsd -28 * SIZE(A2, LDA), %xmm11
  1528. movhps -26 * SIZE(A2, LDA), %xmm11
  1529. mulps %xmm12, %xmm14
  1530. SUBPS %xmm14, %xmm5
  1531. mulps %xmm12, %xmm15
  1532. movaps -24 * SIZE(X1), %xmm12
  1533. SUBPS %xmm15, %xmm7
  1534. pshufd $0xb1, %xmm8, %xmm14
  1535. mulps %xmm13, %xmm8
  1536. addps %xmm8, %xmm0
  1537. movaps -24 * SIZE(A1), %xmm8
  1538. pshufd $0xb1, %xmm9, %xmm15
  1539. mulps %xmm13, %xmm9
  1540. addps %xmm9, %xmm2
  1541. movsd -24 * SIZE(A1, LDA), %xmm9
  1542. movhps -22 * SIZE(A1, LDA), %xmm9
  1543. mulps %xmm13, %xmm14
  1544. SUBPS %xmm14, %xmm1
  1545. mulps %xmm13, %xmm15
  1546. SUBPS %xmm15, %xmm3
  1547. pshufd $0xb1, %xmm10, %xmm14
  1548. mulps %xmm13, %xmm10
  1549. addps %xmm10, %xmm4
  1550. movaps -24 * SIZE(A2), %xmm10
  1551. pshufd $0xb1, %xmm11, %xmm15
  1552. mulps %xmm13, %xmm11
  1553. addps %xmm11, %xmm6
  1554. movsd -24 * SIZE(A2, LDA), %xmm11
  1555. movhps -22 * SIZE(A2, LDA), %xmm11
  1556. mulps %xmm13, %xmm14
  1557. SUBPS %xmm14, %xmm5
  1558. mulps %xmm13, %xmm15
  1559. movaps -20 * SIZE(X1), %xmm13
  1560. SUBPS %xmm15, %xmm7
  1561. pshufd $0xb1, %xmm8, %xmm14
  1562. mulps %xmm12, %xmm8
  1563. addps %xmm8, %xmm0
  1564. movaps -20 * SIZE(A1), %xmm8
  1565. pshufd $0xb1, %xmm9, %xmm15
  1566. mulps %xmm12, %xmm9
  1567. addps %xmm9, %xmm2
  1568. movsd -20 * SIZE(A1, LDA), %xmm9
  1569. movhps -18 * SIZE(A1, LDA), %xmm9
  1570. mulps %xmm12, %xmm14
  1571. SUBPS %xmm14, %xmm1
  1572. mulps %xmm12, %xmm15
  1573. SUBPS %xmm15, %xmm3
  1574. pshufd $0xb1, %xmm10, %xmm14
  1575. mulps %xmm12, %xmm10
  1576. addps %xmm10, %xmm4
  1577. movaps -20 * SIZE(A2), %xmm10
  1578. pshufd $0xb1, %xmm11, %xmm15
  1579. mulps %xmm12, %xmm11
  1580. addps %xmm11, %xmm6
  1581. movsd -20 * SIZE(A2, LDA), %xmm11
  1582. movhps -18 * SIZE(A2, LDA), %xmm11
  1583. mulps %xmm12, %xmm14
  1584. SUBPS %xmm14, %xmm5
  1585. mulps %xmm12, %xmm15
  1586. movaps -16 * SIZE(X1), %xmm12
  1587. SUBPS %xmm15, %xmm7
  1588. pshufd $0xb1, %xmm8, %xmm14
  1589. mulps %xmm13, %xmm8
  1590. addps %xmm8, %xmm0
  1591. pshufd $0xb1, %xmm9, %xmm15
  1592. mulps %xmm13, %xmm9
  1593. addps %xmm9, %xmm2
  1594. mulps %xmm13, %xmm14
  1595. SUBPS %xmm14, %xmm1
  1596. mulps %xmm13, %xmm15
  1597. SUBPS %xmm15, %xmm3
  1598. pshufd $0xb1, %xmm10, %xmm14
  1599. mulps %xmm13, %xmm10
  1600. addps %xmm10, %xmm4
  1601. pshufd $0xb1, %xmm11, %xmm15
  1602. mulps %xmm13, %xmm11
  1603. addps %xmm11, %xmm6
  1604. mulps %xmm13, %xmm14
  1605. SUBPS %xmm14, %xmm5
  1606. mulps %xmm13, %xmm15
  1607. movaps -12 * SIZE(X1), %xmm13
  1608. SUBPS %xmm15, %xmm7
  1609. subq $-16 * SIZE, A1
  1610. subq $-16 * SIZE, A2
  1611. subq $-16 * SIZE, X1
  1612. ALIGN_3
  1613. .L105:
  1614. testq $4, MM
  1615. je .L107
  1616. movaps -32 * SIZE(A1), %xmm8
  1617. movsd -32 * SIZE(A1, LDA), %xmm9
  1618. movhps -30 * SIZE(A1, LDA), %xmm9
  1619. movaps -32 * SIZE(A2), %xmm10
  1620. movsd -32 * SIZE(A2, LDA), %xmm11
  1621. movhps -30 * SIZE(A2, LDA), %xmm11
  1622. pshufd $0xb1, %xmm8, %xmm14
  1623. mulps %xmm12, %xmm8
  1624. addps %xmm8, %xmm0
  1625. movaps -28 * SIZE(A1), %xmm8
  1626. pshufd $0xb1, %xmm9, %xmm15
  1627. mulps %xmm12, %xmm9
  1628. addps %xmm9, %xmm2
  1629. movsd -28 * SIZE(A1, LDA), %xmm9
  1630. movhps -26 * SIZE(A1, LDA), %xmm9
  1631. mulps %xmm12, %xmm14
  1632. SUBPS %xmm14, %xmm1
  1633. mulps %xmm12, %xmm15
  1634. SUBPS %xmm15, %xmm3
  1635. pshufd $0xb1, %xmm10, %xmm14
  1636. mulps %xmm12, %xmm10
  1637. addps %xmm10, %xmm4
  1638. movaps -28 * SIZE(A2), %xmm10
  1639. pshufd $0xb1, %xmm11, %xmm15
  1640. mulps %xmm12, %xmm11
  1641. addps %xmm11, %xmm6
  1642. movsd -28 * SIZE(A2, LDA), %xmm11
  1643. movhps -26 * SIZE(A2, LDA), %xmm11
  1644. mulps %xmm12, %xmm14
  1645. SUBPS %xmm14, %xmm5
  1646. mulps %xmm12, %xmm15
  1647. movaps -24 * SIZE(X1), %xmm12
  1648. SUBPS %xmm15, %xmm7
  1649. pshufd $0xb1, %xmm8, %xmm14
  1650. mulps %xmm13, %xmm8
  1651. addps %xmm8, %xmm0
  1652. pshufd $0xb1, %xmm9, %xmm15
  1653. mulps %xmm13, %xmm9
  1654. addps %xmm9, %xmm2
  1655. mulps %xmm13, %xmm14
  1656. SUBPS %xmm14, %xmm1
  1657. mulps %xmm13, %xmm15
  1658. SUBPS %xmm15, %xmm3
  1659. pshufd $0xb1, %xmm10, %xmm14
  1660. mulps %xmm13, %xmm10
  1661. addps %xmm10, %xmm4
  1662. pshufd $0xb1, %xmm11, %xmm15
  1663. mulps %xmm13, %xmm11
  1664. addps %xmm11, %xmm6
  1665. mulps %xmm13, %xmm14
  1666. SUBPS %xmm14, %xmm5
  1667. mulps %xmm13, %xmm15
  1668. movaps -20 * SIZE(X1), %xmm13
  1669. SUBPS %xmm15, %xmm7
  1670. addq $8 * SIZE, A1
  1671. addq $8 * SIZE, A2
  1672. addq $8 * SIZE, X1
  1673. ALIGN_3
  1674. .L107:
  1675. testq $2, MM
  1676. je .L108
  1677. movaps -32 * SIZE(A1), %xmm8
  1678. movsd -32 * SIZE(A1, LDA), %xmm9
  1679. movhps -30 * SIZE(A1, LDA), %xmm9
  1680. movaps -32 * SIZE(A2), %xmm10
  1681. movsd -32 * SIZE(A2, LDA), %xmm11
  1682. movhps -30 * SIZE(A2, LDA), %xmm11
  1683. pshufd $0xb1, %xmm8, %xmm14
  1684. mulps %xmm12, %xmm8
  1685. addps %xmm8, %xmm0
  1686. pshufd $0xb1, %xmm9, %xmm15
  1687. mulps %xmm12, %xmm9
  1688. addps %xmm9, %xmm2
  1689. mulps %xmm12, %xmm14
  1690. SUBPS %xmm14, %xmm1
  1691. mulps %xmm12, %xmm15
  1692. SUBPS %xmm15, %xmm3
  1693. pshufd $0xb1, %xmm10, %xmm14
  1694. mulps %xmm12, %xmm10
  1695. addps %xmm10, %xmm4
  1696. pshufd $0xb1, %xmm11, %xmm15
  1697. mulps %xmm12, %xmm11
  1698. addps %xmm11, %xmm6
  1699. mulps %xmm12, %xmm14
  1700. SUBPS %xmm14, %xmm5
  1701. mulps %xmm12, %xmm15
  1702. SUBPS %xmm15, %xmm7
  1703. movaps %xmm13, %xmm12
  1704. addq $4 * SIZE, A1
  1705. addq $4 * SIZE, A2
  1706. ALIGN_3
  1707. .L108:
  1708. testq $1, MM
  1709. je .L109
  1710. #ifdef movsd
  1711. xorps %xmm8, %xmm8
  1712. #endif
  1713. movsd -32 * SIZE(A1), %xmm8
  1714. #ifdef movsd
  1715. xorps %xmm9, %xmm9
  1716. #endif
  1717. movsd -32 * SIZE(A1, LDA), %xmm9
  1718. #ifdef movsd
  1719. xorps %xmm10, %xmm10
  1720. #endif
  1721. movsd -32 * SIZE(A2), %xmm10
  1722. #ifdef movsd
  1723. xorps %xmm11, %xmm11
  1724. #endif
  1725. movsd -32 * SIZE(A2, LDA), %xmm11
  1726. pshufd $0xb1, %xmm8, %xmm14
  1727. mulps %xmm12, %xmm8
  1728. addps %xmm8, %xmm0
  1729. pshufd $0xb1, %xmm9, %xmm15
  1730. mulps %xmm12, %xmm9
  1731. addps %xmm9, %xmm2
  1732. mulps %xmm12, %xmm14
  1733. SUBPS %xmm14, %xmm1
  1734. mulps %xmm12, %xmm15
  1735. SUBPS %xmm15, %xmm3
  1736. pshufd $0xb1, %xmm10, %xmm14
  1737. mulps %xmm12, %xmm10
  1738. addps %xmm10, %xmm4
  1739. pshufd $0xb1, %xmm11, %xmm15
  1740. mulps %xmm12, %xmm11
  1741. addps %xmm11, %xmm6
  1742. mulps %xmm12, %xmm14
  1743. SUBPS %xmm14, %xmm5
  1744. mulps %xmm12, %xmm15
  1745. SUBPS %xmm15, %xmm7
  1746. ALIGN_3
  1747. .L109:
  1748. pcmpeqb %xmm11, %xmm11
  1749. psllq $63, %xmm11
  1750. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  1751. xorps %xmm11, %xmm0
  1752. xorps %xmm11, %xmm2
  1753. xorps %xmm11, %xmm4
  1754. xorps %xmm11, %xmm6
  1755. #else
  1756. xorps %xmm11, %xmm1
  1757. xorps %xmm11, %xmm3
  1758. xorps %xmm11, %xmm5
  1759. xorps %xmm11, %xmm7
  1760. #endif
  1761. #ifdef HAVE_SSE3
  1762. haddps %xmm1, %xmm0
  1763. haddps %xmm3, %xmm2
  1764. haddps %xmm2, %xmm0
  1765. haddps %xmm5, %xmm4
  1766. haddps %xmm7, %xmm6
  1767. haddps %xmm6, %xmm4
  1768. #else
  1769. movaps %xmm0, %xmm8
  1770. unpcklps %xmm1, %xmm0
  1771. unpckhps %xmm1, %xmm8
  1772. movaps %xmm2, %xmm9
  1773. unpcklps %xmm3, %xmm2
  1774. unpckhps %xmm3, %xmm9
  1775. movaps %xmm4, %xmm10
  1776. unpcklps %xmm5, %xmm4
  1777. unpckhps %xmm5, %xmm10
  1778. movaps %xmm6, %xmm11
  1779. unpcklps %xmm7, %xmm6
  1780. unpckhps %xmm7, %xmm11
  1781. addps %xmm8, %xmm0
  1782. addps %xmm9, %xmm2
  1783. addps %xmm10, %xmm4
  1784. addps %xmm11, %xmm6
  1785. movhlps %xmm0, %xmm1
  1786. movhlps %xmm2, %xmm3
  1787. movhlps %xmm4, %xmm5
  1788. movhlps %xmm6, %xmm7
  1789. addps %xmm1, %xmm0
  1790. addps %xmm3, %xmm2
  1791. addps %xmm5, %xmm4
  1792. addps %xmm7, %xmm6
  1793. movlhps %xmm2, %xmm0
  1794. movlhps %xmm6, %xmm4
  1795. #endif
  1796. pshufd $0xb1, %xmm0, %xmm1
  1797. pshufd $0xb1, %xmm4, %xmm5
  1798. #ifdef HAVE_SSE3
  1799. movddup ALPHA, %xmm15
  1800. #else
  1801. movsd ALPHA, %xmm15
  1802. pshufd $0x44, %xmm15, %xmm15
  1803. #endif
  1804. mulps %xmm15, %xmm0
  1805. mulps %xmm15, %xmm1
  1806. mulps %xmm15, %xmm4
  1807. mulps %xmm15, %xmm5
  1808. xorps %xmm11, %xmm0
  1809. xorps %xmm11, %xmm4
  1810. #ifdef HAVE_SSE3
  1811. haddps %xmm1, %xmm0
  1812. haddps %xmm5, %xmm4
  1813. #else
  1814. movaps %xmm0, %xmm2
  1815. shufps $0x88, %xmm1, %xmm0
  1816. shufps $0xdd, %xmm1, %xmm2
  1817. movaps %xmm4, %xmm6
  1818. shufps $0x88, %xmm5, %xmm4
  1819. shufps $0xdd, %xmm5, %xmm6
  1820. addps %xmm2, %xmm0
  1821. addps %xmm6, %xmm4
  1822. #endif
  1823. movsd (Y), %xmm2
  1824. addq INCY, Y
  1825. movhps (Y), %xmm2
  1826. addq INCY, Y
  1827. movsd (Y), %xmm6
  1828. addq INCY, Y
  1829. movhps (Y), %xmm6
  1830. addq INCY, Y
  1831. shufps $0xd8, %xmm0, %xmm0
  1832. shufps $0xd8, %xmm4, %xmm4
  1833. addps %xmm2, %xmm0
  1834. addps %xmm6, %xmm4
  1835. movlps %xmm0, (Y1)
  1836. addq INCY, Y1
  1837. movhps %xmm0, (Y1)
  1838. addq INCY, Y1
  1839. movlps %xmm4, (Y1)
  1840. addq INCY, Y1
  1841. movhps %xmm4, (Y1)
  1842. addq INCY, Y1
  1843. cmpq $4, N
  1844. jge .L101
  1845. ALIGN_3
  1846. .L110:
  1847. #endif
  1848. cmpq $2, N
  1849. jl .L120
  1850. #if GEMV_UNROLL == 2
  1851. ALIGN_3
  1852. .L111:
  1853. #endif
  1854. subq $2, N
  1855. leaq 32 * SIZE(BUFFER), X1
  1856. movq A, A1
  1857. leaq (A1, LDA), A2
  1858. leaq (A1, LDA, 2), A
  1859. xorps %xmm0, %xmm0
  1860. xorps %xmm1, %xmm1
  1861. xorps %xmm2, %xmm2
  1862. xorps %xmm3, %xmm3
  1863. #ifdef ALIGNED_ACCESS
  1864. cmpq M, MM
  1865. je .L11X
  1866. #ifdef movsd
  1867. xorps %xmm8, %xmm8
  1868. #endif
  1869. movsd -32 * SIZE(A1), %xmm8
  1870. #ifdef movsd
  1871. xorps %xmm9, %xmm9
  1872. #endif
  1873. movsd -32 * SIZE(A2), %xmm9
  1874. #ifdef movsd
  1875. xorps %xmm12, %xmm12
  1876. #endif
  1877. movsd -32 * SIZE(X1), %xmm12
  1878. pshufd $0xb1, %xmm8, %xmm4
  1879. mulps %xmm12, %xmm8
  1880. addps %xmm8, %xmm0
  1881. mulps %xmm12, %xmm4
  1882. SUBPS %xmm4, %xmm1
  1883. pshufd $0xb1, %xmm9, %xmm5
  1884. mulps %xmm12, %xmm9
  1885. addps %xmm9, %xmm2
  1886. mulps %xmm12, %xmm5
  1887. SUBPS %xmm5, %xmm3
  1888. addq $2 * SIZE, A1
  1889. addq $2 * SIZE, A2
  1890. addq $2 * SIZE, X1
  1891. ALIGN_3
  1892. .L11X:
  1893. #endif
  1894. movaps -32 * SIZE(X1), %xmm12
  1895. movaps -28 * SIZE(X1), %xmm13
  1896. #if (GEMV_UNROLL == 2) && defined(PREFETCHW)
  1897. PREFETCHW 3 * SIZE(Y1)
  1898. #endif
  1899. movq MM, I
  1900. sarq $3, I
  1901. jle .L115
  1902. movaps -32 * SIZE(A1), %xmm8
  1903. movsd -32 * SIZE(A2), %xmm9
  1904. movhps -30 * SIZE(A2), %xmm9
  1905. movaps -28 * SIZE(A1), %xmm10
  1906. movsd -28 * SIZE(A2), %xmm11
  1907. movhps -26 * SIZE(A2), %xmm11
  1908. decq I
  1909. jle .L114
  1910. ALIGN_3
  1911. .L113:
  1912. #ifdef PREFETCH
  1913. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  1914. #endif
  1915. pshufd $0xb1, %xmm8, %xmm4
  1916. mulps %xmm12, %xmm8
  1917. addps %xmm8, %xmm0
  1918. movaps -24 * SIZE(A1), %xmm8
  1919. mulps %xmm12, %xmm4
  1920. SUBPS %xmm4, %xmm1
  1921. pshufd $0xb1, %xmm9, %xmm5
  1922. mulps %xmm12, %xmm9
  1923. addps %xmm9, %xmm2
  1924. movsd -24 * SIZE(A2), %xmm9
  1925. movhps -22 * SIZE(A2), %xmm9
  1926. mulps %xmm12, %xmm5
  1927. SUBPS %xmm5, %xmm3
  1928. movaps -24 * SIZE(X1), %xmm12
  1929. pshufd $0xb1, %xmm10, %xmm6
  1930. mulps %xmm13, %xmm10
  1931. addps %xmm10, %xmm0
  1932. movaps -20 * SIZE(A1), %xmm10
  1933. mulps %xmm13, %xmm6
  1934. SUBPS %xmm6, %xmm1
  1935. pshufd $0xb1, %xmm11, %xmm7
  1936. mulps %xmm13, %xmm11
  1937. addps %xmm11, %xmm2
  1938. movsd -20 * SIZE(A2), %xmm11
  1939. movhps -18 * SIZE(A2), %xmm11
  1940. mulps %xmm13, %xmm7
  1941. SUBPS %xmm7, %xmm3
  1942. movaps -20 * SIZE(X1), %xmm13
  1943. #ifdef PREFETCH
  1944. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  1945. #endif
  1946. pshufd $0xb1, %xmm8, %xmm4
  1947. mulps %xmm12, %xmm8
  1948. addps %xmm8, %xmm0
  1949. movaps -16 * SIZE(A1), %xmm8
  1950. mulps %xmm12, %xmm4
  1951. SUBPS %xmm4, %xmm1
  1952. pshufd $0xb1, %xmm9, %xmm5
  1953. mulps %xmm12, %xmm9
  1954. addps %xmm9, %xmm2
  1955. movsd -16 * SIZE(A2), %xmm9
  1956. movhps -14 * SIZE(A2), %xmm9
  1957. mulps %xmm12, %xmm5
  1958. SUBPS %xmm5, %xmm3
  1959. movaps -16 * SIZE(X1), %xmm12
  1960. #ifdef PREFETCHW
  1961. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
  1962. #endif
  1963. pshufd $0xb1, %xmm10, %xmm6
  1964. mulps %xmm13, %xmm10
  1965. addps %xmm10, %xmm0
  1966. movaps -12 * SIZE(A1), %xmm10
  1967. mulps %xmm13, %xmm6
  1968. SUBPS %xmm6, %xmm1
  1969. pshufd $0xb1, %xmm11, %xmm7
  1970. mulps %xmm13, %xmm11
  1971. addps %xmm11, %xmm2
  1972. movsd -12 * SIZE(A2), %xmm11
  1973. movhps -10 * SIZE(A2), %xmm11
  1974. mulps %xmm13, %xmm7
  1975. SUBPS %xmm7, %xmm3
  1976. movaps -12 * SIZE(X1), %xmm13
  1977. subq $-16 * SIZE, A1
  1978. subq $-16 * SIZE, A2
  1979. subq $-16 * SIZE, X1
  1980. subq $1, I
  1981. BRANCH
  1982. jg .L113
  1983. ALIGN_3
  1984. .L114:
  1985. pshufd $0xb1, %xmm8, %xmm4
  1986. mulps %xmm12, %xmm8
  1987. addps %xmm8, %xmm0
  1988. movaps -24 * SIZE(A1), %xmm8
  1989. mulps %xmm12, %xmm4
  1990. SUBPS %xmm4, %xmm1
  1991. pshufd $0xb1, %xmm9, %xmm5
  1992. mulps %xmm12, %xmm9
  1993. addps %xmm9, %xmm2
  1994. movsd -24 * SIZE(A2), %xmm9
  1995. movhps -22 * SIZE(A2), %xmm9
  1996. mulps %xmm12, %xmm5
  1997. SUBPS %xmm5, %xmm3
  1998. movaps -24 * SIZE(X1), %xmm12
  1999. pshufd $0xb1, %xmm10, %xmm6
  2000. mulps %xmm13, %xmm10
  2001. addps %xmm10, %xmm0
  2002. movaps -20 * SIZE(A1), %xmm10
  2003. mulps %xmm13, %xmm6
  2004. SUBPS %xmm6, %xmm1
  2005. pshufd $0xb1, %xmm11, %xmm7
  2006. mulps %xmm13, %xmm11
  2007. addps %xmm11, %xmm2
  2008. movsd -20 * SIZE(A2), %xmm11
  2009. movhps -18 * SIZE(A2), %xmm11
  2010. mulps %xmm13, %xmm7
  2011. SUBPS %xmm7, %xmm3
  2012. movaps -20 * SIZE(X1), %xmm13
  2013. pshufd $0xb1, %xmm8, %xmm4
  2014. mulps %xmm12, %xmm8
  2015. addps %xmm8, %xmm0
  2016. mulps %xmm12, %xmm4
  2017. SUBPS %xmm4, %xmm1
  2018. pshufd $0xb1, %xmm9, %xmm5
  2019. mulps %xmm12, %xmm9
  2020. addps %xmm9, %xmm2
  2021. mulps %xmm12, %xmm5
  2022. SUBPS %xmm5, %xmm3
  2023. movaps -16 * SIZE(X1), %xmm12
  2024. pshufd $0xb1, %xmm10, %xmm6
  2025. mulps %xmm13, %xmm10
  2026. addps %xmm10, %xmm0
  2027. mulps %xmm13, %xmm6
  2028. SUBPS %xmm6, %xmm1
  2029. pshufd $0xb1, %xmm11, %xmm7
  2030. mulps %xmm13, %xmm11
  2031. addps %xmm11, %xmm2
  2032. mulps %xmm13, %xmm7
  2033. SUBPS %xmm7, %xmm3
  2034. movaps -12 * SIZE(X1), %xmm13
  2035. subq $-16 * SIZE, A1
  2036. subq $-16 * SIZE, A2
  2037. subq $-16 * SIZE, X1
  2038. ALIGN_3
  2039. .L115:
  2040. testq $4, MM
  2041. je .L117
  2042. movaps -32 * SIZE(A1), %xmm8
  2043. movsd -32 * SIZE(A2), %xmm9
  2044. movhps -30 * SIZE(A2), %xmm9
  2045. movaps -28 * SIZE(A1), %xmm10
  2046. movsd -28 * SIZE(A2), %xmm11
  2047. movhps -26 * SIZE(A2), %xmm11
  2048. pshufd $0xb1, %xmm8, %xmm4
  2049. mulps %xmm12, %xmm8
  2050. addps %xmm8, %xmm0
  2051. mulps %xmm12, %xmm4
  2052. SUBPS %xmm4, %xmm1
  2053. pshufd $0xb1, %xmm9, %xmm5
  2054. mulps %xmm12, %xmm9
  2055. addps %xmm9, %xmm2
  2056. mulps %xmm12, %xmm5
  2057. SUBPS %xmm5, %xmm3
  2058. movaps -24 * SIZE(X1), %xmm12
  2059. pshufd $0xb1, %xmm10, %xmm6
  2060. mulps %xmm13, %xmm10
  2061. addps %xmm10, %xmm0
  2062. mulps %xmm13, %xmm6
  2063. SUBPS %xmm6, %xmm1
  2064. pshufd $0xb1, %xmm11, %xmm7
  2065. mulps %xmm13, %xmm11
  2066. addps %xmm11, %xmm2
  2067. mulps %xmm13, %xmm7
  2068. SUBPS %xmm7, %xmm3
  2069. movaps -20 * SIZE(X1), %xmm13
  2070. addq $8 * SIZE, A1
  2071. addq $8 * SIZE, A2
  2072. ALIGN_3
  2073. .L117:
  2074. testq $2, MM
  2075. je .L118
  2076. movaps -32 * SIZE(A1), %xmm8
  2077. movsd -32 * SIZE(A2), %xmm9
  2078. movhps -30 * SIZE(A2), %xmm9
  2079. pshufd $0xb1, %xmm8, %xmm4
  2080. mulps %xmm12, %xmm8
  2081. addps %xmm8, %xmm0
  2082. mulps %xmm12, %xmm4
  2083. SUBPS %xmm4, %xmm1
  2084. pshufd $0xb1, %xmm9, %xmm5
  2085. mulps %xmm12, %xmm9
  2086. addps %xmm9, %xmm2
  2087. mulps %xmm12, %xmm5
  2088. SUBPS %xmm5, %xmm3
  2089. movaps %xmm13, %xmm12
  2090. addq $4 * SIZE, A1
  2091. addq $4 * SIZE, A2
  2092. ALIGN_3
  2093. .L118:
  2094. testq $1, MM
  2095. je .L119
  2096. #ifdef movsd
  2097. xorps %xmm8, %xmm8
  2098. #endif
  2099. movsd -32 * SIZE(A1), %xmm8
  2100. #ifdef movsd
  2101. xorps %xmm9, %xmm9
  2102. #endif
  2103. movsd -32 * SIZE(A2), %xmm9
  2104. pshufd $0xb1, %xmm8, %xmm4
  2105. mulps %xmm12, %xmm8
  2106. addps %xmm8, %xmm0
  2107. mulps %xmm12, %xmm4
  2108. SUBPS %xmm4, %xmm1
  2109. pshufd $0xb1, %xmm9, %xmm5
  2110. mulps %xmm12, %xmm9
  2111. addps %xmm9, %xmm2
  2112. mulps %xmm12, %xmm5
  2113. SUBPS %xmm5, %xmm3
  2114. ALIGN_3
  2115. .L119:
  2116. pcmpeqb %xmm5, %xmm5
  2117. psllq $63, %xmm5
  2118. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  2119. xorps %xmm5, %xmm0
  2120. xorps %xmm5, %xmm2
  2121. #else
  2122. xorps %xmm5, %xmm1
  2123. xorps %xmm5, %xmm3
  2124. #endif
  2125. #ifdef HAVE_SSE3
  2126. haddps %xmm1, %xmm0
  2127. haddps %xmm3, %xmm2
  2128. haddps %xmm2, %xmm0
  2129. #else
  2130. movaps %xmm0, %xmm8
  2131. unpcklps %xmm1, %xmm0
  2132. unpckhps %xmm1, %xmm8
  2133. movaps %xmm2, %xmm4
  2134. unpcklps %xmm3, %xmm2
  2135. unpckhps %xmm3, %xmm4
  2136. addps %xmm8, %xmm0
  2137. addps %xmm4, %xmm2
  2138. movhlps %xmm0, %xmm1
  2139. movhlps %xmm2, %xmm3
  2140. addps %xmm1, %xmm0
  2141. addps %xmm3, %xmm2
  2142. movlhps %xmm2, %xmm0
  2143. #endif
  2144. pshufd $0xb1, %xmm0, %xmm1
  2145. #ifdef HAVE_SSE3
  2146. movddup ALPHA, %xmm15
  2147. #else
  2148. movsd ALPHA, %xmm15
  2149. pshufd $0x44, %xmm15, %xmm15
  2150. #endif
  2151. mulps %xmm15, %xmm0
  2152. mulps %xmm15, %xmm1
  2153. xorps %xmm5, %xmm0
  2154. #ifdef HAVE_SSE3
  2155. haddps %xmm1, %xmm0
  2156. #else
  2157. movaps %xmm0, %xmm2
  2158. shufps $0x88, %xmm1, %xmm0
  2159. shufps $0xdd, %xmm1, %xmm2
  2160. addps %xmm2, %xmm0
  2161. #endif
  2162. movsd (Y), %xmm12
  2163. addq INCY, Y
  2164. movhps (Y), %xmm12
  2165. addq INCY, Y
  2166. shufps $0xd8, %xmm0, %xmm0
  2167. addps %xmm12, %xmm0
  2168. movlps %xmm0, (Y1)
  2169. addq INCY, Y1
  2170. movhps %xmm0, (Y1)
  2171. addq INCY, Y1
  2172. #if GEMV_UNROLL == 2
  2173. cmpq $2, N
  2174. jge .L111
  2175. #endif
  2176. ALIGN_3
  2177. .L120:
  2178. cmpq $1, N
  2179. jl .L999
  2180. leaq 32 * SIZE(BUFFER), X1
  2181. movq A, A1
  2182. xorps %xmm0, %xmm0
  2183. xorps %xmm1, %xmm1
  2184. #ifdef ALIGNED_ACCESS
  2185. cmpq M, MM
  2186. je .L12X
  2187. #ifdef movsd
  2188. xorps %xmm8, %xmm8
  2189. #endif
  2190. movsd -32 * SIZE(A1), %xmm8
  2191. #ifdef movsd
  2192. xorps %xmm12, %xmm12
  2193. #endif
  2194. movsd -32 * SIZE(X1), %xmm12
  2195. pshufd $0xb1, %xmm8, %xmm4
  2196. mulps %xmm12, %xmm8
  2197. addps %xmm8, %xmm0
  2198. mulps %xmm12, %xmm4
  2199. SUBPS %xmm4, %xmm1
  2200. addq $2 * SIZE, A1
  2201. addq $2 * SIZE, X1
  2202. ALIGN_3
  2203. .L12X:
  2204. #endif
  2205. movaps -32 * SIZE(X1), %xmm12
  2206. movaps -28 * SIZE(X1), %xmm13
  2207. movq MM, I
  2208. sarq $3, I
  2209. jle .L125
  2210. MOVUPS_A1(-32 * SIZE, A1, %xmm8)
  2211. MOVUPS_A1(-28 * SIZE, A1, %xmm10)
  2212. decq I
  2213. jle .L124
  2214. ALIGN_3
  2215. .L123:
  2216. #ifdef PREFETCH
  2217. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  2218. #endif
  2219. pshufd $0xb1, %xmm8, %xmm4
  2220. mulps %xmm12, %xmm8
  2221. addps %xmm8, %xmm0
  2222. MOVUPS_A1(-24 * SIZE, A1, %xmm8)
  2223. mulps %xmm12, %xmm4
  2224. SUBPS %xmm4, %xmm1
  2225. movaps -24 * SIZE(X1), %xmm12
  2226. pshufd $0xb1, %xmm10, %xmm6
  2227. mulps %xmm13, %xmm10
  2228. addps %xmm10, %xmm0
  2229. MOVUPS_A1(-20 * SIZE, A1, %xmm10)
  2230. mulps %xmm13, %xmm6
  2231. SUBPS %xmm6, %xmm1
  2232. movaps -20 * SIZE(X1), %xmm13
  2233. pshufd $0xb1, %xmm8, %xmm4
  2234. mulps %xmm12, %xmm8
  2235. addps %xmm8, %xmm0
  2236. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  2237. mulps %xmm12, %xmm4
  2238. SUBPS %xmm4, %xmm1
  2239. movaps -16 * SIZE(X1), %xmm12
  2240. #ifdef PREFETCHW
  2241. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
  2242. #endif
  2243. pshufd $0xb1, %xmm10, %xmm6
  2244. mulps %xmm13, %xmm10
  2245. addps %xmm10, %xmm0
  2246. MOVUPS_A1(-12 * SIZE, A1, %xmm10)
  2247. mulps %xmm13, %xmm6
  2248. SUBPS %xmm6, %xmm1
  2249. movaps -12 * SIZE(X1), %xmm13
  2250. subq $-16 * SIZE, A1
  2251. subq $-16 * SIZE, X1
  2252. subq $1, I
  2253. BRANCH
  2254. jg .L123
  2255. ALIGN_3
  2256. .L124:
  2257. pshufd $0xb1, %xmm8, %xmm4
  2258. mulps %xmm12, %xmm8
  2259. addps %xmm8, %xmm0
  2260. MOVUPS_A1(-24 * SIZE, A1, %xmm8)
  2261. mulps %xmm12, %xmm4
  2262. SUBPS %xmm4, %xmm1
  2263. movaps -24 * SIZE(X1), %xmm12
  2264. pshufd $0xb1, %xmm10, %xmm6
  2265. mulps %xmm13, %xmm10
  2266. addps %xmm10, %xmm0
  2267. MOVUPS_A1(-20 * SIZE, A1, %xmm10)
  2268. mulps %xmm13, %xmm6
  2269. SUBPS %xmm6, %xmm1
  2270. movaps -20 * SIZE(X1), %xmm13
  2271. pshufd $0xb1, %xmm8, %xmm4
  2272. mulps %xmm12, %xmm8
  2273. addps %xmm8, %xmm0
  2274. mulps %xmm12, %xmm4
  2275. SUBPS %xmm4, %xmm1
  2276. movaps -16 * SIZE(X1), %xmm12
  2277. pshufd $0xb1, %xmm10, %xmm6
  2278. mulps %xmm13, %xmm10
  2279. addps %xmm10, %xmm0
  2280. mulps %xmm13, %xmm6
  2281. SUBPS %xmm6, %xmm1
  2282. movaps -12 * SIZE(X1), %xmm13
  2283. subq $-16 * SIZE, A1
  2284. subq $-16 * SIZE, X1
  2285. ALIGN_3
  2286. .L125:
  2287. testq $4, MM
  2288. je .L127
  2289. MOVUPS_A1(-32 * SIZE, A1, %xmm8)
  2290. MOVUPS_A1(-28 * SIZE, A1, %xmm10)
  2291. pshufd $0xb1, %xmm8, %xmm4
  2292. mulps %xmm12, %xmm8
  2293. addps %xmm8, %xmm0
  2294. mulps %xmm12, %xmm4
  2295. SUBPS %xmm4, %xmm1
  2296. movaps -24 * SIZE(X1), %xmm12
  2297. pshufd $0xb1, %xmm10, %xmm6
  2298. mulps %xmm13, %xmm10
  2299. addps %xmm10, %xmm0
  2300. mulps %xmm13, %xmm6
  2301. SUBPS %xmm6, %xmm1
  2302. movaps -20 * SIZE(X1), %xmm13
  2303. addq $8 * SIZE, A1
  2304. ALIGN_3
  2305. .L127:
  2306. testq $2, MM
  2307. je .L128
  2308. MOVUPS_A1(-32 * SIZE, A1, %xmm8)
  2309. pshufd $0xb1, %xmm8, %xmm4
  2310. mulps %xmm12, %xmm8
  2311. addps %xmm8, %xmm0
  2312. mulps %xmm12, %xmm4
  2313. SUBPS %xmm4, %xmm1
  2314. movaps %xmm13, %xmm12
  2315. addq $4 * SIZE, A1
  2316. ALIGN_3
  2317. .L128:
  2318. testq $1, MM
  2319. je .L129
  2320. #ifdef movsd
  2321. xorps %xmm8, %xmm8
  2322. #endif
  2323. movsd -32 * SIZE(A1), %xmm8
  2324. pshufd $0xb1, %xmm8, %xmm4
  2325. mulps %xmm12, %xmm8
  2326. addps %xmm8, %xmm0
  2327. mulps %xmm12, %xmm4
  2328. SUBPS %xmm4, %xmm1
  2329. ALIGN_3
  2330. .L129:
  2331. pcmpeqb %xmm5, %xmm5
  2332. psllq $63, %xmm5
  2333. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  2334. xorps %xmm5, %xmm0
  2335. #else
  2336. xorps %xmm5, %xmm1
  2337. #endif
  2338. #ifdef HAVE_SSE3
  2339. haddps %xmm1, %xmm0
  2340. haddps %xmm0, %xmm0
  2341. #else
  2342. movaps %xmm0, %xmm8
  2343. unpcklps %xmm1, %xmm0
  2344. unpckhps %xmm1, %xmm8
  2345. addps %xmm8, %xmm0
  2346. movhlps %xmm0, %xmm1
  2347. addps %xmm1, %xmm0
  2348. #endif
  2349. pshufd $0xb1, %xmm0, %xmm1
  2350. #ifdef HAVE_SSE3
  2351. movddup ALPHA, %xmm15
  2352. #else
  2353. movsd ALPHA, %xmm15
  2354. pshufd $0x44, %xmm15, %xmm15
  2355. #endif
  2356. mulps %xmm15, %xmm0
  2357. mulps %xmm15, %xmm1
  2358. xorps %xmm5, %xmm0
  2359. #ifdef HAVE_SSE3
  2360. haddps %xmm1, %xmm0
  2361. #else
  2362. movaps %xmm0, %xmm2
  2363. shufps $0x88, %xmm1, %xmm0
  2364. shufps $0xdd, %xmm1, %xmm2
  2365. addps %xmm2, %xmm0
  2366. #endif
  2367. movsd (Y), %xmm12
  2368. addq INCY, Y
  2369. shufps $0xd8, %xmm0, %xmm0
  2370. addps %xmm12, %xmm0
  2371. movlps %xmm0, (Y1)
  2372. addq INCY, Y1
  2373. jmp .L999
  2374. ALIGN_3
  2375. .L200:
  2376. testq $2 * SIZE, LDA
  2377. jne .L300
  2378. cmpq $2, N
  2379. jl .L210
  2380. ALIGN_3
  2381. .L201:
  2382. subq $2, N
  2383. leaq 32 * SIZE(BUFFER), X1
  2384. movq A, A1
  2385. leaq (A1, LDA), A2
  2386. leaq (A1, LDA, 2), A
  2387. xorps %xmm0, %xmm0
  2388. xorps %xmm1, %xmm1
  2389. xorps %xmm2, %xmm2
  2390. xorps %xmm3, %xmm3
  2391. #ifdef ALIGNED_ACCESS
  2392. cmpq M, MM
  2393. je .L20X
  2394. #ifdef movsd
  2395. xorps %xmm8, %xmm8
  2396. #endif
  2397. movsd -32 * SIZE(A1), %xmm8
  2398. #ifdef movsd
  2399. xorps %xmm9, %xmm9
  2400. #endif
  2401. movsd -32 * SIZE(A2), %xmm9
  2402. #ifdef movsd
  2403. xorps %xmm12, %xmm12
  2404. #endif
  2405. movsd -32 * SIZE(X1), %xmm12
  2406. pshufd $0xb1, %xmm8, %xmm4
  2407. mulps %xmm12, %xmm8
  2408. addps %xmm8, %xmm0
  2409. mulps %xmm12, %xmm4
  2410. SUBPS %xmm4, %xmm1
  2411. pshufd $0xb1, %xmm9, %xmm5
  2412. mulps %xmm12, %xmm9
  2413. addps %xmm9, %xmm2
  2414. mulps %xmm12, %xmm5
  2415. SUBPS %xmm5, %xmm3
  2416. addq $2 * SIZE, A1
  2417. addq $2 * SIZE, A2
  2418. addq $2 * SIZE, X1
  2419. ALIGN_3
  2420. .L20X:
  2421. #endif
  2422. movaps -33 * SIZE(A1), %xmm4
  2423. movaps -33 * SIZE(A2), %xmm5
  2424. movaps -32 * SIZE(X1), %xmm12
  2425. movaps -28 * SIZE(X1), %xmm13
  2426. #ifdef PREFETCHW
  2427. PREFETCHW 3 * SIZE(Y1)
  2428. #endif
  2429. movq MM, I
  2430. sarq $3, I
  2431. jle .L205
  2432. movaps -29 * SIZE(A1), %xmm6
  2433. movaps -29 * SIZE(A2), %xmm7
  2434. decq I
  2435. jle .L204
  2436. ALIGN_3
  2437. .L203:
  2438. #ifdef PREFETCH
  2439. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  2440. #endif
  2441. movss %xmm6, %xmm4
  2442. shufps $0x39, %xmm4, %xmm4
  2443. pshufd $0xb1, %xmm4, %xmm14
  2444. mulps %xmm12, %xmm4
  2445. addps %xmm4, %xmm0
  2446. movaps -25 * SIZE(A1), %xmm4
  2447. mulps %xmm12, %xmm14
  2448. SUBPS %xmm14, %xmm1
  2449. movss %xmm7, %xmm5
  2450. shufps $0x39, %xmm5, %xmm5
  2451. pshufd $0xb1, %xmm5, %xmm14
  2452. mulps %xmm12, %xmm5
  2453. addps %xmm5, %xmm2
  2454. movaps -25 * SIZE(A2), %xmm5
  2455. mulps %xmm12, %xmm14
  2456. movaps -24 * SIZE(X1), %xmm12
  2457. SUBPS %xmm14, %xmm3
  2458. movss %xmm4, %xmm6
  2459. shufps $0x39, %xmm6, %xmm6
  2460. pshufd $0xb1, %xmm6, %xmm14
  2461. mulps %xmm13, %xmm6
  2462. addps %xmm6, %xmm0
  2463. movaps -21 * SIZE(A1), %xmm6
  2464. mulps %xmm13, %xmm14
  2465. SUBPS %xmm14, %xmm1
  2466. movss %xmm5, %xmm7
  2467. shufps $0x39, %xmm7, %xmm7
  2468. pshufd $0xb1, %xmm7, %xmm14
  2469. mulps %xmm13, %xmm7
  2470. addps %xmm7, %xmm2
  2471. movaps -21 * SIZE(A2), %xmm7
  2472. mulps %xmm13, %xmm14
  2473. movaps -20 * SIZE(X1), %xmm13
  2474. SUBPS %xmm14, %xmm3
  2475. #ifdef PREFETCH
  2476. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  2477. #endif
  2478. movss %xmm6, %xmm4
  2479. shufps $0x39, %xmm4, %xmm4
  2480. pshufd $0xb1, %xmm4, %xmm14
  2481. mulps %xmm12, %xmm4
  2482. addps %xmm4, %xmm0
  2483. movaps -17 * SIZE(A1), %xmm4
  2484. mulps %xmm12, %xmm14
  2485. SUBPS %xmm14, %xmm1
  2486. movss %xmm7, %xmm5
  2487. shufps $0x39, %xmm5, %xmm5
  2488. pshufd $0xb1, %xmm5, %xmm14
  2489. mulps %xmm12, %xmm5
  2490. addps %xmm5, %xmm2
  2491. movaps -17 * SIZE(A2), %xmm5
  2492. mulps %xmm12, %xmm14
  2493. movaps -16 * SIZE(X1), %xmm12
  2494. SUBPS %xmm14, %xmm3
  2495. #ifdef PREFETCHW
  2496. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
  2497. #endif
  2498. movss %xmm4, %xmm6
  2499. shufps $0x39, %xmm6, %xmm6
  2500. pshufd $0xb1, %xmm6, %xmm14
  2501. mulps %xmm13, %xmm6
  2502. addps %xmm6, %xmm0
  2503. movaps -13 * SIZE(A1), %xmm6
  2504. mulps %xmm13, %xmm14
  2505. SUBPS %xmm14, %xmm1
  2506. movss %xmm5, %xmm7
  2507. shufps $0x39, %xmm7, %xmm7
  2508. pshufd $0xb1, %xmm7, %xmm14
  2509. mulps %xmm13, %xmm7
  2510. addps %xmm7, %xmm2
  2511. movaps -13 * SIZE(A2), %xmm7
  2512. mulps %xmm13, %xmm14
  2513. movaps -12 * SIZE(X1), %xmm13
  2514. SUBPS %xmm14, %xmm3
  2515. subq $-16 * SIZE, A1
  2516. subq $-16 * SIZE, A2
  2517. subq $-16 * SIZE, X1
  2518. subq $1, I
  2519. BRANCH
  2520. jg .L203
  2521. ALIGN_3
  2522. .L204:
  2523. movss %xmm6, %xmm4
  2524. shufps $0x39, %xmm4, %xmm4
  2525. pshufd $0xb1, %xmm4, %xmm14
  2526. mulps %xmm12, %xmm4
  2527. addps %xmm4, %xmm0
  2528. movaps -25 * SIZE(A1), %xmm4
  2529. mulps %xmm12, %xmm14
  2530. SUBPS %xmm14, %xmm1
  2531. movss %xmm7, %xmm5
  2532. shufps $0x39, %xmm5, %xmm5
  2533. pshufd $0xb1, %xmm5, %xmm14
  2534. mulps %xmm12, %xmm5
  2535. addps %xmm5, %xmm2
  2536. movaps -25 * SIZE(A2), %xmm5
  2537. mulps %xmm12, %xmm14
  2538. movaps -24 * SIZE(X1), %xmm12
  2539. SUBPS %xmm14, %xmm3
  2540. movss %xmm4, %xmm6
  2541. shufps $0x39, %xmm6, %xmm6
  2542. pshufd $0xb1, %xmm6, %xmm14
  2543. mulps %xmm13, %xmm6
  2544. addps %xmm6, %xmm0
  2545. movaps -21 * SIZE(A1), %xmm6
  2546. mulps %xmm13, %xmm14
  2547. SUBPS %xmm14, %xmm1
  2548. movss %xmm5, %xmm7
  2549. shufps $0x39, %xmm7, %xmm7
  2550. pshufd $0xb1, %xmm7, %xmm14
  2551. mulps %xmm13, %xmm7
  2552. addps %xmm7, %xmm2
  2553. movaps -21 * SIZE(A2), %xmm7
  2554. mulps %xmm13, %xmm14
  2555. movaps -20 * SIZE(X1), %xmm13
  2556. SUBPS %xmm14, %xmm3
  2557. movss %xmm6, %xmm4
  2558. shufps $0x39, %xmm4, %xmm4
  2559. pshufd $0xb1, %xmm4, %xmm14
  2560. mulps %xmm12, %xmm4
  2561. addps %xmm4, %xmm0
  2562. movaps -17 * SIZE(A1), %xmm4
  2563. mulps %xmm12, %xmm14
  2564. SUBPS %xmm14, %xmm1
  2565. movss %xmm7, %xmm5
  2566. shufps $0x39, %xmm5, %xmm5
  2567. pshufd $0xb1, %xmm5, %xmm14
  2568. mulps %xmm12, %xmm5
  2569. addps %xmm5, %xmm2
  2570. movaps -17 * SIZE(A2), %xmm5
  2571. mulps %xmm12, %xmm14
  2572. movaps -16 * SIZE(X1), %xmm12
  2573. SUBPS %xmm14, %xmm3
  2574. movss %xmm4, %xmm6
  2575. shufps $0x39, %xmm6, %xmm6
  2576. pshufd $0xb1, %xmm6, %xmm14
  2577. mulps %xmm13, %xmm6
  2578. addps %xmm6, %xmm0
  2579. mulps %xmm13, %xmm14
  2580. SUBPS %xmm14, %xmm1
  2581. movss %xmm5, %xmm7
  2582. shufps $0x39, %xmm7, %xmm7
  2583. pshufd $0xb1, %xmm7, %xmm14
  2584. mulps %xmm13, %xmm7
  2585. addps %xmm7, %xmm2
  2586. mulps %xmm13, %xmm14
  2587. movaps -12 * SIZE(X1), %xmm13
  2588. SUBPS %xmm14, %xmm3
  2589. subq $-16 * SIZE, A1
  2590. subq $-16 * SIZE, A2
  2591. subq $-16 * SIZE, X1
  2592. ALIGN_3
  2593. .L205:
  2594. testq $4, MM
  2595. je .L207
  2596. movaps -29 * SIZE(A1), %xmm6
  2597. movss %xmm6, %xmm4
  2598. shufps $0x39, %xmm4, %xmm4
  2599. pshufd $0xb1, %xmm4, %xmm14
  2600. mulps %xmm12, %xmm4
  2601. addps %xmm4, %xmm0
  2602. mulps %xmm12, %xmm14
  2603. SUBPS %xmm14, %xmm1
  2604. movaps -29 * SIZE(A2), %xmm7
  2605. movss %xmm7, %xmm5
  2606. shufps $0x39, %xmm5, %xmm5
  2607. pshufd $0xb1, %xmm5, %xmm14
  2608. mulps %xmm12, %xmm5
  2609. addps %xmm5, %xmm2
  2610. mulps %xmm12, %xmm14
  2611. SUBPS %xmm14, %xmm3
  2612. movaps -25 * SIZE(A1), %xmm8
  2613. movss %xmm8, %xmm6
  2614. shufps $0x39, %xmm6, %xmm6
  2615. pshufd $0xb1, %xmm6, %xmm14
  2616. mulps %xmm13, %xmm6
  2617. addps %xmm6, %xmm0
  2618. mulps %xmm13, %xmm14
  2619. SUBPS %xmm14, %xmm1
  2620. movaps -25 * SIZE(A2), %xmm9
  2621. movss %xmm9, %xmm7
  2622. shufps $0x39, %xmm7, %xmm7
  2623. pshufd $0xb1, %xmm7, %xmm14
  2624. mulps %xmm13, %xmm7
  2625. addps %xmm7, %xmm2
  2626. mulps %xmm13, %xmm14
  2627. SUBPS %xmm14, %xmm3
  2628. movaps %xmm8, %xmm4
  2629. movaps %xmm9, %xmm5
  2630. movaps -24 * SIZE(X1), %xmm12
  2631. movaps -20 * SIZE(X1), %xmm13
  2632. addq $8 * SIZE, A1
  2633. addq $8 * SIZE, A2
  2634. ALIGN_3
  2635. .L207:
  2636. testq $2, MM
  2637. je .L208
  2638. movaps -29 * SIZE(A1), %xmm6
  2639. movaps -29 * SIZE(A2), %xmm7
  2640. movss %xmm6, %xmm4
  2641. shufps $0x39, %xmm4, %xmm4
  2642. pshufd $0xb1, %xmm4, %xmm14
  2643. mulps %xmm12, %xmm4
  2644. addps %xmm4, %xmm0
  2645. mulps %xmm12, %xmm14
  2646. SUBPS %xmm14, %xmm1
  2647. movss %xmm7, %xmm5
  2648. shufps $0x39, %xmm5, %xmm5
  2649. pshufd $0xb1, %xmm5, %xmm14
  2650. mulps %xmm12, %xmm5
  2651. addps %xmm5, %xmm2
  2652. mulps %xmm12, %xmm14
  2653. SUBPS %xmm14, %xmm3
  2654. movaps %xmm6, %xmm4
  2655. movaps %xmm7, %xmm5
  2656. movaps %xmm13, %xmm12
  2657. addq $4 * SIZE, A1
  2658. addq $4 * SIZE, A2
  2659. ALIGN_3
  2660. .L208:
  2661. testq $1, MM
  2662. je .L209
  2663. #ifdef movsd
  2664. xorps %xmm8, %xmm8
  2665. #endif
  2666. movsd -32 * SIZE(A1), %xmm8
  2667. #ifdef movsd
  2668. xorps %xmm9, %xmm9
  2669. #endif
  2670. movsd -32 * SIZE(A2), %xmm9
  2671. pshufd $0xb1, %xmm8, %xmm4
  2672. mulps %xmm12, %xmm8
  2673. addps %xmm8, %xmm0
  2674. mulps %xmm12, %xmm4
  2675. SUBPS %xmm4, %xmm1
  2676. pshufd $0xb1, %xmm9, %xmm5
  2677. mulps %xmm12, %xmm9
  2678. addps %xmm9, %xmm2
  2679. mulps %xmm12, %xmm5
  2680. SUBPS %xmm5, %xmm3
  2681. ALIGN_3
  2682. .L209:
  2683. pcmpeqb %xmm5, %xmm5
  2684. psllq $63, %xmm5
  2685. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  2686. xorps %xmm5, %xmm0
  2687. xorps %xmm5, %xmm2
  2688. #else
  2689. xorps %xmm5, %xmm1
  2690. xorps %xmm5, %xmm3
  2691. #endif
  2692. #ifdef HAVE_SSE3
  2693. haddps %xmm1, %xmm0
  2694. haddps %xmm3, %xmm2
  2695. haddps %xmm2, %xmm0
  2696. #else
  2697. movaps %xmm0, %xmm8
  2698. unpcklps %xmm1, %xmm0
  2699. unpckhps %xmm1, %xmm8
  2700. movaps %xmm2, %xmm4
  2701. unpcklps %xmm3, %xmm2
  2702. unpckhps %xmm3, %xmm4
  2703. addps %xmm8, %xmm0
  2704. addps %xmm4, %xmm2
  2705. movhlps %xmm0, %xmm1
  2706. movhlps %xmm2, %xmm3
  2707. addps %xmm1, %xmm0
  2708. addps %xmm3, %xmm2
  2709. movlhps %xmm2, %xmm0
  2710. #endif
  2711. pshufd $0xb1, %xmm0, %xmm1
  2712. #ifdef HAVE_SSE3
  2713. movddup ALPHA, %xmm15
  2714. #else
  2715. movsd ALPHA, %xmm15
  2716. pshufd $0x44, %xmm15, %xmm15
  2717. #endif
  2718. mulps %xmm15, %xmm0
  2719. mulps %xmm15, %xmm1
  2720. xorps %xmm5, %xmm0
  2721. #ifdef HAVE_SSE3
  2722. haddps %xmm1, %xmm0
  2723. #else
  2724. movaps %xmm0, %xmm2
  2725. shufps $0x88, %xmm1, %xmm0
  2726. shufps $0xdd, %xmm1, %xmm2
  2727. addps %xmm2, %xmm0
  2728. #endif
  2729. movsd (Y), %xmm12
  2730. addq INCY, Y
  2731. movhps (Y), %xmm12
  2732. addq INCY, Y
  2733. shufps $0xd8, %xmm0, %xmm0
  2734. addps %xmm12, %xmm0
  2735. movlps %xmm0, (Y1)
  2736. addq INCY, Y1
  2737. movhps %xmm0, (Y1)
  2738. addq INCY, Y1
  2739. cmpq $2, N
  2740. jge .L201
  2741. ALIGN_3
  2742. .L210:
  2743. cmpq $1, N
  2744. jl .L999
  2745. leaq 32 * SIZE(BUFFER), X1
  2746. movq A, A1
  2747. xorps %xmm0, %xmm0
  2748. xorps %xmm1, %xmm1
  2749. #ifdef ALIGNED_ACCESS
  2750. cmpq M, MM
  2751. je .L21X
  2752. #ifdef movsd
  2753. xorps %xmm8, %xmm8
  2754. #endif
  2755. movsd -32 * SIZE(A1), %xmm8
  2756. #ifdef movsd
  2757. xorps %xmm12, %xmm12
  2758. #endif
  2759. movsd -32 * SIZE(X1), %xmm12
  2760. pshufd $0xb1, %xmm8, %xmm4
  2761. mulps %xmm12, %xmm8
  2762. addps %xmm8, %xmm0
  2763. mulps %xmm12, %xmm4
  2764. SUBPS %xmm4, %xmm1
  2765. addq $2 * SIZE, A1
  2766. addq $2 * SIZE, X1
  2767. ALIGN_3
  2768. .L21X:
  2769. #endif
  2770. movaps -33 * SIZE(A1), %xmm4
  2771. movaps -32 * SIZE(X1), %xmm12
  2772. movaps -28 * SIZE(X1), %xmm13
  2773. movq MM, I
  2774. sarq $3, I
  2775. jle .L215
  2776. movaps -29 * SIZE(A1), %xmm5
  2777. movaps -25 * SIZE(A1), %xmm6
  2778. movaps -21 * SIZE(A1), %xmm7
  2779. decq I
  2780. jle .L214
  2781. ALIGN_3
  2782. .L213:
  2783. #ifdef PREFETCH
  2784. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  2785. #endif
  2786. movss %xmm5, %xmm4
  2787. shufps $0x39, %xmm4, %xmm4
  2788. pshufd $0xb1, %xmm4, %xmm14
  2789. mulps %xmm12, %xmm4
  2790. addps %xmm4, %xmm0
  2791. movaps -17 * SIZE(A1), %xmm4
  2792. mulps %xmm12, %xmm14
  2793. movaps -24 * SIZE(X1), %xmm12
  2794. SUBPS %xmm14, %xmm1
  2795. movss %xmm6, %xmm5
  2796. shufps $0x39, %xmm5, %xmm5
  2797. pshufd $0xb1, %xmm5, %xmm15
  2798. mulps %xmm13, %xmm5
  2799. addps %xmm5, %xmm0
  2800. movaps -13 * SIZE(A1), %xmm5
  2801. mulps %xmm13, %xmm15
  2802. movaps -20 * SIZE(X1), %xmm13
  2803. SUBPS %xmm15, %xmm1
  2804. #ifdef PREFETCHW
  2805. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
  2806. #endif
  2807. movss %xmm7, %xmm6
  2808. shufps $0x39, %xmm6, %xmm6
  2809. pshufd $0xb1, %xmm6, %xmm14
  2810. mulps %xmm12, %xmm6
  2811. addps %xmm6, %xmm0
  2812. movaps -9 * SIZE(A1), %xmm6
  2813. mulps %xmm12, %xmm14
  2814. movaps -16 * SIZE(X1), %xmm12
  2815. SUBPS %xmm14, %xmm1
  2816. movss %xmm4, %xmm7
  2817. shufps $0x39, %xmm7, %xmm7
  2818. pshufd $0xb1, %xmm7, %xmm15
  2819. mulps %xmm13, %xmm7
  2820. addps %xmm7, %xmm0
  2821. movaps -5 * SIZE(A1), %xmm7
  2822. mulps %xmm13, %xmm15
  2823. movaps -12 * SIZE(X1), %xmm13
  2824. SUBPS %xmm15, %xmm1
  2825. subq $-16 * SIZE, A1
  2826. subq $-16 * SIZE, X1
  2827. subq $1, I
  2828. BRANCH
  2829. jg .L213
  2830. ALIGN_3
  2831. .L214:
  2832. movss %xmm5, %xmm4
  2833. shufps $0x39, %xmm4, %xmm4
  2834. pshufd $0xb1, %xmm4, %xmm14
  2835. mulps %xmm12, %xmm4
  2836. addps %xmm4, %xmm0
  2837. movaps -17 * SIZE(A1), %xmm4
  2838. mulps %xmm12, %xmm14
  2839. movaps -24 * SIZE(X1), %xmm12
  2840. SUBPS %xmm14, %xmm1
  2841. movss %xmm6, %xmm5
  2842. shufps $0x39, %xmm5, %xmm5
  2843. pshufd $0xb1, %xmm5, %xmm15
  2844. mulps %xmm13, %xmm5
  2845. addps %xmm5, %xmm0
  2846. mulps %xmm13, %xmm15
  2847. movaps -20 * SIZE(X1), %xmm13
  2848. SUBPS %xmm15, %xmm1
  2849. movss %xmm7, %xmm6
  2850. shufps $0x39, %xmm6, %xmm6
  2851. pshufd $0xb1, %xmm6, %xmm14
  2852. mulps %xmm12, %xmm6
  2853. addps %xmm6, %xmm0
  2854. mulps %xmm12, %xmm14
  2855. movaps -16 * SIZE(X1), %xmm12
  2856. SUBPS %xmm14, %xmm1
  2857. movss %xmm4, %xmm7
  2858. shufps $0x39, %xmm7, %xmm7
  2859. pshufd $0xb1, %xmm7, %xmm15
  2860. mulps %xmm13, %xmm7
  2861. addps %xmm7, %xmm0
  2862. mulps %xmm13, %xmm15
  2863. movaps -12 * SIZE(X1), %xmm13
  2864. SUBPS %xmm15, %xmm1
  2865. subq $-16 * SIZE, A1
  2866. subq $-16 * SIZE, X1
  2867. ALIGN_3
  2868. .L215:
  2869. testq $4, MM
  2870. je .L217
  2871. movaps -29 * SIZE(A1), %xmm5
  2872. movaps -25 * SIZE(A1), %xmm6
  2873. movss %xmm5, %xmm4
  2874. shufps $0x39, %xmm4, %xmm4
  2875. pshufd $0xb1, %xmm4, %xmm14
  2876. mulps %xmm12, %xmm4
  2877. addps %xmm4, %xmm0
  2878. mulps %xmm12, %xmm14
  2879. SUBPS %xmm14, %xmm1
  2880. movss %xmm6, %xmm5
  2881. shufps $0x39, %xmm5, %xmm5
  2882. pshufd $0xb1, %xmm5, %xmm15
  2883. mulps %xmm13, %xmm5
  2884. addps %xmm5, %xmm0
  2885. mulps %xmm13, %xmm15
  2886. SUBPS %xmm15, %xmm1
  2887. movaps -24 * SIZE(X1), %xmm12
  2888. movaps -20 * SIZE(X1), %xmm13
  2889. movaps %xmm6, %xmm4
  2890. addq $8 * SIZE, A1
  2891. ALIGN_3
  2892. .L217:
  2893. testq $2, MM
  2894. je .L218
  2895. movaps -29 * SIZE(A1), %xmm5
  2896. movss %xmm5, %xmm4
  2897. shufps $0x39, %xmm4, %xmm4
  2898. pshufd $0xb1, %xmm4, %xmm14
  2899. mulps %xmm12, %xmm4
  2900. addps %xmm4, %xmm0
  2901. mulps %xmm12, %xmm14
  2902. SUBPS %xmm14, %xmm1
  2903. movaps %xmm13, %xmm12
  2904. addq $4 * SIZE, A1
  2905. ALIGN_3
  2906. .L218:
  2907. testq $1, MM
  2908. je .L219
  2909. #ifdef movsd
  2910. xorps %xmm8, %xmm8
  2911. #endif
  2912. movsd -32 * SIZE(A1), %xmm8
  2913. pshufd $0xb1, %xmm8, %xmm4
  2914. mulps %xmm12, %xmm8
  2915. addps %xmm8, %xmm0
  2916. mulps %xmm12, %xmm4
  2917. SUBPS %xmm4, %xmm1
  2918. ALIGN_3
  2919. .L219:
  2920. pcmpeqb %xmm5, %xmm5
  2921. psllq $63, %xmm5
  2922. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  2923. xorps %xmm5, %xmm0
  2924. #else
  2925. xorps %xmm5, %xmm1
  2926. #endif
  2927. #ifdef HAVE_SSE3
  2928. haddps %xmm1, %xmm0
  2929. haddps %xmm0, %xmm0
  2930. #else
  2931. movaps %xmm0, %xmm8
  2932. unpcklps %xmm1, %xmm0
  2933. unpckhps %xmm1, %xmm8
  2934. addps %xmm8, %xmm0
  2935. movhlps %xmm0, %xmm1
  2936. addps %xmm1, %xmm0
  2937. #endif
  2938. pshufd $0xb1, %xmm0, %xmm1
  2939. #ifdef HAVE_SSE3
  2940. movddup ALPHA, %xmm15
  2941. #else
  2942. movsd ALPHA, %xmm15
  2943. pshufd $0x44, %xmm15, %xmm15
  2944. #endif
  2945. mulps %xmm15, %xmm0
  2946. mulps %xmm15, %xmm1
  2947. xorps %xmm5, %xmm0
  2948. #ifdef HAVE_SSE3
  2949. haddps %xmm1, %xmm0
  2950. #else
  2951. movaps %xmm0, %xmm2
  2952. shufps $0x88, %xmm1, %xmm0
  2953. shufps $0xdd, %xmm1, %xmm2
  2954. addps %xmm2, %xmm0
  2955. #endif
  2956. movsd (Y), %xmm12
  2957. addq INCY, Y
  2958. shufps $0xd8, %xmm0, %xmm0
  2959. addps %xmm12, %xmm0
  2960. movlps %xmm0, (Y1)
  2961. addq INCY, Y1
  2962. jmp .L999
  2963. .L300:
  2964. cmpq $2, N
  2965. jl .L310
  2966. ALIGN_3
  2967. .L301:
  2968. subq $2, N
  2969. leaq 32 * SIZE(BUFFER), X1
  2970. movq A, A1
  2971. leaq (A1, LDA), A2
  2972. leaq (A1, LDA, 2), A
  2973. xorps %xmm0, %xmm0
  2974. xorps %xmm1, %xmm1
  2975. xorps %xmm2, %xmm2
  2976. xorps %xmm3, %xmm3
  2977. #ifdef ALIGNED_ACCESS
  2978. cmpq M, MM
  2979. je .L30X
  2980. #ifdef movsd
  2981. xorps %xmm8, %xmm8
  2982. #endif
  2983. movsd -32 * SIZE(A1), %xmm8
  2984. #ifdef movsd
  2985. xorps %xmm9, %xmm9
  2986. #endif
  2987. movsd -32 * SIZE(A2), %xmm9
  2988. #ifdef movsd
  2989. xorps %xmm12, %xmm12
  2990. #endif
  2991. movsd -32 * SIZE(X1), %xmm12
  2992. pshufd $0xb1, %xmm8, %xmm4
  2993. mulps %xmm12, %xmm8
  2994. addps %xmm8, %xmm0
  2995. mulps %xmm12, %xmm4
  2996. SUBPS %xmm4, %xmm1
  2997. pshufd $0xb1, %xmm9, %xmm5
  2998. mulps %xmm12, %xmm9
  2999. addps %xmm9, %xmm2
  3000. mulps %xmm12, %xmm5
  3001. SUBPS %xmm5, %xmm3
  3002. addq $2 * SIZE, A1
  3003. addq $2 * SIZE, A2
  3004. addq $2 * SIZE, X1
  3005. ALIGN_3
  3006. .L30X:
  3007. #endif
  3008. movaps -33 * SIZE(A1), %xmm4
  3009. movaps -35 * SIZE(A2), %xmm5
  3010. movaps -32 * SIZE(X1), %xmm12
  3011. movaps -28 * SIZE(X1), %xmm13
  3012. #ifdef PREFETCHW
  3013. PREFETCHW 3 * SIZE(Y1)
  3014. #endif
  3015. movq MM, I
  3016. sarq $3, I
  3017. jle .L305
  3018. movaps -29 * SIZE(A1), %xmm6
  3019. movaps -31 * SIZE(A2), %xmm7
  3020. decq I
  3021. jle .L304
  3022. ALIGN_3
  3023. .L303:
  3024. #ifdef PREFETCH
  3025. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  3026. #endif
  3027. movss %xmm6, %xmm4
  3028. shufps $0x39, %xmm4, %xmm4
  3029. pshufd $0xb1, %xmm4, %xmm14
  3030. mulps %xmm12, %xmm4
  3031. addps %xmm4, %xmm0
  3032. movaps -25 * SIZE(A1), %xmm4
  3033. mulps %xmm12, %xmm14
  3034. SUBPS %xmm14, %xmm1
  3035. movss %xmm7, %xmm5
  3036. shufps $0x93, %xmm7, %xmm5
  3037. pshufd $0xb1, %xmm5, %xmm14
  3038. mulps %xmm12, %xmm5
  3039. addps %xmm5, %xmm2
  3040. movaps -27 * SIZE(A2), %xmm5
  3041. mulps %xmm12, %xmm14
  3042. movaps -24 * SIZE(X1), %xmm12
  3043. SUBPS %xmm14, %xmm3
  3044. movss %xmm4, %xmm6
  3045. shufps $0x39, %xmm6, %xmm6
  3046. pshufd $0xb1, %xmm6, %xmm14
  3047. mulps %xmm13, %xmm6
  3048. addps %xmm6, %xmm0
  3049. movaps -21 * SIZE(A1), %xmm6
  3050. mulps %xmm13, %xmm14
  3051. SUBPS %xmm14, %xmm1
  3052. movss %xmm5, %xmm7
  3053. shufps $0x93, %xmm5, %xmm7
  3054. pshufd $0xb1, %xmm7, %xmm14
  3055. mulps %xmm13, %xmm7
  3056. addps %xmm7, %xmm2
  3057. movaps -23 * SIZE(A2), %xmm7
  3058. mulps %xmm13, %xmm14
  3059. movaps -20 * SIZE(X1), %xmm13
  3060. SUBPS %xmm14, %xmm3
  3061. #ifdef PREFETCH
  3062. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  3063. #endif
  3064. movss %xmm6, %xmm4
  3065. shufps $0x39, %xmm4, %xmm4
  3066. pshufd $0xb1, %xmm4, %xmm14
  3067. mulps %xmm12, %xmm4
  3068. addps %xmm4, %xmm0
  3069. movaps -17 * SIZE(A1), %xmm4
  3070. mulps %xmm12, %xmm14
  3071. SUBPS %xmm14, %xmm1
  3072. movss %xmm7, %xmm5
  3073. shufps $0x93, %xmm7, %xmm5
  3074. pshufd $0xb1, %xmm5, %xmm14
  3075. mulps %xmm12, %xmm5
  3076. addps %xmm5, %xmm2
  3077. movaps -19 * SIZE(A2), %xmm5
  3078. mulps %xmm12, %xmm14
  3079. movaps -16 * SIZE(X1), %xmm12
  3080. SUBPS %xmm14, %xmm3
  3081. #ifdef PREFETCHW
  3082. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
  3083. #endif
  3084. movss %xmm4, %xmm6
  3085. shufps $0x39, %xmm6, %xmm6
  3086. pshufd $0xb1, %xmm6, %xmm14
  3087. mulps %xmm13, %xmm6
  3088. addps %xmm6, %xmm0
  3089. movaps -13 * SIZE(A1), %xmm6
  3090. mulps %xmm13, %xmm14
  3091. SUBPS %xmm14, %xmm1
  3092. movss %xmm5, %xmm7
  3093. shufps $0x93, %xmm5, %xmm7
  3094. pshufd $0xb1, %xmm7, %xmm14
  3095. mulps %xmm13, %xmm7
  3096. addps %xmm7, %xmm2
  3097. movaps -15 * SIZE(A2), %xmm7
  3098. mulps %xmm13, %xmm14
  3099. movaps -12 * SIZE(X1), %xmm13
  3100. SUBPS %xmm14, %xmm3
  3101. subq $-16 * SIZE, A1
  3102. subq $-16 * SIZE, A2
  3103. subq $-16 * SIZE, X1
  3104. subq $1, I
  3105. BRANCH
  3106. jg .L303
  3107. ALIGN_3
  3108. .L304:
  3109. movss %xmm6, %xmm4
  3110. shufps $0x39, %xmm4, %xmm4
  3111. pshufd $0xb1, %xmm4, %xmm14
  3112. mulps %xmm12, %xmm4
  3113. addps %xmm4, %xmm0
  3114. movaps -25 * SIZE(A1), %xmm4
  3115. mulps %xmm12, %xmm14
  3116. SUBPS %xmm14, %xmm1
  3117. movss %xmm7, %xmm5
  3118. shufps $0x93, %xmm7, %xmm5
  3119. pshufd $0xb1, %xmm5, %xmm14
  3120. mulps %xmm12, %xmm5
  3121. addps %xmm5, %xmm2
  3122. movaps -27 * SIZE(A2), %xmm5
  3123. mulps %xmm12, %xmm14
  3124. movaps -24 * SIZE(X1), %xmm12
  3125. SUBPS %xmm14, %xmm3
  3126. movss %xmm4, %xmm6
  3127. shufps $0x39, %xmm6, %xmm6
  3128. pshufd $0xb1, %xmm6, %xmm14
  3129. mulps %xmm13, %xmm6
  3130. addps %xmm6, %xmm0
  3131. movaps -21 * SIZE(A1), %xmm6
  3132. mulps %xmm13, %xmm14
  3133. SUBPS %xmm14, %xmm1
  3134. movss %xmm5, %xmm7
  3135. shufps $0x93, %xmm5, %xmm7
  3136. pshufd $0xb1, %xmm7, %xmm14
  3137. mulps %xmm13, %xmm7
  3138. addps %xmm7, %xmm2
  3139. movaps -23 * SIZE(A2), %xmm7
  3140. mulps %xmm13, %xmm14
  3141. movaps -20 * SIZE(X1), %xmm13
  3142. SUBPS %xmm14, %xmm3
  3143. movss %xmm6, %xmm4
  3144. shufps $0x39, %xmm4, %xmm4
  3145. pshufd $0xb1, %xmm4, %xmm14
  3146. mulps %xmm12, %xmm4
  3147. addps %xmm4, %xmm0
  3148. movaps -17 * SIZE(A1), %xmm4
  3149. mulps %xmm12, %xmm14
  3150. SUBPS %xmm14, %xmm1
  3151. movss %xmm7, %xmm5
  3152. shufps $0x93, %xmm7, %xmm5
  3153. pshufd $0xb1, %xmm5, %xmm14
  3154. mulps %xmm12, %xmm5
  3155. addps %xmm5, %xmm2
  3156. movaps -19 * SIZE(A2), %xmm5
  3157. mulps %xmm12, %xmm14
  3158. movaps -16 * SIZE(X1), %xmm12
  3159. SUBPS %xmm14, %xmm3
  3160. movss %xmm4, %xmm6
  3161. shufps $0x39, %xmm6, %xmm6
  3162. pshufd $0xb1, %xmm6, %xmm14
  3163. mulps %xmm13, %xmm6
  3164. addps %xmm6, %xmm0
  3165. mulps %xmm13, %xmm14
  3166. SUBPS %xmm14, %xmm1
  3167. movss %xmm5, %xmm7
  3168. shufps $0x93, %xmm5, %xmm7
  3169. pshufd $0xb1, %xmm7, %xmm14
  3170. mulps %xmm13, %xmm7
  3171. addps %xmm7, %xmm2
  3172. mulps %xmm13, %xmm14
  3173. movaps -12 * SIZE(X1), %xmm13
  3174. SUBPS %xmm14, %xmm3
  3175. subq $-16 * SIZE, A1
  3176. subq $-16 * SIZE, A2
  3177. subq $-16 * SIZE, X1
  3178. ALIGN_3
  3179. .L305:
  3180. testq $4, MM
  3181. je .L307
  3182. movaps -29 * SIZE(A1), %xmm6
  3183. movss %xmm6, %xmm4
  3184. shufps $0x39, %xmm4, %xmm4
  3185. pshufd $0xb1, %xmm4, %xmm14
  3186. mulps %xmm12, %xmm4
  3187. addps %xmm4, %xmm0
  3188. mulps %xmm12, %xmm14
  3189. SUBPS %xmm14, %xmm1
  3190. movaps -31 * SIZE(A2), %xmm7
  3191. movss %xmm7, %xmm5
  3192. shufps $0x93, %xmm7, %xmm5
  3193. pshufd $0xb1, %xmm5, %xmm14
  3194. mulps %xmm12, %xmm5
  3195. addps %xmm5, %xmm2
  3196. mulps %xmm12, %xmm14
  3197. SUBPS %xmm14, %xmm3
  3198. movaps -25 * SIZE(A1), %xmm8
  3199. movss %xmm8, %xmm6
  3200. shufps $0x39, %xmm6, %xmm6
  3201. pshufd $0xb1, %xmm6, %xmm14
  3202. mulps %xmm13, %xmm6
  3203. addps %xmm6, %xmm0
  3204. mulps %xmm13, %xmm14
  3205. SUBPS %xmm14, %xmm1
  3206. movaps -27 * SIZE(A2), %xmm9
  3207. movss %xmm9, %xmm7
  3208. shufps $0x93, %xmm9, %xmm7
  3209. pshufd $0xb1, %xmm7, %xmm14
  3210. mulps %xmm13, %xmm7
  3211. addps %xmm7, %xmm2
  3212. mulps %xmm13, %xmm14
  3213. SUBPS %xmm14, %xmm3
  3214. movaps %xmm8, %xmm4
  3215. movaps %xmm9, %xmm5
  3216. movaps -24 * SIZE(X1), %xmm12
  3217. movaps -20 * SIZE(X1), %xmm13
  3218. addq $8 * SIZE, A1
  3219. addq $8 * SIZE, A2
  3220. ALIGN_3
  3221. .L307:
  3222. testq $2, MM
  3223. je .L308
  3224. movaps -29 * SIZE(A1), %xmm6
  3225. movaps -31 * SIZE(A2), %xmm7
  3226. movss %xmm6, %xmm4
  3227. shufps $0x39, %xmm4, %xmm4
  3228. pshufd $0xb1, %xmm4, %xmm14
  3229. mulps %xmm12, %xmm4
  3230. addps %xmm4, %xmm0
  3231. mulps %xmm12, %xmm14
  3232. SUBPS %xmm14, %xmm1
  3233. movss %xmm7, %xmm5
  3234. shufps $0x93, %xmm7, %xmm5
  3235. pshufd $0xb1, %xmm5, %xmm14
  3236. mulps %xmm12, %xmm5
  3237. addps %xmm5, %xmm2
  3238. mulps %xmm12, %xmm14
  3239. SUBPS %xmm14, %xmm3
  3240. movaps %xmm6, %xmm4
  3241. movaps %xmm7, %xmm5
  3242. movaps %xmm13, %xmm12
  3243. addq $4 * SIZE, A1
  3244. addq $4 * SIZE, A2
  3245. ALIGN_3
  3246. .L308:
  3247. testq $1, MM
  3248. je .L309
  3249. #ifdef movsd
  3250. xorps %xmm8, %xmm8
  3251. #endif
  3252. movsd -32 * SIZE(A1), %xmm8
  3253. #ifdef movsd
  3254. xorps %xmm9, %xmm9
  3255. #endif
  3256. movsd -32 * SIZE(A2), %xmm9
  3257. pshufd $0xb1, %xmm8, %xmm4
  3258. mulps %xmm12, %xmm8
  3259. addps %xmm8, %xmm0
  3260. mulps %xmm12, %xmm4
  3261. SUBPS %xmm4, %xmm1
  3262. pshufd $0xb1, %xmm9, %xmm5
  3263. mulps %xmm12, %xmm9
  3264. addps %xmm9, %xmm2
  3265. mulps %xmm12, %xmm5
  3266. SUBPS %xmm5, %xmm3
  3267. ALIGN_3
  3268. .L309:
  3269. pcmpeqb %xmm5, %xmm5
  3270. psllq $63, %xmm5
  3271. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  3272. xorps %xmm5, %xmm0
  3273. xorps %xmm5, %xmm2
  3274. #else
  3275. xorps %xmm5, %xmm1
  3276. xorps %xmm5, %xmm3
  3277. #endif
  3278. #ifdef HAVE_SSE3
  3279. haddps %xmm1, %xmm0
  3280. haddps %xmm3, %xmm2
  3281. haddps %xmm2, %xmm0
  3282. #else
  3283. movaps %xmm0, %xmm8
  3284. unpcklps %xmm1, %xmm0
  3285. unpckhps %xmm1, %xmm8
  3286. movaps %xmm2, %xmm4
  3287. unpcklps %xmm3, %xmm2
  3288. unpckhps %xmm3, %xmm4
  3289. addps %xmm8, %xmm0
  3290. addps %xmm4, %xmm2
  3291. movhlps %xmm0, %xmm1
  3292. movhlps %xmm2, %xmm3
  3293. addps %xmm1, %xmm0
  3294. addps %xmm3, %xmm2
  3295. movlhps %xmm2, %xmm0
  3296. #endif
  3297. pshufd $0xb1, %xmm0, %xmm1
  3298. #ifdef HAVE_SSE3
  3299. movddup ALPHA, %xmm15
  3300. #else
  3301. movsd ALPHA, %xmm15
  3302. pshufd $0x44, %xmm15, %xmm15
  3303. #endif
  3304. mulps %xmm15, %xmm0
  3305. mulps %xmm15, %xmm1
  3306. xorps %xmm5, %xmm0
  3307. #ifdef HAVE_SSE3
  3308. haddps %xmm1, %xmm0
  3309. #else
  3310. movaps %xmm0, %xmm2
  3311. shufps $0x88, %xmm1, %xmm0
  3312. shufps $0xdd, %xmm1, %xmm2
  3313. addps %xmm2, %xmm0
  3314. #endif
  3315. movsd (Y), %xmm12
  3316. addq INCY, Y
  3317. movhps (Y), %xmm12
  3318. addq INCY, Y
  3319. shufps $0xd8, %xmm0, %xmm0
  3320. addps %xmm12, %xmm0
  3321. movlps %xmm0, (Y1)
  3322. addq INCY, Y1
  3323. movhps %xmm0, (Y1)
  3324. addq INCY, Y1
  3325. cmpq $2, N
  3326. jge .L301
  3327. ALIGN_3
  3328. .L310:
  3329. cmpq $1, N
  3330. jl .L999
  3331. leaq 32 * SIZE(BUFFER), X1
  3332. movq A, A1
  3333. xorps %xmm0, %xmm0
  3334. xorps %xmm1, %xmm1
  3335. #ifdef ALIGNED_ACCESS
  3336. cmpq M, MM
  3337. je .L31X
  3338. #ifdef movsd
  3339. xorps %xmm8, %xmm8
  3340. #endif
  3341. movsd -32 * SIZE(A1), %xmm8
  3342. #ifdef movsd
  3343. xorps %xmm12, %xmm12
  3344. #endif
  3345. movsd -32 * SIZE(X1), %xmm12
  3346. pshufd $0xb1, %xmm8, %xmm4
  3347. mulps %xmm12, %xmm8
  3348. addps %xmm8, %xmm0
  3349. mulps %xmm12, %xmm4
  3350. SUBPS %xmm4, %xmm1
  3351. addq $2 * SIZE, A1
  3352. addq $2 * SIZE, X1
  3353. ALIGN_3
  3354. .L31X:
  3355. #endif
  3356. movaps -33 * SIZE(A1), %xmm4
  3357. movaps -32 * SIZE(X1), %xmm12
  3358. movaps -28 * SIZE(X1), %xmm13
  3359. movq MM, I
  3360. sarq $3, I
  3361. jle .L315
  3362. movaps -29 * SIZE(A1), %xmm5
  3363. movaps -25 * SIZE(A1), %xmm6
  3364. movaps -21 * SIZE(A1), %xmm7
  3365. decq I
  3366. jle .L314
  3367. ALIGN_3
  3368. .L313:
  3369. #ifdef PREFETCH
  3370. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  3371. #endif
  3372. movss %xmm5, %xmm4
  3373. shufps $0x39, %xmm4, %xmm4
  3374. pshufd $0xb1, %xmm4, %xmm14
  3375. mulps %xmm12, %xmm4
  3376. addps %xmm4, %xmm0
  3377. movaps -17 * SIZE(A1), %xmm4
  3378. mulps %xmm12, %xmm14
  3379. movaps -24 * SIZE(X1), %xmm12
  3380. SUBPS %xmm14, %xmm1
  3381. movss %xmm6, %xmm5
  3382. shufps $0x39, %xmm5, %xmm5
  3383. pshufd $0xb1, %xmm5, %xmm15
  3384. mulps %xmm13, %xmm5
  3385. addps %xmm5, %xmm0
  3386. movaps -13 * SIZE(A1), %xmm5
  3387. mulps %xmm13, %xmm15
  3388. movaps -20 * SIZE(X1), %xmm13
  3389. SUBPS %xmm15, %xmm1
  3390. #ifdef PREFETCHW
  3391. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
  3392. #endif
  3393. movss %xmm7, %xmm6
  3394. shufps $0x39, %xmm6, %xmm6
  3395. pshufd $0xb1, %xmm6, %xmm14
  3396. mulps %xmm12, %xmm6
  3397. addps %xmm6, %xmm0
  3398. movaps -9 * SIZE(A1), %xmm6
  3399. mulps %xmm12, %xmm14
  3400. movaps -16 * SIZE(X1), %xmm12
  3401. SUBPS %xmm14, %xmm1
  3402. movss %xmm4, %xmm7
  3403. shufps $0x39, %xmm7, %xmm7
  3404. pshufd $0xb1, %xmm7, %xmm15
  3405. mulps %xmm13, %xmm7
  3406. addps %xmm7, %xmm0
  3407. movaps -5 * SIZE(A1), %xmm7
  3408. mulps %xmm13, %xmm15
  3409. movaps -12 * SIZE(X1), %xmm13
  3410. SUBPS %xmm15, %xmm1
  3411. subq $-16 * SIZE, A1
  3412. subq $-16 * SIZE, X1
  3413. subq $1, I
  3414. BRANCH
  3415. jg .L313
  3416. ALIGN_3
  3417. .L314:
  3418. movss %xmm5, %xmm4
  3419. shufps $0x39, %xmm4, %xmm4
  3420. pshufd $0xb1, %xmm4, %xmm14
  3421. mulps %xmm12, %xmm4
  3422. addps %xmm4, %xmm0
  3423. movaps -17 * SIZE(A1), %xmm4
  3424. mulps %xmm12, %xmm14
  3425. movaps -24 * SIZE(X1), %xmm12
  3426. SUBPS %xmm14, %xmm1
  3427. movss %xmm6, %xmm5
  3428. shufps $0x39, %xmm5, %xmm5
  3429. pshufd $0xb1, %xmm5, %xmm15
  3430. mulps %xmm13, %xmm5
  3431. addps %xmm5, %xmm0
  3432. mulps %xmm13, %xmm15
  3433. movaps -20 * SIZE(X1), %xmm13
  3434. SUBPS %xmm15, %xmm1
  3435. movss %xmm7, %xmm6
  3436. shufps $0x39, %xmm6, %xmm6
  3437. pshufd $0xb1, %xmm6, %xmm14
  3438. mulps %xmm12, %xmm6
  3439. addps %xmm6, %xmm0
  3440. mulps %xmm12, %xmm14
  3441. movaps -16 * SIZE(X1), %xmm12
  3442. SUBPS %xmm14, %xmm1
  3443. movss %xmm4, %xmm7
  3444. shufps $0x39, %xmm7, %xmm7
  3445. pshufd $0xb1, %xmm7, %xmm15
  3446. mulps %xmm13, %xmm7
  3447. addps %xmm7, %xmm0
  3448. mulps %xmm13, %xmm15
  3449. movaps -12 * SIZE(X1), %xmm13
  3450. SUBPS %xmm15, %xmm1
  3451. subq $-16 * SIZE, A1
  3452. subq $-16 * SIZE, X1
  3453. ALIGN_3
  3454. .L315:
  3455. testq $4, MM
  3456. je .L317
  3457. movaps -29 * SIZE(A1), %xmm5
  3458. movaps -25 * SIZE(A1), %xmm6
  3459. movss %xmm5, %xmm4
  3460. shufps $0x39, %xmm4, %xmm4
  3461. pshufd $0xb1, %xmm4, %xmm14
  3462. mulps %xmm12, %xmm4
  3463. addps %xmm4, %xmm0
  3464. mulps %xmm12, %xmm14
  3465. SUBPS %xmm14, %xmm1
  3466. movss %xmm6, %xmm5
  3467. shufps $0x39, %xmm5, %xmm5
  3468. pshufd $0xb1, %xmm5, %xmm15
  3469. mulps %xmm13, %xmm5
  3470. addps %xmm5, %xmm0
  3471. mulps %xmm13, %xmm15
  3472. SUBPS %xmm15, %xmm1
  3473. movaps -24 * SIZE(X1), %xmm12
  3474. movaps -20 * SIZE(X1), %xmm13
  3475. movaps %xmm6, %xmm4
  3476. addq $8 * SIZE, A1
  3477. ALIGN_3
  3478. .L317:
  3479. testq $2, MM
  3480. je .L318
  3481. movaps -29 * SIZE(A1), %xmm5
  3482. movss %xmm5, %xmm4
  3483. shufps $0x39, %xmm4, %xmm4
  3484. pshufd $0xb1, %xmm4, %xmm14
  3485. mulps %xmm12, %xmm4
  3486. addps %xmm4, %xmm0
  3487. mulps %xmm12, %xmm14
  3488. SUBPS %xmm14, %xmm1
  3489. movaps %xmm13, %xmm12
  3490. addq $4 * SIZE, A1
  3491. ALIGN_3
  3492. .L318:
  3493. testq $1, MM
  3494. je .L319
  3495. #ifdef movsd
  3496. xorps %xmm8, %xmm8
  3497. #endif
  3498. movsd -32 * SIZE(A1), %xmm8
  3499. pshufd $0xb1, %xmm8, %xmm4
  3500. mulps %xmm12, %xmm8
  3501. addps %xmm8, %xmm0
  3502. mulps %xmm12, %xmm4
  3503. SUBPS %xmm4, %xmm1
  3504. ALIGN_3
  3505. .L319:
  3506. pcmpeqb %xmm5, %xmm5
  3507. psllq $63, %xmm5
  3508. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  3509. xorps %xmm5, %xmm0
  3510. #else
  3511. xorps %xmm5, %xmm1
  3512. #endif
  3513. #ifdef HAVE_SSE3
  3514. haddps %xmm1, %xmm0
  3515. haddps %xmm0, %xmm0
  3516. #else
  3517. movaps %xmm0, %xmm8
  3518. unpcklps %xmm1, %xmm0
  3519. unpckhps %xmm1, %xmm8
  3520. addps %xmm8, %xmm0
  3521. movhlps %xmm0, %xmm1
  3522. addps %xmm1, %xmm0
  3523. #endif
  3524. pshufd $0xb1, %xmm0, %xmm1
  3525. #ifdef HAVE_SSE3
  3526. movddup ALPHA, %xmm15
  3527. #else
  3528. movsd ALPHA, %xmm15
  3529. pshufd $0x44, %xmm15, %xmm15
  3530. #endif
  3531. mulps %xmm15, %xmm0
  3532. mulps %xmm15, %xmm1
  3533. xorps %xmm5, %xmm0
  3534. #ifdef HAVE_SSE3
  3535. haddps %xmm1, %xmm0
  3536. #else
  3537. movaps %xmm0, %xmm2
  3538. shufps $0x88, %xmm1, %xmm0
  3539. shufps $0xdd, %xmm1, %xmm2
  3540. addps %xmm2, %xmm0
  3541. #endif
  3542. movsd (Y), %xmm12
  3543. addq INCY, Y
  3544. shufps $0xd8, %xmm0, %xmm0
  3545. addps %xmm12, %xmm0
  3546. movlps %xmm0, (Y1)
  3547. addq INCY, Y1
  3548. #endif
  3549. ALIGN_3
  3550. .L999:
  3551. movq M, I
  3552. salq $ZBASE_SHIFT,I
  3553. addq I,AA
  3554. jmp .L0t
  3555. .L999x:
  3556. movq 0(%rsp), %rbx
  3557. movq 8(%rsp), %rbp
  3558. movq 16(%rsp), %r12
  3559. movq 24(%rsp), %r13
  3560. movq 32(%rsp), %r14
  3561. movq 40(%rsp), %r15
  3562. #ifdef WINDOWS_ABI
  3563. movq 48(%rsp), %rdi
  3564. movq 56(%rsp), %rsi
  3565. movups 64(%rsp), %xmm6
  3566. movups 80(%rsp), %xmm7
  3567. movups 96(%rsp), %xmm8
  3568. movups 112(%rsp), %xmm9
  3569. movups 128(%rsp), %xmm10
  3570. movups 144(%rsp), %xmm11
  3571. movups 160(%rsp), %xmm12
  3572. movups 176(%rsp), %xmm13
  3573. movups 192(%rsp), %xmm14
  3574. movups 208(%rsp), %xmm15
  3575. #endif
  3576. addq $STACKSIZE, %rsp
  3577. ret
  3578. EPILOGUE