You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_4x4_sse3.S 76 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define J %r12
  49. #define AO %r13
  50. #define BO %r14
  51. #define CO1 %r15
  52. #define CO2 %rbx
  53. #define KK %rbp
  54. #ifndef WINDOWS_ABI
  55. #define STACKSIZE 128
  56. #define OLD_LDC 8 + STACKSIZE(%rsp)
  57. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  58. #define OFFSET 48(%rsp)
  59. #define KKK 56(%rsp)
  60. #define AORIG 64(%rsp)
  61. #else
  62. #define STACKSIZE 272
  63. #define OLD_A 40 + STACKSIZE(%rsp)
  64. #define OLD_B 48 + STACKSIZE(%rsp)
  65. #define OLD_C 56 + STACKSIZE(%rsp)
  66. #define OLD_LDC 64 + STACKSIZE(%rsp)
  67. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  68. #define OFFSET 224(%rsp)
  69. #define KKK 232(%rsp)
  70. #define AORIG 240(%rsp)
  71. #endif
  72. #define PREFETCH prefetcht1
  73. #define PREFETCHSIZE (16 * 12 + 3)
  74. #define PREFETCH_R (4 * 4 + 0)
  75. #define KERNEL1(address) \
  76. mulpd %xmm8, %xmm9 ;\
  77. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\
  78. addpd %xmm9, %xmm0;\
  79. movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  80. mulpd %xmm8, %xmm9;\
  81. addpd %xmm9, %xmm1;\
  82. movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  83. mulpd %xmm8, %xmm9;\
  84. addpd %xmm9, %xmm2;\
  85. movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  86. mulpd %xmm8, %xmm9;\
  87. movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
  88. addpd %xmm9, %xmm3;\
  89. movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  90. #define KERNEL2(address) \
  91. mulpd %xmm8, %xmm9;\
  92. addpd %xmm9, %xmm4;\
  93. movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  94. mulpd %xmm8, %xmm9;\
  95. addpd %xmm9, %xmm5;\
  96. movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  97. mulpd %xmm8, %xmm9;\
  98. addpd %xmm9, %xmm6;\
  99. movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  100. mulpd %xmm8, %xmm9;\
  101. movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
  102. addpd %xmm9, %xmm7;\
  103. movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  104. #define KERNEL3(address) \
  105. mulpd %xmm8, %xmm9;\
  106. addpd %xmm9, %xmm0;\
  107. movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  108. mulpd %xmm8, %xmm9;\
  109. addpd %xmm9, %xmm1;\
  110. movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  111. mulpd %xmm8, %xmm9;\
  112. addpd %xmm9, %xmm2;\
  113. movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  114. mulpd %xmm8, %xmm9;\
  115. movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
  116. addpd %xmm9, %xmm3;\
  117. movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  118. #define KERNEL4(address) \
  119. mulpd %xmm8, %xmm9;\
  120. addpd %xmm9, %xmm4;\
  121. movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  122. mulpd %xmm8, %xmm9;\
  123. addpd %xmm9, %xmm5;\
  124. movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  125. mulpd %xmm8, %xmm9;\
  126. addpd %xmm9, %xmm6;\
  127. movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  128. mulpd %xmm8, %xmm9;\
  129. movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
  130. addpd %xmm9, %xmm7;\
  131. movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  132. #define KERNEL5(address) \
  133. mulpd %xmm10, %xmm11;\
  134. addpd %xmm11, %xmm0;\
  135. movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  136. mulpd %xmm10, %xmm11;\
  137. addpd %xmm11, %xmm1;\
  138. movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  139. mulpd %xmm10, %xmm11;\
  140. addpd %xmm11, %xmm2;\
  141. movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  142. mulpd %xmm10, %xmm11;\
  143. movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
  144. addpd %xmm11, %xmm3;\
  145. movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  146. #define KERNEL6(address) \
  147. mulpd %xmm10, %xmm11;\
  148. addpd %xmm11, %xmm4;\
  149. movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  150. mulpd %xmm10, %xmm11;\
  151. addpd %xmm11, %xmm5;\
  152. movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  153. mulpd %xmm10, %xmm11;\
  154. addpd %xmm11, %xmm6;\
  155. movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  156. mulpd %xmm10, %xmm11;\
  157. movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
  158. addpd %xmm11, %xmm7;\
  159. movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  160. #define KERNEL7(address) \
  161. mulpd %xmm10, %xmm11;\
  162. addpd %xmm11, %xmm0;\
  163. movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  164. mulpd %xmm10, %xmm11;\
  165. addpd %xmm11, %xmm1;\
  166. movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  167. mulpd %xmm10, %xmm11;\
  168. addpd %xmm11, %xmm2;\
  169. movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  170. mulpd %xmm10, %xmm11;\
  171. movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
  172. addpd %xmm11, %xmm3;\
  173. movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  174. #define KERNEL8(address) \
  175. mulpd %xmm10, %xmm11;\
  176. addpd %xmm11, %xmm4;\
  177. movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  178. mulpd %xmm10, %xmm11;\
  179. addpd %xmm11, %xmm5;\
  180. movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  181. mulpd %xmm10, %xmm11;\
  182. addpd %xmm11, %xmm6;\
  183. movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  184. mulpd %xmm10, %xmm11;\
  185. movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
  186. addpd %xmm11, %xmm7;\
  187. movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  188. #define KERNEL9(address) \
  189. mulpd %xmm12, %xmm13;\
  190. PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\
  191. addpd %xmm13, %xmm0;\
  192. movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  193. mulpd %xmm12, %xmm13;\
  194. addpd %xmm13, %xmm1;\
  195. movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  196. mulpd %xmm12, %xmm13;\
  197. addpd %xmm13, %xmm2;\
  198. movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  199. mulpd %xmm12, %xmm13;\
  200. movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
  201. addpd %xmm13, %xmm3;\
  202. movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  203. #define KERNEL10(address) \
  204. mulpd %xmm12, %xmm13;\
  205. addpd %xmm13, %xmm4;\
  206. movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  207. mulpd %xmm12, %xmm13;\
  208. addpd %xmm13, %xmm5;\
  209. movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  210. mulpd %xmm12, %xmm13;\
  211. addpd %xmm13, %xmm6;\
  212. movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  213. mulpd %xmm12, %xmm13;\
  214. movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
  215. addpd %xmm13, %xmm7;\
  216. movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  217. #define KERNEL11(address) \
  218. mulpd %xmm12, %xmm13;\
  219. addpd %xmm13, %xmm0;\
  220. movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  221. mulpd %xmm12, %xmm13;\
  222. addpd %xmm13, %xmm1;\
  223. movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  224. mulpd %xmm12, %xmm13;\
  225. addpd %xmm13, %xmm2;\
  226. movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  227. mulpd %xmm12, %xmm13;\
  228. movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
  229. addpd %xmm13, %xmm3;\
  230. movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  231. #define KERNEL12(address) \
  232. mulpd %xmm12, %xmm13;\
  233. addpd %xmm13, %xmm4;\
  234. movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  235. mulpd %xmm12, %xmm13;\
  236. addpd %xmm13, %xmm5;\
  237. movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  238. mulpd %xmm12, %xmm13;\
  239. addpd %xmm13, %xmm6;\
  240. movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  241. mulpd %xmm12, %xmm13;\
  242. movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
  243. addpd %xmm13, %xmm7;\
  244. movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  245. #define KERNEL13(address) \
  246. mulpd %xmm14, %xmm15;\
  247. addpd %xmm15, %xmm0;\
  248. movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  249. mulpd %xmm14, %xmm15;\
  250. addpd %xmm15, %xmm1;\
  251. movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  252. mulpd %xmm14, %xmm15;\
  253. addpd %xmm15, %xmm2;\
  254. movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  255. mulpd %xmm14, %xmm15;\
  256. movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
  257. addpd %xmm15, %xmm3;\
  258. movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  259. #define KERNEL14(address) \
  260. mulpd %xmm14, %xmm15;\
  261. addpd %xmm15, %xmm4;\
  262. movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  263. mulpd %xmm14, %xmm15;\
  264. addpd %xmm15, %xmm5;\
  265. movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  266. mulpd %xmm14, %xmm15;\
  267. addpd %xmm15, %xmm6;\
  268. movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  269. mulpd %xmm14, %xmm15;\
  270. movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
  271. addpd %xmm15, %xmm7;\
  272. movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  273. #define KERNEL15(address) \
  274. mulpd %xmm14, %xmm15;\
  275. addpd %xmm15, %xmm0;\
  276. movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  277. mulpd %xmm14, %xmm15;\
  278. addpd %xmm15, %xmm1;\
  279. movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  280. mulpd %xmm14, %xmm15;\
  281. addpd %xmm15, %xmm2;\
  282. movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  283. mulpd %xmm14, %xmm15;\
  284. movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
  285. addpd %xmm15, %xmm3;\
  286. movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  287. #define KERNEL16(address) \
  288. mulpd %xmm14, %xmm15;\
  289. addpd %xmm15, %xmm4;\
  290. movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  291. mulpd %xmm14, %xmm15;\
  292. addpd %xmm15, %xmm5;\
  293. movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  294. mulpd %xmm14, %xmm15;\
  295. addpd %xmm15, %xmm6;\
  296. movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  297. mulpd %xmm14, %xmm15;\
  298. movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
  299. addpd %xmm15, %xmm7;\
  300. movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  301. PROLOGUE
  302. PROFCODE
  303. subq $STACKSIZE, %rsp
  304. movq %rbx, 0(%rsp)
  305. movq %rbp, 8(%rsp)
  306. movq %r12, 16(%rsp)
  307. movq %r13, 24(%rsp)
  308. movq %r14, 32(%rsp)
  309. movq %r15, 40(%rsp)
  310. #ifdef WINDOWS_ABI
  311. movq %rdi, 48(%rsp)
  312. movq %rsi, 56(%rsp)
  313. movups %xmm6, 64(%rsp)
  314. movups %xmm7, 80(%rsp)
  315. movups %xmm8, 96(%rsp)
  316. movups %xmm9, 112(%rsp)
  317. movups %xmm10, 128(%rsp)
  318. movups %xmm11, 144(%rsp)
  319. movups %xmm12, 160(%rsp)
  320. movups %xmm13, 176(%rsp)
  321. movups %xmm14, 192(%rsp)
  322. movups %xmm15, 208(%rsp)
  323. movq ARG1, M
  324. movq ARG2, N
  325. movq ARG3, K
  326. movq OLD_A, A
  327. movq OLD_B, B
  328. movq OLD_C, C
  329. #endif
  330. movq OLD_LDC, LDC
  331. movq OLD_OFFSET, KK
  332. movq KK, OFFSET
  333. leaq (, LDC, SIZE), LDC
  334. #ifdef LN
  335. leaq (, M, SIZE), %rax
  336. addq %rax, C
  337. imulq K, %rax
  338. addq %rax, A
  339. #endif
  340. #ifdef RT
  341. leaq (, N, SIZE), %rax
  342. imulq K, %rax
  343. addq %rax, B
  344. movq N, %rax
  345. imulq LDC, %rax
  346. addq %rax, C
  347. #endif
  348. #ifdef RN
  349. negq KK
  350. #endif
  351. #ifdef RT
  352. movq N, %rax
  353. subq OFFSET, %rax
  354. movq %rax, KK
  355. #endif
  356. testq $1, N
  357. je .L80
  358. ALIGN_4
  359. #if defined(LT) || defined(RN)
  360. movq A, AO
  361. #else
  362. movq A, AORIG
  363. #endif
  364. #ifdef RT
  365. movq K, %rax
  366. salq $0 + BASE_SHIFT, %rax
  367. subq %rax, B
  368. subq LDC, C
  369. #endif
  370. movq C, CO1
  371. #ifndef RT
  372. addq LDC, C
  373. #endif
  374. #ifdef LN
  375. movq OFFSET, %rax
  376. addq M, %rax
  377. movq %rax, KK
  378. #endif
  379. #ifdef LT
  380. movq OFFSET, %rax
  381. movq %rax, KK
  382. #endif
  383. movq M, I
  384. sarq $2, I # i = (m >> 2)
  385. jle .L100
  386. ALIGN_4
  387. .L91:
  388. #ifdef LN
  389. movq K, %rax
  390. salq $2 + BASE_SHIFT, %rax
  391. subq %rax, AORIG
  392. #endif
  393. #if defined(LN) || defined(RT)
  394. movq KK, %rax
  395. leaq (, %rax, SIZE), %rax
  396. movq AORIG, AO
  397. leaq (AO, %rax, 4), AO
  398. leaq (B, %rax, 1), BO
  399. #else
  400. movq B, BO
  401. #endif
  402. movapd 0 * SIZE(AO), %xmm8
  403. pxor %xmm0, %xmm0
  404. movddup 0 * SIZE(BO), %xmm9
  405. pxor %xmm1, %xmm1
  406. movapd 8 * SIZE(AO), %xmm10
  407. pxor %xmm2, %xmm2
  408. movddup 4 * SIZE(BO), %xmm11
  409. pxor %xmm3, %xmm3
  410. #ifdef HAVE_3DNOW
  411. prefetchw 4 * SIZE(CO1)
  412. #else
  413. prefetchnta 4 * SIZE(CO1)
  414. #endif
  415. #if defined(LT) || defined(RN)
  416. movq KK, %rax
  417. #else
  418. movq K, %rax
  419. subq KK, %rax
  420. #endif
  421. sarq $3, %rax
  422. je .L95
  423. ALIGN_4
  424. .L92:
  425. mulpd %xmm9, %xmm8
  426. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  427. mulpd 2 * SIZE(AO), %xmm9
  428. addpd %xmm8, %xmm0
  429. movapd 4 * SIZE(AO), %xmm8
  430. addpd %xmm9, %xmm1
  431. movddup 1 * SIZE(BO), %xmm9
  432. mulpd %xmm9, %xmm8
  433. mulpd 6 * SIZE(AO), %xmm9
  434. addpd %xmm8, %xmm2
  435. movapd 16 * SIZE(AO), %xmm8
  436. addpd %xmm9, %xmm3
  437. movddup 2 * SIZE(BO), %xmm9
  438. mulpd %xmm9, %xmm10
  439. mulpd 10 * SIZE(AO), %xmm9
  440. addpd %xmm10, %xmm0
  441. movapd 12 * SIZE(AO), %xmm10
  442. addpd %xmm9, %xmm1
  443. movddup 3 * SIZE(BO), %xmm9
  444. mulpd %xmm9, %xmm10
  445. mulpd 14 * SIZE(AO), %xmm9
  446. addpd %xmm10, %xmm2
  447. movapd 24 * SIZE(AO), %xmm10
  448. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  449. addpd %xmm9, %xmm3
  450. movddup 8 * SIZE(BO), %xmm9
  451. mulpd %xmm11, %xmm8
  452. mulpd 18 * SIZE(AO), %xmm11
  453. addpd %xmm8, %xmm0
  454. movapd 20 * SIZE(AO), %xmm8
  455. addpd %xmm11, %xmm1
  456. movddup 5 * SIZE(BO), %xmm11
  457. mulpd %xmm11, %xmm8
  458. mulpd 22 * SIZE(AO), %xmm11
  459. addpd %xmm8, %xmm2
  460. movapd 32 * SIZE(AO), %xmm8
  461. addpd %xmm11, %xmm3
  462. movddup 6 * SIZE(BO), %xmm11
  463. mulpd %xmm11, %xmm10
  464. mulpd 26 * SIZE(AO), %xmm11
  465. addpd %xmm10, %xmm0
  466. movapd 28 * SIZE(AO), %xmm10
  467. addpd %xmm11, %xmm1
  468. movddup 7 * SIZE(BO), %xmm11
  469. mulpd %xmm11, %xmm10
  470. mulpd 30 * SIZE(AO), %xmm11
  471. addpd %xmm10, %xmm2
  472. movapd 40 * SIZE(AO), %xmm10
  473. addpd %xmm11, %xmm3
  474. movddup 12 * SIZE(BO), %xmm11
  475. addq $32 * SIZE, AO
  476. addq $8 * SIZE, BO
  477. decq %rax
  478. jne .L92
  479. ALIGN_4
  480. .L95:
  481. #if defined(LT) || defined(RN)
  482. movq KK, %rax
  483. #else
  484. movq K, %rax
  485. subq KK, %rax
  486. #endif
  487. andq $7, %rax # if (k & 1)
  488. BRANCH
  489. je .L99
  490. ALIGN_4
  491. .L96:
  492. mulpd %xmm9, %xmm8
  493. mulpd 2 * SIZE(AO), %xmm9
  494. addpd %xmm8, %xmm0
  495. movapd 4 * SIZE(AO), %xmm8
  496. addpd %xmm9, %xmm1
  497. movddup 1 * SIZE(BO), %xmm9
  498. addq $4 * SIZE, AO # aoffset += 4
  499. addq $1 * SIZE, BO # boffset1 += 8
  500. decq %rax
  501. jg .L96
  502. ALIGN_4
  503. .L99:
  504. addpd %xmm2, %xmm0
  505. addpd %xmm3, %xmm1
  506. #if defined(LN) || defined(RT)
  507. movq KK, %rax
  508. #ifdef LN
  509. subq $4, %rax
  510. #else
  511. subq $1, %rax
  512. #endif
  513. leaq (, %rax, SIZE), %rax
  514. movq AORIG, AO
  515. leaq (AO, %rax, 4), AO
  516. leaq (B, %rax, 1), BO
  517. #endif
  518. #if defined(LN) || defined(LT)
  519. movapd 0 * SIZE(BO), %xmm2
  520. movapd 2 * SIZE(BO), %xmm3
  521. subpd %xmm0, %xmm2
  522. subpd %xmm1, %xmm3
  523. #else
  524. movapd 0 * SIZE(AO), %xmm2
  525. movapd 2 * SIZE(AO), %xmm3
  526. subpd %xmm0, %xmm2
  527. subpd %xmm1, %xmm3
  528. #endif
  529. #ifdef LN
  530. movapd %xmm2, %xmm0
  531. unpckhpd %xmm0, %xmm0
  532. movapd %xmm3, %xmm1
  533. unpckhpd %xmm1, %xmm1
  534. movsd 15 * SIZE(AO), %xmm4
  535. mulsd %xmm4, %xmm1
  536. movsd 14 * SIZE(AO), %xmm5
  537. mulsd %xmm1, %xmm5
  538. subsd %xmm5, %xmm3
  539. movsd 13 * SIZE(AO), %xmm6
  540. mulsd %xmm1, %xmm6
  541. subsd %xmm6, %xmm0
  542. movsd 12 * SIZE(AO), %xmm7
  543. mulsd %xmm1, %xmm7
  544. subsd %xmm7, %xmm2
  545. movsd 10 * SIZE(AO), %xmm4
  546. mulsd %xmm4, %xmm3
  547. movsd 9 * SIZE(AO), %xmm5
  548. mulsd %xmm3, %xmm5
  549. subsd %xmm5, %xmm0
  550. movsd 8 * SIZE(AO), %xmm6
  551. mulsd %xmm3, %xmm6
  552. subsd %xmm6, %xmm2
  553. movsd 5 * SIZE(AO), %xmm4
  554. mulsd %xmm4, %xmm0
  555. movsd 4 * SIZE(AO), %xmm5
  556. mulsd %xmm0, %xmm5
  557. subsd %xmm5, %xmm2
  558. movsd 0 * SIZE(AO), %xmm4
  559. mulsd %xmm4, %xmm2
  560. unpcklpd %xmm0, %xmm2
  561. unpcklpd %xmm1, %xmm3
  562. #endif
  563. #ifdef LT
  564. movapd %xmm2, %xmm0
  565. unpckhpd %xmm0, %xmm0
  566. movapd %xmm3, %xmm1
  567. unpckhpd %xmm1, %xmm1
  568. movsd 0 * SIZE(AO), %xmm4
  569. mulsd %xmm4, %xmm2
  570. movsd 1 * SIZE(AO), %xmm5
  571. mulsd %xmm2, %xmm5
  572. subsd %xmm5, %xmm0
  573. movsd 2 * SIZE(AO), %xmm6
  574. mulsd %xmm2, %xmm6
  575. subsd %xmm6, %xmm3
  576. movsd 3 * SIZE(AO), %xmm7
  577. mulsd %xmm2, %xmm7
  578. subsd %xmm7, %xmm1
  579. movsd 5 * SIZE(AO), %xmm4
  580. mulsd %xmm4, %xmm0
  581. movsd 6 * SIZE(AO), %xmm5
  582. mulsd %xmm0, %xmm5
  583. subsd %xmm5, %xmm3
  584. movsd 7 * SIZE(AO), %xmm6
  585. mulsd %xmm0, %xmm6
  586. subsd %xmm6, %xmm1
  587. movsd 10 * SIZE(AO), %xmm4
  588. mulsd %xmm4, %xmm3
  589. movsd 11 * SIZE(AO), %xmm5
  590. mulsd %xmm3, %xmm5
  591. subsd %xmm5, %xmm1
  592. movsd 15 * SIZE(AO), %xmm4
  593. mulsd %xmm4, %xmm1
  594. unpcklpd %xmm0, %xmm2
  595. unpcklpd %xmm1, %xmm3
  596. #endif
  597. #ifdef RN
  598. movddup 0 * SIZE(BO), %xmm0
  599. mulpd %xmm0, %xmm2
  600. mulpd %xmm0, %xmm3
  601. #endif
  602. #ifdef RT
  603. movddup 0 * SIZE(BO), %xmm0
  604. mulpd %xmm0, %xmm2
  605. mulpd %xmm0, %xmm3
  606. #endif
  607. #ifdef LN
  608. subq $4 * SIZE, CO1
  609. #endif
  610. #if defined(LN) || defined(LT)
  611. movsd %xmm2, 0 * SIZE(CO1)
  612. movhpd %xmm2, 1 * SIZE(CO1)
  613. movsd %xmm3, 2 * SIZE(CO1)
  614. movhpd %xmm3, 3 * SIZE(CO1)
  615. #else
  616. movsd %xmm2, 0 * SIZE(CO1)
  617. movhpd %xmm2, 1 * SIZE(CO1)
  618. movsd %xmm3, 2 * SIZE(CO1)
  619. movhpd %xmm3, 3 * SIZE(CO1)
  620. #endif
  621. #if defined(LN) || defined(LT)
  622. movapd %xmm2, 0 * SIZE(BO)
  623. movapd %xmm3, 2 * SIZE(BO)
  624. #else
  625. movapd %xmm2, 0 * SIZE(AO)
  626. movapd %xmm3, 2 * SIZE(AO)
  627. #endif
  628. #ifndef LN
  629. addq $4 * SIZE, CO1
  630. #endif
  631. #if defined(LT) || defined(RN)
  632. movq K, %rax
  633. subq KK, %rax
  634. leaq (,%rax, SIZE), %rax
  635. leaq (AO, %rax, 4), AO
  636. leaq (BO, %rax, 1), BO
  637. #endif
  638. #ifdef LN
  639. subq $4, KK
  640. #endif
  641. #ifdef LT
  642. addq $4, KK
  643. #endif
  644. #ifdef RT
  645. movq K, %rax
  646. salq $2 + BASE_SHIFT, %rax
  647. addq %rax, AORIG
  648. #endif
  649. decq I # i --
  650. jg .L91
  651. ALIGN_4
  652. .L100:
  653. testq $2, M
  654. je .L110
  655. ALIGN_4
  656. .L101:
  657. #ifdef LN
  658. movq K, %rax
  659. salq $1 + BASE_SHIFT, %rax
  660. subq %rax, AORIG
  661. #endif
  662. #if defined(LN) || defined(RT)
  663. movq KK, %rax
  664. leaq (, %rax, SIZE), %rax
  665. movq AORIG, AO
  666. leaq (AO, %rax, 2), AO
  667. leaq (B, %rax, 1), BO
  668. #else
  669. movq B, BO
  670. #endif
  671. movapd 0 * SIZE(AO), %xmm8
  672. pxor %xmm0, %xmm0
  673. movddup 0 * SIZE(BO), %xmm9
  674. pxor %xmm1, %xmm1
  675. movapd 8 * SIZE(AO), %xmm10
  676. pxor %xmm2, %xmm2
  677. movddup 4 * SIZE(BO), %xmm11
  678. pxor %xmm3, %xmm3
  679. #if defined(LT) || defined(RN)
  680. movq KK, %rax
  681. #else
  682. movq K, %rax
  683. subq KK, %rax
  684. #endif
  685. sarq $3, %rax
  686. je .L105
  687. ALIGN_4
  688. .L102:
  689. mulpd %xmm9, %xmm8
  690. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  691. movddup 1 * SIZE(BO), %xmm9
  692. addpd %xmm8, %xmm0
  693. mulpd 2 * SIZE(AO), %xmm9
  694. movapd 16 * SIZE(AO), %xmm8
  695. addpd %xmm9, %xmm1
  696. movddup 2 * SIZE(BO), %xmm9
  697. mulpd 4 * SIZE(AO), %xmm9
  698. addpd %xmm9, %xmm2
  699. movddup 3 * SIZE(BO), %xmm9
  700. mulpd 6 * SIZE(AO), %xmm9
  701. addpd %xmm9, %xmm3
  702. movddup 8 * SIZE(BO), %xmm9
  703. mulpd %xmm11, %xmm10
  704. movddup 5 * SIZE(BO), %xmm11
  705. addpd %xmm10, %xmm0
  706. mulpd 10 * SIZE(AO), %xmm11
  707. movapd 24 * SIZE(AO), %xmm10
  708. addpd %xmm11, %xmm1
  709. movddup 6 * SIZE(BO), %xmm11
  710. mulpd 12 * SIZE(AO), %xmm11
  711. addpd %xmm11, %xmm2
  712. movddup 7 * SIZE(BO), %xmm11
  713. mulpd 14 * SIZE(AO), %xmm11
  714. addpd %xmm11, %xmm3
  715. movddup 12 * SIZE(BO), %xmm11
  716. addq $16 * SIZE, AO
  717. addq $ 8 * SIZE, BO
  718. decq %rax
  719. jne .L102
  720. ALIGN_4
  721. .L105:
  722. #if defined(LT) || defined(RN)
  723. movq KK, %rax
  724. #else
  725. movq K, %rax
  726. subq KK, %rax
  727. #endif
  728. andq $7, %rax # if (k & 1)
  729. BRANCH
  730. je .L109
  731. ALIGN_4
  732. .L106:
  733. mulpd %xmm9, %xmm8
  734. movddup 1 * SIZE(BO), %xmm9
  735. addpd %xmm8, %xmm0
  736. movapd 2 * SIZE(AO), %xmm8
  737. addq $2 * SIZE, AO # aoffset += 4
  738. addq $1 * SIZE, BO # boffset1 += 8
  739. decq %rax
  740. jg .L106
  741. ALIGN_4
  742. .L109:
  743. addpd %xmm1, %xmm0
  744. addpd %xmm3, %xmm2
  745. addpd %xmm2, %xmm0
  746. #if defined(LN) || defined(RT)
  747. movq KK, %rax
  748. #ifdef LN
  749. subq $2, %rax
  750. #else
  751. subq $1, %rax
  752. #endif
  753. leaq (, %rax, SIZE), %rax
  754. movq AORIG, AO
  755. leaq (AO, %rax, 2), AO
  756. leaq (B, %rax, 1), BO
  757. #endif
  758. #if defined(LN) || defined(LT)
  759. movapd 0 * SIZE(BO), %xmm2
  760. subpd %xmm0, %xmm2
  761. #else
  762. movapd 0 * SIZE(AO), %xmm2
  763. subpd %xmm0, %xmm2
  764. #endif
  765. #ifdef LN
  766. movapd %xmm2, %xmm0
  767. unpckhpd %xmm0, %xmm0
  768. movsd 3 * SIZE(AO), %xmm4
  769. mulsd %xmm4, %xmm0
  770. movsd 2 * SIZE(AO), %xmm5
  771. mulsd %xmm0, %xmm5
  772. subsd %xmm5, %xmm2
  773. movsd 0 * SIZE(AO), %xmm4
  774. mulsd %xmm4, %xmm2
  775. unpcklpd %xmm0, %xmm2
  776. #endif
  777. #ifdef LT
  778. movapd %xmm2, %xmm0
  779. unpckhpd %xmm0, %xmm0
  780. movsd 0 * SIZE(AO), %xmm4
  781. mulsd %xmm4, %xmm2
  782. movsd 1 * SIZE(AO), %xmm5
  783. mulsd %xmm2, %xmm5
  784. subsd %xmm5, %xmm0
  785. movsd 3 * SIZE(AO), %xmm4
  786. mulsd %xmm4, %xmm0
  787. unpcklpd %xmm0, %xmm2
  788. #endif
  789. #ifdef RN
  790. movddup 0 * SIZE(BO), %xmm0
  791. mulpd %xmm0, %xmm2
  792. #endif
  793. #ifdef RT
  794. movddup 0 * SIZE(BO), %xmm0
  795. mulpd %xmm0, %xmm2
  796. #endif
  797. #ifdef LN
  798. subq $2 * SIZE, CO1
  799. #endif
  800. #if defined(LN) || defined(LT)
  801. movsd %xmm2, 0 * SIZE(CO1)
  802. movhpd %xmm2, 1 * SIZE(CO1)
  803. #else
  804. movsd %xmm2, 0 * SIZE(CO1)
  805. movhpd %xmm2, 1 * SIZE(CO1)
  806. #endif
  807. #if defined(LN) || defined(LT)
  808. movapd %xmm2, 0 * SIZE(BO)
  809. #else
  810. movapd %xmm2, 0 * SIZE(AO)
  811. #endif
  812. #ifndef LN
  813. addq $2 * SIZE, CO1
  814. #endif
  815. #if defined(LT) || defined(RN)
  816. movq K, %rax
  817. subq KK, %rax
  818. leaq (,%rax, SIZE), %rax
  819. leaq (AO, %rax, 2), AO
  820. leaq (BO, %rax, 1), BO
  821. #endif
  822. #ifdef LN
  823. subq $2, KK
  824. #endif
  825. #ifdef LT
  826. addq $2, KK
  827. #endif
  828. #ifdef RT
  829. movq K, %rax
  830. salq $1 + BASE_SHIFT, %rax
  831. addq %rax, AORIG
  832. #endif
  833. ALIGN_4
  834. .L110:
  835. testq $1, M
  836. je .L119
  837. ALIGN_4
  838. .L111:
  839. #ifdef LN
  840. movq K, %rax
  841. salq $0 + BASE_SHIFT, %rax
  842. subq %rax, AORIG
  843. #endif
  844. #if defined(LN) || defined(RT)
  845. movq KK, %rax
  846. leaq (, %rax, SIZE), %rax
  847. movq AORIG, AO
  848. leaq (AO, %rax, 1), AO
  849. leaq (B, %rax, 1), BO
  850. #else
  851. movq B, BO
  852. #endif
  853. movsd 0 * SIZE(AO), %xmm8
  854. pxor %xmm0, %xmm0
  855. movsd 0 * SIZE(BO), %xmm9
  856. pxor %xmm1, %xmm1
  857. movsd 4 * SIZE(AO), %xmm10
  858. pxor %xmm2, %xmm2
  859. movsd 4 * SIZE(BO), %xmm11
  860. pxor %xmm3, %xmm3
  861. #if defined(LT) || defined(RN)
  862. movq KK, %rax
  863. #else
  864. movq K, %rax
  865. subq KK, %rax
  866. #endif
  867. sarq $3, %rax
  868. je .L115
  869. ALIGN_4
  870. .L112:
  871. mulpd %xmm9, %xmm8
  872. movapd 2 * SIZE(AO), %xmm9
  873. addpd %xmm8, %xmm0
  874. mulpd 2 * SIZE(BO), %xmm9
  875. movapd 8 * SIZE(BO), %xmm8
  876. addpd %xmm9, %xmm1
  877. movapd 8 * SIZE(AO), %xmm9
  878. mulpd %xmm11, %xmm10
  879. movapd 6 * SIZE(AO), %xmm11
  880. addpd %xmm10, %xmm0
  881. mulpd 6 * SIZE(BO), %xmm11
  882. movapd 12 * SIZE(BO), %xmm10
  883. addpd %xmm11, %xmm1
  884. movapd 12 * SIZE(AO), %xmm11
  885. addq $8 * SIZE, AO
  886. addq $8 * SIZE, BO
  887. decq %rax
  888. jne .L112
  889. ALIGN_4
  890. .L115:
  891. #if defined(LT) || defined(RN)
  892. movq KK, %rax
  893. #else
  894. movq K, %rax
  895. subq KK, %rax
  896. #endif
  897. andq $7, %rax # if (k & 1)
  898. BRANCH
  899. je .L118
  900. ALIGN_4
  901. .L116:
  902. mulsd 0 * SIZE(BO), %xmm9
  903. addsd %xmm9, %xmm0
  904. movsd 1 * SIZE(AO), %xmm9
  905. addq $1 * SIZE, AO # aoffset += 4
  906. addq $1 * SIZE, BO # boffset1 += 8
  907. decq %rax
  908. jg .L116
  909. ALIGN_4
  910. .L118:
  911. addpd %xmm1, %xmm0
  912. haddpd %xmm0, %xmm0
  913. #if defined(LN) || defined(RT)
  914. movq KK, %rax
  915. #ifdef LN
  916. subq $1, %rax
  917. #else
  918. subq $1, %rax
  919. #endif
  920. leaq (, %rax, SIZE), %rax
  921. movq AORIG, AO
  922. leaq (AO, %rax, 1), AO
  923. leaq (B, %rax, 1), BO
  924. #endif
  925. #if defined(LN) || defined(LT)
  926. movsd 0 * SIZE(BO), %xmm2
  927. subsd %xmm0, %xmm2
  928. #else
  929. movsd 0 * SIZE(AO), %xmm2
  930. subsd %xmm0, %xmm2
  931. #endif
  932. #ifdef LN
  933. movsd 0 * SIZE(AO), %xmm4
  934. mulsd %xmm4, %xmm2
  935. #endif
  936. #ifdef LT
  937. movsd 0 * SIZE(AO), %xmm4
  938. mulsd %xmm4, %xmm2
  939. #endif
  940. #ifdef RN
  941. movsd 0 * SIZE(BO), %xmm0
  942. mulsd %xmm0, %xmm2
  943. #endif
  944. #ifdef RT
  945. movsd 0 * SIZE(BO), %xmm0
  946. mulsd %xmm0, %xmm2
  947. #endif
  948. #ifdef LN
  949. subq $1 * SIZE, CO1
  950. #endif
  951. #if defined(LN) || defined(LT)
  952. movsd %xmm2, 0 * SIZE(CO1)
  953. #else
  954. movsd %xmm2, 0 * SIZE(CO1)
  955. #endif
  956. #if defined(LN) || defined(LT)
  957. movsd %xmm2, 0 * SIZE(BO)
  958. #else
  959. movsd %xmm2, 0 * SIZE(AO)
  960. #endif
  961. #ifndef LN
  962. addq $1 * SIZE, CO1
  963. #endif
  964. #if defined(LT) || defined(RN)
  965. movq K, %rax
  966. subq KK, %rax
  967. leaq (,%rax, SIZE), %rax
  968. leaq (AO, %rax, 1), AO
  969. leaq (BO, %rax, 1), BO
  970. #endif
  971. #ifdef LN
  972. subq $1, KK
  973. #endif
  974. #ifdef LT
  975. addq $1, KK
  976. #endif
  977. #ifdef RT
  978. movq K, %rax
  979. salq $0 + BASE_SHIFT, %rax
  980. addq %rax, AORIG
  981. #endif
  982. ALIGN_4
  983. .L119:
  984. #ifdef LN
  985. leaq (, K, SIZE), %rax
  986. leaq (B, %rax, 1), B
  987. #endif
  988. #if defined(LT) || defined(RN)
  989. movq BO, B
  990. #endif
  991. #ifdef RN
  992. addq $1, KK
  993. #endif
  994. #ifdef RT
  995. subq $1, KK
  996. #endif
  997. ALIGN_2
  998. .L80:
  999. testq $2, N
  1000. je .L40
  1001. ALIGN_4
  1002. #if defined(LT) || defined(RN)
  1003. movq A, AO
  1004. #else
  1005. movq A, AORIG
  1006. #endif
  1007. #ifdef RT
  1008. movq K, %rax
  1009. salq $1 + BASE_SHIFT, %rax
  1010. subq %rax, B
  1011. leaq (, LDC, 2), %rax
  1012. subq %rax, C
  1013. #endif
  1014. movq C, CO1
  1015. leaq (C, LDC, 1), CO2
  1016. #ifndef RT
  1017. leaq (C, LDC, 2), C
  1018. #endif
  1019. #ifdef LN
  1020. movq OFFSET, %rax
  1021. addq M, %rax
  1022. movq %rax, KK
  1023. #endif
  1024. #ifdef LT
  1025. movq OFFSET, %rax
  1026. movq %rax, KK
  1027. #endif
  1028. movq M, I
  1029. sarq $2, I # i = (m >> 2)
  1030. jle .L60
  1031. ALIGN_4
  1032. .L51:
  1033. #ifdef LN
  1034. movq K, %rax
  1035. salq $2 + BASE_SHIFT, %rax
  1036. subq %rax, AORIG
  1037. #endif
  1038. #if defined(LN) || defined(RT)
  1039. movq KK, %rax
  1040. leaq (, %rax, SIZE), %rax
  1041. movq AORIG, AO
  1042. leaq (AO, %rax, 4), AO
  1043. leaq (B, %rax, 2), BO
  1044. #else
  1045. movq B, BO
  1046. #endif
  1047. movapd 0 * SIZE(AO), %xmm8
  1048. pxor %xmm0, %xmm0
  1049. movddup 0 * SIZE(BO), %xmm9
  1050. pxor %xmm1, %xmm1
  1051. movapd 8 * SIZE(AO), %xmm10
  1052. pxor %xmm4, %xmm4
  1053. movddup 8 * SIZE(BO), %xmm11
  1054. pxor %xmm5, %xmm5
  1055. #ifdef HAVE_3DNOW
  1056. prefetchw 4 * SIZE(CO1)
  1057. prefetchw 4 * SIZE(CO2)
  1058. #else
  1059. prefetchnta 4 * SIZE(CO1)
  1060. prefetchnta 4 * SIZE(CO2)
  1061. #endif
  1062. #if defined(LT) || defined(RN)
  1063. movq KK, %rax
  1064. #else
  1065. movq K, %rax
  1066. subq KK, %rax
  1067. #endif
  1068. sarq $3, %rax
  1069. je .L55
  1070. ALIGN_4
  1071. .L52:
  1072. mulpd %xmm8, %xmm9
  1073. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1074. addpd %xmm9, %xmm0
  1075. movddup 1 * SIZE(BO), %xmm9
  1076. mulpd %xmm8, %xmm9
  1077. movapd 2 * SIZE(AO), %xmm8
  1078. addpd %xmm9, %xmm1
  1079. movddup 0 * SIZE(BO), %xmm9
  1080. mulpd %xmm8, %xmm9
  1081. addpd %xmm9, %xmm4
  1082. movddup 1 * SIZE(BO), %xmm9
  1083. mulpd %xmm8, %xmm9
  1084. movapd 4 * SIZE(AO), %xmm8
  1085. addpd %xmm9, %xmm5
  1086. movddup 2 * SIZE(BO), %xmm9
  1087. mulpd %xmm8, %xmm9
  1088. addpd %xmm9, %xmm0
  1089. movddup 3 * SIZE(BO), %xmm9
  1090. mulpd %xmm8, %xmm9
  1091. movapd 6 * SIZE(AO), %xmm8
  1092. addpd %xmm9, %xmm1
  1093. movddup 2 * SIZE(BO), %xmm9
  1094. mulpd %xmm8, %xmm9
  1095. addpd %xmm9, %xmm4
  1096. movddup 3 * SIZE(BO), %xmm9
  1097. mulpd %xmm8, %xmm9
  1098. movapd 16 * SIZE(AO), %xmm8
  1099. addpd %xmm9, %xmm5
  1100. movddup 4 * SIZE(BO), %xmm9
  1101. mulpd %xmm10, %xmm9
  1102. addpd %xmm9, %xmm0
  1103. movddup 5 * SIZE(BO), %xmm9
  1104. mulpd %xmm10, %xmm9
  1105. movapd 10 * SIZE(AO), %xmm10
  1106. addpd %xmm9, %xmm1
  1107. movddup 4 * SIZE(BO), %xmm9
  1108. mulpd %xmm10, %xmm9
  1109. addpd %xmm9, %xmm4
  1110. movddup 5 * SIZE(BO), %xmm9
  1111. mulpd %xmm10, %xmm9
  1112. movapd 12 * SIZE(AO), %xmm10
  1113. addpd %xmm9, %xmm5
  1114. movddup 6 * SIZE(BO), %xmm9
  1115. mulpd %xmm10, %xmm9
  1116. addpd %xmm9, %xmm0
  1117. movddup 7 * SIZE(BO), %xmm9
  1118. mulpd %xmm10, %xmm9
  1119. movapd 14 * SIZE(AO), %xmm10
  1120. addpd %xmm9, %xmm1
  1121. movddup 6 * SIZE(BO), %xmm9
  1122. mulpd %xmm10, %xmm9
  1123. addpd %xmm9, %xmm4
  1124. movddup 7 * SIZE(BO), %xmm9
  1125. mulpd %xmm10, %xmm9
  1126. movapd 40 * SIZE(AO), %xmm10
  1127. addpd %xmm9, %xmm5
  1128. movddup 16 * SIZE(BO), %xmm9
  1129. mulpd %xmm8, %xmm11
  1130. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1131. addpd %xmm11, %xmm0
  1132. movddup 9 * SIZE(BO), %xmm11
  1133. mulpd %xmm8, %xmm11
  1134. movapd 18 * SIZE(AO), %xmm8
  1135. addpd %xmm11, %xmm1
  1136. movddup 8 * SIZE(BO), %xmm11
  1137. mulpd %xmm8, %xmm11
  1138. addpd %xmm11, %xmm4
  1139. movddup 9 * SIZE(BO), %xmm11
  1140. mulpd %xmm8, %xmm11
  1141. movapd 20 * SIZE(AO), %xmm8
  1142. addpd %xmm11, %xmm5
  1143. movddup 10 * SIZE(BO), %xmm11
  1144. mulpd %xmm8, %xmm11
  1145. addpd %xmm11, %xmm0
  1146. movddup 11 * SIZE(BO), %xmm11
  1147. mulpd %xmm8, %xmm11
  1148. movapd 22 * SIZE(AO), %xmm8
  1149. addpd %xmm11, %xmm1
  1150. movddup 10 * SIZE(BO), %xmm11
  1151. mulpd %xmm8, %xmm11
  1152. addpd %xmm11, %xmm4
  1153. movddup 11 * SIZE(BO), %xmm11
  1154. mulpd %xmm8, %xmm11
  1155. movapd 24 * SIZE(AO), %xmm8
  1156. addpd %xmm11, %xmm5
  1157. movddup 12 * SIZE(BO), %xmm11
  1158. mulpd %xmm8, %xmm11
  1159. addpd %xmm11, %xmm0
  1160. movddup 13 * SIZE(BO), %xmm11
  1161. mulpd %xmm8, %xmm11
  1162. movapd 26 * SIZE(AO), %xmm8
  1163. addpd %xmm11, %xmm1
  1164. movddup 12 * SIZE(BO), %xmm11
  1165. mulpd %xmm8, %xmm11
  1166. addpd %xmm11, %xmm4
  1167. movddup 13 * SIZE(BO), %xmm11
  1168. mulpd %xmm8, %xmm11
  1169. movapd 28 * SIZE(AO), %xmm8
  1170. addpd %xmm11, %xmm5
  1171. movddup 14 * SIZE(BO), %xmm11
  1172. mulpd %xmm8, %xmm11
  1173. addpd %xmm11, %xmm0
  1174. movddup 15 * SIZE(BO), %xmm11
  1175. mulpd %xmm8, %xmm11
  1176. movapd 30 * SIZE(AO), %xmm8
  1177. addpd %xmm11, %xmm1
  1178. movddup 14 * SIZE(BO), %xmm11
  1179. mulpd %xmm8, %xmm11
  1180. addpd %xmm11, %xmm4
  1181. movddup 15 * SIZE(BO), %xmm11
  1182. mulpd %xmm8, %xmm11
  1183. movapd 32 * SIZE(AO), %xmm8
  1184. addpd %xmm11, %xmm5
  1185. movddup 24 * SIZE(BO), %xmm11
  1186. addq $32 * SIZE, AO
  1187. addq $16 * SIZE, BO
  1188. decq %rax
  1189. jne .L52
  1190. ALIGN_4
  1191. .L55:
  1192. #if defined(LT) || defined(RN)
  1193. movq KK, %rax
  1194. #else
  1195. movq K, %rax
  1196. subq KK, %rax
  1197. #endif
  1198. andq $7, %rax # if (k & 1)
  1199. BRANCH
  1200. je .L59
  1201. ALIGN_4
  1202. .L56:
  1203. mulpd %xmm8, %xmm9
  1204. movapd 2 * SIZE(AO), %xmm10
  1205. addpd %xmm9, %xmm0
  1206. movddup 1 * SIZE(BO), %xmm9
  1207. mulpd %xmm8, %xmm9
  1208. movddup 0 * SIZE(BO), %xmm11
  1209. addpd %xmm9, %xmm1
  1210. movddup 2 * SIZE(BO), %xmm9
  1211. mulpd %xmm10, %xmm11
  1212. movapd 4 * SIZE(AO), %xmm8
  1213. addpd %xmm11, %xmm4
  1214. movddup 1 * SIZE(BO), %xmm11
  1215. mulpd %xmm10, %xmm11
  1216. addpd %xmm11, %xmm5
  1217. addq $4 * SIZE, AO # aoffset += 4
  1218. addq $2 * SIZE, BO # boffset1 += 8
  1219. decq %rax
  1220. jg .L56
  1221. ALIGN_4
  1222. .L59:
  1223. #if defined(LN) || defined(RT)
  1224. movq KK, %rax
  1225. #ifdef LN
  1226. subq $4, %rax
  1227. #else
  1228. subq $2, %rax
  1229. #endif
  1230. leaq (, %rax, SIZE), %rax
  1231. movq AORIG, AO
  1232. leaq (AO, %rax, 4), AO
  1233. leaq (B, %rax, 2), BO
  1234. #endif
  1235. #if defined(LN) || defined(LT)
  1236. movapd %xmm0, %xmm8
  1237. unpcklpd %xmm1, %xmm0
  1238. unpckhpd %xmm1, %xmm8
  1239. movapd %xmm4, %xmm12
  1240. unpcklpd %xmm5, %xmm4
  1241. unpckhpd %xmm5, %xmm12
  1242. movapd 0 * SIZE(BO), %xmm1
  1243. movapd 2 * SIZE(BO), %xmm5
  1244. movapd 4 * SIZE(BO), %xmm9
  1245. movapd 6 * SIZE(BO), %xmm13
  1246. subpd %xmm0, %xmm1
  1247. subpd %xmm8, %xmm5
  1248. subpd %xmm4, %xmm9
  1249. subpd %xmm12, %xmm13
  1250. #else
  1251. movapd 0 * SIZE(AO), %xmm8
  1252. movapd 2 * SIZE(AO), %xmm9
  1253. movapd 4 * SIZE(AO), %xmm10
  1254. movapd 6 * SIZE(AO), %xmm11
  1255. subpd %xmm0, %xmm8
  1256. subpd %xmm4, %xmm9
  1257. subpd %xmm1, %xmm10
  1258. subpd %xmm5, %xmm11
  1259. #endif
  1260. #ifdef LN
  1261. movddup 15 * SIZE(AO), %xmm0
  1262. mulpd %xmm0, %xmm13
  1263. movddup 14 * SIZE(AO), %xmm2
  1264. mulpd %xmm13, %xmm2
  1265. subpd %xmm2, %xmm9
  1266. movddup 13 * SIZE(AO), %xmm4
  1267. mulpd %xmm13, %xmm4
  1268. subpd %xmm4, %xmm5
  1269. movddup 12 * SIZE(AO), %xmm6
  1270. mulpd %xmm13, %xmm6
  1271. subpd %xmm6, %xmm1
  1272. movddup 10 * SIZE(AO), %xmm0
  1273. mulpd %xmm0, %xmm9
  1274. movddup 9 * SIZE(AO), %xmm2
  1275. mulpd %xmm9, %xmm2
  1276. subpd %xmm2, %xmm5
  1277. movddup 8 * SIZE(AO), %xmm4
  1278. mulpd %xmm9, %xmm4
  1279. subpd %xmm4, %xmm1
  1280. movddup 5 * SIZE(AO), %xmm0
  1281. mulpd %xmm0, %xmm5
  1282. movddup 4 * SIZE(AO), %xmm2
  1283. mulpd %xmm5, %xmm2
  1284. subpd %xmm2, %xmm1
  1285. movddup 0 * SIZE(AO), %xmm0
  1286. mulpd %xmm0, %xmm1
  1287. #endif
  1288. #ifdef LT
  1289. movddup 0 * SIZE(AO), %xmm0
  1290. mulpd %xmm0, %xmm1
  1291. movddup 1 * SIZE(AO), %xmm2
  1292. mulpd %xmm1, %xmm2
  1293. subpd %xmm2, %xmm5
  1294. movddup 2 * SIZE(AO), %xmm4
  1295. mulpd %xmm1, %xmm4
  1296. subpd %xmm4, %xmm9
  1297. movddup 3 * SIZE(AO), %xmm6
  1298. mulpd %xmm1, %xmm6
  1299. subpd %xmm6, %xmm13
  1300. movddup 5 * SIZE(AO), %xmm0
  1301. mulpd %xmm0, %xmm5
  1302. movddup 6 * SIZE(AO), %xmm2
  1303. mulpd %xmm5, %xmm2
  1304. subpd %xmm2, %xmm9
  1305. movddup 7 * SIZE(AO), %xmm4
  1306. mulpd %xmm5, %xmm4
  1307. subpd %xmm4, %xmm13
  1308. movddup 10 * SIZE(AO), %xmm0
  1309. mulpd %xmm0, %xmm9
  1310. movddup 11 * SIZE(AO), %xmm2
  1311. mulpd %xmm9, %xmm2
  1312. subpd %xmm2, %xmm13
  1313. movddup 15 * SIZE(AO), %xmm0
  1314. mulpd %xmm0, %xmm13
  1315. #endif
  1316. #ifdef RN
  1317. movddup 0 * SIZE(BO), %xmm0
  1318. mulpd %xmm0, %xmm8
  1319. mulpd %xmm0, %xmm9
  1320. movddup 1 * SIZE(BO), %xmm1
  1321. mulpd %xmm8, %xmm1
  1322. subpd %xmm1, %xmm10
  1323. movddup 1 * SIZE(BO), %xmm1
  1324. mulpd %xmm9, %xmm1
  1325. subpd %xmm1, %xmm11
  1326. movddup 3 * SIZE(BO), %xmm0
  1327. mulpd %xmm0, %xmm10
  1328. mulpd %xmm0, %xmm11
  1329. #endif
  1330. #ifdef RT
  1331. movddup 3 * SIZE(BO), %xmm0
  1332. mulpd %xmm0, %xmm10
  1333. mulpd %xmm0, %xmm11
  1334. movddup 2 * SIZE(BO), %xmm1
  1335. mulpd %xmm10, %xmm1
  1336. subpd %xmm1, %xmm8
  1337. movddup 2 * SIZE(BO), %xmm1
  1338. mulpd %xmm11, %xmm1
  1339. subpd %xmm1, %xmm9
  1340. movddup 0 * SIZE(BO), %xmm0
  1341. mulpd %xmm0, %xmm8
  1342. mulpd %xmm0, %xmm9
  1343. #endif
  1344. #ifdef LN
  1345. subq $4 * SIZE, CO1
  1346. subq $4 * SIZE, CO2
  1347. #endif
  1348. #if defined(LN) || defined(LT)
  1349. movsd %xmm1, 0 * SIZE(CO1)
  1350. movsd %xmm5, 1 * SIZE(CO1)
  1351. movsd %xmm9, 2 * SIZE(CO1)
  1352. movsd %xmm13, 3 * SIZE(CO1)
  1353. movhpd %xmm1, 0 * SIZE(CO2)
  1354. movhpd %xmm5, 1 * SIZE(CO2)
  1355. movhpd %xmm9, 2 * SIZE(CO2)
  1356. movhpd %xmm13, 3 * SIZE(CO2)
  1357. #else
  1358. movsd %xmm8, 0 * SIZE(CO1)
  1359. movhpd %xmm8, 1 * SIZE(CO1)
  1360. movsd %xmm9, 2 * SIZE(CO1)
  1361. movhpd %xmm9, 3 * SIZE(CO1)
  1362. movsd %xmm10, 0 * SIZE(CO2)
  1363. movhpd %xmm10, 1 * SIZE(CO2)
  1364. movsd %xmm11, 2 * SIZE(CO2)
  1365. movhpd %xmm11, 3 * SIZE(CO2)
  1366. #endif
  1367. #if defined(LN) || defined(LT)
  1368. movapd %xmm1, 0 * SIZE(BO)
  1369. movapd %xmm5, 2 * SIZE(BO)
  1370. movapd %xmm9, 4 * SIZE(BO)
  1371. movapd %xmm13, 6 * SIZE(BO)
  1372. #else
  1373. movapd %xmm8, 0 * SIZE(AO)
  1374. movapd %xmm9, 2 * SIZE(AO)
  1375. movapd %xmm10, 4 * SIZE(AO)
  1376. movapd %xmm11, 6 * SIZE(AO)
  1377. #endif
  1378. #ifndef LN
  1379. addq $4 * SIZE, CO1
  1380. addq $4 * SIZE, CO2
  1381. #endif
  1382. #if defined(LT) || defined(RN)
  1383. movq K, %rax
  1384. subq KK, %rax
  1385. leaq (,%rax, SIZE), %rax
  1386. leaq (AO, %rax, 4), AO
  1387. leaq (BO, %rax, 2), BO
  1388. #endif
  1389. #ifdef LN
  1390. subq $4, KK
  1391. #endif
  1392. #ifdef LT
  1393. addq $4, KK
  1394. #endif
  1395. #ifdef RT
  1396. movq K, %rax
  1397. salq $2 + BASE_SHIFT, %rax
  1398. addq %rax, AORIG
  1399. #endif
  1400. decq I # i --
  1401. jg .L51
  1402. ALIGN_4
  1403. .L60:
  1404. testq $2, M
  1405. je .L70
  1406. ALIGN_4
  1407. .L61:
  1408. #ifdef LN
  1409. movq K, %rax
  1410. salq $1 + BASE_SHIFT, %rax
  1411. subq %rax, AORIG
  1412. #endif
  1413. #if defined(LN) || defined(RT)
  1414. movq KK, %rax
  1415. leaq (, %rax, SIZE), %rax
  1416. movq AORIG, AO
  1417. leaq (AO, %rax, 2), AO
  1418. leaq (B, %rax, 2), BO
  1419. #else
  1420. movq B, BO
  1421. #endif
  1422. movapd 0 * SIZE(AO), %xmm8
  1423. pxor %xmm0, %xmm0
  1424. movddup 0 * SIZE(BO), %xmm9
  1425. pxor %xmm1, %xmm1
  1426. movapd 8 * SIZE(AO), %xmm10
  1427. pxor %xmm2, %xmm2
  1428. movddup 8 * SIZE(BO), %xmm11
  1429. pxor %xmm3, %xmm3
  1430. #if defined(LT) || defined(RN)
  1431. movq KK, %rax
  1432. #else
  1433. movq K, %rax
  1434. subq KK, %rax
  1435. #endif
  1436. sarq $3, %rax
  1437. je .L65
  1438. ALIGN_4
  1439. .L62:
  1440. mulpd %xmm8, %xmm9
  1441. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1442. addpd %xmm9, %xmm0
  1443. movddup 1 * SIZE(BO), %xmm9
  1444. mulpd %xmm8, %xmm9
  1445. movapd 2 * SIZE(AO), %xmm8
  1446. addpd %xmm9, %xmm1
  1447. movddup 2 * SIZE(BO), %xmm9
  1448. mulpd %xmm8, %xmm9
  1449. addpd %xmm9, %xmm2
  1450. movddup 3 * SIZE(BO), %xmm9
  1451. mulpd %xmm8, %xmm9
  1452. movapd 4 * SIZE(AO), %xmm8
  1453. addpd %xmm9, %xmm3
  1454. movddup 4 * SIZE(BO), %xmm9
  1455. mulpd %xmm8, %xmm9
  1456. addpd %xmm9, %xmm0
  1457. movddup 5 * SIZE(BO), %xmm9
  1458. mulpd %xmm8, %xmm9
  1459. movapd 6 * SIZE(AO), %xmm8
  1460. addpd %xmm9, %xmm1
  1461. movddup 6 * SIZE(BO), %xmm9
  1462. mulpd %xmm8, %xmm9
  1463. addpd %xmm9, %xmm2
  1464. movddup 7 * SIZE(BO), %xmm9
  1465. mulpd %xmm8, %xmm9
  1466. movapd 16 * SIZE(AO), %xmm8
  1467. addpd %xmm9, %xmm3
  1468. movddup 16 * SIZE(BO), %xmm9
  1469. mulpd %xmm10, %xmm11
  1470. addpd %xmm11, %xmm0
  1471. movddup 9 * SIZE(BO), %xmm11
  1472. mulpd %xmm10, %xmm11
  1473. movapd 10 * SIZE(AO), %xmm10
  1474. addpd %xmm11, %xmm1
  1475. movddup 10 * SIZE(BO), %xmm11
  1476. mulpd %xmm10, %xmm11
  1477. addpd %xmm11, %xmm2
  1478. movddup 11 * SIZE(BO), %xmm11
  1479. mulpd %xmm10, %xmm11
  1480. movapd 12 * SIZE(AO), %xmm10
  1481. addpd %xmm11, %xmm3
  1482. movddup 12 * SIZE(BO), %xmm11
  1483. mulpd %xmm10, %xmm11
  1484. addpd %xmm11, %xmm0
  1485. movddup 13 * SIZE(BO), %xmm11
  1486. mulpd %xmm10, %xmm11
  1487. movapd 14 * SIZE(AO), %xmm10
  1488. addpd %xmm11, %xmm1
  1489. movddup 14 * SIZE(BO), %xmm11
  1490. mulpd %xmm10, %xmm11
  1491. addpd %xmm11, %xmm2
  1492. movddup 15 * SIZE(BO), %xmm11
  1493. mulpd %xmm10, %xmm11
  1494. movapd 24 * SIZE(AO), %xmm10
  1495. addpd %xmm11, %xmm3
  1496. movddup 24 * SIZE(BO), %xmm11
  1497. addq $16 * SIZE, AO
  1498. addq $16 * SIZE, BO
  1499. decq %rax
  1500. jne .L62
  1501. ALIGN_4
  1502. .L65:
  1503. #if defined(LT) || defined(RN)
  1504. movq KK, %rax
  1505. #else
  1506. movq K, %rax
  1507. subq KK, %rax
  1508. #endif
  1509. andq $7, %rax # if (k & 1)
  1510. BRANCH
  1511. je .L69
  1512. ALIGN_4
  1513. .L66:
  1514. mulpd %xmm8, %xmm9
  1515. addpd %xmm9, %xmm0
  1516. movddup 1 * SIZE(BO), %xmm9
  1517. mulpd %xmm8, %xmm9
  1518. movapd 2 * SIZE(AO), %xmm8
  1519. addpd %xmm9, %xmm1
  1520. movddup 2 * SIZE(BO), %xmm9
  1521. addq $2 * SIZE, AO # aoffset += 4
  1522. addq $2 * SIZE, BO # boffset1 += 8
  1523. decq %rax
  1524. jg .L66
  1525. ALIGN_4
  1526. .L69:
  1527. addpd %xmm2, %xmm0
  1528. addpd %xmm3, %xmm1
  1529. #if defined(LN) || defined(RT)
  1530. movq KK, %rax
  1531. #ifdef LN
  1532. subq $2, %rax
  1533. #else
  1534. subq $2, %rax
  1535. #endif
  1536. leaq (, %rax, SIZE), %rax
  1537. movq AORIG, AO
  1538. leaq (AO, %rax, 2), AO
  1539. leaq (B, %rax, 2), BO
  1540. #endif
  1541. #if defined(LN) || defined(LT)
  1542. movapd %xmm0, %xmm8
  1543. unpcklpd %xmm1, %xmm0
  1544. unpckhpd %xmm1, %xmm8
  1545. movapd 0 * SIZE(BO), %xmm1
  1546. movapd 2 * SIZE(BO), %xmm5
  1547. subpd %xmm0, %xmm1
  1548. subpd %xmm8, %xmm5
  1549. #else
  1550. movapd 0 * SIZE(AO), %xmm8
  1551. movapd 2 * SIZE(AO), %xmm10
  1552. subpd %xmm0, %xmm8
  1553. subpd %xmm1, %xmm10
  1554. #endif
  1555. #ifdef LN
  1556. movddup 3 * SIZE(AO), %xmm0
  1557. mulpd %xmm0, %xmm5
  1558. movddup 2 * SIZE(AO), %xmm2
  1559. mulpd %xmm5, %xmm2
  1560. subpd %xmm2, %xmm1
  1561. movddup 0 * SIZE(AO), %xmm0
  1562. mulpd %xmm0, %xmm1
  1563. #endif
  1564. #ifdef LT
  1565. movddup 0 * SIZE(AO), %xmm0
  1566. mulpd %xmm0, %xmm1
  1567. movddup 1 * SIZE(AO), %xmm2
  1568. mulpd %xmm1, %xmm2
  1569. subpd %xmm2, %xmm5
  1570. movddup 3 * SIZE(AO), %xmm0
  1571. mulpd %xmm0, %xmm5
  1572. #endif
  1573. #ifdef RN
  1574. movddup 0 * SIZE(BO), %xmm0
  1575. mulpd %xmm0, %xmm8
  1576. movddup 1 * SIZE(BO), %xmm1
  1577. mulpd %xmm8, %xmm1
  1578. subpd %xmm1, %xmm10
  1579. movddup 3 * SIZE(BO), %xmm0
  1580. mulpd %xmm0, %xmm10
  1581. #endif
  1582. #ifdef RT
  1583. movddup 3 * SIZE(BO), %xmm0
  1584. mulpd %xmm0, %xmm10
  1585. movddup 2 * SIZE(BO), %xmm1
  1586. mulpd %xmm10, %xmm1
  1587. subpd %xmm1, %xmm8
  1588. movddup 0 * SIZE(BO), %xmm0
  1589. mulpd %xmm0, %xmm8
  1590. #endif
  1591. #ifdef LN
  1592. subq $2 * SIZE, CO1
  1593. subq $2 * SIZE, CO2
  1594. #endif
  1595. #if defined(LN) || defined(LT)
  1596. movsd %xmm1, 0 * SIZE(CO1)
  1597. movsd %xmm5, 1 * SIZE(CO1)
  1598. movhpd %xmm1, 0 * SIZE(CO2)
  1599. movhpd %xmm5, 1 * SIZE(CO2)
  1600. #else
  1601. movsd %xmm8, 0 * SIZE(CO1)
  1602. movhpd %xmm8, 1 * SIZE(CO1)
  1603. movsd %xmm10, 0 * SIZE(CO2)
  1604. movhpd %xmm10, 1 * SIZE(CO2)
  1605. #endif
  1606. #if defined(LN) || defined(LT)
  1607. movapd %xmm1, 0 * SIZE(BO)
  1608. movapd %xmm5, 2 * SIZE(BO)
  1609. #else
  1610. movapd %xmm8, 0 * SIZE(AO)
  1611. movapd %xmm10, 2 * SIZE(AO)
  1612. #endif
  1613. #ifndef LN
  1614. addq $2 * SIZE, CO1
  1615. addq $2 * SIZE, CO2
  1616. #endif
  1617. #if defined(LT) || defined(RN)
  1618. movq K, %rax
  1619. subq KK, %rax
  1620. leaq (,%rax, SIZE), %rax
  1621. leaq (AO, %rax, 2), AO
  1622. leaq (BO, %rax, 2), BO
  1623. #endif
  1624. #ifdef LN
  1625. subq $2, KK
  1626. #endif
  1627. #ifdef LT
  1628. addq $2, KK
  1629. #endif
  1630. #ifdef RT
  1631. movq K, %rax
  1632. salq $1 + BASE_SHIFT, %rax
  1633. addq %rax, AORIG
  1634. #endif
  1635. ALIGN_4
  1636. .L70:
  1637. testq $1, M
  1638. je .L79
  1639. ALIGN_4
  1640. .L71:
  1641. #ifdef LN
  1642. movq K, %rax
  1643. salq $0 + BASE_SHIFT, %rax
  1644. subq %rax, AORIG
  1645. #endif
  1646. #if defined(LN) || defined(RT)
  1647. movq KK, %rax
  1648. leaq (, %rax, SIZE), %rax
  1649. movq AORIG, AO
  1650. leaq (AO, %rax, 1), AO
  1651. leaq (B, %rax, 2), BO
  1652. #else
  1653. movq B, BO
  1654. #endif
  1655. movddup 0 * SIZE(AO), %xmm8
  1656. pxor %xmm0, %xmm0
  1657. movapd 0 * SIZE(BO), %xmm9
  1658. pxor %xmm1, %xmm1
  1659. movddup 4 * SIZE(AO), %xmm10
  1660. pxor %xmm2, %xmm2
  1661. movapd 8 * SIZE(BO), %xmm11
  1662. pxor %xmm3, %xmm3
  1663. #if defined(LT) || defined(RN)
  1664. movq KK, %rax
  1665. #else
  1666. movq K, %rax
  1667. subq KK, %rax
  1668. #endif
  1669. sarq $3, %rax
  1670. je .L75
  1671. ALIGN_4
  1672. .L72:
  1673. mulpd %xmm8, %xmm9
  1674. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1675. movddup 1 * SIZE(AO), %xmm8
  1676. addpd %xmm9, %xmm0
  1677. mulpd 2 * SIZE(BO), %xmm8
  1678. movapd 16 * SIZE(BO), %xmm9
  1679. addpd %xmm8, %xmm1
  1680. movddup 2 * SIZE(AO), %xmm8
  1681. mulpd 4 * SIZE(BO), %xmm8
  1682. addpd %xmm8, %xmm2
  1683. movddup 3 * SIZE(AO), %xmm8
  1684. mulpd 6 * SIZE(BO), %xmm8
  1685. addpd %xmm8, %xmm3
  1686. movddup 8 * SIZE(AO), %xmm8
  1687. mulpd %xmm10, %xmm11
  1688. movddup 5 * SIZE(AO), %xmm10
  1689. addpd %xmm11, %xmm0
  1690. mulpd 10 * SIZE(BO), %xmm10
  1691. movapd 24 * SIZE(BO), %xmm11
  1692. addpd %xmm10, %xmm1
  1693. movddup 6 * SIZE(AO), %xmm10
  1694. mulpd 12 * SIZE(BO), %xmm10
  1695. addpd %xmm10, %xmm2
  1696. movddup 7 * SIZE(AO), %xmm10
  1697. mulpd 14 * SIZE(BO), %xmm10
  1698. addpd %xmm10, %xmm3
  1699. movddup 12 * SIZE(AO), %xmm10
  1700. addq $ 8 * SIZE, AO
  1701. addq $16 * SIZE, BO
  1702. decq %rax
  1703. jne .L72
  1704. ALIGN_4
  1705. .L75:
  1706. #if defined(LT) || defined(RN)
  1707. movq KK, %rax
  1708. #else
  1709. movq K, %rax
  1710. subq KK, %rax
  1711. #endif
  1712. andq $7, %rax # if (k & 1)
  1713. BRANCH
  1714. je .L78
  1715. ALIGN_4
  1716. .L76:
  1717. mulpd %xmm8, %xmm9
  1718. movddup 1 * SIZE(AO), %xmm8
  1719. addpd %xmm9, %xmm0
  1720. movapd 2 * SIZE(BO), %xmm9
  1721. addq $1 * SIZE, AO # aoffset += 4
  1722. addq $2 * SIZE, BO # boffset1 += 8
  1723. decq %rax
  1724. jg .L76
  1725. ALIGN_4
  1726. .L78:
  1727. addpd %xmm1, %xmm0
  1728. addpd %xmm3, %xmm2
  1729. addpd %xmm2, %xmm0
  1730. #if defined(LN) || defined(RT)
  1731. movq KK, %rax
  1732. #ifdef LN
  1733. subq $1, %rax
  1734. #else
  1735. subq $2, %rax
  1736. #endif
  1737. leaq (, %rax, SIZE), %rax
  1738. movq AORIG, AO
  1739. leaq (AO, %rax, 1), AO
  1740. leaq (B, %rax, 2), BO
  1741. #endif
  1742. #if defined(LN) || defined(LT)
  1743. movapd 0 * SIZE(BO), %xmm2
  1744. subpd %xmm0, %xmm2
  1745. #else
  1746. movapd 0 * SIZE(AO), %xmm2
  1747. subpd %xmm0, %xmm2
  1748. #endif
  1749. #ifdef LN
  1750. movddup 0 * SIZE(AO), %xmm0
  1751. mulpd %xmm0, %xmm2
  1752. #endif
  1753. #ifdef LT
  1754. movddup 0 * SIZE(AO), %xmm0
  1755. mulpd %xmm0, %xmm2
  1756. #endif
  1757. #ifdef RN
  1758. movapd %xmm2, %xmm0
  1759. unpckhpd %xmm0, %xmm0
  1760. movsd 0 * SIZE(BO), %xmm4
  1761. mulsd %xmm4, %xmm2
  1762. movsd 1 * SIZE(BO), %xmm5
  1763. mulsd %xmm2, %xmm5
  1764. subsd %xmm5, %xmm0
  1765. movsd 3 * SIZE(BO), %xmm4
  1766. mulsd %xmm4, %xmm0
  1767. unpcklpd %xmm0, %xmm2
  1768. #endif
  1769. #ifdef RT
  1770. movapd %xmm2, %xmm0
  1771. unpckhpd %xmm0, %xmm0
  1772. movsd 3 * SIZE(BO), %xmm4
  1773. mulsd %xmm4, %xmm0
  1774. movsd 2 * SIZE(BO), %xmm5
  1775. mulsd %xmm0, %xmm5
  1776. subsd %xmm5, %xmm2
  1777. movsd 0 * SIZE(BO), %xmm4
  1778. mulsd %xmm4, %xmm2
  1779. unpcklpd %xmm0, %xmm2
  1780. #endif
  1781. #ifdef LN
  1782. subq $1 * SIZE, CO1
  1783. subq $1 * SIZE, CO2
  1784. #endif
  1785. #if defined(LN) || defined(LT)
  1786. movsd %xmm2, 0 * SIZE(CO1)
  1787. movhpd %xmm2, 0 * SIZE(CO2)
  1788. #else
  1789. movsd %xmm2, 0 * SIZE(CO1)
  1790. movhpd %xmm2, 0 * SIZE(CO2)
  1791. #endif
  1792. #if defined(LN) || defined(LT)
  1793. movapd %xmm2, 0 * SIZE(BO)
  1794. #else
  1795. movapd %xmm2, 0 * SIZE(AO)
  1796. #endif
  1797. #ifndef LN
  1798. addq $1 * SIZE, CO1
  1799. addq $1 * SIZE, CO2
  1800. #endif
  1801. #if defined(LT) || defined(RN)
  1802. movq K, %rax
  1803. subq KK, %rax
  1804. leaq (,%rax, SIZE), %rax
  1805. leaq (AO, %rax, 1), AO
  1806. leaq (BO, %rax, 2), BO
  1807. #endif
  1808. #ifdef LN
  1809. subq $1, KK
  1810. #endif
  1811. #ifdef LT
  1812. addq $1, KK
  1813. #endif
  1814. #ifdef RT
  1815. movq K, %rax
  1816. salq $0 + BASE_SHIFT, %rax
  1817. addq %rax, AORIG
  1818. #endif
  1819. ALIGN_4
  1820. .L79:
  1821. #ifdef LN
  1822. leaq (, K, SIZE), %rax
  1823. leaq (B, %rax, 2), B
  1824. #endif
  1825. #if defined(LT) || defined(RN)
  1826. movq BO, B
  1827. #endif
  1828. #ifdef RN
  1829. addq $2, KK
  1830. #endif
  1831. #ifdef RT
  1832. subq $2, KK
  1833. #endif
  1834. ALIGN_4
  1835. .L40:
  1836. movq N, J
  1837. sarq $2, J # j = (n >> 2)
  1838. jle .L999
  1839. ALIGN_4
  1840. .L10:
  1841. #if defined(LT) || defined(RN)
  1842. movq A, AO
  1843. #else
  1844. movq A, AORIG
  1845. #endif
  1846. #ifdef RT
  1847. movq K, %rax
  1848. salq $2 + BASE_SHIFT, %rax
  1849. subq %rax, B
  1850. leaq (, LDC, 4), %rax
  1851. subq %rax, C
  1852. #endif
  1853. movq C, CO1
  1854. leaq (C, LDC, 1), CO2
  1855. #ifndef RT
  1856. leaq (C, LDC, 4), C
  1857. #endif
  1858. #ifdef LN
  1859. movq OFFSET, %rax
  1860. addq M, %rax
  1861. movq %rax, KK
  1862. #endif
  1863. #ifdef LT
  1864. movq OFFSET, %rax
  1865. movq %rax, KK
  1866. #endif
  1867. movq M, I
  1868. sarq $2, I # i = (m >> 2)
  1869. jle .L20
  1870. ALIGN_4
  1871. .L11:
  1872. #ifdef LN
  1873. movq K, %rax
  1874. salq $2 + BASE_SHIFT, %rax
  1875. subq %rax, AORIG
  1876. #endif
  1877. #if defined(LN) || defined(RT)
  1878. movq KK, %rax
  1879. leaq (, %rax, SIZE), %rax
  1880. movq AORIG, AO
  1881. leaq (AO, %rax, 4), AO
  1882. leaq (B, %rax, 4), BO
  1883. #else
  1884. movq B, BO
  1885. #endif
  1886. movapd 0 * SIZE(AO), %xmm8
  1887. pxor %xmm0, %xmm0
  1888. movddup 0 * SIZE(BO), %xmm9
  1889. pxor %xmm1, %xmm1
  1890. movapd 8 * SIZE(AO), %xmm10
  1891. pxor %xmm2, %xmm2
  1892. movddup 8 * SIZE(BO), %xmm11
  1893. pxor %xmm3, %xmm3
  1894. movapd 16 * SIZE(AO), %xmm12
  1895. movddup 16 * SIZE(BO), %xmm13
  1896. movapd 24 * SIZE(AO), %xmm14
  1897. movddup 24 * SIZE(BO), %xmm15
  1898. prefetchnta 4 * SIZE(CO1)
  1899. pxor %xmm4, %xmm4
  1900. prefetchnta 4 * SIZE(CO2)
  1901. pxor %xmm5, %xmm5
  1902. prefetchnta 4 * SIZE(CO1, LDC, 2)
  1903. pxor %xmm6, %xmm6
  1904. prefetchnta 4 * SIZE(CO2, LDC, 2)
  1905. pxor %xmm7, %xmm7
  1906. #if defined(LT) || defined(RN)
  1907. movq KK, %rax
  1908. #else
  1909. movq K, %rax
  1910. subq KK, %rax
  1911. #endif
  1912. #if 1
  1913. andq $-8, %rax
  1914. salq $4, %rax
  1915. je .L15
  1916. .L1X:
  1917. KERNEL1 (16 * 0)
  1918. KERNEL2 (16 * 0)
  1919. KERNEL3 (16 * 0)
  1920. KERNEL4 (16 * 0)
  1921. KERNEL5 (16 * 0)
  1922. KERNEL6 (16 * 0)
  1923. KERNEL7 (16 * 0)
  1924. KERNEL8 (16 * 0)
  1925. KERNEL9 (16 * 0)
  1926. KERNEL10(16 * 0)
  1927. KERNEL11(16 * 0)
  1928. KERNEL12(16 * 0)
  1929. KERNEL13(16 * 0)
  1930. KERNEL14(16 * 0)
  1931. KERNEL15(16 * 0)
  1932. KERNEL16(16 * 0)
  1933. cmpq $128 * 1, %rax
  1934. NOBRANCH
  1935. jle .L12
  1936. KERNEL1 (16 * 1)
  1937. KERNEL2 (16 * 1)
  1938. KERNEL3 (16 * 1)
  1939. KERNEL4 (16 * 1)
  1940. KERNEL5 (16 * 1)
  1941. KERNEL6 (16 * 1)
  1942. KERNEL7 (16 * 1)
  1943. KERNEL8 (16 * 1)
  1944. KERNEL9 (16 * 1)
  1945. KERNEL10(16 * 1)
  1946. KERNEL11(16 * 1)
  1947. KERNEL12(16 * 1)
  1948. KERNEL13(16 * 1)
  1949. KERNEL14(16 * 1)
  1950. KERNEL15(16 * 1)
  1951. KERNEL16(16 * 1)
  1952. cmpq $128 * 2, %rax
  1953. NOBRANCH
  1954. jle .L12
  1955. KERNEL1 (16 * 2)
  1956. KERNEL2 (16 * 2)
  1957. KERNEL3 (16 * 2)
  1958. KERNEL4 (16 * 2)
  1959. KERNEL5 (16 * 2)
  1960. KERNEL6 (16 * 2)
  1961. KERNEL7 (16 * 2)
  1962. KERNEL8 (16 * 2)
  1963. KERNEL9 (16 * 2)
  1964. KERNEL10(16 * 2)
  1965. KERNEL11(16 * 2)
  1966. KERNEL12(16 * 2)
  1967. KERNEL13(16 * 2)
  1968. KERNEL14(16 * 2)
  1969. KERNEL15(16 * 2)
  1970. KERNEL16(16 * 2)
  1971. cmpq $128 * 3, %rax
  1972. NOBRANCH
  1973. jle .L12
  1974. KERNEL1 (16 * 3)
  1975. KERNEL2 (16 * 3)
  1976. KERNEL3 (16 * 3)
  1977. KERNEL4 (16 * 3)
  1978. KERNEL5 (16 * 3)
  1979. KERNEL6 (16 * 3)
  1980. KERNEL7 (16 * 3)
  1981. KERNEL8 (16 * 3)
  1982. KERNEL9 (16 * 3)
  1983. KERNEL10(16 * 3)
  1984. KERNEL11(16 * 3)
  1985. KERNEL12(16 * 3)
  1986. KERNEL13(16 * 3)
  1987. KERNEL14(16 * 3)
  1988. KERNEL15(16 * 3)
  1989. KERNEL16(16 * 3)
  1990. cmpq $128 * 4, %rax
  1991. NOBRANCH
  1992. jle .L12
  1993. KERNEL1 (16 * 4)
  1994. KERNEL2 (16 * 4)
  1995. KERNEL3 (16 * 4)
  1996. KERNEL4 (16 * 4)
  1997. KERNEL5 (16 * 4)
  1998. KERNEL6 (16 * 4)
  1999. KERNEL7 (16 * 4)
  2000. KERNEL8 (16 * 4)
  2001. KERNEL9 (16 * 4)
  2002. KERNEL10(16 * 4)
  2003. KERNEL11(16 * 4)
  2004. KERNEL12(16 * 4)
  2005. KERNEL13(16 * 4)
  2006. KERNEL14(16 * 4)
  2007. KERNEL15(16 * 4)
  2008. KERNEL16(16 * 4)
  2009. cmpq $128 * 5, %rax
  2010. NOBRANCH
  2011. jle .L12
  2012. KERNEL1 (16 * 5)
  2013. KERNEL2 (16 * 5)
  2014. KERNEL3 (16 * 5)
  2015. KERNEL4 (16 * 5)
  2016. KERNEL5 (16 * 5)
  2017. KERNEL6 (16 * 5)
  2018. KERNEL7 (16 * 5)
  2019. KERNEL8 (16 * 5)
  2020. KERNEL9 (16 * 5)
  2021. KERNEL10(16 * 5)
  2022. KERNEL11(16 * 5)
  2023. KERNEL12(16 * 5)
  2024. KERNEL13(16 * 5)
  2025. KERNEL14(16 * 5)
  2026. KERNEL15(16 * 5)
  2027. KERNEL16(16 * 5)
  2028. cmpq $128 * 6, %rax
  2029. NOBRANCH
  2030. jle .L12
  2031. KERNEL1 (16 * 6)
  2032. KERNEL2 (16 * 6)
  2033. KERNEL3 (16 * 6)
  2034. KERNEL4 (16 * 6)
  2035. KERNEL5 (16 * 6)
  2036. KERNEL6 (16 * 6)
  2037. KERNEL7 (16 * 6)
  2038. KERNEL8 (16 * 6)
  2039. KERNEL9 (16 * 6)
  2040. KERNEL10(16 * 6)
  2041. KERNEL11(16 * 6)
  2042. KERNEL12(16 * 6)
  2043. KERNEL13(16 * 6)
  2044. KERNEL14(16 * 6)
  2045. KERNEL15(16 * 6)
  2046. KERNEL16(16 * 6)
  2047. cmpq $128 * 7, %rax
  2048. NOBRANCH
  2049. jle .L12
  2050. KERNEL1 (16 * 7)
  2051. KERNEL2 (16 * 7)
  2052. KERNEL3 (16 * 7)
  2053. KERNEL4 (16 * 7)
  2054. KERNEL5 (16 * 7)
  2055. KERNEL6 (16 * 7)
  2056. KERNEL7 (16 * 7)
  2057. KERNEL8 (16 * 7)
  2058. KERNEL9 (16 * 7)
  2059. KERNEL10(16 * 7)
  2060. KERNEL11(16 * 7)
  2061. KERNEL12(16 * 7)
  2062. KERNEL13(16 * 7)
  2063. KERNEL14(16 * 7)
  2064. KERNEL15(16 * 7)
  2065. KERNEL16(16 * 7)
  2066. addq $32 * 8 * SIZE, AO
  2067. addq $32 * 8 * SIZE, BO
  2068. subq $128 * 8, %rax
  2069. jg .L1X
  2070. .L12:
  2071. leaq (AO, %rax, 2), AO # * 16
  2072. leaq (BO, %rax, 2), BO # * 64
  2073. #else
  2074. sarq $3, %rax
  2075. je .L15
  2076. ALIGN_4
  2077. .L12:
  2078. mulpd %xmm8, %xmm9
  2079. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2080. addpd %xmm9, %xmm0
  2081. movddup 1 * SIZE(BO), %xmm9
  2082. mulpd %xmm8, %xmm9
  2083. addpd %xmm9, %xmm1
  2084. movddup 2 * SIZE(BO), %xmm9
  2085. mulpd %xmm8, %xmm9
  2086. addpd %xmm9, %xmm2
  2087. movddup 3 * SIZE(BO), %xmm9
  2088. mulpd %xmm8, %xmm9
  2089. movapd 2 * SIZE(AO), %xmm8
  2090. addpd %xmm9, %xmm3
  2091. movddup 0 * SIZE(BO), %xmm9
  2092. mulpd %xmm8, %xmm9
  2093. addpd %xmm9, %xmm4
  2094. movddup 1 * SIZE(BO), %xmm9
  2095. mulpd %xmm8, %xmm9
  2096. addpd %xmm9, %xmm5
  2097. movddup 2 * SIZE(BO), %xmm9
  2098. mulpd %xmm8, %xmm9
  2099. addpd %xmm9, %xmm6
  2100. movddup 3 * SIZE(BO), %xmm9
  2101. mulpd %xmm8, %xmm9
  2102. movapd 4 * SIZE(AO), %xmm8
  2103. addpd %xmm9, %xmm7
  2104. movddup 4 * SIZE(BO), %xmm9
  2105. mulpd %xmm8, %xmm9
  2106. addpd %xmm9, %xmm0
  2107. movddup 5 * SIZE(BO), %xmm9
  2108. mulpd %xmm8, %xmm9
  2109. addpd %xmm9, %xmm1
  2110. movddup 6 * SIZE(BO), %xmm9
  2111. mulpd %xmm8, %xmm9
  2112. addpd %xmm9, %xmm2
  2113. movddup 7 * SIZE(BO), %xmm9
  2114. mulpd %xmm8, %xmm9
  2115. movapd 6 * SIZE(AO), %xmm8
  2116. addpd %xmm9, %xmm3
  2117. movddup 4 * SIZE(BO), %xmm9
  2118. mulpd %xmm8, %xmm9
  2119. addpd %xmm9, %xmm4
  2120. movddup 5 * SIZE(BO), %xmm9
  2121. mulpd %xmm8, %xmm9
  2122. addpd %xmm9, %xmm5
  2123. movddup 6 * SIZE(BO), %xmm9
  2124. mulpd %xmm8, %xmm9
  2125. addpd %xmm9, %xmm6
  2126. movddup 7 * SIZE(BO), %xmm9
  2127. mulpd %xmm8, %xmm9
  2128. movapd 32 * SIZE(AO), %xmm8
  2129. addpd %xmm9, %xmm7
  2130. movddup 32 * SIZE(BO), %xmm9
  2131. mulpd %xmm10, %xmm11
  2132. addpd %xmm11, %xmm0
  2133. movddup 9 * SIZE(BO), %xmm11
  2134. mulpd %xmm10, %xmm11
  2135. addpd %xmm11, %xmm1
  2136. movddup 10 * SIZE(BO), %xmm11
  2137. mulpd %xmm10, %xmm11
  2138. addpd %xmm11, %xmm2
  2139. movddup 11 * SIZE(BO), %xmm11
  2140. mulpd %xmm10, %xmm11
  2141. movapd 10 * SIZE(AO), %xmm10
  2142. addpd %xmm11, %xmm3
  2143. movddup 8 * SIZE(BO), %xmm11
  2144. mulpd %xmm10, %xmm11
  2145. addpd %xmm11, %xmm4
  2146. movddup 9 * SIZE(BO), %xmm11
  2147. mulpd %xmm10, %xmm11
  2148. addpd %xmm11, %xmm5
  2149. movddup 10 * SIZE(BO), %xmm11
  2150. mulpd %xmm10, %xmm11
  2151. addpd %xmm11, %xmm6
  2152. movddup 11 * SIZE(BO), %xmm11
  2153. mulpd %xmm10, %xmm11
  2154. movapd 12 * SIZE(AO), %xmm10
  2155. addpd %xmm11, %xmm7
  2156. movddup 12 * SIZE(BO), %xmm11
  2157. mulpd %xmm10, %xmm11
  2158. addpd %xmm11, %xmm0
  2159. movddup 13 * SIZE(BO), %xmm11
  2160. mulpd %xmm10, %xmm11
  2161. addpd %xmm11, %xmm1
  2162. movddup 14 * SIZE(BO), %xmm11
  2163. mulpd %xmm10, %xmm11
  2164. addpd %xmm11, %xmm2
  2165. movddup 15 * SIZE(BO), %xmm11
  2166. mulpd %xmm10, %xmm11
  2167. movapd 14 * SIZE(AO), %xmm10
  2168. addpd %xmm11, %xmm3
  2169. movddup 12 * SIZE(BO), %xmm11
  2170. mulpd %xmm10, %xmm11
  2171. addpd %xmm11, %xmm4
  2172. movddup 13 * SIZE(BO), %xmm11
  2173. mulpd %xmm10, %xmm11
  2174. addpd %xmm11, %xmm5
  2175. movddup 14 * SIZE(BO), %xmm11
  2176. mulpd %xmm10, %xmm11
  2177. addpd %xmm11, %xmm6
  2178. movddup 15 * SIZE(BO), %xmm11
  2179. mulpd %xmm10, %xmm11
  2180. movapd 40 * SIZE(AO), %xmm10
  2181. addpd %xmm11, %xmm7
  2182. movddup 40 * SIZE(BO), %xmm11
  2183. mulpd %xmm12, %xmm13
  2184. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  2185. addpd %xmm13, %xmm0
  2186. movddup 17 * SIZE(BO), %xmm13
  2187. mulpd %xmm12, %xmm13
  2188. addpd %xmm13, %xmm1
  2189. movddup 18 * SIZE(BO), %xmm13
  2190. mulpd %xmm12, %xmm13
  2191. addpd %xmm13, %xmm2
  2192. movddup 19 * SIZE(BO), %xmm13
  2193. mulpd %xmm12, %xmm13
  2194. movapd 18 * SIZE(AO), %xmm12
  2195. addpd %xmm13, %xmm3
  2196. movddup 16 * SIZE(BO), %xmm13
  2197. mulpd %xmm12, %xmm13
  2198. addpd %xmm13, %xmm4
  2199. movddup 17 * SIZE(BO), %xmm13
  2200. mulpd %xmm12, %xmm13
  2201. addpd %xmm13, %xmm5
  2202. movddup 18 * SIZE(BO), %xmm13
  2203. mulpd %xmm12, %xmm13
  2204. addpd %xmm13, %xmm6
  2205. movddup 19 * SIZE(BO), %xmm13
  2206. mulpd %xmm12, %xmm13
  2207. movapd 20 * SIZE(AO), %xmm12
  2208. addpd %xmm13, %xmm7
  2209. movddup 20 * SIZE(BO), %xmm13
  2210. mulpd %xmm12, %xmm13
  2211. addpd %xmm13, %xmm0
  2212. movddup 21 * SIZE(BO), %xmm13
  2213. mulpd %xmm12, %xmm13
  2214. addpd %xmm13, %xmm1
  2215. movddup 22 * SIZE(BO), %xmm13
  2216. mulpd %xmm12, %xmm13
  2217. addpd %xmm13, %xmm2
  2218. movddup 23 * SIZE(BO), %xmm13
  2219. mulpd %xmm12, %xmm13
  2220. movapd 22 * SIZE(AO), %xmm12
  2221. addpd %xmm13, %xmm3
  2222. movddup 20 * SIZE(BO), %xmm13
  2223. mulpd %xmm12, %xmm13
  2224. addpd %xmm13, %xmm4
  2225. movddup 21 * SIZE(BO), %xmm13
  2226. mulpd %xmm12, %xmm13
  2227. addpd %xmm13, %xmm5
  2228. movddup 22 * SIZE(BO), %xmm13
  2229. mulpd %xmm12, %xmm13
  2230. addpd %xmm13, %xmm6
  2231. movddup 23 * SIZE(BO), %xmm13
  2232. mulpd %xmm12, %xmm13
  2233. movapd 48 * SIZE(AO), %xmm12
  2234. addpd %xmm13, %xmm7
  2235. movddup 48 * SIZE(BO), %xmm13
  2236. mulpd %xmm14, %xmm15
  2237. addpd %xmm15, %xmm0
  2238. movddup 25 * SIZE(BO), %xmm15
  2239. mulpd %xmm14, %xmm15
  2240. addpd %xmm15, %xmm1
  2241. movddup 26 * SIZE(BO), %xmm15
  2242. mulpd %xmm14, %xmm15
  2243. addpd %xmm15, %xmm2
  2244. movddup 27 * SIZE(BO), %xmm15
  2245. mulpd %xmm14, %xmm15
  2246. movapd 26 * SIZE(AO), %xmm14
  2247. addpd %xmm15, %xmm3
  2248. movddup 24 * SIZE(BO), %xmm15
  2249. mulpd %xmm14, %xmm15
  2250. addpd %xmm15, %xmm4
  2251. movddup 25 * SIZE(BO), %xmm15
  2252. mulpd %xmm14, %xmm15
  2253. addpd %xmm15, %xmm5
  2254. movddup 26 * SIZE(BO), %xmm15
  2255. mulpd %xmm14, %xmm15
  2256. addpd %xmm15, %xmm6
  2257. movddup 27 * SIZE(BO), %xmm15
  2258. mulpd %xmm14, %xmm15
  2259. movapd 28 * SIZE(AO), %xmm14
  2260. addpd %xmm15, %xmm7
  2261. movddup 28 * SIZE(BO), %xmm15
  2262. mulpd %xmm14, %xmm15
  2263. addpd %xmm15, %xmm0
  2264. movddup 29 * SIZE(BO), %xmm15
  2265. mulpd %xmm14, %xmm15
  2266. addpd %xmm15, %xmm1
  2267. movddup 30 * SIZE(BO), %xmm15
  2268. mulpd %xmm14, %xmm15
  2269. addpd %xmm15, %xmm2
  2270. movddup 31 * SIZE(BO), %xmm15
  2271. mulpd %xmm14, %xmm15
  2272. movapd 30 * SIZE(AO), %xmm14
  2273. addpd %xmm15, %xmm3
  2274. movddup 28 * SIZE(BO), %xmm15
  2275. mulpd %xmm14, %xmm15
  2276. addpd %xmm15, %xmm4
  2277. movddup 29 * SIZE(BO), %xmm15
  2278. mulpd %xmm14, %xmm15
  2279. addpd %xmm15, %xmm5
  2280. movddup 30 * SIZE(BO), %xmm15
  2281. mulpd %xmm14, %xmm15
  2282. addpd %xmm15, %xmm6
  2283. movddup 31 * SIZE(BO), %xmm15
  2284. mulpd %xmm14, %xmm15
  2285. movapd 56 * SIZE(AO), %xmm14
  2286. addpd %xmm15, %xmm7
  2287. movddup 56 * SIZE(BO), %xmm15
  2288. addq $32 * SIZE, BO
  2289. addq $32 * SIZE, AO
  2290. decq %rax
  2291. BRANCH
  2292. jne .L12
  2293. #endif
  2294. ALIGN_4
  2295. .L15:
  2296. #if defined(LT) || defined(RN)
  2297. movq KK, %rax
  2298. #else
  2299. movq K, %rax
  2300. subq KK, %rax
  2301. #endif
  2302. andq $7, %rax # if (k & 1)
  2303. BRANCH
  2304. je .L19
  2305. ALIGN_4
  2306. .L16:
  2307. mulpd %xmm8, %xmm9
  2308. movapd 2 * SIZE(AO), %xmm10
  2309. addpd %xmm9, %xmm0
  2310. movddup 1 * SIZE(BO), %xmm9
  2311. mulpd %xmm8, %xmm9
  2312. movddup 0 * SIZE(BO), %xmm11
  2313. addpd %xmm9, %xmm1
  2314. movddup 2 * SIZE(BO), %xmm9
  2315. mulpd %xmm8, %xmm9
  2316. addpd %xmm9, %xmm2
  2317. movddup 3 * SIZE(BO), %xmm9
  2318. mulpd %xmm8, %xmm9
  2319. movapd 4 * SIZE(AO), %xmm8
  2320. addpd %xmm9, %xmm3
  2321. movddup 4 * SIZE(BO), %xmm9
  2322. mulpd %xmm10, %xmm11
  2323. addpd %xmm11, %xmm4
  2324. movddup 1 * SIZE(BO), %xmm11
  2325. mulpd %xmm10, %xmm11
  2326. addpd %xmm11, %xmm5
  2327. movddup 2 * SIZE(BO), %xmm11
  2328. mulpd %xmm10, %xmm11
  2329. addpd %xmm11, %xmm6
  2330. movddup 3 * SIZE(BO), %xmm11
  2331. mulpd %xmm10, %xmm11
  2332. addpd %xmm11, %xmm7
  2333. addq $4 * SIZE, AO # aoffset += 4
  2334. addq $4 * SIZE, BO # boffset1 += 8
  2335. decq %rax
  2336. jg .L16
  2337. ALIGN_4
  2338. .L19:
  2339. #if defined(LN) || defined(RT)
  2340. movq KK, %rax
  2341. subq $4, %rax
  2342. leaq (, %rax, SIZE), %rax
  2343. movq AORIG, AO
  2344. leaq (AO, %rax, 4), AO
  2345. leaq (B, %rax, 4), BO
  2346. #endif
  2347. #if defined(LN) || defined(LT)
  2348. movapd %xmm0, %xmm8
  2349. unpcklpd %xmm1, %xmm0
  2350. unpckhpd %xmm1, %xmm8
  2351. movapd %xmm2, %xmm10
  2352. unpcklpd %xmm3, %xmm2
  2353. unpckhpd %xmm3, %xmm10
  2354. movapd %xmm4, %xmm12
  2355. unpcklpd %xmm5, %xmm4
  2356. unpckhpd %xmm5, %xmm12
  2357. movapd %xmm6, %xmm14
  2358. unpcklpd %xmm7, %xmm6
  2359. unpckhpd %xmm7, %xmm14
  2360. movapd 0 * SIZE(BO), %xmm1
  2361. movapd 2 * SIZE(BO), %xmm3
  2362. movapd 4 * SIZE(BO), %xmm5
  2363. movapd 6 * SIZE(BO), %xmm7
  2364. movapd 8 * SIZE(BO), %xmm9
  2365. movapd 10 * SIZE(BO), %xmm11
  2366. movapd 12 * SIZE(BO), %xmm13
  2367. movapd 14 * SIZE(BO), %xmm15
  2368. subpd %xmm0, %xmm1
  2369. subpd %xmm2, %xmm3
  2370. subpd %xmm8, %xmm5
  2371. subpd %xmm10, %xmm7
  2372. subpd %xmm4, %xmm9
  2373. subpd %xmm6, %xmm11
  2374. subpd %xmm12, %xmm13
  2375. subpd %xmm14, %xmm15
  2376. #else
  2377. movapd 0 * SIZE(AO), %xmm8
  2378. movapd 2 * SIZE(AO), %xmm9
  2379. movapd 4 * SIZE(AO), %xmm10
  2380. movapd 6 * SIZE(AO), %xmm11
  2381. movapd 8 * SIZE(AO), %xmm12
  2382. movapd 10 * SIZE(AO), %xmm13
  2383. movapd 12 * SIZE(AO), %xmm14
  2384. movapd 14 * SIZE(AO), %xmm15
  2385. subpd %xmm0, %xmm8
  2386. subpd %xmm4, %xmm9
  2387. subpd %xmm1, %xmm10
  2388. subpd %xmm5, %xmm11
  2389. subpd %xmm2, %xmm12
  2390. subpd %xmm6, %xmm13
  2391. subpd %xmm3, %xmm14
  2392. subpd %xmm7, %xmm15
  2393. #endif
  2394. #ifdef LN
  2395. movddup 15 * SIZE(AO), %xmm0
  2396. mulpd %xmm0, %xmm13
  2397. mulpd %xmm0, %xmm15
  2398. movddup 14 * SIZE(AO), %xmm2
  2399. mulpd %xmm13, %xmm2
  2400. subpd %xmm2, %xmm9
  2401. movddup 14 * SIZE(AO), %xmm2
  2402. mulpd %xmm15, %xmm2
  2403. subpd %xmm2, %xmm11
  2404. movddup 13 * SIZE(AO), %xmm4
  2405. mulpd %xmm13, %xmm4
  2406. subpd %xmm4, %xmm5
  2407. movddup 13 * SIZE(AO), %xmm4
  2408. mulpd %xmm15, %xmm4
  2409. subpd %xmm4, %xmm7
  2410. movddup 12 * SIZE(AO), %xmm6
  2411. mulpd %xmm13, %xmm6
  2412. subpd %xmm6, %xmm1
  2413. movddup 12 * SIZE(AO), %xmm6
  2414. mulpd %xmm15, %xmm6
  2415. subpd %xmm6, %xmm3
  2416. movddup 10 * SIZE(AO), %xmm0
  2417. mulpd %xmm0, %xmm9
  2418. mulpd %xmm0, %xmm11
  2419. movddup 9 * SIZE(AO), %xmm2
  2420. mulpd %xmm9, %xmm2
  2421. subpd %xmm2, %xmm5
  2422. movddup 9 * SIZE(AO), %xmm2
  2423. mulpd %xmm11, %xmm2
  2424. subpd %xmm2, %xmm7
  2425. movddup 8 * SIZE(AO), %xmm4
  2426. mulpd %xmm9, %xmm4
  2427. subpd %xmm4, %xmm1
  2428. movddup 8 * SIZE(AO), %xmm4
  2429. mulpd %xmm11, %xmm4
  2430. subpd %xmm4, %xmm3
  2431. movddup 5 * SIZE(AO), %xmm0
  2432. mulpd %xmm0, %xmm5
  2433. mulpd %xmm0, %xmm7
  2434. movddup 4 * SIZE(AO), %xmm2
  2435. mulpd %xmm5, %xmm2
  2436. subpd %xmm2, %xmm1
  2437. movddup 4 * SIZE(AO), %xmm2
  2438. mulpd %xmm7, %xmm2
  2439. subpd %xmm2, %xmm3
  2440. movddup 0 * SIZE(AO), %xmm0
  2441. mulpd %xmm0, %xmm1
  2442. mulpd %xmm0, %xmm3
  2443. #endif
  2444. #ifdef LT
  2445. movddup 0 * SIZE(AO), %xmm0
  2446. mulpd %xmm0, %xmm1
  2447. mulpd %xmm0, %xmm3
  2448. movddup 1 * SIZE(AO), %xmm2
  2449. mulpd %xmm1, %xmm2
  2450. subpd %xmm2, %xmm5
  2451. movddup 1 * SIZE(AO), %xmm2
  2452. mulpd %xmm3, %xmm2
  2453. subpd %xmm2, %xmm7
  2454. movddup 2 * SIZE(AO), %xmm4
  2455. mulpd %xmm1, %xmm4
  2456. subpd %xmm4, %xmm9
  2457. movddup 2 * SIZE(AO), %xmm4
  2458. mulpd %xmm3, %xmm4
  2459. subpd %xmm4, %xmm11
  2460. movddup 3 * SIZE(AO), %xmm6
  2461. mulpd %xmm1, %xmm6
  2462. subpd %xmm6, %xmm13
  2463. movddup 3 * SIZE(AO), %xmm6
  2464. mulpd %xmm3, %xmm6
  2465. subpd %xmm6, %xmm15
  2466. movddup 5 * SIZE(AO), %xmm0
  2467. mulpd %xmm0, %xmm5
  2468. mulpd %xmm0, %xmm7
  2469. movddup 6 * SIZE(AO), %xmm2
  2470. mulpd %xmm5, %xmm2
  2471. subpd %xmm2, %xmm9
  2472. movddup 6 * SIZE(AO), %xmm2
  2473. mulpd %xmm7, %xmm2
  2474. subpd %xmm2, %xmm11
  2475. movddup 7 * SIZE(AO), %xmm4
  2476. mulpd %xmm5, %xmm4
  2477. subpd %xmm4, %xmm13
  2478. movddup 7 * SIZE(AO), %xmm4
  2479. mulpd %xmm7, %xmm4
  2480. subpd %xmm4, %xmm15
  2481. movddup 10 * SIZE(AO), %xmm0
  2482. mulpd %xmm0, %xmm9
  2483. mulpd %xmm0, %xmm11
  2484. movddup 11 * SIZE(AO), %xmm2
  2485. mulpd %xmm9, %xmm2
  2486. subpd %xmm2, %xmm13
  2487. movddup 11 * SIZE(AO), %xmm2
  2488. mulpd %xmm11, %xmm2
  2489. subpd %xmm2, %xmm15
  2490. movddup 15 * SIZE(AO), %xmm0
  2491. mulpd %xmm0, %xmm13
  2492. mulpd %xmm0, %xmm15
  2493. #endif
  2494. #ifdef RN
  2495. movddup 0 * SIZE(BO), %xmm0
  2496. mulpd %xmm0, %xmm8
  2497. mulpd %xmm0, %xmm9
  2498. movddup 1 * SIZE(BO), %xmm1
  2499. mulpd %xmm8, %xmm1
  2500. subpd %xmm1, %xmm10
  2501. movddup 1 * SIZE(BO), %xmm1
  2502. mulpd %xmm9, %xmm1
  2503. subpd %xmm1, %xmm11
  2504. movddup 2 * SIZE(BO), %xmm2
  2505. mulpd %xmm8, %xmm2
  2506. subpd %xmm2, %xmm12
  2507. movddup 2 * SIZE(BO), %xmm2
  2508. mulpd %xmm9, %xmm2
  2509. subpd %xmm2, %xmm13
  2510. movddup 3 * SIZE(BO), %xmm3
  2511. mulpd %xmm8, %xmm3
  2512. subpd %xmm3, %xmm14
  2513. movddup 3 * SIZE(BO), %xmm3
  2514. mulpd %xmm9, %xmm3
  2515. subpd %xmm3, %xmm15
  2516. movddup 5 * SIZE(BO), %xmm0
  2517. mulpd %xmm0, %xmm10
  2518. mulpd %xmm0, %xmm11
  2519. movddup 6 * SIZE(BO), %xmm1
  2520. mulpd %xmm10, %xmm1
  2521. subpd %xmm1, %xmm12
  2522. movddup 6 * SIZE(BO), %xmm1
  2523. mulpd %xmm11, %xmm1
  2524. subpd %xmm1, %xmm13
  2525. movddup 7 * SIZE(BO), %xmm2
  2526. mulpd %xmm10, %xmm2
  2527. subpd %xmm2, %xmm14
  2528. movddup 7 * SIZE(BO), %xmm2
  2529. mulpd %xmm11, %xmm2
  2530. subpd %xmm2, %xmm15
  2531. movddup 10 * SIZE(BO), %xmm0
  2532. mulpd %xmm0, %xmm12
  2533. mulpd %xmm0, %xmm13
  2534. movddup 11 * SIZE(BO), %xmm1
  2535. mulpd %xmm12, %xmm1
  2536. subpd %xmm1, %xmm14
  2537. movddup 11 * SIZE(BO), %xmm1
  2538. mulpd %xmm13, %xmm1
  2539. subpd %xmm1, %xmm15
  2540. movddup 15 * SIZE(BO), %xmm0
  2541. mulpd %xmm0, %xmm14
  2542. mulpd %xmm0, %xmm15
  2543. #endif
  2544. #ifdef RT
  2545. movddup 15 * SIZE(BO), %xmm0
  2546. mulpd %xmm0, %xmm14
  2547. mulpd %xmm0, %xmm15
  2548. movddup 14 * SIZE(BO), %xmm1
  2549. mulpd %xmm14, %xmm1
  2550. subpd %xmm1, %xmm12
  2551. movddup 14 * SIZE(BO), %xmm1
  2552. mulpd %xmm15, %xmm1
  2553. subpd %xmm1, %xmm13
  2554. movddup 13 * SIZE(BO), %xmm2
  2555. mulpd %xmm14, %xmm2
  2556. subpd %xmm2, %xmm10
  2557. movddup 13 * SIZE(BO), %xmm2
  2558. mulpd %xmm15, %xmm2
  2559. subpd %xmm2, %xmm11
  2560. movddup 12 * SIZE(BO), %xmm3
  2561. mulpd %xmm14, %xmm3
  2562. subpd %xmm3, %xmm8
  2563. movddup 12 * SIZE(BO), %xmm3
  2564. mulpd %xmm15, %xmm3
  2565. subpd %xmm3, %xmm9
  2566. movddup 10 * SIZE(BO), %xmm0
  2567. mulpd %xmm0, %xmm12
  2568. mulpd %xmm0, %xmm13
  2569. movddup 9 * SIZE(BO), %xmm1
  2570. mulpd %xmm12, %xmm1
  2571. subpd %xmm1, %xmm10
  2572. movddup 9 * SIZE(BO), %xmm1
  2573. mulpd %xmm13, %xmm1
  2574. subpd %xmm1, %xmm11
  2575. movddup 8 * SIZE(BO), %xmm2
  2576. mulpd %xmm12, %xmm2
  2577. subpd %xmm2, %xmm8
  2578. movddup 8 * SIZE(BO), %xmm2
  2579. mulpd %xmm13, %xmm2
  2580. subpd %xmm2, %xmm9
  2581. movddup 5 * SIZE(BO), %xmm0
  2582. mulpd %xmm0, %xmm10
  2583. mulpd %xmm0, %xmm11
  2584. movddup 4 * SIZE(BO), %xmm1
  2585. mulpd %xmm10, %xmm1
  2586. subpd %xmm1, %xmm8
  2587. movddup 4 * SIZE(BO), %xmm1
  2588. mulpd %xmm11, %xmm1
  2589. subpd %xmm1, %xmm9
  2590. movddup 0 * SIZE(BO), %xmm0
  2591. mulpd %xmm0, %xmm8
  2592. mulpd %xmm0, %xmm9
  2593. #endif
  2594. #ifdef LN
  2595. subq $4 * SIZE, CO1
  2596. subq $4 * SIZE, CO2
  2597. #endif
  2598. #if defined(LN) || defined(LT)
  2599. movsd %xmm1, 0 * SIZE(CO1)
  2600. movsd %xmm5, 1 * SIZE(CO1)
  2601. movsd %xmm9, 2 * SIZE(CO1)
  2602. movsd %xmm13, 3 * SIZE(CO1)
  2603. movhpd %xmm1, 0 * SIZE(CO2)
  2604. movhpd %xmm5, 1 * SIZE(CO2)
  2605. movhpd %xmm9, 2 * SIZE(CO2)
  2606. movhpd %xmm13, 3 * SIZE(CO2)
  2607. movsd %xmm3, 0 * SIZE(CO1, LDC, 2)
  2608. movsd %xmm7, 1 * SIZE(CO1, LDC, 2)
  2609. movsd %xmm11, 2 * SIZE(CO1, LDC, 2)
  2610. movsd %xmm15, 3 * SIZE(CO1, LDC, 2)
  2611. movhpd %xmm3, 0 * SIZE(CO2, LDC, 2)
  2612. movhpd %xmm7, 1 * SIZE(CO2, LDC, 2)
  2613. movhpd %xmm11, 2 * SIZE(CO2, LDC, 2)
  2614. movhpd %xmm15, 3 * SIZE(CO2, LDC, 2)
  2615. #else
  2616. movsd %xmm8, 0 * SIZE(CO1)
  2617. movhpd %xmm8, 1 * SIZE(CO1)
  2618. movsd %xmm9, 2 * SIZE(CO1)
  2619. movhpd %xmm9, 3 * SIZE(CO1)
  2620. movsd %xmm10, 0 * SIZE(CO2)
  2621. movhpd %xmm10, 1 * SIZE(CO2)
  2622. movsd %xmm11, 2 * SIZE(CO2)
  2623. movhpd %xmm11, 3 * SIZE(CO2)
  2624. movsd %xmm12, 0 * SIZE(CO1, LDC, 2)
  2625. movhpd %xmm12, 1 * SIZE(CO1, LDC, 2)
  2626. movsd %xmm13, 2 * SIZE(CO1, LDC, 2)
  2627. movhpd %xmm13, 3 * SIZE(CO1, LDC, 2)
  2628. movsd %xmm14, 0 * SIZE(CO2, LDC, 2)
  2629. movhpd %xmm14, 1 * SIZE(CO2, LDC, 2)
  2630. movsd %xmm15, 2 * SIZE(CO2, LDC, 2)
  2631. movhpd %xmm15, 3 * SIZE(CO2, LDC, 2)
  2632. #endif
  2633. #if defined(LN) || defined(LT)
  2634. movapd %xmm1, 0 * SIZE(BO)
  2635. movapd %xmm3, 2 * SIZE(BO)
  2636. movapd %xmm5, 4 * SIZE(BO)
  2637. movapd %xmm7, 6 * SIZE(BO)
  2638. movapd %xmm9, 8 * SIZE(BO)
  2639. movapd %xmm11, 10 * SIZE(BO)
  2640. movapd %xmm13, 12 * SIZE(BO)
  2641. movapd %xmm15, 14 * SIZE(BO)
  2642. #else
  2643. movapd %xmm8, 0 * SIZE(AO)
  2644. movapd %xmm9, 2 * SIZE(AO)
  2645. movapd %xmm10, 4 * SIZE(AO)
  2646. movapd %xmm11, 6 * SIZE(AO)
  2647. movapd %xmm12, 8 * SIZE(AO)
  2648. movapd %xmm13, 10 * SIZE(AO)
  2649. movapd %xmm14, 12 * SIZE(AO)
  2650. movapd %xmm15, 14 * SIZE(AO)
  2651. #endif
  2652. #ifndef LN
  2653. addq $4 * SIZE, CO1
  2654. addq $4 * SIZE, CO2
  2655. #endif
  2656. #if defined(LT) || defined(RN)
  2657. movq K, %rax
  2658. subq KK, %rax
  2659. leaq (,%rax, SIZE), %rax
  2660. leaq (AO, %rax, 4), AO
  2661. leaq (BO, %rax, 4), BO
  2662. #endif
  2663. #ifdef LN
  2664. subq $4, KK
  2665. #endif
  2666. #ifdef LT
  2667. addq $4, KK
  2668. #endif
  2669. #ifdef RT
  2670. movq K, %rax
  2671. salq $2 + BASE_SHIFT, %rax
  2672. addq %rax, AORIG
  2673. #endif
  2674. decq I # i --
  2675. jg .L11
  2676. ALIGN_4
  2677. .L20:
  2678. testq $2, M
  2679. BRANCH
  2680. je .L30
  2681. ALIGN_4
  2682. .L21:
  2683. #ifdef LN
  2684. movq K, %rax
  2685. salq $1 + BASE_SHIFT, %rax
  2686. subq %rax, AORIG
  2687. #endif
  2688. #if defined(LN) || defined(RT)
  2689. movq KK, %rax
  2690. leaq (, %rax, SIZE), %rax
  2691. movq AORIG, AO
  2692. leaq (AO, %rax, 2), AO
  2693. leaq (B, %rax, 4), BO
  2694. #else
  2695. movq B, BO
  2696. #endif
  2697. movapd 0 * SIZE(AO), %xmm8
  2698. pxor %xmm0, %xmm0
  2699. movddup 0 * SIZE(BO), %xmm9
  2700. pxor %xmm1, %xmm1
  2701. movapd 8 * SIZE(AO), %xmm10
  2702. pxor %xmm2, %xmm2
  2703. movddup 8 * SIZE(BO), %xmm11
  2704. pxor %xmm3, %xmm3
  2705. #if defined(LT) || defined(RN)
  2706. movq KK, %rax
  2707. #else
  2708. movq K, %rax
  2709. subq KK, %rax
  2710. #endif
  2711. sarq $3, %rax
  2712. je .L25
  2713. ALIGN_4
  2714. .L22:
  2715. mulpd %xmm8, %xmm9
  2716. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2717. addpd %xmm9, %xmm0
  2718. movddup 1 * SIZE(BO), %xmm9
  2719. mulpd %xmm8, %xmm9
  2720. addpd %xmm9, %xmm1
  2721. movddup 2 * SIZE(BO), %xmm9
  2722. mulpd %xmm8, %xmm9
  2723. addpd %xmm9, %xmm2
  2724. movddup 3 * SIZE(BO), %xmm9
  2725. mulpd %xmm8, %xmm9
  2726. movapd 2 * SIZE(AO), %xmm8
  2727. addpd %xmm9, %xmm3
  2728. movddup 4 * SIZE(BO), %xmm9
  2729. mulpd %xmm8, %xmm9
  2730. addpd %xmm9, %xmm0
  2731. movddup 5 * SIZE(BO), %xmm9
  2732. mulpd %xmm8, %xmm9
  2733. addpd %xmm9, %xmm1
  2734. movddup 6 * SIZE(BO), %xmm9
  2735. mulpd %xmm8, %xmm9
  2736. addpd %xmm9, %xmm2
  2737. movddup 7 * SIZE(BO), %xmm9
  2738. mulpd %xmm8, %xmm9
  2739. movapd 4 * SIZE(AO), %xmm8
  2740. addpd %xmm9, %xmm3
  2741. movddup 16 * SIZE(BO), %xmm9
  2742. mulpd %xmm8, %xmm11
  2743. addpd %xmm11, %xmm0
  2744. movddup 9 * SIZE(BO), %xmm11
  2745. mulpd %xmm8, %xmm11
  2746. addpd %xmm11, %xmm1
  2747. movddup 10 * SIZE(BO), %xmm11
  2748. mulpd %xmm8, %xmm11
  2749. addpd %xmm11, %xmm2
  2750. movddup 11 * SIZE(BO), %xmm11
  2751. mulpd %xmm8, %xmm11
  2752. movapd 6 * SIZE(AO), %xmm8
  2753. addpd %xmm11, %xmm3
  2754. movddup 12 * SIZE(BO), %xmm11
  2755. mulpd %xmm8, %xmm11
  2756. addpd %xmm11, %xmm0
  2757. movddup 13 * SIZE(BO), %xmm11
  2758. mulpd %xmm8, %xmm11
  2759. addpd %xmm11, %xmm1
  2760. movddup 14 * SIZE(BO), %xmm11
  2761. mulpd %xmm8, %xmm11
  2762. addpd %xmm11, %xmm2
  2763. movddup 15 * SIZE(BO), %xmm11
  2764. mulpd %xmm8, %xmm11
  2765. movapd 16 * SIZE(AO), %xmm8
  2766. addpd %xmm11, %xmm3
  2767. movddup 24 * SIZE(BO), %xmm11
  2768. mulpd %xmm10, %xmm9
  2769. addpd %xmm9, %xmm0
  2770. movddup 17 * SIZE(BO), %xmm9
  2771. mulpd %xmm10, %xmm9
  2772. addpd %xmm9, %xmm1
  2773. movddup 18 * SIZE(BO), %xmm9
  2774. mulpd %xmm10, %xmm9
  2775. addpd %xmm9, %xmm2
  2776. movddup 19 * SIZE(BO), %xmm9
  2777. mulpd %xmm10, %xmm9
  2778. movapd 10 * SIZE(AO), %xmm10
  2779. addpd %xmm9, %xmm3
  2780. movddup 20 * SIZE(BO), %xmm9
  2781. mulpd %xmm10, %xmm9
  2782. addpd %xmm9, %xmm0
  2783. movddup 21 * SIZE(BO), %xmm9
  2784. mulpd %xmm10, %xmm9
  2785. addpd %xmm9, %xmm1
  2786. movddup 22 * SIZE(BO), %xmm9
  2787. mulpd %xmm10, %xmm9
  2788. addpd %xmm9, %xmm2
  2789. movddup 23 * SIZE(BO), %xmm9
  2790. mulpd %xmm10, %xmm9
  2791. movapd 12 * SIZE(AO), %xmm10
  2792. addpd %xmm9, %xmm3
  2793. movddup 32 * SIZE(BO), %xmm9
  2794. mulpd %xmm10, %xmm11
  2795. addpd %xmm11, %xmm0
  2796. movddup 25 * SIZE(BO), %xmm11
  2797. mulpd %xmm10, %xmm11
  2798. addpd %xmm11, %xmm1
  2799. movddup 26 * SIZE(BO), %xmm11
  2800. mulpd %xmm10, %xmm11
  2801. addpd %xmm11, %xmm2
  2802. movddup 27 * SIZE(BO), %xmm11
  2803. mulpd %xmm10, %xmm11
  2804. movapd 14 * SIZE(AO), %xmm10
  2805. addpd %xmm11, %xmm3
  2806. movddup 28 * SIZE(BO), %xmm11
  2807. mulpd %xmm10, %xmm11
  2808. addpd %xmm11, %xmm0
  2809. movddup 29 * SIZE(BO), %xmm11
  2810. mulpd %xmm10, %xmm11
  2811. addpd %xmm11, %xmm1
  2812. movddup 30 * SIZE(BO), %xmm11
  2813. mulpd %xmm10, %xmm11
  2814. addpd %xmm11, %xmm2
  2815. movddup 31 * SIZE(BO), %xmm11
  2816. mulpd %xmm10, %xmm11
  2817. movapd 24 * SIZE(AO), %xmm10
  2818. addpd %xmm11, %xmm3
  2819. movddup 40 * SIZE(BO), %xmm11
  2820. addq $16 * SIZE, AO
  2821. addq $32 * SIZE, BO
  2822. decq %rax
  2823. jne .L22
  2824. ALIGN_4
  2825. .L25:
  2826. #if defined(LT) || defined(RN)
  2827. movq KK, %rax
  2828. #else
  2829. movq K, %rax
  2830. subq KK, %rax
  2831. #endif
  2832. andq $7, %rax # if (k & 1)
  2833. BRANCH
  2834. je .L29
  2835. ALIGN_4
  2836. .L26:
  2837. mulpd %xmm8, %xmm9
  2838. addpd %xmm9, %xmm0
  2839. movddup 1 * SIZE(BO), %xmm9
  2840. mulpd %xmm8, %xmm9
  2841. addpd %xmm9, %xmm1
  2842. movddup 2 * SIZE(BO), %xmm9
  2843. mulpd %xmm8, %xmm9
  2844. addpd %xmm9, %xmm2
  2845. movddup 3 * SIZE(BO), %xmm9
  2846. mulpd %xmm8, %xmm9
  2847. movapd 2 * SIZE(AO), %xmm8
  2848. addpd %xmm9, %xmm3
  2849. movddup 4 * SIZE(BO), %xmm9
  2850. addq $2 * SIZE, AO # aoffset += 4
  2851. addq $4 * SIZE, BO # boffset1 += 8
  2852. decq %rax
  2853. jg .L26
  2854. ALIGN_4
  2855. .L29:
  2856. #if defined(LN) || defined(RT)
  2857. movq KK, %rax
  2858. #ifdef LN
  2859. subq $2, %rax
  2860. #else
  2861. subq $4, %rax
  2862. #endif
  2863. leaq (, %rax, SIZE), %rax
  2864. movq AORIG, AO
  2865. leaq (AO, %rax, 2), AO
  2866. leaq (B, %rax, 4), BO
  2867. #endif
  2868. #if defined(LN) || defined(LT)
  2869. movapd %xmm0, %xmm8
  2870. unpcklpd %xmm1, %xmm0
  2871. unpckhpd %xmm1, %xmm8
  2872. movapd %xmm2, %xmm10
  2873. unpcklpd %xmm3, %xmm2
  2874. unpckhpd %xmm3, %xmm10
  2875. movapd 0 * SIZE(BO), %xmm1
  2876. movapd 2 * SIZE(BO), %xmm3
  2877. movapd 4 * SIZE(BO), %xmm5
  2878. movapd 6 * SIZE(BO), %xmm7
  2879. subpd %xmm0, %xmm1
  2880. subpd %xmm2, %xmm3
  2881. subpd %xmm8, %xmm5
  2882. subpd %xmm10, %xmm7
  2883. #else
  2884. movapd 0 * SIZE(AO), %xmm8
  2885. movapd 2 * SIZE(AO), %xmm10
  2886. movapd 4 * SIZE(AO), %xmm12
  2887. movapd 6 * SIZE(AO), %xmm14
  2888. subpd %xmm0, %xmm8
  2889. subpd %xmm1, %xmm10
  2890. subpd %xmm2, %xmm12
  2891. subpd %xmm3, %xmm14
  2892. #endif
  2893. #ifdef LN
  2894. movddup 3 * SIZE(AO), %xmm0
  2895. mulpd %xmm0, %xmm5
  2896. mulpd %xmm0, %xmm7
  2897. movddup 2 * SIZE(AO), %xmm2
  2898. mulpd %xmm5, %xmm2
  2899. subpd %xmm2, %xmm1
  2900. movddup 2 * SIZE(AO), %xmm2
  2901. mulpd %xmm7, %xmm2
  2902. subpd %xmm2, %xmm3
  2903. movddup 0 * SIZE(AO), %xmm0
  2904. mulpd %xmm0, %xmm1
  2905. mulpd %xmm0, %xmm3
  2906. #endif
  2907. #ifdef LT
  2908. movddup 0 * SIZE(AO), %xmm0
  2909. mulpd %xmm0, %xmm1
  2910. mulpd %xmm0, %xmm3
  2911. movddup 1 * SIZE(AO), %xmm2
  2912. mulpd %xmm1, %xmm2
  2913. subpd %xmm2, %xmm5
  2914. movddup 1 * SIZE(AO), %xmm2
  2915. mulpd %xmm3, %xmm2
  2916. subpd %xmm2, %xmm7
  2917. movddup 3 * SIZE(AO), %xmm0
  2918. mulpd %xmm0, %xmm5
  2919. mulpd %xmm0, %xmm7
  2920. #endif
  2921. #ifdef RN
  2922. movddup 0 * SIZE(BO), %xmm0
  2923. mulpd %xmm0, %xmm8
  2924. movddup 1 * SIZE(BO), %xmm1
  2925. mulpd %xmm8, %xmm1
  2926. subpd %xmm1, %xmm10
  2927. movddup 2 * SIZE(BO), %xmm2
  2928. mulpd %xmm8, %xmm2
  2929. subpd %xmm2, %xmm12
  2930. movddup 3 * SIZE(BO), %xmm3
  2931. mulpd %xmm8, %xmm3
  2932. subpd %xmm3, %xmm14
  2933. movddup 5 * SIZE(BO), %xmm0
  2934. mulpd %xmm0, %xmm10
  2935. movddup 6 * SIZE(BO), %xmm1
  2936. mulpd %xmm10, %xmm1
  2937. subpd %xmm1, %xmm12
  2938. movddup 7 * SIZE(BO), %xmm2
  2939. mulpd %xmm10, %xmm2
  2940. subpd %xmm2, %xmm14
  2941. movddup 10 * SIZE(BO), %xmm0
  2942. mulpd %xmm0, %xmm12
  2943. movddup 11 * SIZE(BO), %xmm1
  2944. mulpd %xmm12, %xmm1
  2945. subpd %xmm1, %xmm14
  2946. movddup 15 * SIZE(BO), %xmm0
  2947. mulpd %xmm0, %xmm14
  2948. #endif
  2949. #ifdef RT
  2950. movddup 15 * SIZE(BO), %xmm0
  2951. mulpd %xmm0, %xmm14
  2952. movddup 14 * SIZE(BO), %xmm1
  2953. mulpd %xmm14, %xmm1
  2954. subpd %xmm1, %xmm12
  2955. movddup 13 * SIZE(BO), %xmm2
  2956. mulpd %xmm14, %xmm2
  2957. subpd %xmm2, %xmm10
  2958. movddup 12 * SIZE(BO), %xmm3
  2959. mulpd %xmm14, %xmm3
  2960. subpd %xmm3, %xmm8
  2961. movddup 10 * SIZE(BO), %xmm0
  2962. mulpd %xmm0, %xmm12
  2963. movddup 9 * SIZE(BO), %xmm1
  2964. mulpd %xmm12, %xmm1
  2965. subpd %xmm1, %xmm10
  2966. movddup 8 * SIZE(BO), %xmm2
  2967. mulpd %xmm12, %xmm2
  2968. subpd %xmm2, %xmm8
  2969. movddup 5 * SIZE(BO), %xmm0
  2970. mulpd %xmm0, %xmm10
  2971. movddup 4 * SIZE(BO), %xmm1
  2972. mulpd %xmm10, %xmm1
  2973. subpd %xmm1, %xmm8
  2974. movddup 0 * SIZE(BO), %xmm0
  2975. mulpd %xmm0, %xmm8
  2976. #endif
  2977. #ifdef LN
  2978. subq $2 * SIZE, CO1
  2979. subq $2 * SIZE, CO2
  2980. #endif
  2981. #if defined(LN) || defined(LT)
  2982. movsd %xmm1, 0 * SIZE(CO1)
  2983. movsd %xmm5, 1 * SIZE(CO1)
  2984. movhpd %xmm1, 0 * SIZE(CO2)
  2985. movhpd %xmm5, 1 * SIZE(CO2)
  2986. movsd %xmm3, 0 * SIZE(CO1, LDC, 2)
  2987. movsd %xmm7, 1 * SIZE(CO1, LDC, 2)
  2988. movhpd %xmm3, 0 * SIZE(CO2, LDC, 2)
  2989. movhpd %xmm7, 1 * SIZE(CO2, LDC, 2)
  2990. #else
  2991. movsd %xmm8, 0 * SIZE(CO1)
  2992. movhpd %xmm8, 1 * SIZE(CO1)
  2993. movsd %xmm10, 0 * SIZE(CO2)
  2994. movhpd %xmm10, 1 * SIZE(CO2)
  2995. movsd %xmm12, 0 * SIZE(CO1, LDC, 2)
  2996. movhpd %xmm12, 1 * SIZE(CO1, LDC, 2)
  2997. movsd %xmm14, 0 * SIZE(CO2, LDC, 2)
  2998. movhpd %xmm14, 1 * SIZE(CO2, LDC, 2)
  2999. #endif
  3000. #if defined(LN) || defined(LT)
  3001. movapd %xmm1, 0 * SIZE(BO)
  3002. movapd %xmm3, 2 * SIZE(BO)
  3003. movapd %xmm5, 4 * SIZE(BO)
  3004. movapd %xmm7, 6 * SIZE(BO)
  3005. #else
  3006. movapd %xmm8, 0 * SIZE(AO)
  3007. movapd %xmm10, 2 * SIZE(AO)
  3008. movapd %xmm12, 4 * SIZE(AO)
  3009. movapd %xmm14, 6 * SIZE(AO)
  3010. #endif
  3011. #ifndef LN
  3012. addq $2 * SIZE, CO1
  3013. addq $2 * SIZE, CO2
  3014. #endif
  3015. #if defined(LT) || defined(RN)
  3016. movq K, %rax
  3017. subq KK, %rax
  3018. leaq (,%rax, SIZE), %rax
  3019. leaq (AO, %rax, 2), AO
  3020. leaq (BO, %rax, 4), BO
  3021. #endif
  3022. #ifdef LN
  3023. subq $2, KK
  3024. #endif
  3025. #ifdef LT
  3026. addq $2, KK
  3027. #endif
  3028. #ifdef RT
  3029. movq K, %rax
  3030. salq $1 + BASE_SHIFT, %rax
  3031. addq %rax, AORIG
  3032. #endif
  3033. ALIGN_4
  3034. .L30:
  3035. testq $1, M
  3036. je .L39
  3037. ALIGN_4
  3038. .L31:
  3039. #ifdef LN
  3040. movq K, %rax
  3041. salq $0 + BASE_SHIFT, %rax
  3042. subq %rax, AORIG
  3043. #endif
  3044. #if defined(LN) || defined(RT)
  3045. movq KK, %rax
  3046. leaq (, %rax, SIZE), %rax
  3047. movq AORIG, AO
  3048. leaq (AO, %rax, 1), AO
  3049. leaq (B, %rax, 4), BO
  3050. #else
  3051. movq B, BO
  3052. #endif
  3053. movddup 0 * SIZE(AO), %xmm8
  3054. pxor %xmm0, %xmm0
  3055. movapd 0 * SIZE(BO), %xmm9
  3056. pxor %xmm1, %xmm1
  3057. movddup 4 * SIZE(AO), %xmm10
  3058. pxor %xmm2, %xmm2
  3059. movapd 8 * SIZE(BO), %xmm11
  3060. pxor %xmm3, %xmm3
  3061. #if defined(LT) || defined(RN)
  3062. movq KK, %rax
  3063. #else
  3064. movq K, %rax
  3065. subq KK, %rax
  3066. #endif
  3067. sarq $3, %rax
  3068. je .L35
  3069. ALIGN_4
  3070. .L32:
  3071. mulpd %xmm8, %xmm9
  3072. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  3073. addpd %xmm9, %xmm0
  3074. movapd 2 * SIZE(BO), %xmm9
  3075. mulpd %xmm8, %xmm9
  3076. movddup 1 * SIZE(AO), %xmm8
  3077. addpd %xmm9, %xmm1
  3078. movapd 4 * SIZE(BO), %xmm9
  3079. mulpd %xmm8, %xmm9
  3080. addpd %xmm9, %xmm0
  3081. movapd 6 * SIZE(BO), %xmm9
  3082. mulpd %xmm8, %xmm9
  3083. movddup 2 * SIZE(AO), %xmm8
  3084. addpd %xmm9, %xmm1
  3085. movapd 16 * SIZE(BO), %xmm9
  3086. mulpd %xmm8, %xmm11
  3087. addpd %xmm11, %xmm0
  3088. movapd 10 * SIZE(BO), %xmm11
  3089. mulpd %xmm8, %xmm11
  3090. movddup 3 * SIZE(AO), %xmm8
  3091. addpd %xmm11, %xmm1
  3092. movapd 12 * SIZE(BO), %xmm11
  3093. mulpd %xmm8, %xmm11
  3094. addpd %xmm11, %xmm0
  3095. movapd 14 * SIZE(BO), %xmm11
  3096. mulpd %xmm8, %xmm11
  3097. movddup 8 * SIZE(AO), %xmm8
  3098. addpd %xmm11, %xmm1
  3099. movapd 24 * SIZE(BO), %xmm11
  3100. mulpd %xmm10, %xmm9
  3101. addpd %xmm9, %xmm0
  3102. movapd 18 * SIZE(BO), %xmm9
  3103. mulpd %xmm10, %xmm9
  3104. movddup 5 * SIZE(AO), %xmm10
  3105. addpd %xmm9, %xmm1
  3106. movapd 20 * SIZE(BO), %xmm9
  3107. mulpd %xmm10, %xmm9
  3108. addpd %xmm9, %xmm0
  3109. movapd 22 * SIZE(BO), %xmm9
  3110. mulpd %xmm10, %xmm9
  3111. movddup 6 * SIZE(AO), %xmm10
  3112. addpd %xmm9, %xmm1
  3113. movapd 32 * SIZE(BO), %xmm9
  3114. mulpd %xmm10, %xmm11
  3115. addpd %xmm11, %xmm0
  3116. movapd 26 * SIZE(BO), %xmm11
  3117. mulpd %xmm10, %xmm11
  3118. movddup 7 * SIZE(AO), %xmm10
  3119. addpd %xmm11, %xmm1
  3120. movapd 28 * SIZE(BO), %xmm11
  3121. mulpd %xmm10, %xmm11
  3122. addpd %xmm11, %xmm0
  3123. movapd 30 * SIZE(BO), %xmm11
  3124. mulpd %xmm10, %xmm11
  3125. movddup 12 * SIZE(AO), %xmm10
  3126. addpd %xmm11, %xmm1
  3127. movapd 40 * SIZE(BO), %xmm11
  3128. addq $ 8 * SIZE, AO
  3129. addq $32 * SIZE, BO
  3130. decq %rax
  3131. jne .L32
  3132. ALIGN_4
  3133. .L35:
  3134. #if defined(LT) || defined(RN)
  3135. movq KK, %rax
  3136. #else
  3137. movq K, %rax
  3138. subq KK, %rax
  3139. #endif
  3140. andq $7, %rax # if (k & 1)
  3141. BRANCH
  3142. je .L38
  3143. ALIGN_4
  3144. .L36:
  3145. mulpd %xmm8, %xmm9
  3146. addpd %xmm9, %xmm0
  3147. movapd 2 * SIZE(BO), %xmm9
  3148. mulpd %xmm8, %xmm9
  3149. movddup 1 * SIZE(AO), %xmm8
  3150. addpd %xmm9, %xmm1
  3151. movapd 4 * SIZE(BO), %xmm9
  3152. addq $1 * SIZE, AO # aoffset += 4
  3153. addq $4 * SIZE, BO # boffset1 += 8
  3154. decq %rax
  3155. jg .L36
  3156. ALIGN_4
  3157. .L38:
  3158. #if defined(LN) || defined(RT)
  3159. movq KK, %rax
  3160. #ifdef LN
  3161. subq $1, %rax
  3162. #else
  3163. subq $4, %rax
  3164. #endif
  3165. leaq (, %rax, SIZE), %rax
  3166. movq AORIG, AO
  3167. leaq (AO, %rax, 1), AO
  3168. leaq (B, %rax, 4), BO
  3169. #endif
  3170. #if defined(LN) || defined(LT)
  3171. movapd 0 * SIZE(BO), %xmm2
  3172. movapd 2 * SIZE(BO), %xmm3
  3173. subpd %xmm0, %xmm2
  3174. subpd %xmm1, %xmm3
  3175. #else
  3176. movapd 0 * SIZE(AO), %xmm2
  3177. movapd 2 * SIZE(AO), %xmm3
  3178. subpd %xmm0, %xmm2
  3179. subpd %xmm1, %xmm3
  3180. #endif
  3181. #ifdef LN
  3182. movddup 0 * SIZE(AO), %xmm0
  3183. mulpd %xmm0, %xmm2
  3184. mulpd %xmm0, %xmm3
  3185. #endif
  3186. #ifdef LT
  3187. movddup 0 * SIZE(AO), %xmm0
  3188. mulpd %xmm0, %xmm2
  3189. mulpd %xmm0, %xmm3
  3190. #endif
  3191. #ifdef RN
  3192. movapd %xmm2, %xmm0
  3193. unpckhpd %xmm0, %xmm0
  3194. movapd %xmm3, %xmm1
  3195. unpckhpd %xmm1, %xmm1
  3196. movsd 0 * SIZE(BO), %xmm4
  3197. mulsd %xmm4, %xmm2
  3198. movsd 1 * SIZE(BO), %xmm5
  3199. mulsd %xmm2, %xmm5
  3200. subsd %xmm5, %xmm0
  3201. movsd 2 * SIZE(BO), %xmm6
  3202. mulsd %xmm2, %xmm6
  3203. subsd %xmm6, %xmm3
  3204. movsd 3 * SIZE(BO), %xmm7
  3205. mulsd %xmm2, %xmm7
  3206. subsd %xmm7, %xmm1
  3207. movsd 5 * SIZE(BO), %xmm4
  3208. mulsd %xmm4, %xmm0
  3209. movsd 6 * SIZE(BO), %xmm5
  3210. mulsd %xmm0, %xmm5
  3211. subsd %xmm5, %xmm3
  3212. movsd 7 * SIZE(BO), %xmm6
  3213. mulsd %xmm0, %xmm6
  3214. subsd %xmm6, %xmm1
  3215. movsd 10 * SIZE(BO), %xmm4
  3216. mulsd %xmm4, %xmm3
  3217. movsd 11 * SIZE(BO), %xmm5
  3218. mulsd %xmm3, %xmm5
  3219. subsd %xmm5, %xmm1
  3220. movsd 15 * SIZE(BO), %xmm4
  3221. mulsd %xmm4, %xmm1
  3222. unpcklpd %xmm0, %xmm2
  3223. unpcklpd %xmm1, %xmm3
  3224. #endif
  3225. #ifdef RT
  3226. movapd %xmm2, %xmm0
  3227. unpckhpd %xmm0, %xmm0
  3228. movapd %xmm3, %xmm1
  3229. unpckhpd %xmm1, %xmm1
  3230. movsd 15 * SIZE(BO), %xmm4
  3231. mulsd %xmm4, %xmm1
  3232. movsd 14 * SIZE(BO), %xmm5
  3233. mulsd %xmm1, %xmm5
  3234. subsd %xmm5, %xmm3
  3235. movsd 13 * SIZE(BO), %xmm6
  3236. mulsd %xmm1, %xmm6
  3237. subsd %xmm6, %xmm0
  3238. movsd 12 * SIZE(BO), %xmm7
  3239. mulsd %xmm1, %xmm7
  3240. subsd %xmm7, %xmm2
  3241. movsd 10 * SIZE(BO), %xmm4
  3242. mulsd %xmm4, %xmm3
  3243. movsd 9 * SIZE(BO), %xmm5
  3244. mulsd %xmm3, %xmm5
  3245. subsd %xmm5, %xmm0
  3246. movsd 8 * SIZE(BO), %xmm6
  3247. mulsd %xmm3, %xmm6
  3248. subsd %xmm6, %xmm2
  3249. movsd 5 * SIZE(BO), %xmm4
  3250. mulsd %xmm4, %xmm0
  3251. movsd 4 * SIZE(BO), %xmm5
  3252. mulsd %xmm0, %xmm5
  3253. subsd %xmm5, %xmm2
  3254. movsd 0 * SIZE(BO), %xmm4
  3255. mulsd %xmm4, %xmm2
  3256. unpcklpd %xmm0, %xmm2
  3257. unpcklpd %xmm1, %xmm3
  3258. #endif
  3259. #ifdef LN
  3260. subq $1 * SIZE, CO1
  3261. subq $1 * SIZE, CO2
  3262. #endif
  3263. #if defined(LN) || defined(LT)
  3264. movsd %xmm2, 0 * SIZE(CO1)
  3265. movhpd %xmm2, 0 * SIZE(CO2)
  3266. movsd %xmm3, 0 * SIZE(CO1, LDC, 2)
  3267. movhpd %xmm3, 0 * SIZE(CO2, LDC, 2)
  3268. #else
  3269. movsd %xmm2, 0 * SIZE(CO1)
  3270. movhpd %xmm2, 0 * SIZE(CO2)
  3271. movsd %xmm3, 0 * SIZE(CO1, LDC, 2)
  3272. movhpd %xmm3, 0 * SIZE(CO2, LDC, 2)
  3273. #endif
  3274. #if defined(LN) || defined(LT)
  3275. movapd %xmm2, 0 * SIZE(BO)
  3276. movapd %xmm3, 2 * SIZE(BO)
  3277. #else
  3278. movapd %xmm2, 0 * SIZE(AO)
  3279. movapd %xmm3, 2 * SIZE(AO)
  3280. #endif
  3281. #ifndef LN
  3282. addq $1 * SIZE, CO1
  3283. addq $1 * SIZE, CO2
  3284. #endif
  3285. #if defined(LT) || defined(RN)
  3286. movq K, %rax
  3287. subq KK, %rax
  3288. leaq (,%rax, SIZE), %rax
  3289. leaq (AO, %rax, 1), AO
  3290. leaq (BO, %rax, 4), BO
  3291. #endif
  3292. #ifdef LN
  3293. subq $1, KK
  3294. #endif
  3295. #ifdef LT
  3296. addq $1, KK
  3297. #endif
  3298. #ifdef RT
  3299. movq K, %rax
  3300. salq $0 + BASE_SHIFT, %rax
  3301. addq %rax, AORIG
  3302. #endif
  3303. ALIGN_4
  3304. .L39:
  3305. #ifdef LN
  3306. leaq (, K, SIZE), %rax
  3307. leaq (B, %rax, 4), B
  3308. #endif
  3309. #if defined(LT) || defined(RN)
  3310. movq BO, B
  3311. #endif
  3312. #ifdef RN
  3313. addq $4, KK
  3314. #endif
  3315. #ifdef RT
  3316. subq $4, KK
  3317. #endif
  3318. decq J # j --
  3319. jg .L10
  3320. ALIGN_4
  3321. .L999:
  3322. movq 0(%rsp), %rbx
  3323. movq 8(%rsp), %rbp
  3324. movq 16(%rsp), %r12
  3325. movq 24(%rsp), %r13
  3326. movq 32(%rsp), %r14
  3327. movq 40(%rsp), %r15
  3328. #ifdef WINDOWS_ABI
  3329. movq 48(%rsp), %rdi
  3330. movq 56(%rsp), %rsi
  3331. movups 64(%rsp), %xmm6
  3332. movups 80(%rsp), %xmm7
  3333. movups 96(%rsp), %xmm8
  3334. movups 112(%rsp), %xmm9
  3335. movups 128(%rsp), %xmm10
  3336. movups 144(%rsp), %xmm11
  3337. movups 160(%rsp), %xmm12
  3338. movups 176(%rsp), %xmm13
  3339. movups 192(%rsp), %xmm14
  3340. movups 208(%rsp), %xmm15
  3341. #endif
  3342. addq $STACKSIZE, %rsp
  3343. ret
  3344. EPILOGUE