You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_4x8_haswell.S 89 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753
  1. /*********************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define OLD_M %rdi
  30. #define OLD_N %rsi
  31. #define M %r13
  32. #define J %r14
  33. #define OLD_K %rdx
  34. #define A %rcx
  35. #define B %r8
  36. #define C %r9
  37. #define LDC %r10
  38. #define I %r11
  39. #define AO %rdi
  40. #define BO %rsi
  41. #define CO1 %r15
  42. #define K %r12
  43. #define SP %rbx
  44. #define BO1 %rdi
  45. #define BO2 %r15
  46. #define BO3 %rbp
  47. #ifndef WINDOWS_ABI
  48. #define STACKSIZE 96
  49. #define L_BUFFER_SIZE 256*8*12+4096
  50. #else
  51. #define STACKSIZE 256
  52. #define L_BUFFER_SIZE 128*8*12+512
  53. #define OLD_A 40 + STACKSIZE(%rsp)
  54. #define OLD_B 48 + STACKSIZE(%rsp)
  55. #define OLD_C 56 + STACKSIZE(%rsp)
  56. #define OLD_LDC 64 + STACKSIZE(%rsp)
  57. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  58. #endif
  59. #define Ndiv12 24(%rsp)
  60. #define Nmod12 32(%rsp)
  61. #define N 40(%rsp)
  62. #define ALPHA 48(%rsp)
  63. #define OFFSET 56(%rsp)
  64. #define KK 64(%rsp)
  65. #define KKK 72(%rsp)
  66. #define BUFFER1 128(%rsp)
  67. #if defined(OS_WINDOWS)
  68. #if L_BUFFER_SIZE > 16384
  69. #define STACK_TOUCH \
  70. movl $ 0, 4096 * 4(%rsp);\
  71. movl $ 0, 4096 * 3(%rsp);\
  72. movl $ 0, 4096 * 2(%rsp);\
  73. movl $ 0, 4096 * 1(%rsp);
  74. #elif L_BUFFER_SIZE > 12288
  75. #define STACK_TOUCH \
  76. movl $ 0, 4096 * 3(%rsp);\
  77. movl $ 0, 4096 * 2(%rsp);\
  78. movl $ 0, 4096 * 1(%rsp);
  79. #elif L_BUFFER_SIZE > 8192
  80. #define STACK_TOUCH \
  81. movl $ 0, 4096 * 2(%rsp);\
  82. movl $ 0, 4096 * 1(%rsp);
  83. #elif L_BUFFER_SIZE > 4096
  84. #define STACK_TOUCH \
  85. movl $ 0, 4096 * 1(%rsp);
  86. #else
  87. #define STACK_TOUCH
  88. #endif
  89. #else
  90. #define STACK_TOUCH
  91. #endif
  92. #define A_PR1 512
  93. #define B_PR1 512
  94. /*******************************************************************************************
  95. * Macro definitions
  96. *******************************************************************************************/
  97. .macro INIT4x12
  98. vxorpd %ymm4 , %ymm4 , %ymm4
  99. vxorpd %ymm5 , %ymm5 , %ymm5
  100. vxorpd %ymm6 , %ymm6 , %ymm6
  101. vxorpd %ymm7 , %ymm7 , %ymm7
  102. vxorpd %ymm8 , %ymm8 , %ymm8
  103. vxorpd %ymm9 , %ymm9 , %ymm9
  104. vxorpd %ymm10, %ymm10, %ymm10
  105. vxorpd %ymm11, %ymm11, %ymm11
  106. vxorpd %ymm12, %ymm12, %ymm12
  107. vxorpd %ymm13, %ymm13, %ymm13
  108. vxorpd %ymm14, %ymm14, %ymm14
  109. vxorpd %ymm15, %ymm15, %ymm15
  110. .endm
  111. .macro KERNEL4x12_I
  112. prefetcht0 A_PR1(AO)
  113. vmovups -12 * SIZE(BO), %ymm1
  114. prefetcht0 B_PR1(BO)
  115. vmovups -16 * SIZE(AO), %ymm0
  116. prefetcht0 B_PR1+64(BO)
  117. vmovups -8 * SIZE(BO), %ymm2
  118. prefetcht0 B_PR1+128(BO)
  119. vmovups -4 * SIZE(BO), %ymm3
  120. vmulpd %ymm0 ,%ymm1 , %ymm4
  121. prefetcht0 B_PR1+192(BO)
  122. vmulpd %ymm0 ,%ymm2 , %ymm8
  123. vmulpd %ymm0 ,%ymm3 , %ymm12
  124. prefetcht0 B_PR1+256(BO)
  125. vpermpd $ 0xb1, %ymm0 , %ymm0
  126. vmulpd %ymm0 ,%ymm1 , %ymm5
  127. vmulpd %ymm0 ,%ymm2 , %ymm9
  128. vmulpd %ymm0 ,%ymm3 , %ymm13
  129. vpermpd $ 0x1b, %ymm0 , %ymm0
  130. vmulpd %ymm0 ,%ymm1 , %ymm6
  131. vmulpd %ymm0 ,%ymm2 , %ymm10
  132. addq $ 12*SIZE, BO
  133. vmulpd %ymm0 ,%ymm3 , %ymm14
  134. vpermpd $ 0xb1, %ymm0 , %ymm0
  135. vmulpd %ymm0 ,%ymm1 , %ymm7
  136. vmovups -12 * SIZE(BO), %ymm1
  137. vmulpd %ymm0 ,%ymm2 , %ymm11
  138. vmovups -8 * SIZE(BO), %ymm2
  139. vmulpd %ymm0 ,%ymm3 , %ymm15
  140. vmovups -4 * SIZE(BO), %ymm3
  141. .endm
  142. .macro KERNEL4x12_M1
  143. prefetcht0 A_PR1(AO)
  144. vmovups -16 * SIZE(AO), %ymm0
  145. prefetcht0 B_PR1(BO)
  146. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  147. prefetcht0 B_PR1+64(BO)
  148. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  149. prefetcht0 B_PR1+128(BO)
  150. vfmadd231pd %ymm0 ,%ymm3 , %ymm12
  151. vpermpd $ 0xb1, %ymm0 , %ymm0
  152. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  153. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  154. vfmadd231pd %ymm0 ,%ymm3 , %ymm13
  155. vpermpd $ 0x1b, %ymm0 , %ymm0
  156. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  157. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  158. vfmadd231pd %ymm0 ,%ymm3 , %ymm14
  159. vpermpd $ 0xb1, %ymm0 , %ymm0
  160. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  161. vmovups -12 * SIZE(BO), %ymm1
  162. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  163. vmovups -8 * SIZE(BO), %ymm2
  164. vfmadd231pd %ymm0 ,%ymm3 , %ymm15
  165. vmovups -4 * SIZE(BO), %ymm3
  166. .endm
  167. .macro KERNEL4x12_M2
  168. vmovups -12 * SIZE(AO), %ymm0
  169. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  170. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  171. vfmadd231pd %ymm0 ,%ymm3 , %ymm12
  172. vpermpd $ 0xb1, %ymm0 , %ymm0
  173. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  174. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  175. vfmadd231pd %ymm0 ,%ymm3 , %ymm13
  176. vpermpd $ 0x1b, %ymm0 , %ymm0
  177. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  178. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  179. addq $ 8*SIZE, AO
  180. vfmadd231pd %ymm0 ,%ymm3 , %ymm14
  181. vpermpd $ 0xb1, %ymm0 , %ymm0
  182. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  183. vmovups 0 * SIZE(BO), %ymm1
  184. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  185. vmovups 4 * SIZE(BO), %ymm2
  186. vfmadd231pd %ymm0 ,%ymm3 , %ymm15
  187. vmovups 8 * SIZE(BO), %ymm3
  188. addq $ 24*SIZE, BO
  189. .endm
  190. .macro KERNEL4x12_E
  191. vmovups -12 * SIZE(AO), %ymm0
  192. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  193. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  194. vfmadd231pd %ymm0 ,%ymm3 , %ymm12
  195. vpermpd $ 0xb1, %ymm0 , %ymm0
  196. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  197. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  198. vfmadd231pd %ymm0 ,%ymm3 , %ymm13
  199. vpermpd $ 0x1b, %ymm0 , %ymm0
  200. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  201. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  202. addq $ 8*SIZE, AO
  203. vfmadd231pd %ymm0 ,%ymm3 , %ymm14
  204. vpermpd $ 0xb1, %ymm0 , %ymm0
  205. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  206. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  207. vfmadd231pd %ymm0 ,%ymm3 , %ymm15
  208. addq $ 12*SIZE, BO
  209. .endm
  210. .macro KERNEL4x12_SUB
  211. vmovups -12 * SIZE(BO), %ymm1
  212. vmovups -16 * SIZE(AO), %ymm0
  213. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  214. vmovups -8 * SIZE(BO), %ymm2
  215. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  216. vmovups -4 * SIZE(BO), %ymm3
  217. vfmadd231pd %ymm0 ,%ymm3 , %ymm12
  218. vpermpd $ 0xb1, %ymm0 , %ymm0
  219. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  220. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  221. addq $ 12*SIZE, BO
  222. vfmadd231pd %ymm0 ,%ymm3 , %ymm13
  223. vpermpd $ 0x1b, %ymm0 , %ymm0
  224. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  225. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  226. addq $ 4*SIZE, AO
  227. vfmadd231pd %ymm0 ,%ymm3 , %ymm14
  228. vpermpd $ 0xb1, %ymm0 , %ymm0
  229. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  230. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  231. vfmadd231pd %ymm0 ,%ymm3 , %ymm15
  232. .endm
  233. .macro SAVE4x12
  234. vbroadcastsd ALPHA, %ymm0
  235. vmulpd %ymm0 , %ymm4 , %ymm4
  236. vmulpd %ymm0 , %ymm5 , %ymm5
  237. vmulpd %ymm0 , %ymm6 , %ymm6
  238. vmulpd %ymm0 , %ymm7 , %ymm7
  239. vmulpd %ymm0 , %ymm8 , %ymm8
  240. vmulpd %ymm0 , %ymm9 , %ymm9
  241. vmulpd %ymm0 , %ymm10, %ymm10
  242. vmulpd %ymm0 , %ymm11, %ymm11
  243. vmulpd %ymm0 , %ymm12, %ymm12
  244. vmulpd %ymm0 , %ymm13, %ymm13
  245. vmulpd %ymm0 , %ymm14, %ymm14
  246. vmulpd %ymm0 , %ymm15, %ymm15
  247. vpermpd $ 0xb1 , %ymm5, %ymm5
  248. vpermpd $ 0xb1 , %ymm7, %ymm7
  249. vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
  250. vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
  251. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
  252. vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
  253. vpermpd $ 0x1b , %ymm2, %ymm2
  254. vpermpd $ 0x1b , %ymm3, %ymm3
  255. vpermpd $ 0xb1 , %ymm2, %ymm2
  256. vpermpd $ 0xb1 , %ymm3, %ymm3
  257. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  258. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  259. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  260. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  261. leaq (CO1, LDC, 2), %rax
  262. #if !defined(TRMMKERNEL)
  263. vaddpd (CO1), %ymm4, %ymm4
  264. vaddpd (CO1, LDC), %ymm5, %ymm5
  265. vaddpd (%rax), %ymm6, %ymm6
  266. vaddpd (%rax, LDC), %ymm7, %ymm7
  267. #endif
  268. vmovups %ymm4 , (CO1)
  269. vmovups %ymm5 , (CO1, LDC)
  270. vmovups %ymm6 , (%rax)
  271. vmovups %ymm7 , (%rax, LDC)
  272. prefetcht0 32(CO1)
  273. prefetcht0 32(CO1,LDC)
  274. prefetcht0 32(%rax)
  275. prefetcht0 32(%rax,LDC)
  276. vpermpd $ 0xb1 , %ymm9 , %ymm9
  277. vpermpd $ 0xb1 , %ymm11, %ymm11
  278. vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
  279. vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
  280. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
  281. vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
  282. vpermpd $ 0x1b , %ymm2, %ymm2
  283. vpermpd $ 0x1b , %ymm3, %ymm3
  284. vpermpd $ 0xb1 , %ymm2, %ymm2
  285. vpermpd $ 0xb1 , %ymm3, %ymm3
  286. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  287. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  288. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  289. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  290. leaq (%rax, LDC, 2), %rax
  291. leaq (%rax, LDC, 2), %rbp
  292. #if !defined(TRMMKERNEL)
  293. vaddpd (%rax), %ymm4, %ymm4
  294. vaddpd (%rax, LDC), %ymm5, %ymm5
  295. vaddpd (%rbp), %ymm6, %ymm6
  296. vaddpd (%rbp, LDC), %ymm7, %ymm7
  297. #endif
  298. vmovups %ymm4 , (%rax)
  299. vmovups %ymm5 , (%rax, LDC)
  300. vmovups %ymm6 , (%rbp)
  301. vmovups %ymm7 , (%rbp, LDC)
  302. prefetcht0 32(%rax)
  303. prefetcht0 32(%rax,LDC)
  304. prefetcht0 32(%rbp)
  305. prefetcht0 32(%rbp,LDC)
  306. vpermpd $ 0xb1 , %ymm13, %ymm13
  307. vpermpd $ 0xb1 , %ymm15, %ymm15
  308. vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0
  309. vblendpd $ 0x05, %ymm13, %ymm12, %ymm1
  310. vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2
  311. vblendpd $ 0x05, %ymm15, %ymm14, %ymm3
  312. vpermpd $ 0x1b , %ymm2, %ymm2
  313. vpermpd $ 0x1b , %ymm3, %ymm3
  314. vpermpd $ 0xb1 , %ymm2, %ymm2
  315. vpermpd $ 0xb1 , %ymm3, %ymm3
  316. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  317. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  318. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  319. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  320. leaq (%rax, LDC, 4), %rax
  321. leaq (%rbp, LDC, 4), %rbp
  322. #if !defined(TRMMKERNEL)
  323. vaddpd (%rax), %ymm4, %ymm4
  324. vaddpd (%rax, LDC), %ymm5, %ymm5
  325. vaddpd (%rbp), %ymm6, %ymm6
  326. vaddpd (%rbp, LDC), %ymm7, %ymm7
  327. #endif
  328. vmovups %ymm4 , (%rax)
  329. vmovups %ymm5 , (%rax, LDC)
  330. vmovups %ymm6 , (%rbp)
  331. vmovups %ymm7 , (%rbp, LDC)
  332. prefetcht0 32(%rax)
  333. prefetcht0 32(%rax,LDC)
  334. prefetcht0 32(%rbp)
  335. prefetcht0 32(%rbp,LDC)
  336. addq $ 4*SIZE, CO1
  337. .endm
  338. /******************************************************************************************/
  339. .macro INIT2x12
  340. vxorpd %xmm4 , %xmm4 , %xmm4
  341. vxorpd %xmm5 , %xmm5 , %xmm5
  342. vxorpd %xmm6 , %xmm6 , %xmm6
  343. vxorpd %xmm7 , %xmm7 , %xmm7
  344. vxorpd %xmm8 , %xmm8 , %xmm8
  345. vxorpd %xmm9 , %xmm9 , %xmm9
  346. vxorpd %xmm10, %xmm10, %xmm10
  347. vxorpd %xmm11, %xmm11, %xmm11
  348. vxorpd %xmm12, %xmm12, %xmm12
  349. vxorpd %xmm13, %xmm13, %xmm13
  350. vxorpd %xmm14, %xmm14, %xmm14
  351. vxorpd %xmm15, %xmm15, %xmm15
  352. .endm
  353. .macro KERNEL2x12_SUB
  354. vmovups -16 * SIZE(AO), %xmm0
  355. vmovddup -12 * SIZE(BO), %xmm1
  356. vmovddup -11 * SIZE(BO), %xmm2
  357. vmovddup -10 * SIZE(BO), %xmm3
  358. vfmadd231pd %xmm0 ,%xmm1 , %xmm4
  359. vmovddup -9 * SIZE(BO), %xmm1
  360. vfmadd231pd %xmm0 ,%xmm2 , %xmm5
  361. vmovddup -8 * SIZE(BO), %xmm2
  362. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  363. vmovddup -7 * SIZE(BO), %xmm3
  364. vfmadd231pd %xmm0 ,%xmm1 , %xmm7
  365. vmovddup -6 * SIZE(BO), %xmm1
  366. vfmadd231pd %xmm0 ,%xmm2 , %xmm8
  367. vmovddup -5 * SIZE(BO), %xmm2
  368. vfmadd231pd %xmm0 ,%xmm3 , %xmm9
  369. vmovddup -4 * SIZE(BO), %xmm3
  370. vfmadd231pd %xmm0 ,%xmm1 , %xmm10
  371. vmovddup -3 * SIZE(BO), %xmm1
  372. vfmadd231pd %xmm0 ,%xmm2 , %xmm11
  373. vmovddup -2 * SIZE(BO), %xmm2
  374. vfmadd231pd %xmm0 ,%xmm3 , %xmm12
  375. vmovddup -1 * SIZE(BO), %xmm3
  376. vfmadd231pd %xmm0 ,%xmm1 , %xmm13
  377. addq $ 12*SIZE, BO
  378. vfmadd231pd %xmm0 ,%xmm2 , %xmm14
  379. addq $ 2*SIZE, AO
  380. vfmadd231pd %xmm0 ,%xmm3 , %xmm15
  381. .endm
  382. .macro SAVE2x12
  383. vmovddup ALPHA, %xmm0
  384. vmulpd %xmm0 , %xmm4 , %xmm4
  385. vmulpd %xmm0 , %xmm5 , %xmm5
  386. vmulpd %xmm0 , %xmm6 , %xmm6
  387. vmulpd %xmm0 , %xmm7 , %xmm7
  388. vmulpd %xmm0 , %xmm8 , %xmm8
  389. vmulpd %xmm0 , %xmm9 , %xmm9
  390. vmulpd %xmm0 , %xmm10, %xmm10
  391. vmulpd %xmm0 , %xmm11, %xmm11
  392. vmulpd %xmm0 , %xmm12, %xmm12
  393. vmulpd %xmm0 , %xmm13, %xmm13
  394. vmulpd %xmm0 , %xmm14, %xmm14
  395. vmulpd %xmm0 , %xmm15, %xmm15
  396. leaq (CO1, LDC, 2), %rax
  397. #if !defined(TRMMKERNEL)
  398. vaddpd (CO1), %xmm4, %xmm4
  399. vaddpd (CO1, LDC), %xmm5, %xmm5
  400. vaddpd (%rax), %xmm6, %xmm6
  401. vaddpd (%rax, LDC), %xmm7, %xmm7
  402. #endif
  403. vmovups %xmm4 , (CO1)
  404. vmovups %xmm5 , (CO1, LDC)
  405. vmovups %xmm6 , (%rax)
  406. vmovups %xmm7 , (%rax, LDC)
  407. leaq (%rax, LDC, 2), %rax
  408. leaq (%rax, LDC, 2), %rbp
  409. #if !defined(TRMMKERNEL)
  410. vaddpd (%rax), %xmm8 , %xmm4
  411. vaddpd (%rax, LDC), %xmm9 , %xmm5
  412. vaddpd (%rbp), %xmm10, %xmm6
  413. vaddpd (%rbp, LDC), %xmm11, %xmm7
  414. #endif
  415. vmovups %xmm4 , (%rax)
  416. vmovups %xmm5 , (%rax, LDC)
  417. vmovups %xmm6 , (%rbp)
  418. vmovups %xmm7 , (%rbp, LDC)
  419. leaq (%rax, LDC, 4), %rax
  420. leaq (%rbp, LDC, 4), %rbp
  421. #if !defined(TRMMKERNEL)
  422. vaddpd (%rax), %xmm12, %xmm4
  423. vaddpd (%rax, LDC), %xmm13, %xmm5
  424. vaddpd (%rbp), %xmm14, %xmm6
  425. vaddpd (%rbp, LDC), %xmm15, %xmm7
  426. #endif
  427. vmovups %xmm4 , (%rax)
  428. vmovups %xmm5 , (%rax, LDC)
  429. vmovups %xmm6 , (%rbp)
  430. vmovups %xmm7 , (%rbp, LDC)
  431. addq $ 2*SIZE, CO1
  432. .endm
  433. /******************************************************************************************/
  434. .macro INIT1x12
  435. vxorpd %xmm4 , %xmm4 , %xmm4
  436. vxorpd %xmm5 , %xmm5 , %xmm5
  437. vxorpd %xmm6 , %xmm6 , %xmm6
  438. vxorpd %xmm7 , %xmm7 , %xmm7
  439. vxorpd %xmm8 , %xmm8 , %xmm8
  440. vxorpd %xmm9 , %xmm9 , %xmm9
  441. vxorpd %xmm10, %xmm10, %xmm10
  442. vxorpd %xmm11, %xmm11, %xmm11
  443. vxorpd %xmm12, %xmm12, %xmm12
  444. vxorpd %xmm13, %xmm13, %xmm13
  445. vxorpd %xmm14, %xmm14, %xmm14
  446. vxorpd %xmm15, %xmm15, %xmm15
  447. .endm
  448. .macro KERNEL1x12_SUB
  449. vmovsd -16 * SIZE(AO), %xmm0
  450. vmovsd -12 * SIZE(BO), %xmm1
  451. vmovsd -11 * SIZE(BO), %xmm2
  452. vmovsd -10 * SIZE(BO), %xmm3
  453. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  454. vmovsd -9 * SIZE(BO), %xmm1
  455. vfmadd231sd %xmm0 ,%xmm2 , %xmm5
  456. vmovsd -8 * SIZE(BO), %xmm2
  457. vfmadd231sd %xmm0 ,%xmm3 , %xmm6
  458. vmovsd -7 * SIZE(BO), %xmm3
  459. vfmadd231sd %xmm0 ,%xmm1 , %xmm7
  460. vmovsd -6 * SIZE(BO), %xmm1
  461. vfmadd231sd %xmm0 ,%xmm2 , %xmm8
  462. vmovsd -5 * SIZE(BO), %xmm2
  463. vfmadd231sd %xmm0 ,%xmm3 , %xmm9
  464. vmovsd -4 * SIZE(BO), %xmm3
  465. vfmadd231sd %xmm0 ,%xmm1 , %xmm10
  466. vmovsd -3 * SIZE(BO), %xmm1
  467. vfmadd231sd %xmm0 ,%xmm2 , %xmm11
  468. vmovsd -2 * SIZE(BO), %xmm2
  469. vfmadd231sd %xmm0 ,%xmm3 , %xmm12
  470. vmovsd -1 * SIZE(BO), %xmm3
  471. vfmadd231sd %xmm0 ,%xmm1 , %xmm13
  472. addq $ 12*SIZE, BO
  473. vfmadd231sd %xmm0 ,%xmm2 , %xmm14
  474. addq $ 1*SIZE, AO
  475. vfmadd231sd %xmm0 ,%xmm3 , %xmm15
  476. .endm
  477. .macro SAVE1x12
  478. vmovsd ALPHA, %xmm0
  479. vmulsd %xmm0 , %xmm4 , %xmm4
  480. vmulsd %xmm0 , %xmm5 , %xmm5
  481. vmulsd %xmm0 , %xmm6 , %xmm6
  482. vmulsd %xmm0 , %xmm7 , %xmm7
  483. vmulsd %xmm0 , %xmm8 , %xmm8
  484. vmulsd %xmm0 , %xmm9 , %xmm9
  485. vmulsd %xmm0 , %xmm10, %xmm10
  486. vmulsd %xmm0 , %xmm11, %xmm11
  487. vmulsd %xmm0 , %xmm12, %xmm12
  488. vmulsd %xmm0 , %xmm13, %xmm13
  489. vmulsd %xmm0 , %xmm14, %xmm14
  490. vmulsd %xmm0 , %xmm15, %xmm15
  491. leaq (CO1, LDC, 2), %rax
  492. #if !defined(TRMMKERNEL)
  493. vaddsd (CO1), %xmm4, %xmm4
  494. vaddsd (CO1, LDC), %xmm5, %xmm5
  495. vaddsd (%rax), %xmm6, %xmm6
  496. vaddsd (%rax, LDC), %xmm7, %xmm7
  497. #endif
  498. vmovsd %xmm4 , (CO1)
  499. vmovsd %xmm5 , (CO1, LDC)
  500. vmovsd %xmm6 , (%rax)
  501. vmovsd %xmm7 , (%rax, LDC)
  502. leaq (%rax, LDC, 2), %rax
  503. leaq (%rax, LDC, 2), %rbp
  504. #if !defined(TRMMKERNEL)
  505. vaddsd (%rax), %xmm8 , %xmm4
  506. vaddsd (%rax, LDC), %xmm9 , %xmm5
  507. vaddsd (%rbp), %xmm10, %xmm6
  508. vaddsd (%rbp, LDC), %xmm11, %xmm7
  509. #endif
  510. vmovsd %xmm4 , (%rax)
  511. vmovsd %xmm5 , (%rax, LDC)
  512. vmovsd %xmm6 , (%rbp)
  513. vmovsd %xmm7 , (%rbp, LDC)
  514. leaq (%rax, LDC, 4), %rax
  515. leaq (%rbp, LDC, 4), %rbp
  516. #if !defined(TRMMKERNEL)
  517. vaddsd (%rax), %xmm12, %xmm4
  518. vaddsd (%rax, LDC), %xmm13, %xmm5
  519. vaddsd (%rbp), %xmm14, %xmm6
  520. vaddsd (%rbp, LDC), %xmm15, %xmm7
  521. #endif
  522. vmovsd %xmm4 , (%rax)
  523. vmovsd %xmm5 , (%rax, LDC)
  524. vmovsd %xmm6 , (%rbp)
  525. vmovsd %xmm7 , (%rbp, LDC)
  526. addq $ 1*SIZE, CO1
  527. .endm
  528. /******************************************************************************************/
  529. .macro INIT4x8
  530. vxorpd %ymm4 , %ymm4 , %ymm4
  531. vxorpd %ymm5 , %ymm5 , %ymm5
  532. vxorpd %ymm6 , %ymm6 , %ymm6
  533. vxorpd %ymm7 , %ymm7 , %ymm7
  534. vxorpd %ymm8 , %ymm8 , %ymm8
  535. vxorpd %ymm9 , %ymm9 , %ymm9
  536. vxorpd %ymm10, %ymm10, %ymm10
  537. vxorpd %ymm11, %ymm11, %ymm11
  538. .endm
  539. .macro KERNEL4x8_I
  540. vmovups -12 * SIZE(BO), %ymm1
  541. vmovups -16 * SIZE(AO), %ymm0
  542. vmovups -8 * SIZE(BO), %ymm2
  543. vmulpd %ymm0 ,%ymm1 , %ymm4
  544. vmulpd %ymm0 ,%ymm2 , %ymm8
  545. vpermpd $ 0xb1, %ymm0 , %ymm0
  546. vmulpd %ymm0 ,%ymm1 , %ymm5
  547. vmulpd %ymm0 ,%ymm2 , %ymm9
  548. vpermpd $ 0x1b, %ymm0 , %ymm0
  549. vmulpd %ymm0 ,%ymm1 , %ymm6
  550. vmulpd %ymm0 ,%ymm2 , %ymm10
  551. addq $ 8*SIZE, BO
  552. vpermpd $ 0xb1, %ymm0 , %ymm0
  553. vmulpd %ymm0 ,%ymm1 , %ymm7
  554. vmovups -12 * SIZE(BO), %ymm1
  555. vmulpd %ymm0 ,%ymm2 , %ymm11
  556. vmovups -8 * SIZE(BO), %ymm2
  557. .endm
  558. .macro KERNEL4x8_M1
  559. prefetcht0 A_PR1(AO)
  560. vmovups -16 * SIZE(AO), %ymm0
  561. prefetcht0 B_PR1(BO)
  562. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  563. prefetcht0 B_PR1+64(BO)
  564. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  565. vpermpd $ 0xb1, %ymm0 , %ymm0
  566. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  567. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  568. vpermpd $ 0x1b, %ymm0 , %ymm0
  569. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  570. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  571. vpermpd $ 0xb1, %ymm0 , %ymm0
  572. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  573. vmovups -12 * SIZE(BO), %ymm1
  574. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  575. vmovups -8 * SIZE(BO), %ymm2
  576. .endm
  577. .macro KERNEL4x8_M2
  578. vmovups -12 * SIZE(AO), %ymm0
  579. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  580. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  581. vpermpd $ 0xb1, %ymm0 , %ymm0
  582. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  583. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  584. vpermpd $ 0x1b, %ymm0 , %ymm0
  585. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  586. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  587. addq $ 8*SIZE, AO
  588. vpermpd $ 0xb1, %ymm0 , %ymm0
  589. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  590. vmovups -4 * SIZE(BO), %ymm1
  591. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  592. vmovups 0 * SIZE(BO), %ymm2
  593. addq $ 16*SIZE, BO
  594. .endm
  595. .macro KERNEL4x8_E
  596. vmovups -12 * SIZE(AO), %ymm0
  597. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  598. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  599. vpermpd $ 0xb1, %ymm0 , %ymm0
  600. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  601. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  602. vpermpd $ 0x1b, %ymm0 , %ymm0
  603. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  604. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  605. addq $ 8*SIZE, AO
  606. vpermpd $ 0xb1, %ymm0 , %ymm0
  607. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  608. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  609. addq $ 8*SIZE, BO
  610. .endm
  611. .macro KERNEL4x8_SUB
  612. vmovups -12 * SIZE(BO), %ymm1
  613. vmovups -16 * SIZE(AO), %ymm0
  614. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  615. vmovups -8 * SIZE(BO), %ymm2
  616. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  617. vpermpd $ 0xb1, %ymm0 , %ymm0
  618. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  619. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  620. addq $ 8*SIZE, BO
  621. vpermpd $ 0x1b, %ymm0 , %ymm0
  622. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  623. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  624. addq $ 4*SIZE, AO
  625. vpermpd $ 0xb1, %ymm0 , %ymm0
  626. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  627. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  628. .endm
  629. .macro SAVE4x8
  630. vbroadcastsd ALPHA, %ymm0
  631. vmulpd %ymm0 , %ymm4 , %ymm4
  632. vmulpd %ymm0 , %ymm5 , %ymm5
  633. vmulpd %ymm0 , %ymm6 , %ymm6
  634. vmulpd %ymm0 , %ymm7 , %ymm7
  635. vmulpd %ymm0 , %ymm8 , %ymm8
  636. vmulpd %ymm0 , %ymm9 , %ymm9
  637. vmulpd %ymm0 , %ymm10, %ymm10
  638. vmulpd %ymm0 , %ymm11, %ymm11
  639. vpermpd $ 0xb1 , %ymm5, %ymm5
  640. vpermpd $ 0xb1 , %ymm7, %ymm7
  641. vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
  642. vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
  643. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
  644. vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
  645. vpermpd $ 0x1b , %ymm2, %ymm2
  646. vpermpd $ 0x1b , %ymm3, %ymm3
  647. vpermpd $ 0xb1 , %ymm2, %ymm2
  648. vpermpd $ 0xb1 , %ymm3, %ymm3
  649. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  650. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  651. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  652. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  653. leaq (CO1, LDC, 2), %rax
  654. #if !defined(TRMMKERNEL)
  655. vaddpd (CO1), %ymm4, %ymm4
  656. vaddpd (CO1, LDC), %ymm5, %ymm5
  657. vaddpd (%rax), %ymm6, %ymm6
  658. vaddpd (%rax, LDC), %ymm7, %ymm7
  659. #endif
  660. vmovups %ymm4 , (CO1)
  661. vmovups %ymm5 , (CO1, LDC)
  662. vmovups %ymm6 , (%rax)
  663. vmovups %ymm7 , (%rax, LDC)
  664. prefetcht0 32(CO1)
  665. prefetcht0 32(CO1,LDC)
  666. prefetcht0 32(%rax)
  667. prefetcht0 32(%rax,LDC)
  668. vpermpd $ 0xb1 , %ymm9 , %ymm9
  669. vpermpd $ 0xb1 , %ymm11, %ymm11
  670. vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
  671. vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
  672. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
  673. vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
  674. vpermpd $ 0x1b , %ymm2, %ymm2
  675. vpermpd $ 0x1b , %ymm3, %ymm3
  676. vpermpd $ 0xb1 , %ymm2, %ymm2
  677. vpermpd $ 0xb1 , %ymm3, %ymm3
  678. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  679. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  680. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  681. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  682. leaq (%rax, LDC, 2), %rax
  683. leaq (%rax, LDC, 2), %rbp
  684. #if !defined(TRMMKERNEL)
  685. vaddpd (%rax), %ymm4, %ymm4
  686. vaddpd (%rax, LDC), %ymm5, %ymm5
  687. vaddpd (%rbp), %ymm6, %ymm6
  688. vaddpd (%rbp, LDC), %ymm7, %ymm7
  689. #endif
  690. vmovups %ymm4 , (%rax)
  691. vmovups %ymm5 , (%rax, LDC)
  692. vmovups %ymm6 , (%rbp)
  693. vmovups %ymm7 , (%rbp, LDC)
  694. prefetcht0 32(%rax)
  695. prefetcht0 32(%rax,LDC)
  696. prefetcht0 32(%rbp)
  697. prefetcht0 32(%rbp,LDC)
  698. addq $ 4*SIZE, CO1
  699. .endm
  700. /******************************************************************************************/
  701. .macro INIT2x8
  702. vxorpd %xmm4 , %xmm4 , %xmm4
  703. vxorpd %xmm5 , %xmm5 , %xmm5
  704. vxorpd %xmm6 , %xmm6 , %xmm6
  705. vxorpd %xmm7 , %xmm7 , %xmm7
  706. vxorpd %xmm8 , %xmm8 , %xmm8
  707. vxorpd %xmm9 , %xmm9 , %xmm9
  708. vxorpd %xmm10, %xmm10, %xmm10
  709. vxorpd %xmm11, %xmm11, %xmm11
  710. .endm
  711. .macro KERNEL2x8_SUB
  712. vmovups -16 * SIZE(AO), %xmm0
  713. vmovddup -12 * SIZE(BO), %xmm1
  714. vmovddup -11 * SIZE(BO), %xmm2
  715. vmovddup -10 * SIZE(BO), %xmm3
  716. vfmadd231pd %xmm0 ,%xmm1 , %xmm4
  717. vmovddup -9 * SIZE(BO), %xmm1
  718. vfmadd231pd %xmm0 ,%xmm2 , %xmm5
  719. vmovddup -8 * SIZE(BO), %xmm2
  720. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  721. vmovddup -7 * SIZE(BO), %xmm3
  722. vfmadd231pd %xmm0 ,%xmm1 , %xmm7
  723. vmovddup -6 * SIZE(BO), %xmm1
  724. vfmadd231pd %xmm0 ,%xmm2 , %xmm8
  725. vmovddup -5 * SIZE(BO), %xmm2
  726. vfmadd231pd %xmm0 ,%xmm3 , %xmm9
  727. vfmadd231pd %xmm0 ,%xmm1 , %xmm10
  728. vfmadd231pd %xmm0 ,%xmm2 , %xmm11
  729. addq $ 8*SIZE, BO
  730. addq $ 2*SIZE, AO
  731. .endm
  732. .macro SAVE2x8
  733. vmovddup ALPHA, %xmm0
  734. vmulpd %xmm0 , %xmm4 , %xmm4
  735. vmulpd %xmm0 , %xmm5 , %xmm5
  736. vmulpd %xmm0 , %xmm6 , %xmm6
  737. vmulpd %xmm0 , %xmm7 , %xmm7
  738. vmulpd %xmm0 , %xmm8 , %xmm8
  739. vmulpd %xmm0 , %xmm9 , %xmm9
  740. vmulpd %xmm0 , %xmm10, %xmm10
  741. vmulpd %xmm0 , %xmm11, %xmm11
  742. leaq (CO1, LDC, 2), %rax
  743. #if !defined(TRMMKERNEL)
  744. vaddpd (CO1), %xmm4, %xmm4
  745. vaddpd (CO1, LDC), %xmm5, %xmm5
  746. vaddpd (%rax), %xmm6, %xmm6
  747. vaddpd (%rax, LDC), %xmm7, %xmm7
  748. #endif
  749. vmovups %xmm4 , (CO1)
  750. vmovups %xmm5 , (CO1, LDC)
  751. vmovups %xmm6 , (%rax)
  752. vmovups %xmm7 , (%rax, LDC)
  753. leaq (%rax, LDC, 2), %rax
  754. leaq (%rax, LDC, 2), %rbp
  755. #if !defined(TRMMKERNEL)
  756. vaddpd (%rax), %xmm8 , %xmm4
  757. vaddpd (%rax, LDC), %xmm9 , %xmm5
  758. vaddpd (%rbp), %xmm10, %xmm6
  759. vaddpd (%rbp, LDC), %xmm11, %xmm7
  760. #endif
  761. vmovups %xmm4 , (%rax)
  762. vmovups %xmm5 , (%rax, LDC)
  763. vmovups %xmm6 , (%rbp)
  764. vmovups %xmm7 , (%rbp, LDC)
  765. addq $ 2*SIZE, CO1
  766. .endm
  767. /******************************************************************************************/
  768. .macro INIT1x8
  769. vxorpd %xmm4 , %xmm4 , %xmm4
  770. vxorpd %xmm5 , %xmm5 , %xmm5
  771. vxorpd %xmm6 , %xmm6 , %xmm6
  772. vxorpd %xmm7 , %xmm7 , %xmm7
  773. vxorpd %xmm8 , %xmm8 , %xmm8
  774. vxorpd %xmm9 , %xmm9 , %xmm9
  775. vxorpd %xmm10, %xmm10, %xmm10
  776. vxorpd %xmm11, %xmm11, %xmm11
  777. .endm
  778. .macro KERNEL1x8_SUB
  779. vmovsd -16 * SIZE(AO), %xmm0
  780. vmovsd -12 * SIZE(BO), %xmm1
  781. vmovsd -11 * SIZE(BO), %xmm2
  782. vmovsd -10 * SIZE(BO), %xmm3
  783. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  784. vmovsd -9 * SIZE(BO), %xmm1
  785. vfmadd231sd %xmm0 ,%xmm2 , %xmm5
  786. vmovsd -8 * SIZE(BO), %xmm2
  787. vfmadd231sd %xmm0 ,%xmm3 , %xmm6
  788. vmovsd -7 * SIZE(BO), %xmm3
  789. vfmadd231sd %xmm0 ,%xmm1 , %xmm7
  790. vmovsd -6 * SIZE(BO), %xmm1
  791. vfmadd231sd %xmm0 ,%xmm2 , %xmm8
  792. vmovsd -5 * SIZE(BO), %xmm2
  793. vfmadd231sd %xmm0 ,%xmm3 , %xmm9
  794. vfmadd231sd %xmm0 ,%xmm1 , %xmm10
  795. vfmadd231sd %xmm0 ,%xmm2 , %xmm11
  796. addq $ 8*SIZE, BO
  797. addq $ 1*SIZE, AO
  798. .endm
  799. .macro SAVE1x8
  800. vmovsd ALPHA, %xmm0
  801. vmulsd %xmm0 , %xmm4 , %xmm4
  802. vmulsd %xmm0 , %xmm5 , %xmm5
  803. vmulsd %xmm0 , %xmm6 , %xmm6
  804. vmulsd %xmm0 , %xmm7 , %xmm7
  805. vmulsd %xmm0 , %xmm8 , %xmm8
  806. vmulsd %xmm0 , %xmm9 , %xmm9
  807. vmulsd %xmm0 , %xmm10, %xmm10
  808. vmulsd %xmm0 , %xmm11, %xmm11
  809. leaq (CO1, LDC, 2), %rax
  810. #if !defined(TRMMKERNEL)
  811. vaddsd (CO1), %xmm4, %xmm4
  812. vaddsd (CO1, LDC), %xmm5, %xmm5
  813. vaddsd (%rax), %xmm6, %xmm6
  814. vaddsd (%rax, LDC), %xmm7, %xmm7
  815. #endif
  816. vmovsd %xmm4 , (CO1)
  817. vmovsd %xmm5 , (CO1, LDC)
  818. vmovsd %xmm6 , (%rax)
  819. vmovsd %xmm7 , (%rax, LDC)
  820. leaq (%rax, LDC, 2), %rax
  821. leaq (%rax, LDC, 2), %rbp
  822. #if !defined(TRMMKERNEL)
  823. vaddsd (%rax), %xmm8 , %xmm4
  824. vaddsd (%rax, LDC), %xmm9 , %xmm5
  825. vaddsd (%rbp), %xmm10, %xmm6
  826. vaddsd (%rbp, LDC), %xmm11, %xmm7
  827. #endif
  828. vmovsd %xmm4 , (%rax)
  829. vmovsd %xmm5 , (%rax, LDC)
  830. vmovsd %xmm6 , (%rbp)
  831. vmovsd %xmm7 , (%rbp, LDC)
  832. addq $ 1*SIZE, CO1
  833. .endm
  834. /******************************************************************************************/
  835. .macro INIT4x4
  836. vxorpd %ymm4 , %ymm4 , %ymm4
  837. vxorpd %ymm5 , %ymm5 , %ymm5
  838. vxorpd %ymm6 , %ymm6 , %ymm6
  839. vxorpd %ymm7 , %ymm7 , %ymm7
  840. .endm
  841. .macro KERNEL4x4_I
  842. prefetcht0 A_PR1(AO)
  843. vmovups -12 * SIZE(BO), %ymm1
  844. vmovups -16 * SIZE(AO), %ymm0
  845. vmulpd %ymm0 ,%ymm1 , %ymm4
  846. vpermpd $ 0xb1, %ymm0 , %ymm0
  847. vmulpd %ymm0 ,%ymm1 , %ymm5
  848. vpermpd $ 0x1b, %ymm0 , %ymm0
  849. vmulpd %ymm0 ,%ymm1 , %ymm6
  850. addq $ 4*SIZE, BO
  851. vpermpd $ 0xb1, %ymm0 , %ymm0
  852. vmulpd %ymm0 ,%ymm1 , %ymm7
  853. vmovups -12 * SIZE(BO), %ymm1
  854. .endm
  855. .macro KERNEL4x4_M1
  856. prefetcht0 A_PR1(AO)
  857. vmovups -16 * SIZE(AO), %ymm0
  858. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  859. vpermpd $ 0xb1, %ymm0 , %ymm0
  860. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  861. vpermpd $ 0x1b, %ymm0 , %ymm0
  862. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  863. vpermpd $ 0xb1, %ymm0 , %ymm0
  864. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  865. vmovups -12 * SIZE(BO), %ymm1
  866. .endm
  867. .macro KERNEL4x4_M2
  868. vmovups -12 * SIZE(AO), %ymm0
  869. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  870. vpermpd $ 0xb1, %ymm0 , %ymm0
  871. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  872. vpermpd $ 0x1b, %ymm0 , %ymm0
  873. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  874. addq $ 8*SIZE, AO
  875. vpermpd $ 0xb1, %ymm0 , %ymm0
  876. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  877. vmovups -8 * SIZE(BO), %ymm1
  878. addq $ 8*SIZE, BO
  879. .endm
  880. .macro KERNEL4x4_E
  881. vmovups -12 * SIZE(AO), %ymm0
  882. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  883. vpermpd $ 0xb1, %ymm0 , %ymm0
  884. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  885. vpermpd $ 0x1b, %ymm0 , %ymm0
  886. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  887. addq $ 8*SIZE, AO
  888. vpermpd $ 0xb1, %ymm0 , %ymm0
  889. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  890. addq $ 4*SIZE, BO
  891. .endm
  892. .macro KERNEL4x4_SUB
  893. vmovups -12 * SIZE(BO), %ymm1
  894. vmovups -16 * SIZE(AO), %ymm0
  895. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  896. vpermpd $ 0xb1, %ymm0 , %ymm0
  897. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  898. addq $ 4*SIZE, BO
  899. vpermpd $ 0x1b, %ymm0 , %ymm0
  900. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  901. addq $ 4*SIZE, AO
  902. vpermpd $ 0xb1, %ymm0 , %ymm0
  903. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  904. .endm
  905. .macro SAVE4x4
  906. vbroadcastsd ALPHA, %ymm0
  907. vmulpd %ymm0 , %ymm4 , %ymm4
  908. vmulpd %ymm0 , %ymm7 , %ymm7
  909. vmulpd %ymm0 , %ymm5 , %ymm5
  910. vmulpd %ymm0 , %ymm6 , %ymm6
  911. vpermpd $ 0xb1 , %ymm5, %ymm5
  912. vpermpd $ 0xb1 , %ymm7, %ymm7
  913. vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
  914. vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
  915. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
  916. vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
  917. vpermpd $ 0x1b , %ymm2, %ymm2
  918. vpermpd $ 0x1b , %ymm3, %ymm3
  919. vpermpd $ 0xb1 , %ymm2, %ymm2
  920. vpermpd $ 0xb1 , %ymm3, %ymm3
  921. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  922. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  923. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  924. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  925. leaq (CO1, LDC, 2), %rax
  926. #if !defined(TRMMKERNEL)
  927. vaddpd (CO1), %ymm4, %ymm4
  928. vaddpd (CO1, LDC), %ymm5, %ymm5
  929. vaddpd (%rax), %ymm6, %ymm6
  930. vaddpd (%rax, LDC), %ymm7, %ymm7
  931. #endif
  932. vmovups %ymm4 , (CO1)
  933. vmovups %ymm5 , (CO1, LDC)
  934. vmovups %ymm6 , (%rax)
  935. vmovups %ymm7 , (%rax, LDC)
  936. addq $ 4*SIZE, CO1
  937. .endm
  938. /******************************************************************************************/
  939. /******************************************************************************************/
  940. .macro INIT2x4
  941. vxorpd %xmm4 , %xmm4 , %xmm4
  942. vxorpd %xmm5 , %xmm5 , %xmm5
  943. vxorpd %xmm6 , %xmm6 , %xmm6
  944. vxorpd %xmm7 , %xmm7 , %xmm7
  945. .endm
  946. .macro KERNEL2x4_SUB
  947. vmovddup -12 * SIZE(BO), %xmm1
  948. vmovups -16 * SIZE(AO), %xmm0
  949. vmovddup -11 * SIZE(BO), %xmm2
  950. vfmadd231pd %xmm0 ,%xmm1 , %xmm4
  951. vmovddup -10 * SIZE(BO), %xmm3
  952. vfmadd231pd %xmm0 ,%xmm2 , %xmm5
  953. vmovddup -9 * SIZE(BO), %xmm8
  954. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  955. addq $ 4*SIZE, BO
  956. vfmadd231pd %xmm0 ,%xmm8 , %xmm7
  957. addq $ 2*SIZE, AO
  958. .endm
  959. .macro SAVE2x4
  960. vmovddup ALPHA, %xmm0
  961. vmulpd %xmm0 , %xmm4 , %xmm4
  962. vmulpd %xmm0 , %xmm5 , %xmm5
  963. vmulpd %xmm0 , %xmm6 , %xmm6
  964. vmulpd %xmm0 , %xmm7 , %xmm7
  965. leaq (CO1, LDC, 2), %rax
  966. #if !defined(TRMMKERNEL)
  967. vaddpd (CO1), %xmm4, %xmm4
  968. vaddpd (CO1, LDC), %xmm5, %xmm5
  969. vaddpd (%rax), %xmm6, %xmm6
  970. vaddpd (%rax, LDC), %xmm7, %xmm7
  971. #endif
  972. vmovups %xmm4 , (CO1)
  973. vmovups %xmm5 , (CO1, LDC)
  974. vmovups %xmm6 , (%rax)
  975. vmovups %xmm7 , (%rax, LDC)
  976. addq $ 2*SIZE, CO1
  977. .endm
  978. /******************************************************************************************/
  979. /******************************************************************************************/
  980. .macro INIT1x4
  981. vxorpd %xmm4 , %xmm4 , %xmm4
  982. vxorpd %xmm5 , %xmm5 , %xmm5
  983. vxorpd %xmm6 , %xmm6 , %xmm6
  984. vxorpd %xmm7 , %xmm7 , %xmm7
  985. .endm
  986. .macro KERNEL1x4_SUB
  987. vmovsd -12 * SIZE(BO), %xmm1
  988. vmovsd -16 * SIZE(AO), %xmm0
  989. vmovsd -11 * SIZE(BO), %xmm2
  990. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  991. vmovsd -10 * SIZE(BO), %xmm3
  992. vfmadd231sd %xmm0 ,%xmm2 , %xmm5
  993. vmovsd -9 * SIZE(BO), %xmm8
  994. vfmadd231sd %xmm0 ,%xmm3 , %xmm6
  995. addq $ 4*SIZE, BO
  996. vfmadd231sd %xmm0 ,%xmm8 , %xmm7
  997. addq $ 1*SIZE, AO
  998. .endm
  999. .macro SAVE1x4
  1000. vmovsd ALPHA, %xmm0
  1001. vmulsd %xmm0 , %xmm4 , %xmm4
  1002. vmulsd %xmm0 , %xmm5 , %xmm5
  1003. vmulsd %xmm0 , %xmm6 , %xmm6
  1004. vmulsd %xmm0 , %xmm7 , %xmm7
  1005. leaq (CO1, LDC, 2), %rax
  1006. #if !defined(TRMMKERNEL)
  1007. vaddsd (CO1), %xmm4, %xmm4
  1008. vaddsd (CO1, LDC), %xmm5, %xmm5
  1009. vaddsd (%rax), %xmm6, %xmm6
  1010. vaddsd (%rax, LDC), %xmm7, %xmm7
  1011. #endif
  1012. vmovsd %xmm4 , (CO1)
  1013. vmovsd %xmm5 , (CO1, LDC)
  1014. vmovsd %xmm6 , (%rax)
  1015. vmovsd %xmm7 , (%rax, LDC)
  1016. addq $ 1*SIZE, CO1
  1017. .endm
  1018. /******************************************************************************************/
  1019. /******************************************************************************************/
  1020. .macro INIT4x2
  1021. vxorpd %xmm4 , %xmm4 , %xmm4
  1022. vxorpd %xmm5 , %xmm5 , %xmm5
  1023. vxorpd %xmm6 , %xmm6 , %xmm6
  1024. vxorpd %xmm7 , %xmm7 , %xmm7
  1025. .endm
  1026. .macro KERNEL4x2_SUB
  1027. vmovddup -12 * SIZE(BO), %xmm2
  1028. vmovups -16 * SIZE(AO), %xmm0
  1029. vmovups -14 * SIZE(AO), %xmm1
  1030. vmovddup -11 * SIZE(BO), %xmm3
  1031. vfmadd231pd %xmm0 ,%xmm2 , %xmm4
  1032. vfmadd231pd %xmm1 ,%xmm2 , %xmm5
  1033. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  1034. vfmadd231pd %xmm1 ,%xmm3 , %xmm7
  1035. addq $ 2*SIZE, BO
  1036. addq $ 4*SIZE, AO
  1037. .endm
  1038. .macro SAVE4x2
  1039. vmovddup ALPHA, %xmm0
  1040. vmulpd %xmm0 , %xmm4 , %xmm4
  1041. vmulpd %xmm0 , %xmm5 , %xmm5
  1042. vmulpd %xmm0 , %xmm6 , %xmm6
  1043. vmulpd %xmm0 , %xmm7 , %xmm7
  1044. #if !defined(TRMMKERNEL)
  1045. vaddpd (CO1) , %xmm4, %xmm4
  1046. vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5
  1047. vaddpd (CO1, LDC), %xmm6, %xmm6
  1048. vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7
  1049. #endif
  1050. vmovups %xmm4 , (CO1)
  1051. vmovups %xmm5 , 2 * SIZE(CO1)
  1052. vmovups %xmm6 , (CO1, LDC)
  1053. vmovups %xmm7 , 2 * SIZE(CO1, LDC)
  1054. addq $ 4*SIZE, CO1
  1055. .endm
  1056. /******************************************************************************************/
  1057. /******************************************************************************************/
  1058. .macro INIT2x2
  1059. vxorpd %xmm4 , %xmm4 , %xmm4
  1060. vxorpd %xmm6 , %xmm6 , %xmm6
  1061. .endm
  1062. .macro KERNEL2x2_SUB
  1063. vmovddup -12 * SIZE(BO), %xmm2
  1064. vmovups -16 * SIZE(AO), %xmm0
  1065. vmovddup -11 * SIZE(BO), %xmm3
  1066. vfmadd231pd %xmm0 ,%xmm2 , %xmm4
  1067. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  1068. addq $ 2*SIZE, BO
  1069. addq $ 2*SIZE, AO
  1070. .endm
  1071. .macro SAVE2x2
  1072. vmovddup ALPHA, %xmm0
  1073. vmulpd %xmm0 , %xmm4 , %xmm4
  1074. vmulpd %xmm0 , %xmm6 , %xmm6
  1075. #if !defined(TRMMKERNEL)
  1076. vaddpd (CO1) , %xmm4, %xmm4
  1077. vaddpd (CO1, LDC), %xmm6, %xmm6
  1078. #endif
  1079. vmovups %xmm4 , (CO1)
  1080. vmovups %xmm6 , (CO1, LDC)
  1081. addq $ 2*SIZE, CO1
  1082. .endm
  1083. /******************************************************************************************/
  1084. /******************************************************************************************/
  1085. .macro INIT1x2
  1086. vxorpd %xmm4 , %xmm4 , %xmm4
  1087. vxorpd %xmm5 , %xmm5 , %xmm5
  1088. .endm
  1089. .macro KERNEL1x2_SUB
  1090. vmovsd -12 * SIZE(BO), %xmm1
  1091. vmovsd -16 * SIZE(AO), %xmm0
  1092. vmovsd -11 * SIZE(BO), %xmm2
  1093. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  1094. vfmadd231sd %xmm0 ,%xmm2 , %xmm5
  1095. addq $ 2*SIZE, BO
  1096. addq $ 1*SIZE, AO
  1097. .endm
  1098. .macro SAVE1x2
  1099. vmovsd ALPHA, %xmm0
  1100. vmulsd %xmm0 , %xmm4 , %xmm4
  1101. vmulsd %xmm0 , %xmm5 , %xmm5
  1102. #if !defined(TRMMKERNEL)
  1103. vaddsd (CO1), %xmm4, %xmm4
  1104. vaddsd (CO1, LDC), %xmm5, %xmm5
  1105. #endif
  1106. vmovsd %xmm4 , (CO1)
  1107. vmovsd %xmm5 , (CO1, LDC)
  1108. addq $ 1*SIZE, CO1
  1109. .endm
  1110. /******************************************************************************************/
  1111. /******************************************************************************************/
  1112. .macro INIT4x1
  1113. vxorpd %ymm4 , %ymm4 , %ymm4
  1114. vxorpd %ymm5 , %ymm5 , %ymm5
  1115. vxorpd %ymm6 , %ymm6 , %ymm6
  1116. vxorpd %ymm7 , %ymm7 , %ymm7
  1117. .endm
  1118. .macro KERNEL4x1
  1119. vbroadcastsd -12 * SIZE(BO), %ymm0
  1120. vbroadcastsd -11 * SIZE(BO), %ymm1
  1121. vbroadcastsd -10 * SIZE(BO), %ymm2
  1122. vbroadcastsd -9 * SIZE(BO), %ymm3
  1123. vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4
  1124. vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5
  1125. vbroadcastsd -8 * SIZE(BO), %ymm0
  1126. vbroadcastsd -7 * SIZE(BO), %ymm1
  1127. vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6
  1128. vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7
  1129. vbroadcastsd -6 * SIZE(BO), %ymm2
  1130. vbroadcastsd -5 * SIZE(BO), %ymm3
  1131. vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4
  1132. vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5
  1133. vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6
  1134. vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7
  1135. addq $ 8 *SIZE, BO
  1136. addq $ 32*SIZE, AO
  1137. .endm
  1138. .macro KERNEL4x1_SUB
  1139. vbroadcastsd -12 * SIZE(BO), %ymm2
  1140. vmovups -16 * SIZE(AO), %ymm0
  1141. vfmadd231pd %ymm0 ,%ymm2 , %ymm4
  1142. addq $ 1*SIZE, BO
  1143. addq $ 4*SIZE, AO
  1144. .endm
  1145. .macro SAVE4x1
  1146. vbroadcastsd ALPHA, %ymm0
  1147. vaddpd %ymm4,%ymm5, %ymm4
  1148. vaddpd %ymm6,%ymm7, %ymm6
  1149. vaddpd %ymm4,%ymm6, %ymm4
  1150. vmulpd %ymm0 , %ymm4 , %ymm4
  1151. #if !defined(TRMMKERNEL)
  1152. vaddpd (CO1) , %ymm4, %ymm4
  1153. #endif
  1154. vmovups %ymm4 , (CO1)
  1155. addq $ 4*SIZE, CO1
  1156. .endm
  1157. /******************************************************************************************/
  1158. /******************************************************************************************/
  1159. .macro INIT2x1
  1160. vxorpd %xmm4 , %xmm4 , %xmm4
  1161. .endm
  1162. .macro KERNEL2x1_SUB
  1163. vmovddup -12 * SIZE(BO), %xmm2
  1164. vmovups -16 * SIZE(AO), %xmm0
  1165. vfmadd231pd %xmm0 ,%xmm2 , %xmm4
  1166. addq $ 1*SIZE, BO
  1167. addq $ 2*SIZE, AO
  1168. .endm
  1169. .macro SAVE2x1
  1170. vmovddup ALPHA, %xmm0
  1171. vmulpd %xmm0 , %xmm4 , %xmm4
  1172. #if !defined(TRMMKERNEL)
  1173. vaddpd (CO1) , %xmm4, %xmm4
  1174. #endif
  1175. vmovups %xmm4 , (CO1)
  1176. addq $ 2*SIZE, CO1
  1177. .endm
  1178. /******************************************************************************************/
  1179. /******************************************************************************************/
  1180. .macro INIT1x1
  1181. vxorpd %xmm4 , %xmm4 , %xmm4
  1182. .endm
  1183. .macro KERNEL1x1_SUB
  1184. vmovsd -12 * SIZE(BO), %xmm1
  1185. vmovsd -16 * SIZE(AO), %xmm0
  1186. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  1187. addq $ 1*SIZE, BO
  1188. addq $ 1*SIZE, AO
  1189. .endm
  1190. .macro SAVE1x1
  1191. vmovsd ALPHA, %xmm0
  1192. vmulsd %xmm0 , %xmm4 , %xmm4
  1193. #if !defined(TRMMKERNEL)
  1194. vaddsd (CO1), %xmm4, %xmm4
  1195. #endif
  1196. vmovsd %xmm4 , (CO1)
  1197. addq $ 1*SIZE, CO1
  1198. .endm
  1199. /*******************************************************************************************/
  1200. #if !defined(TRMMKERNEL)
  1201. PROLOGUE
  1202. PROFCODE
  1203. subq $STACKSIZE, %rsp
  1204. movq %rbx, (%rsp)
  1205. movq %rbp, 8(%rsp)
  1206. movq %r12, 16(%rsp)
  1207. movq %r13, 24(%rsp)
  1208. movq %r14, 32(%rsp)
  1209. movq %r15, 40(%rsp)
  1210. vzeroupper
  1211. #ifdef WINDOWS_ABI
  1212. movq %rdi, 48(%rsp)
  1213. movq %rsi, 56(%rsp)
  1214. vmovups %xmm6, 64(%rsp)
  1215. vmovups %xmm7, 80(%rsp)
  1216. vmovups %xmm8, 96(%rsp)
  1217. vmovups %xmm9, 112(%rsp)
  1218. vmovups %xmm10, 128(%rsp)
  1219. vmovups %xmm11, 144(%rsp)
  1220. vmovups %xmm12, 160(%rsp)
  1221. vmovups %xmm13, 176(%rsp)
  1222. vmovups %xmm14, 192(%rsp)
  1223. vmovups %xmm15, 208(%rsp)
  1224. movq ARG1, OLD_M
  1225. movq ARG2, OLD_N
  1226. movq ARG3, OLD_K
  1227. movq OLD_A, A
  1228. movq OLD_B, B
  1229. movq OLD_C, C
  1230. movq OLD_LDC, LDC
  1231. vmovups %xmm3, %xmm0
  1232. #else
  1233. movq STACKSIZE + 8(%rsp), LDC
  1234. #endif
  1235. movq %rsp, SP # save old stack
  1236. subq $128 + L_BUFFER_SIZE, %rsp
  1237. andq $-4096, %rsp # align stack
  1238. STACK_TOUCH
  1239. cmpq $ 0, OLD_M
  1240. je .L999
  1241. cmpq $ 0, OLD_N
  1242. je .L999
  1243. cmpq $ 0, OLD_K
  1244. je .L999
  1245. movq OLD_M, M
  1246. movq OLD_N, N
  1247. movq OLD_K, K
  1248. vmovsd %xmm0, ALPHA
  1249. salq $BASE_SHIFT, LDC
  1250. movq N, %rax
  1251. xorq %rdx, %rdx
  1252. movq $24, %rdi
  1253. divq %rdi // N / 24
  1254. movq %rax, Ndiv12 // N / 24
  1255. movq %rdx, Nmod12 // N % 24
  1256. movq Ndiv12, J
  1257. cmpq $ 0, J
  1258. je .L8_0
  1259. ALIGN_4
  1260. .L12_01:
  1261. // copy to sub buffer
  1262. movq K, %rax
  1263. salq $3,%rax // K * 8 ; read 8 values from BO1
  1264. movq B, BO1
  1265. leaq (B,%rax, SIZE), BO2 // next offset to BO2
  1266. movq BO2 , B
  1267. leaq BUFFER1, BO // first buffer to BO
  1268. movq K, %rax
  1269. ALIGN_4
  1270. .L12_02b:
  1271. vmovups 0 * SIZE(BO1), %ymm1
  1272. vmovups 4 * SIZE(BO1), %ymm2
  1273. vmovups 0 * SIZE(BO2), %ymm3
  1274. vmovups %ymm1, 0 * SIZE(BO)
  1275. vmovups %ymm2, 4 * SIZE(BO)
  1276. vmovups %ymm3, 8 * SIZE(BO)
  1277. addq $ 8*SIZE,BO1
  1278. addq $ 8*SIZE,BO2
  1279. addq $ 12*SIZE,BO
  1280. decq %rax
  1281. jnz .L12_02b
  1282. .L12_03c:
  1283. .L12_10:
  1284. movq C, CO1
  1285. leaq (C, LDC, 8), C
  1286. leaq (C, LDC, 4), C // c += 12 * ldc
  1287. movq A, AO // aoffset = a
  1288. addq $16 * SIZE, AO
  1289. movq M, I
  1290. sarq $2, I // i = m / 4
  1291. je .L12_20
  1292. ALIGN_4
  1293. .L12_11:
  1294. leaq BUFFER1, BO // first buffer to BO
  1295. addq $12 * SIZE, BO
  1296. movq K, %rax
  1297. sarq $3, %rax // K / 8
  1298. cmpq $2, %rax
  1299. jl .L12_13
  1300. KERNEL4x12_I
  1301. KERNEL4x12_M2
  1302. KERNEL4x12_M1
  1303. KERNEL4x12_M2
  1304. KERNEL4x12_M1
  1305. KERNEL4x12_M2
  1306. KERNEL4x12_M1
  1307. KERNEL4x12_M2
  1308. subq $2, %rax
  1309. je .L12_12a
  1310. ALIGN_5
  1311. .L12_12:
  1312. KERNEL4x12_M1
  1313. KERNEL4x12_M2
  1314. KERNEL4x12_M1
  1315. KERNEL4x12_M2
  1316. KERNEL4x12_M1
  1317. KERNEL4x12_M2
  1318. KERNEL4x12_M1
  1319. KERNEL4x12_M2
  1320. dec %rax
  1321. jne .L12_12
  1322. .L12_12a:
  1323. KERNEL4x12_M1
  1324. KERNEL4x12_M2
  1325. KERNEL4x12_M1
  1326. KERNEL4x12_M2
  1327. KERNEL4x12_M1
  1328. KERNEL4x12_M2
  1329. KERNEL4x12_M1
  1330. KERNEL4x12_E
  1331. jmp .L12_16
  1332. .L12_13:
  1333. test $1, %rax
  1334. jz .L12_14
  1335. KERNEL4x12_I
  1336. KERNEL4x12_M2
  1337. KERNEL4x12_M1
  1338. KERNEL4x12_M2
  1339. KERNEL4x12_M1
  1340. KERNEL4x12_M2
  1341. KERNEL4x12_M1
  1342. KERNEL4x12_E
  1343. jmp .L12_16
  1344. .L12_14:
  1345. INIT4x12
  1346. .L12_16:
  1347. movq K, %rax
  1348. andq $7, %rax # if (k & 1)
  1349. je .L12_19
  1350. ALIGN_4
  1351. .L12_17:
  1352. KERNEL4x12_SUB
  1353. dec %rax
  1354. jne .L12_17
  1355. ALIGN_4
  1356. .L12_19:
  1357. SAVE4x12
  1358. decq I # i --
  1359. jne .L12_11
  1360. ALIGN_4
  1361. /**************************************************************************
  1362. * Rest of M
  1363. ***************************************************************************/
  1364. .L12_20:
  1365. // Test rest of M
  1366. testq $3, M
  1367. jz .L12_100 // to next 16 lines of N
  1368. .L12_30:
  1369. testq $2, M
  1370. jz .L12_40
  1371. ALIGN_4
  1372. .L12_31:
  1373. leaq BUFFER1, BO // first buffer to BO
  1374. addq $12 * SIZE, BO
  1375. INIT2x12
  1376. movq K, %rax
  1377. sarq $3, %rax
  1378. je .L12_36
  1379. ALIGN_4
  1380. .L12_32:
  1381. KERNEL2x12_SUB
  1382. KERNEL2x12_SUB
  1383. KERNEL2x12_SUB
  1384. KERNEL2x12_SUB
  1385. KERNEL2x12_SUB
  1386. KERNEL2x12_SUB
  1387. KERNEL2x12_SUB
  1388. KERNEL2x12_SUB
  1389. dec %rax
  1390. jne .L12_32
  1391. ALIGN_4
  1392. .L12_36:
  1393. movq K, %rax
  1394. andq $7, %rax # if (k & 1)
  1395. je .L12_39
  1396. ALIGN_4
  1397. .L12_37:
  1398. KERNEL2x12_SUB
  1399. dec %rax
  1400. jne .L12_37
  1401. ALIGN_4
  1402. .L12_39:
  1403. SAVE2x12
  1404. ALIGN_4
  1405. .L12_40:
  1406. testq $1, M
  1407. jz .L12_100 // to next 3 lines of N
  1408. ALIGN_4
  1409. .L12_41:
  1410. leaq BUFFER1, BO // first buffer to BO
  1411. addq $12 * SIZE, BO
  1412. INIT1x12
  1413. movq K, %rax
  1414. sarq $3,%rax
  1415. je .L12_46
  1416. ALIGN_4
  1417. .L12_42:
  1418. KERNEL1x12_SUB
  1419. KERNEL1x12_SUB
  1420. KERNEL1x12_SUB
  1421. KERNEL1x12_SUB
  1422. KERNEL1x12_SUB
  1423. KERNEL1x12_SUB
  1424. KERNEL1x12_SUB
  1425. KERNEL1x12_SUB
  1426. dec %rax
  1427. jne .L12_42
  1428. ALIGN_4
  1429. .L12_46:
  1430. movq K, %rax
  1431. andq $7, %rax # if (k & 1)
  1432. je .L12_49
  1433. ALIGN_4
  1434. .L12_47:
  1435. KERNEL1x12_SUB
  1436. dec %rax
  1437. jne .L12_47
  1438. ALIGN_4
  1439. .L12_49:
  1440. SAVE1x12
  1441. ALIGN_4
  1442. .L12_100:
  1443. /**************************************************************************************************/
  1444. .L13_01:
  1445. // copy to sub buffer
  1446. movq K, %rax
  1447. salq $3,%rax // K * 8 ; read 8 values
  1448. movq B, BO2
  1449. leaq (B,%rax, SIZE), BO3 // next offset to BO2
  1450. leaq (BO3,%rax, SIZE), B // next offset to B
  1451. leaq BUFFER1, BO // first buffer to BO
  1452. movq K, %rax
  1453. ALIGN_4
  1454. .L13_02b:
  1455. vmovups 4 * SIZE(BO2), %ymm1
  1456. vmovups 0 * SIZE(BO3), %ymm2
  1457. vmovups 4 * SIZE(BO3), %ymm3
  1458. vmovups %ymm1, 0 * SIZE(BO)
  1459. vmovups %ymm2, 4 * SIZE(BO)
  1460. vmovups %ymm3, 8 * SIZE(BO)
  1461. addq $ 8*SIZE,BO2
  1462. addq $ 8*SIZE,BO3
  1463. addq $ 12*SIZE,BO
  1464. decq %rax
  1465. jnz .L13_02b
  1466. .L13_10:
  1467. movq C, CO1
  1468. leaq (C, LDC, 8), C
  1469. leaq (C, LDC, 4), C // c += 12 * ldc
  1470. movq A, AO // aoffset = a
  1471. addq $16 * SIZE, AO
  1472. movq M, I
  1473. sarq $2, I // i = m / 4
  1474. je .L13_20
  1475. ALIGN_4
  1476. .L13_11:
  1477. leaq BUFFER1, BO // first buffer to BO
  1478. addq $12 * SIZE, BO
  1479. movq K, %rax
  1480. sarq $3, %rax // K / 8
  1481. cmpq $2, %rax
  1482. jl .L13_13
  1483. KERNEL4x12_I
  1484. KERNEL4x12_M2
  1485. KERNEL4x12_M1
  1486. KERNEL4x12_M2
  1487. KERNEL4x12_M1
  1488. KERNEL4x12_M2
  1489. KERNEL4x12_M1
  1490. KERNEL4x12_M2
  1491. subq $2, %rax
  1492. je .L13_12a
  1493. ALIGN_5
  1494. .L13_12:
  1495. KERNEL4x12_M1
  1496. KERNEL4x12_M2
  1497. KERNEL4x12_M1
  1498. KERNEL4x12_M2
  1499. KERNEL4x12_M1
  1500. KERNEL4x12_M2
  1501. KERNEL4x12_M1
  1502. KERNEL4x12_M2
  1503. dec %rax
  1504. jne .L13_12
  1505. .L13_12a:
  1506. KERNEL4x12_M1
  1507. KERNEL4x12_M2
  1508. KERNEL4x12_M1
  1509. KERNEL4x12_M2
  1510. KERNEL4x12_M1
  1511. KERNEL4x12_M2
  1512. KERNEL4x12_M1
  1513. KERNEL4x12_E
  1514. jmp .L13_16
  1515. .L13_13:
  1516. test $1, %rax
  1517. jz .L13_14
  1518. KERNEL4x12_I
  1519. KERNEL4x12_M2
  1520. KERNEL4x12_M1
  1521. KERNEL4x12_M2
  1522. KERNEL4x12_M1
  1523. KERNEL4x12_M2
  1524. KERNEL4x12_M1
  1525. KERNEL4x12_E
  1526. jmp .L13_16
  1527. .L13_14:
  1528. INIT4x12
  1529. .L13_16:
  1530. movq K, %rax
  1531. andq $7, %rax # if (k & 1)
  1532. je .L13_19
  1533. ALIGN_4
  1534. .L13_17:
  1535. KERNEL4x12_SUB
  1536. dec %rax
  1537. jne .L13_17
  1538. ALIGN_4
  1539. .L13_19:
  1540. SAVE4x12
  1541. decq I # i --
  1542. jne .L13_11
  1543. ALIGN_4
  1544. /**************************************************************************
  1545. * Rest of M
  1546. ***************************************************************************/
  1547. .L13_20:
  1548. // Test rest of M
  1549. testq $3, M
  1550. jz .L13_100 // to next 16 lines of N
  1551. .L13_30:
  1552. testq $2, M
  1553. jz .L13_40
  1554. ALIGN_4
  1555. .L13_31:
  1556. leaq BUFFER1, BO // first buffer to BO
  1557. addq $12 * SIZE, BO
  1558. INIT2x12
  1559. movq K, %rax
  1560. sarq $3, %rax
  1561. je .L13_36
  1562. ALIGN_4
  1563. .L13_32:
  1564. KERNEL2x12_SUB
  1565. KERNEL2x12_SUB
  1566. KERNEL2x12_SUB
  1567. KERNEL2x12_SUB
  1568. KERNEL2x12_SUB
  1569. KERNEL2x12_SUB
  1570. KERNEL2x12_SUB
  1571. KERNEL2x12_SUB
  1572. dec %rax
  1573. jne .L13_32
  1574. ALIGN_4
  1575. .L13_36:
  1576. movq K, %rax
  1577. andq $7, %rax # if (k & 1)
  1578. je .L13_39
  1579. ALIGN_4
  1580. .L13_37:
  1581. KERNEL2x12_SUB
  1582. dec %rax
  1583. jne .L13_37
  1584. ALIGN_4
  1585. .L13_39:
  1586. SAVE2x12
  1587. ALIGN_4
  1588. .L13_40:
  1589. testq $1, M
  1590. jz .L13_100 // to next 3 lines of N
  1591. ALIGN_4
  1592. .L13_41:
  1593. leaq BUFFER1, BO // first buffer to BO
  1594. addq $12 * SIZE, BO
  1595. INIT1x12
  1596. movq K, %rax
  1597. sarq $3,%rax
  1598. je .L13_46
  1599. ALIGN_4
  1600. .L13_42:
  1601. KERNEL1x12_SUB
  1602. KERNEL1x12_SUB
  1603. KERNEL1x12_SUB
  1604. KERNEL1x12_SUB
  1605. KERNEL1x12_SUB
  1606. KERNEL1x12_SUB
  1607. KERNEL1x12_SUB
  1608. KERNEL1x12_SUB
  1609. dec %rax
  1610. jne .L13_42
  1611. ALIGN_4
  1612. .L13_46:
  1613. movq K, %rax
  1614. andq $7, %rax # if (k & 1)
  1615. je .L13_49
  1616. ALIGN_4
  1617. .L13_47:
  1618. KERNEL1x12_SUB
  1619. dec %rax
  1620. jne .L13_47
  1621. ALIGN_4
  1622. .L13_49:
  1623. SAVE1x12
  1624. ALIGN_4
  1625. .L13_100:
  1626. decq J // j --
  1627. jg .L12_01
  1628. /**************************************************************************************************/
  1629. .L8_0:
  1630. cmpq $ 0, Nmod12 // N % 12 == 0
  1631. je .L999
  1632. movq Nmod12, J
  1633. sarq $3, J // j = j / 8
  1634. je .L4_0
  1635. .L8_10:
  1636. movq C, CO1
  1637. leaq (C, LDC, 8), C // c += 4 * ldc
  1638. movq A, AO // aoffset = a
  1639. addq $16 * SIZE, AO
  1640. movq M, I
  1641. sarq $2, I // i = m / 4
  1642. je .L8_20
  1643. ALIGN_4
  1644. .L8_11:
  1645. movq B, BO
  1646. addq $12 * SIZE, BO
  1647. movq K, %rax
  1648. sarq $3, %rax // K / 8
  1649. cmpq $2, %rax
  1650. jl .L8_13
  1651. KERNEL4x8_I
  1652. KERNEL4x8_M2
  1653. KERNEL4x8_M1
  1654. KERNEL4x8_M2
  1655. KERNEL4x8_M1
  1656. KERNEL4x8_M2
  1657. KERNEL4x8_M1
  1658. KERNEL4x8_M2
  1659. subq $2, %rax
  1660. je .L8_12a
  1661. ALIGN_5
  1662. .L8_12:
  1663. KERNEL4x8_M1
  1664. KERNEL4x8_M2
  1665. KERNEL4x8_M1
  1666. KERNEL4x8_M2
  1667. KERNEL4x8_M1
  1668. KERNEL4x8_M2
  1669. KERNEL4x8_M1
  1670. KERNEL4x8_M2
  1671. dec %rax
  1672. jne .L8_12
  1673. .L8_12a:
  1674. KERNEL4x8_M1
  1675. KERNEL4x8_M2
  1676. KERNEL4x8_M1
  1677. KERNEL4x8_M2
  1678. KERNEL4x8_M1
  1679. KERNEL4x8_M2
  1680. KERNEL4x8_M1
  1681. KERNEL4x8_E
  1682. jmp .L8_16
  1683. .L8_13:
  1684. test $1, %rax
  1685. jz .L8_14
  1686. KERNEL4x8_I
  1687. KERNEL4x8_M2
  1688. KERNEL4x8_M1
  1689. KERNEL4x8_M2
  1690. KERNEL4x8_M1
  1691. KERNEL4x8_M2
  1692. KERNEL4x8_M1
  1693. KERNEL4x8_E
  1694. jmp .L8_16
  1695. .L8_14:
  1696. INIT4x8
  1697. .L8_16:
  1698. movq K, %rax
  1699. andq $7, %rax # if (k & 1)
  1700. je .L8_19
  1701. ALIGN_4
  1702. .L8_17:
  1703. KERNEL4x8_SUB
  1704. dec %rax
  1705. jne .L8_17
  1706. ALIGN_4
  1707. .L8_19:
  1708. SAVE4x8
  1709. decq I # i --
  1710. jg .L8_11
  1711. ALIGN_4
  1712. /**************************************************************************
  1713. * Rest of M
  1714. ***************************************************************************/
  1715. .L8_20:
  1716. // Test rest of M
  1717. testq $3, M
  1718. jz .L8_100 // to next 16 lines of N
  1719. .L8_30:
  1720. testq $2, M
  1721. jz .L8_40
  1722. ALIGN_4
  1723. .L8_31:
  1724. movq B, BO // first buffer to BO
  1725. addq $12 * SIZE, BO
  1726. INIT2x8
  1727. movq K, %rax
  1728. sarq $3, %rax
  1729. je .L8_36
  1730. ALIGN_4
  1731. .L8_32:
  1732. KERNEL2x8_SUB
  1733. KERNEL2x8_SUB
  1734. KERNEL2x8_SUB
  1735. KERNEL2x8_SUB
  1736. KERNEL2x8_SUB
  1737. KERNEL2x8_SUB
  1738. KERNEL2x8_SUB
  1739. KERNEL2x8_SUB
  1740. dec %rax
  1741. jne .L8_32
  1742. ALIGN_4
  1743. .L8_36:
  1744. movq K, %rax
  1745. andq $7, %rax # if (k & 1)
  1746. je .L8_39
  1747. ALIGN_4
  1748. .L8_37:
  1749. KERNEL2x8_SUB
  1750. dec %rax
  1751. jne .L8_37
  1752. .L8_39:
  1753. SAVE2x8
  1754. .L8_40:
  1755. testq $1, M
  1756. jz .L8_100 // to next 3 lines of N
  1757. ALIGN_4
  1758. .L8_41:
  1759. movq B, BO // first buffer to BO
  1760. addq $12 * SIZE, BO
  1761. INIT1x8
  1762. movq K, %rax
  1763. sarq $3,%rax
  1764. je .L8_46
  1765. ALIGN_4
  1766. .L8_42:
  1767. KERNEL1x8_SUB
  1768. KERNEL1x8_SUB
  1769. KERNEL1x8_SUB
  1770. KERNEL1x8_SUB
  1771. KERNEL1x8_SUB
  1772. KERNEL1x8_SUB
  1773. KERNEL1x8_SUB
  1774. KERNEL1x8_SUB
  1775. dec %rax
  1776. jne .L8_42
  1777. ALIGN_4
  1778. .L8_46:
  1779. movq K, %rax
  1780. andq $7, %rax # if (k & 1)
  1781. je .L8_49
  1782. ALIGN_4
  1783. .L8_47:
  1784. KERNEL1x8_SUB
  1785. dec %rax
  1786. jne .L8_47
  1787. ALIGN_4
  1788. .L8_49:
  1789. SAVE1x8
  1790. ALIGN_4
  1791. .L8_100:
  1792. movq K, %rax
  1793. salq $3, %rax // * 8
  1794. leaq (B , %rax, SIZE), B
  1795. decq J // j --
  1796. jg .L8_10
  1797. /**************************************************************************************************/
  1798. .L4_0:
  1799. cmpq $ 0, Nmod12 // N % 12 == 0
  1800. je .L999
  1801. movq Nmod12, J
  1802. testq $4, J // j = j / 4
  1803. je .L2_0
  1804. .L4_10:
  1805. movq C, CO1
  1806. leaq (C, LDC, 4), C // c += 4 * ldc
  1807. movq A, AO // aoffset = a
  1808. addq $16 * SIZE, AO
  1809. movq M, I
  1810. sarq $2, I // i = m / 4
  1811. je .L4_20
  1812. ALIGN_4
  1813. .L4_11:
  1814. movq B, BO
  1815. addq $12 * SIZE, BO
  1816. movq K, %rax
  1817. sarq $3, %rax // K / 8
  1818. cmpq $2, %rax
  1819. jl .L4_13
  1820. KERNEL4x4_I
  1821. KERNEL4x4_M2
  1822. KERNEL4x4_M1
  1823. KERNEL4x4_M2
  1824. KERNEL4x4_M1
  1825. KERNEL4x4_M2
  1826. KERNEL4x4_M1
  1827. KERNEL4x4_M2
  1828. subq $2, %rax
  1829. je .L4_12a
  1830. ALIGN_5
  1831. .L4_12:
  1832. KERNEL4x4_M1
  1833. KERNEL4x4_M2
  1834. KERNEL4x4_M1
  1835. KERNEL4x4_M2
  1836. KERNEL4x4_M1
  1837. KERNEL4x4_M2
  1838. KERNEL4x4_M1
  1839. KERNEL4x4_M2
  1840. dec %rax
  1841. jne .L4_12
  1842. .L4_12a:
  1843. KERNEL4x4_M1
  1844. KERNEL4x4_M2
  1845. KERNEL4x4_M1
  1846. KERNEL4x4_M2
  1847. KERNEL4x4_M1
  1848. KERNEL4x4_M2
  1849. KERNEL4x4_M1
  1850. KERNEL4x4_E
  1851. jmp .L4_16
  1852. .L4_13:
  1853. test $1, %rax
  1854. jz .L4_14
  1855. KERNEL4x4_I
  1856. KERNEL4x4_M2
  1857. KERNEL4x4_M1
  1858. KERNEL4x4_M2
  1859. KERNEL4x4_M1
  1860. KERNEL4x4_M2
  1861. KERNEL4x4_M1
  1862. KERNEL4x4_E
  1863. jmp .L4_16
  1864. .L4_14:
  1865. INIT4x4
  1866. .L4_16:
  1867. movq K, %rax
  1868. andq $7, %rax # if (k & 1)
  1869. je .L4_19
  1870. ALIGN_4
  1871. .L4_17:
  1872. KERNEL4x4_SUB
  1873. dec %rax
  1874. jne .L4_17
  1875. ALIGN_4
  1876. .L4_19:
  1877. SAVE4x4
  1878. decq I # i --
  1879. jg .L4_11
  1880. ALIGN_4
  1881. /**************************************************************************
  1882. * Rest of M
  1883. ***************************************************************************/
  1884. .L4_20:
  1885. // Test rest of M
  1886. testq $3, M
  1887. jz .L4_100 // to next 16 lines of N
  1888. .L4_30:
  1889. testq $2, M
  1890. jz .L4_40
  1891. ALIGN_4
  1892. .L4_31:
  1893. movq B, BO // first buffer to BO
  1894. addq $12 * SIZE, BO
  1895. INIT2x4
  1896. movq K, %rax
  1897. sarq $3, %rax
  1898. je .L4_36
  1899. ALIGN_4
  1900. .L4_32:
  1901. KERNEL2x4_SUB
  1902. KERNEL2x4_SUB
  1903. KERNEL2x4_SUB
  1904. KERNEL2x4_SUB
  1905. KERNEL2x4_SUB
  1906. KERNEL2x4_SUB
  1907. KERNEL2x4_SUB
  1908. KERNEL2x4_SUB
  1909. dec %rax
  1910. jne .L4_32
  1911. ALIGN_4
  1912. .L4_36:
  1913. movq K, %rax
  1914. andq $7, %rax # if (k & 1)
  1915. je .L4_39
  1916. ALIGN_4
  1917. .L4_37:
  1918. KERNEL2x4_SUB
  1919. dec %rax
  1920. jne .L4_37
  1921. .L4_39:
  1922. SAVE2x4
  1923. .L4_40:
  1924. testq $1, M
  1925. jz .L4_100 // to next 3 lines of N
  1926. ALIGN_4
  1927. .L4_41:
  1928. movq B, BO // first buffer to BO
  1929. addq $12 * SIZE, BO
  1930. INIT1x4
  1931. movq K, %rax
  1932. sarq $3,%rax
  1933. je .L4_46
  1934. ALIGN_4
  1935. .L4_42:
  1936. KERNEL1x4_SUB
  1937. KERNEL1x4_SUB
  1938. KERNEL1x4_SUB
  1939. KERNEL1x4_SUB
  1940. KERNEL1x4_SUB
  1941. KERNEL1x4_SUB
  1942. KERNEL1x4_SUB
  1943. KERNEL1x4_SUB
  1944. dec %rax
  1945. jne .L4_42
  1946. ALIGN_4
  1947. .L4_46:
  1948. movq K, %rax
  1949. andq $7, %rax # if (k & 1)
  1950. je .L4_49
  1951. ALIGN_4
  1952. .L4_47:
  1953. KERNEL1x4_SUB
  1954. dec %rax
  1955. jne .L4_47
  1956. ALIGN_4
  1957. .L4_49:
  1958. SAVE1x4
  1959. ALIGN_4
  1960. .L4_100:
  1961. movq K, %rax
  1962. salq $2, %rax // * 4
  1963. leaq (B , %rax, SIZE), B
  1964. /***************************************************************************************************************/
  1965. .L2_0:
  1966. movq Nmod12, J
  1967. testq $2, J
  1968. je .L1_0
  1969. .L2_10:
  1970. movq C, CO1
  1971. leaq (C, LDC, 2), C // c += 2 * ldc
  1972. movq A, AO // aoffset = a
  1973. addq $16 * SIZE, AO
  1974. movq M, I
  1975. sarq $2, I // i = m / 4
  1976. je .L2_20
  1977. ALIGN_4
  1978. .L2_11:
  1979. movq B, BO
  1980. addq $12 * SIZE, BO
  1981. INIT4x2
  1982. movq K, %rax
  1983. sarq $3, %rax // K / 8
  1984. je .L2_16
  1985. ALIGN_5
  1986. .L2_12:
  1987. KERNEL4x2_SUB
  1988. KERNEL4x2_SUB
  1989. KERNEL4x2_SUB
  1990. KERNEL4x2_SUB
  1991. KERNEL4x2_SUB
  1992. KERNEL4x2_SUB
  1993. KERNEL4x2_SUB
  1994. KERNEL4x2_SUB
  1995. dec %rax
  1996. jne .L2_12
  1997. .L2_16:
  1998. movq K, %rax
  1999. andq $7, %rax # if (k & 1)
  2000. je .L2_19
  2001. ALIGN_4
  2002. .L2_17:
  2003. KERNEL4x2_SUB
  2004. dec %rax
  2005. jne .L2_17
  2006. ALIGN_4
  2007. .L2_19:
  2008. SAVE4x2
  2009. decq I # i --
  2010. jg .L2_11
  2011. ALIGN_4
  2012. /**************************************************************************
  2013. * Rest of M
  2014. ***************************************************************************/
  2015. .L2_20:
  2016. // Test rest of M
  2017. testq $3, M
  2018. jz .L2_100 // to next 16 lines of N
  2019. .L2_30:
  2020. testq $2, M
  2021. jz .L2_40
  2022. ALIGN_4
  2023. .L2_31:
  2024. movq B, BO // first buffer to BO
  2025. addq $12 * SIZE, BO
  2026. INIT2x2
  2027. movq K, %rax
  2028. sarq $3, %rax
  2029. je .L2_36
  2030. ALIGN_4
  2031. .L2_32:
  2032. KERNEL2x2_SUB
  2033. KERNEL2x2_SUB
  2034. KERNEL2x2_SUB
  2035. KERNEL2x2_SUB
  2036. KERNEL2x2_SUB
  2037. KERNEL2x2_SUB
  2038. KERNEL2x2_SUB
  2039. KERNEL2x2_SUB
  2040. dec %rax
  2041. jne .L2_32
  2042. .L2_36:
  2043. movq K, %rax
  2044. andq $7, %rax # if (k & 1)
  2045. je .L2_39
  2046. ALIGN_4
  2047. .L2_37:
  2048. KERNEL2x2_SUB
  2049. dec %rax
  2050. jne .L2_37
  2051. .L2_39:
  2052. SAVE2x2
  2053. .L2_40:
  2054. testq $1, M
  2055. jz .L2_100 // to next 3 lines of N
  2056. .L2_41:
  2057. movq B, BO // first buffer to BO
  2058. addq $12 * SIZE, BO
  2059. INIT1x2
  2060. movq K, %rax
  2061. sarq $3,%rax
  2062. je .L2_46
  2063. ALIGN_4
  2064. .L2_42:
  2065. KERNEL1x2_SUB
  2066. KERNEL1x2_SUB
  2067. KERNEL1x2_SUB
  2068. KERNEL1x2_SUB
  2069. KERNEL1x2_SUB
  2070. KERNEL1x2_SUB
  2071. KERNEL1x2_SUB
  2072. KERNEL1x2_SUB
  2073. dec %rax
  2074. jne .L2_42
  2075. .L2_46:
  2076. movq K, %rax
  2077. andq $7, %rax # if (k & 1)
  2078. je .L2_49
  2079. ALIGN_4
  2080. .L2_47:
  2081. KERNEL1x2_SUB
  2082. dec %rax
  2083. jne .L2_47
  2084. .L2_49:
  2085. SAVE1x2
  2086. .L2_100:
  2087. movq K, %rax
  2088. salq $1, %rax // * 2
  2089. leaq (B , %rax, SIZE), B
  2090. /***************************************************************************************************************/
  2091. .L1_0:
  2092. movq Nmod12, J
  2093. testq $1, J
  2094. je .L999
  2095. .L1_10:
  2096. movq C, CO1
  2097. leaq (C, LDC, 1), C // c += 1 * ldc
  2098. movq A, AO // aoffset = a
  2099. addq $16 * SIZE, AO
  2100. movq M, I
  2101. sarq $2, I // i = m / 4
  2102. je .L1_20
  2103. ALIGN_4
  2104. .L1_11:
  2105. movq B, BO
  2106. addq $12 * SIZE, BO
  2107. INIT4x1
  2108. movq K, %rax
  2109. sarq $3, %rax // K / 8
  2110. je .L1_16
  2111. ALIGN_5
  2112. .L1_12:
  2113. KERNEL4x1
  2114. dec %rax
  2115. jne .L1_12
  2116. .L1_16:
  2117. movq K, %rax
  2118. andq $7, %rax # if (k & 1)
  2119. je .L1_19
  2120. ALIGN_4
  2121. .L1_17:
  2122. KERNEL4x1_SUB
  2123. dec %rax
  2124. jne .L1_17
  2125. ALIGN_4
  2126. .L1_19:
  2127. SAVE4x1
  2128. decq I # i --
  2129. jg .L1_11
  2130. /**************************************************************************
  2131. * Rest of M
  2132. ***************************************************************************/
  2133. .L1_20:
  2134. // Test rest of M
  2135. testq $3, M
  2136. jz .L1_100
  2137. .L1_30:
  2138. testq $2, M
  2139. jz .L1_40
  2140. ALIGN_4
  2141. .L1_31:
  2142. movq B, BO // first buffer to BO
  2143. addq $12 * SIZE, BO
  2144. INIT2x1
  2145. movq K, %rax
  2146. sarq $3, %rax
  2147. je .L1_36
  2148. ALIGN_4
  2149. .L1_32:
  2150. KERNEL2x1_SUB
  2151. KERNEL2x1_SUB
  2152. KERNEL2x1_SUB
  2153. KERNEL2x1_SUB
  2154. KERNEL2x1_SUB
  2155. KERNEL2x1_SUB
  2156. KERNEL2x1_SUB
  2157. KERNEL2x1_SUB
  2158. dec %rax
  2159. jne .L1_32
  2160. .L1_36:
  2161. movq K, %rax
  2162. andq $7, %rax # if (k & 1)
  2163. je .L1_39
  2164. ALIGN_4
  2165. .L1_37:
  2166. KERNEL2x1_SUB
  2167. dec %rax
  2168. jne .L1_37
  2169. .L1_39:
  2170. SAVE2x1
  2171. .L1_40:
  2172. testq $1, M
  2173. jz .L1_100 // to next 3 lines of N
  2174. .L1_41:
  2175. movq B, BO // first buffer to BO
  2176. addq $12 * SIZE, BO
  2177. INIT1x1
  2178. movq K, %rax
  2179. sarq $3,%rax
  2180. je .L1_46
  2181. ALIGN_4
  2182. .L1_42:
  2183. KERNEL1x1_SUB
  2184. KERNEL1x1_SUB
  2185. KERNEL1x1_SUB
  2186. KERNEL1x1_SUB
  2187. KERNEL1x1_SUB
  2188. KERNEL1x1_SUB
  2189. KERNEL1x1_SUB
  2190. KERNEL1x1_SUB
  2191. dec %rax
  2192. jne .L1_42
  2193. .L1_46:
  2194. movq K, %rax
  2195. andq $7, %rax # if (k & 1)
  2196. je .L1_49
  2197. ALIGN_4
  2198. .L1_47:
  2199. KERNEL1x1_SUB
  2200. dec %rax
  2201. jne .L1_47
  2202. .L1_49:
  2203. SAVE1x1
  2204. .L1_100:
  2205. .L999:
  2206. vzeroupper
  2207. movq SP, %rsp
  2208. movq (%rsp), %rbx
  2209. movq 8(%rsp), %rbp
  2210. movq 16(%rsp), %r12
  2211. movq 24(%rsp), %r13
  2212. movq 32(%rsp), %r14
  2213. movq 40(%rsp), %r15
  2214. #ifdef WINDOWS_ABI
  2215. movq 48(%rsp), %rdi
  2216. movq 56(%rsp), %rsi
  2217. vmovups 64(%rsp), %xmm6
  2218. vmovups 80(%rsp), %xmm7
  2219. vmovups 96(%rsp), %xmm8
  2220. vmovups 112(%rsp), %xmm9
  2221. vmovups 128(%rsp), %xmm10
  2222. vmovups 144(%rsp), %xmm11
  2223. vmovups 160(%rsp), %xmm12
  2224. vmovups 176(%rsp), %xmm13
  2225. vmovups 192(%rsp), %xmm14
  2226. vmovups 208(%rsp), %xmm15
  2227. #endif
  2228. addq $STACKSIZE, %rsp
  2229. ret
  2230. EPILOGUE
  2231. #else
  2232. /*************************************************************************************
  2233. * TRMM Kernel
  2234. *************************************************************************************/
  2235. PROLOGUE
  2236. PROFCODE
  2237. subq $STACKSIZE, %rsp
  2238. movq %rbx, (%rsp)
  2239. movq %rbp, 8(%rsp)
  2240. movq %r12, 16(%rsp)
  2241. movq %r13, 24(%rsp)
  2242. movq %r14, 32(%rsp)
  2243. movq %r15, 40(%rsp)
  2244. vzeroupper
  2245. #ifdef WINDOWS_ABI
  2246. movq %rdi, 48(%rsp)
  2247. movq %rsi, 56(%rsp)
  2248. vmovups %xmm6, 64(%rsp)
  2249. vmovups %xmm7, 80(%rsp)
  2250. vmovups %xmm8, 96(%rsp)
  2251. vmovups %xmm9, 112(%rsp)
  2252. vmovups %xmm10, 128(%rsp)
  2253. vmovups %xmm11, 144(%rsp)
  2254. vmovups %xmm12, 160(%rsp)
  2255. vmovups %xmm13, 176(%rsp)
  2256. vmovups %xmm14, 192(%rsp)
  2257. vmovups %xmm15, 208(%rsp)
  2258. movq ARG1, OLD_M
  2259. movq ARG2, OLD_N
  2260. movq ARG3, OLD_K
  2261. movq OLD_A, A
  2262. movq OLD_B, B
  2263. movq OLD_C, C
  2264. movq OLD_LDC, LDC
  2265. #ifdef TRMMKERNEL
  2266. vmovsd OLD_OFFSET, %xmm12
  2267. #endif
  2268. vmovups %xmm3, %xmm0
  2269. #else
  2270. movq STACKSIZE + 8(%rsp), LDC
  2271. #ifdef TRMMKERNEL
  2272. vmovsd STACKSIZE + 16(%rsp), %xmm12
  2273. #endif
  2274. #endif
  2275. movq %rsp, SP # save old stack
  2276. subq $128 + L_BUFFER_SIZE, %rsp
  2277. andq $-4096, %rsp # align stack
  2278. STACK_TOUCH
  2279. cmpq $ 0, OLD_M
  2280. je .L999
  2281. cmpq $ 0, OLD_N
  2282. je .L999
  2283. cmpq $ 0, OLD_K
  2284. je .L999
  2285. movq OLD_M, M
  2286. movq OLD_N, N
  2287. movq OLD_K, K
  2288. vmovsd %xmm0, ALPHA
  2289. salq $BASE_SHIFT, LDC
  2290. movq N, %rax
  2291. xorq %rdx, %rdx
  2292. movq $8, %rdi
  2293. divq %rdi // N / 8
  2294. movq %rax, Ndiv12 // N / 8
  2295. movq %rdx, Nmod12 // N % 8
  2296. #ifdef TRMMKERNEL
  2297. vmovsd %xmm12, OFFSET
  2298. vmovsd %xmm12, KK
  2299. #ifndef LEFT
  2300. negq KK
  2301. #endif
  2302. #endif
  2303. /*************************************************************************************************/
  2304. .L8_0:
  2305. movq Ndiv12, J
  2306. cmpq $ 0, J
  2307. je .L4_0
  2308. ALIGN_4
  2309. .L8_10:
  2310. movq C, CO1
  2311. leaq (C, LDC, 8), C // c += 8 * ldc
  2312. #if defined(TRMMKERNEL) && defined(LEFT)
  2313. movq OFFSET, %rax
  2314. movq %rax, KK
  2315. #endif
  2316. movq A, AO // aoffset = a
  2317. addq $16 * SIZE, AO
  2318. movq M, I
  2319. sarq $2, I // i = m / 4
  2320. je .L8_20
  2321. ALIGN_4
  2322. .L8_11:
  2323. #if !defined(TRMMKERNEL) || \
  2324. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2325. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2326. movq B, BO
  2327. addq $12 * SIZE, BO
  2328. #else
  2329. movq B, BO
  2330. addq $12 * SIZE, BO
  2331. movq KK, %rax
  2332. salq $3, %rax // rax * SIZE
  2333. leaq (BO,%rax,8), BO // add number of values in B
  2334. leaq (AO,%rax,4), AO // add number of values in A
  2335. #endif
  2336. #ifndef TRMMKERNEL
  2337. movq K, %rax
  2338. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2339. movq K, %rax
  2340. subq KK, %rax
  2341. movq %rax, KKK
  2342. #else
  2343. movq KK, %rax
  2344. #ifdef LEFT
  2345. addq $4, %rax // number of values in AO
  2346. #else
  2347. addq $8, %rax // number of values in BO
  2348. #endif
  2349. movq %rax, KKK
  2350. #endif
  2351. sarq $3, %rax // K / 8
  2352. cmpq $2, %rax
  2353. jl .L8_13
  2354. KERNEL4x8_I
  2355. KERNEL4x8_M2
  2356. KERNEL4x8_M1
  2357. KERNEL4x8_M2
  2358. KERNEL4x8_M1
  2359. KERNEL4x8_M2
  2360. KERNEL4x8_M1
  2361. KERNEL4x8_M2
  2362. subq $2, %rax
  2363. je .L8_12a
  2364. ALIGN_5
  2365. .L8_12:
  2366. KERNEL4x8_M1
  2367. KERNEL4x8_M2
  2368. KERNEL4x8_M1
  2369. KERNEL4x8_M2
  2370. KERNEL4x8_M1
  2371. KERNEL4x8_M2
  2372. KERNEL4x8_M1
  2373. KERNEL4x8_M2
  2374. dec %rax
  2375. jne .L8_12
  2376. .L8_12a:
  2377. KERNEL4x8_M1
  2378. KERNEL4x8_M2
  2379. KERNEL4x8_M1
  2380. KERNEL4x8_M2
  2381. KERNEL4x8_M1
  2382. KERNEL4x8_M2
  2383. KERNEL4x8_M1
  2384. KERNEL4x8_E
  2385. jmp .L8_16
  2386. .L8_13:
  2387. test $1, %rax
  2388. jz .L8_14
  2389. KERNEL4x8_I
  2390. KERNEL4x8_M2
  2391. KERNEL4x8_M1
  2392. KERNEL4x8_M2
  2393. KERNEL4x8_M1
  2394. KERNEL4x8_M2
  2395. KERNEL4x8_M1
  2396. KERNEL4x8_E
  2397. jmp .L8_16
  2398. .L8_14:
  2399. INIT4x8
  2400. .L8_16:
  2401. movq KKK, %rax
  2402. andq $7, %rax # if (k & 1)
  2403. je .L8_19
  2404. ALIGN_4
  2405. .L8_17:
  2406. KERNEL4x8_SUB
  2407. dec %rax
  2408. jne .L8_17
  2409. ALIGN_4
  2410. .L8_19:
  2411. SAVE4x8
  2412. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2413. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2414. movq K, %rax
  2415. subq KKK, %rax
  2416. salq $3, %rax // rax + SIZE
  2417. leaq (BO, %rax, 8), BO // number of values in B
  2418. leaq (AO, %rax, 4), AO // number of values in A
  2419. #endif
  2420. #if defined(TRMMKERNEL) && defined(LEFT)
  2421. addq $4, KK // number of values in A
  2422. #endif
  2423. decq I # i --
  2424. jg .L8_11
  2425. ALIGN_4
  2426. /**************************************************************************
  2427. * Rest of M
  2428. ***************************************************************************/
  2429. .L8_20:
  2430. // Test rest of M
  2431. testq $3, M
  2432. jz .L8_100 // to next 16 lines of N
  2433. .L8_30:
  2434. testq $2, M
  2435. jz .L8_40
  2436. ALIGN_4
  2437. .L8_31:
  2438. #if !defined(TRMMKERNEL) || \
  2439. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2440. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2441. movq B, BO
  2442. addq $12 * SIZE, BO
  2443. #else
  2444. movq B, BO
  2445. addq $12 * SIZE, BO
  2446. movq KK, %rax
  2447. salq $3, %rax // rax * SIZE
  2448. leaq (BO,%rax,8), BO // add number of values in B
  2449. leaq (AO,%rax,2), AO // add number of values in A
  2450. #endif
  2451. #ifndef TRMMKERNEL
  2452. movq K, %rax
  2453. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2454. movq K, %rax
  2455. subq KK, %rax
  2456. movq %rax, KKK
  2457. #else
  2458. movq KK, %rax
  2459. #ifdef LEFT
  2460. addq $2, %rax // number of values in AO
  2461. #else
  2462. addq $8, %rax // number of values in BO
  2463. #endif
  2464. movq %rax, KKK
  2465. #endif
  2466. INIT2x8
  2467. sarq $3, %rax
  2468. je .L8_36
  2469. ALIGN_4
  2470. .L8_32:
  2471. KERNEL2x8_SUB
  2472. KERNEL2x8_SUB
  2473. KERNEL2x8_SUB
  2474. KERNEL2x8_SUB
  2475. KERNEL2x8_SUB
  2476. KERNEL2x8_SUB
  2477. KERNEL2x8_SUB
  2478. KERNEL2x8_SUB
  2479. dec %rax
  2480. jne .L8_32
  2481. ALIGN_4
  2482. .L8_36:
  2483. movq KKK, %rax
  2484. andq $7, %rax # if (k & 1)
  2485. je .L8_39
  2486. ALIGN_4
  2487. .L8_37:
  2488. KERNEL2x8_SUB
  2489. dec %rax
  2490. jne .L8_37
  2491. .L8_39:
  2492. SAVE2x8
  2493. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2494. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2495. movq K, %rax
  2496. subq KKK, %rax
  2497. salq $3, %rax // rax + SIZE
  2498. leaq (BO, %rax, 8), BO // number of values in B
  2499. leaq (AO, %rax, 2), AO // number of values in A
  2500. #endif
  2501. #if defined(TRMMKERNEL) && defined(LEFT)
  2502. addq $2, KK // number of values in A
  2503. #endif
  2504. .L8_40:
  2505. testq $1, M
  2506. jz .L8_100 // to next 3 lines of N
  2507. ALIGN_4
  2508. .L8_41:
  2509. #if !defined(TRMMKERNEL) || \
  2510. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2511. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2512. movq B, BO
  2513. addq $12 * SIZE, BO
  2514. #else
  2515. movq B, BO
  2516. addq $12 * SIZE, BO
  2517. movq KK, %rax
  2518. salq $3, %rax // rax * SIZE
  2519. leaq (BO,%rax,8), BO // add number of values in B
  2520. leaq (AO,%rax,1), AO // add number of values in A
  2521. #endif
  2522. #ifndef TRMMKERNEL
  2523. movq K, %rax
  2524. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2525. movq K, %rax
  2526. subq KK, %rax
  2527. movq %rax, KKK
  2528. #else
  2529. movq KK, %rax
  2530. #ifdef LEFT
  2531. addq $1, %rax // number of values in AO
  2532. #else
  2533. addq $8, %rax // number of values in BO
  2534. #endif
  2535. movq %rax, KKK
  2536. #endif
  2537. INIT1x8
  2538. sarq $3,%rax
  2539. je .L8_46
  2540. ALIGN_4
  2541. .L8_42:
  2542. KERNEL1x8_SUB
  2543. KERNEL1x8_SUB
  2544. KERNEL1x8_SUB
  2545. KERNEL1x8_SUB
  2546. KERNEL1x8_SUB
  2547. KERNEL1x8_SUB
  2548. KERNEL1x8_SUB
  2549. KERNEL1x8_SUB
  2550. dec %rax
  2551. jne .L8_42
  2552. ALIGN_4
  2553. .L8_46:
  2554. movq KKK, %rax
  2555. andq $7, %rax # if (k & 1)
  2556. je .L8_49
  2557. ALIGN_4
  2558. .L8_47:
  2559. KERNEL1x8_SUB
  2560. dec %rax
  2561. jne .L8_47
  2562. ALIGN_4
  2563. .L8_49:
  2564. SAVE1x8
  2565. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2566. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2567. movq K, %rax
  2568. subq KKK, %rax
  2569. salq $3, %rax // rax + SIZE
  2570. leaq (BO, %rax, 8), BO // number of values in B
  2571. leaq (AO, %rax, 1), AO // number of values in A
  2572. #endif
  2573. #if defined(TRMMKERNEL) && defined(LEFT)
  2574. addq $1, KK // number of values in A
  2575. #endif
  2576. .L8_100:
  2577. #if defined(TRMMKERNEL) && !defined(LEFT)
  2578. addq $8, KK // number of values in B
  2579. #endif
  2580. decq J // j --
  2581. jg .L8_10
  2582. /*************************************************************************************************/
  2583. .L4_0:
  2584. movq Nmod12, J
  2585. testq $4, J
  2586. je .L2_0
  2587. ALIGN_4
  2588. .L4_10:
  2589. movq C, CO1
  2590. leaq (C, LDC, 4), C // c += 4 * ldc
  2591. #if defined(TRMMKERNEL) && defined(LEFT)
  2592. movq OFFSET, %rax
  2593. movq %rax, KK
  2594. #endif
  2595. movq A, AO // aoffset = a
  2596. addq $16 * SIZE, AO
  2597. movq M, I
  2598. sarq $2, I // i = m / 4
  2599. je .L4_20
  2600. ALIGN_4
  2601. .L4_11:
  2602. #if !defined(TRMMKERNEL) || \
  2603. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2604. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2605. movq B, BO
  2606. addq $12 * SIZE, BO
  2607. #else
  2608. movq B, BO
  2609. addq $12 * SIZE, BO
  2610. movq KK, %rax
  2611. salq $3, %rax // rax * SIZE
  2612. leaq (BO,%rax,4), BO // add number of values in B
  2613. leaq (AO,%rax,4), AO // add number of values in A
  2614. #endif
  2615. #ifndef TRMMKERNEL
  2616. movq K, %rax
  2617. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2618. movq K, %rax
  2619. subq KK, %rax
  2620. movq %rax, KKK
  2621. #else
  2622. movq KK, %rax
  2623. #ifdef LEFT
  2624. addq $4, %rax // number of values in AO
  2625. #else
  2626. addq $4, %rax // number of values in BO
  2627. #endif
  2628. movq %rax, KKK
  2629. #endif
  2630. sarq $3, %rax // K / 8
  2631. cmpq $2, %rax
  2632. jl .L4_13
  2633. KERNEL4x4_I
  2634. KERNEL4x4_M2
  2635. KERNEL4x4_M1
  2636. KERNEL4x4_M2
  2637. KERNEL4x4_M1
  2638. KERNEL4x4_M2
  2639. KERNEL4x4_M1
  2640. KERNEL4x4_M2
  2641. subq $2, %rax
  2642. je .L4_12a
  2643. ALIGN_5
  2644. .L4_12:
  2645. KERNEL4x4_M1
  2646. KERNEL4x4_M2
  2647. KERNEL4x4_M1
  2648. KERNEL4x4_M2
  2649. KERNEL4x4_M1
  2650. KERNEL4x4_M2
  2651. KERNEL4x4_M1
  2652. KERNEL4x4_M2
  2653. dec %rax
  2654. jne .L4_12
  2655. .L4_12a:
  2656. KERNEL4x4_M1
  2657. KERNEL4x4_M2
  2658. KERNEL4x4_M1
  2659. KERNEL4x4_M2
  2660. KERNEL4x4_M1
  2661. KERNEL4x4_M2
  2662. KERNEL4x4_M1
  2663. KERNEL4x4_E
  2664. jmp .L4_16
  2665. .L4_13:
  2666. test $1, %rax
  2667. jz .L4_14
  2668. KERNEL4x4_I
  2669. KERNEL4x4_M2
  2670. KERNEL4x4_M1
  2671. KERNEL4x4_M2
  2672. KERNEL4x4_M1
  2673. KERNEL4x4_M2
  2674. KERNEL4x4_M1
  2675. KERNEL4x4_E
  2676. jmp .L4_16
  2677. .L4_14:
  2678. INIT4x4
  2679. .L4_16:
  2680. movq KKK, %rax
  2681. andq $7, %rax # if (k & 1)
  2682. je .L4_19
  2683. ALIGN_4
  2684. .L4_17:
  2685. KERNEL4x4_SUB
  2686. dec %rax
  2687. jne .L4_17
  2688. ALIGN_4
  2689. .L4_19:
  2690. SAVE4x4
  2691. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2692. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2693. movq K, %rax
  2694. subq KKK, %rax
  2695. salq $3, %rax // rax + SIZE
  2696. leaq (BO, %rax, 4), BO // number of values in B
  2697. leaq (AO, %rax, 4), AO // number of values in A
  2698. #endif
  2699. #if defined(TRMMKERNEL) && defined(LEFT)
  2700. addq $4, KK // number of values in A
  2701. #endif
  2702. decq I # i --
  2703. jg .L4_11
  2704. ALIGN_4
  2705. /**************************************************************************
  2706. * Rest of M
  2707. ***************************************************************************/
  2708. .L4_20:
  2709. // Test rest of M
  2710. testq $3, M
  2711. jz .L4_100 // to next 16 lines of N
  2712. .L4_30:
  2713. testq $2, M
  2714. jz .L4_40
  2715. ALIGN_4
  2716. .L4_31:
  2717. #if !defined(TRMMKERNEL) || \
  2718. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2719. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2720. movq B, BO
  2721. addq $12 * SIZE, BO
  2722. #else
  2723. movq B, BO
  2724. addq $12 * SIZE, BO
  2725. movq KK, %rax
  2726. salq $3, %rax // rax * SIZE
  2727. leaq (BO,%rax,4), BO // add number of values in B
  2728. leaq (AO,%rax,2), AO // add number of values in A
  2729. #endif
  2730. #ifndef TRMMKERNEL
  2731. movq K, %rax
  2732. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2733. movq K, %rax
  2734. subq KK, %rax
  2735. movq %rax, KKK
  2736. #else
  2737. movq KK, %rax
  2738. #ifdef LEFT
  2739. addq $2, %rax // number of values in AO
  2740. #else
  2741. addq $4, %rax // number of values in BO
  2742. #endif
  2743. movq %rax, KKK
  2744. #endif
  2745. INIT2x4
  2746. sarq $3, %rax
  2747. je .L4_36
  2748. ALIGN_4
  2749. .L4_32:
  2750. KERNEL2x4_SUB
  2751. KERNEL2x4_SUB
  2752. KERNEL2x4_SUB
  2753. KERNEL2x4_SUB
  2754. KERNEL2x4_SUB
  2755. KERNEL2x4_SUB
  2756. KERNEL2x4_SUB
  2757. KERNEL2x4_SUB
  2758. dec %rax
  2759. jne .L4_32
  2760. ALIGN_4
  2761. .L4_36:
  2762. movq KKK, %rax
  2763. andq $7, %rax # if (k & 1)
  2764. je .L4_39
  2765. ALIGN_4
  2766. .L4_37:
  2767. KERNEL2x4_SUB
  2768. dec %rax
  2769. jne .L4_37
  2770. .L4_39:
  2771. SAVE2x4
  2772. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2773. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2774. movq K, %rax
  2775. subq KKK, %rax
  2776. salq $3, %rax // rax + SIZE
  2777. leaq (BO, %rax, 4), BO // number of values in B
  2778. leaq (AO, %rax, 2), AO // number of values in A
  2779. #endif
  2780. #if defined(TRMMKERNEL) && defined(LEFT)
  2781. addq $2, KK // number of values in A
  2782. #endif
  2783. .L4_40:
  2784. testq $1, M
  2785. jz .L4_100 // to next 3 lines of N
  2786. ALIGN_4
  2787. .L4_41:
  2788. #if !defined(TRMMKERNEL) || \
  2789. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2790. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2791. movq B, BO
  2792. addq $12 * SIZE, BO
  2793. #else
  2794. movq B, BO
  2795. addq $12 * SIZE, BO
  2796. movq KK, %rax
  2797. salq $3, %rax // rax * SIZE
  2798. leaq (BO,%rax,4), BO // add number of values in B
  2799. leaq (AO,%rax,1), AO // add number of values in A
  2800. #endif
  2801. #ifndef TRMMKERNEL
  2802. movq K, %rax
  2803. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2804. movq K, %rax
  2805. subq KK, %rax
  2806. movq %rax, KKK
  2807. #else
  2808. movq KK, %rax
  2809. #ifdef LEFT
  2810. addq $1, %rax // number of values in AO
  2811. #else
  2812. addq $4, %rax // number of values in BO
  2813. #endif
  2814. movq %rax, KKK
  2815. #endif
  2816. INIT1x4
  2817. sarq $3,%rax
  2818. je .L4_46
  2819. ALIGN_4
  2820. .L4_42:
  2821. KERNEL1x4_SUB
  2822. KERNEL1x4_SUB
  2823. KERNEL1x4_SUB
  2824. KERNEL1x4_SUB
  2825. KERNEL1x4_SUB
  2826. KERNEL1x4_SUB
  2827. KERNEL1x4_SUB
  2828. KERNEL1x4_SUB
  2829. dec %rax
  2830. jne .L4_42
  2831. ALIGN_4
  2832. .L4_46:
  2833. movq KKK, %rax
  2834. andq $7, %rax # if (k & 1)
  2835. je .L4_49
  2836. ALIGN_4
  2837. .L4_47:
  2838. KERNEL1x4_SUB
  2839. dec %rax
  2840. jne .L4_47
  2841. ALIGN_4
  2842. .L4_49:
  2843. SAVE1x4
  2844. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2845. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2846. movq K, %rax
  2847. subq KKK, %rax
  2848. salq $3, %rax // rax + SIZE
  2849. leaq (BO, %rax, 4), BO // number of values in B
  2850. leaq (AO, %rax, 1), AO // number of values in A
  2851. #endif
  2852. #if defined(TRMMKERNEL) && defined(LEFT)
  2853. addq $1, KK // number of values in A
  2854. #endif
  2855. .L4_100:
  2856. #if defined(TRMMKERNEL) && !defined(LEFT)
  2857. addq $4, KK // number of values in B
  2858. #endif
  2859. movq K, %rax
  2860. salq $2, %rax // * 4
  2861. leaq (B , %rax, SIZE), B
  2862. /***************************************************************************************************************/
  2863. .L2_0:
  2864. movq Nmod12, J
  2865. testq $2, J
  2866. je .L1_0
  2867. .L2_10:
  2868. movq C, CO1
  2869. leaq (C, LDC, 2), C // c += 2 * ldc
  2870. #if defined(TRMMKERNEL) && defined(LEFT)
  2871. movq OFFSET, %rax
  2872. movq %rax, KK
  2873. #endif
  2874. movq A, AO // aoffset = a
  2875. addq $16 * SIZE, AO
  2876. movq M, I
  2877. sarq $2, I // i = m / 4
  2878. je .L2_20
  2879. ALIGN_4
  2880. .L2_11:
  2881. #if !defined(TRMMKERNEL) || \
  2882. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2883. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2884. movq B, BO
  2885. addq $12 * SIZE, BO
  2886. #else
  2887. movq B, BO
  2888. addq $12 * SIZE, BO
  2889. movq KK, %rax
  2890. salq $3, %rax // rax * SIZE
  2891. leaq (BO,%rax,2), BO // add number of values in B
  2892. leaq (AO,%rax,4), AO // add number of values in A
  2893. #endif
  2894. #ifndef TRMMKERNEL
  2895. movq K, %rax
  2896. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2897. movq K, %rax
  2898. subq KK, %rax
  2899. movq %rax, KKK
  2900. #else
  2901. movq KK, %rax
  2902. #ifdef LEFT
  2903. addq $4, %rax // number of values in AO
  2904. #else
  2905. addq $2, %rax // number of values in BO
  2906. #endif
  2907. movq %rax, KKK
  2908. #endif
  2909. INIT4x2
  2910. sarq $3, %rax // K / 8
  2911. je .L2_16
  2912. ALIGN_5
  2913. .L2_12:
  2914. KERNEL4x2_SUB
  2915. KERNEL4x2_SUB
  2916. KERNEL4x2_SUB
  2917. KERNEL4x2_SUB
  2918. KERNEL4x2_SUB
  2919. KERNEL4x2_SUB
  2920. KERNEL4x2_SUB
  2921. KERNEL4x2_SUB
  2922. dec %rax
  2923. jne .L2_12
  2924. .L2_16:
  2925. movq KKK, %rax
  2926. andq $7, %rax # if (k & 1)
  2927. je .L2_19
  2928. ALIGN_4
  2929. .L2_17:
  2930. KERNEL4x2_SUB
  2931. dec %rax
  2932. jne .L2_17
  2933. ALIGN_4
  2934. .L2_19:
  2935. SAVE4x2
  2936. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2937. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2938. movq K, %rax
  2939. subq KKK, %rax
  2940. salq $3, %rax // rax + SIZE
  2941. leaq (BO, %rax, 2), BO // number of values in B
  2942. leaq (AO, %rax, 4), AO // number of values in A
  2943. #endif
  2944. #if defined(TRMMKERNEL) && defined(LEFT)
  2945. addq $4, KK // number of values in A
  2946. #endif
  2947. decq I # i --
  2948. jg .L2_11
  2949. ALIGN_4
  2950. /**************************************************************************
  2951. * Rest of M
  2952. ***************************************************************************/
  2953. .L2_20:
  2954. // Test rest of M
  2955. testq $3, M
  2956. jz .L2_100 // to next 16 lines of N
  2957. .L2_30:
  2958. testq $2, M
  2959. jz .L2_40
  2960. ALIGN_4
  2961. .L2_31:
  2962. #if !defined(TRMMKERNEL) || \
  2963. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2964. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2965. movq B, BO
  2966. addq $12 * SIZE, BO
  2967. #else
  2968. movq B, BO
  2969. addq $12 * SIZE, BO
  2970. movq KK, %rax
  2971. salq $3, %rax // rax * SIZE
  2972. leaq (BO,%rax,2), BO // add number of values in B
  2973. leaq (AO,%rax,2), AO // add number of values in A
  2974. #endif
  2975. #ifndef TRMMKERNEL
  2976. movq K, %rax
  2977. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2978. movq K, %rax
  2979. subq KK, %rax
  2980. movq %rax, KKK
  2981. #else
  2982. movq KK, %rax
  2983. #ifdef LEFT
  2984. addq $2, %rax // number of values in AO
  2985. #else
  2986. addq $2, %rax // number of values in BO
  2987. #endif
  2988. movq %rax, KKK
  2989. #endif
  2990. INIT2x2
  2991. sarq $3, %rax
  2992. je .L2_36
  2993. ALIGN_4
  2994. .L2_32:
  2995. KERNEL2x2_SUB
  2996. KERNEL2x2_SUB
  2997. KERNEL2x2_SUB
  2998. KERNEL2x2_SUB
  2999. KERNEL2x2_SUB
  3000. KERNEL2x2_SUB
  3001. KERNEL2x2_SUB
  3002. KERNEL2x2_SUB
  3003. dec %rax
  3004. jne .L2_32
  3005. .L2_36:
  3006. movq KKK, %rax
  3007. andq $7, %rax # if (k & 1)
  3008. je .L2_39
  3009. ALIGN_4
  3010. .L2_37:
  3011. KERNEL2x2_SUB
  3012. dec %rax
  3013. jne .L2_37
  3014. .L2_39:
  3015. SAVE2x2
  3016. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3017. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3018. movq K, %rax
  3019. subq KKK, %rax
  3020. salq $3, %rax // rax + SIZE
  3021. leaq (BO, %rax, 2), BO // number of values in B
  3022. leaq (AO, %rax, 2), AO // number of values in A
  3023. #endif
  3024. #if defined(TRMMKERNEL) && defined(LEFT)
  3025. addq $2, KK // number of values in A
  3026. #endif
  3027. .L2_40:
  3028. testq $1, M
  3029. jz .L2_100 // to next 3 lines of N
  3030. .L2_41:
  3031. #if !defined(TRMMKERNEL) || \
  3032. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3033. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3034. movq B, BO
  3035. addq $12 * SIZE, BO
  3036. #else
  3037. movq B, BO
  3038. addq $12 * SIZE, BO
  3039. movq KK, %rax
  3040. salq $3, %rax // rax * SIZE
  3041. leaq (BO,%rax,2), BO // add number of values in B
  3042. leaq (AO,%rax,1), AO // add number of values in A
  3043. #endif
  3044. #ifndef TRMMKERNEL
  3045. movq K, %rax
  3046. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3047. movq K, %rax
  3048. subq KK, %rax
  3049. movq %rax, KKK
  3050. #else
  3051. movq KK, %rax
  3052. #ifdef LEFT
  3053. addq $1, %rax // number of values in AO
  3054. #else
  3055. addq $2, %rax // number of values in BO
  3056. #endif
  3057. movq %rax, KKK
  3058. #endif
  3059. INIT1x2
  3060. sarq $3,%rax
  3061. je .L2_46
  3062. ALIGN_4
  3063. .L2_42:
  3064. KERNEL1x2_SUB
  3065. KERNEL1x2_SUB
  3066. KERNEL1x2_SUB
  3067. KERNEL1x2_SUB
  3068. KERNEL1x2_SUB
  3069. KERNEL1x2_SUB
  3070. KERNEL1x2_SUB
  3071. KERNEL1x2_SUB
  3072. dec %rax
  3073. jne .L2_42
  3074. .L2_46:
  3075. movq KKK, %rax
  3076. andq $7, %rax # if (k & 1)
  3077. je .L2_49
  3078. ALIGN_4
  3079. .L2_47:
  3080. KERNEL1x2_SUB
  3081. dec %rax
  3082. jne .L2_47
  3083. .L2_49:
  3084. SAVE1x2
  3085. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3086. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3087. movq K, %rax
  3088. subq KKK, %rax
  3089. salq $3, %rax // rax * SIZE
  3090. leaq (BO, %rax, 2), BO // number of values in B
  3091. leaq (AO, %rax, 1), AO // number of values in A
  3092. #endif
  3093. #if defined(TRMMKERNEL) && defined(LEFT)
  3094. addq $1, KK // number of values in A
  3095. #endif
  3096. .L2_100:
  3097. #if defined(TRMMKERNEL) && !defined(LEFT)
  3098. addq $2, KK // number of values in B
  3099. #endif
  3100. movq K, %rax
  3101. salq $1, %rax // * 2
  3102. leaq (B , %rax, SIZE), B
  3103. /***************************************************************************************************************/
  3104. .L1_0:
  3105. movq Nmod12, J
  3106. testq $1, J
  3107. je .L999
  3108. .L1_10:
  3109. movq C, CO1
  3110. leaq (C, LDC, 1), C // c += 1 * ldc
  3111. #if defined(TRMMKERNEL) && defined(LEFT)
  3112. movq OFFSET, %rax
  3113. movq %rax, KK
  3114. #endif
  3115. movq A, AO // aoffset = a
  3116. addq $16 * SIZE, AO
  3117. movq M, I
  3118. sarq $2, I // i = m / 4
  3119. je .L1_20
  3120. ALIGN_4
  3121. .L1_11:
  3122. #if !defined(TRMMKERNEL) || \
  3123. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3124. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3125. movq B, BO
  3126. addq $12 * SIZE, BO
  3127. #else
  3128. movq B, BO
  3129. addq $12 * SIZE, BO
  3130. movq KK, %rax
  3131. salq $3, %rax // rax * SIZE
  3132. leaq (BO,%rax,1), BO // add number of values in B
  3133. leaq (AO,%rax,4), AO // add number of values in A
  3134. #endif
  3135. #ifndef TRMMKERNEL
  3136. movq K, %rax
  3137. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3138. movq K, %rax
  3139. subq KK, %rax
  3140. movq %rax, KKK
  3141. #else
  3142. movq KK, %rax
  3143. #ifdef LEFT
  3144. addq $4, %rax // number of values in AO
  3145. #else
  3146. addq $1, %rax // number of values in BO
  3147. #endif
  3148. movq %rax, KKK
  3149. #endif
  3150. INIT4x1
  3151. sarq $3, %rax // K / 8
  3152. je .L1_16
  3153. ALIGN_5
  3154. .L1_12:
  3155. KERNEL4x1
  3156. dec %rax
  3157. jne .L1_12
  3158. .L1_16:
  3159. movq KKK, %rax
  3160. andq $7, %rax # if (k & 1)
  3161. je .L1_19
  3162. ALIGN_4
  3163. .L1_17:
  3164. KERNEL4x1_SUB
  3165. dec %rax
  3166. jne .L1_17
  3167. ALIGN_4
  3168. .L1_19:
  3169. SAVE4x1
  3170. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3171. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3172. movq K, %rax
  3173. subq KKK, %rax
  3174. salq $3, %rax // rax * SIZE
  3175. leaq (BO, %rax, 1), BO // number of values in B
  3176. leaq (AO, %rax, 4), AO // number of values in A
  3177. #endif
  3178. #if defined(TRMMKERNEL) && defined(LEFT)
  3179. addq $4, KK // number of values in A
  3180. #endif
  3181. decq I # i --
  3182. jg .L1_11
  3183. /**************************************************************************
  3184. * Rest of M
  3185. ***************************************************************************/
  3186. .L1_20:
  3187. // Test rest of M
  3188. testq $3, M
  3189. jz .L1_100
  3190. .L1_30:
  3191. testq $2, M
  3192. jz .L1_40
  3193. ALIGN_4
  3194. .L1_31:
  3195. #if !defined(TRMMKERNEL) || \
  3196. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3197. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3198. movq B, BO
  3199. addq $12 * SIZE, BO
  3200. #else
  3201. movq B, BO
  3202. addq $12 * SIZE, BO
  3203. movq KK, %rax
  3204. salq $3, %rax // rax * SIZE
  3205. leaq (BO,%rax,1), BO // add number of values in B
  3206. leaq (AO,%rax,2), AO // add number of values in A
  3207. #endif
  3208. #ifndef TRMMKERNEL
  3209. movq K, %rax
  3210. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3211. movq K, %rax
  3212. subq KK, %rax
  3213. movq %rax, KKK
  3214. #else
  3215. movq KK, %rax
  3216. #ifdef LEFT
  3217. addq $2, %rax // number of values in AO
  3218. #else
  3219. addq $1, %rax // number of values in BO
  3220. #endif
  3221. movq %rax, KKK
  3222. #endif
  3223. INIT2x1
  3224. sarq $3, %rax
  3225. je .L1_36
  3226. ALIGN_4
  3227. .L1_32:
  3228. KERNEL2x1_SUB
  3229. KERNEL2x1_SUB
  3230. KERNEL2x1_SUB
  3231. KERNEL2x1_SUB
  3232. KERNEL2x1_SUB
  3233. KERNEL2x1_SUB
  3234. KERNEL2x1_SUB
  3235. KERNEL2x1_SUB
  3236. dec %rax
  3237. jne .L1_32
  3238. .L1_36:
  3239. movq KKK, %rax
  3240. andq $7, %rax # if (k & 1)
  3241. je .L1_39
  3242. ALIGN_4
  3243. .L1_37:
  3244. KERNEL2x1_SUB
  3245. dec %rax
  3246. jne .L1_37
  3247. .L1_39:
  3248. SAVE2x1
  3249. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3250. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3251. movq K, %rax
  3252. subq KKK, %rax
  3253. salq $3, %rax // rax * SIZE
  3254. leaq (BO, %rax, 1), BO // number of values in B
  3255. leaq (AO, %rax, 2), AO // number of values in A
  3256. #endif
  3257. #if defined(TRMMKERNEL) && defined(LEFT)
  3258. addq $2, KK // number of values in A
  3259. #endif
  3260. .L1_40:
  3261. testq $1, M
  3262. jz .L1_100 // to next 3 lines of N
  3263. .L1_41:
  3264. #if !defined(TRMMKERNEL) || \
  3265. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3266. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3267. movq B, BO
  3268. addq $12 * SIZE, BO
  3269. #else
  3270. movq B, BO
  3271. addq $12 * SIZE, BO
  3272. movq KK, %rax
  3273. salq $3, %rax // rax * SIZE
  3274. leaq (BO,%rax,1), BO // add number of values in B
  3275. leaq (AO,%rax,1), AO // add number of values in A
  3276. #endif
  3277. #ifndef TRMMKERNEL
  3278. movq K, %rax
  3279. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3280. movq K, %rax
  3281. subq KK, %rax
  3282. movq %rax, KKK
  3283. #else
  3284. movq KK, %rax
  3285. #ifdef LEFT
  3286. addq $1, %rax // number of values in AO
  3287. #else
  3288. addq $1, %rax // number of values in BO
  3289. #endif
  3290. movq %rax, KKK
  3291. #endif
  3292. INIT1x1
  3293. sarq $3,%rax
  3294. je .L1_46
  3295. ALIGN_4
  3296. .L1_42:
  3297. KERNEL1x1_SUB
  3298. KERNEL1x1_SUB
  3299. KERNEL1x1_SUB
  3300. KERNEL1x1_SUB
  3301. KERNEL1x1_SUB
  3302. KERNEL1x1_SUB
  3303. KERNEL1x1_SUB
  3304. KERNEL1x1_SUB
  3305. dec %rax
  3306. jne .L1_42
  3307. .L1_46:
  3308. movq KKK, %rax
  3309. andq $7, %rax # if (k & 1)
  3310. je .L1_49
  3311. ALIGN_4
  3312. .L1_47:
  3313. KERNEL1x1_SUB
  3314. dec %rax
  3315. jne .L1_47
  3316. .L1_49:
  3317. SAVE1x1
  3318. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3319. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3320. movq K, %rax
  3321. subq KKK, %rax
  3322. salq $3, %rax // rax * SIZE
  3323. leaq (BO, %rax, 1), BO // number of values in B
  3324. leaq (AO, %rax, 1), AO // number of values in A
  3325. #endif
  3326. #if defined(TRMMKERNEL) && defined(LEFT)
  3327. addq $1, KK // number of values in A
  3328. #endif
  3329. .L1_100:
  3330. #if defined(TRMMKERNEL) && !defined(LEFT)
  3331. addq $1, KK // number of values in B
  3332. #endif
  3333. .L999:
  3334. vzeroupper
  3335. movq SP, %rsp
  3336. movq (%rsp), %rbx
  3337. movq 8(%rsp), %rbp
  3338. movq 16(%rsp), %r12
  3339. movq 24(%rsp), %r13
  3340. movq 32(%rsp), %r14
  3341. movq 40(%rsp), %r15
  3342. #ifdef WINDOWS_ABI
  3343. movq 48(%rsp), %rdi
  3344. movq 56(%rsp), %rsi
  3345. vmovups 64(%rsp), %xmm6
  3346. vmovups 80(%rsp), %xmm7
  3347. vmovups 96(%rsp), %xmm8
  3348. vmovups 112(%rsp), %xmm9
  3349. vmovups 128(%rsp), %xmm10
  3350. vmovups 144(%rsp), %xmm11
  3351. vmovups 160(%rsp), %xmm12
  3352. vmovups 176(%rsp), %xmm13
  3353. vmovups 192(%rsp), %xmm14
  3354. vmovups 208(%rsp), %xmm15
  3355. #endif
  3356. addq $STACKSIZE, %rsp
  3357. ret
  3358. EPILOGUE
  3359. #endif