You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_16x2_haswell.S 110 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. /*********************************************************************
  39. * 2013/10/20 Saar
  40. * BLASTEST : OK
  41. * CTEST : OK
  42. * TEST : OK
  43. *
  44. *
  45. * 2013/10/20 Saar
  46. * Parameter:
  47. * DGEMM_DEFAULT_UNROLL_N 2
  48. * DGEMM_DEFAULT_UNROLL_M 16
  49. * DGEMM_DEFAULT_P 192
  50. * DGEMM_DEFAULT_Q 128
  51. * A_PR1 512
  52. *
  53. *
  54. * Performance without prefetch of B:
  55. * 1 thread: 45.8 GFLOPS (MKL: 45)
  56. * 2 threads: 80.0 GFLOPS (MKL: 91)
  57. * 4 threads: 135.0 GFLOPS (MKL: 135)
  58. *********************************************************************/
  59. #define ASSEMBLER
  60. #include "common.h"
  61. #define OLD_M %rdi
  62. #define OLD_N %rsi
  63. #define M %r13
  64. #define J %r14
  65. #define OLD_K %rdx
  66. #define A %rcx
  67. #define B %r8
  68. #define C %r9
  69. #define LDC %r10
  70. #define I %r11
  71. #define AO %rdi
  72. #define BO %rsi
  73. #define CO1 %r15
  74. #define K %r12
  75. #define BI %rbp
  76. #define SP %rbx
  77. #define BO1 %rdi
  78. #define BO2 %r15
  79. #ifndef WINDOWS_ABI
  80. #define STACKSIZE 96
  81. #else
  82. #define STACKSIZE 256
  83. #define OLD_A 40 + STACKSIZE(%rsp)
  84. #define OLD_B 48 + STACKSIZE(%rsp)
  85. #define OLD_C 56 + STACKSIZE(%rsp)
  86. #define OLD_LDC 64 + STACKSIZE(%rsp)
  87. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  88. #endif
  89. #define L_BUFFER_SIZE 512*8*4
  90. #define LB2_OFFSET 512*8*2
  91. #define Ndiv6 24(%rsp)
  92. #define Nmod6 32(%rsp)
  93. #define N 40(%rsp)
  94. #define ALPHA 48(%rsp)
  95. #define OFFSET 56(%rsp)
  96. #define KK 64(%rsp)
  97. #define KKK 72(%rsp)
  98. #define BUFFER1 128(%rsp)
  99. #define BUFFER2 LB2_OFFSET+128(%rsp)
  100. #if defined(OS_WINDOWS)
  101. #if L_BUFFER_SIZE > 16384
  102. #define STACK_TOUCH \
  103. movl $0, 4096 * 4(%rsp);\
  104. movl $0, 4096 * 3(%rsp);\
  105. movl $0, 4096 * 2(%rsp);\
  106. movl $0, 4096 * 1(%rsp);
  107. #elif L_BUFFER_SIZE > 12288
  108. #define STACK_TOUCH \
  109. movl $0, 4096 * 3(%rsp);\
  110. movl $0, 4096 * 2(%rsp);\
  111. movl $0, 4096 * 1(%rsp);
  112. #elif L_BUFFER_SIZE > 8192
  113. #define STACK_TOUCH \
  114. movl $0, 4096 * 2(%rsp);\
  115. movl $0, 4096 * 1(%rsp);
  116. #elif L_BUFFER_SIZE > 4096
  117. #define STACK_TOUCH \
  118. movl $0, 4096 * 1(%rsp);
  119. #else
  120. #define STACK_TOUCH
  121. #endif
  122. #else
  123. #define STACK_TOUCH
  124. #endif
  125. #if defined(BULLDOZER)
  126. .macro VFMADD231PD_ y0,y1,y2
  127. vfmaddpd \y0,\y1,\y2,\y0
  128. .endm
  129. .macro VFMADD231SD_ x0,x1,x2
  130. vfmaddsd \x0,\x1,\x2,\x0
  131. .endm
  132. #else
  133. .macro VFMADD231PD_ y0,y1,y2
  134. vfmadd231pd \y2,\y1,\y0
  135. .endm
  136. .macro VFMADD231SD_ x0,x1,x2
  137. vfmadd231sd \x2,\x1,\x0
  138. .endm
  139. #endif
  140. #define A_PR1 512
  141. #define B_PR1 256
  142. /*******************************************************************************************
  143. * 3 lines of N
  144. *******************************************************************************************/
  145. .macro KERNEL16x3_SUBN
  146. prefetcht0 A_PR1(AO)
  147. vbroadcastsd -12 * SIZE(BO), %ymm1
  148. vmovaps -16 * SIZE(AO), %ymm0
  149. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  150. vbroadcastsd -11 * SIZE(BO), %ymm2
  151. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  152. vbroadcastsd -10 * SIZE(BO), %ymm3
  153. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  154. vmovaps -12 * SIZE(AO), %ymm0
  155. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  156. prefetcht0 A_PR1+64(AO)
  157. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  158. VFMADD231PD_ %ymm9,%ymm3,%ymm0
  159. vmovaps -8 * SIZE(AO), %ymm0
  160. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  161. VFMADD231PD_ %ymm11,%ymm2,%ymm0
  162. VFMADD231PD_ %ymm12,%ymm3,%ymm0
  163. vmovaps -4 * SIZE(AO), %ymm0
  164. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  165. VFMADD231PD_ %ymm14,%ymm2,%ymm0
  166. VFMADD231PD_ %ymm15,%ymm3,%ymm0
  167. addq $ 3*SIZE , BO
  168. addq $ 16*SIZE, AO
  169. .endm
  170. .macro KERNEL8x3_SUBN
  171. //prefetcht0 A_PR1(AO)
  172. vbroadcastsd -12 * SIZE(BO), %ymm1
  173. vmovaps -16 * SIZE(AO), %ymm0
  174. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  175. vbroadcastsd -11 * SIZE(BO), %ymm2
  176. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  177. vbroadcastsd -10 * SIZE(BO), %ymm3
  178. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  179. vmovaps -12 * SIZE(AO), %ymm0
  180. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  181. //prefetcht0 A_PR1+64(AO)
  182. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  183. VFMADD231PD_ %ymm9,%ymm3,%ymm0
  184. prefetcht0 B_PR1(BO)
  185. addq $ 3*SIZE , BO
  186. addq $ 8*SIZE, AO
  187. .endm
  188. .macro KERNEL4x3_SUBN
  189. vbroadcastsd -12 * SIZE(BO), %ymm1
  190. vmovaps -16 * SIZE(AO), %ymm0
  191. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  192. vbroadcastsd -11 * SIZE(BO), %ymm2
  193. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  194. vbroadcastsd -10 * SIZE(BO), %ymm3
  195. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  196. addq $ 3*SIZE , BO
  197. addq $ 4*SIZE, AO
  198. .endm
  199. .macro KERNEL2x3_SUBN
  200. vmovsd -12 * SIZE(BO), %xmm1
  201. vmovsd -16 * SIZE(AO), %xmm0
  202. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  203. vmovsd -11 * SIZE(BO), %xmm2
  204. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  205. vmovsd -10 * SIZE(BO), %xmm3
  206. VFMADD231SD_ %xmm6,%xmm3,%xmm0
  207. vmovsd -15 * SIZE(AO), %xmm0
  208. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  209. VFMADD231SD_ %xmm10,%xmm2,%xmm0
  210. VFMADD231SD_ %xmm12,%xmm3,%xmm0
  211. addq $ 3*SIZE , BO
  212. addq $ 2*SIZE, AO
  213. .endm
  214. .macro KERNEL1x3_SUBN
  215. vmovsd -12 * SIZE(BO), %xmm1
  216. vmovsd -16 * SIZE(AO), %xmm0
  217. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  218. vmovsd -11 * SIZE(BO), %xmm2
  219. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  220. vmovsd -10 * SIZE(BO), %xmm3
  221. VFMADD231SD_ %xmm6,%xmm3,%xmm0
  222. addq $ 3*SIZE , BO
  223. addq $ 1*SIZE, AO
  224. .endm
  225. /******************************************************************************************/
  226. .macro KERNEL16x3_1
  227. prefetcht0 A_PR1(AO, %rax, SIZE)
  228. vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1
  229. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  230. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  231. vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2
  232. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  233. vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3
  234. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  235. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  236. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  237. prefetcht0 64+A_PR1(AO, %rax, SIZE)
  238. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  239. VFMADD231PD_ %ymm9,%ymm3,%ymm0
  240. vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0
  241. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  242. VFMADD231PD_ %ymm11,%ymm2,%ymm0
  243. VFMADD231PD_ %ymm12,%ymm3,%ymm0
  244. vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0
  245. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  246. VFMADD231PD_ %ymm14,%ymm2,%ymm0
  247. vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1
  248. vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2
  249. VFMADD231PD_ %ymm15,%ymm3,%ymm0
  250. .endm
  251. .macro KERNEL16x3_2
  252. prefetcht0 128+A_PR1(AO, %rax, SIZE)
  253. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  254. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  255. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  256. vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3
  257. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  258. vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0
  259. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  260. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  261. prefetcht0 A_PR1+64(AO,%rax,SIZE)
  262. VFMADD231PD_ %ymm9,%ymm3,%ymm0
  263. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0
  264. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  265. prefetcht0 192+A_PR1(AO, %rax, SIZE)
  266. VFMADD231PD_ %ymm11,%ymm2,%ymm0
  267. VFMADD231PD_ %ymm12,%ymm3,%ymm0
  268. vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0
  269. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  270. VFMADD231PD_ %ymm14,%ymm2,%ymm0
  271. vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1
  272. vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2
  273. VFMADD231PD_ %ymm15,%ymm3,%ymm0
  274. .endm
  275. .macro KERNEL16x3_3
  276. prefetcht0 256+A_PR1(AO, %rax, SIZE)
  277. vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0
  278. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  279. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  280. vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3
  281. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  282. vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0
  283. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  284. prefetcht0 320+A_PR1(AO, %rax, SIZE)
  285. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  286. VFMADD231PD_ %ymm9,%ymm3,%ymm0
  287. vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0
  288. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  289. VFMADD231PD_ %ymm11,%ymm2,%ymm0
  290. VFMADD231PD_ %ymm12,%ymm3,%ymm0
  291. vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0
  292. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  293. VFMADD231PD_ %ymm14,%ymm2,%ymm0
  294. vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1
  295. vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2
  296. VFMADD231PD_ %ymm15,%ymm3,%ymm0
  297. .endm
  298. .macro KERNEL16x3_4
  299. prefetcht0 384+A_PR1(AO, %rax, SIZE)
  300. vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0
  301. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  302. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  303. vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3
  304. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  305. vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0
  306. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  307. prefetcht0 448+A_PR1(AO, %rax, SIZE)
  308. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  309. VFMADD231PD_ %ymm9,%ymm3,%ymm0
  310. vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0
  311. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  312. VFMADD231PD_ %ymm11,%ymm2,%ymm0
  313. addq $12, BI
  314. VFMADD231PD_ %ymm12,%ymm3,%ymm0
  315. vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0
  316. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  317. VFMADD231PD_ %ymm14,%ymm2,%ymm0
  318. addq $64, %rax
  319. VFMADD231PD_ %ymm15,%ymm3,%ymm0
  320. .endm
  321. .macro KERNEL16x3_SUB
  322. vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1
  323. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  324. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  325. vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2
  326. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  327. vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3
  328. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  329. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  330. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  331. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  332. VFMADD231PD_ %ymm9,%ymm3,%ymm0
  333. vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0
  334. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  335. VFMADD231PD_ %ymm11,%ymm2,%ymm0
  336. VFMADD231PD_ %ymm12,%ymm3,%ymm0
  337. vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0
  338. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  339. VFMADD231PD_ %ymm14,%ymm2,%ymm0
  340. VFMADD231PD_ %ymm15,%ymm3,%ymm0
  341. addq $3 , BI
  342. addq $16, %rax
  343. .endm
  344. .macro SAVE16x3
  345. vbroadcastsd ALPHA, %ymm0
  346. vmulpd %ymm0 , %ymm4 , %ymm4
  347. vmulpd %ymm0 , %ymm7 , %ymm7
  348. vmulpd %ymm0 , %ymm10, %ymm10
  349. vmulpd %ymm0 , %ymm13, %ymm13
  350. vmulpd %ymm0 , %ymm5 , %ymm5
  351. vmulpd %ymm0 , %ymm8 , %ymm8
  352. vmulpd %ymm0 , %ymm11, %ymm11
  353. vmulpd %ymm0 , %ymm14, %ymm14
  354. vmulpd %ymm0 , %ymm6 , %ymm6
  355. vmulpd %ymm0 , %ymm9 , %ymm9
  356. vmulpd %ymm0 , %ymm12, %ymm12
  357. vmulpd %ymm0 , %ymm15, %ymm15
  358. #if !defined(TRMMKERNEL)
  359. vaddpd (CO1), %ymm4,%ymm4
  360. vaddpd 4 * SIZE(CO1), %ymm7,%ymm7
  361. vaddpd 8 * SIZE(CO1), %ymm10,%ymm10
  362. vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
  363. vaddpd (CO1, LDC), %ymm5,%ymm5
  364. vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8
  365. vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11
  366. vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14
  367. vaddpd (CO1, LDC, 2), %ymm6,%ymm6
  368. vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9
  369. vaddpd 8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12
  370. vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15
  371. #endif
  372. vmovups %ymm4 , (CO1)
  373. vmovups %ymm7 , 4 * SIZE(CO1)
  374. vmovups %ymm10, 8 * SIZE(CO1)
  375. vmovups %ymm13,12 * SIZE(CO1)
  376. vmovups %ymm5 , (CO1, LDC)
  377. vmovups %ymm8 , 4 * SIZE(CO1, LDC)
  378. vmovups %ymm11, 8 * SIZE(CO1, LDC)
  379. vmovups %ymm14,12 * SIZE(CO1, LDC)
  380. vmovups %ymm6 , (CO1, LDC, 2)
  381. vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2)
  382. vmovups %ymm12, 8 * SIZE(CO1, LDC, 2)
  383. vmovups %ymm15,12 * SIZE(CO1, LDC, 2)
  384. .endm
  385. /*******************************************************************************************/
  386. .macro KERNEL8x3_1
  387. prefetcht0 A_PR1(AO, %rax, SIZE)
  388. vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1
  389. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  390. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  391. vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2
  392. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  393. vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3
  394. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  395. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  396. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  397. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  398. VFMADD231PD_ %ymm9,%ymm3,%ymm0
  399. .endm
  400. .macro KERNEL8x3_2
  401. prefetcht0 64+A_PR1(AO, %rax, SIZE)
  402. vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1
  403. vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0
  404. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  405. vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2
  406. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  407. vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3
  408. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  409. vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0
  410. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  411. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  412. VFMADD231PD_ %ymm9,%ymm3,%ymm0
  413. .endm
  414. .macro KERNEL8x3_3
  415. prefetcht0 128+A_PR1(AO, %rax, SIZE)
  416. vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1
  417. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  418. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  419. vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2
  420. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  421. vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3
  422. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  423. vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0
  424. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  425. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  426. VFMADD231PD_ %ymm9,%ymm3,%ymm0
  427. .endm
  428. .macro KERNEL8x3_4
  429. prefetcht0 192+A_PR1(AO, %rax, SIZE)
  430. vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1
  431. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0
  432. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  433. vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2
  434. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  435. vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3
  436. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  437. vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0
  438. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  439. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  440. VFMADD231PD_ %ymm9,%ymm3,%ymm0
  441. addq $12, BI
  442. addq $32, %rax
  443. .endm
  444. .macro KERNEL8x3_SUB
  445. vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1
  446. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  447. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  448. vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2
  449. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  450. vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3
  451. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  452. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  453. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  454. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  455. VFMADD231PD_ %ymm9,%ymm3,%ymm0
  456. addq $3 , BI
  457. addq $8 , %rax
  458. .endm
  459. .macro SAVE8x3
  460. vbroadcastsd ALPHA, %ymm0
  461. vmulpd %ymm0 , %ymm4 , %ymm4
  462. vmulpd %ymm0 , %ymm7 , %ymm7
  463. vmulpd %ymm0 , %ymm5 , %ymm5
  464. vmulpd %ymm0 , %ymm8 , %ymm8
  465. vmulpd %ymm0 , %ymm6 , %ymm6
  466. vmulpd %ymm0 , %ymm9 , %ymm9
  467. #if !defined(TRMMKERNEL)
  468. vaddpd (CO1), %ymm4,%ymm4
  469. vaddpd 4 * SIZE(CO1), %ymm7,%ymm7
  470. vaddpd (CO1, LDC), %ymm5,%ymm5
  471. vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8
  472. vaddpd (CO1, LDC, 2), %ymm6,%ymm6
  473. vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9
  474. #endif
  475. vmovups %ymm4 , (CO1)
  476. vmovups %ymm7 , 4 * SIZE(CO1)
  477. vmovups %ymm5 , (CO1, LDC)
  478. vmovups %ymm8 , 4 * SIZE(CO1, LDC)
  479. vmovups %ymm6 , (CO1, LDC, 2)
  480. vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2)
  481. .endm
  482. /*******************************************************************************************/
  483. .macro KERNEL4x3_1
  484. prefetcht0 A_PR1(AO, %rax, SIZE)
  485. vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1
  486. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  487. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  488. vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2
  489. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  490. vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3
  491. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  492. .endm
  493. .macro KERNEL4x3_2
  494. vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1
  495. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  496. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  497. vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2
  498. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  499. vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3
  500. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  501. .endm
  502. .macro KERNEL4x3_3
  503. prefetcht0 A_PR1(AO, %rax, SIZE)
  504. vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1
  505. vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0
  506. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  507. vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2
  508. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  509. vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3
  510. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  511. .endm
  512. .macro KERNEL4x3_4
  513. vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1
  514. vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0
  515. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  516. vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2
  517. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  518. vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3
  519. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  520. addq $12, BI
  521. addq $16, %rax
  522. .endm
  523. .macro KERNEL4x3_SUB
  524. vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1
  525. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  526. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  527. vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2
  528. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  529. vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3
  530. VFMADD231PD_ %ymm6,%ymm3,%ymm0
  531. addq $3 , BI
  532. addq $4 , %rax
  533. .endm
  534. .macro SAVE4x3
  535. vbroadcastsd ALPHA, %ymm0
  536. vmulpd %ymm0 , %ymm4 , %ymm4
  537. vmulpd %ymm0 , %ymm5 , %ymm5
  538. vmulpd %ymm0 , %ymm6 , %ymm6
  539. #if !defined(TRMMKERNEL)
  540. vaddpd (CO1), %ymm4,%ymm4
  541. vaddpd (CO1, LDC), %ymm5,%ymm5
  542. vaddpd (CO1, LDC, 2), %ymm6,%ymm6
  543. #endif
  544. vmovups %ymm4 , (CO1)
  545. vmovups %ymm5 , (CO1, LDC)
  546. vmovups %ymm6 , (CO1, LDC, 2)
  547. .endm
  548. /*******************************************************************************************/
  549. .macro KERNEL2x3_1
  550. prefetcht0 A_PR1(AO, %rax, SIZE)
  551. vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1
  552. vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0
  553. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  554. vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2
  555. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  556. vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3
  557. VFMADD231SD_ %xmm6,%xmm3,%xmm0
  558. vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0
  559. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  560. VFMADD231SD_ %xmm10,%xmm2,%xmm0
  561. VFMADD231SD_ %xmm12,%xmm3,%xmm0
  562. .endm
  563. .macro KERNEL2x3_2
  564. vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1
  565. vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0
  566. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  567. vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2
  568. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  569. vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3
  570. VFMADD231SD_ %xmm6,%xmm3,%xmm0
  571. vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0
  572. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  573. VFMADD231SD_ %xmm10,%xmm2,%xmm0
  574. VFMADD231SD_ %xmm12,%xmm3,%xmm0
  575. .endm
  576. .macro KERNEL2x3_3
  577. vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1
  578. vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0
  579. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  580. vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2
  581. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  582. vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3
  583. VFMADD231SD_ %xmm6,%xmm3,%xmm0
  584. vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0
  585. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  586. VFMADD231SD_ %xmm10,%xmm2,%xmm0
  587. VFMADD231SD_ %xmm12,%xmm3,%xmm0
  588. .endm
  589. .macro KERNEL2x3_4
  590. vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1
  591. vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0
  592. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  593. vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2
  594. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  595. vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3
  596. VFMADD231SD_ %xmm6,%xmm3,%xmm0
  597. vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0
  598. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  599. VFMADD231SD_ %xmm10,%xmm2,%xmm0
  600. VFMADD231SD_ %xmm12,%xmm3,%xmm0
  601. addq $12, BI
  602. addq $8, %rax
  603. .endm
  604. .macro KERNEL2x3_SUB
  605. vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1
  606. vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0
  607. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  608. vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2
  609. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  610. vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3
  611. VFMADD231SD_ %xmm6,%xmm3,%xmm0
  612. vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0
  613. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  614. VFMADD231SD_ %xmm10,%xmm2,%xmm0
  615. VFMADD231SD_ %xmm12,%xmm3,%xmm0
  616. addq $3 , BI
  617. addq $2 , %rax
  618. .endm
  619. .macro SAVE2x3
  620. vmovsd ALPHA, %xmm0
  621. vmulsd %xmm0 , %xmm4 , %xmm4
  622. vmulsd %xmm0 , %xmm8 , %xmm8
  623. vmulsd %xmm0 , %xmm5 , %xmm5
  624. vmulsd %xmm0 , %xmm10, %xmm10
  625. vmulsd %xmm0 , %xmm6 , %xmm6
  626. vmulsd %xmm0 , %xmm12, %xmm12
  627. #if !defined(TRMMKERNEL)
  628. vaddsd (CO1), %xmm4,%xmm4
  629. vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
  630. vaddsd (CO1, LDC), %xmm5,%xmm5
  631. vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10
  632. vaddsd (CO1, LDC, 2), %xmm6,%xmm6
  633. vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12
  634. #endif
  635. vmovsd %xmm4 , (CO1)
  636. vmovsd %xmm8 , 1 * SIZE(CO1)
  637. vmovsd %xmm5 , (CO1, LDC)
  638. vmovsd %xmm10, 1 * SIZE(CO1, LDC)
  639. vmovsd %xmm6 , (CO1, LDC, 2)
  640. vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2)
  641. .endm
  642. /*******************************************************************************************/
  643. .macro KERNEL1x3_1
  644. vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1
  645. vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0
  646. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  647. vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2
  648. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  649. vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3
  650. VFMADD231SD_ %xmm6,%xmm3,%xmm0
  651. .endm
  652. .macro KERNEL1x3_2
  653. vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1
  654. vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0
  655. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  656. vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2
  657. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  658. vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3
  659. VFMADD231SD_ %xmm6,%xmm3,%xmm0
  660. .endm
  661. .macro KERNEL1x3_3
  662. vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1
  663. vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0
  664. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  665. vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2
  666. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  667. vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3
  668. VFMADD231SD_ %xmm6,%xmm3,%xmm0
  669. .endm
  670. .macro KERNEL1x3_4
  671. vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1
  672. vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0
  673. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  674. vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2
  675. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  676. vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3
  677. VFMADD231SD_ %xmm6,%xmm3,%xmm0
  678. addq $12, BI
  679. addq $4, %rax
  680. .endm
  681. .macro KERNEL1x3_SUB
  682. vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1
  683. vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0
  684. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  685. vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2
  686. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  687. vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3
  688. VFMADD231SD_ %xmm6,%xmm3,%xmm0
  689. addq $3 , BI
  690. addq $1 , %rax
  691. .endm
  692. .macro SAVE1x3
  693. vmovsd ALPHA, %xmm0
  694. vmulsd %xmm0 , %xmm4 , %xmm4
  695. vmulsd %xmm0 , %xmm5 , %xmm5
  696. vmulsd %xmm0 , %xmm6 , %xmm6
  697. #if !defined(TRMMKERNEL)
  698. vaddsd (CO1), %xmm4,%xmm4
  699. vaddsd (CO1, LDC), %xmm5,%xmm5
  700. vaddsd (CO1, LDC, 2), %xmm6,%xmm6
  701. #endif
  702. vmovsd %xmm4 , (CO1)
  703. vmovsd %xmm5 , (CO1, LDC)
  704. vmovsd %xmm6 , (CO1, LDC, 2)
  705. .endm
  706. /*******************************************************************************************/
  707. /*******************************************************************************************
  708. * 2 lines of N
  709. *******************************************************************************************/
  710. .macro KERNEL16x2_1
  711. prefetcht0 A_PR1(AO, %rax, SIZE)
  712. vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1
  713. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  714. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  715. vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2
  716. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  717. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  718. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  719. prefetcht0 64+A_PR1(AO, %rax, SIZE)
  720. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  721. vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0
  722. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  723. VFMADD231PD_ %ymm11,%ymm2,%ymm0
  724. vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0
  725. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  726. VFMADD231PD_ %ymm14,%ymm2,%ymm0
  727. .endm
  728. .macro KERNEL16x2_2
  729. prefetcht0 128+A_PR1(AO, %rax, SIZE)
  730. vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1
  731. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  732. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  733. vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2
  734. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  735. vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0
  736. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  737. prefetcht0 192+A_PR1(AO, %rax, SIZE)
  738. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  739. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0
  740. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  741. VFMADD231PD_ %ymm11,%ymm2,%ymm0
  742. vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0
  743. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  744. VFMADD231PD_ %ymm14,%ymm2,%ymm0
  745. .endm
  746. .macro KERNEL16x2_3
  747. prefetcht0 256+A_PR1(AO, %rax, SIZE)
  748. vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1
  749. vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0
  750. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  751. vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2
  752. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  753. vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0
  754. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  755. prefetcht0 320+A_PR1(AO, %rax, SIZE)
  756. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  757. vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0
  758. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  759. VFMADD231PD_ %ymm11,%ymm2,%ymm0
  760. vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0
  761. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  762. VFMADD231PD_ %ymm14,%ymm2,%ymm0
  763. .endm
  764. .macro KERNEL16x2_4
  765. prefetcht0 384+A_PR1(AO, %rax, SIZE)
  766. vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1
  767. vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0
  768. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  769. vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2
  770. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  771. vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0
  772. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  773. prefetcht0 448+A_PR1(AO, %rax, SIZE)
  774. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  775. vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0
  776. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  777. VFMADD231PD_ %ymm11,%ymm2,%ymm0
  778. vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0
  779. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  780. VFMADD231PD_ %ymm14,%ymm2,%ymm0
  781. addq $8, BI
  782. addq $64, %rax
  783. .endm
  784. .macro KERNEL16x2_SUB
  785. vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1
  786. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  787. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  788. vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2
  789. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  790. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  791. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  792. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  793. vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0
  794. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  795. VFMADD231PD_ %ymm11,%ymm2,%ymm0
  796. vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0
  797. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  798. VFMADD231PD_ %ymm14,%ymm2,%ymm0
  799. addq $2, BI
  800. addq $16, %rax
  801. .endm
  802. .macro SAVE16x2
  803. vbroadcastsd ALPHA, %ymm0
  804. vmulpd %ymm0 , %ymm4 , %ymm4
  805. vmulpd %ymm0 , %ymm7 , %ymm7
  806. vmulpd %ymm0 , %ymm10, %ymm10
  807. vmulpd %ymm0 , %ymm13, %ymm13
  808. vmulpd %ymm0 , %ymm5 , %ymm5
  809. vmulpd %ymm0 , %ymm8 , %ymm8
  810. vmulpd %ymm0 , %ymm11, %ymm11
  811. vmulpd %ymm0 , %ymm14, %ymm14
  812. #if !defined(TRMMKERNEL)
  813. vaddpd (CO1), %ymm4,%ymm4
  814. vaddpd 4 * SIZE(CO1), %ymm7,%ymm7
  815. vaddpd 8 * SIZE(CO1), %ymm10,%ymm10
  816. vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
  817. vaddpd (CO1, LDC), %ymm5,%ymm5
  818. vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8
  819. vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11
  820. vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14
  821. #endif
  822. vmovups %ymm4 , (CO1)
  823. vmovups %ymm7 , 4 * SIZE(CO1)
  824. vmovups %ymm10, 8 * SIZE(CO1)
  825. vmovups %ymm13,12 * SIZE(CO1)
  826. vmovups %ymm5 , (CO1, LDC)
  827. vmovups %ymm8 , 4 * SIZE(CO1, LDC)
  828. vmovups %ymm11, 8 * SIZE(CO1, LDC)
  829. vmovups %ymm14,12 * SIZE(CO1, LDC)
  830. .endm
  831. /*******************************************************************************************/
  832. .macro KERNEL8x2_1
  833. prefetcht0 A_PR1(AO, %rax, SIZE)
  834. vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1
  835. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  836. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  837. vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2
  838. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  839. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  840. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  841. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  842. .endm
  843. .macro KERNEL8x2_2
  844. prefetcht0 64+A_PR1(AO, %rax, SIZE)
  845. vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1
  846. vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0
  847. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  848. vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2
  849. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  850. vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0
  851. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  852. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  853. .endm
  854. .macro KERNEL8x2_3
  855. prefetcht0 128+A_PR1(AO, %rax, SIZE)
  856. vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1
  857. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  858. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  859. vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2
  860. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  861. vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0
  862. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  863. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  864. .endm
  865. .macro KERNEL8x2_4
  866. prefetcht0 192+A_PR1(AO, %rax, SIZE)
  867. vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1
  868. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0
  869. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  870. vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2
  871. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  872. vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0
  873. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  874. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  875. addq $8, BI
  876. addq $32, %rax
  877. .endm
  878. .macro KERNEL8x2_SUB
  879. vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1
  880. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  881. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  882. vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2
  883. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  884. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  885. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  886. VFMADD231PD_ %ymm8,%ymm2,%ymm0
  887. addq $2, BI
  888. addq $8 , %rax
  889. .endm
  890. .macro SAVE8x2
  891. vbroadcastsd ALPHA, %ymm0
  892. vmulpd %ymm0 , %ymm4 , %ymm4
  893. vmulpd %ymm0 , %ymm7 , %ymm7
  894. vmulpd %ymm0 , %ymm5 , %ymm5
  895. vmulpd %ymm0 , %ymm8 , %ymm8
  896. #if !defined(TRMMKERNEL)
  897. vaddpd (CO1), %ymm4,%ymm4
  898. vaddpd 4 * SIZE(CO1), %ymm7,%ymm7
  899. vaddpd (CO1, LDC), %ymm5,%ymm5
  900. vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8
  901. #endif
  902. vmovups %ymm4 , (CO1)
  903. vmovups %ymm7 , 4 * SIZE(CO1)
  904. vmovups %ymm5 , (CO1, LDC)
  905. vmovups %ymm8 , 4 * SIZE(CO1, LDC)
  906. .endm
  907. /*******************************************************************************************/
  908. .macro KERNEL4x2_1
  909. prefetcht0 A_PR1(AO, %rax, SIZE)
  910. vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1
  911. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  912. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  913. vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2
  914. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  915. .endm
  916. .macro KERNEL4x2_2
  917. vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1
  918. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  919. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  920. vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2
  921. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  922. .endm
  923. .macro KERNEL4x2_3
  924. prefetcht0 64+A_PR1(AO, %rax, SIZE)
  925. vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1
  926. vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0
  927. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  928. vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2
  929. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  930. .endm
  931. .macro KERNEL4x2_4
  932. vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1
  933. vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0
  934. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  935. vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2
  936. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  937. addq $8, BI
  938. addq $16, %rax
  939. .endm
  940. .macro KERNEL4x2_SUB
  941. vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1
  942. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  943. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  944. vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2
  945. VFMADD231PD_ %ymm5,%ymm2,%ymm0
  946. addq $2, BI
  947. addq $4 , %rax
  948. .endm
  949. .macro SAVE4x2
  950. vbroadcastsd ALPHA, %ymm0
  951. vmulpd %ymm0 , %ymm4 , %ymm4
  952. vmulpd %ymm0 , %ymm5 , %ymm5
  953. #if !defined(TRMMKERNEL)
  954. vaddpd (CO1), %ymm4,%ymm4
  955. vaddpd (CO1, LDC), %ymm5,%ymm5
  956. #endif
  957. vmovups %ymm4 , (CO1)
  958. vmovups %ymm5 , (CO1, LDC)
  959. .endm
  960. /*******************************************************************************************/
  961. .macro KERNEL2x2_1
  962. prefetcht0 A_PR1(AO, %rax, SIZE)
  963. vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1
  964. vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0
  965. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  966. vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2
  967. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  968. vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0
  969. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  970. VFMADD231SD_ %xmm10,%xmm2,%xmm0
  971. .endm
  972. .macro KERNEL2x2_2
  973. vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1
  974. vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0
  975. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  976. vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2
  977. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  978. vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0
  979. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  980. VFMADD231SD_ %xmm10,%xmm2,%xmm0
  981. .endm
  982. .macro KERNEL2x2_3
  983. vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1
  984. vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0
  985. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  986. vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2
  987. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  988. vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0
  989. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  990. VFMADD231SD_ %xmm10,%xmm2,%xmm0
  991. .endm
  992. .macro KERNEL2x2_4
  993. vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1
  994. vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0
  995. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  996. vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2
  997. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  998. vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0
  999. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  1000. VFMADD231SD_ %xmm10,%xmm2,%xmm0
  1001. addq $8, BI
  1002. addq $8, %rax
  1003. .endm
  1004. .macro KERNEL2x2_SUB
  1005. vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1
  1006. vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0
  1007. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1008. vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2
  1009. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  1010. vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0
  1011. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  1012. VFMADD231SD_ %xmm10,%xmm2,%xmm0
  1013. addq $2, BI
  1014. addq $2, %rax
  1015. .endm
  1016. .macro SAVE2x2
  1017. vmovsd ALPHA, %xmm0
  1018. vmulsd %xmm0 , %xmm4 , %xmm4
  1019. vmulsd %xmm0 , %xmm8 , %xmm8
  1020. vmulsd %xmm0 , %xmm5 , %xmm5
  1021. vmulsd %xmm0 , %xmm10, %xmm10
  1022. #if !defined(TRMMKERNEL)
  1023. vaddsd (CO1), %xmm4,%xmm4
  1024. vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
  1025. vaddsd (CO1, LDC), %xmm5,%xmm5
  1026. vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10
  1027. #endif
  1028. vmovsd %xmm4 , (CO1)
  1029. vmovsd %xmm8 , 1 * SIZE(CO1)
  1030. vmovsd %xmm5 , (CO1, LDC)
  1031. vmovsd %xmm10, 1 * SIZE(CO1, LDC)
  1032. .endm
  1033. /*******************************************************************************************/
  1034. .macro KERNEL1x2_1
  1035. vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1
  1036. vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0
  1037. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1038. vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2
  1039. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  1040. .endm
  1041. .macro KERNEL1x2_2
  1042. vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1
  1043. vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0
  1044. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1045. vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2
  1046. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  1047. .endm
  1048. .macro KERNEL1x2_3
  1049. vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1
  1050. vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0
  1051. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1052. vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2
  1053. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  1054. .endm
  1055. .macro KERNEL1x2_4
  1056. vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1
  1057. vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0
  1058. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1059. vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2
  1060. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  1061. addq $8, BI
  1062. addq $4, %rax
  1063. .endm
  1064. .macro KERNEL1x2_SUB
  1065. vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1
  1066. vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0
  1067. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1068. vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2
  1069. VFMADD231SD_ %xmm5,%xmm2,%xmm0
  1070. addq $2, BI
  1071. addq $1, %rax
  1072. .endm
  1073. .macro SAVE1x2
  1074. vmovsd ALPHA, %xmm0
  1075. vmulsd %xmm0 , %xmm4 , %xmm4
  1076. vmulsd %xmm0 , %xmm5 , %xmm5
  1077. #if !defined(TRMMKERNEL)
  1078. vaddsd (CO1), %xmm4,%xmm4
  1079. vaddsd (CO1, LDC), %xmm5,%xmm5
  1080. #endif
  1081. vmovsd %xmm4 , (CO1)
  1082. vmovsd %xmm5 , (CO1, LDC)
  1083. .endm
  1084. /*******************************************************************************************/
  1085. /*******************************************************************************************
  1086. * 1 line of N
  1087. *******************************************************************************************/
  1088. .macro KERNEL16x1_1
  1089. vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1
  1090. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  1091. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1092. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  1093. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  1094. vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0
  1095. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  1096. vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0
  1097. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  1098. .endm
  1099. .macro KERNEL16x1_2
  1100. vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1
  1101. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  1102. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1103. vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0
  1104. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  1105. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0
  1106. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  1107. vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0
  1108. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  1109. .endm
  1110. .macro KERNEL16x1_3
  1111. vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1
  1112. vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0
  1113. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1114. vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0
  1115. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  1116. vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0
  1117. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  1118. vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0
  1119. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  1120. .endm
  1121. .macro KERNEL16x1_4
  1122. vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1
  1123. vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0
  1124. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1125. vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0
  1126. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  1127. vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0
  1128. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  1129. vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0
  1130. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  1131. addq $4, BI
  1132. addq $64, %rax
  1133. .endm
  1134. .macro KERNEL16x1_SUB
  1135. vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1
  1136. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  1137. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1138. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  1139. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  1140. vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0
  1141. VFMADD231PD_ %ymm10,%ymm1,%ymm0
  1142. vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0
  1143. VFMADD231PD_ %ymm13,%ymm1,%ymm0
  1144. addq $1, BI
  1145. addq $16, %rax
  1146. .endm
  1147. .macro SAVE16x1
  1148. vbroadcastsd ALPHA, %ymm0
  1149. vmulpd %ymm0 , %ymm4 , %ymm4
  1150. vmulpd %ymm0 , %ymm7 , %ymm7
  1151. vmulpd %ymm0 , %ymm10, %ymm10
  1152. vmulpd %ymm0 , %ymm13, %ymm13
  1153. #if !defined(TRMMKERNEL)
  1154. vaddpd (CO1), %ymm4,%ymm4
  1155. vaddpd 4 * SIZE(CO1), %ymm7,%ymm7
  1156. vaddpd 8 * SIZE(CO1), %ymm10,%ymm10
  1157. vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
  1158. #endif
  1159. vmovups %ymm4 , (CO1)
  1160. vmovups %ymm7 , 4 * SIZE(CO1)
  1161. vmovups %ymm10, 8 * SIZE(CO1)
  1162. vmovups %ymm13,12 * SIZE(CO1)
  1163. .endm
  1164. /*******************************************************************************************/
  1165. .macro KERNEL8x1_1
  1166. vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1
  1167. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  1168. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1169. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  1170. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  1171. .endm
  1172. .macro KERNEL8x1_2
  1173. vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1
  1174. vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0
  1175. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1176. vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0
  1177. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  1178. .endm
  1179. .macro KERNEL8x1_3
  1180. vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1
  1181. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  1182. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1183. vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0
  1184. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  1185. .endm
  1186. .macro KERNEL8x1_4
  1187. vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1
  1188. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0
  1189. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1190. vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0
  1191. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  1192. addq $4, BI
  1193. addq $32, %rax
  1194. .endm
  1195. .macro KERNEL8x1_SUB
  1196. vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1
  1197. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  1198. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1199. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  1200. VFMADD231PD_ %ymm7,%ymm1,%ymm0
  1201. addq $1, BI
  1202. addq $8 , %rax
  1203. .endm
  1204. .macro SAVE8x1
  1205. vbroadcastsd ALPHA, %ymm0
  1206. vmulpd %ymm0 , %ymm4 , %ymm4
  1207. vmulpd %ymm0 , %ymm7 , %ymm7
  1208. #if !defined(TRMMKERNEL)
  1209. vaddpd (CO1), %ymm4,%ymm4
  1210. vaddpd 4 * SIZE(CO1), %ymm7,%ymm7
  1211. #endif
  1212. vmovups %ymm4 , (CO1)
  1213. vmovups %ymm7 , 4 * SIZE(CO1)
  1214. .endm
  1215. /*******************************************************************************************/
  1216. .macro KERNEL4x1_1
  1217. vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1
  1218. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  1219. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1220. .endm
  1221. .macro KERNEL4x1_2
  1222. vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1
  1223. vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0
  1224. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1225. .endm
  1226. .macro KERNEL4x1_3
  1227. vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1
  1228. vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0
  1229. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1230. .endm
  1231. .macro KERNEL4x1_4
  1232. vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1
  1233. vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0
  1234. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1235. addq $4, BI
  1236. addq $16, %rax
  1237. .endm
  1238. .macro KERNEL4x1_SUB
  1239. vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1
  1240. vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0
  1241. VFMADD231PD_ %ymm4,%ymm1,%ymm0
  1242. addq $1, BI
  1243. addq $4 , %rax
  1244. .endm
  1245. .macro SAVE4x1
  1246. vbroadcastsd ALPHA, %ymm0
  1247. vmulpd %ymm0 , %ymm4 , %ymm4
  1248. #if !defined(TRMMKERNEL)
  1249. vaddpd (CO1), %ymm4,%ymm4
  1250. #endif
  1251. vmovups %ymm4 , (CO1)
  1252. .endm
  1253. /*******************************************************************************************/
  1254. .macro KERNEL2x1_1
  1255. vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1
  1256. vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0
  1257. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1258. vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0
  1259. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  1260. .endm
  1261. .macro KERNEL2x1_2
  1262. vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1
  1263. vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0
  1264. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1265. vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0
  1266. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  1267. .endm
  1268. .macro KERNEL2x1_3
  1269. vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1
  1270. vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0
  1271. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1272. vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0
  1273. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  1274. .endm
  1275. .macro KERNEL2x1_4
  1276. vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1
  1277. vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0
  1278. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1279. vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0
  1280. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  1281. addq $4, BI
  1282. addq $8, %rax
  1283. .endm
  1284. .macro KERNEL2x1_SUB
  1285. vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1
  1286. vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0
  1287. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1288. vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0
  1289. VFMADD231SD_ %xmm8,%xmm1,%xmm0
  1290. addq $1, BI
  1291. addq $2 , %rax
  1292. .endm
  1293. .macro SAVE2x1
  1294. vmovsd ALPHA, %xmm0
  1295. vmulsd %xmm0 , %xmm4 , %xmm4
  1296. vmulsd %xmm0 , %xmm8 , %xmm8
  1297. #if !defined(TRMMKERNEL)
  1298. vaddsd (CO1), %xmm4,%xmm4
  1299. vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
  1300. #endif
  1301. vmovsd %xmm4 , (CO1)
  1302. vmovsd %xmm8 , 1 * SIZE(CO1)
  1303. .endm
  1304. /*******************************************************************************************/
  1305. .macro KERNEL1x1_1
  1306. vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1
  1307. vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0
  1308. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1309. .endm
  1310. .macro KERNEL1x1_2
  1311. vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1
  1312. vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0
  1313. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1314. .endm
  1315. .macro KERNEL1x1_3
  1316. vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1
  1317. vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0
  1318. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1319. .endm
  1320. .macro KERNEL1x1_4
  1321. vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1
  1322. vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0
  1323. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1324. addq $ 4, BI
  1325. addq $ 4, %rax
  1326. .endm
  1327. .macro KERNEL1x1_SUB
  1328. vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1
  1329. vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0
  1330. VFMADD231SD_ %xmm4,%xmm1,%xmm0
  1331. addq $ 1, BI
  1332. addq $ 1 , %rax
  1333. .endm
  1334. .macro SAVE1x1
  1335. vmovsd ALPHA, %xmm0
  1336. vmulsd %xmm0 , %xmm4 , %xmm4
  1337. #if !defined(TRMMKERNEL)
  1338. vaddsd (CO1), %xmm4,%xmm4
  1339. #endif
  1340. vmovsd %xmm4 , (CO1)
  1341. .endm
  1342. /*******************************************************************************************/
  1343. #if !defined(TRMMKERNEL)
  1344. PROLOGUE
  1345. PROFCODE
  1346. subq $STACKSIZE, %rsp
  1347. movq %rbx, (%rsp)
  1348. movq %rbp, 8(%rsp)
  1349. movq %r12, 16(%rsp)
  1350. movq %r13, 24(%rsp)
  1351. movq %r14, 32(%rsp)
  1352. movq %r15, 40(%rsp)
  1353. vzeroupper
  1354. #ifdef WINDOWS_ABI
  1355. movq %rdi, 48(%rsp)
  1356. movq %rsi, 56(%rsp)
  1357. movups %xmm6, 64(%rsp)
  1358. movups %xmm7, 80(%rsp)
  1359. movups %xmm8, 96(%rsp)
  1360. movups %xmm9, 112(%rsp)
  1361. movups %xmm10, 128(%rsp)
  1362. movups %xmm11, 144(%rsp)
  1363. movups %xmm12, 160(%rsp)
  1364. movups %xmm13, 176(%rsp)
  1365. movups %xmm14, 192(%rsp)
  1366. movups %xmm15, 208(%rsp)
  1367. movq ARG1, OLD_M
  1368. movq ARG2, OLD_N
  1369. movq ARG3, OLD_K
  1370. movq OLD_A, A
  1371. movq OLD_B, B
  1372. movq OLD_C, C
  1373. movq OLD_LDC, LDC
  1374. vmovaps %xmm3, %xmm0
  1375. #else
  1376. movq STACKSIZE + 8(%rsp), LDC
  1377. #endif
  1378. movq %rsp, SP # save old stack
  1379. subq $128 + L_BUFFER_SIZE, %rsp
  1380. andq $-4096, %rsp # align stack
  1381. STACK_TOUCH
  1382. cmpq $0, OLD_M
  1383. je .L999
  1384. cmpq $0, OLD_N
  1385. je .L999
  1386. cmpq $0, OLD_K
  1387. je .L999
  1388. movq OLD_M, M
  1389. movq OLD_N, N
  1390. movq OLD_K, K
  1391. vmovsd %xmm0, ALPHA
  1392. salq $BASE_SHIFT, LDC
  1393. movq N, %rax
  1394. xorq %rdx, %rdx
  1395. movq $6, %rdi
  1396. divq %rdi // N / 6
  1397. movq %rax, Ndiv6 // N / 6
  1398. movq %rdx, Nmod6 // N % 6
  1399. movq Ndiv6, J
  1400. cmpq $0, J
  1401. je .L2_0
  1402. ALIGN_4
  1403. .L6_01:
  1404. // copy to sub buffer
  1405. movq K, %rax
  1406. salq $1,%rax // K * 2 ; read 2 values
  1407. movq B, BO1
  1408. leaq (B,%rax, SIZE), BO2 // next offset to BO2
  1409. leaq BUFFER1, BO // first buffer to BO
  1410. movq K, %rax
  1411. sarq $3 , %rax // K / 8
  1412. jz .L6_01a_2
  1413. ALIGN_4
  1414. .L6_01a_1:
  1415. prefetcht0 512(BO1)
  1416. prefetcht0 512(BO2)
  1417. prefetchw 512(BO)
  1418. vmovups 0 * SIZE(BO1), %xmm0
  1419. vmovups 2 * SIZE(BO1), %xmm2
  1420. vmovups 4 * SIZE(BO1), %xmm4
  1421. vmovups 6 * SIZE(BO1), %xmm6
  1422. vmovsd 0 * SIZE(BO2), %xmm1
  1423. vmovsd 2 * SIZE(BO2), %xmm3
  1424. vmovsd 4 * SIZE(BO2), %xmm5
  1425. vmovsd 6 * SIZE(BO2), %xmm7
  1426. vmovups %xmm0, 0*SIZE(BO)
  1427. vmovsd %xmm1, 2*SIZE(BO)
  1428. vmovups %xmm2, 3*SIZE(BO)
  1429. vmovsd %xmm3, 5*SIZE(BO)
  1430. vmovups %xmm4, 6*SIZE(BO)
  1431. vmovsd %xmm5, 8*SIZE(BO)
  1432. vmovups %xmm6, 9*SIZE(BO)
  1433. vmovsd %xmm7,11*SIZE(BO)
  1434. addq $ 8*SIZE,BO1
  1435. addq $ 8*SIZE,BO2
  1436. addq $ 12*SIZE,BO
  1437. vmovups 0 * SIZE(BO1), %xmm0
  1438. vmovups 2 * SIZE(BO1), %xmm2
  1439. vmovups 4 * SIZE(BO1), %xmm4
  1440. vmovups 6 * SIZE(BO1), %xmm6
  1441. vmovsd 0 * SIZE(BO2), %xmm1
  1442. vmovsd 2 * SIZE(BO2), %xmm3
  1443. vmovsd 4 * SIZE(BO2), %xmm5
  1444. vmovsd 6 * SIZE(BO2), %xmm7
  1445. vmovups %xmm0, 0*SIZE(BO)
  1446. vmovsd %xmm1, 2*SIZE(BO)
  1447. vmovups %xmm2, 3*SIZE(BO)
  1448. vmovsd %xmm3, 5*SIZE(BO)
  1449. vmovups %xmm4, 6*SIZE(BO)
  1450. vmovsd %xmm5, 8*SIZE(BO)
  1451. vmovups %xmm6, 9*SIZE(BO)
  1452. vmovsd %xmm7,11*SIZE(BO)
  1453. addq $ 8*SIZE,BO1
  1454. addq $ 8*SIZE,BO2
  1455. addq $ 12*SIZE,BO
  1456. decq %rax
  1457. jnz .L6_01a_1
  1458. .L6_01a_2:
  1459. movq K, %rax
  1460. andq $7, %rax // K % 8
  1461. jz .L6_02c
  1462. ALIGN_4
  1463. .L6_02b:
  1464. vmovups 0 * SIZE(BO1), %xmm0
  1465. vmovsd 0 * SIZE(BO2), %xmm2
  1466. vmovups %xmm0, 0*SIZE(BO)
  1467. vmovsd %xmm2, 2*SIZE(BO)
  1468. addq $ 2*SIZE,BO1
  1469. addq $ 2*SIZE,BO2
  1470. addq $ 3*SIZE,BO
  1471. decq %rax
  1472. jnz .L6_02b
  1473. .L6_02c:
  1474. movq K, %rax
  1475. salq $1,%rax // K * 2
  1476. leaq (B,%rax, SIZE), BO1 // next offset to BO1
  1477. leaq (BO1,%rax, SIZE), BO2 // next offset to BO2
  1478. leaq BUFFER2, BO // second buffer to BO
  1479. movq K, %rax
  1480. sarq $3 , %rax // K / 8
  1481. jz .L6_02c_2
  1482. ALIGN_4
  1483. .L6_02c_1:
  1484. prefetcht0 512(BO2)
  1485. prefetchw 512(BO)
  1486. vmovups 0 * SIZE(BO2), %xmm0
  1487. vmovups 2 * SIZE(BO2), %xmm2
  1488. vmovups 4 * SIZE(BO2), %xmm4
  1489. vmovups 6 * SIZE(BO2), %xmm6
  1490. vmovsd 1 * SIZE(BO1), %xmm1
  1491. vmovsd 3 * SIZE(BO1), %xmm3
  1492. vmovsd 5 * SIZE(BO1), %xmm5
  1493. vmovsd 7 * SIZE(BO1), %xmm7
  1494. vmovsd %xmm1, 0*SIZE(BO)
  1495. vmovups %xmm0, 1*SIZE(BO)
  1496. vmovsd %xmm3, 3*SIZE(BO)
  1497. vmovups %xmm2, 4*SIZE(BO)
  1498. vmovsd %xmm5, 6*SIZE(BO)
  1499. vmovups %xmm4, 7*SIZE(BO)
  1500. vmovsd %xmm7, 9*SIZE(BO)
  1501. vmovups %xmm6,10*SIZE(BO)
  1502. addq $8*SIZE,BO1
  1503. addq $8*SIZE,BO2
  1504. addq $12*SIZE,BO
  1505. vmovups 0 * SIZE(BO2), %xmm0
  1506. vmovups 2 * SIZE(BO2), %xmm2
  1507. vmovups 4 * SIZE(BO2), %xmm4
  1508. vmovups 6 * SIZE(BO2), %xmm6
  1509. vmovsd 1 * SIZE(BO1), %xmm1
  1510. vmovsd 3 * SIZE(BO1), %xmm3
  1511. vmovsd 5 * SIZE(BO1), %xmm5
  1512. vmovsd 7 * SIZE(BO1), %xmm7
  1513. vmovsd %xmm1, 0*SIZE(BO)
  1514. vmovups %xmm0, 1*SIZE(BO)
  1515. vmovsd %xmm3, 3*SIZE(BO)
  1516. vmovups %xmm2, 4*SIZE(BO)
  1517. vmovsd %xmm5, 6*SIZE(BO)
  1518. vmovups %xmm4, 7*SIZE(BO)
  1519. vmovsd %xmm7, 9*SIZE(BO)
  1520. vmovups %xmm6,10*SIZE(BO)
  1521. addq $8*SIZE,BO1
  1522. addq $8*SIZE,BO2
  1523. addq $12*SIZE,BO
  1524. decq %rax
  1525. jnz .L6_02c_1
  1526. .L6_02c_2:
  1527. movq K, %rax
  1528. andq $7, %rax // K % 8
  1529. jz .L6_03c
  1530. ALIGN_4
  1531. .L6_03b:
  1532. vmovsd 1*SIZE(BO1), %xmm0
  1533. vmovups 0*SIZE(BO2), %xmm1
  1534. vmovsd %xmm0, 0*SIZE(BO)
  1535. vmovups %xmm1, 1*SIZE(BO)
  1536. addq $2*SIZE,BO1
  1537. addq $2*SIZE,BO2
  1538. addq $3*SIZE,BO
  1539. decq %rax
  1540. jnz .L6_03b
  1541. .L6_03c:
  1542. movq BO2, B // next offset of B
  1543. .L6_10:
  1544. movq C, CO1
  1545. leaq (C, LDC, 2), C
  1546. leaq (C, LDC, 1), C // c += 3 * ldc
  1547. movq A, AO // aoffset = a
  1548. addq $16 * SIZE, AO
  1549. movq M, I
  1550. sarq $4, I // i = (m >> 4)
  1551. je .L6_20
  1552. ALIGN_4
  1553. .L6_11:
  1554. leaq BUFFER1, BO // first buffer to BO
  1555. addq $12 * SIZE, BO
  1556. prefetcht0 (CO1)
  1557. prefetcht0 (CO1,LDC,1)
  1558. prefetcht0 (CO1,LDC,2)
  1559. prefetcht0 64(CO1)
  1560. prefetcht0 64(CO1,LDC,1)
  1561. prefetcht0 64(CO1,LDC,2)
  1562. vzeroall
  1563. movq K, %rax
  1564. sarq $1, %rax // K / 8
  1565. je .L6_16
  1566. ALIGN_5
  1567. .L6_12:
  1568. /*
  1569. prefetcht0 B_PR1(BO)
  1570. prefetcht0 B_PR1+64(BO)
  1571. prefetcht0 B_PR1+128(BO)
  1572. */
  1573. KERNEL16x3_SUBN
  1574. KERNEL16x3_SUBN
  1575. /*
  1576. KERNEL16x3_SUBN
  1577. KERNEL16x3_SUBN
  1578. KERNEL16x3_SUBN
  1579. KERNEL16x3_SUBN
  1580. KERNEL16x3_SUBN
  1581. KERNEL16x3_SUBN
  1582. */
  1583. dec %rax
  1584. jne .L6_12
  1585. .L6_16:
  1586. movq K, %rax
  1587. andq $1, %rax # if (k & 1)
  1588. je .L6_19
  1589. ALIGN_4
  1590. .L6_17:
  1591. KERNEL16x3_SUBN
  1592. dec %rax
  1593. jne .L6_17
  1594. ALIGN_4
  1595. .L6_19:
  1596. SAVE16x3
  1597. addq $16 * SIZE, CO1 # coffset += 16
  1598. decq I # i --
  1599. jg .L6_11
  1600. ALIGN_4
  1601. /**************************************************************************
  1602. * Rest of M
  1603. ***************************************************************************/
  1604. .L6_20:
  1605. // Test rest of M
  1606. testq $15, M
  1607. jz .L7_10 // to next 3 lines of N
  1608. testq $8, M
  1609. jz .L6_21pre
  1610. ALIGN_4
  1611. /**************************************************************************/
  1612. .L6_20_1:
  1613. leaq BUFFER1, BO // first buffer to BO
  1614. addq $12 * SIZE, BO
  1615. vzeroall
  1616. movq K, %rax
  1617. sarq $3, %rax
  1618. je .L6_20_6
  1619. ALIGN_4
  1620. .L6_20_2:
  1621. KERNEL8x3_SUBN
  1622. KERNEL8x3_SUBN
  1623. KERNEL8x3_SUBN
  1624. KERNEL8x3_SUBN
  1625. KERNEL8x3_SUBN
  1626. KERNEL8x3_SUBN
  1627. KERNEL8x3_SUBN
  1628. KERNEL8x3_SUBN
  1629. dec %rax
  1630. jne .L6_20_2
  1631. ALIGN_4
  1632. .L6_20_6:
  1633. movq K, %rax
  1634. andq $7, %rax # if (k & 1)
  1635. je .L6_20_9
  1636. ALIGN_4
  1637. .L6_20_7:
  1638. KERNEL8x3_SUBN
  1639. dec %rax
  1640. jne .L6_20_7
  1641. ALIGN_4
  1642. .L6_20_9:
  1643. SAVE8x3
  1644. addq $8 * SIZE, CO1 # coffset += 8
  1645. ALIGN_4
  1646. /**************************************************************************/
  1647. .L6_21pre:
  1648. testq $4, M
  1649. jz .L6_30
  1650. ALIGN_4
  1651. .L6_21:
  1652. leaq BUFFER1, BO // first buffer to BO
  1653. addq $12 * SIZE, BO
  1654. vzeroall
  1655. movq K, %rax
  1656. sarq $3, %rax
  1657. je .L6_26
  1658. ALIGN_4
  1659. .L6_22:
  1660. KERNEL4x3_SUBN
  1661. KERNEL4x3_SUBN
  1662. KERNEL4x3_SUBN
  1663. KERNEL4x3_SUBN
  1664. KERNEL4x3_SUBN
  1665. KERNEL4x3_SUBN
  1666. KERNEL4x3_SUBN
  1667. KERNEL4x3_SUBN
  1668. dec %rax
  1669. jne .L6_22
  1670. ALIGN_4
  1671. .L6_26:
  1672. movq K, %rax
  1673. andq $7, %rax # if (k & 1)
  1674. je .L6_29
  1675. ALIGN_4
  1676. .L6_27:
  1677. KERNEL4x3_SUBN
  1678. dec %rax
  1679. jne .L6_27
  1680. ALIGN_4
  1681. .L6_29:
  1682. SAVE4x3
  1683. addq $4 * SIZE, CO1 # coffset += 4
  1684. ALIGN_4
  1685. .L6_30:
  1686. testq $2, M
  1687. jz .L6_40
  1688. ALIGN_4
  1689. .L6_31:
  1690. leaq BUFFER1, BO // first buffer to BO
  1691. addq $12 * SIZE, BO
  1692. vzeroall
  1693. movq K, %rax
  1694. sarq $3, %rax
  1695. je .L6_36
  1696. ALIGN_4
  1697. .L6_32:
  1698. KERNEL2x3_SUBN
  1699. KERNEL2x3_SUBN
  1700. KERNEL2x3_SUBN
  1701. KERNEL2x3_SUBN
  1702. KERNEL2x3_SUBN
  1703. KERNEL2x3_SUBN
  1704. KERNEL2x3_SUBN
  1705. KERNEL2x3_SUBN
  1706. dec %rax
  1707. jne .L6_32
  1708. ALIGN_4
  1709. .L6_36:
  1710. movq K, %rax
  1711. andq $7, %rax # if (k & 1)
  1712. je .L6_39
  1713. ALIGN_4
  1714. .L6_37:
  1715. KERNEL2x3_SUBN
  1716. dec %rax
  1717. jne .L6_37
  1718. ALIGN_4
  1719. .L6_39:
  1720. SAVE2x3
  1721. addq $2 * SIZE, CO1 # coffset += 2
  1722. ALIGN_4
  1723. .L6_40:
  1724. testq $1, M
  1725. jz .L7_10 // to next 3 lines of N
  1726. ALIGN_4
  1727. .L6_41:
  1728. leaq BUFFER1, BO // first buffer to BO
  1729. addq $12 * SIZE, BO
  1730. vzeroall
  1731. movq K, %rax
  1732. sarq $3,%rax
  1733. je .L6_46
  1734. ALIGN_4
  1735. .L6_42:
  1736. KERNEL1x3_SUBN
  1737. KERNEL1x3_SUBN
  1738. KERNEL1x3_SUBN
  1739. KERNEL1x3_SUBN
  1740. KERNEL1x3_SUBN
  1741. KERNEL1x3_SUBN
  1742. KERNEL1x3_SUBN
  1743. KERNEL1x3_SUBN
  1744. dec %rax
  1745. jne .L6_42
  1746. ALIGN_4
  1747. .L6_46:
  1748. movq K, %rax
  1749. andq $7, %rax # if (k & 1)
  1750. je .L6_49
  1751. ALIGN_4
  1752. .L6_47:
  1753. KERNEL1x3_SUBN
  1754. dec %rax
  1755. jne .L6_47
  1756. ALIGN_4
  1757. .L6_49:
  1758. SAVE1x3
  1759. addq $1 * SIZE, CO1 # coffset += 1
  1760. ALIGN_4
  1761. /***************************************************************************************************************/
  1762. .L7_10:
  1763. movq C, CO1
  1764. leaq (C, LDC, 2), C
  1765. leaq (C, LDC, 1), C // c += 3 * ldc
  1766. movq A, AO // aoffset = a
  1767. addq $16 * SIZE, AO
  1768. movq M, I
  1769. sarq $4, I // i = (m >> 4)
  1770. je .L7_20
  1771. ALIGN_4
  1772. .L7_11:
  1773. leaq BUFFER2, BO // second buffer to BO
  1774. addq $12 * SIZE, BO
  1775. prefetcht0 (CO1)
  1776. prefetcht0 (CO1,LDC,1)
  1777. prefetcht0 (CO1,LDC,2)
  1778. prefetcht0 64(CO1)
  1779. prefetcht0 64(CO1,LDC,1)
  1780. prefetcht0 64(CO1,LDC,2)
  1781. vzeroall
  1782. movq K, %rax
  1783. sarq $3, %rax // K / 8
  1784. je .L7_16
  1785. ALIGN_5
  1786. .L7_12:
  1787. /*
  1788. prefetcht0 B_PR1(BO)
  1789. prefetcht0 B_PR1+64(BO)
  1790. prefetcht0 B_PR1+128(BO)
  1791. */
  1792. KERNEL16x3_SUBN
  1793. KERNEL16x3_SUBN
  1794. KERNEL16x3_SUBN
  1795. KERNEL16x3_SUBN
  1796. KERNEL16x3_SUBN
  1797. KERNEL16x3_SUBN
  1798. KERNEL16x3_SUBN
  1799. KERNEL16x3_SUBN
  1800. dec %rax
  1801. jne .L7_12
  1802. ALIGN_4
  1803. .L7_16:
  1804. movq K, %rax
  1805. andq $7, %rax # if (k & 1)
  1806. je .L7_19
  1807. ALIGN_5
  1808. .L7_17:
  1809. KERNEL16x3_SUBN
  1810. dec %rax
  1811. jne .L7_17
  1812. .L7_19:
  1813. SAVE16x3
  1814. addq $16 * SIZE, CO1 # coffset += 16
  1815. decq I # i --
  1816. jg .L7_11
  1817. ALIGN_4
  1818. /**************************************************************************
  1819. * Rest of M
  1820. ***************************************************************************/
  1821. .L7_20:
  1822. // Test rest of M
  1823. testq $15, M
  1824. jz .L7_60 // to next 3 lines of N
  1825. testq $8, M
  1826. jz .L7_21pre
  1827. ALIGN_4
  1828. /**************************************************************************/
  1829. .L7_20_1:
  1830. leaq BUFFER2, BO // first buffer to BO
  1831. addq $12 * SIZE, BO
  1832. vzeroall
  1833. movq K, %rax
  1834. sarq $3, %rax
  1835. je .L7_20_6
  1836. ALIGN_4
  1837. .L7_20_2:
  1838. KERNEL8x3_SUBN
  1839. KERNEL8x3_SUBN
  1840. KERNEL8x3_SUBN
  1841. KERNEL8x3_SUBN
  1842. KERNEL8x3_SUBN
  1843. KERNEL8x3_SUBN
  1844. KERNEL8x3_SUBN
  1845. KERNEL8x3_SUBN
  1846. dec %rax
  1847. jne .L7_20_2
  1848. ALIGN_4
  1849. .L7_20_6:
  1850. movq K, %rax
  1851. andq $7, %rax # if (k & 1)
  1852. je .L7_20_9
  1853. ALIGN_4
  1854. .L7_20_7:
  1855. KERNEL8x3_SUBN
  1856. dec %rax
  1857. jne .L7_20_7
  1858. ALIGN_4
  1859. .L7_20_9:
  1860. SAVE8x3
  1861. addq $8 * SIZE, CO1 # coffset += 8
  1862. ALIGN_4
  1863. /**************************************************************************/
  1864. .L7_21pre:
  1865. testq $4, M
  1866. jz .L7_30
  1867. ALIGN_4
  1868. .L7_21:
  1869. leaq BUFFER2, BO // second buffer to BO
  1870. addq $12 * SIZE, BO
  1871. vzeroall
  1872. movq K, %rax
  1873. sarq $3, %rax
  1874. je .L7_26
  1875. ALIGN_4
  1876. .L7_22:
  1877. KERNEL4x3_SUBN
  1878. KERNEL4x3_SUBN
  1879. KERNEL4x3_SUBN
  1880. KERNEL4x3_SUBN
  1881. KERNEL4x3_SUBN
  1882. KERNEL4x3_SUBN
  1883. KERNEL4x3_SUBN
  1884. KERNEL4x3_SUBN
  1885. dec %rax
  1886. jne .L7_22
  1887. ALIGN_4
  1888. .L7_26:
  1889. movq K, %rax
  1890. andq $7, %rax # if (k & 1)
  1891. je .L7_29
  1892. ALIGN_4
  1893. .L7_27:
  1894. KERNEL4x3_SUBN
  1895. dec %rax
  1896. jne .L7_27
  1897. ALIGN_4
  1898. .L7_29:
  1899. SAVE4x3
  1900. addq $4 * SIZE, CO1 # coffset += 4
  1901. ALIGN_4
  1902. .L7_30:
  1903. testq $2, M
  1904. jz .L7_40
  1905. ALIGN_4
  1906. .L7_31:
  1907. leaq BUFFER2, BO // second buffer to BO
  1908. addq $12 * SIZE, BO
  1909. vzeroall
  1910. movq K, %rax
  1911. sarq $3, %rax
  1912. je .L7_36
  1913. ALIGN_4
  1914. .L7_32:
  1915. KERNEL2x3_SUBN
  1916. KERNEL2x3_SUBN
  1917. KERNEL2x3_SUBN
  1918. KERNEL2x3_SUBN
  1919. KERNEL2x3_SUBN
  1920. KERNEL2x3_SUBN
  1921. KERNEL2x3_SUBN
  1922. KERNEL2x3_SUBN
  1923. dec %rax
  1924. jne .L7_32
  1925. ALIGN_4
  1926. .L7_36:
  1927. movq K, %rax
  1928. andq $7, %rax # if (k & 1)
  1929. je .L7_39
  1930. ALIGN_4
  1931. .L7_37:
  1932. KERNEL2x3_SUBN
  1933. dec %rax
  1934. jne .L7_37
  1935. ALIGN_4
  1936. .L7_39:
  1937. SAVE2x3
  1938. addq $2 * SIZE, CO1 # coffset += 2
  1939. ALIGN_4
  1940. .L7_40:
  1941. testq $1, M
  1942. jz .L7_60 // to next 3 lines of N
  1943. ALIGN_4
  1944. .L7_41:
  1945. leaq BUFFER2, BO // second buffer to BO
  1946. addq $12 * SIZE, BO
  1947. vzeroall
  1948. movq K, %rax
  1949. sarq $3, %rax
  1950. je .L7_46
  1951. ALIGN_4
  1952. .L7_42:
  1953. KERNEL1x3_SUBN
  1954. KERNEL1x3_SUBN
  1955. KERNEL1x3_SUBN
  1956. KERNEL1x3_SUBN
  1957. KERNEL1x3_SUBN
  1958. KERNEL1x3_SUBN
  1959. KERNEL1x3_SUBN
  1960. KERNEL1x3_SUBN
  1961. dec %rax
  1962. jne .L7_42
  1963. ALIGN_4
  1964. .L7_46:
  1965. movq K, %rax
  1966. andq $7, %rax # if (k & 1)
  1967. je .L7_49
  1968. ALIGN_4
  1969. .L7_47:
  1970. KERNEL1x3_SUBN
  1971. dec %rax
  1972. jne .L7_47
  1973. ALIGN_4
  1974. .L7_49:
  1975. SAVE1x3
  1976. addq $1 * SIZE, CO1 # coffset += 1
  1977. ALIGN_4
  1978. .L7_60:
  1979. decq J // j --
  1980. jg .L6_01
  1981. .L2_0:
  1982. cmpq $0, Nmod6 // N % 6 == 0
  1983. je .L999
  1984. /************************************************************************************************
  1985. * Loop for Nmod6 / 2 > 0
  1986. *************************************************************************************************/
  1987. movq Nmod6, J
  1988. sarq $1, J // j = j / 2
  1989. je .L1_0
  1990. ALIGN_4
  1991. .L2_01:
  1992. // copy to sub buffer
  1993. movq B, BO1
  1994. leaq BUFFER1, BO // first buffer to BO
  1995. movq K, %rax
  1996. sarq $2, %rax // K / 4
  1997. jz .L2_01b
  1998. ALIGN_4
  1999. .L2_01a:
  2000. prefetcht0 512(BO1)
  2001. prefetchw 512(BO)
  2002. vmovups (BO1), %xmm0
  2003. vmovups 2*SIZE(BO1), %xmm1
  2004. vmovups 4*SIZE(BO1), %xmm2
  2005. vmovups 6*SIZE(BO1), %xmm3
  2006. vmovups %xmm0, (BO)
  2007. vmovups %xmm1, 2*SIZE(BO)
  2008. vmovups %xmm2, 4*SIZE(BO)
  2009. vmovups %xmm3, 6*SIZE(BO)
  2010. addq $8*SIZE,BO1
  2011. addq $8*SIZE,BO
  2012. decq %rax
  2013. jnz .L2_01a
  2014. .L2_01b:
  2015. movq K, %rax
  2016. andq $3, %rax // K % 4
  2017. jz .L2_02d
  2018. ALIGN_4
  2019. .L2_02c:
  2020. vmovups (BO1), %xmm0
  2021. vmovups %xmm0, (BO)
  2022. addq $2*SIZE,BO1
  2023. addq $2*SIZE,BO
  2024. decq %rax
  2025. jnz .L2_02c
  2026. .L2_02d:
  2027. movq BO1, B // next offset of B
  2028. .L2_10:
  2029. movq C, CO1
  2030. leaq (C, LDC, 2), C // c += 2 * ldc
  2031. movq A, AO // aoffset = a
  2032. addq $32 * SIZE, AO
  2033. movq M, I
  2034. sarq $4, I // i = (m >> 4)
  2035. je .L2_20
  2036. ALIGN_4
  2037. .L2_11:
  2038. leaq BUFFER1, BO // first buffer to BO
  2039. addq $4 * SIZE, BO
  2040. vzeroall
  2041. movq K, %rax
  2042. andq $-8, %rax // K = K - ( K % 8 )
  2043. je .L2_16
  2044. movq %rax, BI // Index for BO
  2045. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2046. salq $4, %rax // rax = rax * 16 ; number of values
  2047. leaq (AO, %rax, SIZE), AO
  2048. leaq (BO, BI, SIZE), BO
  2049. negq BI
  2050. negq %rax
  2051. ALIGN_4
  2052. .L2_12:
  2053. prefetcht0 B_PR1(BO,BI,8)
  2054. KERNEL16x2_1
  2055. KERNEL16x2_2
  2056. KERNEL16x2_3
  2057. KERNEL16x2_4
  2058. prefetcht0 B_PR1(BO,BI,8)
  2059. KERNEL16x2_1
  2060. KERNEL16x2_2
  2061. KERNEL16x2_3
  2062. KERNEL16x2_4
  2063. je .L2_16
  2064. prefetcht0 B_PR1(BO,BI,8)
  2065. KERNEL16x2_1
  2066. KERNEL16x2_2
  2067. KERNEL16x2_3
  2068. KERNEL16x2_4
  2069. prefetcht0 B_PR1(BO,BI,8)
  2070. KERNEL16x2_1
  2071. KERNEL16x2_2
  2072. KERNEL16x2_3
  2073. KERNEL16x2_4
  2074. je .L2_16
  2075. jmp .L2_12
  2076. ALIGN_4
  2077. .L2_16:
  2078. movq K, %rax
  2079. andq $7, %rax # if (k & 1)
  2080. je .L2_19
  2081. movq %rax, BI // Index for BO
  2082. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2083. salq $4, %rax // rax = rax * 16 ; number of values
  2084. leaq (AO, %rax, SIZE), AO
  2085. leaq (BO, BI, SIZE), BO
  2086. negq BI
  2087. negq %rax
  2088. ALIGN_4
  2089. .L2_17:
  2090. KERNEL16x2_SUB
  2091. jl .L2_17
  2092. ALIGN_4
  2093. .L2_19:
  2094. SAVE16x2
  2095. addq $16 * SIZE, CO1 # coffset += 16
  2096. decq I # i --
  2097. jg .L2_11
  2098. ALIGN_4
  2099. /**************************************************************************
  2100. * Rest of M
  2101. ***************************************************************************/
  2102. .L2_20:
  2103. // Test rest of M
  2104. testq $15, M
  2105. jz .L2_60 // to next 3 lines of N
  2106. testq $8, M
  2107. jz .L2_21pre
  2108. ALIGN_4
  2109. /**************************************************************************/
  2110. .L2_20_1:
  2111. leaq BUFFER1, BO // first buffer to BO
  2112. addq $4 * SIZE, BO
  2113. vzeroall
  2114. movq K, %rax
  2115. andq $-8, %rax
  2116. je .L2_20_6
  2117. movq %rax, BI // Index for BO
  2118. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2119. salq $3, %rax // rax = rax * 8 ; number of values
  2120. leaq (AO, %rax, SIZE), AO
  2121. leaq (BO, BI, SIZE), BO
  2122. negq BI
  2123. negq %rax
  2124. ALIGN_4
  2125. .L2_20_2:
  2126. prefetcht0 B_PR1(BO,BI,8)
  2127. KERNEL8x2_1
  2128. KERNEL8x2_2
  2129. KERNEL8x2_3
  2130. KERNEL8x2_4
  2131. prefetcht0 B_PR1(BO,BI,8)
  2132. KERNEL8x2_1
  2133. KERNEL8x2_2
  2134. KERNEL8x2_3
  2135. KERNEL8x2_4
  2136. je .L2_20_6
  2137. prefetcht0 B_PR1(BO,BI,8)
  2138. KERNEL8x2_1
  2139. KERNEL8x2_2
  2140. KERNEL8x2_3
  2141. KERNEL8x2_4
  2142. prefetcht0 B_PR1(BO,BI,8)
  2143. KERNEL8x2_1
  2144. KERNEL8x2_2
  2145. KERNEL8x2_3
  2146. KERNEL8x2_4
  2147. je .L2_20_6
  2148. jmp .L2_20_2
  2149. ALIGN_4
  2150. .L2_20_6:
  2151. movq K, %rax
  2152. andq $7, %rax # if (k & 1)
  2153. je .L2_20_9
  2154. movq %rax, BI // Index for BO
  2155. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2156. salq $3, %rax // rax = rax * 8 ; number of values
  2157. leaq (AO, %rax, SIZE), AO
  2158. leaq (BO, BI, SIZE), BO
  2159. negq BI
  2160. negq %rax
  2161. ALIGN_4
  2162. .L2_20_7:
  2163. KERNEL8x2_SUB
  2164. jl .L2_20_7
  2165. ALIGN_4
  2166. .L2_20_9:
  2167. SAVE8x2
  2168. addq $8 * SIZE, CO1 # coffset += 8
  2169. ALIGN_4
  2170. /**************************************************************************/
  2171. .L2_21pre:
  2172. testq $4, M
  2173. jz .L2_30
  2174. ALIGN_4
  2175. .L2_21:
  2176. leaq BUFFER1, BO // first buffer to BO
  2177. addq $4 * SIZE, BO
  2178. vzeroall
  2179. movq K, %rax
  2180. andq $-8, %rax
  2181. je .L2_26
  2182. movq %rax, BI // Index for BO
  2183. leaq (BI,BI,1), BI // BI = BI * 1 ; number of values
  2184. salq $2, %rax // rax = rax * 4 ; number of values
  2185. leaq (AO, %rax, SIZE), AO
  2186. leaq (BO, BI, SIZE), BO
  2187. negq BI
  2188. negq %rax
  2189. ALIGN_4
  2190. .L2_22:
  2191. prefetcht0 B_PR1(BO,BI,8)
  2192. KERNEL4x2_1
  2193. KERNEL4x2_2
  2194. KERNEL4x2_3
  2195. KERNEL4x2_4
  2196. prefetcht0 B_PR1(BO,BI,8)
  2197. KERNEL4x2_1
  2198. KERNEL4x2_2
  2199. KERNEL4x2_3
  2200. KERNEL4x2_4
  2201. je .L2_26
  2202. prefetcht0 B_PR1(BO,BI,8)
  2203. KERNEL4x2_1
  2204. KERNEL4x2_2
  2205. KERNEL4x2_3
  2206. KERNEL4x2_4
  2207. prefetcht0 B_PR1(BO,BI,8)
  2208. KERNEL4x2_1
  2209. KERNEL4x2_2
  2210. KERNEL4x2_3
  2211. KERNEL4x2_4
  2212. je .L2_26
  2213. jmp .L2_22
  2214. ALIGN_4
  2215. .L2_26:
  2216. movq K, %rax
  2217. andq $7, %rax # if (k & 1)
  2218. je .L2_29
  2219. movq %rax, BI // Index for BO
  2220. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2221. salq $2, %rax // rax = rax * 4 ; number of values
  2222. leaq (AO, %rax, SIZE), AO
  2223. leaq (BO, BI, SIZE), BO
  2224. negq BI
  2225. negq %rax
  2226. ALIGN_4
  2227. .L2_27:
  2228. KERNEL4x2_SUB
  2229. jl .L2_27
  2230. ALIGN_4
  2231. .L2_29:
  2232. SAVE4x2
  2233. addq $4 * SIZE, CO1 # coffset += 4
  2234. ALIGN_4
  2235. .L2_30:
  2236. testq $2, M
  2237. jz .L2_40
  2238. ALIGN_4
  2239. .L2_31:
  2240. leaq BUFFER1, BO // first buffer to BO
  2241. addq $4 * SIZE, BO
  2242. vzeroall
  2243. movq K, %rax
  2244. andq $-8, %rax
  2245. je .L2_36
  2246. movq %rax, BI // Index for BO
  2247. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2248. salq $1, %rax // rax = rax *2 ; number of values
  2249. leaq (AO, %rax, SIZE), AO
  2250. leaq (BO, BI, SIZE), BO
  2251. negq BI
  2252. negq %rax
  2253. ALIGN_4
  2254. .L2_32:
  2255. KERNEL2x2_1
  2256. KERNEL2x2_2
  2257. KERNEL2x2_3
  2258. KERNEL2x2_4
  2259. KERNEL2x2_1
  2260. KERNEL2x2_2
  2261. KERNEL2x2_3
  2262. KERNEL2x2_4
  2263. je .L2_36
  2264. KERNEL2x2_1
  2265. KERNEL2x2_2
  2266. KERNEL2x2_3
  2267. KERNEL2x2_4
  2268. KERNEL2x2_1
  2269. KERNEL2x2_2
  2270. KERNEL2x2_3
  2271. KERNEL2x2_4
  2272. je .L2_36
  2273. jmp .L2_32
  2274. ALIGN_4
  2275. .L2_36:
  2276. movq K, %rax
  2277. andq $7, %rax # if (k & 1)
  2278. je .L2_39
  2279. movq %rax, BI // Index for BO
  2280. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2281. salq $1, %rax // rax = rax *2 ; number of values
  2282. leaq (AO, %rax, SIZE), AO
  2283. leaq (BO, BI, SIZE), BO
  2284. negq BI
  2285. negq %rax
  2286. ALIGN_4
  2287. .L2_37:
  2288. KERNEL2x2_SUB
  2289. jl .L2_37
  2290. ALIGN_4
  2291. .L2_39:
  2292. SAVE2x2
  2293. addq $2 * SIZE, CO1 # coffset += 2
  2294. ALIGN_4
  2295. .L2_40:
  2296. testq $1, M
  2297. jz .L2_60 // to next 2 lines of N
  2298. ALIGN_4
  2299. .L2_41:
  2300. leaq BUFFER1, BO // first buffer to BO
  2301. addq $4 * SIZE, BO
  2302. vzeroall
  2303. movq K, %rax
  2304. andq $-8, %rax
  2305. je .L2_46
  2306. movq %rax, BI // Index for BO
  2307. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2308. leaq (AO, %rax, SIZE), AO
  2309. leaq (BO, BI, SIZE), BO
  2310. negq BI
  2311. negq %rax
  2312. ALIGN_4
  2313. .L2_42:
  2314. KERNEL1x2_1
  2315. KERNEL1x2_2
  2316. KERNEL1x2_3
  2317. KERNEL1x2_4
  2318. KERNEL1x2_1
  2319. KERNEL1x2_2
  2320. KERNEL1x2_3
  2321. KERNEL1x2_4
  2322. je .L2_46
  2323. KERNEL1x2_1
  2324. KERNEL1x2_2
  2325. KERNEL1x2_3
  2326. KERNEL1x2_4
  2327. KERNEL1x2_1
  2328. KERNEL1x2_2
  2329. KERNEL1x2_3
  2330. KERNEL1x2_4
  2331. je .L2_46
  2332. jmp .L2_42
  2333. ALIGN_4
  2334. .L2_46:
  2335. movq K, %rax
  2336. andq $7, %rax # if (k & 1)
  2337. je .L2_49
  2338. movq %rax, BI // Index for BO
  2339. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2340. leaq (AO, %rax, SIZE), AO
  2341. leaq (BO, BI, SIZE), BO
  2342. negq BI
  2343. negq %rax
  2344. ALIGN_4
  2345. .L2_47:
  2346. KERNEL1x2_SUB
  2347. jl .L2_47
  2348. ALIGN_4
  2349. .L2_49:
  2350. SAVE1x2
  2351. addq $1 * SIZE, CO1 # coffset += 1
  2352. ALIGN_4
  2353. .L2_60:
  2354. decq J // j --
  2355. jg .L2_01 // next 2 lines of N
  2356. .L1_0:
  2357. /************************************************************************************************
  2358. * Loop for Nmod6 % 2 > 0
  2359. *************************************************************************************************/
  2360. movq Nmod6, J
  2361. andq $1, J // j % 2
  2362. je .L999
  2363. ALIGN_4
  2364. .L1_01:
  2365. // copy to sub buffer
  2366. movq B, BO1
  2367. leaq BUFFER1, BO // first buffer to BO
  2368. movq K, %rax
  2369. ALIGN_4
  2370. .L1_02b:
  2371. vmovsd (BO1), %xmm0
  2372. vmovsd %xmm0, (BO)
  2373. addq $1*SIZE,BO1
  2374. addq $1*SIZE,BO
  2375. decq %rax
  2376. jnz .L1_02b
  2377. .L1_02c:
  2378. movq BO1, B // next offset of B
  2379. .L1_10:
  2380. movq C, CO1
  2381. leaq (C, LDC, 1), C // c += 1 * ldc
  2382. movq A, AO // aoffset = a
  2383. addq $32 * SIZE, AO
  2384. movq M, I
  2385. sarq $4, I // i = (m >> 4)
  2386. je .L1_20
  2387. ALIGN_4
  2388. .L1_11:
  2389. leaq BUFFER1, BO // first buffer to BO
  2390. addq $2 * SIZE, BO
  2391. vzeroall
  2392. movq K, %rax
  2393. andq $-8, %rax // K = K - ( K % 8 )
  2394. je .L1_16
  2395. movq %rax, BI // Index for BO
  2396. salq $4, %rax // rax = rax * 16 ; number of values
  2397. leaq (AO, %rax, SIZE), AO
  2398. leaq (BO, BI, SIZE), BO
  2399. negq BI
  2400. negq %rax
  2401. ALIGN_4
  2402. .L1_12:
  2403. prefetcht0 B_PR1(BO,BI,8)
  2404. KERNEL16x1_1
  2405. KERNEL16x1_2
  2406. KERNEL16x1_3
  2407. KERNEL16x1_4
  2408. KERNEL16x1_1
  2409. KERNEL16x1_2
  2410. KERNEL16x1_3
  2411. KERNEL16x1_4
  2412. je .L1_16
  2413. prefetcht0 B_PR1(BO,BI,8)
  2414. KERNEL16x1_1
  2415. KERNEL16x1_2
  2416. KERNEL16x1_3
  2417. KERNEL16x1_4
  2418. KERNEL16x1_1
  2419. KERNEL16x1_2
  2420. KERNEL16x1_3
  2421. KERNEL16x1_4
  2422. je .L1_16
  2423. jmp .L1_12
  2424. ALIGN_4
  2425. .L1_16:
  2426. movq K, %rax
  2427. andq $7, %rax # if (k & 1)
  2428. je .L1_19
  2429. movq %rax, BI // Index for BO
  2430. salq $4, %rax // rax = rax * 16 ; number of values
  2431. leaq (AO, %rax, SIZE), AO
  2432. leaq (BO, BI, SIZE), BO
  2433. negq BI
  2434. negq %rax
  2435. ALIGN_4
  2436. .L1_17:
  2437. KERNEL16x1_SUB
  2438. jl .L1_17
  2439. ALIGN_4
  2440. .L1_19:
  2441. SAVE16x1
  2442. addq $16 * SIZE, CO1 # coffset += 16
  2443. decq I # i --
  2444. jg .L1_11
  2445. ALIGN_4
  2446. /**************************************************************************
  2447. * Rest of M
  2448. ***************************************************************************/
  2449. .L1_20:
  2450. // Test rest of M
  2451. testq $15, M
  2452. jz .L999
  2453. testq $8, M
  2454. jz .L1_21pre
  2455. ALIGN_4
  2456. /**************************************************************************/
  2457. .L1_20_1:
  2458. leaq BUFFER1, BO // first buffer to BO
  2459. addq $2 * SIZE, BO
  2460. vzeroall
  2461. movq K, %rax
  2462. andq $-8, %rax
  2463. je .L1_20_6
  2464. movq %rax, BI // Index for BO
  2465. salq $3, %rax // rax = rax * 8 ; number of values
  2466. leaq (AO, %rax, SIZE), AO
  2467. leaq (BO, BI, SIZE), BO
  2468. negq BI
  2469. negq %rax
  2470. ALIGN_4
  2471. .L1_20_2:
  2472. prefetcht0 B_PR1(BO,BI,8)
  2473. KERNEL8x1_1
  2474. KERNEL8x1_2
  2475. KERNEL8x1_3
  2476. KERNEL8x1_4
  2477. KERNEL8x1_1
  2478. KERNEL8x1_2
  2479. KERNEL8x1_3
  2480. KERNEL8x1_4
  2481. je .L1_20_6
  2482. prefetcht0 B_PR1(BO,BI,8)
  2483. KERNEL8x1_1
  2484. KERNEL8x1_2
  2485. KERNEL8x1_3
  2486. KERNEL8x1_4
  2487. KERNEL8x1_1
  2488. KERNEL8x1_2
  2489. KERNEL8x1_3
  2490. KERNEL8x1_4
  2491. je .L1_20_6
  2492. jmp .L1_20_2
  2493. ALIGN_4
  2494. .L1_20_6:
  2495. movq K, %rax
  2496. andq $7, %rax # if (k & 1)
  2497. je .L1_20_9
  2498. movq %rax, BI // Index for BO
  2499. salq $3, %rax // rax = rax * 8 ; number of values
  2500. leaq (AO, %rax, SIZE), AO
  2501. leaq (BO, BI, SIZE), BO
  2502. negq BI
  2503. negq %rax
  2504. ALIGN_4
  2505. .L1_20_7:
  2506. KERNEL8x1_SUB
  2507. jl .L1_20_7
  2508. ALIGN_4
  2509. .L1_20_9:
  2510. SAVE8x1
  2511. addq $8 * SIZE, CO1 # coffset += 8
  2512. ALIGN_4
  2513. /**************************************************************************/
  2514. .L1_21pre:
  2515. testq $4, M
  2516. jz .L1_30
  2517. ALIGN_4
  2518. .L1_21:
  2519. leaq BUFFER1, BO // first buffer to BO
  2520. addq $2 * SIZE, BO
  2521. vzeroall
  2522. movq K, %rax
  2523. andq $-8, %rax
  2524. je .L1_26
  2525. movq %rax, BI // Index for BO
  2526. salq $2, %rax // rax = rax * 4 ; number of values
  2527. leaq (AO, %rax, SIZE), AO
  2528. leaq (BO, BI, SIZE), BO
  2529. negq BI
  2530. negq %rax
  2531. ALIGN_4
  2532. .L1_22:
  2533. prefetcht0 B_PR1(BO,BI,8)
  2534. KERNEL4x1_1
  2535. KERNEL4x1_2
  2536. KERNEL4x1_3
  2537. KERNEL4x1_4
  2538. KERNEL4x1_1
  2539. KERNEL4x1_2
  2540. KERNEL4x1_3
  2541. KERNEL4x1_4
  2542. je .L1_26
  2543. prefetcht0 B_PR1(BO,BI,8)
  2544. KERNEL4x1_1
  2545. KERNEL4x1_2
  2546. KERNEL4x1_3
  2547. KERNEL4x1_4
  2548. KERNEL4x1_1
  2549. KERNEL4x1_2
  2550. KERNEL4x1_3
  2551. KERNEL4x1_4
  2552. je .L1_26
  2553. jmp .L1_22
  2554. ALIGN_4
  2555. .L1_26:
  2556. movq K, %rax
  2557. andq $7, %rax # if (k & 1)
  2558. je .L1_29
  2559. movq %rax, BI // Index for BO
  2560. salq $2, %rax // rax = rax * 4 ; number of values
  2561. leaq (AO, %rax, SIZE), AO
  2562. leaq (BO, BI, SIZE), BO
  2563. negq BI
  2564. negq %rax
  2565. ALIGN_4
  2566. .L1_27:
  2567. KERNEL4x1_SUB
  2568. jl .L1_27
  2569. ALIGN_4
  2570. .L1_29:
  2571. SAVE4x1
  2572. addq $4 * SIZE, CO1 # coffset += 4
  2573. ALIGN_4
  2574. .L1_30:
  2575. testq $2, M
  2576. jz .L1_40
  2577. ALIGN_4
  2578. .L1_31:
  2579. leaq BUFFER1, BO // first buffer to BO
  2580. addq $2 * SIZE, BO
  2581. vzeroall
  2582. movq K, %rax
  2583. andq $-8, %rax
  2584. je .L1_36
  2585. movq %rax, BI // Index for BO
  2586. salq $1, %rax // rax = rax *2 ; number of values
  2587. leaq (AO, %rax, SIZE), AO
  2588. leaq (BO, BI, SIZE), BO
  2589. negq BI
  2590. negq %rax
  2591. ALIGN_4
  2592. .L1_32:
  2593. KERNEL2x1_1
  2594. KERNEL2x1_2
  2595. KERNEL2x1_3
  2596. KERNEL2x1_4
  2597. KERNEL2x1_1
  2598. KERNEL2x1_2
  2599. KERNEL2x1_3
  2600. KERNEL2x1_4
  2601. je .L1_36
  2602. KERNEL2x1_1
  2603. KERNEL2x1_2
  2604. KERNEL2x1_3
  2605. KERNEL2x1_4
  2606. KERNEL2x1_1
  2607. KERNEL2x1_2
  2608. KERNEL2x1_3
  2609. KERNEL2x1_4
  2610. je .L1_36
  2611. jmp .L1_32
  2612. ALIGN_4
  2613. .L1_36:
  2614. movq K, %rax
  2615. andq $7, %rax # if (k & 1)
  2616. je .L1_39
  2617. movq %rax, BI // Index for BO
  2618. salq $1, %rax // rax = rax *2 ; number of values
  2619. leaq (AO, %rax, SIZE), AO
  2620. leaq (BO, BI, SIZE), BO
  2621. negq BI
  2622. negq %rax
  2623. ALIGN_4
  2624. .L1_37:
  2625. KERNEL2x1_SUB
  2626. jl .L1_37
  2627. ALIGN_4
  2628. .L1_39:
  2629. SAVE2x1
  2630. addq $2 * SIZE, CO1 # coffset += 2
  2631. ALIGN_4
  2632. .L1_40:
  2633. testq $1, M
  2634. jz .L999
  2635. ALIGN_4
  2636. .L1_41:
  2637. leaq BUFFER1, BO // first buffer to BO
  2638. addq $2 * SIZE, BO
  2639. vzeroall
  2640. movq K, %rax
  2641. andq $-8, %rax
  2642. je .L1_46
  2643. movq %rax, BI // Index for BO
  2644. leaq (AO, %rax, SIZE), AO
  2645. leaq (BO, BI, SIZE), BO
  2646. negq BI
  2647. negq %rax
  2648. ALIGN_4
  2649. .L1_42:
  2650. KERNEL1x1_1
  2651. KERNEL1x1_2
  2652. KERNEL1x1_3
  2653. KERNEL1x1_4
  2654. KERNEL1x1_1
  2655. KERNEL1x1_2
  2656. KERNEL1x1_3
  2657. KERNEL1x1_4
  2658. je .L1_46
  2659. KERNEL1x1_1
  2660. KERNEL1x1_2
  2661. KERNEL1x1_3
  2662. KERNEL1x1_4
  2663. KERNEL1x1_1
  2664. KERNEL1x1_2
  2665. KERNEL1x1_3
  2666. KERNEL1x1_4
  2667. je .L1_46
  2668. jmp .L1_42
  2669. ALIGN_4
  2670. .L1_46:
  2671. movq K, %rax
  2672. andq $7, %rax # if (k & 1)
  2673. je .L1_49
  2674. movq %rax, BI // Index for BO
  2675. leaq (AO, %rax, SIZE), AO
  2676. leaq (BO, BI, SIZE), BO
  2677. negq BI
  2678. negq %rax
  2679. ALIGN_4
  2680. .L1_47:
  2681. KERNEL1x1_SUB
  2682. jl .L1_47
  2683. ALIGN_4
  2684. .L1_49:
  2685. SAVE1x1
  2686. addq $1 * SIZE, CO1 # coffset += 1
  2687. ALIGN_4
  2688. .L999:
  2689. movq SP, %rsp
  2690. movq (%rsp), %rbx
  2691. movq 8(%rsp), %rbp
  2692. movq 16(%rsp), %r12
  2693. movq 24(%rsp), %r13
  2694. movq 32(%rsp), %r14
  2695. movq 40(%rsp), %r15
  2696. #ifdef WINDOWS_ABI
  2697. movq 48(%rsp), %rdi
  2698. movq 56(%rsp), %rsi
  2699. movups 64(%rsp), %xmm6
  2700. movups 80(%rsp), %xmm7
  2701. movups 96(%rsp), %xmm8
  2702. movups 112(%rsp), %xmm9
  2703. movups 128(%rsp), %xmm10
  2704. movups 144(%rsp), %xmm11
  2705. movups 160(%rsp), %xmm12
  2706. movups 176(%rsp), %xmm13
  2707. movups 192(%rsp), %xmm14
  2708. movups 208(%rsp), %xmm15
  2709. #endif
  2710. addq $STACKSIZE, %rsp
  2711. ret
  2712. EPILOGUE
  2713. #else
  2714. /*************************************************************************************
  2715. * TRMM Kernel
  2716. *************************************************************************************/
  2717. PROLOGUE
  2718. PROFCODE
  2719. subq $STACKSIZE, %rsp
  2720. movq %rbx, (%rsp)
  2721. movq %rbp, 8(%rsp)
  2722. movq %r12, 16(%rsp)
  2723. movq %r13, 24(%rsp)
  2724. movq %r14, 32(%rsp)
  2725. movq %r15, 40(%rsp)
  2726. vzeroupper
  2727. #ifdef WINDOWS_ABI
  2728. movq %rdi, 48(%rsp)
  2729. movq %rsi, 56(%rsp)
  2730. movups %xmm6, 64(%rsp)
  2731. movups %xmm7, 80(%rsp)
  2732. movups %xmm8, 96(%rsp)
  2733. movups %xmm9, 112(%rsp)
  2734. movups %xmm10, 128(%rsp)
  2735. movups %xmm11, 144(%rsp)
  2736. movups %xmm12, 160(%rsp)
  2737. movups %xmm13, 176(%rsp)
  2738. movups %xmm14, 192(%rsp)
  2739. movups %xmm15, 208(%rsp)
  2740. movq ARG1, OLD_M
  2741. movq ARG2, OLD_N
  2742. movq ARG3, OLD_K
  2743. movq OLD_A, A
  2744. movq OLD_B, B
  2745. movq OLD_C, C
  2746. movq OLD_LDC, LDC
  2747. #ifdef TRMMKERNEL
  2748. movsd OLD_OFFSET, %xmm12
  2749. #endif
  2750. vmovaps %xmm3, %xmm0
  2751. #else
  2752. movq STACKSIZE + 8(%rsp), LDC
  2753. #ifdef TRMMKERNEL
  2754. movsd STACKSIZE + 16(%rsp), %xmm12
  2755. #endif
  2756. #endif
  2757. movq %rsp, SP # save old stack
  2758. subq $128 + L_BUFFER_SIZE, %rsp
  2759. andq $-4096, %rsp # align stack
  2760. STACK_TOUCH
  2761. cmpq $0, OLD_M
  2762. je .L999
  2763. cmpq $0, OLD_N
  2764. je .L999
  2765. cmpq $0, OLD_K
  2766. je .L999
  2767. movq OLD_M, M
  2768. movq OLD_N, N
  2769. movq OLD_K, K
  2770. vmovsd %xmm0, ALPHA
  2771. salq $BASE_SHIFT, LDC
  2772. movq N, %rax
  2773. xorq %rdx, %rdx
  2774. movq $2, %rdi
  2775. divq %rdi // N / 6
  2776. movq %rax, Ndiv6 // N / 6
  2777. movq %rdx, Nmod6 // N % 6
  2778. #ifdef TRMMKERNEL
  2779. vmovsd %xmm12, OFFSET
  2780. vmovsd %xmm12, KK
  2781. #ifndef LEFT
  2782. negq KK
  2783. #endif
  2784. #endif
  2785. movq Ndiv6, J
  2786. cmpq $0, J
  2787. je .L1_0
  2788. ALIGN_4
  2789. .L2_01:
  2790. // copy to sub buffer
  2791. movq B, BO1
  2792. leaq BUFFER1, BO // first buffer to BO
  2793. movq K, %rax
  2794. sarq $2, %rax // K / 4
  2795. jz .L2_01b
  2796. ALIGN_4
  2797. .L2_01a:
  2798. prefetcht0 512(BO1)
  2799. prefetchw 512(BO)
  2800. vmovups (BO1), %xmm0
  2801. vmovups 2*SIZE(BO1), %xmm1
  2802. vmovups 4*SIZE(BO1), %xmm2
  2803. vmovups 6*SIZE(BO1), %xmm3
  2804. vmovups %xmm0, (BO)
  2805. vmovups %xmm1, 2*SIZE(BO)
  2806. vmovups %xmm2, 4*SIZE(BO)
  2807. vmovups %xmm3, 6*SIZE(BO)
  2808. addq $8*SIZE,BO1
  2809. addq $8*SIZE,BO
  2810. decq %rax
  2811. jnz .L2_01a
  2812. .L2_01b:
  2813. movq K, %rax
  2814. andq $3, %rax // K % 4
  2815. jz .L2_02d
  2816. ALIGN_4
  2817. .L2_02c:
  2818. vmovups (BO1), %xmm0
  2819. vmovups %xmm0, (BO)
  2820. addq $2*SIZE,BO1
  2821. addq $2*SIZE,BO
  2822. decq %rax
  2823. jnz .L2_02c
  2824. .L2_02d:
  2825. movq BO1, B // next offset of B
  2826. .L2_10:
  2827. movq C, CO1
  2828. leaq (C, LDC, 2), C // c += 2 * ldc
  2829. #if defined(TRMMKERNEL) && defined(LEFT)
  2830. movq OFFSET, %rax
  2831. movq %rax, KK
  2832. #endif
  2833. movq A, AO // aoffset = a
  2834. addq $32 * SIZE, AO
  2835. movq M, I
  2836. sarq $4, I // i = (m >> 4)
  2837. je .L2_20
  2838. ALIGN_4
  2839. .L2_11:
  2840. #if !defined(TRMMKERNEL) || \
  2841. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2842. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2843. leaq BUFFER1, BO // first buffer to BO
  2844. addq $4 * SIZE, BO
  2845. #else
  2846. movq KK, %rax
  2847. leaq BUFFER1, BO // first buffer to BO
  2848. addq $4 * SIZE, BO
  2849. movq %rax, BI // Index for BO
  2850. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2851. leaq (BO, BI, SIZE), BO
  2852. salq $4, %rax // rax = rax * 16 ; number of values
  2853. leaq (AO, %rax, SIZE), AO
  2854. #endif
  2855. vzeroall
  2856. #ifndef TRMMKERNEL
  2857. movq K, %rax
  2858. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2859. movq K, %rax
  2860. subq KK, %rax
  2861. movq %rax, KKK
  2862. #else
  2863. movq KK, %rax
  2864. #ifdef LEFT
  2865. addq $16, %rax // number of values in AO
  2866. #else
  2867. addq $2, %rax // number of values in BO
  2868. #endif
  2869. movq %rax, KKK
  2870. #endif
  2871. andq $-8, %rax // K = K - ( K % 8 )
  2872. je .L2_16
  2873. movq %rax, BI // Index for BO
  2874. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2875. salq $4, %rax // rax = rax * 16 ; number of values
  2876. leaq (AO, %rax, SIZE), AO
  2877. leaq (BO, BI, SIZE), BO
  2878. negq BI
  2879. negq %rax
  2880. ALIGN_4
  2881. .L2_12:
  2882. prefetcht0 B_PR1(BO,BI,8)
  2883. KERNEL16x2_1
  2884. KERNEL16x2_2
  2885. KERNEL16x2_3
  2886. KERNEL16x2_4
  2887. prefetcht0 B_PR1(BO,BI,8)
  2888. KERNEL16x2_1
  2889. KERNEL16x2_2
  2890. KERNEL16x2_3
  2891. KERNEL16x2_4
  2892. je .L2_16
  2893. prefetcht0 B_PR1(BO,BI,8)
  2894. KERNEL16x2_1
  2895. KERNEL16x2_2
  2896. KERNEL16x2_3
  2897. KERNEL16x2_4
  2898. prefetcht0 B_PR1(BO,BI,8)
  2899. KERNEL16x2_1
  2900. KERNEL16x2_2
  2901. KERNEL16x2_3
  2902. KERNEL16x2_4
  2903. je .L2_16
  2904. jmp .L2_12
  2905. ALIGN_4
  2906. .L2_16:
  2907. #ifndef TRMMKERNEL
  2908. movq K, %rax
  2909. #else
  2910. movq KKK, %rax
  2911. #endif
  2912. andq $7, %rax # if (k & 1)
  2913. je .L2_19
  2914. movq %rax, BI // Index for BO
  2915. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2916. salq $4, %rax // rax = rax * 16 ; number of values
  2917. leaq (AO, %rax, SIZE), AO
  2918. leaq (BO, BI, SIZE), BO
  2919. negq BI
  2920. negq %rax
  2921. ALIGN_4
  2922. .L2_17:
  2923. KERNEL16x2_SUB
  2924. jl .L2_17
  2925. ALIGN_4
  2926. .L2_19:
  2927. SAVE16x2
  2928. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2929. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2930. movq K, %rax
  2931. subq KKK, %rax
  2932. movq %rax, BI // Index for BO
  2933. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2934. leaq (BO, BI, SIZE), BO
  2935. salq $4, %rax // rax = rax * 16 ; number of values
  2936. leaq (AO, %rax, SIZE), AO
  2937. #endif
  2938. #if defined(TRMMKERNEL) && defined(LEFT)
  2939. addq $16, KK
  2940. #endif
  2941. addq $16 * SIZE, CO1 # coffset += 16
  2942. decq I # i --
  2943. jg .L2_11
  2944. ALIGN_4
  2945. /**************************************************************************
  2946. * Rest of M
  2947. ***************************************************************************/
  2948. .L2_20:
  2949. // Test rest of M
  2950. testq $15, M
  2951. jz .L2_60 // to next 3 lines of N
  2952. testq $8, M
  2953. jz .L2_21pre
  2954. ALIGN_4
  2955. /**************************************************************************/
  2956. .L2_20_1:
  2957. #if !defined(TRMMKERNEL) || \
  2958. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2959. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2960. leaq BUFFER1, BO // first buffer to BO
  2961. addq $4 * SIZE, BO
  2962. #else
  2963. movq KK, %rax
  2964. leaq BUFFER1, BO // first buffer to BO
  2965. addq $4 * SIZE, BO
  2966. movq %rax, BI // Index for BO
  2967. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2968. leaq (BO, BI, SIZE), BO
  2969. salq $3, %rax // rax = rax * 8 ; number of values
  2970. leaq (AO, %rax, SIZE), AO
  2971. #endif
  2972. vzeroall
  2973. #ifndef TRMMKERNEL
  2974. movq K, %rax
  2975. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2976. movq K, %rax
  2977. subq KK, %rax
  2978. movq %rax, KKK
  2979. #else
  2980. movq KK, %rax
  2981. #ifdef LEFT
  2982. addq $8, %rax // number of values in A
  2983. #else
  2984. addq $2, %rax // number of values in BO
  2985. #endif
  2986. movq %rax, KKK
  2987. #endif
  2988. andq $-8, %rax
  2989. je .L2_20_6
  2990. movq %rax, BI // Index for BO
  2991. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2992. salq $3, %rax // rax = rax * 8 ; number of values
  2993. leaq (AO, %rax, SIZE), AO
  2994. leaq (BO, BI, SIZE), BO
  2995. negq BI
  2996. negq %rax
  2997. ALIGN_4
  2998. .L2_20_2:
  2999. prefetcht0 B_PR1(BO,BI,8)
  3000. KERNEL8x2_1
  3001. KERNEL8x2_2
  3002. KERNEL8x2_3
  3003. KERNEL8x2_4
  3004. prefetcht0 B_PR1(BO,BI,8)
  3005. KERNEL8x2_1
  3006. KERNEL8x2_2
  3007. KERNEL8x2_3
  3008. KERNEL8x2_4
  3009. je .L2_20_6
  3010. prefetcht0 B_PR1(BO,BI,8)
  3011. KERNEL8x2_1
  3012. KERNEL8x2_2
  3013. KERNEL8x2_3
  3014. KERNEL8x2_4
  3015. prefetcht0 B_PR1(BO,BI,8)
  3016. KERNEL8x2_1
  3017. KERNEL8x2_2
  3018. KERNEL8x2_3
  3019. KERNEL8x2_4
  3020. je .L2_20_6
  3021. jmp .L2_20_2
  3022. ALIGN_4
  3023. .L2_20_6:
  3024. #ifndef TRMMKERNEL
  3025. movq K, %rax
  3026. #else
  3027. movq KKK, %rax
  3028. #endif
  3029. andq $7, %rax # if (k & 1)
  3030. je .L2_20_9
  3031. movq %rax, BI // Index for BO
  3032. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3033. salq $3, %rax // rax = rax * 8 ; number of values
  3034. leaq (AO, %rax, SIZE), AO
  3035. leaq (BO, BI, SIZE), BO
  3036. negq BI
  3037. negq %rax
  3038. ALIGN_4
  3039. .L2_20_7:
  3040. KERNEL8x2_SUB
  3041. jl .L2_20_7
  3042. ALIGN_4
  3043. .L2_20_9:
  3044. SAVE8x2
  3045. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3046. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3047. movq K, %rax
  3048. subq KKK, %rax
  3049. movq %rax, BI // Index for BO
  3050. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3051. leaq (BO, BI, SIZE), BO
  3052. salq $3, %rax // rax = rax * 8 ; number of values
  3053. leaq (AO, %rax, SIZE), AO
  3054. #endif
  3055. #if defined(TRMMKERNEL) && defined(LEFT)
  3056. addq $8, KK
  3057. #endif
  3058. addq $8 * SIZE, CO1 # coffset += 8
  3059. ALIGN_4
  3060. /**************************************************************************/
  3061. .L2_21pre:
  3062. testq $4, M
  3063. jz .L2_30
  3064. ALIGN_4
  3065. .L2_21:
  3066. #if !defined(TRMMKERNEL) || \
  3067. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3068. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3069. leaq BUFFER1, BO // first buffer to BO
  3070. addq $4 * SIZE, BO
  3071. #else
  3072. movq KK, %rax
  3073. leaq BUFFER1, BO // first buffer to BO
  3074. addq $4 * SIZE, BO
  3075. movq %rax, BI // Index for BO
  3076. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3077. leaq (BO, BI, SIZE), BO
  3078. salq $2, %rax // rax = rax * 4 ; number of values
  3079. leaq (AO, %rax, SIZE), AO
  3080. #endif
  3081. vzeroall
  3082. #ifndef TRMMKERNEL
  3083. movq K, %rax
  3084. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3085. movq K, %rax
  3086. subq KK, %rax
  3087. movq %rax, KKK
  3088. #else
  3089. movq KK, %rax
  3090. #ifdef LEFT
  3091. addq $4, %rax // number of values in A
  3092. #else
  3093. addq $2, %rax // number of values in BO
  3094. #endif
  3095. movq %rax, KKK
  3096. #endif
  3097. andq $-8, %rax
  3098. je .L2_26
  3099. movq %rax, BI // Index for BO
  3100. leaq (BI,BI,1), BI // BI = BI * 1 ; number of values
  3101. salq $2, %rax // rax = rax * 4 ; number of values
  3102. leaq (AO, %rax, SIZE), AO
  3103. leaq (BO, BI, SIZE), BO
  3104. negq BI
  3105. negq %rax
  3106. ALIGN_4
  3107. .L2_22:
  3108. prefetcht0 B_PR1(BO,BI,8)
  3109. KERNEL4x2_1
  3110. KERNEL4x2_2
  3111. KERNEL4x2_3
  3112. KERNEL4x2_4
  3113. prefetcht0 B_PR1(BO,BI,8)
  3114. KERNEL4x2_1
  3115. KERNEL4x2_2
  3116. KERNEL4x2_3
  3117. KERNEL4x2_4
  3118. je .L2_26
  3119. prefetcht0 B_PR1(BO,BI,8)
  3120. KERNEL4x2_1
  3121. KERNEL4x2_2
  3122. KERNEL4x2_3
  3123. KERNEL4x2_4
  3124. prefetcht0 B_PR1(BO,BI,8)
  3125. KERNEL4x2_1
  3126. KERNEL4x2_2
  3127. KERNEL4x2_3
  3128. KERNEL4x2_4
  3129. je .L2_26
  3130. jmp .L2_22
  3131. ALIGN_4
  3132. .L2_26:
  3133. #ifndef TRMMKERNEL
  3134. movq K, %rax
  3135. #else
  3136. movq KKK, %rax
  3137. #endif
  3138. andq $7, %rax # if (k & 1)
  3139. je .L2_29
  3140. movq %rax, BI // Index for BO
  3141. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3142. salq $2, %rax // rax = rax * 4 ; number of values
  3143. leaq (AO, %rax, SIZE), AO
  3144. leaq (BO, BI, SIZE), BO
  3145. negq BI
  3146. negq %rax
  3147. ALIGN_4
  3148. .L2_27:
  3149. KERNEL4x2_SUB
  3150. jl .L2_27
  3151. ALIGN_4
  3152. .L2_29:
  3153. SAVE4x2
  3154. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3155. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3156. movq K, %rax
  3157. subq KKK, %rax
  3158. movq %rax, BI // Index for BO
  3159. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3160. leaq (BO, BI, SIZE), BO
  3161. salq $2, %rax // rax = rax * 4 ; number of values
  3162. leaq (AO, %rax, SIZE), AO
  3163. #endif
  3164. #if defined(TRMMKERNEL) && defined(LEFT)
  3165. addq $4, KK
  3166. #endif
  3167. addq $4 * SIZE, CO1 # coffset += 4
  3168. ALIGN_4
  3169. .L2_30:
  3170. testq $2, M
  3171. jz .L2_40
  3172. ALIGN_4
  3173. .L2_31:
  3174. #if !defined(TRMMKERNEL) || \
  3175. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3176. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3177. leaq BUFFER1, BO // first buffer to BO
  3178. addq $4 * SIZE, BO
  3179. #else
  3180. movq KK, %rax
  3181. leaq BUFFER1, BO // first buffer to BO
  3182. addq $4 * SIZE, BO
  3183. movq %rax, BI // Index for BO
  3184. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3185. leaq (BO, BI, SIZE), BO
  3186. salq $1, %rax // rax = rax * 2 ; number of values
  3187. leaq (AO, %rax, SIZE), AO
  3188. #endif
  3189. vzeroall
  3190. #ifndef TRMMKERNEL
  3191. movq K, %rax
  3192. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3193. movq K, %rax
  3194. subq KK, %rax
  3195. movq %rax, KKK
  3196. #else
  3197. movq KK, %rax
  3198. #ifdef LEFT
  3199. addq $2, %rax // number of values in AO
  3200. #else
  3201. addq $2, %rax // number of values in BO
  3202. #endif
  3203. movq %rax, KKK
  3204. #endif
  3205. andq $-8, %rax
  3206. je .L2_36
  3207. movq %rax, BI // Index for BO
  3208. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3209. salq $1, %rax // rax = rax *2 ; number of values
  3210. leaq (AO, %rax, SIZE), AO
  3211. leaq (BO, BI, SIZE), BO
  3212. negq BI
  3213. negq %rax
  3214. ALIGN_4
  3215. .L2_32:
  3216. KERNEL2x2_1
  3217. KERNEL2x2_2
  3218. KERNEL2x2_3
  3219. KERNEL2x2_4
  3220. KERNEL2x2_1
  3221. KERNEL2x2_2
  3222. KERNEL2x2_3
  3223. KERNEL2x2_4
  3224. je .L2_36
  3225. KERNEL2x2_1
  3226. KERNEL2x2_2
  3227. KERNEL2x2_3
  3228. KERNEL2x2_4
  3229. KERNEL2x2_1
  3230. KERNEL2x2_2
  3231. KERNEL2x2_3
  3232. KERNEL2x2_4
  3233. je .L2_36
  3234. jmp .L2_32
  3235. ALIGN_4
  3236. .L2_36:
  3237. #ifndef TRMMKERNEL
  3238. movq K, %rax
  3239. #else
  3240. movq KKK, %rax
  3241. #endif
  3242. andq $7, %rax # if (k & 1)
  3243. je .L2_39
  3244. movq %rax, BI // Index for BO
  3245. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3246. salq $1, %rax // rax = rax *2 ; number of values
  3247. leaq (AO, %rax, SIZE), AO
  3248. leaq (BO, BI, SIZE), BO
  3249. negq BI
  3250. negq %rax
  3251. ALIGN_4
  3252. .L2_37:
  3253. KERNEL2x2_SUB
  3254. jl .L2_37
  3255. ALIGN_4
  3256. .L2_39:
  3257. SAVE2x2
  3258. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3259. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3260. movq K, %rax
  3261. subq KKK, %rax
  3262. movq %rax, BI // Index for BO
  3263. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3264. leaq (BO, BI, SIZE), BO
  3265. salq $1, %rax // rax = rax * 2 ; number of values
  3266. leaq (AO, %rax, SIZE), AO
  3267. #endif
  3268. #if defined(TRMMKERNEL) && defined(LEFT)
  3269. addq $2, KK
  3270. #endif
  3271. addq $2 * SIZE, CO1 # coffset += 2
  3272. ALIGN_4
  3273. .L2_40:
  3274. testq $1, M
  3275. jz .L2_60 // to next 2 lines of N
  3276. ALIGN_4
  3277. .L2_41:
  3278. #if !defined(TRMMKERNEL) || \
  3279. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3280. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3281. leaq BUFFER1, BO // first buffer to BO
  3282. addq $4 * SIZE, BO
  3283. #else
  3284. movq KK, %rax
  3285. leaq BUFFER1, BO // first buffer to BO
  3286. addq $4 * SIZE, BO
  3287. movq %rax, BI // Index for BO
  3288. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3289. leaq (BO, BI, SIZE), BO
  3290. leaq (AO, %rax, SIZE), AO
  3291. #endif
  3292. vzeroall
  3293. #ifndef TRMMKERNEL
  3294. movq K, %rax
  3295. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3296. movq K, %rax
  3297. subq KK, %rax
  3298. movq %rax, KKK
  3299. #else
  3300. movq KK, %rax
  3301. #ifdef LEFT
  3302. addq $1, %rax // number of values in AO
  3303. #else
  3304. addq $2, %rax // number of values in BO
  3305. #endif
  3306. movq %rax, KKK
  3307. #endif
  3308. andq $-8, %rax
  3309. je .L2_46
  3310. movq %rax, BI // Index for BO
  3311. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3312. leaq (AO, %rax, SIZE), AO
  3313. leaq (BO, BI, SIZE), BO
  3314. negq BI
  3315. negq %rax
  3316. ALIGN_4
  3317. .L2_42:
  3318. KERNEL1x2_1
  3319. KERNEL1x2_2
  3320. KERNEL1x2_3
  3321. KERNEL1x2_4
  3322. KERNEL1x2_1
  3323. KERNEL1x2_2
  3324. KERNEL1x2_3
  3325. KERNEL1x2_4
  3326. je .L2_46
  3327. KERNEL1x2_1
  3328. KERNEL1x2_2
  3329. KERNEL1x2_3
  3330. KERNEL1x2_4
  3331. KERNEL1x2_1
  3332. KERNEL1x2_2
  3333. KERNEL1x2_3
  3334. KERNEL1x2_4
  3335. je .L2_46
  3336. jmp .L2_42
  3337. ALIGN_4
  3338. .L2_46:
  3339. #ifndef TRMMKERNEL
  3340. movq K, %rax
  3341. #else
  3342. movq KKK, %rax
  3343. #endif
  3344. andq $7, %rax # if (k & 1)
  3345. je .L2_49
  3346. movq %rax, BI // Index for BO
  3347. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3348. leaq (AO, %rax, SIZE), AO
  3349. leaq (BO, BI, SIZE), BO
  3350. negq BI
  3351. negq %rax
  3352. ALIGN_4
  3353. .L2_47:
  3354. KERNEL1x2_SUB
  3355. jl .L2_47
  3356. ALIGN_4
  3357. .L2_49:
  3358. SAVE1x2
  3359. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3360. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3361. movq K, %rax
  3362. subq KKK, %rax
  3363. movq %rax, BI // Index for BO
  3364. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3365. leaq (BO, BI, SIZE), BO
  3366. leaq (AO, %rax, SIZE), AO
  3367. #endif
  3368. #if defined(TRMMKERNEL) && defined(LEFT)
  3369. addq $1, KK
  3370. #endif
  3371. addq $1 * SIZE, CO1 # coffset += 1
  3372. ALIGN_4
  3373. .L2_60:
  3374. #if defined(TRMMKERNEL) && !defined(LEFT)
  3375. addq $2, KK
  3376. #endif
  3377. decq J // j --
  3378. jg .L2_01 // next 2 lines of N
  3379. .L1_0:
  3380. /************************************************************************************************
  3381. * Loop for Nmod6 % 2 > 0
  3382. *************************************************************************************************/
  3383. movq Nmod6, J
  3384. andq $1, J // j % 2
  3385. je .L999
  3386. ALIGN_4
  3387. .L1_01:
  3388. // copy to sub buffer
  3389. movq B, BO1
  3390. leaq BUFFER1, BO // first buffer to BO
  3391. movq K, %rax
  3392. ALIGN_4
  3393. .L1_02b:
  3394. vmovsd (BO1), %xmm0
  3395. vmovsd %xmm0, (BO)
  3396. addq $1*SIZE,BO1
  3397. addq $1*SIZE,BO
  3398. decq %rax
  3399. jnz .L1_02b
  3400. .L1_02c:
  3401. movq BO1, B // next offset of B
  3402. .L1_10:
  3403. movq C, CO1
  3404. leaq (C, LDC, 1), C // c += 1 * ldc
  3405. #if defined(TRMMKERNEL) && defined(LEFT)
  3406. movq OFFSET, %rax
  3407. movq %rax, KK
  3408. #endif
  3409. movq A, AO // aoffset = a
  3410. addq $32 * SIZE, AO
  3411. movq M, I
  3412. sarq $4, I // i = (m >> 4)
  3413. je .L1_20
  3414. ALIGN_4
  3415. .L1_11:
  3416. #if !defined(TRMMKERNEL) || \
  3417. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3418. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3419. leaq BUFFER1, BO // first buffer to BO
  3420. addq $2 * SIZE, BO
  3421. #else
  3422. movq KK, %rax
  3423. leaq BUFFER1, BO // first buffer to BO
  3424. addq $2 * SIZE, BO
  3425. movq %rax, BI // Index for BO
  3426. leaq (BO, BI, SIZE), BO
  3427. salq $4, %rax // rax = rax * 16 ; number of values
  3428. leaq (AO, %rax, SIZE), AO
  3429. #endif
  3430. vzeroall
  3431. #ifndef TRMMKERNEL
  3432. movq K, %rax
  3433. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3434. movq K, %rax
  3435. subq KK, %rax
  3436. movq %rax, KKK
  3437. #else
  3438. movq KK, %rax
  3439. #ifdef LEFT
  3440. addq $16, %rax // number of values in AO
  3441. #else
  3442. addq $1, %rax // number of values in BO
  3443. #endif
  3444. movq %rax, KKK
  3445. #endif
  3446. andq $-8, %rax // K = K - ( K % 8 )
  3447. je .L1_16
  3448. movq %rax, BI // Index for BO
  3449. salq $4, %rax // rax = rax * 16 ; number of values
  3450. leaq (AO, %rax, SIZE), AO
  3451. leaq (BO, BI, SIZE), BO
  3452. negq BI
  3453. negq %rax
  3454. ALIGN_4
  3455. .L1_12:
  3456. prefetcht0 B_PR1(BO,BI,8)
  3457. KERNEL16x1_1
  3458. KERNEL16x1_2
  3459. KERNEL16x1_3
  3460. KERNEL16x1_4
  3461. KERNEL16x1_1
  3462. KERNEL16x1_2
  3463. KERNEL16x1_3
  3464. KERNEL16x1_4
  3465. je .L1_16
  3466. prefetcht0 B_PR1(BO,BI,8)
  3467. KERNEL16x1_1
  3468. KERNEL16x1_2
  3469. KERNEL16x1_3
  3470. KERNEL16x1_4
  3471. KERNEL16x1_1
  3472. KERNEL16x1_2
  3473. KERNEL16x1_3
  3474. KERNEL16x1_4
  3475. je .L1_16
  3476. jmp .L1_12
  3477. ALIGN_4
  3478. .L1_16:
  3479. #ifndef TRMMKERNEL
  3480. movq K, %rax
  3481. #else
  3482. movq KKK, %rax
  3483. #endif
  3484. andq $7, %rax # if (k & 1)
  3485. je .L1_19
  3486. movq %rax, BI // Index for BO
  3487. salq $4, %rax // rax = rax * 16 ; number of values
  3488. leaq (AO, %rax, SIZE), AO
  3489. leaq (BO, BI, SIZE), BO
  3490. negq BI
  3491. negq %rax
  3492. ALIGN_4
  3493. .L1_17:
  3494. KERNEL16x1_SUB
  3495. jl .L1_17
  3496. ALIGN_4
  3497. .L1_19:
  3498. SAVE16x1
  3499. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3500. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3501. movq K, %rax
  3502. subq KKK, %rax
  3503. movq %rax, BI // Index for BO
  3504. leaq (BO, BI, SIZE), BO
  3505. salq $4, %rax // rax = rax * 16 ; number of values
  3506. leaq (AO, %rax, SIZE), AO
  3507. #endif
  3508. #if defined(TRMMKERNEL) && defined(LEFT)
  3509. addq $16, KK
  3510. #endif
  3511. addq $16 * SIZE, CO1 # coffset += 16
  3512. decq I # i --
  3513. jg .L1_11
  3514. ALIGN_4
  3515. /**************************************************************************
  3516. * Rest of M
  3517. ***************************************************************************/
  3518. .L1_20:
  3519. // Test rest of M
  3520. testq $15, M
  3521. jz .L999
  3522. testq $8, M
  3523. jz .L1_21pre
  3524. ALIGN_4
  3525. /**************************************************************************/
  3526. .L1_20_1:
  3527. #if !defined(TRMMKERNEL) || \
  3528. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3529. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3530. leaq BUFFER1, BO // first buffer to BO
  3531. addq $2 * SIZE, BO
  3532. #else
  3533. movq KK, %rax
  3534. leaq BUFFER1, BO // first buffer to BO
  3535. addq $2 * SIZE, BO
  3536. movq %rax, BI // Index for BO
  3537. leaq (BO, BI, SIZE), BO
  3538. salq $3, %rax // rax = rax * 8 ; number of values
  3539. leaq (AO, %rax, SIZE), AO
  3540. #endif
  3541. vzeroall
  3542. #ifndef TRMMKERNEL
  3543. movq K, %rax
  3544. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3545. movq K, %rax
  3546. subq KK, %rax
  3547. movq %rax, KKK
  3548. #else
  3549. movq KK, %rax
  3550. #ifdef LEFT
  3551. addq $8, %rax // number of values in A
  3552. #else
  3553. addq $1, %rax // number of values in BO
  3554. #endif
  3555. movq %rax, KKK
  3556. #endif
  3557. andq $-8, %rax
  3558. je .L1_20_6
  3559. movq %rax, BI // Index for BO
  3560. salq $3, %rax // rax = rax * 8 ; number of values
  3561. leaq (AO, %rax, SIZE), AO
  3562. leaq (BO, BI, SIZE), BO
  3563. negq BI
  3564. negq %rax
  3565. ALIGN_4
  3566. .L1_20_2:
  3567. prefetcht0 B_PR1(BO,BI,8)
  3568. KERNEL8x1_1
  3569. KERNEL8x1_2
  3570. KERNEL8x1_3
  3571. KERNEL8x1_4
  3572. KERNEL8x1_1
  3573. KERNEL8x1_2
  3574. KERNEL8x1_3
  3575. KERNEL8x1_4
  3576. je .L1_20_6
  3577. prefetcht0 B_PR1(BO,BI,8)
  3578. KERNEL8x1_1
  3579. KERNEL8x1_2
  3580. KERNEL8x1_3
  3581. KERNEL8x1_4
  3582. KERNEL8x1_1
  3583. KERNEL8x1_2
  3584. KERNEL8x1_3
  3585. KERNEL8x1_4
  3586. je .L1_20_6
  3587. jmp .L1_20_2
  3588. ALIGN_4
  3589. .L1_20_6:
  3590. #ifndef TRMMKERNEL
  3591. movq K, %rax
  3592. #else
  3593. movq KKK, %rax
  3594. #endif
  3595. andq $7, %rax # if (k & 1)
  3596. je .L1_20_9
  3597. movq %rax, BI // Index for BO
  3598. salq $3, %rax // rax = rax * 8 ; number of values
  3599. leaq (AO, %rax, SIZE), AO
  3600. leaq (BO, BI, SIZE), BO
  3601. negq BI
  3602. negq %rax
  3603. ALIGN_4
  3604. .L1_20_7:
  3605. KERNEL8x1_SUB
  3606. jl .L1_20_7
  3607. ALIGN_4
  3608. .L1_20_9:
  3609. SAVE8x1
  3610. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3611. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3612. movq K, %rax
  3613. subq KKK, %rax
  3614. movq %rax, BI // Index for BO
  3615. leaq (BO, BI, SIZE), BO
  3616. salq $3, %rax // rax = rax * 8 ; number of values
  3617. leaq (AO, %rax, SIZE), AO
  3618. #endif
  3619. #if defined(TRMMKERNEL) && defined(LEFT)
  3620. addq $8, KK
  3621. #endif
  3622. addq $8 * SIZE, CO1 # coffset += 8
  3623. ALIGN_4
  3624. /**************************************************************************/
  3625. .L1_21pre:
  3626. testq $4, M
  3627. jz .L1_30
  3628. ALIGN_4
  3629. .L1_21:
  3630. #if !defined(TRMMKERNEL) || \
  3631. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3632. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3633. leaq BUFFER1, BO // first buffer to BO
  3634. addq $2 * SIZE, BO
  3635. #else
  3636. movq KK, %rax
  3637. leaq BUFFER1, BO // first buffer to BO
  3638. addq $2 * SIZE, BO
  3639. movq %rax, BI // Index for BO
  3640. leaq (BO, BI, SIZE), BO
  3641. salq $2, %rax // rax = rax * 4 ; number of values
  3642. leaq (AO, %rax, SIZE), AO
  3643. #endif
  3644. vzeroall
  3645. #ifndef TRMMKERNEL
  3646. movq K, %rax
  3647. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3648. movq K, %rax
  3649. subq KK, %rax
  3650. movq %rax, KKK
  3651. #else
  3652. movq KK, %rax
  3653. #ifdef LEFT
  3654. addq $4, %rax // number of values in A
  3655. #else
  3656. addq $1, %rax // number of values in BO
  3657. #endif
  3658. movq %rax, KKK
  3659. #endif
  3660. andq $-8, %rax
  3661. je .L1_26
  3662. movq %rax, BI // Index for BO
  3663. salq $2, %rax // rax = rax * 4 ; number of values
  3664. leaq (AO, %rax, SIZE), AO
  3665. leaq (BO, BI, SIZE), BO
  3666. negq BI
  3667. negq %rax
  3668. ALIGN_4
  3669. .L1_22:
  3670. prefetcht0 B_PR1(BO,BI,8)
  3671. KERNEL4x1_1
  3672. KERNEL4x1_2
  3673. KERNEL4x1_3
  3674. KERNEL4x1_4
  3675. KERNEL4x1_1
  3676. KERNEL4x1_2
  3677. KERNEL4x1_3
  3678. KERNEL4x1_4
  3679. je .L1_26
  3680. prefetcht0 B_PR1(BO,BI,8)
  3681. KERNEL4x1_1
  3682. KERNEL4x1_2
  3683. KERNEL4x1_3
  3684. KERNEL4x1_4
  3685. KERNEL4x1_1
  3686. KERNEL4x1_2
  3687. KERNEL4x1_3
  3688. KERNEL4x1_4
  3689. je .L1_26
  3690. jmp .L1_22
  3691. ALIGN_4
  3692. .L1_26:
  3693. #ifndef TRMMKERNEL
  3694. movq K, %rax
  3695. #else
  3696. movq KKK, %rax
  3697. #endif
  3698. andq $7, %rax # if (k & 1)
  3699. je .L1_29
  3700. movq %rax, BI // Index for BO
  3701. salq $2, %rax // rax = rax * 4 ; number of values
  3702. leaq (AO, %rax, SIZE), AO
  3703. leaq (BO, BI, SIZE), BO
  3704. negq BI
  3705. negq %rax
  3706. ALIGN_4
  3707. .L1_27:
  3708. KERNEL4x1_SUB
  3709. jl .L1_27
  3710. ALIGN_4
  3711. .L1_29:
  3712. SAVE4x1
  3713. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3714. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3715. movq K, %rax
  3716. subq KKK, %rax
  3717. movq %rax, BI // Index for BO
  3718. leaq (BO, BI, SIZE), BO
  3719. salq $2, %rax // rax = rax * 4 ; number of values
  3720. leaq (AO, %rax, SIZE), AO
  3721. #endif
  3722. #if defined(TRMMKERNEL) && defined(LEFT)
  3723. addq $4, KK
  3724. #endif
  3725. addq $4 * SIZE, CO1 # coffset += 4
  3726. ALIGN_4
  3727. .L1_30:
  3728. testq $2, M
  3729. jz .L1_40
  3730. ALIGN_4
  3731. .L1_31:
  3732. #if !defined(TRMMKERNEL) || \
  3733. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3734. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3735. leaq BUFFER1, BO // first buffer to BO
  3736. addq $2 * SIZE, BO
  3737. #else
  3738. movq KK, %rax
  3739. leaq BUFFER1, BO // first buffer to BO
  3740. addq $2 * SIZE, BO
  3741. movq %rax, BI // Index for BO
  3742. leaq (BO, BI, SIZE), BO
  3743. salq $1, %rax // rax = rax * 2 ; number of values
  3744. leaq (AO, %rax, SIZE), AO
  3745. #endif
  3746. vzeroall
  3747. #ifndef TRMMKERNEL
  3748. movq K, %rax
  3749. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3750. movq K, %rax
  3751. subq KK, %rax
  3752. movq %rax, KKK
  3753. #else
  3754. movq KK, %rax
  3755. #ifdef LEFT
  3756. addq $2, %rax // number of values in AO
  3757. #else
  3758. addq $1, %rax // number of values in BO
  3759. #endif
  3760. movq %rax, KKK
  3761. #endif
  3762. andq $-8, %rax
  3763. je .L1_36
  3764. movq %rax, BI // Index for BO
  3765. salq $1, %rax // rax = rax *2 ; number of values
  3766. leaq (AO, %rax, SIZE), AO
  3767. leaq (BO, BI, SIZE), BO
  3768. negq BI
  3769. negq %rax
  3770. ALIGN_4
  3771. .L1_32:
  3772. KERNEL2x1_1
  3773. KERNEL2x1_2
  3774. KERNEL2x1_3
  3775. KERNEL2x1_4
  3776. KERNEL2x1_1
  3777. KERNEL2x1_2
  3778. KERNEL2x1_3
  3779. KERNEL2x1_4
  3780. je .L1_36
  3781. KERNEL2x1_1
  3782. KERNEL2x1_2
  3783. KERNEL2x1_3
  3784. KERNEL2x1_4
  3785. KERNEL2x1_1
  3786. KERNEL2x1_2
  3787. KERNEL2x1_3
  3788. KERNEL2x1_4
  3789. je .L1_36
  3790. jmp .L1_32
  3791. ALIGN_4
  3792. .L1_36:
  3793. #ifndef TRMMKERNEL
  3794. movq K, %rax
  3795. #else
  3796. movq KKK, %rax
  3797. #endif
  3798. andq $7, %rax # if (k & 1)
  3799. je .L1_39
  3800. movq %rax, BI // Index for BO
  3801. salq $1, %rax // rax = rax *2 ; number of values
  3802. leaq (AO, %rax, SIZE), AO
  3803. leaq (BO, BI, SIZE), BO
  3804. negq BI
  3805. negq %rax
  3806. ALIGN_4
  3807. .L1_37:
  3808. KERNEL2x1_SUB
  3809. jl .L1_37
  3810. ALIGN_4
  3811. .L1_39:
  3812. SAVE2x1
  3813. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3814. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3815. movq K, %rax
  3816. subq KKK, %rax
  3817. movq %rax, BI // Index for BO
  3818. leaq (BO, BI, SIZE), BO
  3819. salq $1, %rax // rax = rax * 2 ; number of values
  3820. leaq (AO, %rax, SIZE), AO
  3821. #endif
  3822. #if defined(TRMMKERNEL) && defined(LEFT)
  3823. addq $2, KK
  3824. #endif
  3825. addq $2 * SIZE, CO1 # coffset += 2
  3826. ALIGN_4
  3827. .L1_40:
  3828. testq $1, M
  3829. jz .L999
  3830. ALIGN_4
  3831. .L1_41:
  3832. #if !defined(TRMMKERNEL) || \
  3833. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3834. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3835. leaq BUFFER1, BO // first buffer to BO
  3836. addq $2 * SIZE, BO
  3837. #else
  3838. movq KK, %rax
  3839. leaq BUFFER1, BO // first buffer to BO
  3840. addq $2 * SIZE, BO
  3841. movq %rax, BI // Index for BO
  3842. leaq (BO, BI, SIZE), BO
  3843. leaq (AO, %rax, SIZE), AO
  3844. #endif
  3845. vzeroall
  3846. #ifndef TRMMKERNEL
  3847. movq K, %rax
  3848. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3849. movq K, %rax
  3850. subq KK, %rax
  3851. movq %rax, KKK
  3852. #else
  3853. movq KK, %rax
  3854. #ifdef LEFT
  3855. addq $1, %rax // number of values in AO
  3856. #else
  3857. addq $1, %rax // number of values in BO
  3858. #endif
  3859. movq %rax, KKK
  3860. #endif
  3861. andq $-8, %rax
  3862. je .L1_46
  3863. movq %rax, BI // Index for BO
  3864. leaq (AO, %rax, SIZE), AO
  3865. leaq (BO, BI, SIZE), BO
  3866. negq BI
  3867. negq %rax
  3868. ALIGN_4
  3869. .L1_42:
  3870. KERNEL1x1_1
  3871. KERNEL1x1_2
  3872. KERNEL1x1_3
  3873. KERNEL1x1_4
  3874. KERNEL1x1_1
  3875. KERNEL1x1_2
  3876. KERNEL1x1_3
  3877. KERNEL1x1_4
  3878. je .L1_46
  3879. KERNEL1x1_1
  3880. KERNEL1x1_2
  3881. KERNEL1x1_3
  3882. KERNEL1x1_4
  3883. KERNEL1x1_1
  3884. KERNEL1x1_2
  3885. KERNEL1x1_3
  3886. KERNEL1x1_4
  3887. je .L1_46
  3888. jmp .L1_42
  3889. ALIGN_4
  3890. .L1_46:
  3891. #ifndef TRMMKERNEL
  3892. movq K, %rax
  3893. #else
  3894. movq KKK, %rax
  3895. #endif
  3896. andq $7, %rax # if (k & 1)
  3897. je .L1_49
  3898. movq %rax, BI // Index for BO
  3899. leaq (AO, %rax, SIZE), AO
  3900. leaq (BO, BI, SIZE), BO
  3901. negq BI
  3902. negq %rax
  3903. ALIGN_4
  3904. .L1_47:
  3905. KERNEL1x1_SUB
  3906. jl .L1_47
  3907. ALIGN_4
  3908. .L1_49:
  3909. SAVE1x1
  3910. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3911. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3912. movq K, %rax
  3913. subq KKK, %rax
  3914. movq %rax, BI // Index for BO
  3915. leaq (BO, BI, SIZE), BO
  3916. leaq (AO, %rax, SIZE), AO
  3917. #endif
  3918. #if defined(TRMMKERNEL) && defined(LEFT)
  3919. addq $1, KK
  3920. #endif
  3921. addq $1 * SIZE, CO1 # coffset += 1
  3922. ALIGN_4
  3923. .L999:
  3924. movq SP, %rsp
  3925. movq (%rsp), %rbx
  3926. movq 8(%rsp), %rbp
  3927. movq 16(%rsp), %r12
  3928. movq 24(%rsp), %r13
  3929. movq 32(%rsp), %r14
  3930. movq 40(%rsp), %r15
  3931. #ifdef WINDOWS_ABI
  3932. movq 48(%rsp), %rdi
  3933. movq 56(%rsp), %rsi
  3934. movups 64(%rsp), %xmm6
  3935. movups 80(%rsp), %xmm7
  3936. movups 96(%rsp), %xmm8
  3937. movups 112(%rsp), %xmm9
  3938. movups 128(%rsp), %xmm10
  3939. movups 144(%rsp), %xmm11
  3940. movups 160(%rsp), %xmm12
  3941. movups 176(%rsp), %xmm13
  3942. movups 192(%rsp), %xmm14
  3943. movups 208(%rsp), %xmm15
  3944. #endif
  3945. addq $STACKSIZE, %rsp
  3946. ret
  3947. EPILOGUE
  3948. #endif