You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_8x2_haswell.S 106 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927
  1. /*********************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. /*********************************************************************
  28. * 2014/07/29 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. * 2013/10/28 Saar
  34. * Parameter:
  35. * CGEMM_DEFAULT_UNROLL_N 2
  36. * CGEMM_DEFAULT_UNROLL_M 8
  37. * CGEMM_DEFAULT_P 384
  38. * CGEMM_DEFAULT_Q 192
  39. * A_PR1 512
  40. * B_PR1 512
  41. *
  42. * 2014/07/29 Saar
  43. * Performance at 6912x6912x6912:
  44. * 1 thread: 107 GFLOPS (SANDYBRIDGE: 60) (MKL: 86)
  45. * 2 threads: 208 GFLOPS (SANDYBRIDGE: 114) (MKL: 155)
  46. * 3 threads: 289 GFLOPS (SANDYBRIDGE: 162) (MKL: 222)
  47. * 4 threads: 377 GFLOPS (SANDYBRIDGE: 223) (MKL: 279)
  48. *
  49. *
  50. *********************************************************************/
  51. #define ASSEMBLER
  52. #include "common.h"
  53. #define OLD_M %rdi
  54. #define OLD_N %rsi
  55. #define M %r13
  56. #define J %r14
  57. #define OLD_K %rdx
  58. #define A %rcx
  59. #define B %r8
  60. #define C %r9
  61. #define LDC %r10
  62. #define I %r11
  63. #define AO %rdi
  64. #define BO %rsi
  65. #define CO1 %r15
  66. #define K %r12
  67. #define BI %rbp
  68. #define SP %rbx
  69. #define BO1 %rdi
  70. #define BO2 %rbp
  71. #ifndef WINDOWS_ABI
  72. #define STACKSIZE 96
  73. #else
  74. #define STACKSIZE 320
  75. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  76. #define OLD_A 48 + STACKSIZE(%rsp)
  77. #define OLD_B 56 + STACKSIZE(%rsp)
  78. #define OLD_C 64 + STACKSIZE(%rsp)
  79. #define OLD_LDC 72 + STACKSIZE(%rsp)
  80. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  81. #endif
  82. #define L_BUFFER_SIZE 8192
  83. #define Ndiv6 24(%rsp)
  84. #define Nmod6 32(%rsp)
  85. #define N 40(%rsp)
  86. #define ALPHA_R 48(%rsp)
  87. #define ALPHA_I 56(%rsp)
  88. #define OFFSET 64(%rsp)
  89. #define KK 72(%rsp)
  90. #define KKK 80(%rsp)
  91. #define BUFFER1 128(%rsp)
  92. #if defined(OS_WINDOWS)
  93. #if L_BUFFER_SIZE > 16384
  94. #define STACK_TOUCH \
  95. movl $ 0, 4096 * 4(%rsp);\
  96. movl $ 0, 4096 * 3(%rsp);\
  97. movl $ 0, 4096 * 2(%rsp);\
  98. movl $ 0, 4096 * 1(%rsp);
  99. #elif L_BUFFER_SIZE > 12288
  100. #define STACK_TOUCH \
  101. movl $ 0, 4096 * 3(%rsp);\
  102. movl $ 0, 4096 * 2(%rsp);\
  103. movl $ 0, 4096 * 1(%rsp);
  104. #elif L_BUFFER_SIZE > 8192
  105. #define STACK_TOUCH \
  106. movl $ 0, 4096 * 2(%rsp);\
  107. movl $ 0, 4096 * 1(%rsp);
  108. #elif L_BUFFER_SIZE > 4096
  109. #define STACK_TOUCH \
  110. movl $ 0, 4096 * 1(%rsp);
  111. #else
  112. #define STACK_TOUCH
  113. #endif
  114. #else
  115. #define STACK_TOUCH
  116. #endif
  117. #if defined(BULLDOZER)
  118. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  119. #define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
  120. #define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
  121. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  122. #define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
  123. #define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
  124. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  125. #define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
  126. #define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
  127. #else
  128. #define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
  129. #define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
  130. #endif
  131. #else
  132. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  133. #define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
  134. #define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
  135. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  136. #define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
  137. #define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
  138. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  139. #define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
  140. #define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
  141. #else
  142. #define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
  143. #define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
  144. #endif
  145. #endif
  146. #define A_PR1 512
  147. #define B_PR1 512
  148. /***************************************************************************************************************************/
  149. .macro KERNEL8x3_SUB
  150. vmovups -16 * SIZE(AO), %ymm0
  151. vmovups -8 * SIZE(AO), %ymm1
  152. vbroadcastss -8 * SIZE(BO), %ymm2
  153. vbroadcastss -7 * SIZE(BO), %ymm3
  154. prefetcht0 A_PR1(AO)
  155. VFMADDPS_R( %ymm8 ,%ymm2,%ymm0 )
  156. VFMADDPS_R( %ymm12,%ymm2,%ymm1 )
  157. VFMADDPS_I( %ymm9 ,%ymm3,%ymm0 )
  158. VFMADDPS_I( %ymm13,%ymm3,%ymm1 )
  159. vbroadcastss -6 * SIZE(BO), %ymm2
  160. vbroadcastss -5 * SIZE(BO), %ymm3
  161. VFMADDPS_R( %ymm10,%ymm2,%ymm0 )
  162. VFMADDPS_R( %ymm14,%ymm2,%ymm1 )
  163. VFMADDPS_I( %ymm11,%ymm3,%ymm0 )
  164. VFMADDPS_I( %ymm15,%ymm3,%ymm1 )
  165. vbroadcastss -4 * SIZE(BO), %ymm2
  166. vbroadcastss -3 * SIZE(BO), %ymm3
  167. VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 )
  168. VFMADDPS_R( %ymm6 ,%ymm2,%ymm1 )
  169. VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 )
  170. VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 )
  171. addq $ 6*SIZE, BO
  172. addq $ 16*SIZE, AO
  173. decq %rax
  174. .endm
  175. .macro SAVE8x3
  176. vbroadcastss ALPHA_R, %ymm0
  177. vbroadcastss ALPHA_I, %ymm1
  178. // swap high and low 64 bytes
  179. vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
  180. vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
  181. vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
  182. vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
  183. vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
  184. vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7
  185. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  186. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  187. vaddsubps %ymm9, %ymm8 , %ymm8
  188. vaddsubps %ymm11,%ymm10, %ymm10
  189. vaddsubps %ymm13,%ymm12, %ymm12
  190. vaddsubps %ymm15,%ymm14, %ymm14
  191. vaddsubps %ymm5, %ymm4 , %ymm4
  192. vaddsubps %ymm7, %ymm6 , %ymm6
  193. vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9
  194. vshufps $ 0xb1, %ymm10, %ymm10, %ymm11
  195. vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
  196. vshufps $ 0xb1, %ymm14, %ymm14, %ymm15
  197. vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5
  198. vshufps $ 0xb1, %ymm6 , %ymm6 , %ymm7
  199. #else
  200. vaddsubps %ymm8, %ymm9 ,%ymm9
  201. vaddsubps %ymm10, %ymm11,%ymm11
  202. vaddsubps %ymm12, %ymm13,%ymm13
  203. vaddsubps %ymm14, %ymm15,%ymm15
  204. vaddsubps %ymm4, %ymm5 ,%ymm5
  205. vaddsubps %ymm6, %ymm7 ,%ymm7
  206. vmovaps %ymm9, %ymm8
  207. vmovaps %ymm11, %ymm10
  208. vmovaps %ymm13, %ymm12
  209. vmovaps %ymm15, %ymm14
  210. vmovaps %ymm5, %ymm4
  211. vmovaps %ymm7, %ymm6
  212. // swap high and low 64 bytes
  213. vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
  214. vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
  215. vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
  216. vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
  217. vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
  218. vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7
  219. #endif
  220. // multiply with ALPHA_R
  221. vmulps %ymm8 , %ymm0, %ymm8
  222. vmulps %ymm10, %ymm0, %ymm10
  223. vmulps %ymm12, %ymm0, %ymm12
  224. vmulps %ymm14, %ymm0, %ymm14
  225. vmulps %ymm4 , %ymm0, %ymm4
  226. vmulps %ymm6 , %ymm0, %ymm6
  227. // multiply with ALPHA_I
  228. vmulps %ymm9 , %ymm1, %ymm9
  229. vmulps %ymm11, %ymm1, %ymm11
  230. vmulps %ymm13, %ymm1, %ymm13
  231. vmulps %ymm15, %ymm1, %ymm15
  232. vmulps %ymm5 , %ymm1, %ymm5
  233. vmulps %ymm7 , %ymm1, %ymm7
  234. vaddsubps %ymm9, %ymm8 , %ymm8
  235. vaddsubps %ymm11,%ymm10, %ymm10
  236. vaddsubps %ymm13,%ymm12, %ymm12
  237. vaddsubps %ymm15,%ymm14, %ymm14
  238. vaddsubps %ymm5, %ymm4 , %ymm4
  239. vaddsubps %ymm7, %ymm6 , %ymm6
  240. #if !defined(TRMMKERNEL)
  241. vaddps (CO1), %ymm8 , %ymm8
  242. vaddps 8 * SIZE(CO1), %ymm12, %ymm12
  243. vaddps (CO1, LDC), %ymm10, %ymm10
  244. vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14
  245. vaddps (CO1, LDC,2), %ymm4, %ymm4
  246. vaddps 8 * SIZE(CO1, LDC,2), %ymm6, %ymm6
  247. #endif
  248. vmovups %ymm8 , (CO1)
  249. vmovups %ymm12 , 8 * SIZE(CO1)
  250. vmovups %ymm10 , (CO1, LDC)
  251. vmovups %ymm14 , 8 * SIZE(CO1, LDC)
  252. vmovups %ymm4 , (CO1, LDC,2)
  253. vmovups %ymm6 , 8 * SIZE(CO1, LDC,2)
  254. .endm
  255. /***************************************************************************************************************************/
  256. .macro KERNEL4x3_SUB
  257. vmovups -16 * SIZE(AO), %ymm0
  258. vbroadcastss -8 * SIZE(BO), %ymm2
  259. vbroadcastss -7 * SIZE(BO), %ymm3
  260. VFMADDPS_R( %ymm8 ,%ymm2,%ymm0 )
  261. VFMADDPS_I( %ymm9 ,%ymm3,%ymm0 )
  262. vbroadcastss -6 * SIZE(BO), %ymm2
  263. vbroadcastss -5 * SIZE(BO), %ymm3
  264. VFMADDPS_R( %ymm12,%ymm2,%ymm0 )
  265. VFMADDPS_I( %ymm13,%ymm3,%ymm0 )
  266. vbroadcastss -4 * SIZE(BO), %ymm2
  267. vbroadcastss -3 * SIZE(BO), %ymm3
  268. VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 )
  269. VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 )
  270. addq $ 6*SIZE, BO
  271. addq $ 8*SIZE, AO
  272. decq %rax
  273. .endm
  274. .macro SAVE4x3
  275. vbroadcastss ALPHA_R, %ymm0
  276. vbroadcastss ALPHA_I, %ymm1
  277. // swap high and low 64 bytes
  278. vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
  279. vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
  280. vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
  281. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  282. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  283. vaddsubps %ymm9, %ymm8 , %ymm8
  284. vaddsubps %ymm13,%ymm12, %ymm12
  285. vaddsubps %ymm5, %ymm4 , %ymm4
  286. vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9
  287. vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
  288. vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5
  289. #else
  290. vaddsubps %ymm8, %ymm9 ,%ymm9
  291. vaddsubps %ymm12, %ymm13,%ymm13
  292. vaddsubps %ymm4, %ymm5 ,%ymm5
  293. vmovaps %ymm9, %ymm8
  294. vmovaps %ymm13, %ymm12
  295. vmovaps %ymm5, %ymm4
  296. // swap high and low 64 bytes
  297. vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
  298. vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
  299. vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
  300. #endif
  301. // multiply with ALPHA_R
  302. vmulps %ymm8 , %ymm0, %ymm8
  303. vmulps %ymm12, %ymm0, %ymm12
  304. vmulps %ymm4 , %ymm0, %ymm4
  305. // multiply with ALPHA_I
  306. vmulps %ymm9 , %ymm1, %ymm9
  307. vmulps %ymm13, %ymm1, %ymm13
  308. vmulps %ymm5 , %ymm1, %ymm5
  309. vaddsubps %ymm9, %ymm8 , %ymm8
  310. vaddsubps %ymm13,%ymm12, %ymm12
  311. vaddsubps %ymm5, %ymm4 , %ymm4
  312. #if !defined(TRMMKERNEL)
  313. vaddps (CO1), %ymm8 , %ymm8
  314. vaddps (CO1, LDC), %ymm12, %ymm12
  315. vaddps (CO1, LDC,2), %ymm4, %ymm4
  316. #endif
  317. vmovups %ymm8 , (CO1)
  318. vmovups %ymm12 , (CO1, LDC)
  319. vmovups %ymm4 , (CO1, LDC,2)
  320. .endm
  321. /***************************************************************************************************************************/
  322. .macro KERNEL2x3_SUB
  323. vmovups -16 * SIZE(AO), %xmm0
  324. vbroadcastss -8 * SIZE(BO), %xmm2
  325. vbroadcastss -7 * SIZE(BO), %xmm3
  326. VFMADDPS_R( %xmm8 ,%xmm2,%xmm0 )
  327. VFMADDPS_I( %xmm9 ,%xmm3,%xmm0 )
  328. vbroadcastss -6 * SIZE(BO), %xmm2
  329. vbroadcastss -5 * SIZE(BO), %xmm3
  330. VFMADDPS_R( %xmm12,%xmm2,%xmm0 )
  331. VFMADDPS_I( %xmm13,%xmm3,%xmm0 )
  332. vbroadcastss -4 * SIZE(BO), %xmm2
  333. vbroadcastss -3 * SIZE(BO), %xmm3
  334. VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 )
  335. VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 )
  336. addq $ 6*SIZE, BO
  337. addq $ 4*SIZE, AO
  338. decq %rax
  339. .endm
  340. .macro SAVE2x3
  341. vbroadcastss ALPHA_R, %xmm0
  342. vbroadcastss ALPHA_I, %xmm1
  343. // swap high and low 64 bytes
  344. vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
  345. vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
  346. vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
  347. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  348. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  349. vaddsubps %xmm9, %xmm8 , %xmm8
  350. vaddsubps %xmm13,%xmm12, %xmm12
  351. vaddsubps %xmm5, %xmm4 , %xmm4
  352. vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9
  353. vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
  354. vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5
  355. #else
  356. vaddsubps %xmm8, %xmm9 ,%xmm9
  357. vaddsubps %xmm12, %xmm13,%xmm13
  358. vaddsubps %xmm4, %xmm5 ,%xmm5
  359. vmovaps %xmm9, %xmm8
  360. vmovaps %xmm13, %xmm12
  361. vmovaps %xmm5, %xmm4
  362. // swap high and low 64 bytes
  363. vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
  364. vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
  365. vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
  366. #endif
  367. // multiply with ALPHA_R
  368. vmulps %xmm8 , %xmm0, %xmm8
  369. vmulps %xmm12, %xmm0, %xmm12
  370. vmulps %xmm4 , %xmm0, %xmm4
  371. // multiply with ALPHA_I
  372. vmulps %xmm9 , %xmm1, %xmm9
  373. vmulps %xmm13, %xmm1, %xmm13
  374. vmulps %xmm5 , %xmm1, %xmm5
  375. vaddsubps %xmm9, %xmm8 , %xmm8
  376. vaddsubps %xmm13,%xmm12, %xmm12
  377. vaddsubps %xmm5, %xmm4 , %xmm4
  378. #if !defined(TRMMKERNEL)
  379. vaddps (CO1), %xmm8 , %xmm8
  380. vaddps (CO1, LDC), %xmm12, %xmm12
  381. vaddps (CO1, LDC,2), %xmm4, %xmm4
  382. #endif
  383. vmovups %xmm8 , (CO1)
  384. vmovups %xmm12 , (CO1, LDC)
  385. vmovups %xmm4 , (CO1, LDC,2)
  386. .endm
  387. /***************************************************************************************************************************/
  388. .macro KERNEL1x3_SUB
  389. vmovsd -16 * SIZE(AO), %xmm0
  390. vbroadcastss -8 * SIZE(BO), %xmm2
  391. vbroadcastss -7 * SIZE(BO), %xmm3
  392. VFMADDPS_R( %xmm8 ,%xmm2,%xmm0 )
  393. VFMADDPS_I( %xmm9 ,%xmm3,%xmm0 )
  394. vbroadcastss -6 * SIZE(BO), %xmm2
  395. vbroadcastss -5 * SIZE(BO), %xmm3
  396. VFMADDPS_R( %xmm12,%xmm2,%xmm0 )
  397. VFMADDPS_I( %xmm13,%xmm3,%xmm0 )
  398. vbroadcastss -4 * SIZE(BO), %xmm2
  399. vbroadcastss -3 * SIZE(BO), %xmm3
  400. VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 )
  401. VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 )
  402. addq $ 6*SIZE, BO
  403. addq $ 2*SIZE, AO
  404. decq %rax
  405. .endm
  406. .macro SAVE1x3
  407. vbroadcastss ALPHA_R, %xmm0
  408. vbroadcastss ALPHA_I, %xmm1
  409. // swap high and low 64 bytes
  410. vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
  411. vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
  412. vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
  413. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  414. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  415. vaddsubps %xmm9, %xmm8 , %xmm8
  416. vaddsubps %xmm13,%xmm12, %xmm12
  417. vaddsubps %xmm5, %xmm4 , %xmm4
  418. vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9
  419. vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
  420. vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5
  421. #else
  422. vaddsubps %xmm8, %xmm9 ,%xmm9
  423. vaddsubps %xmm12, %xmm13,%xmm13
  424. vaddsubps %xmm4, %xmm5 ,%xmm5
  425. vmovaps %xmm9, %xmm8
  426. vmovaps %xmm13, %xmm12
  427. vmovaps %xmm5, %xmm4
  428. // swap high and low 64 bytes
  429. vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
  430. vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
  431. vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
  432. #endif
  433. // multiply with ALPHA_R
  434. vmulps %xmm8 , %xmm0, %xmm8
  435. vmulps %xmm12, %xmm0, %xmm12
  436. vmulps %xmm4 , %xmm0, %xmm4
  437. // multiply with ALPHA_I
  438. vmulps %xmm9 , %xmm1, %xmm9
  439. vmulps %xmm13, %xmm1, %xmm13
  440. vmulps %xmm5 , %xmm1, %xmm5
  441. vaddsubps %xmm9, %xmm8 , %xmm8
  442. vaddsubps %xmm13,%xmm12, %xmm12
  443. vaddsubps %xmm5, %xmm4 , %xmm4
  444. #if !defined(TRMMKERNEL)
  445. vmovsd (CO1) , %xmm9
  446. vmovsd (CO1,LDC) , %xmm13
  447. vmovsd (CO1,LDC,2), %xmm5
  448. vaddps %xmm9 , %xmm8 , %xmm8
  449. vaddps %xmm13, %xmm12, %xmm12
  450. vaddps %xmm5 , %xmm4, %xmm4
  451. #endif
  452. vmovsd %xmm8 , (CO1)
  453. vmovsd %xmm12 , (CO1, LDC)
  454. vmovsd %xmm4 , (CO1, LDC,2)
  455. .endm
  456. /***************************************************************************************************************************/
  457. .macro KERNEL8x2_SUB
  458. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  459. vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4
  460. VFMADDPS_R( %ymm8,%ymm4,%ymm0 )
  461. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
  462. VFMADDPS_R( %ymm12,%ymm4,%ymm1 )
  463. vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5
  464. VFMADDPS_I( %ymm9,%ymm5,%ymm0 )
  465. VFMADDPS_I( %ymm13,%ymm5,%ymm1 )
  466. vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6
  467. VFMADDPS_R( %ymm10,%ymm6,%ymm0 )
  468. VFMADDPS_R( %ymm14,%ymm6,%ymm1 )
  469. vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7
  470. VFMADDPS_I( %ymm11,%ymm7,%ymm0 )
  471. VFMADDPS_I( %ymm15,%ymm7,%ymm1 )
  472. addq $ 4 , BI
  473. addq $ 16, %rax
  474. .endm
  475. .macro SAVE8x2
  476. vbroadcastss ALPHA_R, %ymm0
  477. vbroadcastss ALPHA_I, %ymm1
  478. // swap high and low 64 bytes
  479. vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
  480. vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
  481. vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
  482. vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
  483. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  484. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  485. vaddsubps %ymm9, %ymm8 , %ymm8
  486. vaddsubps %ymm11,%ymm10, %ymm10
  487. vaddsubps %ymm13,%ymm12, %ymm12
  488. vaddsubps %ymm15,%ymm14, %ymm14
  489. vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
  490. vshufps $ 0xb1, %ymm10, %ymm10, %ymm11
  491. vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
  492. vshufps $ 0xb1, %ymm14, %ymm14, %ymm15
  493. #else
  494. vaddsubps %ymm8, %ymm9 ,%ymm9
  495. vaddsubps %ymm10, %ymm11,%ymm11
  496. vaddsubps %ymm12, %ymm13,%ymm13
  497. vaddsubps %ymm14, %ymm15,%ymm15
  498. vmovaps %ymm9, %ymm8
  499. vmovaps %ymm11, %ymm10
  500. vmovaps %ymm13, %ymm12
  501. vmovaps %ymm15, %ymm14
  502. // swap high and low 64 bytes
  503. vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
  504. vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
  505. vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
  506. vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
  507. #endif
  508. // multiply with ALPHA_R
  509. vmulps %ymm8 , %ymm0, %ymm8
  510. vmulps %ymm10, %ymm0, %ymm10
  511. vmulps %ymm12, %ymm0, %ymm12
  512. vmulps %ymm14, %ymm0, %ymm14
  513. // multiply with ALPHA_I
  514. vmulps %ymm9 , %ymm1, %ymm9
  515. vmulps %ymm11, %ymm1, %ymm11
  516. vmulps %ymm13, %ymm1, %ymm13
  517. vmulps %ymm15, %ymm1, %ymm15
  518. vaddsubps %ymm9, %ymm8 , %ymm8
  519. vaddsubps %ymm11,%ymm10, %ymm10
  520. vaddsubps %ymm13,%ymm12, %ymm12
  521. vaddsubps %ymm15,%ymm14, %ymm14
  522. #if !defined(TRMMKERNEL)
  523. vaddps (CO1), %ymm8 , %ymm8
  524. vaddps 8 * SIZE(CO1), %ymm12, %ymm12
  525. vaddps (CO1, LDC), %ymm10, %ymm10
  526. vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14
  527. #endif
  528. vmovups %ymm8 , (CO1)
  529. vmovups %ymm12 , 8 * SIZE(CO1)
  530. vmovups %ymm10 , (CO1, LDC)
  531. vmovups %ymm14 , 8 * SIZE(CO1, LDC)
  532. prefetcht0 64(CO1)
  533. prefetcht0 64(CO1, LDC)
  534. .endm
  535. /***************************************************************************************************************************/
  536. .macro KERNEL4x2_SUB
  537. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  538. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4
  539. VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
  540. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1
  541. VFMADDPS_R( %xmm12,%xmm4,%xmm1 )
  542. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5
  543. VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
  544. VFMADDPS_I( %xmm13,%xmm5,%xmm1 )
  545. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6
  546. VFMADDPS_R( %xmm10,%xmm6,%xmm0 )
  547. VFMADDPS_R( %xmm14,%xmm6,%xmm1 )
  548. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7
  549. VFMADDPS_I( %xmm11,%xmm7,%xmm0 )
  550. VFMADDPS_I( %xmm15,%xmm7,%xmm1 )
  551. addq $ 4, BI
  552. addq $ 8, %rax
  553. .endm
  554. .macro SAVE4x2
  555. vbroadcastss ALPHA_R, %xmm0
  556. vbroadcastss ALPHA_I, %xmm1
  557. // swap high and low 64 bytes
  558. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  559. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  560. vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
  561. vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
  562. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  563. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  564. vaddsubps %xmm9, %xmm8 , %xmm8
  565. vaddsubps %xmm11,%xmm10, %xmm10
  566. vaddsubps %xmm13,%xmm12, %xmm12
  567. vaddsubps %xmm15,%xmm14, %xmm14
  568. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  569. vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
  570. vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
  571. vshufps $ 0xb1, %xmm14, %xmm14, %xmm15
  572. #else
  573. vaddsubps %xmm8, %xmm9 ,%xmm9
  574. vaddsubps %xmm10, %xmm11,%xmm11
  575. vaddsubps %xmm12, %xmm13,%xmm13
  576. vaddsubps %xmm14, %xmm15,%xmm15
  577. vmovaps %xmm9, %xmm8
  578. vmovaps %xmm11, %xmm10
  579. vmovaps %xmm13, %xmm12
  580. vmovaps %xmm15, %xmm14
  581. // swap high and low 64 bytes
  582. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  583. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  584. vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
  585. vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
  586. #endif
  587. // multiply with ALPHA_R
  588. vmulps %xmm8 , %xmm0, %xmm8
  589. vmulps %xmm10, %xmm0, %xmm10
  590. vmulps %xmm12, %xmm0, %xmm12
  591. vmulps %xmm14, %xmm0, %xmm14
  592. // multiply with ALPHA_I
  593. vmulps %xmm9 , %xmm1, %xmm9
  594. vmulps %xmm11, %xmm1, %xmm11
  595. vmulps %xmm13, %xmm1, %xmm13
  596. vmulps %xmm15, %xmm1, %xmm15
  597. vaddsubps %xmm9, %xmm8 , %xmm8
  598. vaddsubps %xmm11,%xmm10, %xmm10
  599. vaddsubps %xmm13,%xmm12, %xmm12
  600. vaddsubps %xmm15,%xmm14, %xmm14
  601. #if !defined(TRMMKERNEL)
  602. vaddps (CO1), %xmm8 , %xmm8
  603. vaddps 4 * SIZE(CO1), %xmm12, %xmm12
  604. vaddps (CO1, LDC), %xmm10, %xmm10
  605. vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14
  606. #endif
  607. vmovups %xmm8 , (CO1)
  608. vmovups %xmm12 , 4 * SIZE(CO1)
  609. vmovups %xmm10 , (CO1, LDC)
  610. vmovups %xmm14 , 4 * SIZE(CO1, LDC)
  611. .endm
  612. /************************************************************************************************/
  613. .macro KERNEL2x2_SUB
  614. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  615. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4
  616. VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
  617. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5
  618. VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
  619. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6
  620. VFMADDPS_R( %xmm10,%xmm6,%xmm0 )
  621. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7
  622. VFMADDPS_I( %xmm11,%xmm7,%xmm0 )
  623. addq $ 4, BI
  624. addq $ 4, %rax
  625. .endm
  626. .macro SAVE2x2
  627. vbroadcastss ALPHA_R, %xmm0
  628. vbroadcastss ALPHA_I, %xmm1
  629. // swap high and low 4 bytes
  630. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  631. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  632. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  633. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  634. vaddsubps %xmm9, %xmm8 , %xmm8
  635. vaddsubps %xmm11,%xmm10, %xmm10
  636. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  637. vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
  638. #else
  639. vaddsubps %xmm8, %xmm9 ,%xmm9
  640. vaddsubps %xmm10, %xmm11,%xmm11
  641. vmovaps %xmm9, %xmm8
  642. vmovaps %xmm11, %xmm10
  643. // swap high and low 4 bytes
  644. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  645. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  646. #endif
  647. // multiply with ALPHA_R
  648. vmulps %xmm8 , %xmm0, %xmm8
  649. vmulps %xmm10, %xmm0, %xmm10
  650. // multiply with ALPHA_I
  651. vmulps %xmm9 , %xmm1, %xmm9
  652. vmulps %xmm11, %xmm1, %xmm11
  653. vaddsubps %xmm9, %xmm8 , %xmm8
  654. vaddsubps %xmm11,%xmm10, %xmm10
  655. #if !defined(TRMMKERNEL)
  656. vaddps (CO1), %xmm8 , %xmm8
  657. vaddps (CO1, LDC), %xmm10, %xmm10
  658. #endif
  659. vmovups %xmm8 , (CO1)
  660. vmovups %xmm10 , (CO1, LDC)
  661. .endm
  662. /************************************************************************************************/
  663. .macro KERNEL1x2_SUB
  664. vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0
  665. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4
  666. VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
  667. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5
  668. VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
  669. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6
  670. VFMADDPS_R( %xmm10,%xmm6,%xmm0 )
  671. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7
  672. VFMADDPS_I( %xmm11,%xmm7,%xmm0 )
  673. addq $ 4, BI
  674. addq $ 2, %rax
  675. .endm
  676. .macro SAVE1x2
  677. vbroadcastss ALPHA_R, %xmm0
  678. vbroadcastss ALPHA_I, %xmm1
  679. // swap high and low 64 bytes
  680. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  681. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  682. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  683. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  684. vaddsubps %xmm9, %xmm8 , %xmm8
  685. vaddsubps %xmm11,%xmm10, %xmm10
  686. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  687. vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
  688. #else
  689. vaddsubps %xmm8, %xmm9 ,%xmm9
  690. vaddsubps %xmm10, %xmm11,%xmm11
  691. vmovaps %xmm9, %xmm8
  692. vmovaps %xmm11, %xmm10
  693. // swap high and low 64 bytes
  694. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  695. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  696. #endif
  697. // multiply with ALPHA_R
  698. vmulps %xmm8 , %xmm0, %xmm8
  699. vmulps %xmm10, %xmm0, %xmm10
  700. // multiply with ALPHA_I
  701. vmulps %xmm9 , %xmm1, %xmm9
  702. vmulps %xmm11, %xmm1, %xmm11
  703. vaddsubps %xmm9, %xmm8 , %xmm8
  704. vaddsubps %xmm11,%xmm10, %xmm10
  705. #if !defined(TRMMKERNEL)
  706. vmovsd (CO1), %xmm14
  707. vaddps %xmm14, %xmm8 , %xmm8
  708. vmovsd (CO1, LDC), %xmm15
  709. vaddps %xmm15, %xmm10, %xmm10
  710. #endif
  711. vmovsd %xmm8 , (CO1)
  712. vmovsd %xmm10 , (CO1, LDC)
  713. .endm
  714. /************************************************************************************************/
  715. .macro KERNEL8x1_SUB
  716. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  717. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1
  718. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4
  719. VFMADDPS_R( %ymm8,%ymm4,%ymm0 )
  720. VFMADDPS_R( %ymm12,%ymm4,%ymm1 )
  721. vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5
  722. VFMADDPS_I( %ymm9,%ymm5,%ymm0 )
  723. VFMADDPS_I( %ymm13,%ymm5,%ymm1 )
  724. addq $ 2 , BI
  725. addq $ 16, %rax
  726. .endm
  727. .macro SAVE8x1
  728. vbroadcastss ALPHA_R, %ymm0
  729. vbroadcastss ALPHA_I, %ymm1
  730. // swap high and low 64 bytes
  731. vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
  732. vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
  733. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  734. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  735. vaddsubps %ymm9, %ymm8 , %ymm8
  736. vaddsubps %ymm13,%ymm12, %ymm12
  737. vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
  738. vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
  739. #else
  740. vaddsubps %ymm8, %ymm9 ,%ymm9
  741. vaddsubps %ymm12, %ymm13,%ymm13
  742. vmovaps %ymm9, %ymm8
  743. vmovaps %ymm13, %ymm12
  744. // swap high and low 64 bytes
  745. vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
  746. vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
  747. #endif
  748. // multiply with ALPHA_R
  749. vmulps %ymm8 , %ymm0, %ymm8
  750. vmulps %ymm12, %ymm0, %ymm12
  751. // multiply with ALPHA_I
  752. vmulps %ymm9 , %ymm1, %ymm9
  753. vmulps %ymm13, %ymm1, %ymm13
  754. vaddsubps %ymm9, %ymm8 , %ymm8
  755. vaddsubps %ymm13,%ymm12, %ymm12
  756. #if !defined(TRMMKERNEL)
  757. vaddps (CO1), %ymm8 , %ymm8
  758. vaddps 8 * SIZE(CO1), %ymm12, %ymm12
  759. #endif
  760. vmovups %ymm8 , (CO1)
  761. vmovups %ymm12 , 8 * SIZE(CO1)
  762. .endm
  763. /************************************************************************************************/
  764. .macro KERNEL4x1_SUB
  765. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  766. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4
  767. VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
  768. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1
  769. VFMADDPS_R( %xmm12,%xmm4,%xmm1 )
  770. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5
  771. VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
  772. VFMADDPS_I( %xmm13,%xmm5,%xmm1 )
  773. addq $ 2, BI
  774. addq $ 8, %rax
  775. .endm
  776. .macro SAVE4x1
  777. vbroadcastss ALPHA_R, %xmm0
  778. vbroadcastss ALPHA_I, %xmm1
  779. // swap high and low 4 bytes
  780. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  781. vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
  782. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  783. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  784. vaddsubps %xmm9, %xmm8 , %xmm8
  785. vaddsubps %xmm13,%xmm12, %xmm12
  786. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  787. vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
  788. #else
  789. vaddsubps %xmm8, %xmm9 ,%xmm9
  790. vaddsubps %xmm12, %xmm13,%xmm13
  791. vmovaps %xmm9, %xmm8
  792. vmovaps %xmm13, %xmm12
  793. // swap high and low 4 bytes
  794. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  795. vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
  796. #endif
  797. // multiply with ALPHA_R
  798. vmulps %xmm8 , %xmm0, %xmm8
  799. vmulps %xmm12, %xmm0, %xmm12
  800. // multiply with ALPHA_I
  801. vmulps %xmm9 , %xmm1, %xmm9
  802. vmulps %xmm13, %xmm1, %xmm13
  803. vaddsubps %xmm9, %xmm8 , %xmm8
  804. vaddsubps %xmm13,%xmm12, %xmm12
  805. #ifndef TRMMKERNEL
  806. vaddps (CO1), %xmm8 , %xmm8
  807. vaddps 4 * SIZE(CO1), %xmm12, %xmm12
  808. #endif
  809. vmovups %xmm8 , (CO1)
  810. vmovups %xmm12 , 4 * SIZE(CO1)
  811. .endm
  812. /************************************************************************************************/
  813. .macro KERNEL2x1_SUB
  814. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  815. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4
  816. VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
  817. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5
  818. VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
  819. addq $ 2, BI
  820. addq $ 4, %rax
  821. .endm
  822. .macro SAVE2x1
  823. vbroadcastss ALPHA_R, %xmm0
  824. vbroadcastss ALPHA_I, %xmm1
  825. // swap high and low 64 bytes
  826. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  827. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  828. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  829. vaddsubps %xmm9, %xmm8 , %xmm8
  830. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  831. #else
  832. vaddsubps %xmm8, %xmm9 ,%xmm9
  833. vmovaps %xmm9, %xmm8
  834. // swap high and low 64 bytes
  835. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  836. #endif
  837. // multiply with ALPHA_R
  838. vmulps %xmm8 , %xmm0, %xmm8
  839. // multiply with ALPHA_I
  840. vmulps %xmm9 , %xmm1, %xmm9
  841. vaddsubps %xmm9, %xmm8 , %xmm8
  842. #if !defined(TRMMKERNEL)
  843. vaddps (CO1), %xmm8 , %xmm8
  844. #endif
  845. vmovups %xmm8 , (CO1)
  846. .endm
  847. /************************************************************************************************/
  848. .macro KERNEL1x1_SUB
  849. vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0
  850. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4
  851. VFMADDPS_R( %xmm8,%xmm4,%xmm0 )
  852. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5
  853. VFMADDPS_I( %xmm9,%xmm5,%xmm0 )
  854. addq $ 2, BI
  855. addq $ 2, %rax
  856. .endm
  857. .macro SAVE1x1
  858. vbroadcastss ALPHA_R, %xmm0
  859. vbroadcastss ALPHA_I, %xmm1
  860. // swap high and low 64 bytes
  861. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  862. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  863. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  864. vaddsubps %xmm9, %xmm8 , %xmm8
  865. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  866. #else
  867. vaddsubps %xmm8, %xmm9 ,%xmm9
  868. vmovaps %xmm9, %xmm8
  869. // swap high and low 64 bytes
  870. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  871. #endif
  872. // multiply with ALPHA_R
  873. vmulps %xmm8 , %xmm0, %xmm8
  874. // multiply with ALPHA_I
  875. vmulps %xmm9 , %xmm1, %xmm9
  876. vaddsubps %xmm9, %xmm8 , %xmm8
  877. #if !defined(TRMMKERNEL)
  878. vmovsd (CO1), %xmm14
  879. vaddps %xmm14, %xmm8 , %xmm8
  880. #endif
  881. vmovsd %xmm8 , (CO1)
  882. .endm
  883. #if !defined(TRMMKERNEL)
  884. PROLOGUE
  885. PROFCODE
  886. subq $STACKSIZE, %rsp
  887. movq %rbx, (%rsp)
  888. movq %rbp, 8(%rsp)
  889. movq %r12, 16(%rsp)
  890. movq %r13, 24(%rsp)
  891. movq %r14, 32(%rsp)
  892. movq %r15, 40(%rsp)
  893. vzeroupper
  894. #ifdef WINDOWS_ABI
  895. movq %rdi, 48(%rsp)
  896. movq %rsi, 56(%rsp)
  897. vmovups %xmm6, 64(%rsp)
  898. vmovups %xmm7, 80(%rsp)
  899. vmovups %xmm8, 96(%rsp)
  900. vmovups %xmm9, 112(%rsp)
  901. vmovups %xmm10, 128(%rsp)
  902. vmovups %xmm11, 144(%rsp)
  903. vmovups %xmm12, 160(%rsp)
  904. vmovups %xmm13, 176(%rsp)
  905. vmovups %xmm14, 192(%rsp)
  906. vmovups %xmm15, 208(%rsp)
  907. movq ARG1, OLD_M
  908. movq ARG2, OLD_N
  909. movq ARG3, OLD_K
  910. movq OLD_A, A
  911. movq OLD_B, B
  912. movq OLD_C, C
  913. movq OLD_LDC, LDC
  914. vmovaps %xmm3, %xmm0
  915. vmovsd OLD_ALPHA_I, %xmm1
  916. #else
  917. movq STACKSIZE + 8(%rsp), LDC
  918. #endif
  919. movq %rsp, SP # save old stack
  920. subq $ 128 + L_BUFFER_SIZE, %rsp
  921. andq $ -4096, %rsp # align stack
  922. STACK_TOUCH
  923. cmpq $ 0, OLD_M
  924. je .L999
  925. cmpq $ 0, OLD_N
  926. je .L999
  927. cmpq $ 0, OLD_K
  928. je .L999
  929. movq OLD_M, M
  930. movq OLD_N, N
  931. movq OLD_K, K
  932. vmovss %xmm0, ALPHA_R
  933. vmovss %xmm1, ALPHA_I
  934. salq $ ZBASE_SHIFT, LDC
  935. movq N, %rax
  936. xorq %rdx, %rdx
  937. movq $ 6, %rdi
  938. divq %rdi // N / 6
  939. movq %rax, Ndiv6 // N / 6
  940. movq %rdx, Nmod6 // N % 6
  941. /************************************************************************************************/
  942. .L6_0:
  943. movq Ndiv6, J
  944. cmpq $ 0, J
  945. je .L2_00
  946. ALIGN_4
  947. .L6_01:
  948. // copy to sub buffer
  949. movq B, BO1
  950. leaq BUFFER1, BO // first buffer to BO
  951. movq K, %rax
  952. salq $2, %rax // 2 * COMPSIZE
  953. leaq (B, %rax,4), BO2
  954. movq BO2, B // next offset of B
  955. movq K, %rax
  956. ALIGN_4
  957. .L6_02b:
  958. vmovups (BO1), %xmm0
  959. vmovsd (BO2), %xmm1
  960. vmovups %xmm0, (BO)
  961. vmovsd %xmm1, 4*SIZE(BO)
  962. addq $ 4*SIZE,BO1
  963. addq $ 4*SIZE,BO2
  964. addq $ 6*SIZE,BO
  965. decq %rax
  966. jnz .L6_02b
  967. .L6_10:
  968. movq C, CO1
  969. leaq (C, LDC, 2), C // c += 2 * ldc
  970. leaq (C, LDC, 1), C // c += 1 * ldc
  971. movq A, AO // aoffset = a
  972. addq $ 16 * SIZE, AO
  973. movq M, I
  974. sarq $ 3, I // i = (m >> 3)
  975. je .L6_4_10
  976. ALIGN_4
  977. /**********************************************************************************************************/
  978. .L6_8_11:
  979. leaq BUFFER1, BO // first buffer to BO
  980. addq $ 8 * SIZE, BO
  981. vzeroall
  982. movq K, %rax
  983. andq $ -8, %rax // K = K - ( K % 8 )
  984. je .L6_8_16
  985. ALIGN_4
  986. .L6_8_12:
  987. KERNEL8x3_SUB
  988. KERNEL8x3_SUB
  989. KERNEL8x3_SUB
  990. KERNEL8x3_SUB
  991. KERNEL8x3_SUB
  992. KERNEL8x3_SUB
  993. KERNEL8x3_SUB
  994. KERNEL8x3_SUB
  995. je .L6_8_16
  996. KERNEL8x3_SUB
  997. KERNEL8x3_SUB
  998. KERNEL8x3_SUB
  999. KERNEL8x3_SUB
  1000. KERNEL8x3_SUB
  1001. KERNEL8x3_SUB
  1002. KERNEL8x3_SUB
  1003. KERNEL8x3_SUB
  1004. je .L6_8_16
  1005. jmp .L6_8_12
  1006. ALIGN_4
  1007. .L6_8_16:
  1008. movq K, %rax
  1009. andq $ 7, %rax # if (k & 1)
  1010. je .L6_8_19
  1011. ALIGN_4
  1012. .L6_8_17:
  1013. KERNEL8x3_SUB
  1014. jnz .L6_8_17
  1015. ALIGN_4
  1016. .L6_8_19:
  1017. SAVE8x3
  1018. addq $ 16 * SIZE, CO1 # coffset += 16
  1019. decq I # i --
  1020. jg .L6_8_11
  1021. ALIGN_4
  1022. /**********************************************************************************************************/
  1023. .L6_4_10:
  1024. testq $ 7, M
  1025. jz .L6_4_60 // to next 2 lines of N
  1026. testq $ 4, M
  1027. jz .L6_4_20
  1028. ALIGN_4
  1029. .L6_4_11:
  1030. leaq BUFFER1, BO // first buffer to BO
  1031. addq $ 8 * SIZE, BO
  1032. vzeroall
  1033. movq K, %rax
  1034. andq $ -8, %rax // K = K - ( K % 8 )
  1035. je .L6_4_16
  1036. ALIGN_4
  1037. .L6_4_12:
  1038. prefetcht0 A_PR1(AO)
  1039. KERNEL4x3_SUB
  1040. KERNEL4x3_SUB
  1041. prefetcht0 A_PR1(AO)
  1042. KERNEL4x3_SUB
  1043. KERNEL4x3_SUB
  1044. prefetcht0 A_PR1(AO)
  1045. KERNEL4x3_SUB
  1046. KERNEL4x3_SUB
  1047. prefetcht0 A_PR1(AO)
  1048. KERNEL4x3_SUB
  1049. KERNEL4x3_SUB
  1050. je .L6_4_16
  1051. prefetcht0 A_PR1(AO)
  1052. KERNEL4x3_SUB
  1053. KERNEL4x3_SUB
  1054. prefetcht0 A_PR1(AO)
  1055. KERNEL4x3_SUB
  1056. KERNEL4x3_SUB
  1057. prefetcht0 A_PR1(AO)
  1058. KERNEL4x3_SUB
  1059. KERNEL4x3_SUB
  1060. prefetcht0 A_PR1(AO)
  1061. KERNEL4x3_SUB
  1062. KERNEL4x3_SUB
  1063. je .L6_4_16
  1064. jmp .L6_4_12
  1065. ALIGN_4
  1066. .L6_4_16:
  1067. movq K, %rax
  1068. andq $ 7, %rax # if (k & 1)
  1069. je .L6_4_19
  1070. ALIGN_4
  1071. .L6_4_17:
  1072. KERNEL4x3_SUB
  1073. jnz .L6_4_17
  1074. ALIGN_4
  1075. .L6_4_19:
  1076. SAVE4x3
  1077. addq $ 8 * SIZE, CO1 # coffset += 8
  1078. ALIGN_4
  1079. /**************************************************************************
  1080. * Rest of M
  1081. ***************************************************************************/
  1082. .L6_4_20:
  1083. testq $ 2, M
  1084. jz .L6_4_40
  1085. ALIGN_4
  1086. .L6_4_21:
  1087. leaq BUFFER1, BO // first buffer to BO
  1088. addq $ 8 * SIZE, BO
  1089. vzeroall
  1090. movq K, %rax
  1091. andq $ -8, %rax // K = K - ( K % 8 )
  1092. je .L6_4_26
  1093. ALIGN_4
  1094. .L6_4_22:
  1095. prefetcht0 A_PR1(AO)
  1096. KERNEL2x3_SUB
  1097. KERNEL2x3_SUB
  1098. KERNEL2x3_SUB
  1099. KERNEL2x3_SUB
  1100. prefetcht0 A_PR1(AO)
  1101. KERNEL2x3_SUB
  1102. KERNEL2x3_SUB
  1103. KERNEL2x3_SUB
  1104. KERNEL2x3_SUB
  1105. je .L6_4_26
  1106. prefetcht0 A_PR1(AO)
  1107. KERNEL2x3_SUB
  1108. KERNEL2x3_SUB
  1109. KERNEL2x3_SUB
  1110. KERNEL2x3_SUB
  1111. prefetcht0 A_PR1(AO)
  1112. KERNEL2x3_SUB
  1113. KERNEL2x3_SUB
  1114. KERNEL2x3_SUB
  1115. KERNEL2x3_SUB
  1116. je .L6_4_26
  1117. jmp .L6_4_22
  1118. ALIGN_4
  1119. .L6_4_26:
  1120. movq K, %rax
  1121. andq $ 7, %rax # if (k & 1)
  1122. je .L6_4_29
  1123. ALIGN_4
  1124. .L6_4_27:
  1125. KERNEL2x3_SUB
  1126. jnz .L6_4_27
  1127. ALIGN_4
  1128. .L6_4_29:
  1129. SAVE2x3
  1130. addq $ 4 * SIZE, CO1 # coffset += 4
  1131. decq I # i --
  1132. jg .L6_4_21
  1133. ALIGN_4
  1134. /**************************************************************************/
  1135. .L6_4_40:
  1136. testq $ 1, M
  1137. jz .L6_4_60 // to next 2 lines of N
  1138. ALIGN_4
  1139. .L6_4_41:
  1140. leaq BUFFER1, BO // first buffer to BO
  1141. addq $ 8 * SIZE, BO
  1142. vzeroall
  1143. movq K, %rax
  1144. andq $ -8, %rax // K = K - ( K % 8 )
  1145. je .L6_4_46
  1146. ALIGN_4
  1147. .L6_4_42:
  1148. prefetcht0 A_PR1(AO)
  1149. KERNEL1x3_SUB
  1150. KERNEL1x3_SUB
  1151. KERNEL1x3_SUB
  1152. KERNEL1x3_SUB
  1153. KERNEL1x3_SUB
  1154. KERNEL1x3_SUB
  1155. KERNEL1x3_SUB
  1156. KERNEL1x3_SUB
  1157. je .L6_4_46
  1158. prefetcht0 A_PR1(AO)
  1159. KERNEL1x3_SUB
  1160. KERNEL1x3_SUB
  1161. KERNEL1x3_SUB
  1162. KERNEL1x3_SUB
  1163. KERNEL1x3_SUB
  1164. KERNEL1x3_SUB
  1165. KERNEL1x3_SUB
  1166. KERNEL1x3_SUB
  1167. je .L6_4_46
  1168. jmp .L6_4_42
  1169. ALIGN_4
  1170. .L6_4_46:
  1171. movq K, %rax
  1172. andq $ 7, %rax # if (k & 1)
  1173. je .L6_4_49
  1174. ALIGN_4
  1175. .L6_4_47:
  1176. KERNEL1x3_SUB
  1177. jnz .L6_4_47
  1178. ALIGN_4
  1179. .L6_4_49:
  1180. SAVE1x3
  1181. addq $ 2 * SIZE, CO1 # coffset += 2
  1182. decq I # i --
  1183. jg .L6_4_41
  1184. ALIGN_4
  1185. .L6_4_60:
  1186. /*******************************************************************************************/
  1187. .L7_01:
  1188. // copy to sub buffer
  1189. movq B, BO1
  1190. leaq BUFFER1, BO // first buffer to BO
  1191. movq K, %rax
  1192. salq $2, %rax // 2 * COMPSIZE
  1193. leaq (B, %rax,4), BO2
  1194. movq K, %rax
  1195. ALIGN_4
  1196. .L7_02b:
  1197. vmovsd 2*SIZE(BO1), %xmm0
  1198. vmovups (BO2), %xmm1
  1199. vmovsd %xmm0, (BO)
  1200. vmovups %xmm1, 2*SIZE(BO)
  1201. addq $ 4*SIZE,BO1
  1202. addq $ 4*SIZE,BO2
  1203. addq $ 6*SIZE,BO
  1204. decq %rax
  1205. jnz .L7_02b
  1206. movq BO2, B // next offset of B
  1207. .L7_10:
  1208. movq C, CO1
  1209. leaq (C, LDC, 2), C // c += 2 * ldc
  1210. leaq (C, LDC, 1), C // c += 1 * ldc
  1211. movq A, AO // aoffset = a
  1212. addq $ 16 * SIZE, AO
  1213. movq M, I
  1214. sarq $ 3, I // i = (m >> 3)
  1215. je .L7_4_10
  1216. ALIGN_4
  1217. /**********************************************************************************************************/
  1218. .L7_8_11:
  1219. leaq BUFFER1, BO // first buffer to BO
  1220. addq $ 8 * SIZE, BO
  1221. vzeroall
  1222. movq K, %rax
  1223. andq $ -8, %rax // K = K - ( K % 8 )
  1224. je .L7_8_16
  1225. ALIGN_4
  1226. .L7_8_12:
  1227. KERNEL8x3_SUB
  1228. KERNEL8x3_SUB
  1229. KERNEL8x3_SUB
  1230. KERNEL8x3_SUB
  1231. KERNEL8x3_SUB
  1232. KERNEL8x3_SUB
  1233. KERNEL8x3_SUB
  1234. KERNEL8x3_SUB
  1235. je .L7_8_16
  1236. KERNEL8x3_SUB
  1237. KERNEL8x3_SUB
  1238. KERNEL8x3_SUB
  1239. KERNEL8x3_SUB
  1240. KERNEL8x3_SUB
  1241. KERNEL8x3_SUB
  1242. KERNEL8x3_SUB
  1243. KERNEL8x3_SUB
  1244. je .L7_8_16
  1245. jmp .L7_8_12
  1246. ALIGN_4
  1247. .L7_8_16:
  1248. movq K, %rax
  1249. andq $ 7, %rax # if (k & 1)
  1250. je .L7_8_19
  1251. ALIGN_4
  1252. .L7_8_17:
  1253. KERNEL8x3_SUB
  1254. jnz .L7_8_17
  1255. ALIGN_4
  1256. .L7_8_19:
  1257. SAVE8x3
  1258. addq $ 16 * SIZE, CO1 # coffset += 16
  1259. decq I # i --
  1260. jg .L7_8_11
  1261. ALIGN_4
  1262. /**********************************************************************************************************/
  1263. .L7_4_10:
  1264. testq $ 7, M
  1265. jz .L7_4_60 // to next 2 lines of N
  1266. testq $ 4, M
  1267. jz .L7_4_20
  1268. ALIGN_4
  1269. .L7_4_11:
  1270. leaq BUFFER1, BO // first buffer to BO
  1271. addq $ 8 * SIZE, BO
  1272. vzeroall
  1273. movq K, %rax
  1274. andq $ -8, %rax // K = K - ( K % 8 )
  1275. je .L7_4_16
  1276. ALIGN_4
  1277. .L7_4_12:
  1278. prefetcht0 A_PR1(AO)
  1279. KERNEL4x3_SUB
  1280. KERNEL4x3_SUB
  1281. prefetcht0 A_PR1(AO)
  1282. KERNEL4x3_SUB
  1283. KERNEL4x3_SUB
  1284. prefetcht0 A_PR1(AO)
  1285. KERNEL4x3_SUB
  1286. KERNEL4x3_SUB
  1287. prefetcht0 A_PR1(AO)
  1288. KERNEL4x3_SUB
  1289. KERNEL4x3_SUB
  1290. je .L7_4_16
  1291. prefetcht0 A_PR1(AO)
  1292. KERNEL4x3_SUB
  1293. KERNEL4x3_SUB
  1294. prefetcht0 A_PR1(AO)
  1295. KERNEL4x3_SUB
  1296. KERNEL4x3_SUB
  1297. prefetcht0 A_PR1(AO)
  1298. KERNEL4x3_SUB
  1299. KERNEL4x3_SUB
  1300. prefetcht0 A_PR1(AO)
  1301. KERNEL4x3_SUB
  1302. KERNEL4x3_SUB
  1303. je .L7_4_16
  1304. jmp .L7_4_12
  1305. ALIGN_4
  1306. .L7_4_16:
  1307. movq K, %rax
  1308. andq $ 7, %rax # if (k & 1)
  1309. je .L7_4_19
  1310. ALIGN_4
  1311. .L7_4_17:
  1312. KERNEL4x3_SUB
  1313. jnz .L7_4_17
  1314. ALIGN_4
  1315. .L7_4_19:
  1316. SAVE4x3
  1317. addq $ 8 * SIZE, CO1 # coffset += 8
  1318. ALIGN_4
  1319. /**************************************************************************
  1320. * Rest of M
  1321. ***************************************************************************/
  1322. .L7_4_20:
  1323. testq $ 2, M
  1324. jz .L7_4_40
  1325. ALIGN_4
  1326. .L7_4_21:
  1327. leaq BUFFER1, BO // first buffer to BO
  1328. addq $ 8 * SIZE, BO
  1329. vzeroall
  1330. movq K, %rax
  1331. andq $ -8, %rax // K = K - ( K % 8 )
  1332. je .L7_4_26
  1333. ALIGN_4
  1334. .L7_4_22:
  1335. prefetcht0 A_PR1(AO)
  1336. KERNEL2x3_SUB
  1337. KERNEL2x3_SUB
  1338. KERNEL2x3_SUB
  1339. KERNEL2x3_SUB
  1340. prefetcht0 A_PR1(AO)
  1341. KERNEL2x3_SUB
  1342. KERNEL2x3_SUB
  1343. KERNEL2x3_SUB
  1344. KERNEL2x3_SUB
  1345. je .L7_4_26
  1346. prefetcht0 A_PR1(AO)
  1347. KERNEL2x3_SUB
  1348. KERNEL2x3_SUB
  1349. KERNEL2x3_SUB
  1350. KERNEL2x3_SUB
  1351. prefetcht0 A_PR1(AO)
  1352. KERNEL2x3_SUB
  1353. KERNEL2x3_SUB
  1354. KERNEL2x3_SUB
  1355. KERNEL2x3_SUB
  1356. je .L7_4_26
  1357. jmp .L7_4_22
  1358. ALIGN_4
  1359. .L7_4_26:
  1360. movq K, %rax
  1361. andq $ 7, %rax # if (k & 1)
  1362. je .L7_4_29
  1363. ALIGN_4
  1364. .L7_4_27:
  1365. KERNEL2x3_SUB
  1366. jnz .L7_4_27
  1367. ALIGN_4
  1368. .L7_4_29:
  1369. SAVE2x3
  1370. addq $ 4 * SIZE, CO1 # coffset += 4
  1371. decq I # i --
  1372. jg .L7_4_21
  1373. ALIGN_4
  1374. /**************************************************************************/
  1375. .L7_4_40:
  1376. testq $ 1, M
  1377. jz .L7_4_60 // to next 2 lines of N
  1378. ALIGN_4
  1379. .L7_4_41:
  1380. leaq BUFFER1, BO // first buffer to BO
  1381. addq $ 8 * SIZE, BO
  1382. vzeroall
  1383. movq K, %rax
  1384. andq $ -8, %rax // K = K - ( K % 8 )
  1385. je .L7_4_46
  1386. ALIGN_4
  1387. .L7_4_42:
  1388. prefetcht0 A_PR1(AO)
  1389. KERNEL1x3_SUB
  1390. KERNEL1x3_SUB
  1391. KERNEL1x3_SUB
  1392. KERNEL1x3_SUB
  1393. KERNEL1x3_SUB
  1394. KERNEL1x3_SUB
  1395. KERNEL1x3_SUB
  1396. KERNEL1x3_SUB
  1397. je .L7_4_46
  1398. prefetcht0 A_PR1(AO)
  1399. KERNEL1x3_SUB
  1400. KERNEL1x3_SUB
  1401. KERNEL1x3_SUB
  1402. KERNEL1x3_SUB
  1403. KERNEL1x3_SUB
  1404. KERNEL1x3_SUB
  1405. KERNEL1x3_SUB
  1406. KERNEL1x3_SUB
  1407. je .L7_4_46
  1408. jmp .L7_4_42
  1409. ALIGN_4
  1410. .L7_4_46:
  1411. movq K, %rax
  1412. andq $ 7, %rax # if (k & 1)
  1413. je .L7_4_49
  1414. ALIGN_4
  1415. .L7_4_47:
  1416. KERNEL1x3_SUB
  1417. jnz .L7_4_47
  1418. ALIGN_4
  1419. .L7_4_49:
  1420. SAVE1x3
  1421. addq $ 2 * SIZE, CO1 # coffset += 2
  1422. decq I # i --
  1423. jg .L7_4_41
  1424. ALIGN_4
  1425. .L7_4_60:
  1426. decq J // j --
  1427. jg .L6_01 // next 6 lines of N
  1428. /************************************************************************************************/
  1429. .L2_00:
  1430. movq Nmod6, J
  1431. sarq $1, J // j = j / 2
  1432. cmpq $ 0, J
  1433. je .L1_0
  1434. ALIGN_4
  1435. .L2_01:
  1436. // copy to sub buffer
  1437. movq B, BO1
  1438. leaq BUFFER1, BO // first buffer to BO
  1439. movq K, %rax
  1440. ALIGN_4
  1441. .L2_02b:
  1442. vmovups (BO1), %xmm0
  1443. vmovups %xmm0, (BO)
  1444. addq $ 4*SIZE,BO1
  1445. addq $ 4*SIZE,BO
  1446. decq %rax
  1447. jnz .L2_02b
  1448. .L2_02c:
  1449. movq BO1, B // next offset of B
  1450. .L2_10:
  1451. movq C, CO1
  1452. leaq (C, LDC, 2), C // c += 2 * ldc
  1453. #if defined(TRMMKERNEL) && defined(LEFT)
  1454. movq OFFSET, %rax
  1455. movq %rax, KK
  1456. #endif
  1457. movq A, AO // aoffset = a
  1458. addq $ 16 * SIZE, AO
  1459. movq M, I
  1460. sarq $ 3, I // i = (m >> 3)
  1461. je .L2_4_10
  1462. ALIGN_4
  1463. /**********************************************************************************************************/
  1464. .L2_8_11:
  1465. #if !defined(TRMMKERNEL) || \
  1466. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1467. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1468. leaq BUFFER1, BO // first buffer to BO
  1469. addq $ 8 * SIZE, BO
  1470. #else
  1471. movq KK, %rax
  1472. leaq BUFFER1, BO // first buffer to BO
  1473. addq $ 8 * SIZE, BO
  1474. movq %rax, BI // Index for BO
  1475. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1476. leaq (BO, BI, SIZE), BO
  1477. salq $ 4, %rax // rax = rax *16 ; number of values
  1478. leaq (AO, %rax, SIZE), AO
  1479. #endif
  1480. vzeroall
  1481. #ifndef TRMMKERNEL
  1482. movq K, %rax
  1483. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1484. movq K, %rax
  1485. subq KK, %rax
  1486. movq %rax, KKK
  1487. #else
  1488. movq KK, %rax
  1489. #ifdef LEFT
  1490. addq $ 8, %rax // number of values in AO
  1491. #else
  1492. addq $ 2, %rax // number of values in BO
  1493. #endif
  1494. movq %rax, KKK
  1495. #endif
  1496. andq $ -8, %rax // K = K - ( K % 8 )
  1497. je .L2_8_16
  1498. movq %rax, BI // Index for BO
  1499. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1500. salq $ 4, %rax // rax = rax *16 ; number of values
  1501. leaq (AO, %rax, SIZE), AO
  1502. leaq (BO, BI, SIZE), BO
  1503. negq BI
  1504. negq %rax
  1505. ALIGN_4
  1506. .L2_8_12:
  1507. prefetcht0 A_PR1(AO,%rax,SIZE)
  1508. prefetcht0 B_PR1(BO,BI,SIZE)
  1509. KERNEL8x2_SUB
  1510. prefetcht0 A_PR1(AO,%rax,SIZE)
  1511. KERNEL8x2_SUB
  1512. prefetcht0 A_PR1(AO,%rax,SIZE)
  1513. KERNEL8x2_SUB
  1514. prefetcht0 A_PR1(AO,%rax,SIZE)
  1515. KERNEL8x2_SUB
  1516. prefetcht0 A_PR1(AO,%rax,SIZE)
  1517. prefetcht0 B_PR1(BO,BI,SIZE)
  1518. KERNEL8x2_SUB
  1519. prefetcht0 A_PR1(AO,%rax,SIZE)
  1520. KERNEL8x2_SUB
  1521. prefetcht0 A_PR1(AO,%rax,SIZE)
  1522. KERNEL8x2_SUB
  1523. prefetcht0 A_PR1(AO,%rax,SIZE)
  1524. KERNEL8x2_SUB
  1525. je .L2_8_16
  1526. prefetcht0 A_PR1(AO,%rax,SIZE)
  1527. prefetcht0 B_PR1(BO,BI,SIZE)
  1528. KERNEL8x2_SUB
  1529. prefetcht0 A_PR1(AO,%rax,SIZE)
  1530. KERNEL8x2_SUB
  1531. prefetcht0 A_PR1(AO,%rax,SIZE)
  1532. KERNEL8x2_SUB
  1533. prefetcht0 A_PR1(AO,%rax,SIZE)
  1534. KERNEL8x2_SUB
  1535. prefetcht0 A_PR1(AO,%rax,SIZE)
  1536. prefetcht0 B_PR1(BO,BI,SIZE)
  1537. KERNEL8x2_SUB
  1538. prefetcht0 A_PR1(AO,%rax,SIZE)
  1539. KERNEL8x2_SUB
  1540. prefetcht0 A_PR1(AO,%rax,SIZE)
  1541. KERNEL8x2_SUB
  1542. prefetcht0 A_PR1(AO,%rax,SIZE)
  1543. KERNEL8x2_SUB
  1544. je .L2_8_16
  1545. jmp .L2_8_12
  1546. ALIGN_4
  1547. .L2_8_16:
  1548. #ifndef TRMMKERNEL
  1549. movq K, %rax
  1550. #else
  1551. movq KKK, %rax
  1552. #endif
  1553. andq $ 7, %rax # if (k & 1)
  1554. je .L2_8_19
  1555. movq %rax, BI // Index for BO
  1556. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1557. salq $ 4, %rax // rax = rax *16 ; number of values
  1558. leaq (AO, %rax, SIZE), AO
  1559. leaq (BO, BI, SIZE), BO
  1560. negq BI
  1561. negq %rax
  1562. ALIGN_4
  1563. .L2_8_17:
  1564. KERNEL8x2_SUB
  1565. jl .L2_8_17
  1566. ALIGN_4
  1567. .L2_8_19:
  1568. SAVE8x2
  1569. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1570. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1571. movq K, %rax
  1572. subq KKK, %rax
  1573. movq %rax, BI // Index for BO
  1574. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1575. leaq (BO, BI, SIZE), BO
  1576. salq $ 4, %rax // rax = rax *16 ; number of values
  1577. leaq (AO, %rax, SIZE), AO
  1578. #endif
  1579. #if defined(TRMMKERNEL) && defined(LEFT)
  1580. addq $ 8, KK
  1581. #endif
  1582. addq $ 16 * SIZE, CO1 # coffset += 16
  1583. decq I # i --
  1584. jg .L2_8_11
  1585. ALIGN_4
  1586. /**********************************************************************************************************/
  1587. .L2_4_10:
  1588. testq $ 7, M
  1589. jz .L2_4_60 // to next 2 lines of N
  1590. testq $ 4, M
  1591. jz .L2_4_20
  1592. ALIGN_4
  1593. .L2_4_11:
  1594. #if !defined(TRMMKERNEL) || \
  1595. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1596. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1597. leaq BUFFER1, BO // first buffer to BO
  1598. addq $ 8 * SIZE, BO
  1599. #else
  1600. movq KK, %rax
  1601. leaq BUFFER1, BO // first buffer to BO
  1602. addq $ 8 * SIZE, BO
  1603. movq %rax, BI // Index for BO
  1604. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1605. leaq (BO, BI, SIZE), BO
  1606. salq $ 3, %rax // rax = rax * 8 ; number of values
  1607. leaq (AO, %rax, SIZE), AO
  1608. #endif
  1609. vzeroall
  1610. #ifndef TRMMKERNEL
  1611. movq K, %rax
  1612. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1613. movq K, %rax
  1614. subq KK, %rax
  1615. movq %rax, KKK
  1616. #else
  1617. movq KK, %rax
  1618. #ifdef LEFT
  1619. addq $ 4, %rax // number of values in AO
  1620. #else
  1621. addq $ 2, %rax // number of values in BO
  1622. #endif
  1623. movq %rax, KKK
  1624. #endif
  1625. andq $ -8, %rax // K = K - ( K % 8 )
  1626. je .L2_4_16
  1627. movq %rax, BI // Index for BO
  1628. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1629. salq $ 3, %rax // rax = rax * 8 ; number of values
  1630. leaq (AO, %rax, SIZE), AO
  1631. leaq (BO, BI, SIZE), BO
  1632. negq BI
  1633. negq %rax
  1634. ALIGN_4
  1635. .L2_4_12:
  1636. prefetcht0 A_PR1(AO,%rax,SIZE)
  1637. prefetcht0 B_PR1(BO,BI,SIZE)
  1638. KERNEL4x2_SUB
  1639. KERNEL4x2_SUB
  1640. prefetcht0 A_PR1(AO,%rax,SIZE)
  1641. KERNEL4x2_SUB
  1642. KERNEL4x2_SUB
  1643. prefetcht0 A_PR1(AO,%rax,SIZE)
  1644. prefetcht0 B_PR1(BO,BI,SIZE)
  1645. KERNEL4x2_SUB
  1646. KERNEL4x2_SUB
  1647. prefetcht0 A_PR1(AO,%rax,SIZE)
  1648. KERNEL4x2_SUB
  1649. KERNEL4x2_SUB
  1650. je .L2_4_16
  1651. prefetcht0 A_PR1(AO,%rax,SIZE)
  1652. prefetcht0 B_PR1(BO,BI,SIZE)
  1653. KERNEL4x2_SUB
  1654. KERNEL4x2_SUB
  1655. prefetcht0 A_PR1(AO,%rax,SIZE)
  1656. KERNEL4x2_SUB
  1657. KERNEL4x2_SUB
  1658. prefetcht0 A_PR1(AO,%rax,SIZE)
  1659. prefetcht0 B_PR1(BO,BI,SIZE)
  1660. KERNEL4x2_SUB
  1661. KERNEL4x2_SUB
  1662. prefetcht0 A_PR1(AO,%rax,SIZE)
  1663. KERNEL4x2_SUB
  1664. KERNEL4x2_SUB
  1665. je .L2_4_16
  1666. jmp .L2_4_12
  1667. ALIGN_4
  1668. .L2_4_16:
  1669. #ifndef TRMMKERNEL
  1670. movq K, %rax
  1671. #else
  1672. movq KKK, %rax
  1673. #endif
  1674. andq $ 7, %rax # if (k & 1)
  1675. je .L2_4_19
  1676. movq %rax, BI // Index for BO
  1677. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1678. salq $ 3, %rax // rax = rax * 8 ; number of values
  1679. leaq (AO, %rax, SIZE), AO
  1680. leaq (BO, BI, SIZE), BO
  1681. negq BI
  1682. negq %rax
  1683. ALIGN_4
  1684. .L2_4_17:
  1685. KERNEL4x2_SUB
  1686. jl .L2_4_17
  1687. ALIGN_4
  1688. .L2_4_19:
  1689. SAVE4x2
  1690. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1691. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1692. movq K, %rax
  1693. subq KKK, %rax
  1694. movq %rax, BI // Index for BO
  1695. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1696. leaq (BO, BI, SIZE), BO
  1697. salq $ 3, %rax // rax = rax * 8 ; number of values
  1698. leaq (AO, %rax, SIZE), AO
  1699. #endif
  1700. #if defined(TRMMKERNEL) && defined(LEFT)
  1701. addq $ 4, KK
  1702. #endif
  1703. addq $ 8 * SIZE, CO1 # coffset += 8
  1704. ALIGN_4
  1705. /**************************************************************************
  1706. * Rest of M
  1707. ***************************************************************************/
  1708. .L2_4_20:
  1709. testq $ 2, M
  1710. jz .L2_4_40
  1711. ALIGN_4
  1712. .L2_4_21:
  1713. #if !defined(TRMMKERNEL) || \
  1714. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1715. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1716. leaq BUFFER1, BO // first buffer to BO
  1717. addq $ 8 * SIZE, BO
  1718. #else
  1719. movq KK, %rax
  1720. leaq BUFFER1, BO // first buffer to BO
  1721. addq $ 8 * SIZE, BO
  1722. movq %rax, BI // Index for BO
  1723. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1724. leaq (BO, BI, SIZE), BO
  1725. salq $ 2, %rax // rax = rax * 4 ; number of values
  1726. leaq (AO, %rax, SIZE), AO
  1727. #endif
  1728. vzeroall
  1729. #ifndef TRMMKERNEL
  1730. movq K, %rax
  1731. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1732. movq K, %rax
  1733. subq KK, %rax
  1734. movq %rax, KKK
  1735. #else
  1736. movq KK, %rax
  1737. #ifdef LEFT
  1738. addq $ 2, %rax // number of values in AO
  1739. #else
  1740. addq $ 2, %rax // number of values in BO
  1741. #endif
  1742. movq %rax, KKK
  1743. #endif
  1744. andq $ -8, %rax // K = K - ( K % 8 )
  1745. je .L2_4_26
  1746. movq %rax, BI // Index for BO
  1747. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1748. salq $ 2, %rax // rax = rax * 4 ; number of values
  1749. leaq (AO, %rax, SIZE), AO
  1750. leaq (BO, BI, SIZE), BO
  1751. negq BI
  1752. negq %rax
  1753. ALIGN_4
  1754. .L2_4_22:
  1755. prefetcht0 A_PR1(AO,%rax,SIZE)
  1756. prefetcht0 B_PR1(BO,BI,SIZE)
  1757. KERNEL2x2_SUB
  1758. KERNEL2x2_SUB
  1759. KERNEL2x2_SUB
  1760. KERNEL2x2_SUB
  1761. prefetcht0 A_PR1(AO,%rax,SIZE)
  1762. prefetcht0 B_PR1(BO,BI,SIZE)
  1763. KERNEL2x2_SUB
  1764. KERNEL2x2_SUB
  1765. KERNEL2x2_SUB
  1766. KERNEL2x2_SUB
  1767. je .L2_4_26
  1768. prefetcht0 A_PR1(AO,%rax,SIZE)
  1769. prefetcht0 B_PR1(BO,BI,SIZE)
  1770. KERNEL2x2_SUB
  1771. KERNEL2x2_SUB
  1772. KERNEL2x2_SUB
  1773. KERNEL2x2_SUB
  1774. prefetcht0 A_PR1(AO,%rax,SIZE)
  1775. prefetcht0 B_PR1(BO,BI,SIZE)
  1776. KERNEL2x2_SUB
  1777. KERNEL2x2_SUB
  1778. KERNEL2x2_SUB
  1779. KERNEL2x2_SUB
  1780. je .L2_4_26
  1781. jmp .L2_4_22
  1782. ALIGN_4
  1783. .L2_4_26:
  1784. #ifndef TRMMKERNEL
  1785. movq K, %rax
  1786. #else
  1787. movq KKK, %rax
  1788. #endif
  1789. andq $ 7, %rax # if (k & 1)
  1790. je .L2_4_29
  1791. movq %rax, BI // Index for BO
  1792. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1793. salq $ 2, %rax // rax = rax * 4 ; number of values
  1794. leaq (AO, %rax, SIZE), AO
  1795. leaq (BO, BI, SIZE), BO
  1796. negq BI
  1797. negq %rax
  1798. ALIGN_4
  1799. .L2_4_27:
  1800. KERNEL2x2_SUB
  1801. jl .L2_4_27
  1802. ALIGN_4
  1803. .L2_4_29:
  1804. vbroadcastss ALPHA_R, %xmm0
  1805. vbroadcastss ALPHA_I, %xmm1
  1806. // swap high and low 64 bytes
  1807. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  1808. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  1809. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1810. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1811. vaddsubps %xmm9, %xmm8 , %xmm8
  1812. vaddsubps %xmm11,%xmm10, %xmm10
  1813. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  1814. vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
  1815. #else
  1816. vaddsubps %xmm8, %xmm9 ,%xmm9
  1817. vaddsubps %xmm10, %xmm11,%xmm11
  1818. vmovaps %xmm9, %xmm8
  1819. vmovaps %xmm11, %xmm10
  1820. // swap high and low 64 bytes
  1821. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  1822. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  1823. #endif
  1824. // multiply with ALPHA_R
  1825. vmulps %xmm8 , %xmm0, %xmm8
  1826. vmulps %xmm10, %xmm0, %xmm10
  1827. // multiply with ALPHA_I
  1828. vmulps %xmm9 , %xmm1, %xmm9
  1829. vmulps %xmm11, %xmm1, %xmm11
  1830. vaddsubps %xmm9, %xmm8 , %xmm8
  1831. vaddsubps %xmm11,%xmm10, %xmm10
  1832. #ifndef TRMMKERNEL
  1833. vaddps (CO1), %xmm8 , %xmm8
  1834. vaddps (CO1, LDC), %xmm10, %xmm10
  1835. #endif
  1836. vmovups %xmm8 , (CO1)
  1837. vmovups %xmm10 , (CO1, LDC)
  1838. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1839. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1840. movq K, %rax
  1841. subq KKK, %rax
  1842. movq %rax, BI // Index for BO
  1843. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1844. leaq (BO, BI, SIZE), BO
  1845. salq $ 2, %rax // rax = rax * 4 ; number of values
  1846. leaq (AO, %rax, SIZE), AO
  1847. #endif
  1848. #if defined(TRMMKERNEL) && defined(LEFT)
  1849. addq $ 2, KK
  1850. #endif
  1851. addq $ 4 * SIZE, CO1 # coffset += 4
  1852. decq I # i --
  1853. jg .L2_4_21
  1854. ALIGN_4
  1855. /**************************************************************************/
  1856. .L2_4_40:
  1857. testq $ 1, M
  1858. jz .L2_4_60 // to next 2 lines of N
  1859. ALIGN_4
  1860. .L2_4_41:
  1861. #if !defined(TRMMKERNEL) || \
  1862. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1863. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1864. leaq BUFFER1, BO // first buffer to BO
  1865. addq $ 8 * SIZE, BO
  1866. #else
  1867. movq KK, %rax
  1868. leaq BUFFER1, BO // first buffer to BO
  1869. addq $ 8 * SIZE, BO
  1870. movq %rax, BI // Index for BO
  1871. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1872. leaq (BO, BI, SIZE), BO
  1873. salq $ 1, %rax // rax = rax * 2 ; number of values
  1874. leaq (AO, %rax, SIZE), AO
  1875. #endif
  1876. vzeroall
  1877. #ifndef TRMMKERNEL
  1878. movq K, %rax
  1879. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1880. movq K, %rax
  1881. subq KK, %rax
  1882. movq %rax, KKK
  1883. #else
  1884. movq KK, %rax
  1885. #ifdef LEFT
  1886. addq $ 1, %rax // number of values in AO
  1887. #else
  1888. addq $ 2, %rax // number of values in BO
  1889. #endif
  1890. movq %rax, KKK
  1891. #endif
  1892. andq $ -8, %rax // K = K - ( K % 8 )
  1893. je .L2_4_46
  1894. movq %rax, BI // Index for BO
  1895. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1896. salq $ 1, %rax // rax = rax * 2 ; number of values
  1897. leaq (AO, %rax, SIZE), AO
  1898. leaq (BO, BI, SIZE), BO
  1899. negq BI
  1900. negq %rax
  1901. ALIGN_4
  1902. .L2_4_42:
  1903. prefetcht0 A_PR1(AO,%rax,SIZE)
  1904. prefetcht0 B_PR1(BO,BI,SIZE)
  1905. KERNEL1x2_SUB
  1906. KERNEL1x2_SUB
  1907. KERNEL1x2_SUB
  1908. KERNEL1x2_SUB
  1909. prefetcht0 B_PR1(BO,BI,SIZE)
  1910. KERNEL1x2_SUB
  1911. KERNEL1x2_SUB
  1912. KERNEL1x2_SUB
  1913. KERNEL1x2_SUB
  1914. je .L2_4_46
  1915. prefetcht0 A_PR1(AO,%rax,SIZE)
  1916. prefetcht0 B_PR1(BO,BI,SIZE)
  1917. KERNEL1x2_SUB
  1918. KERNEL1x2_SUB
  1919. KERNEL1x2_SUB
  1920. KERNEL1x2_SUB
  1921. prefetcht0 B_PR1(BO,BI,SIZE)
  1922. KERNEL1x2_SUB
  1923. KERNEL1x2_SUB
  1924. KERNEL1x2_SUB
  1925. KERNEL1x2_SUB
  1926. je .L2_4_46
  1927. jmp .L2_4_42
  1928. ALIGN_4
  1929. .L2_4_46:
  1930. #ifndef TRMMKERNEL
  1931. movq K, %rax
  1932. #else
  1933. movq KKK, %rax
  1934. #endif
  1935. andq $ 7, %rax # if (k & 1)
  1936. je .L2_4_49
  1937. movq %rax, BI // Index for BO
  1938. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1939. salq $ 1, %rax // rax = rax * 2 ; number of values
  1940. leaq (AO, %rax, SIZE), AO
  1941. leaq (BO, BI, SIZE), BO
  1942. negq BI
  1943. negq %rax
  1944. ALIGN_4
  1945. .L2_4_47:
  1946. KERNEL1x2_SUB
  1947. jl .L2_4_47
  1948. ALIGN_4
  1949. .L2_4_49:
  1950. SAVE1x2
  1951. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1952. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1953. movq K, %rax
  1954. subq KKK, %rax
  1955. movq %rax, BI // Index for BO
  1956. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1957. leaq (BO, BI, SIZE), BO
  1958. salq $ 1, %rax // rax = rax * 2 ; number of values
  1959. leaq (AO, %rax, SIZE), AO
  1960. #endif
  1961. #if defined(TRMMKERNEL) && defined(LEFT)
  1962. addq $ 1, KK
  1963. #endif
  1964. addq $ 2 * SIZE, CO1 # coffset += 2
  1965. decq I # i --
  1966. jg .L2_4_41
  1967. ALIGN_4
  1968. .L2_4_60:
  1969. #if defined(TRMMKERNEL) && !defined(LEFT)
  1970. addq $ 2, KK
  1971. #endif
  1972. decq J // j --
  1973. jg .L2_01 // next 2 lines of N
  1974. .L1_0:
  1975. /************************************************************************************************
  1976. * Loop for Nmod6 % 2 > 0
  1977. *************************************************************************************************/
  1978. movq Nmod6, J
  1979. andq $ 1, J // j % 2
  1980. je .L999
  1981. ALIGN_4
  1982. .L1_01:
  1983. // copy to sub buffer
  1984. movq B, BO1
  1985. leaq BUFFER1, BO // first buffer to BO
  1986. movq K, %rax
  1987. ALIGN_4
  1988. .L1_02b:
  1989. vmovsd (BO1), %xmm0
  1990. vmovsd %xmm0, (BO)
  1991. addq $ 2*SIZE,BO1
  1992. addq $ 2*SIZE,BO
  1993. decq %rax
  1994. jnz .L1_02b
  1995. .L1_02c:
  1996. movq BO1, B // next offset of B
  1997. .L1_10:
  1998. movq C, CO1
  1999. leaq (C, LDC, 1), C // c += 1 * ldc
  2000. #if defined(TRMMKERNEL) && defined(LEFT)
  2001. movq OFFSET, %rax
  2002. movq %rax, KK
  2003. #endif
  2004. movq A, AO // aoffset = a
  2005. addq $ 16 * SIZE, AO
  2006. movq M, I
  2007. sarq $ 3, I // i = (m >> 3)
  2008. je .L1_4_10
  2009. ALIGN_4
  2010. /**************************************************************************************************/
  2011. .L1_8_11:
  2012. #if !defined(TRMMKERNEL) || \
  2013. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2014. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2015. leaq BUFFER1, BO // first buffer to BO
  2016. addq $ 4 * SIZE, BO
  2017. #else
  2018. movq KK, %rax
  2019. leaq BUFFER1, BO // first buffer to BO
  2020. addq $ 4 * SIZE, BO
  2021. movq %rax, BI // Index for BO
  2022. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  2023. leaq (BO, BI, SIZE), BO
  2024. salq $ 4, %rax // rax = rax *16 ; number of values
  2025. leaq (AO, %rax, SIZE), AO
  2026. #endif
  2027. vzeroall
  2028. #ifndef TRMMKERNEL
  2029. movq K, %rax
  2030. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2031. movq K, %rax
  2032. subq KK, %rax
  2033. movq %rax, KKK
  2034. #else
  2035. movq KK, %rax
  2036. #ifdef LEFT
  2037. addq $ 8, %rax // number of values in AO
  2038. #else
  2039. addq $ 1, %rax // number of values in BO
  2040. #endif
  2041. movq %rax, KKK
  2042. #endif
  2043. andq $ -8, %rax // K = K - ( K % 8 )
  2044. je .L1_8_16
  2045. movq %rax, BI // Index for BO
  2046. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2047. salq $ 4, %rax // rax = rax *16 ; number of values
  2048. leaq (AO, %rax, SIZE), AO
  2049. leaq (BO, BI, SIZE), BO
  2050. negq BI
  2051. negq %rax
  2052. ALIGN_4
  2053. .L1_8_12:
  2054. prefetcht0 A_PR1(AO,%rax,SIZE)
  2055. prefetcht0 B_PR1(BO,BI,SIZE)
  2056. KERNEL8x1_SUB
  2057. prefetcht0 A_PR1(AO,%rax,SIZE)
  2058. KERNEL8x1_SUB
  2059. prefetcht0 A_PR1(AO,%rax,SIZE)
  2060. KERNEL8x1_SUB
  2061. prefetcht0 A_PR1(AO,%rax,SIZE)
  2062. KERNEL8x1_SUB
  2063. prefetcht0 A_PR1(AO,%rax,SIZE)
  2064. KERNEL8x1_SUB
  2065. prefetcht0 A_PR1(AO,%rax,SIZE)
  2066. KERNEL8x1_SUB
  2067. prefetcht0 A_PR1(AO,%rax,SIZE)
  2068. KERNEL8x1_SUB
  2069. prefetcht0 A_PR1(AO,%rax,SIZE)
  2070. KERNEL8x1_SUB
  2071. je .L1_8_16
  2072. prefetcht0 A_PR1(AO,%rax,SIZE)
  2073. prefetcht0 B_PR1(BO,BI,SIZE)
  2074. KERNEL8x1_SUB
  2075. prefetcht0 A_PR1(AO,%rax,SIZE)
  2076. KERNEL8x1_SUB
  2077. prefetcht0 A_PR1(AO,%rax,SIZE)
  2078. KERNEL8x1_SUB
  2079. prefetcht0 A_PR1(AO,%rax,SIZE)
  2080. KERNEL8x1_SUB
  2081. prefetcht0 A_PR1(AO,%rax,SIZE)
  2082. KERNEL8x1_SUB
  2083. prefetcht0 A_PR1(AO,%rax,SIZE)
  2084. KERNEL8x1_SUB
  2085. prefetcht0 A_PR1(AO,%rax,SIZE)
  2086. KERNEL8x1_SUB
  2087. prefetcht0 A_PR1(AO,%rax,SIZE)
  2088. KERNEL8x1_SUB
  2089. je .L1_8_16
  2090. jmp .L1_8_12
  2091. ALIGN_4
  2092. .L1_8_16:
  2093. #ifndef TRMMKERNEL
  2094. movq K, %rax
  2095. #else
  2096. movq KKK, %rax
  2097. #endif
  2098. andq $ 7, %rax # if (k & 1)
  2099. je .L1_8_19
  2100. movq %rax, BI // Index for BO
  2101. leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
  2102. salq $ 4, %rax // rax = rax *16 ; number of values
  2103. leaq (AO, %rax, SIZE), AO
  2104. leaq (BO, BI, SIZE), BO
  2105. negq BI
  2106. negq %rax
  2107. ALIGN_4
  2108. .L1_8_17:
  2109. KERNEL8x1_SUB
  2110. jl .L1_8_17
  2111. ALIGN_4
  2112. .L1_8_19:
  2113. SAVE8x1
  2114. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2115. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2116. movq K, %rax
  2117. subq KKK, %rax
  2118. movq %rax, BI // Index for BO
  2119. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2120. leaq (BO, BI, SIZE), BO
  2121. salq $ 4, %rax // rax = rax *16 ; number of values
  2122. leaq (AO, %rax, SIZE), AO
  2123. #endif
  2124. #if defined(TRMMKERNEL) && defined(LEFT)
  2125. addq $ 8, KK
  2126. #endif
  2127. addq $ 16 * SIZE, CO1 # coffset += 16
  2128. decq I # i --
  2129. jg .L1_8_11
  2130. ALIGN_4
  2131. /**************************************************************************************************/
  2132. .L1_4_10:
  2133. testq $ 7, M
  2134. jz .L999
  2135. testq $ 4, M
  2136. jz .L1_4_20
  2137. .L1_4_11:
  2138. #if !defined(TRMMKERNEL) || \
  2139. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2140. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2141. leaq BUFFER1, BO // first buffer to BO
  2142. addq $ 4 * SIZE, BO
  2143. #else
  2144. movq KK, %rax
  2145. leaq BUFFER1, BO // first buffer to BO
  2146. addq $ 4 * SIZE, BO
  2147. movq %rax, BI // Index for BO
  2148. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  2149. leaq (BO, BI, SIZE), BO
  2150. salq $ 3, %rax // rax = rax * 8 ; number of values
  2151. leaq (AO, %rax, SIZE), AO
  2152. #endif
  2153. vzeroall
  2154. #ifndef TRMMKERNEL
  2155. movq K, %rax
  2156. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2157. movq K, %rax
  2158. subq KK, %rax
  2159. movq %rax, KKK
  2160. #else
  2161. movq KK, %rax
  2162. #ifdef LEFT
  2163. addq $ 4, %rax // number of values in AO
  2164. #else
  2165. addq $ 1, %rax // number of values in BO
  2166. #endif
  2167. movq %rax, KKK
  2168. #endif
  2169. andq $ -8, %rax // K = K - ( K % 8 )
  2170. je .L1_4_16
  2171. movq %rax, BI // Index for BO
  2172. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2173. salq $ 3, %rax // rax = rax * 8 ; number of values
  2174. leaq (AO, %rax, SIZE), AO
  2175. leaq (BO, BI, SIZE), BO
  2176. negq BI
  2177. negq %rax
  2178. ALIGN_4
  2179. .L1_4_12:
  2180. prefetcht0 A_PR1(AO,%rax,SIZE)
  2181. prefetcht0 B_PR1(BO,BI,SIZE)
  2182. KERNEL4x1_SUB
  2183. KERNEL4x1_SUB
  2184. prefetcht0 A_PR1(AO,%rax,SIZE)
  2185. KERNEL4x1_SUB
  2186. KERNEL4x1_SUB
  2187. prefetcht0 A_PR1(AO,%rax,SIZE)
  2188. KERNEL4x1_SUB
  2189. KERNEL4x1_SUB
  2190. prefetcht0 A_PR1(AO,%rax,SIZE)
  2191. KERNEL4x1_SUB
  2192. KERNEL4x1_SUB
  2193. je .L1_4_16
  2194. prefetcht0 A_PR1(AO,%rax,SIZE)
  2195. prefetcht0 B_PR1(BO,BI,SIZE)
  2196. KERNEL4x1_SUB
  2197. KERNEL4x1_SUB
  2198. prefetcht0 A_PR1(AO,%rax,SIZE)
  2199. KERNEL4x1_SUB
  2200. KERNEL4x1_SUB
  2201. prefetcht0 A_PR1(AO,%rax,SIZE)
  2202. KERNEL4x1_SUB
  2203. KERNEL4x1_SUB
  2204. prefetcht0 A_PR1(AO,%rax,SIZE)
  2205. KERNEL4x1_SUB
  2206. KERNEL4x1_SUB
  2207. je .L1_4_16
  2208. jmp .L1_4_12
  2209. ALIGN_4
  2210. .L1_4_16:
  2211. #ifndef TRMMKERNEL
  2212. movq K, %rax
  2213. #else
  2214. movq KKK, %rax
  2215. #endif
  2216. andq $ 7, %rax # if (k & 1)
  2217. je .L1_4_19
  2218. movq %rax, BI // Index for BO
  2219. leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
  2220. salq $ 3, %rax // rax = rax * 8 ; number of values
  2221. leaq (AO, %rax, SIZE), AO
  2222. leaq (BO, BI, SIZE), BO
  2223. negq BI
  2224. negq %rax
  2225. ALIGN_4
  2226. .L1_4_17:
  2227. KERNEL4x1_SUB
  2228. jl .L1_4_17
  2229. ALIGN_4
  2230. .L1_4_19:
  2231. SAVE4x1
  2232. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2233. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2234. movq K, %rax
  2235. subq KKK, %rax
  2236. movq %rax, BI // Index for BO
  2237. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2238. leaq (BO, BI, SIZE), BO
  2239. salq $ 3, %rax // rax = rax * 8 ; number of values
  2240. leaq (AO, %rax, SIZE), AO
  2241. #endif
  2242. #if defined(TRMMKERNEL) && defined(LEFT)
  2243. addq $ 4, KK
  2244. #endif
  2245. addq $ 8 * SIZE, CO1 # coffset += 8
  2246. ALIGN_4
  2247. /**************************************************************************
  2248. * Rest of M
  2249. ***************************************************************************/
  2250. .L1_4_20:
  2251. testq $ 2, M
  2252. jz .L1_4_40
  2253. ALIGN_4
  2254. .L1_4_21:
  2255. #if !defined(TRMMKERNEL) || \
  2256. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2257. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2258. leaq BUFFER1, BO // first buffer to BO
  2259. addq $ 4 * SIZE, BO
  2260. #else
  2261. movq KK, %rax
  2262. leaq BUFFER1, BO // first buffer to BO
  2263. addq $ 4 * SIZE, BO
  2264. movq %rax, BI // Index for BO
  2265. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  2266. leaq (BO, BI, SIZE), BO
  2267. salq $ 2, %rax // rax = rax * 4 ; number of values
  2268. leaq (AO, %rax, SIZE), AO
  2269. #endif
  2270. vzeroall
  2271. #ifndef TRMMKERNEL
  2272. movq K, %rax
  2273. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2274. movq K, %rax
  2275. subq KK, %rax
  2276. movq %rax, KKK
  2277. #else
  2278. movq KK, %rax
  2279. #ifdef LEFT
  2280. addq $ 2, %rax // number of values in AO
  2281. #else
  2282. addq $ 1, %rax // number of values in BO
  2283. #endif
  2284. movq %rax, KKK
  2285. #endif
  2286. andq $ -8, %rax // K = K - ( K % 8 )
  2287. je .L1_4_26
  2288. movq %rax, BI // Index for BO
  2289. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2290. salq $ 2, %rax // rax = rax * 4 ; number of values
  2291. leaq (AO, %rax, SIZE), AO
  2292. leaq (BO, BI, SIZE), BO
  2293. negq BI
  2294. negq %rax
  2295. ALIGN_4
  2296. .L1_4_22:
  2297. prefetcht0 A_PR1(AO,%rax,SIZE)
  2298. prefetcht0 B_PR1(BO,BI,SIZE)
  2299. KERNEL2x1_SUB
  2300. KERNEL2x1_SUB
  2301. KERNEL2x1_SUB
  2302. KERNEL2x1_SUB
  2303. prefetcht0 A_PR1(AO,%rax,SIZE)
  2304. KERNEL2x1_SUB
  2305. KERNEL2x1_SUB
  2306. KERNEL2x1_SUB
  2307. KERNEL2x1_SUB
  2308. je .L1_4_26
  2309. prefetcht0 A_PR1(AO,%rax,SIZE)
  2310. prefetcht0 B_PR1(BO,BI,SIZE)
  2311. KERNEL2x1_SUB
  2312. KERNEL2x1_SUB
  2313. KERNEL2x1_SUB
  2314. KERNEL2x1_SUB
  2315. prefetcht0 A_PR1(AO,%rax,SIZE)
  2316. KERNEL2x1_SUB
  2317. KERNEL2x1_SUB
  2318. KERNEL2x1_SUB
  2319. KERNEL2x1_SUB
  2320. je .L1_4_26
  2321. jmp .L1_4_22
  2322. ALIGN_4
  2323. .L1_4_26:
  2324. #ifndef TRMMKERNEL
  2325. movq K, %rax
  2326. #else
  2327. movq KKK, %rax
  2328. #endif
  2329. andq $ 7, %rax # if (k & 1)
  2330. je .L1_4_29
  2331. movq %rax, BI // Index for BO
  2332. leaq ( ,BI,2), BI // BI = BI * 2; number of values
  2333. salq $ 2, %rax // rax = rax * 4 ; number of values
  2334. leaq (AO, %rax, SIZE), AO
  2335. leaq (BO, BI, SIZE), BO
  2336. negq BI
  2337. negq %rax
  2338. ALIGN_4
  2339. .L1_4_27:
  2340. KERNEL2x1_SUB
  2341. jl .L1_4_27
  2342. ALIGN_4
  2343. .L1_4_29:
  2344. SAVE2x1
  2345. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2346. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2347. movq K, %rax
  2348. subq KKK, %rax
  2349. movq %rax, BI // Index for BO
  2350. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2351. leaq (BO, BI, SIZE), BO
  2352. salq $ 2, %rax // rax = rax * 4 ; number of values
  2353. leaq (AO, %rax, SIZE), AO
  2354. #endif
  2355. #if defined(TRMMKERNEL) && defined(LEFT)
  2356. addq $ 2, KK
  2357. #endif
  2358. addq $ 4 * SIZE, CO1 # coffset += 4
  2359. ALIGN_4
  2360. /**************************************************************************/
  2361. .L1_4_40:
  2362. testq $ 1, M
  2363. jz .L999 // to next 2 lines of N
  2364. ALIGN_4
  2365. .L1_4_41:
  2366. #if !defined(TRMMKERNEL) || \
  2367. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2368. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2369. leaq BUFFER1, BO // first buffer to BO
  2370. addq $ 4 * SIZE, BO
  2371. #else
  2372. movq KK, %rax
  2373. leaq BUFFER1, BO // first buffer to BO
  2374. addq $ 4 * SIZE, BO
  2375. movq %rax, BI // Index for BO
  2376. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  2377. leaq (BO, BI, SIZE), BO
  2378. salq $ 1, %rax // rax = rax * 2 ; number of values
  2379. leaq (AO, %rax, SIZE), AO
  2380. #endif
  2381. vzeroall
  2382. #ifndef TRMMKERNEL
  2383. movq K, %rax
  2384. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2385. movq K, %rax
  2386. subq KK, %rax
  2387. movq %rax, KKK
  2388. #else
  2389. movq KK, %rax
  2390. #ifdef LEFT
  2391. addq $ 1, %rax // number of values in AO
  2392. #else
  2393. addq $ 1, %rax // number of values in BO
  2394. #endif
  2395. movq %rax, KKK
  2396. #endif
  2397. andq $ -8, %rax // K = K - ( K % 8 )
  2398. je .L1_4_46
  2399. movq %rax, BI // Index for BO
  2400. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2401. salq $ 1, %rax // rax = rax * 2 ; number of values
  2402. leaq (AO, %rax, SIZE), AO
  2403. leaq (BO, BI, SIZE), BO
  2404. negq BI
  2405. negq %rax
  2406. ALIGN_4
  2407. .L1_4_42:
  2408. prefetcht0 A_PR1(AO,%rax,SIZE)
  2409. prefetcht0 B_PR1(BO,BI,SIZE)
  2410. KERNEL1x1_SUB
  2411. KERNEL1x1_SUB
  2412. KERNEL1x1_SUB
  2413. KERNEL1x1_SUB
  2414. KERNEL1x1_SUB
  2415. KERNEL1x1_SUB
  2416. KERNEL1x1_SUB
  2417. KERNEL1x1_SUB
  2418. je .L1_4_46
  2419. prefetcht0 A_PR1(AO,%rax,SIZE)
  2420. prefetcht0 B_PR1(BO,BI,SIZE)
  2421. KERNEL1x1_SUB
  2422. KERNEL1x1_SUB
  2423. KERNEL1x1_SUB
  2424. KERNEL1x1_SUB
  2425. KERNEL1x1_SUB
  2426. KERNEL1x1_SUB
  2427. KERNEL1x1_SUB
  2428. KERNEL1x1_SUB
  2429. je .L1_4_46
  2430. jmp .L1_4_42
  2431. ALIGN_4
  2432. .L1_4_46:
  2433. #ifndef TRMMKERNEL
  2434. movq K, %rax
  2435. #else
  2436. movq KKK, %rax
  2437. #endif
  2438. andq $ 7, %rax # if (k & 1)
  2439. je .L1_4_49
  2440. movq %rax, BI // Index for BO
  2441. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2442. salq $ 1, %rax // rax = rax * 2 ; number of values
  2443. leaq (AO, %rax, SIZE), AO
  2444. leaq (BO, BI, SIZE), BO
  2445. negq BI
  2446. negq %rax
  2447. ALIGN_4
  2448. .L1_4_47:
  2449. KERNEL1x1_SUB
  2450. jl .L1_4_47
  2451. ALIGN_4
  2452. .L1_4_49:
  2453. SAVE1x1
  2454. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2455. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2456. movq K, %rax
  2457. subq KKK, %rax
  2458. movq %rax, BI // Index for BO
  2459. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2460. leaq (BO, BI, SIZE), BO
  2461. salq $ 1, %rax // rax = rax * 2 ; number of values
  2462. leaq (AO, %rax, SIZE), AO
  2463. #endif
  2464. #if defined(TRMMKERNEL) && defined(LEFT)
  2465. addq $ 1, KK
  2466. #endif
  2467. addq $ 2 * SIZE, CO1 # coffset += 2
  2468. ALIGN_4
  2469. .L999:
  2470. vzeroupper
  2471. movq SP, %rsp
  2472. movq (%rsp), %rbx
  2473. movq 8(%rsp), %rbp
  2474. movq 16(%rsp), %r12
  2475. movq 24(%rsp), %r13
  2476. movq 32(%rsp), %r14
  2477. movq 40(%rsp), %r15
  2478. #ifdef WINDOWS_ABI
  2479. movq 48(%rsp), %rdi
  2480. movq 56(%rsp), %rsi
  2481. vmovups 64(%rsp), %xmm6
  2482. vmovups 80(%rsp), %xmm7
  2483. vmovups 96(%rsp), %xmm8
  2484. vmovups 112(%rsp), %xmm9
  2485. vmovups 128(%rsp), %xmm10
  2486. vmovups 144(%rsp), %xmm11
  2487. vmovups 160(%rsp), %xmm12
  2488. vmovups 176(%rsp), %xmm13
  2489. vmovups 192(%rsp), %xmm14
  2490. vmovups 208(%rsp), %xmm15
  2491. #endif
  2492. addq $ STACKSIZE, %rsp
  2493. ret
  2494. EPILOGUE
  2495. #else
  2496. /************************************************************************************************/
  2497. PROLOGUE
  2498. PROFCODE
  2499. subq $ STACKSIZE, %rsp
  2500. movq %rbx, (%rsp)
  2501. movq %rbp, 8(%rsp)
  2502. movq %r12, 16(%rsp)
  2503. movq %r13, 24(%rsp)
  2504. movq %r14, 32(%rsp)
  2505. movq %r15, 40(%rsp)
  2506. vzeroupper
  2507. #ifdef WINDOWS_ABI
  2508. movq %rdi, 48(%rsp)
  2509. movq %rsi, 56(%rsp)
  2510. vmovups %xmm6, 64(%rsp)
  2511. vmovups %xmm7, 80(%rsp)
  2512. vmovups %xmm8, 96(%rsp)
  2513. vmovups %xmm9, 112(%rsp)
  2514. vmovups %xmm10, 128(%rsp)
  2515. vmovups %xmm11, 144(%rsp)
  2516. vmovups %xmm12, 160(%rsp)
  2517. vmovups %xmm13, 176(%rsp)
  2518. vmovups %xmm14, 192(%rsp)
  2519. vmovups %xmm15, 208(%rsp)
  2520. movq ARG1, OLD_M
  2521. movq ARG2, OLD_N
  2522. movq ARG3, OLD_K
  2523. movq OLD_A, A
  2524. movq OLD_B, B
  2525. movq OLD_C, C
  2526. movq OLD_LDC, LDC
  2527. #ifdef TRMMKERNEL
  2528. movsd OLD_OFFSET, %xmm12
  2529. #endif
  2530. vmovaps %xmm3, %xmm0
  2531. vmovsd OLD_ALPHA_I, %xmm1
  2532. #else
  2533. movq STACKSIZE + 8(%rsp), LDC
  2534. #ifdef TRMMKERNEL
  2535. movsd STACKSIZE + 16(%rsp), %xmm12
  2536. #endif
  2537. #endif
  2538. movq %rsp, SP # save old stack
  2539. subq $ 128 + L_BUFFER_SIZE, %rsp
  2540. andq $ -4096, %rsp # align stack
  2541. STACK_TOUCH
  2542. cmpq $ 0, OLD_M
  2543. je .L999
  2544. cmpq $ 0, OLD_N
  2545. je .L999
  2546. cmpq $ 0, OLD_K
  2547. je .L999
  2548. movq OLD_M, M
  2549. movq OLD_N, N
  2550. movq OLD_K, K
  2551. vmovss %xmm0, ALPHA_R
  2552. vmovss %xmm1, ALPHA_I
  2553. salq $ ZBASE_SHIFT, LDC
  2554. movq N, %rax
  2555. xorq %rdx, %rdx
  2556. movq $ 2, %rdi
  2557. divq %rdi // N / 2
  2558. movq %rax, Ndiv6 // N / 2
  2559. movq %rdx, Nmod6 // N % 2
  2560. #ifdef TRMMKERNEL
  2561. vmovsd %xmm12, OFFSET
  2562. vmovsd %xmm12, KK
  2563. #ifndef LEFT
  2564. negq KK
  2565. #endif
  2566. #endif
  2567. .L2_0:
  2568. movq Ndiv6, J
  2569. cmpq $ 0, J
  2570. je .L1_0
  2571. ALIGN_4
  2572. .L2_01:
  2573. // copy to sub buffer
  2574. movq B, BO1
  2575. leaq BUFFER1, BO // first buffer to BO
  2576. movq K, %rax
  2577. ALIGN_4
  2578. .L2_02b:
  2579. vmovups (BO1), %xmm0
  2580. vmovups %xmm0, (BO)
  2581. addq $ 4*SIZE,BO1
  2582. addq $ 4*SIZE,BO
  2583. decq %rax
  2584. jnz .L2_02b
  2585. .L2_02c:
  2586. movq BO1, B // next offset of B
  2587. .L2_10:
  2588. movq C, CO1
  2589. leaq (C, LDC, 2), C // c += 2 * ldc
  2590. #if defined(TRMMKERNEL) && defined(LEFT)
  2591. movq OFFSET, %rax
  2592. movq %rax, KK
  2593. #endif
  2594. movq A, AO // aoffset = a
  2595. addq $ 16 * SIZE, AO
  2596. movq M, I
  2597. sarq $ 3, I // i = (m >> 3)
  2598. je .L2_4_10
  2599. ALIGN_4
  2600. /**********************************************************************************************************/
  2601. .L2_8_11:
  2602. #if !defined(TRMMKERNEL) || \
  2603. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2604. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2605. leaq BUFFER1, BO // first buffer to BO
  2606. addq $ 8 * SIZE, BO
  2607. #else
  2608. movq KK, %rax
  2609. leaq BUFFER1, BO // first buffer to BO
  2610. addq $ 8 * SIZE, BO
  2611. movq %rax, BI // Index for BO
  2612. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  2613. leaq (BO, BI, SIZE), BO
  2614. salq $ 4, %rax // rax = rax *16 ; number of values
  2615. leaq (AO, %rax, SIZE), AO
  2616. #endif
  2617. vzeroall
  2618. #ifndef TRMMKERNEL
  2619. movq K, %rax
  2620. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2621. movq K, %rax
  2622. subq KK, %rax
  2623. movq %rax, KKK
  2624. #else
  2625. movq KK, %rax
  2626. #ifdef LEFT
  2627. addq $ 8, %rax // number of values in AO
  2628. #else
  2629. addq $ 2, %rax // number of values in BO
  2630. #endif
  2631. movq %rax, KKK
  2632. #endif
  2633. andq $ -8, %rax // K = K - ( K % 8 )
  2634. je .L2_8_16
  2635. movq %rax, BI // Index for BO
  2636. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2637. salq $ 4, %rax // rax = rax *16 ; number of values
  2638. leaq (AO, %rax, SIZE), AO
  2639. leaq (BO, BI, SIZE), BO
  2640. negq BI
  2641. negq %rax
  2642. ALIGN_4
  2643. .L2_8_12:
  2644. prefetcht0 A_PR1(AO,%rax,SIZE)
  2645. prefetcht0 B_PR1(BO,BI,SIZE)
  2646. KERNEL8x2_SUB
  2647. prefetcht0 A_PR1(AO,%rax,SIZE)
  2648. KERNEL8x2_SUB
  2649. prefetcht0 A_PR1(AO,%rax,SIZE)
  2650. KERNEL8x2_SUB
  2651. prefetcht0 A_PR1(AO,%rax,SIZE)
  2652. KERNEL8x2_SUB
  2653. prefetcht0 A_PR1(AO,%rax,SIZE)
  2654. prefetcht0 B_PR1(BO,BI,SIZE)
  2655. KERNEL8x2_SUB
  2656. prefetcht0 A_PR1(AO,%rax,SIZE)
  2657. KERNEL8x2_SUB
  2658. prefetcht0 A_PR1(AO,%rax,SIZE)
  2659. KERNEL8x2_SUB
  2660. prefetcht0 A_PR1(AO,%rax,SIZE)
  2661. KERNEL8x2_SUB
  2662. je .L2_8_16
  2663. prefetcht0 A_PR1(AO,%rax,SIZE)
  2664. prefetcht0 B_PR1(BO,BI,SIZE)
  2665. KERNEL8x2_SUB
  2666. prefetcht0 A_PR1(AO,%rax,SIZE)
  2667. KERNEL8x2_SUB
  2668. prefetcht0 A_PR1(AO,%rax,SIZE)
  2669. KERNEL8x2_SUB
  2670. prefetcht0 A_PR1(AO,%rax,SIZE)
  2671. KERNEL8x2_SUB
  2672. prefetcht0 A_PR1(AO,%rax,SIZE)
  2673. prefetcht0 B_PR1(BO,BI,SIZE)
  2674. KERNEL8x2_SUB
  2675. prefetcht0 A_PR1(AO,%rax,SIZE)
  2676. KERNEL8x2_SUB
  2677. prefetcht0 A_PR1(AO,%rax,SIZE)
  2678. KERNEL8x2_SUB
  2679. prefetcht0 A_PR1(AO,%rax,SIZE)
  2680. KERNEL8x2_SUB
  2681. je .L2_8_16
  2682. jmp .L2_8_12
  2683. ALIGN_4
  2684. .L2_8_16:
  2685. #ifndef TRMMKERNEL
  2686. movq K, %rax
  2687. #else
  2688. movq KKK, %rax
  2689. #endif
  2690. andq $ 7, %rax # if (k & 1)
  2691. je .L2_8_19
  2692. movq %rax, BI // Index for BO
  2693. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2694. salq $ 4, %rax // rax = rax *16 ; number of values
  2695. leaq (AO, %rax, SIZE), AO
  2696. leaq (BO, BI, SIZE), BO
  2697. negq BI
  2698. negq %rax
  2699. ALIGN_4
  2700. .L2_8_17:
  2701. KERNEL8x2_SUB
  2702. jl .L2_8_17
  2703. ALIGN_4
  2704. .L2_8_19:
  2705. SAVE8x2
  2706. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2707. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2708. movq K, %rax
  2709. subq KKK, %rax
  2710. movq %rax, BI // Index for BO
  2711. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2712. leaq (BO, BI, SIZE), BO
  2713. salq $ 4, %rax // rax = rax *16 ; number of values
  2714. leaq (AO, %rax, SIZE), AO
  2715. #endif
  2716. #if defined(TRMMKERNEL) && defined(LEFT)
  2717. addq $ 8, KK
  2718. #endif
  2719. addq $ 16 * SIZE, CO1 # coffset += 16
  2720. decq I # i --
  2721. jg .L2_8_11
  2722. ALIGN_4
  2723. /**********************************************************************************************************/
  2724. .L2_4_10:
  2725. testq $ 7, M
  2726. jz .L2_4_60 // to next 2 lines of N
  2727. testq $ 4, M
  2728. jz .L2_4_20
  2729. ALIGN_4
  2730. .L2_4_11:
  2731. #if !defined(TRMMKERNEL) || \
  2732. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2733. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2734. leaq BUFFER1, BO // first buffer to BO
  2735. addq $ 8 * SIZE, BO
  2736. #else
  2737. movq KK, %rax
  2738. leaq BUFFER1, BO // first buffer to BO
  2739. addq $ 8 * SIZE, BO
  2740. movq %rax, BI // Index for BO
  2741. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  2742. leaq (BO, BI, SIZE), BO
  2743. salq $ 3, %rax // rax = rax * 8 ; number of values
  2744. leaq (AO, %rax, SIZE), AO
  2745. #endif
  2746. vzeroall
  2747. #ifndef TRMMKERNEL
  2748. movq K, %rax
  2749. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2750. movq K, %rax
  2751. subq KK, %rax
  2752. movq %rax, KKK
  2753. #else
  2754. movq KK, %rax
  2755. #ifdef LEFT
  2756. addq $ 4, %rax // number of values in AO
  2757. #else
  2758. addq $ 2, %rax // number of values in BO
  2759. #endif
  2760. movq %rax, KKK
  2761. #endif
  2762. andq $ -8, %rax // K = K - ( K % 8 )
  2763. je .L2_4_16
  2764. movq %rax, BI // Index for BO
  2765. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2766. salq $ 3, %rax // rax = rax * 8 ; number of values
  2767. leaq (AO, %rax, SIZE), AO
  2768. leaq (BO, BI, SIZE), BO
  2769. negq BI
  2770. negq %rax
  2771. ALIGN_4
  2772. .L2_4_12:
  2773. prefetcht0 A_PR1(AO,%rax,SIZE)
  2774. prefetcht0 B_PR1(BO,BI,SIZE)
  2775. KERNEL4x2_SUB
  2776. KERNEL4x2_SUB
  2777. prefetcht0 A_PR1(AO,%rax,SIZE)
  2778. KERNEL4x2_SUB
  2779. KERNEL4x2_SUB
  2780. prefetcht0 A_PR1(AO,%rax,SIZE)
  2781. prefetcht0 B_PR1(BO,BI,SIZE)
  2782. KERNEL4x2_SUB
  2783. KERNEL4x2_SUB
  2784. prefetcht0 A_PR1(AO,%rax,SIZE)
  2785. KERNEL4x2_SUB
  2786. KERNEL4x2_SUB
  2787. je .L2_4_16
  2788. prefetcht0 A_PR1(AO,%rax,SIZE)
  2789. prefetcht0 B_PR1(BO,BI,SIZE)
  2790. KERNEL4x2_SUB
  2791. KERNEL4x2_SUB
  2792. prefetcht0 A_PR1(AO,%rax,SIZE)
  2793. KERNEL4x2_SUB
  2794. KERNEL4x2_SUB
  2795. prefetcht0 A_PR1(AO,%rax,SIZE)
  2796. prefetcht0 B_PR1(BO,BI,SIZE)
  2797. KERNEL4x2_SUB
  2798. KERNEL4x2_SUB
  2799. prefetcht0 A_PR1(AO,%rax,SIZE)
  2800. KERNEL4x2_SUB
  2801. KERNEL4x2_SUB
  2802. je .L2_4_16
  2803. jmp .L2_4_12
  2804. ALIGN_4
  2805. .L2_4_16:
  2806. #ifndef TRMMKERNEL
  2807. movq K, %rax
  2808. #else
  2809. movq KKK, %rax
  2810. #endif
  2811. andq $ 7, %rax # if (k & 1)
  2812. je .L2_4_19
  2813. movq %rax, BI // Index for BO
  2814. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2815. salq $ 3, %rax // rax = rax * 8 ; number of values
  2816. leaq (AO, %rax, SIZE), AO
  2817. leaq (BO, BI, SIZE), BO
  2818. negq BI
  2819. negq %rax
  2820. ALIGN_4
  2821. .L2_4_17:
  2822. KERNEL4x2_SUB
  2823. jl .L2_4_17
  2824. ALIGN_4
  2825. .L2_4_19:
  2826. SAVE4x2
  2827. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2828. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2829. movq K, %rax
  2830. subq KKK, %rax
  2831. movq %rax, BI // Index for BO
  2832. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2833. leaq (BO, BI, SIZE), BO
  2834. salq $ 3, %rax // rax = rax * 8 ; number of values
  2835. leaq (AO, %rax, SIZE), AO
  2836. #endif
  2837. #if defined(TRMMKERNEL) && defined(LEFT)
  2838. addq $ 4, KK
  2839. #endif
  2840. addq $ 8 * SIZE, CO1 # coffset += 8
  2841. ALIGN_4
  2842. /**************************************************************************
  2843. * Rest of M
  2844. ***************************************************************************/
  2845. .L2_4_20:
  2846. testq $ 2, M
  2847. jz .L2_4_40
  2848. ALIGN_4
  2849. .L2_4_21:
  2850. #if !defined(TRMMKERNEL) || \
  2851. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2852. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2853. leaq BUFFER1, BO // first buffer to BO
  2854. addq $ 8 * SIZE, BO
  2855. #else
  2856. movq KK, %rax
  2857. leaq BUFFER1, BO // first buffer to BO
  2858. addq $ 8 * SIZE, BO
  2859. movq %rax, BI // Index for BO
  2860. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  2861. leaq (BO, BI, SIZE), BO
  2862. salq $ 2, %rax // rax = rax * 4 ; number of values
  2863. leaq (AO, %rax, SIZE), AO
  2864. #endif
  2865. vzeroall
  2866. #ifndef TRMMKERNEL
  2867. movq K, %rax
  2868. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2869. movq K, %rax
  2870. subq KK, %rax
  2871. movq %rax, KKK
  2872. #else
  2873. movq KK, %rax
  2874. #ifdef LEFT
  2875. addq $ 2, %rax // number of values in AO
  2876. #else
  2877. addq $ 2, %rax // number of values in BO
  2878. #endif
  2879. movq %rax, KKK
  2880. #endif
  2881. andq $ -8, %rax // K = K - ( K % 8 )
  2882. je .L2_4_26
  2883. movq %rax, BI // Index for BO
  2884. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2885. salq $ 2, %rax // rax = rax * 4 ; number of values
  2886. leaq (AO, %rax, SIZE), AO
  2887. leaq (BO, BI, SIZE), BO
  2888. negq BI
  2889. negq %rax
  2890. ALIGN_4
  2891. .L2_4_22:
  2892. prefetcht0 A_PR1(AO,%rax,SIZE)
  2893. prefetcht0 B_PR1(BO,BI,SIZE)
  2894. KERNEL2x2_SUB
  2895. KERNEL2x2_SUB
  2896. KERNEL2x2_SUB
  2897. KERNEL2x2_SUB
  2898. prefetcht0 A_PR1(AO,%rax,SIZE)
  2899. prefetcht0 B_PR1(BO,BI,SIZE)
  2900. KERNEL2x2_SUB
  2901. KERNEL2x2_SUB
  2902. KERNEL2x2_SUB
  2903. KERNEL2x2_SUB
  2904. je .L2_4_26
  2905. prefetcht0 A_PR1(AO,%rax,SIZE)
  2906. prefetcht0 B_PR1(BO,BI,SIZE)
  2907. KERNEL2x2_SUB
  2908. KERNEL2x2_SUB
  2909. KERNEL2x2_SUB
  2910. KERNEL2x2_SUB
  2911. prefetcht0 A_PR1(AO,%rax,SIZE)
  2912. prefetcht0 B_PR1(BO,BI,SIZE)
  2913. KERNEL2x2_SUB
  2914. KERNEL2x2_SUB
  2915. KERNEL2x2_SUB
  2916. KERNEL2x2_SUB
  2917. je .L2_4_26
  2918. jmp .L2_4_22
  2919. ALIGN_4
  2920. .L2_4_26:
  2921. #ifndef TRMMKERNEL
  2922. movq K, %rax
  2923. #else
  2924. movq KKK, %rax
  2925. #endif
  2926. andq $ 7, %rax # if (k & 1)
  2927. je .L2_4_29
  2928. movq %rax, BI // Index for BO
  2929. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2930. salq $ 2, %rax // rax = rax * 4 ; number of values
  2931. leaq (AO, %rax, SIZE), AO
  2932. leaq (BO, BI, SIZE), BO
  2933. negq BI
  2934. negq %rax
  2935. ALIGN_4
  2936. .L2_4_27:
  2937. KERNEL2x2_SUB
  2938. jl .L2_4_27
  2939. ALIGN_4
  2940. .L2_4_29:
  2941. vbroadcastss ALPHA_R, %xmm0
  2942. vbroadcastss ALPHA_I, %xmm1
  2943. // swap high and low 64 bytes
  2944. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  2945. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  2946. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  2947. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  2948. vaddsubps %xmm9, %xmm8 , %xmm8
  2949. vaddsubps %xmm11,%xmm10, %xmm10
  2950. vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
  2951. vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
  2952. #else
  2953. vaddsubps %xmm8, %xmm9 ,%xmm9
  2954. vaddsubps %xmm10, %xmm11,%xmm11
  2955. vmovaps %xmm9, %xmm8
  2956. vmovaps %xmm11, %xmm10
  2957. // swap high and low 64 bytes
  2958. vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
  2959. vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
  2960. #endif
  2961. // multiply with ALPHA_R
  2962. vmulps %xmm8 , %xmm0, %xmm8
  2963. vmulps %xmm10, %xmm0, %xmm10
  2964. // multiply with ALPHA_I
  2965. vmulps %xmm9 , %xmm1, %xmm9
  2966. vmulps %xmm11, %xmm1, %xmm11
  2967. vaddsubps %xmm9, %xmm8 , %xmm8
  2968. vaddsubps %xmm11,%xmm10, %xmm10
  2969. #ifndef TRMMKERNEL
  2970. vaddps (CO1), %xmm8 , %xmm8
  2971. vaddps (CO1, LDC), %xmm10, %xmm10
  2972. #endif
  2973. vmovups %xmm8 , (CO1)
  2974. vmovups %xmm10 , (CO1, LDC)
  2975. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2976. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2977. movq K, %rax
  2978. subq KKK, %rax
  2979. movq %rax, BI // Index for BO
  2980. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2981. leaq (BO, BI, SIZE), BO
  2982. salq $ 2, %rax // rax = rax * 4 ; number of values
  2983. leaq (AO, %rax, SIZE), AO
  2984. #endif
  2985. #if defined(TRMMKERNEL) && defined(LEFT)
  2986. addq $ 2, KK
  2987. #endif
  2988. addq $ 4 * SIZE, CO1 # coffset += 4
  2989. decq I # i --
  2990. jg .L2_4_21
  2991. ALIGN_4
  2992. /**************************************************************************/
  2993. .L2_4_40:
  2994. testq $ 1, M
  2995. jz .L2_4_60 // to next 2 lines of N
  2996. ALIGN_4
  2997. .L2_4_41:
  2998. #if !defined(TRMMKERNEL) || \
  2999. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3000. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3001. leaq BUFFER1, BO // first buffer to BO
  3002. addq $ 8 * SIZE, BO
  3003. #else
  3004. movq KK, %rax
  3005. leaq BUFFER1, BO // first buffer to BO
  3006. addq $ 8 * SIZE, BO
  3007. movq %rax, BI // Index for BO
  3008. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  3009. leaq (BO, BI, SIZE), BO
  3010. salq $ 1, %rax // rax = rax * 2 ; number of values
  3011. leaq (AO, %rax, SIZE), AO
  3012. #endif
  3013. vzeroall
  3014. #ifndef TRMMKERNEL
  3015. movq K, %rax
  3016. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3017. movq K, %rax
  3018. subq KK, %rax
  3019. movq %rax, KKK
  3020. #else
  3021. movq KK, %rax
  3022. #ifdef LEFT
  3023. addq $ 1, %rax // number of values in AO
  3024. #else
  3025. addq $ 2, %rax // number of values in BO
  3026. #endif
  3027. movq %rax, KKK
  3028. #endif
  3029. andq $ -8, %rax // K = K - ( K % 8 )
  3030. je .L2_4_46
  3031. movq %rax, BI // Index for BO
  3032. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  3033. salq $ 1, %rax // rax = rax * 2 ; number of values
  3034. leaq (AO, %rax, SIZE), AO
  3035. leaq (BO, BI, SIZE), BO
  3036. negq BI
  3037. negq %rax
  3038. ALIGN_4
  3039. .L2_4_42:
  3040. prefetcht0 A_PR1(AO,%rax,SIZE)
  3041. prefetcht0 B_PR1(BO,BI,SIZE)
  3042. KERNEL1x2_SUB
  3043. KERNEL1x2_SUB
  3044. KERNEL1x2_SUB
  3045. KERNEL1x2_SUB
  3046. prefetcht0 B_PR1(BO,BI,SIZE)
  3047. KERNEL1x2_SUB
  3048. KERNEL1x2_SUB
  3049. KERNEL1x2_SUB
  3050. KERNEL1x2_SUB
  3051. je .L2_4_46
  3052. prefetcht0 A_PR1(AO,%rax,SIZE)
  3053. prefetcht0 B_PR1(BO,BI,SIZE)
  3054. KERNEL1x2_SUB
  3055. KERNEL1x2_SUB
  3056. KERNEL1x2_SUB
  3057. KERNEL1x2_SUB
  3058. prefetcht0 B_PR1(BO,BI,SIZE)
  3059. KERNEL1x2_SUB
  3060. KERNEL1x2_SUB
  3061. KERNEL1x2_SUB
  3062. KERNEL1x2_SUB
  3063. je .L2_4_46
  3064. jmp .L2_4_42
  3065. ALIGN_4
  3066. .L2_4_46:
  3067. #ifndef TRMMKERNEL
  3068. movq K, %rax
  3069. #else
  3070. movq KKK, %rax
  3071. #endif
  3072. andq $ 7, %rax # if (k & 1)
  3073. je .L2_4_49
  3074. movq %rax, BI // Index for BO
  3075. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  3076. salq $ 1, %rax // rax = rax * 2 ; number of values
  3077. leaq (AO, %rax, SIZE), AO
  3078. leaq (BO, BI, SIZE), BO
  3079. negq BI
  3080. negq %rax
  3081. ALIGN_4
  3082. .L2_4_47:
  3083. KERNEL1x2_SUB
  3084. jl .L2_4_47
  3085. ALIGN_4
  3086. .L2_4_49:
  3087. SAVE1x2
  3088. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3089. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3090. movq K, %rax
  3091. subq KKK, %rax
  3092. movq %rax, BI // Index for BO
  3093. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  3094. leaq (BO, BI, SIZE), BO
  3095. salq $ 1, %rax // rax = rax * 2 ; number of values
  3096. leaq (AO, %rax, SIZE), AO
  3097. #endif
  3098. #if defined(TRMMKERNEL) && defined(LEFT)
  3099. addq $ 1, KK
  3100. #endif
  3101. addq $ 2 * SIZE, CO1 # coffset += 2
  3102. decq I # i --
  3103. jg .L2_4_41
  3104. ALIGN_4
  3105. .L2_4_60:
  3106. #if defined(TRMMKERNEL) && !defined(LEFT)
  3107. addq $ 2, KK
  3108. #endif
  3109. decq J // j --
  3110. jg .L2_01 // next 2 lines of N
  3111. .L1_0:
  3112. /************************************************************************************************
  3113. * Loop for Nmod6 % 2 > 0
  3114. *************************************************************************************************/
  3115. movq Nmod6, J
  3116. andq $ 1, J // j % 2
  3117. je .L999
  3118. ALIGN_4
  3119. .L1_01:
  3120. // copy to sub buffer
  3121. movq B, BO1
  3122. leaq BUFFER1, BO // first buffer to BO
  3123. movq K, %rax
  3124. ALIGN_4
  3125. .L1_02b:
  3126. vmovsd (BO1), %xmm0
  3127. vmovsd %xmm0, (BO)
  3128. addq $ 2*SIZE,BO1
  3129. addq $ 2*SIZE,BO
  3130. decq %rax
  3131. jnz .L1_02b
  3132. .L1_02c:
  3133. movq BO1, B // next offset of B
  3134. .L1_10:
  3135. movq C, CO1
  3136. leaq (C, LDC, 1), C // c += 1 * ldc
  3137. #if defined(TRMMKERNEL) && defined(LEFT)
  3138. movq OFFSET, %rax
  3139. movq %rax, KK
  3140. #endif
  3141. movq A, AO // aoffset = a
  3142. addq $ 16 * SIZE, AO
  3143. movq M, I
  3144. sarq $ 3, I // i = (m >> 3)
  3145. je .L1_4_10
  3146. ALIGN_4
  3147. /**************************************************************************************************/
  3148. .L1_8_11:
  3149. #if !defined(TRMMKERNEL) || \
  3150. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3151. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3152. leaq BUFFER1, BO // first buffer to BO
  3153. addq $ 4 * SIZE, BO
  3154. #else
  3155. movq KK, %rax
  3156. leaq BUFFER1, BO // first buffer to BO
  3157. addq $ 4 * SIZE, BO
  3158. movq %rax, BI // Index for BO
  3159. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  3160. leaq (BO, BI, SIZE), BO
  3161. salq $ 4, %rax // rax = rax *16 ; number of values
  3162. leaq (AO, %rax, SIZE), AO
  3163. #endif
  3164. vzeroall
  3165. #ifndef TRMMKERNEL
  3166. movq K, %rax
  3167. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3168. movq K, %rax
  3169. subq KK, %rax
  3170. movq %rax, KKK
  3171. #else
  3172. movq KK, %rax
  3173. #ifdef LEFT
  3174. addq $ 8, %rax // number of values in AO
  3175. #else
  3176. addq $ 1, %rax // number of values in BO
  3177. #endif
  3178. movq %rax, KKK
  3179. #endif
  3180. andq $ -8, %rax // K = K - ( K % 8 )
  3181. je .L1_8_16
  3182. movq %rax, BI // Index for BO
  3183. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  3184. salq $ 4, %rax // rax = rax *16 ; number of values
  3185. leaq (AO, %rax, SIZE), AO
  3186. leaq (BO, BI, SIZE), BO
  3187. negq BI
  3188. negq %rax
  3189. ALIGN_4
  3190. .L1_8_12:
  3191. prefetcht0 A_PR1(AO,%rax,SIZE)
  3192. prefetcht0 B_PR1(BO,BI,SIZE)
  3193. KERNEL8x1_SUB
  3194. prefetcht0 A_PR1(AO,%rax,SIZE)
  3195. KERNEL8x1_SUB
  3196. prefetcht0 A_PR1(AO,%rax,SIZE)
  3197. KERNEL8x1_SUB
  3198. prefetcht0 A_PR1(AO,%rax,SIZE)
  3199. KERNEL8x1_SUB
  3200. prefetcht0 A_PR1(AO,%rax,SIZE)
  3201. KERNEL8x1_SUB
  3202. prefetcht0 A_PR1(AO,%rax,SIZE)
  3203. KERNEL8x1_SUB
  3204. prefetcht0 A_PR1(AO,%rax,SIZE)
  3205. KERNEL8x1_SUB
  3206. prefetcht0 A_PR1(AO,%rax,SIZE)
  3207. KERNEL8x1_SUB
  3208. je .L1_8_16
  3209. prefetcht0 A_PR1(AO,%rax,SIZE)
  3210. prefetcht0 B_PR1(BO,BI,SIZE)
  3211. KERNEL8x1_SUB
  3212. prefetcht0 A_PR1(AO,%rax,SIZE)
  3213. KERNEL8x1_SUB
  3214. prefetcht0 A_PR1(AO,%rax,SIZE)
  3215. KERNEL8x1_SUB
  3216. prefetcht0 A_PR1(AO,%rax,SIZE)
  3217. KERNEL8x1_SUB
  3218. prefetcht0 A_PR1(AO,%rax,SIZE)
  3219. KERNEL8x1_SUB
  3220. prefetcht0 A_PR1(AO,%rax,SIZE)
  3221. KERNEL8x1_SUB
  3222. prefetcht0 A_PR1(AO,%rax,SIZE)
  3223. KERNEL8x1_SUB
  3224. prefetcht0 A_PR1(AO,%rax,SIZE)
  3225. KERNEL8x1_SUB
  3226. je .L1_8_16
  3227. jmp .L1_8_12
  3228. ALIGN_4
  3229. .L1_8_16:
  3230. #ifndef TRMMKERNEL
  3231. movq K, %rax
  3232. #else
  3233. movq KKK, %rax
  3234. #endif
  3235. andq $ 7, %rax # if (k & 1)
  3236. je .L1_8_19
  3237. movq %rax, BI // Index for BO
  3238. leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
  3239. salq $ 4, %rax // rax = rax *16 ; number of values
  3240. leaq (AO, %rax, SIZE), AO
  3241. leaq (BO, BI, SIZE), BO
  3242. negq BI
  3243. negq %rax
  3244. ALIGN_4
  3245. .L1_8_17:
  3246. KERNEL8x1_SUB
  3247. jl .L1_8_17
  3248. ALIGN_4
  3249. .L1_8_19:
  3250. SAVE8x1
  3251. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3252. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3253. movq K, %rax
  3254. subq KKK, %rax
  3255. movq %rax, BI // Index for BO
  3256. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  3257. leaq (BO, BI, SIZE), BO
  3258. salq $ 4, %rax // rax = rax *16 ; number of values
  3259. leaq (AO, %rax, SIZE), AO
  3260. #endif
  3261. #if defined(TRMMKERNEL) && defined(LEFT)
  3262. addq $ 8, KK
  3263. #endif
  3264. addq $ 16 * SIZE, CO1 # coffset += 16
  3265. decq I # i --
  3266. jg .L1_8_11
  3267. ALIGN_4
  3268. /**************************************************************************************************/
  3269. .L1_4_10:
  3270. testq $ 7, M
  3271. jz .L999
  3272. testq $ 4, M
  3273. jz .L1_4_20
  3274. .L1_4_11:
  3275. #if !defined(TRMMKERNEL) || \
  3276. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3277. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3278. leaq BUFFER1, BO // first buffer to BO
  3279. addq $ 4 * SIZE, BO
  3280. #else
  3281. movq KK, %rax
  3282. leaq BUFFER1, BO // first buffer to BO
  3283. addq $ 4 * SIZE, BO
  3284. movq %rax, BI // Index for BO
  3285. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  3286. leaq (BO, BI, SIZE), BO
  3287. salq $ 3, %rax // rax = rax * 8 ; number of values
  3288. leaq (AO, %rax, SIZE), AO
  3289. #endif
  3290. vzeroall
  3291. #ifndef TRMMKERNEL
  3292. movq K, %rax
  3293. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3294. movq K, %rax
  3295. subq KK, %rax
  3296. movq %rax, KKK
  3297. #else
  3298. movq KK, %rax
  3299. #ifdef LEFT
  3300. addq $ 4, %rax // number of values in AO
  3301. #else
  3302. addq $ 1, %rax // number of values in BO
  3303. #endif
  3304. movq %rax, KKK
  3305. #endif
  3306. andq $ -8, %rax // K = K - ( K % 8 )
  3307. je .L1_4_16
  3308. movq %rax, BI // Index for BO
  3309. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  3310. salq $ 3, %rax // rax = rax * 8 ; number of values
  3311. leaq (AO, %rax, SIZE), AO
  3312. leaq (BO, BI, SIZE), BO
  3313. negq BI
  3314. negq %rax
  3315. ALIGN_4
  3316. .L1_4_12:
  3317. prefetcht0 A_PR1(AO,%rax,SIZE)
  3318. prefetcht0 B_PR1(BO,BI,SIZE)
  3319. KERNEL4x1_SUB
  3320. KERNEL4x1_SUB
  3321. prefetcht0 A_PR1(AO,%rax,SIZE)
  3322. KERNEL4x1_SUB
  3323. KERNEL4x1_SUB
  3324. prefetcht0 A_PR1(AO,%rax,SIZE)
  3325. KERNEL4x1_SUB
  3326. KERNEL4x1_SUB
  3327. prefetcht0 A_PR1(AO,%rax,SIZE)
  3328. KERNEL4x1_SUB
  3329. KERNEL4x1_SUB
  3330. je .L1_4_16
  3331. prefetcht0 A_PR1(AO,%rax,SIZE)
  3332. prefetcht0 B_PR1(BO,BI,SIZE)
  3333. KERNEL4x1_SUB
  3334. KERNEL4x1_SUB
  3335. prefetcht0 A_PR1(AO,%rax,SIZE)
  3336. KERNEL4x1_SUB
  3337. KERNEL4x1_SUB
  3338. prefetcht0 A_PR1(AO,%rax,SIZE)
  3339. KERNEL4x1_SUB
  3340. KERNEL4x1_SUB
  3341. prefetcht0 A_PR1(AO,%rax,SIZE)
  3342. KERNEL4x1_SUB
  3343. KERNEL4x1_SUB
  3344. je .L1_4_16
  3345. jmp .L1_4_12
  3346. ALIGN_4
  3347. .L1_4_16:
  3348. #ifndef TRMMKERNEL
  3349. movq K, %rax
  3350. #else
  3351. movq KKK, %rax
  3352. #endif
  3353. andq $ 7, %rax # if (k & 1)
  3354. je .L1_4_19
  3355. movq %rax, BI // Index for BO
  3356. leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
  3357. salq $ 3, %rax // rax = rax * 8 ; number of values
  3358. leaq (AO, %rax, SIZE), AO
  3359. leaq (BO, BI, SIZE), BO
  3360. negq BI
  3361. negq %rax
  3362. ALIGN_4
  3363. .L1_4_17:
  3364. KERNEL4x1_SUB
  3365. jl .L1_4_17
  3366. ALIGN_4
  3367. .L1_4_19:
  3368. SAVE4x1
  3369. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3370. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3371. movq K, %rax
  3372. subq KKK, %rax
  3373. movq %rax, BI // Index for BO
  3374. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  3375. leaq (BO, BI, SIZE), BO
  3376. salq $ 3, %rax // rax = rax * 8 ; number of values
  3377. leaq (AO, %rax, SIZE), AO
  3378. #endif
  3379. #if defined(TRMMKERNEL) && defined(LEFT)
  3380. addq $ 4, KK
  3381. #endif
  3382. addq $ 8 * SIZE, CO1 # coffset += 8
  3383. ALIGN_4
  3384. /**************************************************************************
  3385. * Rest of M
  3386. ***************************************************************************/
  3387. .L1_4_20:
  3388. testq $ 2, M
  3389. jz .L1_4_40
  3390. ALIGN_4
  3391. .L1_4_21:
  3392. #if !defined(TRMMKERNEL) || \
  3393. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3394. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3395. leaq BUFFER1, BO // first buffer to BO
  3396. addq $ 4 * SIZE, BO
  3397. #else
  3398. movq KK, %rax
  3399. leaq BUFFER1, BO // first buffer to BO
  3400. addq $ 4 * SIZE, BO
  3401. movq %rax, BI // Index for BO
  3402. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  3403. leaq (BO, BI, SIZE), BO
  3404. salq $ 2, %rax // rax = rax * 4 ; number of values
  3405. leaq (AO, %rax, SIZE), AO
  3406. #endif
  3407. vzeroall
  3408. #ifndef TRMMKERNEL
  3409. movq K, %rax
  3410. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3411. movq K, %rax
  3412. subq KK, %rax
  3413. movq %rax, KKK
  3414. #else
  3415. movq KK, %rax
  3416. #ifdef LEFT
  3417. addq $ 2, %rax // number of values in AO
  3418. #else
  3419. addq $ 1, %rax // number of values in BO
  3420. #endif
  3421. movq %rax, KKK
  3422. #endif
  3423. andq $ -8, %rax // K = K - ( K % 8 )
  3424. je .L1_4_26
  3425. movq %rax, BI // Index for BO
  3426. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  3427. salq $ 2, %rax // rax = rax * 4 ; number of values
  3428. leaq (AO, %rax, SIZE), AO
  3429. leaq (BO, BI, SIZE), BO
  3430. negq BI
  3431. negq %rax
  3432. ALIGN_4
  3433. .L1_4_22:
  3434. prefetcht0 A_PR1(AO,%rax,SIZE)
  3435. prefetcht0 B_PR1(BO,BI,SIZE)
  3436. KERNEL2x1_SUB
  3437. KERNEL2x1_SUB
  3438. KERNEL2x1_SUB
  3439. KERNEL2x1_SUB
  3440. prefetcht0 A_PR1(AO,%rax,SIZE)
  3441. KERNEL2x1_SUB
  3442. KERNEL2x1_SUB
  3443. KERNEL2x1_SUB
  3444. KERNEL2x1_SUB
  3445. je .L1_4_26
  3446. prefetcht0 A_PR1(AO,%rax,SIZE)
  3447. prefetcht0 B_PR1(BO,BI,SIZE)
  3448. KERNEL2x1_SUB
  3449. KERNEL2x1_SUB
  3450. KERNEL2x1_SUB
  3451. KERNEL2x1_SUB
  3452. prefetcht0 A_PR1(AO,%rax,SIZE)
  3453. KERNEL2x1_SUB
  3454. KERNEL2x1_SUB
  3455. KERNEL2x1_SUB
  3456. KERNEL2x1_SUB
  3457. je .L1_4_26
  3458. jmp .L1_4_22
  3459. ALIGN_4
  3460. .L1_4_26:
  3461. #ifndef TRMMKERNEL
  3462. movq K, %rax
  3463. #else
  3464. movq KKK, %rax
  3465. #endif
  3466. andq $ 7, %rax # if (k & 1)
  3467. je .L1_4_29
  3468. movq %rax, BI // Index for BO
  3469. leaq ( ,BI,2), BI // BI = BI * 2; number of values
  3470. salq $ 2, %rax // rax = rax * 4 ; number of values
  3471. leaq (AO, %rax, SIZE), AO
  3472. leaq (BO, BI, SIZE), BO
  3473. negq BI
  3474. negq %rax
  3475. ALIGN_4
  3476. .L1_4_27:
  3477. KERNEL2x1_SUB
  3478. jl .L1_4_27
  3479. ALIGN_4
  3480. .L1_4_29:
  3481. SAVE2x1
  3482. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3483. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3484. movq K, %rax
  3485. subq KKK, %rax
  3486. movq %rax, BI // Index for BO
  3487. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  3488. leaq (BO, BI, SIZE), BO
  3489. salq $ 2, %rax // rax = rax * 4 ; number of values
  3490. leaq (AO, %rax, SIZE), AO
  3491. #endif
  3492. #if defined(TRMMKERNEL) && defined(LEFT)
  3493. addq $ 2, KK
  3494. #endif
  3495. addq $ 4 * SIZE, CO1 # coffset += 4
  3496. ALIGN_4
  3497. /**************************************************************************/
  3498. .L1_4_40:
  3499. testq $ 1, M
  3500. jz .L999 // to next 2 lines of N
  3501. ALIGN_4
  3502. .L1_4_41:
  3503. #if !defined(TRMMKERNEL) || \
  3504. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3505. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3506. leaq BUFFER1, BO // first buffer to BO
  3507. addq $ 4 * SIZE, BO
  3508. #else
  3509. movq KK, %rax
  3510. leaq BUFFER1, BO // first buffer to BO
  3511. addq $ 4 * SIZE, BO
  3512. movq %rax, BI // Index for BO
  3513. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  3514. leaq (BO, BI, SIZE), BO
  3515. salq $ 1, %rax // rax = rax * 2 ; number of values
  3516. leaq (AO, %rax, SIZE), AO
  3517. #endif
  3518. vzeroall
  3519. #ifndef TRMMKERNEL
  3520. movq K, %rax
  3521. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3522. movq K, %rax
  3523. subq KK, %rax
  3524. movq %rax, KKK
  3525. #else
  3526. movq KK, %rax
  3527. #ifdef LEFT
  3528. addq $ 1, %rax // number of values in AO
  3529. #else
  3530. addq $ 1, %rax // number of values in BO
  3531. #endif
  3532. movq %rax, KKK
  3533. #endif
  3534. andq $ -8, %rax // K = K - ( K % 8 )
  3535. je .L1_4_46
  3536. movq %rax, BI // Index for BO
  3537. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  3538. salq $ 1, %rax // rax = rax * 2 ; number of values
  3539. leaq (AO, %rax, SIZE), AO
  3540. leaq (BO, BI, SIZE), BO
  3541. negq BI
  3542. negq %rax
  3543. ALIGN_4
  3544. .L1_4_42:
  3545. prefetcht0 A_PR1(AO,%rax,SIZE)
  3546. prefetcht0 B_PR1(BO,BI,SIZE)
  3547. KERNEL1x1_SUB
  3548. KERNEL1x1_SUB
  3549. KERNEL1x1_SUB
  3550. KERNEL1x1_SUB
  3551. KERNEL1x1_SUB
  3552. KERNEL1x1_SUB
  3553. KERNEL1x1_SUB
  3554. KERNEL1x1_SUB
  3555. je .L1_4_46
  3556. prefetcht0 A_PR1(AO,%rax,SIZE)
  3557. prefetcht0 B_PR1(BO,BI,SIZE)
  3558. KERNEL1x1_SUB
  3559. KERNEL1x1_SUB
  3560. KERNEL1x1_SUB
  3561. KERNEL1x1_SUB
  3562. KERNEL1x1_SUB
  3563. KERNEL1x1_SUB
  3564. KERNEL1x1_SUB
  3565. KERNEL1x1_SUB
  3566. je .L1_4_46
  3567. jmp .L1_4_42
  3568. ALIGN_4
  3569. .L1_4_46:
  3570. #ifndef TRMMKERNEL
  3571. movq K, %rax
  3572. #else
  3573. movq KKK, %rax
  3574. #endif
  3575. andq $ 7, %rax # if (k & 1)
  3576. je .L1_4_49
  3577. movq %rax, BI // Index for BO
  3578. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  3579. salq $ 1, %rax // rax = rax * 2 ; number of values
  3580. leaq (AO, %rax, SIZE), AO
  3581. leaq (BO, BI, SIZE), BO
  3582. negq BI
  3583. negq %rax
  3584. ALIGN_4
  3585. .L1_4_47:
  3586. KERNEL1x1_SUB
  3587. jl .L1_4_47
  3588. ALIGN_4
  3589. .L1_4_49:
  3590. SAVE1x1
  3591. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3592. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3593. movq K, %rax
  3594. subq KKK, %rax
  3595. movq %rax, BI // Index for BO
  3596. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  3597. leaq (BO, BI, SIZE), BO
  3598. salq $ 1, %rax // rax = rax * 2 ; number of values
  3599. leaq (AO, %rax, SIZE), AO
  3600. #endif
  3601. #if defined(TRMMKERNEL) && defined(LEFT)
  3602. addq $ 1, KK
  3603. #endif
  3604. addq $ 2 * SIZE, CO1 # coffset += 2
  3605. ALIGN_4
  3606. .L999:
  3607. vzeroupper
  3608. movq SP, %rsp
  3609. movq (%rsp), %rbx
  3610. movq 8(%rsp), %rbp
  3611. movq 16(%rsp), %r12
  3612. movq 24(%rsp), %r13
  3613. movq 32(%rsp), %r14
  3614. movq 40(%rsp), %r15
  3615. #ifdef WINDOWS_ABI
  3616. movq 48(%rsp), %rdi
  3617. movq 56(%rsp), %rsi
  3618. vmovups 64(%rsp), %xmm6
  3619. vmovups 80(%rsp), %xmm7
  3620. vmovups 96(%rsp), %xmm8
  3621. vmovups 112(%rsp), %xmm9
  3622. vmovups 128(%rsp), %xmm10
  3623. vmovups 144(%rsp), %xmm11
  3624. vmovups 160(%rsp), %xmm12
  3625. vmovups 176(%rsp), %xmm13
  3626. vmovups 192(%rsp), %xmm14
  3627. vmovups 208(%rsp), %xmm15
  3628. #endif
  3629. addq $ STACKSIZE, %rsp
  3630. ret
  3631. EPILOGUE
  3632. #endif