You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_4x8_haswell.S 100 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153
  1. /*********************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define OLD_M %rdi
  30. #define OLD_N %rsi
  31. #define M %r13
  32. #define J %r14
  33. #define OLD_K %rdx
  34. #define A %rcx
  35. #define B %r8
  36. #define C %r9
  37. #define LDC %r10
  38. #define I %r11
  39. #define AO %rdi
  40. #define BO %rsi
  41. #define CO1 %r15
  42. #define K %r12
  43. #define SP %rbx
  44. #define BO1 %rdi
  45. #define BO2 %r15
  46. #define BO3 %rbp
  47. #ifndef WINDOWS_ABI
  48. #define STACKSIZE 96
  49. #define L_BUFFER_SIZE 256*8*12+4096
  50. #else
  51. #define STACKSIZE 256
  52. #define L_BUFFER_SIZE 128*8*12+512
  53. #define OLD_A 40 + STACKSIZE(%rsp)
  54. #define OLD_B 48 + STACKSIZE(%rsp)
  55. #define OLD_C 56 + STACKSIZE(%rsp)
  56. #define OLD_LDC 64 + STACKSIZE(%rsp)
  57. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  58. #endif
  59. #define Ndiv12 24(%rsp)
  60. #define Nmod12 32(%rsp)
  61. #define N 40(%rsp)
  62. #define ALPHA 48(%rsp)
  63. #define OFFSET 56(%rsp)
  64. #define KK 64(%rsp)
  65. #define KKK 72(%rsp)
  66. #define BUFFER1 128(%rsp)
  67. #if defined(OS_WINDOWS)
  68. #if L_BUFFER_SIZE > 16384
  69. #define STACK_TOUCH \
  70. movl $ 0, 4096 * 4(%rsp);\
  71. movl $ 0, 4096 * 3(%rsp);\
  72. movl $ 0, 4096 * 2(%rsp);\
  73. movl $ 0, 4096 * 1(%rsp);
  74. #elif L_BUFFER_SIZE > 12288
  75. #define STACK_TOUCH \
  76. movl $ 0, 4096 * 3(%rsp);\
  77. movl $ 0, 4096 * 2(%rsp);\
  78. movl $ 0, 4096 * 1(%rsp);
  79. #elif L_BUFFER_SIZE > 8192
  80. #define STACK_TOUCH \
  81. movl $ 0, 4096 * 2(%rsp);\
  82. movl $ 0, 4096 * 1(%rsp);
  83. #elif L_BUFFER_SIZE > 4096
  84. #define STACK_TOUCH \
  85. movl $ 0, 4096 * 1(%rsp);
  86. #else
  87. #define STACK_TOUCH
  88. #endif
  89. #else
  90. #define STACK_TOUCH
  91. #endif
  92. #define A_PR1 512
  93. #define B_PR1 160
  94. #define BROADCASTKERNEL
  95. /*******************************************************************************************
  96. * Macro definitions
  97. *******************************************************************************************/
  98. .macro INIT4x12
  99. vxorpd %ymm4 , %ymm4 , %ymm4
  100. vxorpd %ymm5 , %ymm5 , %ymm5
  101. vxorpd %ymm6 , %ymm6 , %ymm6
  102. vxorpd %ymm7 , %ymm7 , %ymm7
  103. vxorpd %ymm8 , %ymm8 , %ymm8
  104. vxorpd %ymm9 , %ymm9 , %ymm9
  105. vxorpd %ymm10, %ymm10, %ymm10
  106. vxorpd %ymm11, %ymm11, %ymm11
  107. vxorpd %ymm12, %ymm12, %ymm12
  108. vxorpd %ymm13, %ymm13, %ymm13
  109. vxorpd %ymm14, %ymm14, %ymm14
  110. vxorpd %ymm15, %ymm15, %ymm15
  111. .endm
  112. .macro KERNEL4x12_I
  113. prefetcht0 A_PR1(AO)
  114. vmovups -12 * SIZE(BO), %ymm1
  115. prefetcht0 B_PR1(BO)
  116. # if defined BROADCASTKERNEL
  117. vbroadcastsd -16 * SIZE(AO), %ymm0
  118. # else
  119. vmovups -16 * SIZE(AO), %ymm0
  120. # endif
  121. prefetcht0 B_PR1+64(BO)
  122. vmovups -8 * SIZE(BO), %ymm2
  123. prefetcht0 B_PR1+128(BO)
  124. vmovups -4 * SIZE(BO), %ymm3
  125. vmulpd %ymm0 ,%ymm1 , %ymm4
  126. prefetcht0 B_PR1+192(BO)
  127. vmulpd %ymm0 ,%ymm2 , %ymm8
  128. vmulpd %ymm0 ,%ymm3 , %ymm12
  129. prefetcht0 B_PR1+256(BO)
  130. # if defined BROADCASTKERNEL
  131. vbroadcastsd -15 * SIZE(AO), %ymm0
  132. # else
  133. vpermilpd $ 0x05, %ymm0 , %ymm0
  134. # endif
  135. vmulpd %ymm0 ,%ymm1 , %ymm5
  136. vmulpd %ymm0 ,%ymm2 , %ymm9
  137. vmulpd %ymm0 ,%ymm3 , %ymm13
  138. # if defined BROADCASTKERNEL
  139. vbroadcastsd -14 * SIZE(AO), %ymm0
  140. # else
  141. vpermpd $ 0x1b, %ymm0 , %ymm0
  142. # endif
  143. vmulpd %ymm0 ,%ymm1 , %ymm6
  144. vmulpd %ymm0 ,%ymm2 , %ymm10
  145. addq $ 12*SIZE, BO
  146. vmulpd %ymm0 ,%ymm3 , %ymm14
  147. # if defined BROADCASTKERNEL
  148. vbroadcastsd -13 * SIZE(AO), %ymm0
  149. # else
  150. vpermilpd $ 0x05, %ymm0 , %ymm0
  151. # endif
  152. vmulpd %ymm0 ,%ymm1 , %ymm7
  153. vmovups -12 * SIZE(BO), %ymm1
  154. vmulpd %ymm0 ,%ymm2 , %ymm11
  155. vmovups -8 * SIZE(BO), %ymm2
  156. vmulpd %ymm0 ,%ymm3 , %ymm15
  157. vmovups -4 * SIZE(BO), %ymm3
  158. .endm
  159. .macro KERNEL4x12_M1
  160. prefetcht0 A_PR1(AO)
  161. # if defined BROADCASTKERNEL
  162. vbroadcastsd -16 * SIZE(AO), %ymm0
  163. # else
  164. vmovups -16 * SIZE(AO), %ymm0
  165. # endif
  166. prefetcht0 B_PR1(BO)
  167. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  168. prefetcht0 B_PR1+64(BO)
  169. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  170. prefetcht0 B_PR1+128(BO)
  171. vfmadd231pd %ymm0 ,%ymm3 , %ymm12
  172. # if defined BROADCASTKERNEL
  173. vbroadcastsd -15 * SIZE(AO), %ymm0
  174. # else
  175. vpermilpd $ 0x05, %ymm0 , %ymm0
  176. # endif
  177. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  178. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  179. vfmadd231pd %ymm0 ,%ymm3 , %ymm13
  180. # if defined BROADCASTKERNEL
  181. vbroadcastsd -14 * SIZE(AO), %ymm0
  182. # else
  183. vpermpd $ 0x1b, %ymm0 , %ymm0
  184. # endif
  185. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  186. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  187. vfmadd231pd %ymm0 ,%ymm3 , %ymm14
  188. # if defined BROADCASTKERNEL
  189. vbroadcastsd -13 * SIZE(AO), %ymm0
  190. # else
  191. vpermilpd $ 0x05, %ymm0 , %ymm0
  192. # endif
  193. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  194. vmovups -12 * SIZE(BO), %ymm1
  195. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  196. vmovups -8 * SIZE(BO), %ymm2
  197. vfmadd231pd %ymm0 ,%ymm3 , %ymm15
  198. vmovups -4 * SIZE(BO), %ymm3
  199. .endm
  200. .macro KERNEL4x12_M2
  201. # if defined BROADCASTKERNEL
  202. vbroadcastsd -12 * SIZE(AO), %ymm0
  203. # else
  204. vmovups -12 * SIZE(AO), %ymm0
  205. # endif
  206. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  207. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  208. vfmadd231pd %ymm0 ,%ymm3 , %ymm12
  209. # if defined BROADCASTKERNEL
  210. vbroadcastsd -11 * SIZE(AO), %ymm0
  211. # else
  212. vpermilpd $ 0x05, %ymm0 , %ymm0
  213. # endif
  214. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  215. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  216. vfmadd231pd %ymm0 ,%ymm3 , %ymm13
  217. # if defined BROADCASTKERNEL
  218. vbroadcastsd -10 * SIZE(AO), %ymm0
  219. # else
  220. vpermpd $ 0x1b, %ymm0 , %ymm0
  221. # endif
  222. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  223. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  224. addq $ 8*SIZE, AO
  225. vfmadd231pd %ymm0 ,%ymm3 , %ymm14
  226. # if defined BROADCASTKERNEL
  227. vbroadcastsd -17 * SIZE(AO), %ymm0
  228. # else
  229. vpermilpd $ 0x05, %ymm0 , %ymm0
  230. # endif
  231. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  232. vmovups 0 * SIZE(BO), %ymm1
  233. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  234. vmovups 4 * SIZE(BO), %ymm2
  235. vfmadd231pd %ymm0 ,%ymm3 , %ymm15
  236. vmovups 8 * SIZE(BO), %ymm3
  237. addq $ 24*SIZE, BO
  238. .endm
  239. .macro KERNEL4x12_E
  240. # if defined BROADCASTKERNEL
  241. vbroadcastsd -12 * SIZE(AO), %ymm0
  242. # else
  243. vmovups -12 * SIZE(AO), %ymm0
  244. # endif
  245. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  246. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  247. vfmadd231pd %ymm0 ,%ymm3 , %ymm12
  248. # if defined BROADCASTKERNEL
  249. vbroadcastsd -11 * SIZE(AO), %ymm0
  250. # else
  251. vpermilpd $ 0x05, %ymm0 , %ymm0
  252. # endif
  253. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  254. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  255. vfmadd231pd %ymm0 ,%ymm3 , %ymm13
  256. # if defined BROADCASTKERNEL
  257. vbroadcastsd -10 * SIZE(AO), %ymm0
  258. # else
  259. vpermpd $ 0x1b, %ymm0 , %ymm0
  260. # endif
  261. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  262. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  263. addq $ 8*SIZE, AO
  264. vfmadd231pd %ymm0 ,%ymm3 , %ymm14
  265. # if defined BROADCASTKERNEL
  266. vbroadcastsd -17 * SIZE(AO), %ymm0
  267. # else
  268. vpermilpd $ 0x05, %ymm0 , %ymm0
  269. # endif
  270. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  271. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  272. vfmadd231pd %ymm0 ,%ymm3 , %ymm15
  273. addq $ 12*SIZE, BO
  274. .endm
  275. .macro KERNEL4x12_SUB
  276. vmovups -12 * SIZE(BO), %ymm1
  277. # if defined BROADCASTKERNEL
  278. vbroadcastsd -16 * SIZE(AO), %ymm0
  279. # else
  280. vmovups -16 * SIZE(AO), %ymm0
  281. # endif
  282. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  283. vmovups -8 * SIZE(BO), %ymm2
  284. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  285. vmovups -4 * SIZE(BO), %ymm3
  286. vfmadd231pd %ymm0 ,%ymm3 , %ymm12
  287. # if defined BROADCASTKERNEL
  288. vbroadcastsd -15 * SIZE(AO), %ymm0
  289. # else
  290. vpermilpd $ 0x05, %ymm0 , %ymm0
  291. # endif
  292. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  293. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  294. addq $ 12*SIZE, BO
  295. vfmadd231pd %ymm0 ,%ymm3 , %ymm13
  296. # if defined BROADCASTKERNEL
  297. vbroadcastsd -14 * SIZE(AO), %ymm0
  298. # else
  299. vpermpd $ 0x1b, %ymm0 , %ymm0
  300. # endif
  301. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  302. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  303. addq $ 4*SIZE, AO
  304. vfmadd231pd %ymm0 ,%ymm3 , %ymm14
  305. # if defined BROADCASTKERNEL
  306. vbroadcastsd -17 * SIZE(AO), %ymm0
  307. # else
  308. vpermilpd $ 0x05, %ymm0 , %ymm0
  309. # endif
  310. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  311. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  312. vfmadd231pd %ymm0 ,%ymm3 , %ymm15
  313. .endm
  314. .macro SAVE4x12
  315. prefetcht0 BUFFER1
  316. vbroadcastsd ALPHA, %ymm0
  317. vmulpd %ymm0 , %ymm4 , %ymm4
  318. vmulpd %ymm0 , %ymm5 , %ymm5
  319. vmulpd %ymm0 , %ymm6 , %ymm6
  320. vmulpd %ymm0 , %ymm7 , %ymm7
  321. prefetcht0 64 + BUFFER1
  322. vmulpd %ymm0 , %ymm8 , %ymm8
  323. vmulpd %ymm0 , %ymm9 , %ymm9
  324. vmulpd %ymm0 , %ymm10, %ymm10
  325. vmulpd %ymm0 , %ymm11, %ymm11
  326. #if B_PR1 > 32
  327. prefetcht0 128 + BUFFER1
  328. #endif
  329. vmulpd %ymm0 , %ymm12, %ymm12
  330. vmulpd %ymm0 , %ymm13, %ymm13
  331. vmulpd %ymm0 , %ymm14, %ymm14
  332. vmulpd %ymm0 , %ymm15, %ymm15
  333. #if B_PR1 > 96
  334. prefetcht0 192 + BUFFER1
  335. #endif
  336. #if defined BROADCASTKERNEL
  337. vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
  338. vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
  339. vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
  340. vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
  341. #else
  342. vpermilpd $ 0x05 , %ymm5, %ymm5
  343. vpermilpd $ 0x05 , %ymm7, %ymm7
  344. #endif
  345. #if B_PR1 > 160
  346. prefetcht0 256 + BUFFER1
  347. #endif
  348. #if defined BROADCASTKERNEL
  349. vunpcklpd %ymm1, %ymm0, %ymm4
  350. vunpckhpd %ymm1, %ymm0, %ymm5
  351. vunpcklpd %ymm3, %ymm2, %ymm6
  352. vunpckhpd %ymm3, %ymm2, %ymm7
  353. #else
  354. vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
  355. vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
  356. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
  357. vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
  358. #endif
  359. #if B_PR1 > 224
  360. prefetcht0 320 + BUFFER1
  361. #endif
  362. #ifndef BROADCASTKERNEL
  363. vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
  364. vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
  365. #endif
  366. #if B_PR1 > 288
  367. prefetcht0 384 + BUFFER1
  368. #endif
  369. #ifndef BROADCASTKERNEL
  370. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  371. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  372. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  373. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  374. #endif
  375. #if B_PR1 > 352
  376. prefetcht0 448 + BUFFER1
  377. #endif
  378. leaq (CO1, LDC, 2), %rax
  379. #if B_PR1 > 416
  380. prefetcht0 512 + BUFFER1
  381. #endif
  382. #if !defined(TRMMKERNEL)
  383. vaddpd (CO1), %ymm4, %ymm4
  384. vaddpd (CO1, LDC), %ymm5, %ymm5
  385. vaddpd (%rax), %ymm6, %ymm6
  386. vaddpd (%rax, LDC), %ymm7, %ymm7
  387. #endif
  388. vmovups %ymm4 , (CO1)
  389. vmovups %ymm5 , (CO1, LDC)
  390. vmovups %ymm6 , (%rax)
  391. vmovups %ymm7 , (%rax, LDC)
  392. prefetcht1 56(CO1)
  393. prefetcht1 56(CO1,LDC)
  394. prefetcht1 56(%rax)
  395. prefetcht1 56(%rax,LDC)
  396. #if defined BROADCASTKERNEL
  397. vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0
  398. vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1
  399. vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2
  400. vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3
  401. vunpcklpd %ymm1, %ymm0, %ymm4
  402. vunpckhpd %ymm1, %ymm0, %ymm5
  403. vunpcklpd %ymm3, %ymm2, %ymm6
  404. vunpckhpd %ymm3, %ymm2, %ymm7
  405. #else
  406. vpermilpd $ 0x05 , %ymm9, %ymm9
  407. vpermilpd $ 0x05 , %ymm11, %ymm11
  408. vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0
  409. vblendpd $ 0x05, %ymm9, %ymm8, %ymm1
  410. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
  411. vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
  412. vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
  413. vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
  414. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  415. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  416. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  417. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  418. #endif
  419. leaq (%rax, LDC, 2), %rax
  420. leaq (%rax, LDC, 2), %rbp
  421. #if !defined(TRMMKERNEL)
  422. vaddpd (%rax), %ymm4, %ymm4
  423. vaddpd (%rax, LDC), %ymm5, %ymm5
  424. vaddpd (%rbp), %ymm6, %ymm6
  425. vaddpd (%rbp, LDC), %ymm7, %ymm7
  426. #endif
  427. vmovups %ymm4 , (%rax)
  428. vmovups %ymm5 , (%rax, LDC)
  429. vmovups %ymm6 , (%rbp)
  430. vmovups %ymm7 , (%rbp, LDC)
  431. prefetcht1 56(%rax)
  432. prefetcht1 56(%rax,LDC)
  433. prefetcht1 56(%rbp)
  434. prefetcht1 56(%rbp,LDC)
  435. #if defined BROADCASTKERNEL
  436. vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0
  437. vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1
  438. vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2
  439. vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3
  440. vunpcklpd %ymm1, %ymm0, %ymm4
  441. vunpckhpd %ymm1, %ymm0, %ymm5
  442. vunpcklpd %ymm3, %ymm2, %ymm6
  443. vunpckhpd %ymm3, %ymm2, %ymm7
  444. #else
  445. vpermilpd $ 0x05 , %ymm13, %ymm13
  446. vpermilpd $ 0x05 , %ymm15, %ymm15
  447. vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0
  448. vblendpd $ 0x05, %ymm13, %ymm12, %ymm1
  449. vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2
  450. vblendpd $ 0x05, %ymm15, %ymm14, %ymm3
  451. vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
  452. vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
  453. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  454. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  455. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  456. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  457. #endif
  458. leaq (%rax, LDC, 4), %rax
  459. leaq (%rbp, LDC, 4), %rbp
  460. #if !defined(TRMMKERNEL)
  461. vaddpd (%rax), %ymm4, %ymm4
  462. vaddpd (%rax, LDC), %ymm5, %ymm5
  463. vaddpd (%rbp), %ymm6, %ymm6
  464. vaddpd (%rbp, LDC), %ymm7, %ymm7
  465. #endif
  466. vmovups %ymm4 , (%rax)
  467. vmovups %ymm5 , (%rax, LDC)
  468. vmovups %ymm6 , (%rbp)
  469. vmovups %ymm7 , (%rbp, LDC)
  470. prefetcht1 56(%rax)
  471. prefetcht1 56(%rax,LDC)
  472. prefetcht1 56(%rbp)
  473. prefetcht1 56(%rbp,LDC)
  474. addq $ 4*SIZE, CO1
  475. .endm
  476. /******************************************************************************************/
  477. .macro INIT2x12
  478. vxorpd %xmm4 , %xmm4 , %xmm4
  479. vxorpd %xmm5 , %xmm5 , %xmm5
  480. vxorpd %xmm6 , %xmm6 , %xmm6
  481. vxorpd %xmm7 , %xmm7 , %xmm7
  482. vxorpd %xmm8 , %xmm8 , %xmm8
  483. vxorpd %xmm9 , %xmm9 , %xmm9
  484. vxorpd %xmm10, %xmm10, %xmm10
  485. vxorpd %xmm11, %xmm11, %xmm11
  486. vxorpd %xmm12, %xmm12, %xmm12
  487. vxorpd %xmm13, %xmm13, %xmm13
  488. vxorpd %xmm14, %xmm14, %xmm14
  489. vxorpd %xmm15, %xmm15, %xmm15
  490. .endm
  491. .macro KERNEL2x12_SUB
  492. vmovups -16 * SIZE(AO), %xmm0
  493. vmovddup -12 * SIZE(BO), %xmm1
  494. vmovddup -11 * SIZE(BO), %xmm2
  495. vmovddup -10 * SIZE(BO), %xmm3
  496. vfmadd231pd %xmm0 ,%xmm1 , %xmm4
  497. vmovddup -9 * SIZE(BO), %xmm1
  498. vfmadd231pd %xmm0 ,%xmm2 , %xmm5
  499. vmovddup -8 * SIZE(BO), %xmm2
  500. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  501. vmovddup -7 * SIZE(BO), %xmm3
  502. vfmadd231pd %xmm0 ,%xmm1 , %xmm7
  503. vmovddup -6 * SIZE(BO), %xmm1
  504. vfmadd231pd %xmm0 ,%xmm2 , %xmm8
  505. vmovddup -5 * SIZE(BO), %xmm2
  506. vfmadd231pd %xmm0 ,%xmm3 , %xmm9
  507. vmovddup -4 * SIZE(BO), %xmm3
  508. vfmadd231pd %xmm0 ,%xmm1 , %xmm10
  509. vmovddup -3 * SIZE(BO), %xmm1
  510. vfmadd231pd %xmm0 ,%xmm2 , %xmm11
  511. vmovddup -2 * SIZE(BO), %xmm2
  512. vfmadd231pd %xmm0 ,%xmm3 , %xmm12
  513. vmovddup -1 * SIZE(BO), %xmm3
  514. vfmadd231pd %xmm0 ,%xmm1 , %xmm13
  515. addq $ 12*SIZE, BO
  516. vfmadd231pd %xmm0 ,%xmm2 , %xmm14
  517. addq $ 2*SIZE, AO
  518. vfmadd231pd %xmm0 ,%xmm3 , %xmm15
  519. .endm
  520. .macro SAVE2x12
  521. vmovddup ALPHA, %xmm0
  522. vmulpd %xmm0 , %xmm4 , %xmm4
  523. vmulpd %xmm0 , %xmm5 , %xmm5
  524. vmulpd %xmm0 , %xmm6 , %xmm6
  525. vmulpd %xmm0 , %xmm7 , %xmm7
  526. vmulpd %xmm0 , %xmm8 , %xmm8
  527. vmulpd %xmm0 , %xmm9 , %xmm9
  528. vmulpd %xmm0 , %xmm10, %xmm10
  529. vmulpd %xmm0 , %xmm11, %xmm11
  530. vmulpd %xmm0 , %xmm12, %xmm12
  531. vmulpd %xmm0 , %xmm13, %xmm13
  532. vmulpd %xmm0 , %xmm14, %xmm14
  533. vmulpd %xmm0 , %xmm15, %xmm15
  534. leaq (CO1, LDC, 2), %rax
  535. #if !defined(TRMMKERNEL)
  536. vaddpd (CO1), %xmm4, %xmm4
  537. vaddpd (CO1, LDC), %xmm5, %xmm5
  538. vaddpd (%rax), %xmm6, %xmm6
  539. vaddpd (%rax, LDC), %xmm7, %xmm7
  540. #endif
  541. vmovups %xmm4 , (CO1)
  542. vmovups %xmm5 , (CO1, LDC)
  543. vmovups %xmm6 , (%rax)
  544. vmovups %xmm7 , (%rax, LDC)
  545. leaq (%rax, LDC, 2), %rax
  546. leaq (%rax, LDC, 2), %rbp
  547. #if !defined(TRMMKERNEL)
  548. vaddpd (%rax), %xmm8 , %xmm4
  549. vaddpd (%rax, LDC), %xmm9 , %xmm5
  550. vaddpd (%rbp), %xmm10, %xmm6
  551. vaddpd (%rbp, LDC), %xmm11, %xmm7
  552. #endif
  553. vmovups %xmm4 , (%rax)
  554. vmovups %xmm5 , (%rax, LDC)
  555. vmovups %xmm6 , (%rbp)
  556. vmovups %xmm7 , (%rbp, LDC)
  557. leaq (%rax, LDC, 4), %rax
  558. leaq (%rbp, LDC, 4), %rbp
  559. #if !defined(TRMMKERNEL)
  560. vaddpd (%rax), %xmm12, %xmm4
  561. vaddpd (%rax, LDC), %xmm13, %xmm5
  562. vaddpd (%rbp), %xmm14, %xmm6
  563. vaddpd (%rbp, LDC), %xmm15, %xmm7
  564. #endif
  565. vmovups %xmm4 , (%rax)
  566. vmovups %xmm5 , (%rax, LDC)
  567. vmovups %xmm6 , (%rbp)
  568. vmovups %xmm7 , (%rbp, LDC)
  569. addq $ 2*SIZE, CO1
  570. .endm
  571. /******************************************************************************************/
  572. .macro INIT1x12
  573. vxorpd %xmm4 , %xmm4 , %xmm4
  574. vxorpd %xmm5 , %xmm5 , %xmm5
  575. vxorpd %xmm6 , %xmm6 , %xmm6
  576. vxorpd %xmm7 , %xmm7 , %xmm7
  577. vxorpd %xmm8 , %xmm8 , %xmm8
  578. vxorpd %xmm9 , %xmm9 , %xmm9
  579. vxorpd %xmm10, %xmm10, %xmm10
  580. vxorpd %xmm11, %xmm11, %xmm11
  581. vxorpd %xmm12, %xmm12, %xmm12
  582. vxorpd %xmm13, %xmm13, %xmm13
  583. vxorpd %xmm14, %xmm14, %xmm14
  584. vxorpd %xmm15, %xmm15, %xmm15
  585. .endm
  586. .macro KERNEL1x12_SUB
  587. vmovsd -16 * SIZE(AO), %xmm0
  588. vmovsd -12 * SIZE(BO), %xmm1
  589. vmovsd -11 * SIZE(BO), %xmm2
  590. vmovsd -10 * SIZE(BO), %xmm3
  591. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  592. vmovsd -9 * SIZE(BO), %xmm1
  593. vfmadd231sd %xmm0 ,%xmm2 , %xmm5
  594. vmovsd -8 * SIZE(BO), %xmm2
  595. vfmadd231sd %xmm0 ,%xmm3 , %xmm6
  596. vmovsd -7 * SIZE(BO), %xmm3
  597. vfmadd231sd %xmm0 ,%xmm1 , %xmm7
  598. vmovsd -6 * SIZE(BO), %xmm1
  599. vfmadd231sd %xmm0 ,%xmm2 , %xmm8
  600. vmovsd -5 * SIZE(BO), %xmm2
  601. vfmadd231sd %xmm0 ,%xmm3 , %xmm9
  602. vmovsd -4 * SIZE(BO), %xmm3
  603. vfmadd231sd %xmm0 ,%xmm1 , %xmm10
  604. vmovsd -3 * SIZE(BO), %xmm1
  605. vfmadd231sd %xmm0 ,%xmm2 , %xmm11
  606. vmovsd -2 * SIZE(BO), %xmm2
  607. vfmadd231sd %xmm0 ,%xmm3 , %xmm12
  608. vmovsd -1 * SIZE(BO), %xmm3
  609. vfmadd231sd %xmm0 ,%xmm1 , %xmm13
  610. addq $ 12*SIZE, BO
  611. vfmadd231sd %xmm0 ,%xmm2 , %xmm14
  612. addq $ 1*SIZE, AO
  613. vfmadd231sd %xmm0 ,%xmm3 , %xmm15
  614. .endm
  615. .macro SAVE1x12
  616. vmovsd ALPHA, %xmm0
  617. vmulsd %xmm0 , %xmm4 , %xmm4
  618. vmulsd %xmm0 , %xmm5 , %xmm5
  619. vmulsd %xmm0 , %xmm6 , %xmm6
  620. vmulsd %xmm0 , %xmm7 , %xmm7
  621. vmulsd %xmm0 , %xmm8 , %xmm8
  622. vmulsd %xmm0 , %xmm9 , %xmm9
  623. vmulsd %xmm0 , %xmm10, %xmm10
  624. vmulsd %xmm0 , %xmm11, %xmm11
  625. vmulsd %xmm0 , %xmm12, %xmm12
  626. vmulsd %xmm0 , %xmm13, %xmm13
  627. vmulsd %xmm0 , %xmm14, %xmm14
  628. vmulsd %xmm0 , %xmm15, %xmm15
  629. leaq (CO1, LDC, 2), %rax
  630. #if !defined(TRMMKERNEL)
  631. vaddsd (CO1), %xmm4, %xmm4
  632. vaddsd (CO1, LDC), %xmm5, %xmm5
  633. vaddsd (%rax), %xmm6, %xmm6
  634. vaddsd (%rax, LDC), %xmm7, %xmm7
  635. #endif
  636. vmovsd %xmm4 , (CO1)
  637. vmovsd %xmm5 , (CO1, LDC)
  638. vmovsd %xmm6 , (%rax)
  639. vmovsd %xmm7 , (%rax, LDC)
  640. leaq (%rax, LDC, 2), %rax
  641. leaq (%rax, LDC, 2), %rbp
  642. #if !defined(TRMMKERNEL)
  643. vaddsd (%rax), %xmm8 , %xmm4
  644. vaddsd (%rax, LDC), %xmm9 , %xmm5
  645. vaddsd (%rbp), %xmm10, %xmm6
  646. vaddsd (%rbp, LDC), %xmm11, %xmm7
  647. #endif
  648. vmovsd %xmm4 , (%rax)
  649. vmovsd %xmm5 , (%rax, LDC)
  650. vmovsd %xmm6 , (%rbp)
  651. vmovsd %xmm7 , (%rbp, LDC)
  652. leaq (%rax, LDC, 4), %rax
  653. leaq (%rbp, LDC, 4), %rbp
  654. #if !defined(TRMMKERNEL)
  655. vaddsd (%rax), %xmm12, %xmm4
  656. vaddsd (%rax, LDC), %xmm13, %xmm5
  657. vaddsd (%rbp), %xmm14, %xmm6
  658. vaddsd (%rbp, LDC), %xmm15, %xmm7
  659. #endif
  660. vmovsd %xmm4 , (%rax)
  661. vmovsd %xmm5 , (%rax, LDC)
  662. vmovsd %xmm6 , (%rbp)
  663. vmovsd %xmm7 , (%rbp, LDC)
  664. addq $ 1*SIZE, CO1
  665. .endm
  666. /******************************************************************************************/
  667. .macro INIT4x8
  668. vxorpd %ymm4 , %ymm4 , %ymm4
  669. vxorpd %ymm5 , %ymm5 , %ymm5
  670. vxorpd %ymm6 , %ymm6 , %ymm6
  671. vxorpd %ymm7 , %ymm7 , %ymm7
  672. vxorpd %ymm8 , %ymm8 , %ymm8
  673. vxorpd %ymm9 , %ymm9 , %ymm9
  674. vxorpd %ymm10, %ymm10, %ymm10
  675. vxorpd %ymm11, %ymm11, %ymm11
  676. .endm
  677. .macro KERNEL4x8_I
  678. vmovups -12 * SIZE(BO), %ymm1
  679. #if defined BROADCASTKERNEL
  680. vbroadcastsd -16 * SIZE(AO), %ymm0
  681. #else
  682. vmovups -16 * SIZE(AO), %ymm0
  683. #endif
  684. vmovups -8 * SIZE(BO), %ymm2
  685. vmulpd %ymm0 ,%ymm1 , %ymm4
  686. vmulpd %ymm0 ,%ymm2 , %ymm8
  687. #if defined BROADCASTKERNEL
  688. vbroadcastsd -15 * SIZE(AO), %ymm0
  689. #else
  690. vpermilpd $ 0x05, %ymm0 , %ymm0
  691. #endif
  692. vmulpd %ymm0 ,%ymm1 , %ymm5
  693. vmulpd %ymm0 ,%ymm2 , %ymm9
  694. #if defined BROADCASTKERNEL
  695. vbroadcastsd -14 * SIZE(AO), %ymm0
  696. #else
  697. vpermpd $ 0x1b, %ymm0 , %ymm0
  698. #endif
  699. vmulpd %ymm0 ,%ymm1 , %ymm6
  700. vmulpd %ymm0 ,%ymm2 , %ymm10
  701. addq $ 8*SIZE, BO
  702. #if defined BROADCASTKERNEL
  703. vbroadcastsd -13 * SIZE(AO), %ymm0
  704. #else
  705. vpermilpd $ 0x05, %ymm0 , %ymm0
  706. #endif
  707. vmulpd %ymm0 ,%ymm1 , %ymm7
  708. vmovups -12 * SIZE(BO), %ymm1
  709. vmulpd %ymm0 ,%ymm2 , %ymm11
  710. vmovups -8 * SIZE(BO), %ymm2
  711. .endm
  712. .macro KERNEL4x8_M1
  713. prefetcht0 A_PR1(AO)
  714. #if defined BROADCASTKERNEL
  715. vbroadcastsd -16 * SIZE(AO), %ymm0
  716. #else
  717. vmovups -16 * SIZE(AO), %ymm0
  718. #endif
  719. prefetcht0 B_PR1(BO)
  720. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  721. prefetcht0 B_PR1+64(BO)
  722. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  723. #if defined BROADCASTKERNEL
  724. vbroadcastsd -15 * SIZE(AO), %ymm0
  725. #else
  726. vpermilpd $ 0x05, %ymm0 , %ymm0
  727. #endif
  728. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  729. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  730. #if defined BROADCASTKERNEL
  731. vbroadcastsd -14 * SIZE(AO), %ymm0
  732. #else
  733. vpermpd $ 0x1b, %ymm0 , %ymm0
  734. #endif
  735. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  736. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  737. #if defined BROADCASTKERNEL
  738. vbroadcastsd -13 * SIZE(AO), %ymm0
  739. #else
  740. vpermilpd $ 0x05, %ymm0 , %ymm0
  741. #endif
  742. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  743. vmovups -12 * SIZE(BO), %ymm1
  744. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  745. vmovups -8 * SIZE(BO), %ymm2
  746. .endm
  747. .macro KERNEL4x8_M2
  748. #if defined BROADCASTKERNEL
  749. vbroadcastsd -12 * SIZE(AO), %ymm0
  750. #else
  751. vmovups -12 * SIZE(AO), %ymm0
  752. #endif
  753. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  754. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  755. #if defined BROADCASTKERNEL
  756. vbroadcastsd -11 * SIZE(AO), %ymm0
  757. #else
  758. vpermilpd $ 0x05, %ymm0 , %ymm0
  759. #endif
  760. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  761. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  762. #if defined BROADCASTKERNEL
  763. vbroadcastsd -10 * SIZE(AO), %ymm0
  764. #else
  765. vpermpd $ 0x1b, %ymm0 , %ymm0
  766. #endif
  767. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  768. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  769. addq $ 8*SIZE, AO
  770. #if defined BROADCASTKERNEL
  771. vbroadcastsd -17 * SIZE(AO), %ymm0
  772. #else
  773. vpermilpd $ 0x05, %ymm0 , %ymm0
  774. #endif
  775. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  776. vmovups -4 * SIZE(BO), %ymm1
  777. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  778. vmovups 0 * SIZE(BO), %ymm2
  779. addq $ 16*SIZE, BO
  780. .endm
  781. .macro KERNEL4x8_E
  782. #if defined BROADCASTKERNEL
  783. vbroadcastsd -12 * SIZE(AO), %ymm0
  784. #else
  785. vmovups -12 * SIZE(AO), %ymm0
  786. #endif
  787. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  788. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  789. #if defined BROADCASTKERNEL
  790. vbroadcastsd -11 * SIZE(AO), %ymm0
  791. #else
  792. vpermilpd $ 0x05, %ymm0 , %ymm0
  793. #endif
  794. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  795. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  796. #if defined BROADCASTKERNEL
  797. vbroadcastsd -10 * SIZE(AO), %ymm0
  798. #else
  799. vpermpd $ 0x1b, %ymm0 , %ymm0
  800. #endif
  801. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  802. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  803. addq $ 8*SIZE, AO
  804. #if defined BROADCASTKERNEL
  805. vbroadcastsd -17 * SIZE(AO), %ymm0
  806. #else
  807. vpermilpd $ 0x05, %ymm0 , %ymm0
  808. #endif
  809. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  810. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  811. addq $ 8*SIZE, BO
  812. .endm
  813. .macro KERNEL4x8_SUB
  814. vmovups -12 * SIZE(BO), %ymm1
  815. #if defined BROADCASTKERNEL
  816. vbroadcastsd -16 * SIZE(AO), %ymm0
  817. #else
  818. vmovups -16 * SIZE(AO), %ymm0
  819. #endif
  820. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  821. vmovups -8 * SIZE(BO), %ymm2
  822. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  823. #if defined BROADCASTKERNEL
  824. vbroadcastsd -15 * SIZE(AO), %ymm0
  825. #else
  826. vpermilpd $ 0x05, %ymm0 , %ymm0
  827. #endif
  828. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  829. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  830. addq $ 8*SIZE, BO
  831. #if defined BROADCASTKERNEL
  832. vbroadcastsd -14 * SIZE(AO), %ymm0
  833. #else
  834. vpermpd $ 0x1b, %ymm0 , %ymm0
  835. #endif
  836. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  837. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  838. addq $ 4*SIZE, AO
  839. #if defined BROADCASTKERNEL
  840. vbroadcastsd -17 * SIZE(AO), %ymm0
  841. #else
  842. vpermilpd $ 0x05, %ymm0 , %ymm0
  843. #endif
  844. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  845. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  846. .endm
  847. .macro SAVE4x8
  848. vbroadcastsd ALPHA, %ymm0
  849. vmulpd %ymm0 , %ymm4 , %ymm4
  850. vmulpd %ymm0 , %ymm5 , %ymm5
  851. vmulpd %ymm0 , %ymm6 , %ymm6
  852. vmulpd %ymm0 , %ymm7 , %ymm7
  853. vmulpd %ymm0 , %ymm8 , %ymm8
  854. vmulpd %ymm0 , %ymm9 , %ymm9
  855. vmulpd %ymm0 , %ymm10, %ymm10
  856. vmulpd %ymm0 , %ymm11, %ymm11
  857. #if defined BROADCASTKERNEL
  858. vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
  859. vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
  860. vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
  861. vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
  862. vunpcklpd %ymm1, %ymm0, %ymm4
  863. vunpckhpd %ymm1, %ymm0, %ymm5
  864. vunpcklpd %ymm3, %ymm2, %ymm6
  865. vunpckhpd %ymm3, %ymm2, %ymm7
  866. #else
  867. vpermilpd $ 0x05 , %ymm5, %ymm5
  868. vpermilpd $ 0x05 , %ymm7, %ymm7
  869. vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
  870. vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
  871. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
  872. vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
  873. vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
  874. vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
  875. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  876. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  877. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  878. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  879. #endif
  880. leaq (CO1, LDC, 2), %rax
  881. #if !defined(TRMMKERNEL)
  882. vaddpd (CO1), %ymm4, %ymm4
  883. vaddpd (CO1, LDC), %ymm5, %ymm5
  884. vaddpd (%rax), %ymm6, %ymm6
  885. vaddpd (%rax, LDC), %ymm7, %ymm7
  886. #endif
  887. vmovups %ymm4 , (CO1)
  888. vmovups %ymm5 , (CO1, LDC)
  889. vmovups %ymm6 , (%rax)
  890. vmovups %ymm7 , (%rax, LDC)
  891. prefetcht0 56(CO1)
  892. prefetcht0 56(CO1,LDC)
  893. prefetcht0 56(%rax)
  894. prefetcht0 56(%rax,LDC)
  895. #if defined BROADCASTKERNEL
  896. vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0
  897. vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1
  898. vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2
  899. vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3
  900. vunpcklpd %ymm1, %ymm0, %ymm4
  901. vunpckhpd %ymm1, %ymm0, %ymm5
  902. vunpcklpd %ymm3, %ymm2, %ymm6
  903. vunpckhpd %ymm3, %ymm2, %ymm7
  904. #else
  905. vpermilpd $ 0x05 , %ymm9 , %ymm9
  906. vpermilpd $ 0x05 , %ymm11, %ymm11
  907. vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
  908. vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
  909. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
  910. vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
  911. vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
  912. vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
  913. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  914. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  915. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  916. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  917. #endif
  918. leaq (%rax, LDC, 2), %rax
  919. leaq (%rax, LDC, 2), %rbp
  920. #if !defined(TRMMKERNEL)
  921. vaddpd (%rax), %ymm4, %ymm4
  922. vaddpd (%rax, LDC), %ymm5, %ymm5
  923. vaddpd (%rbp), %ymm6, %ymm6
  924. vaddpd (%rbp, LDC), %ymm7, %ymm7
  925. #endif
  926. vmovups %ymm4 , (%rax)
  927. vmovups %ymm5 , (%rax, LDC)
  928. vmovups %ymm6 , (%rbp)
  929. vmovups %ymm7 , (%rbp, LDC)
  930. prefetcht0 56(%rax)
  931. prefetcht0 56(%rax,LDC)
  932. prefetcht0 56(%rbp)
  933. prefetcht0 56(%rbp,LDC)
  934. addq $ 4*SIZE, CO1
  935. .endm
  936. /******************************************************************************************/
  937. .macro INIT2x8
  938. vxorpd %xmm4 , %xmm4 , %xmm4
  939. vxorpd %xmm5 , %xmm5 , %xmm5
  940. vxorpd %xmm6 , %xmm6 , %xmm6
  941. vxorpd %xmm7 , %xmm7 , %xmm7
  942. vxorpd %xmm8 , %xmm8 , %xmm8
  943. vxorpd %xmm9 , %xmm9 , %xmm9
  944. vxorpd %xmm10, %xmm10, %xmm10
  945. vxorpd %xmm11, %xmm11, %xmm11
  946. .endm
  947. .macro KERNEL2x8_SUB
  948. vmovups -16 * SIZE(AO), %xmm0
  949. vmovddup -12 * SIZE(BO), %xmm1
  950. vmovddup -11 * SIZE(BO), %xmm2
  951. vmovddup -10 * SIZE(BO), %xmm3
  952. vfmadd231pd %xmm0 ,%xmm1 , %xmm4
  953. vmovddup -9 * SIZE(BO), %xmm1
  954. vfmadd231pd %xmm0 ,%xmm2 , %xmm5
  955. vmovddup -8 * SIZE(BO), %xmm2
  956. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  957. vmovddup -7 * SIZE(BO), %xmm3
  958. vfmadd231pd %xmm0 ,%xmm1 , %xmm7
  959. vmovddup -6 * SIZE(BO), %xmm1
  960. vfmadd231pd %xmm0 ,%xmm2 , %xmm8
  961. vmovddup -5 * SIZE(BO), %xmm2
  962. vfmadd231pd %xmm0 ,%xmm3 , %xmm9
  963. vfmadd231pd %xmm0 ,%xmm1 , %xmm10
  964. vfmadd231pd %xmm0 ,%xmm2 , %xmm11
  965. addq $ 8*SIZE, BO
  966. addq $ 2*SIZE, AO
  967. .endm
  968. .macro SAVE2x8
  969. vmovddup ALPHA, %xmm0
  970. vmulpd %xmm0 , %xmm4 , %xmm4
  971. vmulpd %xmm0 , %xmm5 , %xmm5
  972. vmulpd %xmm0 , %xmm6 , %xmm6
  973. vmulpd %xmm0 , %xmm7 , %xmm7
  974. vmulpd %xmm0 , %xmm8 , %xmm8
  975. vmulpd %xmm0 , %xmm9 , %xmm9
  976. vmulpd %xmm0 , %xmm10, %xmm10
  977. vmulpd %xmm0 , %xmm11, %xmm11
  978. leaq (CO1, LDC, 2), %rax
  979. #if !defined(TRMMKERNEL)
  980. vaddpd (CO1), %xmm4, %xmm4
  981. vaddpd (CO1, LDC), %xmm5, %xmm5
  982. vaddpd (%rax), %xmm6, %xmm6
  983. vaddpd (%rax, LDC), %xmm7, %xmm7
  984. #endif
  985. vmovups %xmm4 , (CO1)
  986. vmovups %xmm5 , (CO1, LDC)
  987. vmovups %xmm6 , (%rax)
  988. vmovups %xmm7 , (%rax, LDC)
  989. leaq (%rax, LDC, 2), %rax
  990. leaq (%rax, LDC, 2), %rbp
  991. #if !defined(TRMMKERNEL)
  992. vaddpd (%rax), %xmm8 , %xmm4
  993. vaddpd (%rax, LDC), %xmm9 , %xmm5
  994. vaddpd (%rbp), %xmm10, %xmm6
  995. vaddpd (%rbp, LDC), %xmm11, %xmm7
  996. #endif
  997. vmovups %xmm4 , (%rax)
  998. vmovups %xmm5 , (%rax, LDC)
  999. vmovups %xmm6 , (%rbp)
  1000. vmovups %xmm7 , (%rbp, LDC)
  1001. addq $ 2*SIZE, CO1
  1002. .endm
  1003. /******************************************************************************************/
  1004. .macro INIT1x8
  1005. vxorpd %xmm4 , %xmm4 , %xmm4
  1006. vxorpd %xmm5 , %xmm5 , %xmm5
  1007. vxorpd %xmm6 , %xmm6 , %xmm6
  1008. vxorpd %xmm7 , %xmm7 , %xmm7
  1009. vxorpd %xmm8 , %xmm8 , %xmm8
  1010. vxorpd %xmm9 , %xmm9 , %xmm9
  1011. vxorpd %xmm10, %xmm10, %xmm10
  1012. vxorpd %xmm11, %xmm11, %xmm11
  1013. .endm
  1014. .macro KERNEL1x8_SUB
  1015. vmovsd -16 * SIZE(AO), %xmm0
  1016. vmovsd -12 * SIZE(BO), %xmm1
  1017. vmovsd -11 * SIZE(BO), %xmm2
  1018. vmovsd -10 * SIZE(BO), %xmm3
  1019. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  1020. vmovsd -9 * SIZE(BO), %xmm1
  1021. vfmadd231sd %xmm0 ,%xmm2 , %xmm5
  1022. vmovsd -8 * SIZE(BO), %xmm2
  1023. vfmadd231sd %xmm0 ,%xmm3 , %xmm6
  1024. vmovsd -7 * SIZE(BO), %xmm3
  1025. vfmadd231sd %xmm0 ,%xmm1 , %xmm7
  1026. vmovsd -6 * SIZE(BO), %xmm1
  1027. vfmadd231sd %xmm0 ,%xmm2 , %xmm8
  1028. vmovsd -5 * SIZE(BO), %xmm2
  1029. vfmadd231sd %xmm0 ,%xmm3 , %xmm9
  1030. vfmadd231sd %xmm0 ,%xmm1 , %xmm10
  1031. vfmadd231sd %xmm0 ,%xmm2 , %xmm11
  1032. addq $ 8*SIZE, BO
  1033. addq $ 1*SIZE, AO
  1034. .endm
  1035. .macro SAVE1x8
  1036. vmovsd ALPHA, %xmm0
  1037. vmulsd %xmm0 , %xmm4 , %xmm4
  1038. vmulsd %xmm0 , %xmm5 , %xmm5
  1039. vmulsd %xmm0 , %xmm6 , %xmm6
  1040. vmulsd %xmm0 , %xmm7 , %xmm7
  1041. vmulsd %xmm0 , %xmm8 , %xmm8
  1042. vmulsd %xmm0 , %xmm9 , %xmm9
  1043. vmulsd %xmm0 , %xmm10, %xmm10
  1044. vmulsd %xmm0 , %xmm11, %xmm11
  1045. leaq (CO1, LDC, 2), %rax
  1046. #if !defined(TRMMKERNEL)
  1047. vaddsd (CO1), %xmm4, %xmm4
  1048. vaddsd (CO1, LDC), %xmm5, %xmm5
  1049. vaddsd (%rax), %xmm6, %xmm6
  1050. vaddsd (%rax, LDC), %xmm7, %xmm7
  1051. #endif
  1052. vmovsd %xmm4 , (CO1)
  1053. vmovsd %xmm5 , (CO1, LDC)
  1054. vmovsd %xmm6 , (%rax)
  1055. vmovsd %xmm7 , (%rax, LDC)
  1056. leaq (%rax, LDC, 2), %rax
  1057. leaq (%rax, LDC, 2), %rbp
  1058. #if !defined(TRMMKERNEL)
  1059. vaddsd (%rax), %xmm8 , %xmm4
  1060. vaddsd (%rax, LDC), %xmm9 , %xmm5
  1061. vaddsd (%rbp), %xmm10, %xmm6
  1062. vaddsd (%rbp, LDC), %xmm11, %xmm7
  1063. #endif
  1064. vmovsd %xmm4 , (%rax)
  1065. vmovsd %xmm5 , (%rax, LDC)
  1066. vmovsd %xmm6 , (%rbp)
  1067. vmovsd %xmm7 , (%rbp, LDC)
  1068. addq $ 1*SIZE, CO1
  1069. .endm
  1070. /******************************************************************************************/
  1071. .macro INIT4x4
  1072. vxorpd %ymm4 , %ymm4 , %ymm4
  1073. vxorpd %ymm5 , %ymm5 , %ymm5
  1074. vxorpd %ymm6 , %ymm6 , %ymm6
  1075. vxorpd %ymm7 , %ymm7 , %ymm7
  1076. .endm
  1077. .macro KERNEL4x4_I
  1078. prefetcht0 A_PR1(AO)
  1079. vmovups -12 * SIZE(BO), %ymm1
  1080. #if defined BROADCASTKERNEL
  1081. vbroadcastsd -16 * SIZE(AO), %ymm0
  1082. #else
  1083. vmovups -16 * SIZE(AO), %ymm0
  1084. #endif
  1085. vmulpd %ymm0 ,%ymm1 , %ymm4
  1086. #if defined BROADCASTKERNEL
  1087. vbroadcastsd -15 * SIZE(AO), %ymm0
  1088. #else
  1089. vpermilpd $ 0x05, %ymm0 , %ymm0
  1090. #endif
  1091. vmulpd %ymm0 ,%ymm1 , %ymm5
  1092. #if defined BROADCASTKERNEL
  1093. vbroadcastsd -14 * SIZE(AO), %ymm0
  1094. #else
  1095. vpermpd $ 0x1b, %ymm0 , %ymm0
  1096. #endif
  1097. vmulpd %ymm0 ,%ymm1 , %ymm6
  1098. addq $ 4*SIZE, BO
  1099. #if defined BROADCASTKERNEL
  1100. vbroadcastsd -13 * SIZE(AO), %ymm0
  1101. #else
  1102. vpermilpd $ 0x05, %ymm0 , %ymm0
  1103. #endif
  1104. vmulpd %ymm0 ,%ymm1 , %ymm7
  1105. vmovups -12 * SIZE(BO), %ymm1
  1106. .endm
  1107. .macro KERNEL4x4_M1
  1108. prefetcht0 A_PR1(AO)
  1109. #if defined BROADCASTKERNEL
  1110. vbroadcastsd -16 * SIZE(AO), %ymm0
  1111. #else
  1112. vmovups -16 * SIZE(AO), %ymm0
  1113. #endif
  1114. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  1115. #if defined BROADCASTKERNEL
  1116. vbroadcastsd -15 * SIZE(AO), %ymm0
  1117. #else
  1118. vpermilpd $ 0x05, %ymm0 , %ymm0
  1119. #endif
  1120. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  1121. #if defined BROADCASTKERNEL
  1122. vbroadcastsd -14 * SIZE(AO), %ymm0
  1123. #else
  1124. vpermpd $ 0x1b, %ymm0 , %ymm0
  1125. #endif
  1126. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  1127. #if defined BROADCASTKERNEL
  1128. vbroadcastsd -13 * SIZE(AO), %ymm0
  1129. #else
  1130. vpermilpd $ 0x05, %ymm0 , %ymm0
  1131. #endif
  1132. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  1133. vmovups -12 * SIZE(BO), %ymm1
  1134. .endm
  1135. .macro KERNEL4x4_M2
  1136. #if defined BROADCASTKERNEL
  1137. vbroadcastsd -12 * SIZE(AO), %ymm0
  1138. #else
  1139. vmovups -12 * SIZE(AO), %ymm0
  1140. #endif
  1141. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  1142. #if defined BROADCASTKERNEL
  1143. vbroadcastsd -11 * SIZE(AO), %ymm0
  1144. #else
  1145. vpermilpd $ 0x05, %ymm0 , %ymm0
  1146. #endif
  1147. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  1148. #if defined BROADCASTKERNEL
  1149. vbroadcastsd -10 * SIZE(AO), %ymm0
  1150. #else
  1151. vpermpd $ 0x1b, %ymm0 , %ymm0
  1152. #endif
  1153. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  1154. addq $ 8*SIZE, AO
  1155. #if defined BROADCASTKERNEL
  1156. vbroadcastsd -17 * SIZE(AO), %ymm0
  1157. #else
  1158. vpermilpd $ 0x05, %ymm0 , %ymm0
  1159. #endif
  1160. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  1161. vmovups -8 * SIZE(BO), %ymm1
  1162. addq $ 8*SIZE, BO
  1163. .endm
  1164. .macro KERNEL4x4_E
  1165. #if defined BROADCASTKERNEL
  1166. vbroadcastsd -12 * SIZE(AO), %ymm0
  1167. #else
  1168. vmovups -12 * SIZE(AO), %ymm0
  1169. #endif
  1170. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  1171. #if defined BROADCASTKERNEL
  1172. vbroadcastsd -11 * SIZE(AO), %ymm0
  1173. #else
  1174. vpermilpd $ 0x05, %ymm0 , %ymm0
  1175. #endif
  1176. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  1177. #if defined BROADCASTKERNEL
  1178. vbroadcastsd -10 * SIZE(AO), %ymm0
  1179. #else
  1180. vpermpd $ 0x1b, %ymm0 , %ymm0
  1181. #endif
  1182. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  1183. addq $ 8*SIZE, AO
  1184. #if defined BROADCASTKERNEL
  1185. vbroadcastsd -17 * SIZE(AO), %ymm0
  1186. #else
  1187. vpermilpd $ 0x05, %ymm0 , %ymm0
  1188. #endif
  1189. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  1190. addq $ 4*SIZE, BO
  1191. .endm
  1192. .macro KERNEL4x4_SUB
  1193. vmovups -12 * SIZE(BO), %ymm1
  1194. #if defined BROADCASTKERNEL
  1195. vbroadcastsd -16 * SIZE(AO), %ymm0
  1196. #else
  1197. vmovups -16 * SIZE(AO), %ymm0
  1198. #endif
  1199. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  1200. #if defined BROADCASTKERNEL
  1201. vbroadcastsd -15 * SIZE(AO), %ymm0
  1202. #else
  1203. vpermilpd $ 0x05, %ymm0 , %ymm0
  1204. #endif
  1205. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  1206. addq $ 4*SIZE, BO
  1207. #if defined BROADCASTKERNEL
  1208. vbroadcastsd -14 * SIZE(AO), %ymm0
  1209. #else
  1210. vpermpd $ 0x1b, %ymm0 , %ymm0
  1211. #endif
  1212. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  1213. addq $ 4*SIZE, AO
  1214. #if defined BROADCASTKERNEL
  1215. vbroadcastsd -17 * SIZE(AO), %ymm0
  1216. #else
  1217. vpermilpd $ 0x05, %ymm0 , %ymm0
  1218. #endif
  1219. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  1220. .endm
  1221. .macro SAVE4x4
  1222. vbroadcastsd ALPHA, %ymm0
  1223. vmulpd %ymm0 , %ymm4 , %ymm4
  1224. vmulpd %ymm0 , %ymm7 , %ymm7
  1225. vmulpd %ymm0 , %ymm5 , %ymm5
  1226. vmulpd %ymm0 , %ymm6 , %ymm6
  1227. #if defined BROADCASTKERNEL
  1228. vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
  1229. vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
  1230. vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
  1231. vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
  1232. vunpcklpd %ymm1, %ymm0, %ymm4
  1233. vunpckhpd %ymm1, %ymm0, %ymm5
  1234. vunpcklpd %ymm3, %ymm2, %ymm6
  1235. vunpckhpd %ymm3, %ymm2, %ymm7
  1236. #else
  1237. vpermilpd $ 0x05 , %ymm5, %ymm5
  1238. vpermilpd $ 0x05 , %ymm7, %ymm7
  1239. vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
  1240. vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
  1241. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
  1242. vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
  1243. vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
  1244. vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
  1245. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  1246. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  1247. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  1248. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  1249. #endif
  1250. leaq (CO1, LDC, 2), %rax
  1251. #if !defined(TRMMKERNEL)
  1252. vaddpd (CO1), %ymm4, %ymm4
  1253. vaddpd (CO1, LDC), %ymm5, %ymm5
  1254. vaddpd (%rax), %ymm6, %ymm6
  1255. vaddpd (%rax, LDC), %ymm7, %ymm7
  1256. #endif
  1257. vmovups %ymm4 , (CO1)
  1258. vmovups %ymm5 , (CO1, LDC)
  1259. vmovups %ymm6 , (%rax)
  1260. vmovups %ymm7 , (%rax, LDC)
  1261. addq $ 4*SIZE, CO1
  1262. .endm
  1263. /******************************************************************************************/
  1264. /******************************************************************************************/
  1265. .macro INIT2x4
  1266. vxorpd %xmm4 , %xmm4 , %xmm4
  1267. vxorpd %xmm5 , %xmm5 , %xmm5
  1268. vxorpd %xmm6 , %xmm6 , %xmm6
  1269. vxorpd %xmm7 , %xmm7 , %xmm7
  1270. .endm
  1271. .macro KERNEL2x4_SUB
  1272. vmovddup -12 * SIZE(BO), %xmm1
  1273. vmovups -16 * SIZE(AO), %xmm0
  1274. vmovddup -11 * SIZE(BO), %xmm2
  1275. vfmadd231pd %xmm0 ,%xmm1 , %xmm4
  1276. vmovddup -10 * SIZE(BO), %xmm3
  1277. vfmadd231pd %xmm0 ,%xmm2 , %xmm5
  1278. vmovddup -9 * SIZE(BO), %xmm8
  1279. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  1280. addq $ 4*SIZE, BO
  1281. vfmadd231pd %xmm0 ,%xmm8 , %xmm7
  1282. addq $ 2*SIZE, AO
  1283. .endm
  1284. .macro SAVE2x4
  1285. vmovddup ALPHA, %xmm0
  1286. vmulpd %xmm0 , %xmm4 , %xmm4
  1287. vmulpd %xmm0 , %xmm5 , %xmm5
  1288. vmulpd %xmm0 , %xmm6 , %xmm6
  1289. vmulpd %xmm0 , %xmm7 , %xmm7
  1290. leaq (CO1, LDC, 2), %rax
  1291. #if !defined(TRMMKERNEL)
  1292. vaddpd (CO1), %xmm4, %xmm4
  1293. vaddpd (CO1, LDC), %xmm5, %xmm5
  1294. vaddpd (%rax), %xmm6, %xmm6
  1295. vaddpd (%rax, LDC), %xmm7, %xmm7
  1296. #endif
  1297. vmovups %xmm4 , (CO1)
  1298. vmovups %xmm5 , (CO1, LDC)
  1299. vmovups %xmm6 , (%rax)
  1300. vmovups %xmm7 , (%rax, LDC)
  1301. addq $ 2*SIZE, CO1
  1302. .endm
  1303. /******************************************************************************************/
  1304. /******************************************************************************************/
  1305. .macro INIT1x4
  1306. vxorpd %xmm4 , %xmm4 , %xmm4
  1307. vxorpd %xmm5 , %xmm5 , %xmm5
  1308. vxorpd %xmm6 , %xmm6 , %xmm6
  1309. vxorpd %xmm7 , %xmm7 , %xmm7
  1310. .endm
  1311. .macro KERNEL1x4_SUB
  1312. vmovsd -12 * SIZE(BO), %xmm1
  1313. vmovsd -16 * SIZE(AO), %xmm0
  1314. vmovsd -11 * SIZE(BO), %xmm2
  1315. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  1316. vmovsd -10 * SIZE(BO), %xmm3
  1317. vfmadd231sd %xmm0 ,%xmm2 , %xmm5
  1318. vmovsd -9 * SIZE(BO), %xmm8
  1319. vfmadd231sd %xmm0 ,%xmm3 , %xmm6
  1320. addq $ 4*SIZE, BO
  1321. vfmadd231sd %xmm0 ,%xmm8 , %xmm7
  1322. addq $ 1*SIZE, AO
  1323. .endm
  1324. .macro SAVE1x4
  1325. vmovsd ALPHA, %xmm0
  1326. vmulsd %xmm0 , %xmm4 , %xmm4
  1327. vmulsd %xmm0 , %xmm5 , %xmm5
  1328. vmulsd %xmm0 , %xmm6 , %xmm6
  1329. vmulsd %xmm0 , %xmm7 , %xmm7
  1330. leaq (CO1, LDC, 2), %rax
  1331. #if !defined(TRMMKERNEL)
  1332. vaddsd (CO1), %xmm4, %xmm4
  1333. vaddsd (CO1, LDC), %xmm5, %xmm5
  1334. vaddsd (%rax), %xmm6, %xmm6
  1335. vaddsd (%rax, LDC), %xmm7, %xmm7
  1336. #endif
  1337. vmovsd %xmm4 , (CO1)
  1338. vmovsd %xmm5 , (CO1, LDC)
  1339. vmovsd %xmm6 , (%rax)
  1340. vmovsd %xmm7 , (%rax, LDC)
  1341. addq $ 1*SIZE, CO1
  1342. .endm
  1343. /******************************************************************************************/
  1344. /******************************************************************************************/
  1345. .macro INIT4x2
  1346. vxorpd %xmm4 , %xmm4 , %xmm4
  1347. vxorpd %xmm5 , %xmm5 , %xmm5
  1348. vxorpd %xmm6 , %xmm6 , %xmm6
  1349. vxorpd %xmm7 , %xmm7 , %xmm7
  1350. .endm
  1351. .macro KERNEL4x2_SUB
  1352. vmovddup -12 * SIZE(BO), %xmm2
  1353. vmovups -16 * SIZE(AO), %xmm0
  1354. vmovups -14 * SIZE(AO), %xmm1
  1355. vmovddup -11 * SIZE(BO), %xmm3
  1356. vfmadd231pd %xmm0 ,%xmm2 , %xmm4
  1357. vfmadd231pd %xmm1 ,%xmm2 , %xmm5
  1358. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  1359. vfmadd231pd %xmm1 ,%xmm3 , %xmm7
  1360. addq $ 2*SIZE, BO
  1361. addq $ 4*SIZE, AO
  1362. .endm
  1363. .macro SAVE4x2
  1364. vmovddup ALPHA, %xmm0
  1365. vmulpd %xmm0 , %xmm4 , %xmm4
  1366. vmulpd %xmm0 , %xmm5 , %xmm5
  1367. vmulpd %xmm0 , %xmm6 , %xmm6
  1368. vmulpd %xmm0 , %xmm7 , %xmm7
  1369. #if !defined(TRMMKERNEL)
  1370. vaddpd (CO1) , %xmm4, %xmm4
  1371. vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5
  1372. vaddpd (CO1, LDC), %xmm6, %xmm6
  1373. vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7
  1374. #endif
  1375. vmovups %xmm4 , (CO1)
  1376. vmovups %xmm5 , 2 * SIZE(CO1)
  1377. vmovups %xmm6 , (CO1, LDC)
  1378. vmovups %xmm7 , 2 * SIZE(CO1, LDC)
  1379. addq $ 4*SIZE, CO1
  1380. .endm
  1381. /******************************************************************************************/
  1382. /******************************************************************************************/
  1383. .macro INIT2x2
  1384. vxorpd %xmm4 , %xmm4 , %xmm4
  1385. vxorpd %xmm6 , %xmm6 , %xmm6
  1386. .endm
  1387. .macro KERNEL2x2_SUB
  1388. vmovddup -12 * SIZE(BO), %xmm2
  1389. vmovups -16 * SIZE(AO), %xmm0
  1390. vmovddup -11 * SIZE(BO), %xmm3
  1391. vfmadd231pd %xmm0 ,%xmm2 , %xmm4
  1392. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  1393. addq $ 2*SIZE, BO
  1394. addq $ 2*SIZE, AO
  1395. .endm
  1396. .macro SAVE2x2
  1397. vmovddup ALPHA, %xmm0
  1398. vmulpd %xmm0 , %xmm4 , %xmm4
  1399. vmulpd %xmm0 , %xmm6 , %xmm6
  1400. #if !defined(TRMMKERNEL)
  1401. vaddpd (CO1) , %xmm4, %xmm4
  1402. vaddpd (CO1, LDC), %xmm6, %xmm6
  1403. #endif
  1404. vmovups %xmm4 , (CO1)
  1405. vmovups %xmm6 , (CO1, LDC)
  1406. addq $ 2*SIZE, CO1
  1407. .endm
  1408. /******************************************************************************************/
  1409. /******************************************************************************************/
  1410. .macro INIT1x2
  1411. vxorpd %xmm4 , %xmm4 , %xmm4
  1412. vxorpd %xmm5 , %xmm5 , %xmm5
  1413. .endm
  1414. .macro KERNEL1x2_SUB
  1415. vmovsd -12 * SIZE(BO), %xmm1
  1416. vmovsd -16 * SIZE(AO), %xmm0
  1417. vmovsd -11 * SIZE(BO), %xmm2
  1418. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  1419. vfmadd231sd %xmm0 ,%xmm2 , %xmm5
  1420. addq $ 2*SIZE, BO
  1421. addq $ 1*SIZE, AO
  1422. .endm
  1423. .macro SAVE1x2
  1424. vmovsd ALPHA, %xmm0
  1425. vmulsd %xmm0 , %xmm4 , %xmm4
  1426. vmulsd %xmm0 , %xmm5 , %xmm5
  1427. #if !defined(TRMMKERNEL)
  1428. vaddsd (CO1), %xmm4, %xmm4
  1429. vaddsd (CO1, LDC), %xmm5, %xmm5
  1430. #endif
  1431. vmovsd %xmm4 , (CO1)
  1432. vmovsd %xmm5 , (CO1, LDC)
  1433. addq $ 1*SIZE, CO1
  1434. .endm
  1435. /******************************************************************************************/
  1436. /******************************************************************************************/
  1437. .macro INIT4x1
  1438. vxorpd %ymm4 , %ymm4 , %ymm4
  1439. vxorpd %ymm5 , %ymm5 , %ymm5
  1440. vxorpd %ymm6 , %ymm6 , %ymm6
  1441. vxorpd %ymm7 , %ymm7 , %ymm7
  1442. .endm
  1443. .macro KERNEL4x1
  1444. vbroadcastsd -12 * SIZE(BO), %ymm0
  1445. vbroadcastsd -11 * SIZE(BO), %ymm1
  1446. vbroadcastsd -10 * SIZE(BO), %ymm2
  1447. vbroadcastsd -9 * SIZE(BO), %ymm3
  1448. vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4
  1449. vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5
  1450. vbroadcastsd -8 * SIZE(BO), %ymm0
  1451. vbroadcastsd -7 * SIZE(BO), %ymm1
  1452. vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6
  1453. vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7
  1454. vbroadcastsd -6 * SIZE(BO), %ymm2
  1455. vbroadcastsd -5 * SIZE(BO), %ymm3
  1456. vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4
  1457. vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5
  1458. vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6
  1459. vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7
  1460. addq $ 8 *SIZE, BO
  1461. addq $ 32*SIZE, AO
  1462. .endm
  1463. .macro KERNEL4x1_SUB
  1464. vbroadcastsd -12 * SIZE(BO), %ymm2
  1465. vmovups -16 * SIZE(AO), %ymm0
  1466. vfmadd231pd %ymm0 ,%ymm2 , %ymm4
  1467. addq $ 1*SIZE, BO
  1468. addq $ 4*SIZE, AO
  1469. .endm
  1470. .macro SAVE4x1
  1471. vbroadcastsd ALPHA, %ymm0
  1472. vaddpd %ymm4,%ymm5, %ymm4
  1473. vaddpd %ymm6,%ymm7, %ymm6
  1474. vaddpd %ymm4,%ymm6, %ymm4
  1475. vmulpd %ymm0 , %ymm4 , %ymm4
  1476. #if !defined(TRMMKERNEL)
  1477. vaddpd (CO1) , %ymm4, %ymm4
  1478. #endif
  1479. vmovups %ymm4 , (CO1)
  1480. addq $ 4*SIZE, CO1
  1481. .endm
  1482. /******************************************************************************************/
  1483. /******************************************************************************************/
  1484. .macro INIT2x1
  1485. vxorpd %xmm4 , %xmm4 , %xmm4
  1486. .endm
  1487. .macro KERNEL2x1_SUB
  1488. vmovddup -12 * SIZE(BO), %xmm2
  1489. vmovups -16 * SIZE(AO), %xmm0
  1490. vfmadd231pd %xmm0 ,%xmm2 , %xmm4
  1491. addq $ 1*SIZE, BO
  1492. addq $ 2*SIZE, AO
  1493. .endm
  1494. .macro SAVE2x1
  1495. vmovddup ALPHA, %xmm0
  1496. vmulpd %xmm0 , %xmm4 , %xmm4
  1497. #if !defined(TRMMKERNEL)
  1498. vaddpd (CO1) , %xmm4, %xmm4
  1499. #endif
  1500. vmovups %xmm4 , (CO1)
  1501. addq $ 2*SIZE, CO1
  1502. .endm
  1503. /******************************************************************************************/
  1504. /******************************************************************************************/
  1505. .macro INIT1x1
  1506. vxorpd %xmm4 , %xmm4 , %xmm4
  1507. .endm
  1508. .macro KERNEL1x1_SUB
  1509. vmovsd -12 * SIZE(BO), %xmm1
  1510. vmovsd -16 * SIZE(AO), %xmm0
  1511. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  1512. addq $ 1*SIZE, BO
  1513. addq $ 1*SIZE, AO
  1514. .endm
  1515. .macro SAVE1x1
  1516. vmovsd ALPHA, %xmm0
  1517. vmulsd %xmm0 , %xmm4 , %xmm4
  1518. #if !defined(TRMMKERNEL)
  1519. vaddsd (CO1), %xmm4, %xmm4
  1520. #endif
  1521. vmovsd %xmm4 , (CO1)
  1522. addq $ 1*SIZE, CO1
  1523. .endm
  1524. .macro PREFETCHT0_C
  1525. prefetcht0 (CO1)
  1526. prefetcht0 24(CO1)
  1527. prefetcht0 (CO1,LDC,4)
  1528. prefetcht0 24(CO1,LDC,4)
  1529. prefetcht0 (CO1,LDC,8)
  1530. prefetcht0 24(CO1,LDC,8)
  1531. .endm
  1532. /*******************************************************************************************/
  1533. #if !defined(TRMMKERNEL)
  1534. PROLOGUE
  1535. PROFCODE
  1536. subq $STACKSIZE, %rsp
  1537. movq %rbx, (%rsp)
  1538. movq %rbp, 8(%rsp)
  1539. movq %r12, 16(%rsp)
  1540. movq %r13, 24(%rsp)
  1541. movq %r14, 32(%rsp)
  1542. movq %r15, 40(%rsp)
  1543. vzeroupper
  1544. #ifdef WINDOWS_ABI
  1545. movq %rdi, 48(%rsp)
  1546. movq %rsi, 56(%rsp)
  1547. vmovups %xmm6, 64(%rsp)
  1548. vmovups %xmm7, 80(%rsp)
  1549. vmovups %xmm8, 96(%rsp)
  1550. vmovups %xmm9, 112(%rsp)
  1551. vmovups %xmm10, 128(%rsp)
  1552. vmovups %xmm11, 144(%rsp)
  1553. vmovups %xmm12, 160(%rsp)
  1554. vmovups %xmm13, 176(%rsp)
  1555. vmovups %xmm14, 192(%rsp)
  1556. vmovups %xmm15, 208(%rsp)
  1557. movq ARG1, OLD_M
  1558. movq ARG2, OLD_N
  1559. movq ARG3, OLD_K
  1560. movq OLD_A, A
  1561. movq OLD_B, B
  1562. movq OLD_C, C
  1563. movq OLD_LDC, LDC
  1564. vmovups %xmm3, %xmm0
  1565. #else
  1566. movq STACKSIZE + 8(%rsp), LDC
  1567. #endif
  1568. movq %rsp, SP # save old stack
  1569. subq $128 + L_BUFFER_SIZE, %rsp
  1570. andq $-4096, %rsp # align stack
  1571. STACK_TOUCH
  1572. cmpq $ 0, OLD_M
  1573. je .L999
  1574. cmpq $ 0, OLD_N
  1575. je .L999
  1576. cmpq $ 0, OLD_K
  1577. je .L999
  1578. movq OLD_M, M
  1579. movq OLD_N, N
  1580. movq OLD_K, K
  1581. vmovsd %xmm0, ALPHA
  1582. salq $BASE_SHIFT, LDC
  1583. movq N, %rax
  1584. xorq %rdx, %rdx
  1585. movq $24, %rdi
  1586. divq %rdi // N / 24
  1587. movq %rax, Ndiv12 // N / 24
  1588. movq %rdx, Nmod12 // N % 24
  1589. movq Ndiv12, J
  1590. cmpq $ 0, J
  1591. je .L8_0
  1592. ALIGN_4
  1593. .L12_01:
  1594. // copy to sub buffer
  1595. movq K, %rax
  1596. salq $3,%rax // K * 8 ; read 8 values from BO1
  1597. movq B, BO1
  1598. leaq (B,%rax, SIZE), BO2 // next offset to BO2
  1599. movq BO2 , B
  1600. leaq BUFFER1, BO // first buffer to BO
  1601. movq K, %rax
  1602. ALIGN_4
  1603. .L12_02b:
  1604. vmovups 0 * SIZE(BO1), %ymm1
  1605. vmovups 4 * SIZE(BO1), %ymm2
  1606. vmovups 0 * SIZE(BO2), %ymm3
  1607. vmovups %ymm1, 0 * SIZE(BO)
  1608. vmovups %ymm2, 4 * SIZE(BO)
  1609. vmovups %ymm3, 8 * SIZE(BO)
  1610. addq $ 8*SIZE,BO1
  1611. addq $ 8*SIZE,BO2
  1612. addq $ 12*SIZE,BO
  1613. decq %rax
  1614. jnz .L12_02b
  1615. .L12_03c:
  1616. .L12_10:
  1617. movq C, CO1
  1618. leaq (C, LDC, 8), C
  1619. leaq (C, LDC, 4), C // c += 12 * ldc
  1620. movq A, AO // aoffset = a
  1621. addq $16 * SIZE, AO
  1622. movq M, I
  1623. sarq $2, I // i = m / 4
  1624. je .L12_20
  1625. ALIGN_4
  1626. .L12_11:
  1627. leaq BUFFER1, BO // first buffer to BO
  1628. addq $12 * SIZE, BO
  1629. movq K, %rax
  1630. sarq $3, %rax // K / 8
  1631. cmpq $2, %rax
  1632. jl .L12_13
  1633. KERNEL4x12_I
  1634. KERNEL4x12_M2
  1635. KERNEL4x12_M1
  1636. KERNEL4x12_M2
  1637. KERNEL4x12_M1
  1638. KERNEL4x12_M2
  1639. KERNEL4x12_M1
  1640. KERNEL4x12_M2
  1641. subq $2, %rax
  1642. je .L12_12a
  1643. ALIGN_5
  1644. .L12_12:
  1645. KERNEL4x12_M1
  1646. KERNEL4x12_M2
  1647. KERNEL4x12_M1
  1648. KERNEL4x12_M2
  1649. KERNEL4x12_M1
  1650. KERNEL4x12_M2
  1651. KERNEL4x12_M1
  1652. KERNEL4x12_M2
  1653. dec %rax
  1654. jne .L12_12
  1655. .L12_12a:
  1656. prefetcht0 ALPHA
  1657. PREFETCHT0_C
  1658. addq LDC,CO1
  1659. KERNEL4x12_M1
  1660. PREFETCHT0_C
  1661. leaq (CO1,LDC,2),CO1
  1662. KERNEL4x12_M2
  1663. PREFETCHT0_C
  1664. subq LDC,CO1
  1665. KERNEL4x12_M1
  1666. PREFETCHT0_C
  1667. subq LDC,CO1
  1668. subq LDC,CO1
  1669. KERNEL4x12_M2
  1670. KERNEL4x12_M1
  1671. KERNEL4x12_M2
  1672. KERNEL4x12_M1
  1673. KERNEL4x12_E
  1674. jmp .L12_16
  1675. .L12_13:
  1676. test $1, %rax
  1677. jz .L12_14
  1678. KERNEL4x12_I
  1679. KERNEL4x12_M2
  1680. KERNEL4x12_M1
  1681. KERNEL4x12_M2
  1682. KERNEL4x12_M1
  1683. KERNEL4x12_M2
  1684. KERNEL4x12_M1
  1685. KERNEL4x12_E
  1686. jmp .L12_16
  1687. .L12_14:
  1688. INIT4x12
  1689. .L12_16:
  1690. movq K, %rax
  1691. andq $7, %rax # if (k & 1)
  1692. je .L12_19
  1693. ALIGN_4
  1694. .L12_17:
  1695. KERNEL4x12_SUB
  1696. dec %rax
  1697. jne .L12_17
  1698. ALIGN_4
  1699. .L12_19:
  1700. SAVE4x12
  1701. /* here for the prefetch of next b source block */
  1702. /* the increment should be proportional to GEMM_Q/GEMM_P */
  1703. salq $3, K
  1704. #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
  1705. prefetcht2 32(B)
  1706. prefetcht2 32(B, K, 8)
  1707. addq $64, B /* increment */
  1708. #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
  1709. prefetcht2 32(B)
  1710. prefetcht2 32(B, K, 8)
  1711. prefetcht2 96(B)
  1712. prefetcht2 96(B, K, 8)
  1713. addq $128, B /* increment */
  1714. #endif
  1715. sarq $3, K
  1716. decq I # i --
  1717. jne .L12_11
  1718. ALIGN_4
  1719. /**************************************************************************
  1720. * Rest of M
  1721. ***************************************************************************/
  1722. /* recover the original value of pointer B after prefetch */
  1723. movq M, I
  1724. sarq $2, I
  1725. #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
  1726. salq $6, I
  1727. #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
  1728. salq $7, I
  1729. #endif
  1730. subq I, B
  1731. .L12_20:
  1732. // Test rest of M
  1733. testq $3, M
  1734. jz .L12_100 // to next 16 lines of N
  1735. .L12_30:
  1736. testq $2, M
  1737. jz .L12_40
  1738. ALIGN_4
  1739. .L12_31:
  1740. leaq BUFFER1, BO // first buffer to BO
  1741. addq $12 * SIZE, BO
  1742. INIT2x12
  1743. movq K, %rax
  1744. sarq $3, %rax
  1745. je .L12_36
  1746. ALIGN_4
  1747. .L12_32:
  1748. KERNEL2x12_SUB
  1749. KERNEL2x12_SUB
  1750. KERNEL2x12_SUB
  1751. KERNEL2x12_SUB
  1752. KERNEL2x12_SUB
  1753. KERNEL2x12_SUB
  1754. KERNEL2x12_SUB
  1755. KERNEL2x12_SUB
  1756. dec %rax
  1757. jne .L12_32
  1758. ALIGN_4
  1759. .L12_36:
  1760. movq K, %rax
  1761. andq $7, %rax # if (k & 1)
  1762. je .L12_39
  1763. ALIGN_4
  1764. .L12_37:
  1765. KERNEL2x12_SUB
  1766. dec %rax
  1767. jne .L12_37
  1768. ALIGN_4
  1769. .L12_39:
  1770. SAVE2x12
  1771. ALIGN_4
  1772. .L12_40:
  1773. testq $1, M
  1774. jz .L12_100 // to next 3 lines of N
  1775. ALIGN_4
  1776. .L12_41:
  1777. leaq BUFFER1, BO // first buffer to BO
  1778. addq $12 * SIZE, BO
  1779. INIT1x12
  1780. movq K, %rax
  1781. sarq $3,%rax
  1782. je .L12_46
  1783. ALIGN_4
  1784. .L12_42:
  1785. KERNEL1x12_SUB
  1786. KERNEL1x12_SUB
  1787. KERNEL1x12_SUB
  1788. KERNEL1x12_SUB
  1789. KERNEL1x12_SUB
  1790. KERNEL1x12_SUB
  1791. KERNEL1x12_SUB
  1792. KERNEL1x12_SUB
  1793. dec %rax
  1794. jne .L12_42
  1795. ALIGN_4
  1796. .L12_46:
  1797. movq K, %rax
  1798. andq $7, %rax # if (k & 1)
  1799. je .L12_49
  1800. ALIGN_4
  1801. .L12_47:
  1802. KERNEL1x12_SUB
  1803. dec %rax
  1804. jne .L12_47
  1805. ALIGN_4
  1806. .L12_49:
  1807. SAVE1x12
  1808. ALIGN_4
  1809. .L12_100:
  1810. /**************************************************************************************************/
  1811. .L13_01:
  1812. // copy to sub buffer
  1813. movq K, %rax
  1814. salq $3,%rax // K * 8 ; read 8 values
  1815. movq B, BO2
  1816. leaq (B,%rax, SIZE), BO3 // next offset to BO2
  1817. leaq (BO3,%rax, SIZE), B // next offset to B
  1818. leaq BUFFER1, BO // first buffer to BO
  1819. movq K, %rax
  1820. ALIGN_4
  1821. .L13_02b:
  1822. vmovups 4 * SIZE(BO2), %ymm1
  1823. vmovups 0 * SIZE(BO3), %ymm2
  1824. vmovups 4 * SIZE(BO3), %ymm3
  1825. vmovups %ymm1, 0 * SIZE(BO)
  1826. vmovups %ymm2, 4 * SIZE(BO)
  1827. vmovups %ymm3, 8 * SIZE(BO)
  1828. addq $ 8*SIZE,BO2
  1829. addq $ 8*SIZE,BO3
  1830. addq $ 12*SIZE,BO
  1831. decq %rax
  1832. jnz .L13_02b
  1833. .L13_10:
  1834. movq C, CO1
  1835. leaq (C, LDC, 8), C
  1836. leaq (C, LDC, 4), C // c += 12 * ldc
  1837. movq A, AO // aoffset = a
  1838. addq $16 * SIZE, AO
  1839. movq M, I
  1840. sarq $2, I // i = m / 4
  1841. je .L13_20
  1842. ALIGN_4
  1843. .L13_11:
  1844. leaq BUFFER1, BO // first buffer to BO
  1845. addq $12 * SIZE, BO
  1846. movq K, %rax
  1847. sarq $3, %rax // K / 8
  1848. cmpq $2, %rax
  1849. jl .L13_13
  1850. KERNEL4x12_I
  1851. KERNEL4x12_M2
  1852. KERNEL4x12_M1
  1853. KERNEL4x12_M2
  1854. KERNEL4x12_M1
  1855. KERNEL4x12_M2
  1856. KERNEL4x12_M1
  1857. KERNEL4x12_M2
  1858. subq $2, %rax
  1859. je .L13_12a
  1860. ALIGN_5
  1861. .L13_12:
  1862. KERNEL4x12_M1
  1863. KERNEL4x12_M2
  1864. KERNEL4x12_M1
  1865. KERNEL4x12_M2
  1866. KERNEL4x12_M1
  1867. KERNEL4x12_M2
  1868. KERNEL4x12_M1
  1869. KERNEL4x12_M2
  1870. dec %rax
  1871. jne .L13_12
  1872. .L13_12a:
  1873. prefetcht0 ALPHA
  1874. PREFETCHT0_C
  1875. addq LDC,CO1
  1876. KERNEL4x12_M1
  1877. PREFETCHT0_C
  1878. leaq (CO1,LDC,2),CO1
  1879. KERNEL4x12_M2
  1880. PREFETCHT0_C
  1881. subq LDC,CO1
  1882. KERNEL4x12_M1
  1883. PREFETCHT0_C
  1884. subq LDC,CO1
  1885. subq LDC,CO1
  1886. KERNEL4x12_M2
  1887. KERNEL4x12_M1
  1888. KERNEL4x12_M2
  1889. KERNEL4x12_M1
  1890. KERNEL4x12_E
  1891. jmp .L13_16
  1892. .L13_13:
  1893. test $1, %rax
  1894. jz .L13_14
  1895. KERNEL4x12_I
  1896. KERNEL4x12_M2
  1897. KERNEL4x12_M1
  1898. KERNEL4x12_M2
  1899. KERNEL4x12_M1
  1900. KERNEL4x12_M2
  1901. KERNEL4x12_M1
  1902. KERNEL4x12_E
  1903. jmp .L13_16
  1904. .L13_14:
  1905. INIT4x12
  1906. .L13_16:
  1907. movq K, %rax
  1908. andq $7, %rax # if (k & 1)
  1909. je .L13_19
  1910. ALIGN_4
  1911. .L13_17:
  1912. KERNEL4x12_SUB
  1913. dec %rax
  1914. jne .L13_17
  1915. ALIGN_4
  1916. .L13_19:
  1917. SAVE4x12
  1918. /* here for the prefetch of next b source block */
  1919. /* the increment should be proportional to GEMM_Q/GEMM_P */
  1920. salq $3, K
  1921. #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
  1922. prefetcht2 (B)
  1923. prefetcht2 (B, K, 8)
  1924. addq $64, B /* increment */
  1925. #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
  1926. prefetcht2 (B)
  1927. prefetcht2 (B, K, 8)
  1928. prefetcht2 64(B)
  1929. prefetcht2 64(B, K, 8)
  1930. addq $128, B /* increment */
  1931. #endif
  1932. sarq $3, K
  1933. decq I # i --
  1934. jne .L13_11
  1935. ALIGN_4
  1936. /**************************************************************************
  1937. * Rest of M
  1938. ***************************************************************************/
  1939. /* recover the original value of pointer B */
  1940. movq M, I
  1941. sarq $2, I
  1942. #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
  1943. salq $6, I
  1944. #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
  1945. salq $7, I
  1946. #endif
  1947. subq I, B
  1948. .L13_20:
  1949. // Test rest of M
  1950. testq $3, M
  1951. jz .L13_100 // to next 16 lines of N
  1952. .L13_30:
  1953. testq $2, M
  1954. jz .L13_40
  1955. ALIGN_4
  1956. .L13_31:
  1957. leaq BUFFER1, BO // first buffer to BO
  1958. addq $12 * SIZE, BO
  1959. INIT2x12
  1960. movq K, %rax
  1961. sarq $3, %rax
  1962. je .L13_36
  1963. ALIGN_4
  1964. .L13_32:
  1965. KERNEL2x12_SUB
  1966. KERNEL2x12_SUB
  1967. KERNEL2x12_SUB
  1968. KERNEL2x12_SUB
  1969. KERNEL2x12_SUB
  1970. KERNEL2x12_SUB
  1971. KERNEL2x12_SUB
  1972. KERNEL2x12_SUB
  1973. dec %rax
  1974. jne .L13_32
  1975. ALIGN_4
  1976. .L13_36:
  1977. movq K, %rax
  1978. andq $7, %rax # if (k & 1)
  1979. je .L13_39
  1980. ALIGN_4
  1981. .L13_37:
  1982. KERNEL2x12_SUB
  1983. dec %rax
  1984. jne .L13_37
  1985. ALIGN_4
  1986. .L13_39:
  1987. SAVE2x12
  1988. ALIGN_4
  1989. .L13_40:
  1990. testq $1, M
  1991. jz .L13_100 // to next 3 lines of N
  1992. ALIGN_4
  1993. .L13_41:
  1994. leaq BUFFER1, BO // first buffer to BO
  1995. addq $12 * SIZE, BO
  1996. INIT1x12
  1997. movq K, %rax
  1998. sarq $3,%rax
  1999. je .L13_46
  2000. ALIGN_4
  2001. .L13_42:
  2002. KERNEL1x12_SUB
  2003. KERNEL1x12_SUB
  2004. KERNEL1x12_SUB
  2005. KERNEL1x12_SUB
  2006. KERNEL1x12_SUB
  2007. KERNEL1x12_SUB
  2008. KERNEL1x12_SUB
  2009. KERNEL1x12_SUB
  2010. dec %rax
  2011. jne .L13_42
  2012. ALIGN_4
  2013. .L13_46:
  2014. movq K, %rax
  2015. andq $7, %rax # if (k & 1)
  2016. je .L13_49
  2017. ALIGN_4
  2018. .L13_47:
  2019. KERNEL1x12_SUB
  2020. dec %rax
  2021. jne .L13_47
  2022. ALIGN_4
  2023. .L13_49:
  2024. SAVE1x12
  2025. ALIGN_4
  2026. .L13_100:
  2027. decq J // j --
  2028. jg .L12_01
  2029. /**************************************************************************************************/
  2030. .L8_0:
  2031. cmpq $ 0, Nmod12 // N % 12 == 0
  2032. je .L999
  2033. movq Nmod12, J
  2034. sarq $3, J // j = j / 8
  2035. je .L4_0
  2036. .L8_10:
  2037. movq C, CO1
  2038. leaq (C, LDC, 8), C // c += 4 * ldc
  2039. movq A, AO // aoffset = a
  2040. addq $16 * SIZE, AO
  2041. movq M, I
  2042. sarq $2, I // i = m / 4
  2043. je .L8_20
  2044. ALIGN_4
  2045. .L8_11:
  2046. movq B, BO
  2047. addq $12 * SIZE, BO
  2048. movq K, %rax
  2049. sarq $3, %rax // K / 8
  2050. cmpq $2, %rax
  2051. jl .L8_13
  2052. KERNEL4x8_I
  2053. KERNEL4x8_M2
  2054. KERNEL4x8_M1
  2055. KERNEL4x8_M2
  2056. KERNEL4x8_M1
  2057. KERNEL4x8_M2
  2058. KERNEL4x8_M1
  2059. KERNEL4x8_M2
  2060. subq $2, %rax
  2061. je .L8_12a
  2062. ALIGN_5
  2063. .L8_12:
  2064. KERNEL4x8_M1
  2065. KERNEL4x8_M2
  2066. KERNEL4x8_M1
  2067. KERNEL4x8_M2
  2068. KERNEL4x8_M1
  2069. KERNEL4x8_M2
  2070. KERNEL4x8_M1
  2071. KERNEL4x8_M2
  2072. dec %rax
  2073. jne .L8_12
  2074. .L8_12a:
  2075. KERNEL4x8_M1
  2076. KERNEL4x8_M2
  2077. KERNEL4x8_M1
  2078. KERNEL4x8_M2
  2079. KERNEL4x8_M1
  2080. KERNEL4x8_M2
  2081. KERNEL4x8_M1
  2082. KERNEL4x8_E
  2083. jmp .L8_16
  2084. .L8_13:
  2085. test $1, %rax
  2086. jz .L8_14
  2087. KERNEL4x8_I
  2088. KERNEL4x8_M2
  2089. KERNEL4x8_M1
  2090. KERNEL4x8_M2
  2091. KERNEL4x8_M1
  2092. KERNEL4x8_M2
  2093. KERNEL4x8_M1
  2094. KERNEL4x8_E
  2095. jmp .L8_16
  2096. .L8_14:
  2097. INIT4x8
  2098. .L8_16:
  2099. movq K, %rax
  2100. andq $7, %rax # if (k & 1)
  2101. je .L8_19
  2102. ALIGN_4
  2103. .L8_17:
  2104. KERNEL4x8_SUB
  2105. dec %rax
  2106. jne .L8_17
  2107. ALIGN_4
  2108. .L8_19:
  2109. SAVE4x8
  2110. decq I # i --
  2111. jg .L8_11
  2112. ALIGN_4
  2113. /**************************************************************************
  2114. * Rest of M
  2115. ***************************************************************************/
  2116. .L8_20:
  2117. // Test rest of M
  2118. testq $3, M
  2119. jz .L8_100 // to next 16 lines of N
  2120. .L8_30:
  2121. testq $2, M
  2122. jz .L8_40
  2123. ALIGN_4
  2124. .L8_31:
  2125. movq B, BO // first buffer to BO
  2126. addq $12 * SIZE, BO
  2127. INIT2x8
  2128. movq K, %rax
  2129. sarq $3, %rax
  2130. je .L8_36
  2131. ALIGN_4
  2132. .L8_32:
  2133. KERNEL2x8_SUB
  2134. KERNEL2x8_SUB
  2135. KERNEL2x8_SUB
  2136. KERNEL2x8_SUB
  2137. KERNEL2x8_SUB
  2138. KERNEL2x8_SUB
  2139. KERNEL2x8_SUB
  2140. KERNEL2x8_SUB
  2141. dec %rax
  2142. jne .L8_32
  2143. ALIGN_4
  2144. .L8_36:
  2145. movq K, %rax
  2146. andq $7, %rax # if (k & 1)
  2147. je .L8_39
  2148. ALIGN_4
  2149. .L8_37:
  2150. KERNEL2x8_SUB
  2151. dec %rax
  2152. jne .L8_37
  2153. .L8_39:
  2154. SAVE2x8
  2155. .L8_40:
  2156. testq $1, M
  2157. jz .L8_100 // to next 3 lines of N
  2158. ALIGN_4
  2159. .L8_41:
  2160. movq B, BO // first buffer to BO
  2161. addq $12 * SIZE, BO
  2162. INIT1x8
  2163. movq K, %rax
  2164. sarq $3,%rax
  2165. je .L8_46
  2166. ALIGN_4
  2167. .L8_42:
  2168. KERNEL1x8_SUB
  2169. KERNEL1x8_SUB
  2170. KERNEL1x8_SUB
  2171. KERNEL1x8_SUB
  2172. KERNEL1x8_SUB
  2173. KERNEL1x8_SUB
  2174. KERNEL1x8_SUB
  2175. KERNEL1x8_SUB
  2176. dec %rax
  2177. jne .L8_42
  2178. ALIGN_4
  2179. .L8_46:
  2180. movq K, %rax
  2181. andq $7, %rax # if (k & 1)
  2182. je .L8_49
  2183. ALIGN_4
  2184. .L8_47:
  2185. KERNEL1x8_SUB
  2186. dec %rax
  2187. jne .L8_47
  2188. ALIGN_4
  2189. .L8_49:
  2190. SAVE1x8
  2191. ALIGN_4
  2192. .L8_100:
  2193. movq K, %rax
  2194. salq $3, %rax // * 8
  2195. leaq (B , %rax, SIZE), B
  2196. decq J // j --
  2197. jg .L8_10
  2198. /**************************************************************************************************/
  2199. .L4_0:
  2200. cmpq $ 0, Nmod12 // N % 12 == 0
  2201. je .L999
  2202. movq Nmod12, J
  2203. testq $4, J // j = j / 4
  2204. je .L2_0
  2205. .L4_10:
  2206. movq C, CO1
  2207. leaq (C, LDC, 4), C // c += 4 * ldc
  2208. movq A, AO // aoffset = a
  2209. addq $16 * SIZE, AO
  2210. movq M, I
  2211. sarq $2, I // i = m / 4
  2212. je .L4_20
  2213. ALIGN_4
  2214. .L4_11:
  2215. movq B, BO
  2216. addq $12 * SIZE, BO
  2217. movq K, %rax
  2218. sarq $3, %rax // K / 8
  2219. cmpq $2, %rax
  2220. jl .L4_13
  2221. KERNEL4x4_I
  2222. KERNEL4x4_M2
  2223. KERNEL4x4_M1
  2224. KERNEL4x4_M2
  2225. KERNEL4x4_M1
  2226. KERNEL4x4_M2
  2227. KERNEL4x4_M1
  2228. KERNEL4x4_M2
  2229. subq $2, %rax
  2230. je .L4_12a
  2231. ALIGN_5
  2232. .L4_12:
  2233. KERNEL4x4_M1
  2234. KERNEL4x4_M2
  2235. KERNEL4x4_M1
  2236. KERNEL4x4_M2
  2237. KERNEL4x4_M1
  2238. KERNEL4x4_M2
  2239. KERNEL4x4_M1
  2240. KERNEL4x4_M2
  2241. dec %rax
  2242. jne .L4_12
  2243. .L4_12a:
  2244. KERNEL4x4_M1
  2245. KERNEL4x4_M2
  2246. KERNEL4x4_M1
  2247. KERNEL4x4_M2
  2248. KERNEL4x4_M1
  2249. KERNEL4x4_M2
  2250. KERNEL4x4_M1
  2251. KERNEL4x4_E
  2252. jmp .L4_16
  2253. .L4_13:
  2254. test $1, %rax
  2255. jz .L4_14
  2256. KERNEL4x4_I
  2257. KERNEL4x4_M2
  2258. KERNEL4x4_M1
  2259. KERNEL4x4_M2
  2260. KERNEL4x4_M1
  2261. KERNEL4x4_M2
  2262. KERNEL4x4_M1
  2263. KERNEL4x4_E
  2264. jmp .L4_16
  2265. .L4_14:
  2266. INIT4x4
  2267. .L4_16:
  2268. movq K, %rax
  2269. andq $7, %rax # if (k & 1)
  2270. je .L4_19
  2271. ALIGN_4
  2272. .L4_17:
  2273. KERNEL4x4_SUB
  2274. dec %rax
  2275. jne .L4_17
  2276. ALIGN_4
  2277. .L4_19:
  2278. SAVE4x4
  2279. decq I # i --
  2280. jg .L4_11
  2281. ALIGN_4
  2282. /**************************************************************************
  2283. * Rest of M
  2284. ***************************************************************************/
  2285. .L4_20:
  2286. // Test rest of M
  2287. testq $3, M
  2288. jz .L4_100 // to next 16 lines of N
  2289. .L4_30:
  2290. testq $2, M
  2291. jz .L4_40
  2292. ALIGN_4
  2293. .L4_31:
  2294. movq B, BO // first buffer to BO
  2295. addq $12 * SIZE, BO
  2296. INIT2x4
  2297. movq K, %rax
  2298. sarq $3, %rax
  2299. je .L4_36
  2300. ALIGN_4
  2301. .L4_32:
  2302. KERNEL2x4_SUB
  2303. KERNEL2x4_SUB
  2304. KERNEL2x4_SUB
  2305. KERNEL2x4_SUB
  2306. KERNEL2x4_SUB
  2307. KERNEL2x4_SUB
  2308. KERNEL2x4_SUB
  2309. KERNEL2x4_SUB
  2310. dec %rax
  2311. jne .L4_32
  2312. ALIGN_4
  2313. .L4_36:
  2314. movq K, %rax
  2315. andq $7, %rax # if (k & 1)
  2316. je .L4_39
  2317. ALIGN_4
  2318. .L4_37:
  2319. KERNEL2x4_SUB
  2320. dec %rax
  2321. jne .L4_37
  2322. .L4_39:
  2323. SAVE2x4
  2324. .L4_40:
  2325. testq $1, M
  2326. jz .L4_100 // to next 3 lines of N
  2327. ALIGN_4
  2328. .L4_41:
  2329. movq B, BO // first buffer to BO
  2330. addq $12 * SIZE, BO
  2331. INIT1x4
  2332. movq K, %rax
  2333. sarq $3,%rax
  2334. je .L4_46
  2335. ALIGN_4
  2336. .L4_42:
  2337. KERNEL1x4_SUB
  2338. KERNEL1x4_SUB
  2339. KERNEL1x4_SUB
  2340. KERNEL1x4_SUB
  2341. KERNEL1x4_SUB
  2342. KERNEL1x4_SUB
  2343. KERNEL1x4_SUB
  2344. KERNEL1x4_SUB
  2345. dec %rax
  2346. jne .L4_42
  2347. ALIGN_4
  2348. .L4_46:
  2349. movq K, %rax
  2350. andq $7, %rax # if (k & 1)
  2351. je .L4_49
  2352. ALIGN_4
  2353. .L4_47:
  2354. KERNEL1x4_SUB
  2355. dec %rax
  2356. jne .L4_47
  2357. ALIGN_4
  2358. .L4_49:
  2359. SAVE1x4
  2360. ALIGN_4
  2361. .L4_100:
  2362. movq K, %rax
  2363. salq $2, %rax // * 4
  2364. leaq (B , %rax, SIZE), B
  2365. /***************************************************************************************************************/
  2366. .L2_0:
  2367. movq Nmod12, J
  2368. testq $2, J
  2369. je .L1_0
  2370. .L2_10:
  2371. movq C, CO1
  2372. leaq (C, LDC, 2), C // c += 2 * ldc
  2373. movq A, AO // aoffset = a
  2374. addq $16 * SIZE, AO
  2375. movq M, I
  2376. sarq $2, I // i = m / 4
  2377. je .L2_20
  2378. ALIGN_4
  2379. .L2_11:
  2380. movq B, BO
  2381. addq $12 * SIZE, BO
  2382. INIT4x2
  2383. movq K, %rax
  2384. sarq $3, %rax // K / 8
  2385. je .L2_16
  2386. ALIGN_5
  2387. .L2_12:
  2388. KERNEL4x2_SUB
  2389. KERNEL4x2_SUB
  2390. KERNEL4x2_SUB
  2391. KERNEL4x2_SUB
  2392. KERNEL4x2_SUB
  2393. KERNEL4x2_SUB
  2394. KERNEL4x2_SUB
  2395. KERNEL4x2_SUB
  2396. dec %rax
  2397. jne .L2_12
  2398. .L2_16:
  2399. movq K, %rax
  2400. andq $7, %rax # if (k & 1)
  2401. je .L2_19
  2402. ALIGN_4
  2403. .L2_17:
  2404. KERNEL4x2_SUB
  2405. dec %rax
  2406. jne .L2_17
  2407. ALIGN_4
  2408. .L2_19:
  2409. SAVE4x2
  2410. decq I # i --
  2411. jg .L2_11
  2412. ALIGN_4
  2413. /**************************************************************************
  2414. * Rest of M
  2415. ***************************************************************************/
  2416. .L2_20:
  2417. // Test rest of M
  2418. testq $3, M
  2419. jz .L2_100 // to next 16 lines of N
  2420. .L2_30:
  2421. testq $2, M
  2422. jz .L2_40
  2423. ALIGN_4
  2424. .L2_31:
  2425. movq B, BO // first buffer to BO
  2426. addq $12 * SIZE, BO
  2427. INIT2x2
  2428. movq K, %rax
  2429. sarq $3, %rax
  2430. je .L2_36
  2431. ALIGN_4
  2432. .L2_32:
  2433. KERNEL2x2_SUB
  2434. KERNEL2x2_SUB
  2435. KERNEL2x2_SUB
  2436. KERNEL2x2_SUB
  2437. KERNEL2x2_SUB
  2438. KERNEL2x2_SUB
  2439. KERNEL2x2_SUB
  2440. KERNEL2x2_SUB
  2441. dec %rax
  2442. jne .L2_32
  2443. .L2_36:
  2444. movq K, %rax
  2445. andq $7, %rax # if (k & 1)
  2446. je .L2_39
  2447. ALIGN_4
  2448. .L2_37:
  2449. KERNEL2x2_SUB
  2450. dec %rax
  2451. jne .L2_37
  2452. .L2_39:
  2453. SAVE2x2
  2454. .L2_40:
  2455. testq $1, M
  2456. jz .L2_100 // to next 3 lines of N
  2457. .L2_41:
  2458. movq B, BO // first buffer to BO
  2459. addq $12 * SIZE, BO
  2460. INIT1x2
  2461. movq K, %rax
  2462. sarq $3,%rax
  2463. je .L2_46
  2464. ALIGN_4
  2465. .L2_42:
  2466. KERNEL1x2_SUB
  2467. KERNEL1x2_SUB
  2468. KERNEL1x2_SUB
  2469. KERNEL1x2_SUB
  2470. KERNEL1x2_SUB
  2471. KERNEL1x2_SUB
  2472. KERNEL1x2_SUB
  2473. KERNEL1x2_SUB
  2474. dec %rax
  2475. jne .L2_42
  2476. .L2_46:
  2477. movq K, %rax
  2478. andq $7, %rax # if (k & 1)
  2479. je .L2_49
  2480. ALIGN_4
  2481. .L2_47:
  2482. KERNEL1x2_SUB
  2483. dec %rax
  2484. jne .L2_47
  2485. .L2_49:
  2486. SAVE1x2
  2487. .L2_100:
  2488. movq K, %rax
  2489. salq $1, %rax // * 2
  2490. leaq (B , %rax, SIZE), B
  2491. /***************************************************************************************************************/
  2492. .L1_0:
  2493. movq Nmod12, J
  2494. testq $1, J
  2495. je .L999
  2496. .L1_10:
  2497. movq C, CO1
  2498. leaq (C, LDC, 1), C // c += 1 * ldc
  2499. movq A, AO // aoffset = a
  2500. addq $16 * SIZE, AO
  2501. movq M, I
  2502. sarq $2, I // i = m / 4
  2503. je .L1_20
  2504. ALIGN_4
  2505. .L1_11:
  2506. movq B, BO
  2507. addq $12 * SIZE, BO
  2508. INIT4x1
  2509. movq K, %rax
  2510. sarq $3, %rax // K / 8
  2511. je .L1_16
  2512. ALIGN_5
  2513. .L1_12:
  2514. KERNEL4x1
  2515. dec %rax
  2516. jne .L1_12
  2517. .L1_16:
  2518. movq K, %rax
  2519. andq $7, %rax # if (k & 1)
  2520. je .L1_19
  2521. ALIGN_4
  2522. .L1_17:
  2523. KERNEL4x1_SUB
  2524. dec %rax
  2525. jne .L1_17
  2526. ALIGN_4
  2527. .L1_19:
  2528. SAVE4x1
  2529. decq I # i --
  2530. jg .L1_11
  2531. /**************************************************************************
  2532. * Rest of M
  2533. ***************************************************************************/
  2534. .L1_20:
  2535. // Test rest of M
  2536. testq $3, M
  2537. jz .L1_100
  2538. .L1_30:
  2539. testq $2, M
  2540. jz .L1_40
  2541. ALIGN_4
  2542. .L1_31:
  2543. movq B, BO // first buffer to BO
  2544. addq $12 * SIZE, BO
  2545. INIT2x1
  2546. movq K, %rax
  2547. sarq $3, %rax
  2548. je .L1_36
  2549. ALIGN_4
  2550. .L1_32:
  2551. KERNEL2x1_SUB
  2552. KERNEL2x1_SUB
  2553. KERNEL2x1_SUB
  2554. KERNEL2x1_SUB
  2555. KERNEL2x1_SUB
  2556. KERNEL2x1_SUB
  2557. KERNEL2x1_SUB
  2558. KERNEL2x1_SUB
  2559. dec %rax
  2560. jne .L1_32
  2561. .L1_36:
  2562. movq K, %rax
  2563. andq $7, %rax # if (k & 1)
  2564. je .L1_39
  2565. ALIGN_4
  2566. .L1_37:
  2567. KERNEL2x1_SUB
  2568. dec %rax
  2569. jne .L1_37
  2570. .L1_39:
  2571. SAVE2x1
  2572. .L1_40:
  2573. testq $1, M
  2574. jz .L1_100 // to next 3 lines of N
  2575. .L1_41:
  2576. movq B, BO // first buffer to BO
  2577. addq $12 * SIZE, BO
  2578. INIT1x1
  2579. movq K, %rax
  2580. sarq $3,%rax
  2581. je .L1_46
  2582. ALIGN_4
  2583. .L1_42:
  2584. KERNEL1x1_SUB
  2585. KERNEL1x1_SUB
  2586. KERNEL1x1_SUB
  2587. KERNEL1x1_SUB
  2588. KERNEL1x1_SUB
  2589. KERNEL1x1_SUB
  2590. KERNEL1x1_SUB
  2591. KERNEL1x1_SUB
  2592. dec %rax
  2593. jne .L1_42
  2594. .L1_46:
  2595. movq K, %rax
  2596. andq $7, %rax # if (k & 1)
  2597. je .L1_49
  2598. ALIGN_4
  2599. .L1_47:
  2600. KERNEL1x1_SUB
  2601. dec %rax
  2602. jne .L1_47
  2603. .L1_49:
  2604. SAVE1x1
  2605. .L1_100:
  2606. .L999:
  2607. vzeroupper
  2608. movq SP, %rsp
  2609. movq (%rsp), %rbx
  2610. movq 8(%rsp), %rbp
  2611. movq 16(%rsp), %r12
  2612. movq 24(%rsp), %r13
  2613. movq 32(%rsp), %r14
  2614. movq 40(%rsp), %r15
  2615. #ifdef WINDOWS_ABI
  2616. movq 48(%rsp), %rdi
  2617. movq 56(%rsp), %rsi
  2618. vmovups 64(%rsp), %xmm6
  2619. vmovups 80(%rsp), %xmm7
  2620. vmovups 96(%rsp), %xmm8
  2621. vmovups 112(%rsp), %xmm9
  2622. vmovups 128(%rsp), %xmm10
  2623. vmovups 144(%rsp), %xmm11
  2624. vmovups 160(%rsp), %xmm12
  2625. vmovups 176(%rsp), %xmm13
  2626. vmovups 192(%rsp), %xmm14
  2627. vmovups 208(%rsp), %xmm15
  2628. #endif
  2629. addq $STACKSIZE, %rsp
  2630. ret
  2631. EPILOGUE
  2632. #else
  2633. /*************************************************************************************
  2634. * TRMM Kernel
  2635. *************************************************************************************/
  2636. PROLOGUE
  2637. PROFCODE
  2638. subq $STACKSIZE, %rsp
  2639. movq %rbx, (%rsp)
  2640. movq %rbp, 8(%rsp)
  2641. movq %r12, 16(%rsp)
  2642. movq %r13, 24(%rsp)
  2643. movq %r14, 32(%rsp)
  2644. movq %r15, 40(%rsp)
  2645. vzeroupper
  2646. #ifdef WINDOWS_ABI
  2647. movq %rdi, 48(%rsp)
  2648. movq %rsi, 56(%rsp)
  2649. vmovups %xmm6, 64(%rsp)
  2650. vmovups %xmm7, 80(%rsp)
  2651. vmovups %xmm8, 96(%rsp)
  2652. vmovups %xmm9, 112(%rsp)
  2653. vmovups %xmm10, 128(%rsp)
  2654. vmovups %xmm11, 144(%rsp)
  2655. vmovups %xmm12, 160(%rsp)
  2656. vmovups %xmm13, 176(%rsp)
  2657. vmovups %xmm14, 192(%rsp)
  2658. vmovups %xmm15, 208(%rsp)
  2659. movq ARG1, OLD_M
  2660. movq ARG2, OLD_N
  2661. movq ARG3, OLD_K
  2662. movq OLD_A, A
  2663. movq OLD_B, B
  2664. movq OLD_C, C
  2665. movq OLD_LDC, LDC
  2666. #ifdef TRMMKERNEL
  2667. vmovsd OLD_OFFSET, %xmm12
  2668. #endif
  2669. vmovups %xmm3, %xmm0
  2670. #else
  2671. movq STACKSIZE + 8(%rsp), LDC
  2672. #ifdef TRMMKERNEL
  2673. vmovsd STACKSIZE + 16(%rsp), %xmm12
  2674. #endif
  2675. #endif
  2676. movq %rsp, SP # save old stack
  2677. subq $128 + L_BUFFER_SIZE, %rsp
  2678. andq $-4096, %rsp # align stack
  2679. STACK_TOUCH
  2680. cmpq $ 0, OLD_M
  2681. je .L999
  2682. cmpq $ 0, OLD_N
  2683. je .L999
  2684. cmpq $ 0, OLD_K
  2685. je .L999
  2686. movq OLD_M, M
  2687. movq OLD_N, N
  2688. movq OLD_K, K
  2689. vmovsd %xmm0, ALPHA
  2690. salq $BASE_SHIFT, LDC
  2691. movq N, %rax
  2692. xorq %rdx, %rdx
  2693. movq $8, %rdi
  2694. divq %rdi // N / 8
  2695. movq %rax, Ndiv12 // N / 8
  2696. movq %rdx, Nmod12 // N % 8
  2697. #ifdef TRMMKERNEL
  2698. vmovsd %xmm12, OFFSET
  2699. vmovsd %xmm12, KK
  2700. #ifndef LEFT
  2701. negq KK
  2702. #endif
  2703. #endif
  2704. /*************************************************************************************************/
  2705. .L8_0:
  2706. movq Ndiv12, J
  2707. cmpq $ 0, J
  2708. je .L4_0
  2709. ALIGN_4
  2710. .L8_10:
  2711. movq C, CO1
  2712. leaq (C, LDC, 8), C // c += 8 * ldc
  2713. #if defined(TRMMKERNEL) && defined(LEFT)
  2714. movq OFFSET, %rax
  2715. movq %rax, KK
  2716. #endif
  2717. movq A, AO // aoffset = a
  2718. addq $16 * SIZE, AO
  2719. movq M, I
  2720. sarq $2, I // i = m / 4
  2721. je .L8_20
  2722. ALIGN_4
  2723. .L8_11:
  2724. #if !defined(TRMMKERNEL) || \
  2725. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2726. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2727. movq B, BO
  2728. addq $12 * SIZE, BO
  2729. #else
  2730. movq B, BO
  2731. addq $12 * SIZE, BO
  2732. movq KK, %rax
  2733. salq $3, %rax // rax * SIZE
  2734. leaq (BO,%rax,8), BO // add number of values in B
  2735. leaq (AO,%rax,4), AO // add number of values in A
  2736. #endif
  2737. #ifndef TRMMKERNEL
  2738. movq K, %rax
  2739. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2740. movq K, %rax
  2741. subq KK, %rax
  2742. movq %rax, KKK
  2743. #else
  2744. movq KK, %rax
  2745. #ifdef LEFT
  2746. addq $4, %rax // number of values in AO
  2747. #else
  2748. addq $8, %rax // number of values in BO
  2749. #endif
  2750. movq %rax, KKK
  2751. #endif
  2752. sarq $3, %rax // K / 8
  2753. cmpq $2, %rax
  2754. jl .L8_13
  2755. KERNEL4x8_I
  2756. KERNEL4x8_M2
  2757. KERNEL4x8_M1
  2758. KERNEL4x8_M2
  2759. KERNEL4x8_M1
  2760. KERNEL4x8_M2
  2761. KERNEL4x8_M1
  2762. KERNEL4x8_M2
  2763. subq $2, %rax
  2764. je .L8_12a
  2765. ALIGN_5
  2766. .L8_12:
  2767. KERNEL4x8_M1
  2768. KERNEL4x8_M2
  2769. KERNEL4x8_M1
  2770. KERNEL4x8_M2
  2771. KERNEL4x8_M1
  2772. KERNEL4x8_M2
  2773. KERNEL4x8_M1
  2774. KERNEL4x8_M2
  2775. dec %rax
  2776. jne .L8_12
  2777. .L8_12a:
  2778. KERNEL4x8_M1
  2779. KERNEL4x8_M2
  2780. KERNEL4x8_M1
  2781. KERNEL4x8_M2
  2782. KERNEL4x8_M1
  2783. KERNEL4x8_M2
  2784. KERNEL4x8_M1
  2785. KERNEL4x8_E
  2786. jmp .L8_16
  2787. .L8_13:
  2788. test $1, %rax
  2789. jz .L8_14
  2790. KERNEL4x8_I
  2791. KERNEL4x8_M2
  2792. KERNEL4x8_M1
  2793. KERNEL4x8_M2
  2794. KERNEL4x8_M1
  2795. KERNEL4x8_M2
  2796. KERNEL4x8_M1
  2797. KERNEL4x8_E
  2798. jmp .L8_16
  2799. .L8_14:
  2800. INIT4x8
  2801. .L8_16:
  2802. movq KKK, %rax
  2803. andq $7, %rax # if (k & 1)
  2804. je .L8_19
  2805. ALIGN_4
  2806. .L8_17:
  2807. KERNEL4x8_SUB
  2808. dec %rax
  2809. jne .L8_17
  2810. ALIGN_4
  2811. .L8_19:
  2812. SAVE4x8
  2813. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2814. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2815. movq K, %rax
  2816. subq KKK, %rax
  2817. salq $3, %rax // rax + SIZE
  2818. leaq (BO, %rax, 8), BO // number of values in B
  2819. leaq (AO, %rax, 4), AO // number of values in A
  2820. #endif
  2821. #if defined(TRMMKERNEL) && defined(LEFT)
  2822. addq $4, KK // number of values in A
  2823. #endif
  2824. decq I # i --
  2825. jg .L8_11
  2826. ALIGN_4
  2827. /**************************************************************************
  2828. * Rest of M
  2829. ***************************************************************************/
  2830. .L8_20:
  2831. // Test rest of M
  2832. testq $3, M
  2833. jz .L8_100 // to next 16 lines of N
  2834. .L8_30:
  2835. testq $2, M
  2836. jz .L8_40
  2837. ALIGN_4
  2838. .L8_31:
  2839. #if !defined(TRMMKERNEL) || \
  2840. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2841. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2842. movq B, BO
  2843. addq $12 * SIZE, BO
  2844. #else
  2845. movq B, BO
  2846. addq $12 * SIZE, BO
  2847. movq KK, %rax
  2848. salq $3, %rax // rax * SIZE
  2849. leaq (BO,%rax,8), BO // add number of values in B
  2850. leaq (AO,%rax,2), AO // add number of values in A
  2851. #endif
  2852. #ifndef TRMMKERNEL
  2853. movq K, %rax
  2854. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2855. movq K, %rax
  2856. subq KK, %rax
  2857. movq %rax, KKK
  2858. #else
  2859. movq KK, %rax
  2860. #ifdef LEFT
  2861. addq $2, %rax // number of values in AO
  2862. #else
  2863. addq $8, %rax // number of values in BO
  2864. #endif
  2865. movq %rax, KKK
  2866. #endif
  2867. INIT2x8
  2868. sarq $3, %rax
  2869. je .L8_36
  2870. ALIGN_4
  2871. .L8_32:
  2872. KERNEL2x8_SUB
  2873. KERNEL2x8_SUB
  2874. KERNEL2x8_SUB
  2875. KERNEL2x8_SUB
  2876. KERNEL2x8_SUB
  2877. KERNEL2x8_SUB
  2878. KERNEL2x8_SUB
  2879. KERNEL2x8_SUB
  2880. dec %rax
  2881. jne .L8_32
  2882. ALIGN_4
  2883. .L8_36:
  2884. movq KKK, %rax
  2885. andq $7, %rax # if (k & 1)
  2886. je .L8_39
  2887. ALIGN_4
  2888. .L8_37:
  2889. KERNEL2x8_SUB
  2890. dec %rax
  2891. jne .L8_37
  2892. .L8_39:
  2893. SAVE2x8
  2894. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2895. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2896. movq K, %rax
  2897. subq KKK, %rax
  2898. salq $3, %rax // rax + SIZE
  2899. leaq (BO, %rax, 8), BO // number of values in B
  2900. leaq (AO, %rax, 2), AO // number of values in A
  2901. #endif
  2902. #if defined(TRMMKERNEL) && defined(LEFT)
  2903. addq $2, KK // number of values in A
  2904. #endif
  2905. .L8_40:
  2906. testq $1, M
  2907. jz .L8_100 // to next 3 lines of N
  2908. ALIGN_4
  2909. .L8_41:
  2910. #if !defined(TRMMKERNEL) || \
  2911. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2912. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2913. movq B, BO
  2914. addq $12 * SIZE, BO
  2915. #else
  2916. movq B, BO
  2917. addq $12 * SIZE, BO
  2918. movq KK, %rax
  2919. salq $3, %rax // rax * SIZE
  2920. leaq (BO,%rax,8), BO // add number of values in B
  2921. leaq (AO,%rax,1), AO // add number of values in A
  2922. #endif
  2923. #ifndef TRMMKERNEL
  2924. movq K, %rax
  2925. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2926. movq K, %rax
  2927. subq KK, %rax
  2928. movq %rax, KKK
  2929. #else
  2930. movq KK, %rax
  2931. #ifdef LEFT
  2932. addq $1, %rax // number of values in AO
  2933. #else
  2934. addq $8, %rax // number of values in BO
  2935. #endif
  2936. movq %rax, KKK
  2937. #endif
  2938. INIT1x8
  2939. sarq $3,%rax
  2940. je .L8_46
  2941. ALIGN_4
  2942. .L8_42:
  2943. KERNEL1x8_SUB
  2944. KERNEL1x8_SUB
  2945. KERNEL1x8_SUB
  2946. KERNEL1x8_SUB
  2947. KERNEL1x8_SUB
  2948. KERNEL1x8_SUB
  2949. KERNEL1x8_SUB
  2950. KERNEL1x8_SUB
  2951. dec %rax
  2952. jne .L8_42
  2953. ALIGN_4
  2954. .L8_46:
  2955. movq KKK, %rax
  2956. andq $7, %rax # if (k & 1)
  2957. je .L8_49
  2958. ALIGN_4
  2959. .L8_47:
  2960. KERNEL1x8_SUB
  2961. dec %rax
  2962. jne .L8_47
  2963. ALIGN_4
  2964. .L8_49:
  2965. SAVE1x8
  2966. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2967. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2968. movq K, %rax
  2969. subq KKK, %rax
  2970. salq $3, %rax // rax + SIZE
  2971. leaq (BO, %rax, 8), BO // number of values in B
  2972. leaq (AO, %rax, 1), AO // number of values in A
  2973. #endif
  2974. #if defined(TRMMKERNEL) && defined(LEFT)
  2975. addq $1, KK // number of values in A
  2976. #endif
  2977. .L8_100:
  2978. #if defined(TRMMKERNEL) && !defined(LEFT)
  2979. addq $8, KK // number of values in B
  2980. #endif
  2981. decq J // j --
  2982. jg .L8_10
  2983. /*************************************************************************************************/
  2984. .L4_0:
  2985. movq Nmod12, J
  2986. testq $4, J
  2987. je .L2_0
  2988. ALIGN_4
  2989. .L4_10:
  2990. movq C, CO1
  2991. leaq (C, LDC, 4), C // c += 4 * ldc
  2992. #if defined(TRMMKERNEL) && defined(LEFT)
  2993. movq OFFSET, %rax
  2994. movq %rax, KK
  2995. #endif
  2996. movq A, AO // aoffset = a
  2997. addq $16 * SIZE, AO
  2998. movq M, I
  2999. sarq $2, I // i = m / 4
  3000. je .L4_20
  3001. ALIGN_4
  3002. .L4_11:
  3003. #if !defined(TRMMKERNEL) || \
  3004. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3005. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3006. movq B, BO
  3007. addq $12 * SIZE, BO
  3008. #else
  3009. movq B, BO
  3010. addq $12 * SIZE, BO
  3011. movq KK, %rax
  3012. salq $3, %rax // rax * SIZE
  3013. leaq (BO,%rax,4), BO // add number of values in B
  3014. leaq (AO,%rax,4), AO // add number of values in A
  3015. #endif
  3016. #ifndef TRMMKERNEL
  3017. movq K, %rax
  3018. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3019. movq K, %rax
  3020. subq KK, %rax
  3021. movq %rax, KKK
  3022. #else
  3023. movq KK, %rax
  3024. #ifdef LEFT
  3025. addq $4, %rax // number of values in AO
  3026. #else
  3027. addq $4, %rax // number of values in BO
  3028. #endif
  3029. movq %rax, KKK
  3030. #endif
  3031. sarq $3, %rax // K / 8
  3032. cmpq $2, %rax
  3033. jl .L4_13
  3034. KERNEL4x4_I
  3035. KERNEL4x4_M2
  3036. KERNEL4x4_M1
  3037. KERNEL4x4_M2
  3038. KERNEL4x4_M1
  3039. KERNEL4x4_M2
  3040. KERNEL4x4_M1
  3041. KERNEL4x4_M2
  3042. subq $2, %rax
  3043. je .L4_12a
  3044. ALIGN_5
  3045. .L4_12:
  3046. KERNEL4x4_M1
  3047. KERNEL4x4_M2
  3048. KERNEL4x4_M1
  3049. KERNEL4x4_M2
  3050. KERNEL4x4_M1
  3051. KERNEL4x4_M2
  3052. KERNEL4x4_M1
  3053. KERNEL4x4_M2
  3054. dec %rax
  3055. jne .L4_12
  3056. .L4_12a:
  3057. KERNEL4x4_M1
  3058. KERNEL4x4_M2
  3059. KERNEL4x4_M1
  3060. KERNEL4x4_M2
  3061. KERNEL4x4_M1
  3062. KERNEL4x4_M2
  3063. KERNEL4x4_M1
  3064. KERNEL4x4_E
  3065. jmp .L4_16
  3066. .L4_13:
  3067. test $1, %rax
  3068. jz .L4_14
  3069. KERNEL4x4_I
  3070. KERNEL4x4_M2
  3071. KERNEL4x4_M1
  3072. KERNEL4x4_M2
  3073. KERNEL4x4_M1
  3074. KERNEL4x4_M2
  3075. KERNEL4x4_M1
  3076. KERNEL4x4_E
  3077. jmp .L4_16
  3078. .L4_14:
  3079. INIT4x4
  3080. .L4_16:
  3081. movq KKK, %rax
  3082. andq $7, %rax # if (k & 1)
  3083. je .L4_19
  3084. ALIGN_4
  3085. .L4_17:
  3086. KERNEL4x4_SUB
  3087. dec %rax
  3088. jne .L4_17
  3089. ALIGN_4
  3090. .L4_19:
  3091. SAVE4x4
  3092. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3093. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3094. movq K, %rax
  3095. subq KKK, %rax
  3096. salq $3, %rax // rax + SIZE
  3097. leaq (BO, %rax, 4), BO // number of values in B
  3098. leaq (AO, %rax, 4), AO // number of values in A
  3099. #endif
  3100. #if defined(TRMMKERNEL) && defined(LEFT)
  3101. addq $4, KK // number of values in A
  3102. #endif
  3103. decq I # i --
  3104. jg .L4_11
  3105. ALIGN_4
  3106. /**************************************************************************
  3107. * Rest of M
  3108. ***************************************************************************/
  3109. .L4_20:
  3110. // Test rest of M
  3111. testq $3, M
  3112. jz .L4_100 // to next 16 lines of N
  3113. .L4_30:
  3114. testq $2, M
  3115. jz .L4_40
  3116. ALIGN_4
  3117. .L4_31:
  3118. #if !defined(TRMMKERNEL) || \
  3119. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3120. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3121. movq B, BO
  3122. addq $12 * SIZE, BO
  3123. #else
  3124. movq B, BO
  3125. addq $12 * SIZE, BO
  3126. movq KK, %rax
  3127. salq $3, %rax // rax * SIZE
  3128. leaq (BO,%rax,4), BO // add number of values in B
  3129. leaq (AO,%rax,2), AO // add number of values in A
  3130. #endif
  3131. #ifndef TRMMKERNEL
  3132. movq K, %rax
  3133. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3134. movq K, %rax
  3135. subq KK, %rax
  3136. movq %rax, KKK
  3137. #else
  3138. movq KK, %rax
  3139. #ifdef LEFT
  3140. addq $2, %rax // number of values in AO
  3141. #else
  3142. addq $4, %rax // number of values in BO
  3143. #endif
  3144. movq %rax, KKK
  3145. #endif
  3146. INIT2x4
  3147. sarq $3, %rax
  3148. je .L4_36
  3149. ALIGN_4
  3150. .L4_32:
  3151. KERNEL2x4_SUB
  3152. KERNEL2x4_SUB
  3153. KERNEL2x4_SUB
  3154. KERNEL2x4_SUB
  3155. KERNEL2x4_SUB
  3156. KERNEL2x4_SUB
  3157. KERNEL2x4_SUB
  3158. KERNEL2x4_SUB
  3159. dec %rax
  3160. jne .L4_32
  3161. ALIGN_4
  3162. .L4_36:
  3163. movq KKK, %rax
  3164. andq $7, %rax # if (k & 1)
  3165. je .L4_39
  3166. ALIGN_4
  3167. .L4_37:
  3168. KERNEL2x4_SUB
  3169. dec %rax
  3170. jne .L4_37
  3171. .L4_39:
  3172. SAVE2x4
  3173. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3174. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3175. movq K, %rax
  3176. subq KKK, %rax
  3177. salq $3, %rax // rax + SIZE
  3178. leaq (BO, %rax, 4), BO // number of values in B
  3179. leaq (AO, %rax, 2), AO // number of values in A
  3180. #endif
  3181. #if defined(TRMMKERNEL) && defined(LEFT)
  3182. addq $2, KK // number of values in A
  3183. #endif
  3184. .L4_40:
  3185. testq $1, M
  3186. jz .L4_100 // to next 3 lines of N
  3187. ALIGN_4
  3188. .L4_41:
  3189. #if !defined(TRMMKERNEL) || \
  3190. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3191. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3192. movq B, BO
  3193. addq $12 * SIZE, BO
  3194. #else
  3195. movq B, BO
  3196. addq $12 * SIZE, BO
  3197. movq KK, %rax
  3198. salq $3, %rax // rax * SIZE
  3199. leaq (BO,%rax,4), BO // add number of values in B
  3200. leaq (AO,%rax,1), AO // add number of values in A
  3201. #endif
  3202. #ifndef TRMMKERNEL
  3203. movq K, %rax
  3204. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3205. movq K, %rax
  3206. subq KK, %rax
  3207. movq %rax, KKK
  3208. #else
  3209. movq KK, %rax
  3210. #ifdef LEFT
  3211. addq $1, %rax // number of values in AO
  3212. #else
  3213. addq $4, %rax // number of values in BO
  3214. #endif
  3215. movq %rax, KKK
  3216. #endif
  3217. INIT1x4
  3218. sarq $3,%rax
  3219. je .L4_46
  3220. ALIGN_4
  3221. .L4_42:
  3222. KERNEL1x4_SUB
  3223. KERNEL1x4_SUB
  3224. KERNEL1x4_SUB
  3225. KERNEL1x4_SUB
  3226. KERNEL1x4_SUB
  3227. KERNEL1x4_SUB
  3228. KERNEL1x4_SUB
  3229. KERNEL1x4_SUB
  3230. dec %rax
  3231. jne .L4_42
  3232. ALIGN_4
  3233. .L4_46:
  3234. movq KKK, %rax
  3235. andq $7, %rax # if (k & 1)
  3236. je .L4_49
  3237. ALIGN_4
  3238. .L4_47:
  3239. KERNEL1x4_SUB
  3240. dec %rax
  3241. jne .L4_47
  3242. ALIGN_4
  3243. .L4_49:
  3244. SAVE1x4
  3245. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3246. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3247. movq K, %rax
  3248. subq KKK, %rax
  3249. salq $3, %rax // rax + SIZE
  3250. leaq (BO, %rax, 4), BO // number of values in B
  3251. leaq (AO, %rax, 1), AO // number of values in A
  3252. #endif
  3253. #if defined(TRMMKERNEL) && defined(LEFT)
  3254. addq $1, KK // number of values in A
  3255. #endif
  3256. .L4_100:
  3257. #if defined(TRMMKERNEL) && !defined(LEFT)
  3258. addq $4, KK // number of values in B
  3259. #endif
  3260. movq K, %rax
  3261. salq $2, %rax // * 4
  3262. leaq (B , %rax, SIZE), B
  3263. /***************************************************************************************************************/
  3264. .L2_0:
  3265. movq Nmod12, J
  3266. testq $2, J
  3267. je .L1_0
  3268. .L2_10:
  3269. movq C, CO1
  3270. leaq (C, LDC, 2), C // c += 2 * ldc
  3271. #if defined(TRMMKERNEL) && defined(LEFT)
  3272. movq OFFSET, %rax
  3273. movq %rax, KK
  3274. #endif
  3275. movq A, AO // aoffset = a
  3276. addq $16 * SIZE, AO
  3277. movq M, I
  3278. sarq $2, I // i = m / 4
  3279. je .L2_20
  3280. ALIGN_4
  3281. .L2_11:
  3282. #if !defined(TRMMKERNEL) || \
  3283. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3284. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3285. movq B, BO
  3286. addq $12 * SIZE, BO
  3287. #else
  3288. movq B, BO
  3289. addq $12 * SIZE, BO
  3290. movq KK, %rax
  3291. salq $3, %rax // rax * SIZE
  3292. leaq (BO,%rax,2), BO // add number of values in B
  3293. leaq (AO,%rax,4), AO // add number of values in A
  3294. #endif
  3295. #ifndef TRMMKERNEL
  3296. movq K, %rax
  3297. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3298. movq K, %rax
  3299. subq KK, %rax
  3300. movq %rax, KKK
  3301. #else
  3302. movq KK, %rax
  3303. #ifdef LEFT
  3304. addq $4, %rax // number of values in AO
  3305. #else
  3306. addq $2, %rax // number of values in BO
  3307. #endif
  3308. movq %rax, KKK
  3309. #endif
  3310. INIT4x2
  3311. sarq $3, %rax // K / 8
  3312. je .L2_16
  3313. ALIGN_5
  3314. .L2_12:
  3315. KERNEL4x2_SUB
  3316. KERNEL4x2_SUB
  3317. KERNEL4x2_SUB
  3318. KERNEL4x2_SUB
  3319. KERNEL4x2_SUB
  3320. KERNEL4x2_SUB
  3321. KERNEL4x2_SUB
  3322. KERNEL4x2_SUB
  3323. dec %rax
  3324. jne .L2_12
  3325. .L2_16:
  3326. movq KKK, %rax
  3327. andq $7, %rax # if (k & 1)
  3328. je .L2_19
  3329. ALIGN_4
  3330. .L2_17:
  3331. KERNEL4x2_SUB
  3332. dec %rax
  3333. jne .L2_17
  3334. ALIGN_4
  3335. .L2_19:
  3336. SAVE4x2
  3337. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3338. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3339. movq K, %rax
  3340. subq KKK, %rax
  3341. salq $3, %rax // rax + SIZE
  3342. leaq (BO, %rax, 2), BO // number of values in B
  3343. leaq (AO, %rax, 4), AO // number of values in A
  3344. #endif
  3345. #if defined(TRMMKERNEL) && defined(LEFT)
  3346. addq $4, KK // number of values in A
  3347. #endif
  3348. decq I # i --
  3349. jg .L2_11
  3350. ALIGN_4
  3351. /**************************************************************************
  3352. * Rest of M
  3353. ***************************************************************************/
  3354. .L2_20:
  3355. // Test rest of M
  3356. testq $3, M
  3357. jz .L2_100 // to next 16 lines of N
  3358. .L2_30:
  3359. testq $2, M
  3360. jz .L2_40
  3361. ALIGN_4
  3362. .L2_31:
  3363. #if !defined(TRMMKERNEL) || \
  3364. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3365. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3366. movq B, BO
  3367. addq $12 * SIZE, BO
  3368. #else
  3369. movq B, BO
  3370. addq $12 * SIZE, BO
  3371. movq KK, %rax
  3372. salq $3, %rax // rax * SIZE
  3373. leaq (BO,%rax,2), BO // add number of values in B
  3374. leaq (AO,%rax,2), AO // add number of values in A
  3375. #endif
  3376. #ifndef TRMMKERNEL
  3377. movq K, %rax
  3378. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3379. movq K, %rax
  3380. subq KK, %rax
  3381. movq %rax, KKK
  3382. #else
  3383. movq KK, %rax
  3384. #ifdef LEFT
  3385. addq $2, %rax // number of values in AO
  3386. #else
  3387. addq $2, %rax // number of values in BO
  3388. #endif
  3389. movq %rax, KKK
  3390. #endif
  3391. INIT2x2
  3392. sarq $3, %rax
  3393. je .L2_36
  3394. ALIGN_4
  3395. .L2_32:
  3396. KERNEL2x2_SUB
  3397. KERNEL2x2_SUB
  3398. KERNEL2x2_SUB
  3399. KERNEL2x2_SUB
  3400. KERNEL2x2_SUB
  3401. KERNEL2x2_SUB
  3402. KERNEL2x2_SUB
  3403. KERNEL2x2_SUB
  3404. dec %rax
  3405. jne .L2_32
  3406. .L2_36:
  3407. movq KKK, %rax
  3408. andq $7, %rax # if (k & 1)
  3409. je .L2_39
  3410. ALIGN_4
  3411. .L2_37:
  3412. KERNEL2x2_SUB
  3413. dec %rax
  3414. jne .L2_37
  3415. .L2_39:
  3416. SAVE2x2
  3417. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3418. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3419. movq K, %rax
  3420. subq KKK, %rax
  3421. salq $3, %rax // rax + SIZE
  3422. leaq (BO, %rax, 2), BO // number of values in B
  3423. leaq (AO, %rax, 2), AO // number of values in A
  3424. #endif
  3425. #if defined(TRMMKERNEL) && defined(LEFT)
  3426. addq $2, KK // number of values in A
  3427. #endif
  3428. .L2_40:
  3429. testq $1, M
  3430. jz .L2_100 // to next 3 lines of N
  3431. .L2_41:
  3432. #if !defined(TRMMKERNEL) || \
  3433. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3434. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3435. movq B, BO
  3436. addq $12 * SIZE, BO
  3437. #else
  3438. movq B, BO
  3439. addq $12 * SIZE, BO
  3440. movq KK, %rax
  3441. salq $3, %rax // rax * SIZE
  3442. leaq (BO,%rax,2), BO // add number of values in B
  3443. leaq (AO,%rax,1), AO // add number of values in A
  3444. #endif
  3445. #ifndef TRMMKERNEL
  3446. movq K, %rax
  3447. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3448. movq K, %rax
  3449. subq KK, %rax
  3450. movq %rax, KKK
  3451. #else
  3452. movq KK, %rax
  3453. #ifdef LEFT
  3454. addq $1, %rax // number of values in AO
  3455. #else
  3456. addq $2, %rax // number of values in BO
  3457. #endif
  3458. movq %rax, KKK
  3459. #endif
  3460. INIT1x2
  3461. sarq $3,%rax
  3462. je .L2_46
  3463. ALIGN_4
  3464. .L2_42:
  3465. KERNEL1x2_SUB
  3466. KERNEL1x2_SUB
  3467. KERNEL1x2_SUB
  3468. KERNEL1x2_SUB
  3469. KERNEL1x2_SUB
  3470. KERNEL1x2_SUB
  3471. KERNEL1x2_SUB
  3472. KERNEL1x2_SUB
  3473. dec %rax
  3474. jne .L2_42
  3475. .L2_46:
  3476. movq KKK, %rax
  3477. andq $7, %rax # if (k & 1)
  3478. je .L2_49
  3479. ALIGN_4
  3480. .L2_47:
  3481. KERNEL1x2_SUB
  3482. dec %rax
  3483. jne .L2_47
  3484. .L2_49:
  3485. SAVE1x2
  3486. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3487. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3488. movq K, %rax
  3489. subq KKK, %rax
  3490. salq $3, %rax // rax * SIZE
  3491. leaq (BO, %rax, 2), BO // number of values in B
  3492. leaq (AO, %rax, 1), AO // number of values in A
  3493. #endif
  3494. #if defined(TRMMKERNEL) && defined(LEFT)
  3495. addq $1, KK // number of values in A
  3496. #endif
  3497. .L2_100:
  3498. #if defined(TRMMKERNEL) && !defined(LEFT)
  3499. addq $2, KK // number of values in B
  3500. #endif
  3501. movq K, %rax
  3502. salq $1, %rax // * 2
  3503. leaq (B , %rax, SIZE), B
  3504. /***************************************************************************************************************/
  3505. .L1_0:
  3506. movq Nmod12, J
  3507. testq $1, J
  3508. je .L999
  3509. .L1_10:
  3510. movq C, CO1
  3511. leaq (C, LDC, 1), C // c += 1 * ldc
  3512. #if defined(TRMMKERNEL) && defined(LEFT)
  3513. movq OFFSET, %rax
  3514. movq %rax, KK
  3515. #endif
  3516. movq A, AO // aoffset = a
  3517. addq $16 * SIZE, AO
  3518. movq M, I
  3519. sarq $2, I // i = m / 4
  3520. je .L1_20
  3521. ALIGN_4
  3522. .L1_11:
  3523. #if !defined(TRMMKERNEL) || \
  3524. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3525. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3526. movq B, BO
  3527. addq $12 * SIZE, BO
  3528. #else
  3529. movq B, BO
  3530. addq $12 * SIZE, BO
  3531. movq KK, %rax
  3532. salq $3, %rax // rax * SIZE
  3533. leaq (BO,%rax,1), BO // add number of values in B
  3534. leaq (AO,%rax,4), AO // add number of values in A
  3535. #endif
  3536. #ifndef TRMMKERNEL
  3537. movq K, %rax
  3538. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3539. movq K, %rax
  3540. subq KK, %rax
  3541. movq %rax, KKK
  3542. #else
  3543. movq KK, %rax
  3544. #ifdef LEFT
  3545. addq $4, %rax // number of values in AO
  3546. #else
  3547. addq $1, %rax // number of values in BO
  3548. #endif
  3549. movq %rax, KKK
  3550. #endif
  3551. INIT4x1
  3552. sarq $3, %rax // K / 8
  3553. je .L1_16
  3554. ALIGN_5
  3555. .L1_12:
  3556. KERNEL4x1
  3557. dec %rax
  3558. jne .L1_12
  3559. .L1_16:
  3560. movq KKK, %rax
  3561. andq $7, %rax # if (k & 1)
  3562. je .L1_19
  3563. ALIGN_4
  3564. .L1_17:
  3565. KERNEL4x1_SUB
  3566. dec %rax
  3567. jne .L1_17
  3568. ALIGN_4
  3569. .L1_19:
  3570. SAVE4x1
  3571. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3572. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3573. movq K, %rax
  3574. subq KKK, %rax
  3575. salq $3, %rax // rax * SIZE
  3576. leaq (BO, %rax, 1), BO // number of values in B
  3577. leaq (AO, %rax, 4), AO // number of values in A
  3578. #endif
  3579. #if defined(TRMMKERNEL) && defined(LEFT)
  3580. addq $4, KK // number of values in A
  3581. #endif
  3582. decq I # i --
  3583. jg .L1_11
  3584. /**************************************************************************
  3585. * Rest of M
  3586. ***************************************************************************/
  3587. .L1_20:
  3588. // Test rest of M
  3589. testq $3, M
  3590. jz .L1_100
  3591. .L1_30:
  3592. testq $2, M
  3593. jz .L1_40
  3594. ALIGN_4
  3595. .L1_31:
  3596. #if !defined(TRMMKERNEL) || \
  3597. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3598. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3599. movq B, BO
  3600. addq $12 * SIZE, BO
  3601. #else
  3602. movq B, BO
  3603. addq $12 * SIZE, BO
  3604. movq KK, %rax
  3605. salq $3, %rax // rax * SIZE
  3606. leaq (BO,%rax,1), BO // add number of values in B
  3607. leaq (AO,%rax,2), AO // add number of values in A
  3608. #endif
  3609. #ifndef TRMMKERNEL
  3610. movq K, %rax
  3611. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3612. movq K, %rax
  3613. subq KK, %rax
  3614. movq %rax, KKK
  3615. #else
  3616. movq KK, %rax
  3617. #ifdef LEFT
  3618. addq $2, %rax // number of values in AO
  3619. #else
  3620. addq $1, %rax // number of values in BO
  3621. #endif
  3622. movq %rax, KKK
  3623. #endif
  3624. INIT2x1
  3625. sarq $3, %rax
  3626. je .L1_36
  3627. ALIGN_4
  3628. .L1_32:
  3629. KERNEL2x1_SUB
  3630. KERNEL2x1_SUB
  3631. KERNEL2x1_SUB
  3632. KERNEL2x1_SUB
  3633. KERNEL2x1_SUB
  3634. KERNEL2x1_SUB
  3635. KERNEL2x1_SUB
  3636. KERNEL2x1_SUB
  3637. dec %rax
  3638. jne .L1_32
  3639. .L1_36:
  3640. movq KKK, %rax
  3641. andq $7, %rax # if (k & 1)
  3642. je .L1_39
  3643. ALIGN_4
  3644. .L1_37:
  3645. KERNEL2x1_SUB
  3646. dec %rax
  3647. jne .L1_37
  3648. .L1_39:
  3649. SAVE2x1
  3650. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3651. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3652. movq K, %rax
  3653. subq KKK, %rax
  3654. salq $3, %rax // rax * SIZE
  3655. leaq (BO, %rax, 1), BO // number of values in B
  3656. leaq (AO, %rax, 2), AO // number of values in A
  3657. #endif
  3658. #if defined(TRMMKERNEL) && defined(LEFT)
  3659. addq $2, KK // number of values in A
  3660. #endif
  3661. .L1_40:
  3662. testq $1, M
  3663. jz .L1_100 // to next 3 lines of N
  3664. .L1_41:
  3665. #if !defined(TRMMKERNEL) || \
  3666. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3667. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3668. movq B, BO
  3669. addq $12 * SIZE, BO
  3670. #else
  3671. movq B, BO
  3672. addq $12 * SIZE, BO
  3673. movq KK, %rax
  3674. salq $3, %rax // rax * SIZE
  3675. leaq (BO,%rax,1), BO // add number of values in B
  3676. leaq (AO,%rax,1), AO // add number of values in A
  3677. #endif
  3678. #ifndef TRMMKERNEL
  3679. movq K, %rax
  3680. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3681. movq K, %rax
  3682. subq KK, %rax
  3683. movq %rax, KKK
  3684. #else
  3685. movq KK, %rax
  3686. #ifdef LEFT
  3687. addq $1, %rax // number of values in AO
  3688. #else
  3689. addq $1, %rax // number of values in BO
  3690. #endif
  3691. movq %rax, KKK
  3692. #endif
  3693. INIT1x1
  3694. sarq $3,%rax
  3695. je .L1_46
  3696. ALIGN_4
  3697. .L1_42:
  3698. KERNEL1x1_SUB
  3699. KERNEL1x1_SUB
  3700. KERNEL1x1_SUB
  3701. KERNEL1x1_SUB
  3702. KERNEL1x1_SUB
  3703. KERNEL1x1_SUB
  3704. KERNEL1x1_SUB
  3705. KERNEL1x1_SUB
  3706. dec %rax
  3707. jne .L1_42
  3708. .L1_46:
  3709. movq KKK, %rax
  3710. andq $7, %rax # if (k & 1)
  3711. je .L1_49
  3712. ALIGN_4
  3713. .L1_47:
  3714. KERNEL1x1_SUB
  3715. dec %rax
  3716. jne .L1_47
  3717. .L1_49:
  3718. SAVE1x1
  3719. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3720. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3721. movq K, %rax
  3722. subq KKK, %rax
  3723. salq $3, %rax // rax * SIZE
  3724. leaq (BO, %rax, 1), BO // number of values in B
  3725. leaq (AO, %rax, 1), AO // number of values in A
  3726. #endif
  3727. #if defined(TRMMKERNEL) && defined(LEFT)
  3728. addq $1, KK // number of values in A
  3729. #endif
  3730. .L1_100:
  3731. #if defined(TRMMKERNEL) && !defined(LEFT)
  3732. addq $1, KK // number of values in B
  3733. #endif
  3734. .L999:
  3735. vzeroupper
  3736. movq SP, %rsp
  3737. movq (%rsp), %rbx
  3738. movq 8(%rsp), %rbp
  3739. movq 16(%rsp), %r12
  3740. movq 24(%rsp), %r13
  3741. movq 32(%rsp), %r14
  3742. movq 40(%rsp), %r15
  3743. #ifdef WINDOWS_ABI
  3744. movq 48(%rsp), %rdi
  3745. movq 56(%rsp), %rsi
  3746. vmovups 64(%rsp), %xmm6
  3747. vmovups 80(%rsp), %xmm7
  3748. vmovups 96(%rsp), %xmm8
  3749. vmovups 112(%rsp), %xmm9
  3750. vmovups 128(%rsp), %xmm10
  3751. vmovups 144(%rsp), %xmm11
  3752. vmovups 160(%rsp), %xmm12
  3753. vmovups 176(%rsp), %xmm13
  3754. vmovups 192(%rsp), %xmm14
  3755. vmovups 208(%rsp), %xmm15
  3756. #endif
  3757. addq $STACKSIZE, %rsp
  3758. ret
  3759. EPILOGUE
  3760. #endif