You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_16x2_piledriver.S 121 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /*********************************************************************
  28. *
  29. * 2013/10/18 Saar
  30. * BLASTEST : OK
  31. * CTEST : OK
  32. * TEST : OK
  33. *
  34. *
  35. * 2013/10/29 Saar
  36. *
  37. * Parameter:
  38. * UNROLL_M 16
  39. * UNROLL_N 2
  40. * SGEMM_P 768
  41. * SGEMM_Q 192
  42. * SGEMM_R 12288
  43. * A_PR1 384
  44. * B_PR1 192
  45. *
  46. * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
  47. *
  48. * 6144x6144 168.2 GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 )
  49. * 6144x6144 162.7 GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 )
  50. * 6144x6144 82.0 GFLOPS with 2 threads on 2 modules (ACML: 81.4 ) (BULLDOZER: 80.3 )
  51. * 6144x6144 41.3 GFLOPS with 1 threads on 1 modules (ACML: 41.1 ) (BULLDOZER: 40.4 )
  52. *
  53. * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1):
  54. *
  55. * 12288x12288 469.5 GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 )
  56. * 12288x12288 442.9 GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 )
  57. * 12288x12288 265.1 GFLOPS with 8 threads on 8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 )
  58. * 6144x6144 139.7 GFLOPS with 4 threads on 4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 )
  59. * 6144x6144 70.9 GFLOPS with 2 threads on 2 modules (ACML: 67.4 ) (BULLDOZER: 69.5 )
  60. * 6144x6144 35.6 GFLOPS with 1 threads on 1 modules (ACML: 36.1 ) (BULLDOZER: 35.1 )
  61. *
  62. *********************************************************************/
  63. #define ASSEMBLER
  64. #include "common.h"
  65. #define OLD_M %rdi
  66. #define OLD_N %rsi
  67. #define M %r13
  68. #define J %r14
  69. #define OLD_K %rdx
  70. #define A %rcx
  71. #define B %r8
  72. #define C %r9
  73. #define LDC %r10
  74. #define I %r11
  75. #define AO %rdi
  76. #define BO %rsi
  77. #define CO1 %r15
  78. #define K %r12
  79. #define BI %rbp
  80. #define SP %rbx
  81. #define BO1 %rdi
  82. #define BO2 %r15
  83. #ifndef WINDOWS_ABI
  84. #define STACKSIZE 96
  85. #else
  86. #define STACKSIZE 256
  87. #define OLD_A 40 + STACKSIZE(%rsp)
  88. #define OLD_B 48 + STACKSIZE(%rsp)
  89. #define OLD_C 56 + STACKSIZE(%rsp)
  90. #define OLD_LDC 64 + STACKSIZE(%rsp)
  91. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  92. #endif
  93. #define L_BUFFER_SIZE 8192
  94. #define LB2_OFFSET 4096
  95. #define Ndiv6 24(%rsp)
  96. #define Nmod6 32(%rsp)
  97. #define N 40(%rsp)
  98. #define ALPHA 48(%rsp)
  99. #define OFFSET 56(%rsp)
  100. #define KK 64(%rsp)
  101. #define KKK 72(%rsp)
  102. #define BUFFER1 128(%rsp)
  103. #define BUFFER2 LB2_OFFSET+128(%rsp)
  104. #if defined(OS_WINDOWS)
  105. #if L_BUFFER_SIZE > 16384
  106. #define STACK_TOUCH \
  107. movl $0, 4096 * 4(%rsp);\
  108. movl $0, 4096 * 3(%rsp);\
  109. movl $0, 4096 * 2(%rsp);\
  110. movl $0, 4096 * 1(%rsp);
  111. #elif L_BUFFER_SIZE > 12288
  112. #define STACK_TOUCH \
  113. movl $0, 4096 * 3(%rsp);\
  114. movl $0, 4096 * 2(%rsp);\
  115. movl $0, 4096 * 1(%rsp);
  116. #elif L_BUFFER_SIZE > 8192
  117. #define STACK_TOUCH \
  118. movl $0, 4096 * 2(%rsp);\
  119. movl $0, 4096 * 1(%rsp);
  120. #elif L_BUFFER_SIZE > 4096
  121. #define STACK_TOUCH \
  122. movl $0, 4096 * 1(%rsp);
  123. #else
  124. #define STACK_TOUCH
  125. #endif
  126. #else
  127. #define STACK_TOUCH
  128. #endif
  129. #define A_PR1 384
  130. #define B_PR1 192
  131. /*******************************************************************************************
  132. * 3 lines of N
  133. *******************************************************************************************/
  134. #define KERNEL16x3_1(xx) \
  135. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
  136. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  137. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  138. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
  139. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  140. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
  141. nop ;\
  142. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  143. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  144. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  145. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  146. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  147. vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
  148. vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  149. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  150. vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\
  151. vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\
  152. vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  153. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  154. vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
  155. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
  156. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
  157. vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
  158. #define KERNEL16x3_2(xx) \
  159. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  160. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  161. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  162. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
  163. nop ;\
  164. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  165. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  166. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  167. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  168. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  169. vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
  170. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  171. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  172. vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\
  173. vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\
  174. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  175. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  176. vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
  177. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  178. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  179. vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
  180. #define KERNEL16x3_3(xx) \
  181. vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  182. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  183. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  184. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\
  185. nop ;\
  186. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  187. vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  188. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  189. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  190. prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\
  191. vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
  192. vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  193. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  194. vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\
  195. vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\
  196. vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  197. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  198. vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
  199. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\
  200. vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\
  201. vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
  202. #define KERNEL16x3_4(xx) \
  203. vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  204. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  205. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  206. vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\
  207. nop ;\
  208. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  209. vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  210. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  211. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  212. prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\
  213. vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
  214. vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  215. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  216. vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\
  217. addq $12, BI ;\
  218. vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\
  219. vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  220. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  221. vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
  222. addq $64, %rax ;\
  223. vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
  224. #define KERNEL16x3_SUB(xx) \
  225. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
  226. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  227. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  228. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
  229. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  230. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
  231. nop ;\
  232. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  233. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  234. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  235. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  236. vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
  237. vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  238. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  239. vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\
  240. vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\
  241. vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  242. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  243. vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
  244. vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\
  245. /*******************************************************************************************/
  246. #define KERNEL8x3_1(xx) \
  247. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  248. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
  249. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  250. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  251. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
  252. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  253. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
  254. nop ;\
  255. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  256. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  257. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  258. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  259. vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
  260. #define KERNEL8x3_2(xx) \
  261. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
  262. vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  263. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  264. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
  265. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  266. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
  267. nop ;\
  268. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  269. vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  270. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  271. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  272. vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
  273. #define KERNEL8x3_3(xx) \
  274. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  275. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  276. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  277. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  278. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  279. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  280. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\
  281. nop ;\
  282. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  283. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  284. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  285. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  286. vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
  287. #define KERNEL8x3_4(xx) \
  288. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\
  289. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  290. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  291. vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\
  292. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  293. vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\
  294. nop ;\
  295. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  296. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  297. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  298. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  299. vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
  300. addq $12, BI ;\
  301. addq $32, %rax ;\
  302. #define KERNEL8x3_SUB(xx) \
  303. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
  304. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  305. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  306. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
  307. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  308. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
  309. nop ;\
  310. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  311. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  312. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  313. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  314. vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\
  315. /*******************************************************************************************/
  316. #define KERNEL4x3_1(xx) \
  317. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  318. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
  319. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  320. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  321. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
  322. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  323. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
  324. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  325. #define KERNEL4x3_2(xx) \
  326. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
  327. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  328. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  329. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
  330. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  331. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
  332. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  333. #define KERNEL4x3_3(xx) \
  334. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  335. vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  336. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  337. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  338. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  339. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\
  340. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  341. #define KERNEL4x3_4(xx) \
  342. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\
  343. vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  344. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  345. vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\
  346. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  347. vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\
  348. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  349. addq $12, BI ;\
  350. addq $16, %rax ;\
  351. #define KERNEL4x3_SUB(xx) \
  352. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
  353. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  354. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  355. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
  356. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  357. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
  358. vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\
  359. /*******************************************************************************************/
  360. #define KERNEL2x3_1(xx) \
  361. vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
  362. vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  363. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  364. vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
  365. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  366. vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
  367. vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\
  368. vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  369. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  370. vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\
  371. vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\
  372. #define KERNEL2x3_2(xx) \
  373. vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
  374. vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  375. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  376. vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
  377. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  378. vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
  379. vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\
  380. vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  381. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  382. vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\
  383. vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\
  384. #define KERNEL2x3_3(xx) \
  385. vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  386. vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  387. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  388. vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  389. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  390. vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\
  391. vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\
  392. vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  393. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  394. vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\
  395. vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\
  396. #define KERNEL2x3_4(xx) \
  397. vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\
  398. vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  399. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  400. vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\
  401. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  402. vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\
  403. vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\
  404. vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  405. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  406. vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\
  407. vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\
  408. addq $12, BI ;\
  409. addq $8, %rax ;\
  410. #define KERNEL2x3_SUB(xx) \
  411. vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
  412. vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  413. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  414. vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
  415. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  416. vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
  417. vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\
  418. vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  419. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  420. vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\
  421. vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\
  422. /*******************************************************************************************/
  423. #define KERNEL1x3_1(xx) \
  424. vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
  425. vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  426. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  427. vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
  428. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  429. vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
  430. vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\
  431. #define KERNEL1x3_2(xx) \
  432. vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
  433. vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  434. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  435. vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
  436. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  437. vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
  438. vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\
  439. #define KERNEL1x3_3(xx) \
  440. vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  441. vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  442. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  443. vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  444. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  445. vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\
  446. vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\
  447. #define KERNEL1x3_4(xx) \
  448. vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\
  449. vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  450. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  451. vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\
  452. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  453. vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\
  454. vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\
  455. addq $12, BI ;\
  456. addq $4, %rax ;\
  457. #define KERNEL1x3_SUB(xx) \
  458. vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
  459. vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  460. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  461. vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
  462. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  463. vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
  464. vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\
  465. /*******************************************************************************************/
  466. /*******************************************************************************************
  467. * 2 lines of N
  468. *******************************************************************************************/
  469. #define KERNEL16x2_1(xx) \
  470. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  471. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
  472. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  473. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  474. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  475. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  476. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  477. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  478. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  479. vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  480. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  481. vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\
  482. vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  483. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  484. vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
  485. #define KERNEL16x2_2(xx) \
  486. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  487. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  488. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  489. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  490. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  491. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  492. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  493. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  494. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  495. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  496. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  497. vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\
  498. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  499. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  500. vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
  501. #define KERNEL16x2_3(xx) \
  502. prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\
  503. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  504. vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  505. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  506. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  507. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  508. vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  509. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  510. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  511. vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  512. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  513. vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\
  514. vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  515. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  516. vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
  517. #define KERNEL16x2_4(xx) \
  518. prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\
  519. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  520. vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  521. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  522. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  523. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  524. vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  525. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  526. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  527. vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  528. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  529. vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\
  530. vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  531. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  532. vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
  533. addq $8, BI ;\
  534. addq $64, %rax ;\
  535. #define KERNEL16x2_SUB(xx) \
  536. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
  537. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  538. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  539. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  540. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  541. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  542. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  543. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  544. vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  545. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  546. vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\
  547. vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  548. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  549. vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\
  550. /*******************************************************************************************/
  551. #define KERNEL8x2_1(xx) \
  552. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  553. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
  554. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  555. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  556. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  557. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  558. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  559. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  560. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  561. #define KERNEL8x2_2(xx) \
  562. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  563. vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  564. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  565. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  566. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  567. vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  568. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  569. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  570. #define KERNEL8x2_3(xx) \
  571. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  572. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  573. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  574. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  575. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  576. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  577. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  578. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  579. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  580. #define KERNEL8x2_4(xx) \
  581. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  582. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  583. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  584. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  585. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  586. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  587. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  588. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  589. addq $8, BI ;\
  590. addq $32, %rax ;\
  591. #define KERNEL8x2_SUB(xx) \
  592. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
  593. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  594. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  595. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  596. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  597. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  598. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  599. vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\
  600. /*******************************************************************************************/
  601. #define KERNEL4x2_1(xx) \
  602. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  603. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
  604. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  605. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  606. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  607. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  608. #define KERNEL4x2_2(xx) \
  609. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  610. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  611. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  612. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  613. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  614. #define KERNEL4x2_3(xx) \
  615. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  616. vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  617. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  618. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  619. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  620. #define KERNEL4x2_4(xx) \
  621. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  622. vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  623. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  624. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  625. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  626. addq $8, BI ;\
  627. addq $16, %rax ;\
  628. #define KERNEL4x2_SUB(xx) \
  629. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
  630. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  631. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  632. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  633. vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\
  634. /*******************************************************************************************/
  635. #define KERNEL2x2_1(xx) \
  636. vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
  637. vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  638. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  639. vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  640. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  641. vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  642. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  643. vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\
  644. #define KERNEL2x2_2(xx) \
  645. vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  646. vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  647. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  648. vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  649. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  650. vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  651. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  652. vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\
  653. #define KERNEL2x2_3(xx) \
  654. vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  655. vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  656. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  657. vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  658. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  659. vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  660. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  661. vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\
  662. #define KERNEL2x2_4(xx) \
  663. vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  664. vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  665. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  666. vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  667. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  668. vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  669. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  670. vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\
  671. addq $8, BI ;\
  672. addq $8, %rax ;\
  673. #define KERNEL2x2_SUB(xx) \
  674. vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
  675. vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  676. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  677. vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  678. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  679. vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  680. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  681. vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\
  682. /*******************************************************************************************/
  683. #define KERNEL1x2_1(xx) \
  684. vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
  685. vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  686. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  687. vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  688. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  689. #define KERNEL1x2_2(xx) \
  690. vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  691. vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  692. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  693. vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  694. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  695. #define KERNEL1x2_3(xx) \
  696. vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  697. vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  698. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  699. vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\
  700. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  701. #define KERNEL1x2_4(xx) \
  702. vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  703. vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  704. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  705. vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  706. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  707. addq $8, BI ;\
  708. addq $4, %rax ;\
  709. #define KERNEL1x2_SUB(xx) \
  710. vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
  711. vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  712. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  713. vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
  714. vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\
  715. /*******************************************************************************************/
  716. /*******************************************************************************************
  717. * 1 line of N
  718. *******************************************************************************************/
  719. #define KERNEL16x1_1(xx) \
  720. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  721. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  722. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  723. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  724. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  725. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  726. vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  727. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  728. vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  729. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  730. #define KERNEL16x1_2(xx) \
  731. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  732. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
  733. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  734. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  735. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  736. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  737. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  738. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  739. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  740. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  741. #define KERNEL16x1_3(xx) \
  742. prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\
  743. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  744. vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  745. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  746. vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  747. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  748. vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  749. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  750. vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  751. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  752. #define KERNEL16x1_4(xx) \
  753. prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\
  754. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\
  755. vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  756. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  757. vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  758. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  759. vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  760. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  761. vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  762. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  763. addq $4, BI ;\
  764. addq $64, %rax ;\
  765. #define KERNEL16x1_SUB(xx) \
  766. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  767. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  768. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  769. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  770. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  771. vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  772. vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\
  773. vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  774. vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\
  775. /*******************************************************************************************/
  776. #define KERNEL8x1_1(xx) \
  777. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  778. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  779. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  780. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  781. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  782. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  783. #define KERNEL8x1_2(xx) \
  784. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
  785. vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  786. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  787. vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  788. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  789. #define KERNEL8x1_3(xx) \
  790. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  791. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  792. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  793. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  794. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  795. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  796. #define KERNEL8x1_4(xx) \
  797. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\
  798. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  799. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  800. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  801. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  802. addq $4, BI ;\
  803. addq $32, %rax ;\
  804. #define KERNEL8x1_SUB(xx) \
  805. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  806. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  807. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  808. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  809. vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\
  810. /*******************************************************************************************/
  811. #define KERNEL4x1_1(xx) \
  812. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  813. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  814. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  815. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  816. #define KERNEL4x1_2(xx) \
  817. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
  818. vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  819. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  820. #define KERNEL4x1_3(xx) \
  821. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  822. vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  823. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  824. #define KERNEL4x1_4(xx) \
  825. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\
  826. vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  827. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  828. addq $4, BI ;\
  829. addq $16, %rax ;\
  830. #define KERNEL4x1_SUB(xx) \
  831. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  832. vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  833. vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\
  834. /*******************************************************************************************/
  835. #define KERNEL2x1_1(xx) \
  836. vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  837. vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  838. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  839. vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  840. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  841. #define KERNEL2x1_2(xx) \
  842. vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
  843. vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  844. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  845. vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  846. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  847. #define KERNEL2x1_3(xx) \
  848. vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  849. vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  850. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  851. vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  852. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  853. #define KERNEL2x1_4(xx) \
  854. vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\
  855. vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  856. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  857. vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  858. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  859. addq $4, BI ;\
  860. addq $8, %rax ;\
  861. #define KERNEL2x1_SUB(xx) \
  862. vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  863. vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  864. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  865. vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  866. vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\
  867. /*******************************************************************************************/
  868. #define KERNEL1x1_1(xx) \
  869. vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  870. vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  871. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  872. #define KERNEL1x1_2(xx) \
  873. vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
  874. vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  875. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  876. #define KERNEL1x1_3(xx) \
  877. vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\
  878. vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  879. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  880. #define KERNEL1x1_4(xx) \
  881. vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\
  882. vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  883. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  884. addq $4, BI ;\
  885. addq $4, %rax ;\
  886. #define KERNEL1x1_SUB(xx) \
  887. vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
  888. vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  889. vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\
  890. /*******************************************************************************************/
  891. #if !defined(TRMMKERNEL)
  892. PROLOGUE
  893. PROFCODE
  894. subq $STACKSIZE, %rsp
  895. movq %rbx, (%rsp)
  896. movq %rbp, 8(%rsp)
  897. movq %r12, 16(%rsp)
  898. movq %r13, 24(%rsp)
  899. movq %r14, 32(%rsp)
  900. movq %r15, 40(%rsp)
  901. vzeroupper
  902. #ifdef WINDOWS_ABI
  903. movq %rdi, 48(%rsp)
  904. movq %rsi, 56(%rsp)
  905. movups %xmm6, 64(%rsp)
  906. movups %xmm7, 80(%rsp)
  907. movups %xmm8, 96(%rsp)
  908. movups %xmm9, 112(%rsp)
  909. movups %xmm10, 128(%rsp)
  910. movups %xmm11, 144(%rsp)
  911. movups %xmm12, 160(%rsp)
  912. movups %xmm13, 176(%rsp)
  913. movups %xmm14, 192(%rsp)
  914. movups %xmm15, 208(%rsp)
  915. movq ARG1, OLD_M
  916. movq ARG2, OLD_N
  917. movq ARG3, OLD_K
  918. movq OLD_A, A
  919. movq OLD_B, B
  920. movq OLD_C, C
  921. movq OLD_LDC, LDC
  922. vmovaps %xmm3, %xmm0
  923. #else
  924. movq STACKSIZE + 8(%rsp), LDC
  925. #endif
  926. movq %rsp, SP # save old stack
  927. subq $128 + L_BUFFER_SIZE, %rsp
  928. andq $-4096, %rsp # align stack
  929. STACK_TOUCH
  930. cmpq $0, OLD_M
  931. je .L999
  932. cmpq $0, OLD_N
  933. je .L999
  934. cmpq $0, OLD_K
  935. je .L999
  936. movq OLD_M, M
  937. movq OLD_N, N
  938. movq OLD_K, K
  939. vmovsd %xmm0, ALPHA
  940. salq $BASE_SHIFT, LDC
  941. movq N, %rax
  942. xorq %rdx, %rdx
  943. movq $6, %rdi
  944. divq %rdi // N / 6
  945. movq %rax, Ndiv6 // N / 6
  946. movq %rdx, Nmod6 // N % 6
  947. movq Ndiv6, J
  948. cmpq $0, J
  949. je .L2_0
  950. ALIGN_4
  951. .L6_01:
  952. // copy to sub buffer
  953. movq K, %rax
  954. salq $1,%rax // K * 2 ; read 2 values
  955. movq B, BO1
  956. leaq (B,%rax, SIZE), BO2 // next offset to BO2
  957. leaq BUFFER1, BO // first buffer to BO
  958. movq K, %rax
  959. sarq $3 , %rax // K / 8
  960. jz .L6_01a_2
  961. ALIGN_4
  962. .L6_01a_1:
  963. prefetcht0 512(BO1)
  964. prefetcht0 512(BO2)
  965. prefetchw 512(BO)
  966. vmovsd 0 * SIZE(BO1), %xmm0
  967. vmovsd 2 * SIZE(BO1), %xmm2
  968. vmovsd 4 * SIZE(BO1), %xmm4
  969. vmovsd 6 * SIZE(BO1), %xmm6
  970. vmovss 0 * SIZE(BO2), %xmm1
  971. vmovss 2 * SIZE(BO2), %xmm3
  972. vmovss 4 * SIZE(BO2), %xmm5
  973. vmovss 6 * SIZE(BO2), %xmm7
  974. vmovsd %xmm0, 0*SIZE(BO)
  975. vmovss %xmm1, 2*SIZE(BO)
  976. vmovsd %xmm2, 3*SIZE(BO)
  977. vmovss %xmm3, 5*SIZE(BO)
  978. vmovsd %xmm4, 6*SIZE(BO)
  979. vmovss %xmm5, 8*SIZE(BO)
  980. vmovsd %xmm6, 9*SIZE(BO)
  981. vmovss %xmm7,11*SIZE(BO)
  982. addq $8*SIZE,BO1
  983. addq $8*SIZE,BO2
  984. addq $12*SIZE,BO
  985. vmovsd 0 * SIZE(BO1), %xmm0
  986. vmovsd 2 * SIZE(BO1), %xmm2
  987. vmovsd 4 * SIZE(BO1), %xmm4
  988. vmovsd 6 * SIZE(BO1), %xmm6
  989. vmovss 0 * SIZE(BO2), %xmm1
  990. vmovss 2 * SIZE(BO2), %xmm3
  991. vmovss 4 * SIZE(BO2), %xmm5
  992. vmovss 6 * SIZE(BO2), %xmm7
  993. vmovsd %xmm0, 0*SIZE(BO)
  994. vmovss %xmm1, 2*SIZE(BO)
  995. vmovsd %xmm2, 3*SIZE(BO)
  996. vmovss %xmm3, 5*SIZE(BO)
  997. vmovsd %xmm4, 6*SIZE(BO)
  998. vmovss %xmm5, 8*SIZE(BO)
  999. vmovsd %xmm6, 9*SIZE(BO)
  1000. vmovss %xmm7,11*SIZE(BO)
  1001. addq $8*SIZE,BO1
  1002. addq $8*SIZE,BO2
  1003. addq $12*SIZE,BO
  1004. decq %rax
  1005. jnz .L6_01a_1
  1006. .L6_01a_2:
  1007. movq K, %rax
  1008. andq $7, %rax // K % 8
  1009. jz .L6_02c
  1010. ALIGN_4
  1011. .L6_02b:
  1012. vmovsd 0 * SIZE(BO1), %xmm0
  1013. vmovss 0 * SIZE(BO2), %xmm2
  1014. vmovsd %xmm0, 0*SIZE(BO)
  1015. vmovss %xmm2, 2*SIZE(BO)
  1016. addq $2*SIZE,BO1
  1017. addq $2*SIZE,BO2
  1018. addq $3*SIZE,BO
  1019. decq %rax
  1020. jnz .L6_02b
  1021. .L6_02c:
  1022. movq K, %rax
  1023. salq $1,%rax // K * 2
  1024. leaq (B,%rax, SIZE), BO1 // next offset to BO1
  1025. leaq (BO1,%rax, SIZE), BO2 // next offset to BO2
  1026. leaq BUFFER2, BO // second buffer to BO
  1027. movq K, %rax
  1028. sarq $3 , %rax // K / 8
  1029. jz .L6_02c_2
  1030. ALIGN_4
  1031. .L6_02c_1:
  1032. prefetcht0 512(BO2)
  1033. prefetchw 512(BO)
  1034. vmovsd 0 * SIZE(BO2), %xmm0
  1035. vmovsd 2 * SIZE(BO2), %xmm2
  1036. vmovsd 4 * SIZE(BO2), %xmm4
  1037. vmovsd 6 * SIZE(BO2), %xmm6
  1038. vmovss 1 * SIZE(BO1), %xmm1
  1039. vmovss 3 * SIZE(BO1), %xmm3
  1040. vmovss 5 * SIZE(BO1), %xmm5
  1041. vmovss 7 * SIZE(BO1), %xmm7
  1042. vmovss %xmm1, 0*SIZE(BO)
  1043. vmovsd %xmm0, 1*SIZE(BO)
  1044. vmovss %xmm3, 3*SIZE(BO)
  1045. vmovsd %xmm2, 4*SIZE(BO)
  1046. vmovss %xmm5, 6*SIZE(BO)
  1047. vmovsd %xmm4, 7*SIZE(BO)
  1048. vmovss %xmm7, 9*SIZE(BO)
  1049. vmovsd %xmm6,10*SIZE(BO)
  1050. addq $8*SIZE,BO1
  1051. addq $8*SIZE,BO2
  1052. addq $12*SIZE,BO
  1053. vmovsd 0 * SIZE(BO2), %xmm0
  1054. vmovsd 2 * SIZE(BO2), %xmm2
  1055. vmovsd 4 * SIZE(BO2), %xmm4
  1056. vmovsd 6 * SIZE(BO2), %xmm6
  1057. vmovss 1 * SIZE(BO1), %xmm1
  1058. vmovss 3 * SIZE(BO1), %xmm3
  1059. vmovss 5 * SIZE(BO1), %xmm5
  1060. vmovss 7 * SIZE(BO1), %xmm7
  1061. vmovss %xmm1, 0*SIZE(BO)
  1062. vmovsd %xmm0, 1*SIZE(BO)
  1063. vmovss %xmm3, 3*SIZE(BO)
  1064. vmovsd %xmm2, 4*SIZE(BO)
  1065. vmovss %xmm5, 6*SIZE(BO)
  1066. vmovsd %xmm4, 7*SIZE(BO)
  1067. vmovss %xmm7, 9*SIZE(BO)
  1068. vmovsd %xmm6,10*SIZE(BO)
  1069. addq $8*SIZE,BO1
  1070. addq $8*SIZE,BO2
  1071. addq $12*SIZE,BO
  1072. decq %rax
  1073. jnz .L6_02c_1
  1074. .L6_02c_2:
  1075. movq K, %rax
  1076. andq $7, %rax // K % 8
  1077. jz .L6_03c
  1078. ALIGN_4
  1079. .L6_03b:
  1080. vmovss 1*SIZE(BO1), %xmm0
  1081. vmovsd 0*SIZE(BO2), %xmm1
  1082. vmovss %xmm0, 0*SIZE(BO)
  1083. vmovsd %xmm1, 1*SIZE(BO)
  1084. addq $2*SIZE,BO1
  1085. addq $2*SIZE,BO2
  1086. addq $3*SIZE,BO
  1087. decq %rax
  1088. jnz .L6_03b
  1089. .L6_03c:
  1090. movq BO2, B // next offset of B
  1091. .L6_10:
  1092. movq C, CO1
  1093. leaq (C, LDC, 2), C
  1094. leaq (C, LDC, 1), C // c += 3 * ldc
  1095. movq A, AO // aoffset = a
  1096. addq $32 * SIZE, AO
  1097. movq M, I
  1098. sarq $4, I // i = (m >> 4)
  1099. je .L6_20
  1100. ALIGN_4
  1101. .L6_11:
  1102. leaq BUFFER1, BO // first buffer to BO
  1103. addq $6 * SIZE, BO
  1104. vzeroall
  1105. movq K, %rax
  1106. andq $-8, %rax // K = K - ( K % 8 )
  1107. je .L6_16
  1108. movq %rax, BI // Index for BO
  1109. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1110. salq $4, %rax // rax = rax * 16 ; number of values
  1111. leaq (AO, %rax, SIZE), AO
  1112. leaq (BO, BI, SIZE), BO
  1113. negq BI
  1114. negq %rax
  1115. ALIGN_4
  1116. .L6_12:
  1117. prefetcht0 B_PR1(BO,BI, SIZE)
  1118. KERNEL16x3_1(xxx)
  1119. KERNEL16x3_2(xxx)
  1120. KERNEL16x3_3(xxx)
  1121. KERNEL16x3_4(xxx)
  1122. KERNEL16x3_1(xxx)
  1123. prefetcht0 B_PR1+16(BO,BI, SIZE)
  1124. KERNEL16x3_2(xxx)
  1125. KERNEL16x3_3(xxx)
  1126. KERNEL16x3_4(xxx)
  1127. je .L6_16
  1128. KERNEL16x3_1(xxx)
  1129. KERNEL16x3_2(xxx)
  1130. prefetcht0 B_PR1+32(BO,BI, SIZE)
  1131. KERNEL16x3_3(xxx)
  1132. KERNEL16x3_4(xxx)
  1133. KERNEL16x3_1(xxx)
  1134. KERNEL16x3_2(xxx)
  1135. KERNEL16x3_3(xxx)
  1136. KERNEL16x3_4(xxx)
  1137. je .L6_16
  1138. jmp .L6_12
  1139. ALIGN_4
  1140. .L6_16:
  1141. movq K, %rax
  1142. andq $7, %rax # if (k & 1)
  1143. je .L6_19
  1144. movq %rax, BI // Index for BO
  1145. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1146. salq $4, %rax // rax = rax * 16 ; number of values
  1147. leaq (AO, %rax, SIZE), AO
  1148. leaq (BO, BI, SIZE), BO
  1149. negq BI
  1150. negq %rax
  1151. ALIGN_4
  1152. .L6_17:
  1153. KERNEL16x3_SUB(xxx)
  1154. addq $3, BI
  1155. addq $16, %rax
  1156. jl .L6_17
  1157. ALIGN_4
  1158. .L6_19:
  1159. vbroadcastss ALPHA, %xmm0
  1160. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  1161. vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  1162. vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
  1163. vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
  1164. vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5
  1165. vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  1166. vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
  1167. vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
  1168. vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1169. vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
  1170. vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
  1171. vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
  1172. vmovups %xmm4 , (CO1)
  1173. vmovups %xmm7 , 4 * SIZE(CO1)
  1174. vmovups %xmm10, 8 * SIZE(CO1)
  1175. vmovups %xmm13,12 * SIZE(CO1)
  1176. vmovups %xmm5 , (CO1, LDC)
  1177. vmovups %xmm8 , 4 * SIZE(CO1, LDC)
  1178. vmovups %xmm11, 8 * SIZE(CO1, LDC)
  1179. vmovups %xmm14,12 * SIZE(CO1, LDC)
  1180. vmovups %xmm6 , (CO1, LDC, 2)
  1181. vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2)
  1182. vmovups %xmm12, 8 * SIZE(CO1, LDC, 2)
  1183. vmovups %xmm15,12 * SIZE(CO1, LDC, 2)
  1184. addq $16 * SIZE, CO1 # coffset += 16
  1185. decq I # i --
  1186. jg .L6_11
  1187. ALIGN_4
  1188. /**************************************************************************
  1189. * Rest of M
  1190. ***************************************************************************/
  1191. .L6_20:
  1192. // Test rest of M
  1193. testq $15, M
  1194. jz .L7_10 // to next 3 lines of N
  1195. testq $8, M
  1196. jz .L6_21pre
  1197. ALIGN_4
  1198. /**************************************************************************/
  1199. .L6_20_1:
  1200. leaq BUFFER1, BO // first buffer to BO
  1201. addq $6 * SIZE, BO
  1202. vzeroall
  1203. movq K, %rax
  1204. andq $-8, %rax
  1205. je .L6_20_6
  1206. movq %rax, BI // Index for BO
  1207. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1208. salq $3, %rax // rax = rax * 8 ; number of values
  1209. leaq (AO, %rax, SIZE), AO
  1210. leaq (BO, BI, SIZE), BO
  1211. negq BI
  1212. negq %rax
  1213. ALIGN_4
  1214. .L6_20_2:
  1215. prefetcht0 B_PR1(BO,BI, SIZE)
  1216. KERNEL8x3_1(xxx)
  1217. KERNEL8x3_2(xxx)
  1218. KERNEL8x3_3(xxx)
  1219. KERNEL8x3_4(xxx)
  1220. KERNEL8x3_1(xxx)
  1221. prefetcht0 B_PR1+16(BO,BI, SIZE)
  1222. KERNEL8x3_2(xxx)
  1223. KERNEL8x3_3(xxx)
  1224. KERNEL8x3_4(xxx)
  1225. je .L6_20_6
  1226. KERNEL8x3_1(xxx)
  1227. KERNEL8x3_2(xxx)
  1228. prefetcht0 B_PR1+32(BO,BI, SIZE)
  1229. KERNEL8x3_3(xxx)
  1230. KERNEL8x3_4(xxx)
  1231. KERNEL8x3_1(xxx)
  1232. KERNEL8x3_2(xxx)
  1233. KERNEL8x3_3(xxx)
  1234. KERNEL8x3_4(xxx)
  1235. je .L6_20_6
  1236. jmp .L6_20_2
  1237. ALIGN_4
  1238. .L6_20_6:
  1239. movq K, %rax
  1240. andq $7, %rax # if (k & 1)
  1241. je .L6_20_9
  1242. movq %rax, BI // Index for BO
  1243. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1244. salq $3, %rax // rax = rax * 8 ; number of values
  1245. leaq (AO, %rax, SIZE), AO
  1246. leaq (BO, BI, SIZE), BO
  1247. negq BI
  1248. negq %rax
  1249. ALIGN_4
  1250. .L6_20_7:
  1251. KERNEL8x3_SUB(xxx)
  1252. addq $3, BI
  1253. addq $8, %rax
  1254. jl .L6_20_7
  1255. ALIGN_4
  1256. .L6_20_9:
  1257. vbroadcastss ALPHA, %xmm0
  1258. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  1259. vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  1260. vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5
  1261. vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  1262. vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1263. vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
  1264. vmovups %xmm4 , (CO1)
  1265. vmovups %xmm7 , 4 * SIZE(CO1)
  1266. vmovups %xmm5 , (CO1, LDC)
  1267. vmovups %xmm8 , 4 * SIZE(CO1, LDC)
  1268. vmovups %xmm6 , (CO1, LDC, 2)
  1269. vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2)
  1270. addq $8 * SIZE, CO1 # coffset += 8
  1271. ALIGN_4
  1272. /**************************************************************************/
  1273. .L6_21pre:
  1274. testq $4, M
  1275. jz .L6_30
  1276. ALIGN_4
  1277. .L6_21:
  1278. leaq BUFFER1, BO // first buffer to BO
  1279. addq $6 * SIZE, BO
  1280. vzeroall
  1281. movq K, %rax
  1282. andq $-8, %rax
  1283. je .L6_26
  1284. movq %rax, BI // Index for BO
  1285. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1286. salq $2, %rax // rax = rax * 4 ; number of values
  1287. leaq (AO, %rax, SIZE), AO
  1288. leaq (BO, BI, SIZE), BO
  1289. negq BI
  1290. negq %rax
  1291. ALIGN_4
  1292. .L6_22:
  1293. prefetcht0 B_PR1(BO,BI, SIZE)
  1294. KERNEL4x3_1(xxx)
  1295. KERNEL4x3_2(xxx)
  1296. KERNEL4x3_3(xxx)
  1297. KERNEL4x3_4(xxx)
  1298. KERNEL4x3_1(xxx)
  1299. prefetcht0 B_PR1+16(BO,BI, SIZE)
  1300. KERNEL4x3_2(xxx)
  1301. KERNEL4x3_3(xxx)
  1302. KERNEL4x3_4(xxx)
  1303. je .L6_26
  1304. KERNEL4x3_1(xxx)
  1305. KERNEL4x3_2(xxx)
  1306. prefetcht0 B_PR1+32(BO,BI, SIZE)
  1307. KERNEL4x3_3(xxx)
  1308. KERNEL4x3_4(xxx)
  1309. KERNEL4x3_1(xxx)
  1310. KERNEL4x3_2(xxx)
  1311. KERNEL4x3_3(xxx)
  1312. KERNEL4x3_4(xxx)
  1313. je .L6_26
  1314. jmp .L6_22
  1315. ALIGN_4
  1316. .L6_26:
  1317. movq K, %rax
  1318. andq $7, %rax # if (k & 1)
  1319. je .L6_29
  1320. movq %rax, BI // Index for BO
  1321. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1322. salq $2, %rax // rax = rax * 4 ; number of values
  1323. leaq (AO, %rax, SIZE), AO
  1324. leaq (BO, BI, SIZE), BO
  1325. negq BI
  1326. negq %rax
  1327. ALIGN_4
  1328. .L6_27:
  1329. KERNEL4x3_SUB(xxx)
  1330. addq $3, BI
  1331. addq $4, %rax
  1332. jl .L6_27
  1333. ALIGN_4
  1334. .L6_29:
  1335. vbroadcastss ALPHA, %xmm0
  1336. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  1337. vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5
  1338. vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1339. vmovups %xmm4 , (CO1)
  1340. vmovups %xmm5 , (CO1, LDC)
  1341. vmovups %xmm6 , (CO1, LDC, 2)
  1342. addq $4 * SIZE, CO1 # coffset += 4
  1343. ALIGN_4
  1344. .L6_30:
  1345. testq $2, M
  1346. jz .L6_40
  1347. ALIGN_4
  1348. .L6_31:
  1349. leaq BUFFER1, BO // first buffer to BO
  1350. addq $6 * SIZE, BO
  1351. vzeroall
  1352. movq K, %rax
  1353. andq $-8, %rax
  1354. je .L6_36
  1355. movq %rax, BI // Index for BO
  1356. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1357. salq $1, %rax // rax = rax *2 ; number of values
  1358. leaq (AO, %rax, SIZE), AO
  1359. leaq (BO, BI, SIZE), BO
  1360. negq BI
  1361. negq %rax
  1362. ALIGN_4
  1363. .L6_32:
  1364. prefetcht0 B_PR1(BO,BI,SIZE)
  1365. KERNEL2x3_1(xxx)
  1366. KERNEL2x3_2(xxx)
  1367. KERNEL2x3_3(xxx)
  1368. KERNEL2x3_4(xxx)
  1369. KERNEL2x3_1(xxx)
  1370. prefetcht0 B_PR1+16(BO,BI,SIZE)
  1371. KERNEL2x3_2(xxx)
  1372. KERNEL2x3_3(xxx)
  1373. KERNEL2x3_4(xxx)
  1374. je .L6_36
  1375. KERNEL2x3_1(xxx)
  1376. KERNEL2x3_2(xxx)
  1377. prefetcht0 B_PR1+32(BO,BI,SIZE)
  1378. KERNEL2x3_3(xxx)
  1379. KERNEL2x3_4(xxx)
  1380. KERNEL2x3_1(xxx)
  1381. KERNEL2x3_2(xxx)
  1382. KERNEL2x3_3(xxx)
  1383. KERNEL2x3_4(xxx)
  1384. je .L6_36
  1385. jmp .L6_32
  1386. ALIGN_4
  1387. .L6_36:
  1388. movq K, %rax
  1389. andq $7, %rax # if (k & 1)
  1390. je .L6_39
  1391. movq %rax, BI // Index for BO
  1392. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1393. salq $1, %rax // rax = rax *2 ; number of values
  1394. leaq (AO, %rax, SIZE), AO
  1395. leaq (BO, BI, SIZE), BO
  1396. negq BI
  1397. negq %rax
  1398. ALIGN_4
  1399. .L6_37:
  1400. KERNEL2x3_SUB(xxx)
  1401. addq $3, BI
  1402. addq $2, %rax
  1403. jl .L6_37
  1404. ALIGN_4
  1405. .L6_39:
  1406. vmovss ALPHA, %xmm0
  1407. vfmaddss (CO1),%xmm0, %xmm4,%xmm4
  1408. vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
  1409. vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5
  1410. vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
  1411. vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1412. vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
  1413. vmovss %xmm4 , (CO1)
  1414. vmovss %xmm8 , 1 * SIZE(CO1)
  1415. vmovss %xmm5 , (CO1, LDC)
  1416. vmovss %xmm10, 1 * SIZE(CO1, LDC)
  1417. vmovss %xmm6 , (CO1, LDC, 2)
  1418. vmovss %xmm12, 1 * SIZE(CO1, LDC, 2)
  1419. addq $2 * SIZE, CO1 # coffset += 2
  1420. ALIGN_4
  1421. .L6_40:
  1422. testq $1, M
  1423. jz .L7_10 // to next 3 lines of N
  1424. ALIGN_4
  1425. .L6_41:
  1426. leaq BUFFER1, BO // first buffer to BO
  1427. addq $6 * SIZE, BO
  1428. vzeroall
  1429. movq K, %rax
  1430. andq $-8, %rax
  1431. je .L6_46
  1432. movq %rax, BI // Index for BO
  1433. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1434. leaq (AO, %rax, SIZE), AO
  1435. leaq (BO, BI, SIZE), BO
  1436. negq BI
  1437. negq %rax
  1438. ALIGN_4
  1439. .L6_42:
  1440. KERNEL1x3_1(xxx)
  1441. KERNEL1x3_2(xxx)
  1442. KERNEL1x3_3(xxx)
  1443. KERNEL1x3_4(xxx)
  1444. KERNEL1x3_1(xxx)
  1445. KERNEL1x3_2(xxx)
  1446. KERNEL1x3_3(xxx)
  1447. KERNEL1x3_4(xxx)
  1448. je .L6_46
  1449. KERNEL1x3_1(xxx)
  1450. KERNEL1x3_2(xxx)
  1451. KERNEL1x3_3(xxx)
  1452. KERNEL1x3_4(xxx)
  1453. KERNEL1x3_1(xxx)
  1454. KERNEL1x3_2(xxx)
  1455. KERNEL1x3_3(xxx)
  1456. KERNEL1x3_4(xxx)
  1457. je .L6_46
  1458. jmp .L6_42
  1459. ALIGN_4
  1460. .L6_46:
  1461. movq K, %rax
  1462. andq $7, %rax # if (k & 1)
  1463. je .L6_49
  1464. movq %rax, BI // Index for BO
  1465. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1466. leaq (AO, %rax, SIZE), AO
  1467. leaq (BO, BI, SIZE), BO
  1468. negq BI
  1469. negq %rax
  1470. ALIGN_4
  1471. .L6_47:
  1472. KERNEL1x3_SUB(xxx)
  1473. addq $3, BI
  1474. addq $1, %rax
  1475. jl .L6_47
  1476. ALIGN_4
  1477. .L6_49:
  1478. vmovss ALPHA, %xmm0
  1479. vfmaddss (CO1),%xmm0, %xmm4,%xmm4
  1480. vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5
  1481. vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1482. vmovss %xmm4 , (CO1)
  1483. vmovss %xmm5 , (CO1, LDC)
  1484. vmovss %xmm6 , (CO1, LDC, 2)
  1485. addq $1 * SIZE, CO1 # coffset += 1
  1486. ALIGN_4
  1487. /***************************************************************************************************************/
  1488. .L7_10:
  1489. movq C, CO1
  1490. leaq (C, LDC, 2), C
  1491. leaq (C, LDC, 1), C // c += 3 * ldc
  1492. movq A, AO // aoffset = a
  1493. addq $32 * SIZE, AO
  1494. movq M, I
  1495. sarq $4, I // i = (m >> 4)
  1496. je .L7_20
  1497. ALIGN_4
  1498. .L7_11:
  1499. leaq BUFFER2, BO // second buffer to BO
  1500. addq $6 * SIZE, BO
  1501. vzeroall
  1502. movq K, %rax
  1503. andq $-8, %rax // K = K - ( K % 8 )
  1504. je .L7_16
  1505. movq %rax, BI // Index for BO
  1506. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1507. salq $4, %rax // rax = rax * 16 ; number of values
  1508. leaq (AO, %rax, SIZE), AO
  1509. leaq (BO, BI, SIZE), BO
  1510. negq BI
  1511. negq %rax
  1512. ALIGN_4
  1513. .L7_12:
  1514. prefetcht0 B_PR1(BO,BI, SIZE)
  1515. KERNEL16x3_1(xxx)
  1516. KERNEL16x3_2(xxx)
  1517. KERNEL16x3_3(xxx)
  1518. KERNEL16x3_4(xxx)
  1519. KERNEL16x3_1(xxx)
  1520. prefetcht0 B_PR1+16(BO,BI, SIZE)
  1521. KERNEL16x3_2(xxx)
  1522. KERNEL16x3_3(xxx)
  1523. KERNEL16x3_4(xxx)
  1524. je .L7_16
  1525. KERNEL16x3_1(xxx)
  1526. KERNEL16x3_2(xxx)
  1527. prefetcht0 B_PR1+32(BO,BI, SIZE)
  1528. KERNEL16x3_3(xxx)
  1529. KERNEL16x3_4(xxx)
  1530. KERNEL16x3_1(xxx)
  1531. KERNEL16x3_2(xxx)
  1532. KERNEL16x3_3(xxx)
  1533. KERNEL16x3_4(xxx)
  1534. je .L7_16
  1535. jmp .L7_12
  1536. ALIGN_4
  1537. .L7_16:
  1538. movq K, %rax
  1539. andq $7, %rax # if (k & 1)
  1540. je .L7_19
  1541. movq %rax, BI // Index for BO
  1542. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1543. salq $4, %rax // rax = rax * 16 ; number of values
  1544. leaq (AO, %rax, SIZE), AO
  1545. leaq (BO, BI, SIZE), BO
  1546. negq BI
  1547. negq %rax
  1548. ALIGN_4
  1549. .L7_17:
  1550. KERNEL16x3_SUB(xxx)
  1551. addq $3, BI
  1552. addq $16, %rax
  1553. jl .L7_17
  1554. ALIGN_4
  1555. .L7_19:
  1556. vbroadcastss ALPHA, %xmm0
  1557. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  1558. vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  1559. vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
  1560. vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
  1561. vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5
  1562. vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  1563. vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
  1564. vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
  1565. vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1566. vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
  1567. vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
  1568. vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
  1569. vmovups %xmm4 , (CO1)
  1570. vmovups %xmm7 , 4 * SIZE(CO1)
  1571. vmovups %xmm10, 8 * SIZE(CO1)
  1572. vmovups %xmm13,12 * SIZE(CO1)
  1573. vmovups %xmm5 , (CO1, LDC)
  1574. vmovups %xmm8 , 4 * SIZE(CO1, LDC)
  1575. vmovups %xmm11, 8 * SIZE(CO1, LDC)
  1576. vmovups %xmm14,12 * SIZE(CO1, LDC)
  1577. vmovups %xmm6 , (CO1, LDC, 2)
  1578. vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2)
  1579. vmovups %xmm12, 8 * SIZE(CO1, LDC, 2)
  1580. vmovups %xmm15,12 * SIZE(CO1, LDC, 2)
  1581. addq $16 * SIZE, CO1 # coffset += 16
  1582. decq I # i --
  1583. jg .L7_11
  1584. ALIGN_4
  1585. /**************************************************************************
  1586. * Rest of M
  1587. ***************************************************************************/
  1588. .L7_20:
  1589. // Test rest of M
  1590. testq $15, M
  1591. jz .L7_60 // to next 3 lines of N
  1592. testq $8, M
  1593. jz .L7_21pre
  1594. ALIGN_4
  1595. /**************************************************************************/
  1596. .L7_20_1:
  1597. leaq BUFFER2, BO // first buffer to BO
  1598. addq $6 * SIZE, BO
  1599. vzeroall
  1600. movq K, %rax
  1601. andq $-8, %rax
  1602. je .L7_20_6
  1603. movq %rax, BI // Index for BO
  1604. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1605. salq $3, %rax // rax = rax * 8 ; number of values
  1606. leaq (AO, %rax, SIZE), AO
  1607. leaq (BO, BI, SIZE), BO
  1608. negq BI
  1609. negq %rax
  1610. ALIGN_4
  1611. .L7_20_2:
  1612. prefetcht0 B_PR1(BO,BI, SIZE)
  1613. KERNEL8x3_1(xxx)
  1614. KERNEL8x3_2(xxx)
  1615. KERNEL8x3_3(xxx)
  1616. KERNEL8x3_4(xxx)
  1617. KERNEL8x3_1(xxx)
  1618. prefetcht0 B_PR1+16(BO,BI, SIZE)
  1619. KERNEL8x3_2(xxx)
  1620. KERNEL8x3_3(xxx)
  1621. KERNEL8x3_4(xxx)
  1622. je .L7_20_6
  1623. KERNEL8x3_1(xxx)
  1624. KERNEL8x3_2(xxx)
  1625. prefetcht0 B_PR1+32(BO,BI, SIZE)
  1626. KERNEL8x3_3(xxx)
  1627. KERNEL8x3_4(xxx)
  1628. KERNEL8x3_1(xxx)
  1629. KERNEL8x3_2(xxx)
  1630. KERNEL8x3_3(xxx)
  1631. KERNEL8x3_4(xxx)
  1632. je .L7_20_6
  1633. jmp .L7_20_2
  1634. ALIGN_4
  1635. .L7_20_6:
  1636. movq K, %rax
  1637. andq $7, %rax # if (k & 1)
  1638. je .L7_20_9
  1639. movq %rax, BI // Index for BO
  1640. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1641. salq $3, %rax // rax = rax * 8 ; number of values
  1642. leaq (AO, %rax, SIZE), AO
  1643. leaq (BO, BI, SIZE), BO
  1644. negq BI
  1645. negq %rax
  1646. ALIGN_4
  1647. .L7_20_7:
  1648. KERNEL8x3_SUB(xxx)
  1649. addq $3, BI
  1650. addq $8, %rax
  1651. jl .L7_20_7
  1652. ALIGN_4
  1653. .L7_20_9:
  1654. vbroadcastss ALPHA, %xmm0
  1655. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  1656. vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  1657. vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5
  1658. vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  1659. vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1660. vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
  1661. vmovups %xmm4 , (CO1)
  1662. vmovups %xmm7 , 4 * SIZE(CO1)
  1663. vmovups %xmm5 , (CO1, LDC)
  1664. vmovups %xmm8 , 4 * SIZE(CO1, LDC)
  1665. vmovups %xmm6 , (CO1, LDC, 2)
  1666. vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2)
  1667. addq $8 * SIZE, CO1 # coffset += 8
  1668. ALIGN_4
  1669. /**************************************************************************/
  1670. .L7_21pre:
  1671. testq $4, M
  1672. jz .L7_30
  1673. ALIGN_4
  1674. .L7_21:
  1675. leaq BUFFER2, BO // second buffer to BO
  1676. addq $6 * SIZE, BO
  1677. vzeroall
  1678. movq K, %rax
  1679. andq $-8, %rax
  1680. je .L7_26
  1681. movq %rax, BI // Index for BO
  1682. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1683. salq $2, %rax // rax = rax * 4 ; number of values
  1684. leaq (AO, %rax, SIZE), AO
  1685. leaq (BO, BI, SIZE), BO
  1686. negq BI
  1687. negq %rax
  1688. ALIGN_4
  1689. .L7_22:
  1690. prefetcht0 B_PR1(BO,BI, SIZE)
  1691. KERNEL4x3_1(xxx)
  1692. KERNEL4x3_2(xxx)
  1693. KERNEL4x3_3(xxx)
  1694. KERNEL4x3_4(xxx)
  1695. KERNEL4x3_1(xxx)
  1696. prefetcht0 B_PR1+16(BO,BI, SIZE)
  1697. KERNEL4x3_2(xxx)
  1698. KERNEL4x3_3(xxx)
  1699. KERNEL4x3_4(xxx)
  1700. je .L7_26
  1701. KERNEL4x3_1(xxx)
  1702. KERNEL4x3_2(xxx)
  1703. prefetcht0 B_PR1+32(BO,BI, SIZE)
  1704. KERNEL4x3_3(xxx)
  1705. KERNEL4x3_4(xxx)
  1706. KERNEL4x3_1(xxx)
  1707. KERNEL4x3_2(xxx)
  1708. KERNEL4x3_3(xxx)
  1709. KERNEL4x3_4(xxx)
  1710. je .L7_26
  1711. jmp .L7_22
  1712. ALIGN_4
  1713. .L7_26:
  1714. movq K, %rax
  1715. andq $7, %rax # if (k & 1)
  1716. je .L7_29
  1717. movq %rax, BI // Index for BO
  1718. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1719. salq $2, %rax // rax = rax * 4 ; number of values
  1720. leaq (AO, %rax, SIZE), AO
  1721. leaq (BO, BI, SIZE), BO
  1722. negq BI
  1723. negq %rax
  1724. ALIGN_4
  1725. .L7_27:
  1726. KERNEL4x3_SUB(xxx)
  1727. addq $3, BI
  1728. addq $4, %rax
  1729. jl .L7_27
  1730. ALIGN_4
  1731. .L7_29:
  1732. vbroadcastss ALPHA, %xmm0
  1733. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  1734. vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5
  1735. vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6
  1736. vmovups %xmm4 , (CO1)
  1737. vmovups %xmm5 , (CO1, LDC)
  1738. vmovups %xmm6 , (CO1, LDC, 2)
  1739. addq $4 * SIZE, CO1 # coffset += 4
  1740. ALIGN_4
  1741. .L7_30:
  1742. testq $2, M
  1743. jz .L7_40
  1744. ALIGN_4
  1745. .L7_31:
  1746. leaq BUFFER2, BO // second buffer to BO
  1747. addq $6 * SIZE, BO
  1748. vzeroall
  1749. movq K, %rax
  1750. andq $-8, %rax
  1751. je .L7_36
  1752. movq %rax, BI // Index for BO
  1753. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1754. salq $1, %rax // rax = rax *2 ; number of values
  1755. leaq (AO, %rax, SIZE), AO
  1756. leaq (BO, BI, SIZE), BO
  1757. negq BI
  1758. negq %rax
  1759. ALIGN_4
  1760. .L7_32:
  1761. prefetcht0 B_PR1(BO,BI,SIZE)
  1762. KERNEL2x3_1(xxx)
  1763. KERNEL2x3_2(xxx)
  1764. KERNEL2x3_3(xxx)
  1765. KERNEL2x3_4(xxx)
  1766. KERNEL2x3_1(xxx)
  1767. prefetcht0 B_PR1+16(BO,BI,SIZE)
  1768. KERNEL2x3_2(xxx)
  1769. KERNEL2x3_3(xxx)
  1770. KERNEL2x3_4(xxx)
  1771. je .L7_36
  1772. KERNEL2x3_1(xxx)
  1773. KERNEL2x3_2(xxx)
  1774. prefetcht0 B_PR1+32(BO,BI,SIZE)
  1775. KERNEL2x3_3(xxx)
  1776. KERNEL2x3_4(xxx)
  1777. KERNEL2x3_1(xxx)
  1778. KERNEL2x3_2(xxx)
  1779. KERNEL2x3_3(xxx)
  1780. KERNEL2x3_4(xxx)
  1781. je .L7_36
  1782. jmp .L7_32
  1783. ALIGN_4
  1784. .L7_36:
  1785. movq K, %rax
  1786. andq $7, %rax # if (k & 1)
  1787. je .L7_39
  1788. movq %rax, BI // Index for BO
  1789. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1790. salq $1, %rax // rax = rax *2 ; number of values
  1791. leaq (AO, %rax, SIZE), AO
  1792. leaq (BO, BI, SIZE), BO
  1793. negq BI
  1794. negq %rax
  1795. ALIGN_4
  1796. .L7_37:
  1797. KERNEL2x3_SUB(xxx)
  1798. addq $3, BI
  1799. addq $2, %rax
  1800. jl .L7_37
  1801. ALIGN_4
  1802. .L7_39:
  1803. vmovss ALPHA, %xmm0
  1804. vfmaddss (CO1),%xmm0, %xmm4,%xmm4
  1805. vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
  1806. vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5
  1807. vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
  1808. vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1809. vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
  1810. vmovss %xmm4 , (CO1)
  1811. vmovss %xmm8 , 1 * SIZE(CO1)
  1812. vmovss %xmm5 , (CO1, LDC)
  1813. vmovss %xmm10, 1 * SIZE(CO1, LDC)
  1814. vmovss %xmm6 , (CO1, LDC, 2)
  1815. vmovss %xmm12, 1 * SIZE(CO1, LDC, 2)
  1816. addq $2 * SIZE, CO1 # coffset += 2
  1817. ALIGN_4
  1818. .L7_40:
  1819. testq $1, M
  1820. jz .L7_60 // to next 3 lines of N
  1821. ALIGN_4
  1822. .L7_41:
  1823. leaq BUFFER2, BO // second buffer to BO
  1824. addq $6 * SIZE, BO
  1825. vzeroall
  1826. movq K, %rax
  1827. andq $-8, %rax
  1828. je .L7_46
  1829. movq %rax, BI // Index for BO
  1830. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1831. leaq (AO, %rax, SIZE), AO
  1832. leaq (BO, BI, SIZE), BO
  1833. negq BI
  1834. negq %rax
  1835. ALIGN_4
  1836. .L7_42:
  1837. KERNEL1x3_1(xxx)
  1838. KERNEL1x3_2(xxx)
  1839. KERNEL1x3_3(xxx)
  1840. KERNEL1x3_4(xxx)
  1841. KERNEL1x3_1(xxx)
  1842. KERNEL1x3_2(xxx)
  1843. KERNEL1x3_3(xxx)
  1844. KERNEL1x3_4(xxx)
  1845. je .L7_46
  1846. KERNEL1x3_1(xxx)
  1847. KERNEL1x3_2(xxx)
  1848. KERNEL1x3_3(xxx)
  1849. KERNEL1x3_4(xxx)
  1850. KERNEL1x3_1(xxx)
  1851. KERNEL1x3_2(xxx)
  1852. KERNEL1x3_3(xxx)
  1853. KERNEL1x3_4(xxx)
  1854. je .L7_46
  1855. jmp .L7_42
  1856. ALIGN_4
  1857. .L7_46:
  1858. movq K, %rax
  1859. andq $7, %rax # if (k & 1)
  1860. je .L7_49
  1861. movq %rax, BI // Index for BO
  1862. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1863. leaq (AO, %rax, SIZE), AO
  1864. leaq (BO, BI, SIZE), BO
  1865. negq BI
  1866. negq %rax
  1867. ALIGN_4
  1868. .L7_47:
  1869. KERNEL1x3_SUB(xxx)
  1870. addq $3, BI
  1871. addq $1, %rax
  1872. jl .L7_47
  1873. ALIGN_4
  1874. .L7_49:
  1875. vmovss ALPHA, %xmm0
  1876. vfmaddss (CO1),%xmm0, %xmm4,%xmm4
  1877. vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5
  1878. vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1879. vmovss %xmm4 , (CO1)
  1880. vmovss %xmm5 , (CO1, LDC)
  1881. vmovss %xmm6 , (CO1, LDC, 2)
  1882. addq $1 * SIZE, CO1 # coffset += 1
  1883. ALIGN_4
  1884. .L7_60:
  1885. decq J // j --
  1886. jg .L6_01
  1887. .L2_0:
  1888. cmpq $0, Nmod6 // N % 6 == 0
  1889. je .L999
  1890. /************************************************************************************************
  1891. * Loop for Nmod6 / 2 > 0
  1892. *************************************************************************************************/
  1893. movq Nmod6, J
  1894. sarq $1, J // j = j / 2
  1895. je .L1_0
  1896. ALIGN_4
  1897. .L2_01:
  1898. // copy to sub buffer
  1899. movq B, BO1
  1900. leaq BUFFER1, BO // first buffer to BO
  1901. movq K, %rax
  1902. ALIGN_4
  1903. .L2_02b:
  1904. vmovsd (BO1), %xmm0
  1905. vmovsd %xmm0, (BO)
  1906. addq $2*SIZE,BO1
  1907. addq $2*SIZE,BO
  1908. decq %rax
  1909. jnz .L2_02b
  1910. .L2_02c:
  1911. movq BO1, B // next offset of B
  1912. .L2_10:
  1913. movq C, CO1
  1914. leaq (C, LDC, 2), C // c += 2 * ldc
  1915. movq A, AO // aoffset = a
  1916. addq $32 * SIZE, AO
  1917. movq M, I
  1918. sarq $4, I // i = (m >> 4)
  1919. je .L2_20
  1920. ALIGN_4
  1921. .L2_11:
  1922. leaq BUFFER1, BO // first buffer to BO
  1923. addq $4 * SIZE, BO
  1924. vzeroall
  1925. movq K, %rax
  1926. andq $-8, %rax // K = K - ( K % 8 )
  1927. je .L2_16
  1928. movq %rax, BI // Index for BO
  1929. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1930. salq $4, %rax // rax = rax * 16 ; number of values
  1931. leaq (AO, %rax, SIZE), AO
  1932. leaq (BO, BI, SIZE), BO
  1933. negq BI
  1934. negq %rax
  1935. ALIGN_4
  1936. .L2_12:
  1937. prefetcht0 B_PR1(BO,BI, SIZE)
  1938. KERNEL16x2_1(xxx)
  1939. KERNEL16x2_2(xxx)
  1940. KERNEL16x2_3(xxx)
  1941. KERNEL16x2_4(xxx)
  1942. KERNEL16x2_1(xxx)
  1943. KERNEL16x2_2(xxx)
  1944. KERNEL16x2_3(xxx)
  1945. KERNEL16x2_4(xxx)
  1946. je .L2_16
  1947. prefetcht0 B_PR1(BO,BI, SIZE)
  1948. KERNEL16x2_1(xxx)
  1949. KERNEL16x2_2(xxx)
  1950. KERNEL16x2_3(xxx)
  1951. KERNEL16x2_4(xxx)
  1952. KERNEL16x2_1(xxx)
  1953. KERNEL16x2_2(xxx)
  1954. KERNEL16x2_3(xxx)
  1955. KERNEL16x2_4(xxx)
  1956. je .L2_16
  1957. jmp .L2_12
  1958. ALIGN_4
  1959. .L2_16:
  1960. movq K, %rax
  1961. andq $7, %rax # if (k & 1)
  1962. je .L2_19
  1963. movq %rax, BI // Index for BO
  1964. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1965. salq $4, %rax // rax = rax * 16 ; number of values
  1966. leaq (AO, %rax, SIZE), AO
  1967. leaq (BO, BI, SIZE), BO
  1968. negq BI
  1969. negq %rax
  1970. ALIGN_4
  1971. .L2_17:
  1972. KERNEL16x2_SUB(xxx)
  1973. addq $2, BI
  1974. addq $16, %rax
  1975. jl .L2_17
  1976. ALIGN_4
  1977. .L2_19:
  1978. vbroadcastss ALPHA, %xmm0
  1979. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  1980. vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  1981. vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
  1982. vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
  1983. vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5
  1984. vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  1985. vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
  1986. vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
  1987. vmovups %xmm4 , (CO1)
  1988. vmovups %xmm7 , 4 * SIZE(CO1)
  1989. vmovups %xmm10, 8 * SIZE(CO1)
  1990. vmovups %xmm13,12 * SIZE(CO1)
  1991. vmovups %xmm5 , (CO1, LDC)
  1992. vmovups %xmm8 , 4 * SIZE(CO1, LDC)
  1993. vmovups %xmm11, 8 * SIZE(CO1, LDC)
  1994. vmovups %xmm14,12 * SIZE(CO1, LDC)
  1995. addq $16 * SIZE, CO1 # coffset += 16
  1996. decq I # i --
  1997. jg .L2_11
  1998. ALIGN_4
  1999. /**************************************************************************
  2000. * Rest of M
  2001. ***************************************************************************/
  2002. .L2_20:
  2003. // Test rest of M
  2004. testq $15, M
  2005. jz .L2_60 // to next 3 lines of N
  2006. testq $8, M
  2007. jz .L2_21pre
  2008. ALIGN_4
  2009. /**************************************************************************/
  2010. .L2_20_1:
  2011. leaq BUFFER1, BO // first buffer to BO
  2012. addq $4 * SIZE, BO
  2013. vzeroall
  2014. movq K, %rax
  2015. andq $-8, %rax
  2016. je .L2_20_6
  2017. movq %rax, BI // Index for BO
  2018. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2019. salq $3, %rax // rax = rax * 8 ; number of values
  2020. leaq (AO, %rax, SIZE), AO
  2021. leaq (BO, BI, SIZE), BO
  2022. negq BI
  2023. negq %rax
  2024. ALIGN_4
  2025. .L2_20_2:
  2026. prefetcht0 B_PR1(BO,BI, SIZE)
  2027. KERNEL8x2_1(xxx)
  2028. KERNEL8x2_2(xxx)
  2029. KERNEL8x2_3(xxx)
  2030. KERNEL8x2_4(xxx)
  2031. KERNEL8x2_1(xxx)
  2032. KERNEL8x2_2(xxx)
  2033. KERNEL8x2_3(xxx)
  2034. KERNEL8x2_4(xxx)
  2035. je .L2_20_6
  2036. prefetcht0 B_PR1(BO,BI, SIZE)
  2037. KERNEL8x2_1(xxx)
  2038. KERNEL8x2_2(xxx)
  2039. KERNEL8x2_3(xxx)
  2040. KERNEL8x2_4(xxx)
  2041. KERNEL8x2_1(xxx)
  2042. KERNEL8x2_2(xxx)
  2043. KERNEL8x2_3(xxx)
  2044. KERNEL8x2_4(xxx)
  2045. je .L2_20_6
  2046. jmp .L2_20_2
  2047. ALIGN_4
  2048. .L2_20_6:
  2049. movq K, %rax
  2050. andq $7, %rax # if (k & 1)
  2051. je .L2_20_9
  2052. movq %rax, BI // Index for BO
  2053. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2054. salq $3, %rax // rax = rax * 8 ; number of values
  2055. leaq (AO, %rax, SIZE), AO
  2056. leaq (BO, BI, SIZE), BO
  2057. negq BI
  2058. negq %rax
  2059. ALIGN_4
  2060. .L2_20_7:
  2061. KERNEL8x2_SUB(xxx)
  2062. addq $2, BI
  2063. addq $8, %rax
  2064. jl .L2_20_7
  2065. ALIGN_4
  2066. .L2_20_9:
  2067. vbroadcastss ALPHA, %xmm0
  2068. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  2069. vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  2070. vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5
  2071. vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  2072. vmovups %xmm4 , (CO1)
  2073. vmovups %xmm7 , 4 * SIZE(CO1)
  2074. vmovups %xmm5 , (CO1, LDC)
  2075. vmovups %xmm8 , 4 * SIZE(CO1, LDC)
  2076. addq $8 * SIZE, CO1 # coffset += 8
  2077. ALIGN_4
  2078. /**************************************************************************/
  2079. .L2_21pre:
  2080. testq $4, M
  2081. jz .L2_30
  2082. ALIGN_4
  2083. .L2_21:
  2084. leaq BUFFER1, BO // first buffer to BO
  2085. addq $4 * SIZE, BO
  2086. vzeroall
  2087. movq K, %rax
  2088. andq $-8, %rax
  2089. je .L2_26
  2090. movq %rax, BI // Index for BO
  2091. leaq (BI,BI,1), BI // BI = BI * 1 ; number of values
  2092. salq $2, %rax // rax = rax * 4 ; number of values
  2093. leaq (AO, %rax, SIZE), AO
  2094. leaq (BO, BI, SIZE), BO
  2095. negq BI
  2096. negq %rax
  2097. ALIGN_4
  2098. .L2_22:
  2099. prefetcht0 B_PR1(BO,BI, SIZE)
  2100. KERNEL4x2_1(xxx)
  2101. KERNEL4x2_2(xxx)
  2102. KERNEL4x2_3(xxx)
  2103. KERNEL4x2_4(xxx)
  2104. KERNEL4x2_1(xxx)
  2105. KERNEL4x2_2(xxx)
  2106. KERNEL4x2_3(xxx)
  2107. KERNEL4x2_4(xxx)
  2108. je .L2_26
  2109. prefetcht0 B_PR1(BO,BI, SIZE)
  2110. KERNEL4x2_1(xxx)
  2111. KERNEL4x2_2(xxx)
  2112. KERNEL4x2_3(xxx)
  2113. KERNEL4x2_4(xxx)
  2114. KERNEL4x2_1(xxx)
  2115. KERNEL4x2_2(xxx)
  2116. KERNEL4x2_3(xxx)
  2117. KERNEL4x2_4(xxx)
  2118. je .L2_26
  2119. jmp .L2_22
  2120. ALIGN_4
  2121. .L2_26:
  2122. movq K, %rax
  2123. andq $7, %rax # if (k & 1)
  2124. je .L2_29
  2125. movq %rax, BI // Index for BO
  2126. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2127. salq $2, %rax // rax = rax * 4 ; number of values
  2128. leaq (AO, %rax, SIZE), AO
  2129. leaq (BO, BI, SIZE), BO
  2130. negq BI
  2131. negq %rax
  2132. ALIGN_4
  2133. .L2_27:
  2134. KERNEL4x2_SUB(xxx)
  2135. addq $2, BI
  2136. addq $4, %rax
  2137. jl .L2_27
  2138. ALIGN_4
  2139. .L2_29:
  2140. vbroadcastss ALPHA, %xmm0
  2141. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  2142. vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5
  2143. vmovups %xmm4 , (CO1)
  2144. vmovups %xmm5 , (CO1, LDC)
  2145. addq $4 * SIZE, CO1 # coffset += 4
  2146. ALIGN_4
  2147. .L2_30:
  2148. testq $2, M
  2149. jz .L2_40
  2150. ALIGN_4
  2151. .L2_31:
  2152. leaq BUFFER1, BO // first buffer to BO
  2153. addq $4 * SIZE, BO
  2154. vzeroall
  2155. movq K, %rax
  2156. andq $-8, %rax
  2157. je .L2_36
  2158. movq %rax, BI // Index for BO
  2159. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2160. salq $1, %rax // rax = rax *2 ; number of values
  2161. leaq (AO, %rax, SIZE), AO
  2162. leaq (BO, BI, SIZE), BO
  2163. negq BI
  2164. negq %rax
  2165. ALIGN_4
  2166. .L2_32:
  2167. prefetcht0 B_PR1(BO,BI,SIZE)
  2168. KERNEL2x2_1(xxx)
  2169. KERNEL2x2_2(xxx)
  2170. KERNEL2x2_3(xxx)
  2171. KERNEL2x2_4(xxx)
  2172. KERNEL2x2_1(xxx)
  2173. KERNEL2x2_2(xxx)
  2174. KERNEL2x2_3(xxx)
  2175. KERNEL2x2_4(xxx)
  2176. je .L2_36
  2177. prefetcht0 B_PR1(BO,BI,SIZE)
  2178. KERNEL2x2_1(xxx)
  2179. KERNEL2x2_2(xxx)
  2180. KERNEL2x2_3(xxx)
  2181. KERNEL2x2_4(xxx)
  2182. KERNEL2x2_1(xxx)
  2183. KERNEL2x2_2(xxx)
  2184. KERNEL2x2_3(xxx)
  2185. KERNEL2x2_4(xxx)
  2186. je .L2_36
  2187. jmp .L2_32
  2188. ALIGN_4
  2189. .L2_36:
  2190. movq K, %rax
  2191. andq $7, %rax # if (k & 1)
  2192. je .L2_39
  2193. movq %rax, BI // Index for BO
  2194. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2195. salq $1, %rax // rax = rax *2 ; number of values
  2196. leaq (AO, %rax, SIZE), AO
  2197. leaq (BO, BI, SIZE), BO
  2198. negq BI
  2199. negq %rax
  2200. ALIGN_4
  2201. .L2_37:
  2202. KERNEL2x2_SUB(xxx)
  2203. addq $2, BI
  2204. addq $2, %rax
  2205. jl .L2_37
  2206. ALIGN_4
  2207. .L2_39:
  2208. vmovss ALPHA, %xmm0
  2209. vfmaddss (CO1),%xmm0, %xmm4,%xmm4
  2210. vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
  2211. vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5
  2212. vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
  2213. vmovss %xmm4 , (CO1)
  2214. vmovss %xmm8 , 1 * SIZE(CO1)
  2215. vmovss %xmm5 , (CO1, LDC)
  2216. vmovss %xmm10, 1 * SIZE(CO1, LDC)
  2217. addq $2 * SIZE, CO1 # coffset += 2
  2218. ALIGN_4
  2219. .L2_40:
  2220. testq $1, M
  2221. jz .L2_60 // to next 2 lines of N
  2222. ALIGN_4
  2223. .L2_41:
  2224. leaq BUFFER1, BO // first buffer to BO
  2225. addq $4 * SIZE, BO
  2226. vzeroall
  2227. movq K, %rax
  2228. andq $-8, %rax
  2229. je .L2_46
  2230. movq %rax, BI // Index for BO
  2231. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2232. leaq (AO, %rax, SIZE), AO
  2233. leaq (BO, BI, SIZE), BO
  2234. negq BI
  2235. negq %rax
  2236. ALIGN_4
  2237. .L2_42:
  2238. KERNEL1x2_1(xxx)
  2239. KERNEL1x2_2(xxx)
  2240. KERNEL1x2_3(xxx)
  2241. KERNEL1x2_4(xxx)
  2242. KERNEL1x2_1(xxx)
  2243. KERNEL1x2_2(xxx)
  2244. KERNEL1x2_3(xxx)
  2245. KERNEL1x2_4(xxx)
  2246. je .L2_46
  2247. KERNEL1x2_1(xxx)
  2248. KERNEL1x2_2(xxx)
  2249. KERNEL1x2_3(xxx)
  2250. KERNEL1x2_4(xxx)
  2251. KERNEL1x2_1(xxx)
  2252. KERNEL1x2_2(xxx)
  2253. KERNEL1x2_3(xxx)
  2254. KERNEL1x2_4(xxx)
  2255. je .L2_46
  2256. jmp .L2_42
  2257. ALIGN_4
  2258. .L2_46:
  2259. movq K, %rax
  2260. andq $7, %rax # if (k & 1)
  2261. je .L2_49
  2262. movq %rax, BI // Index for BO
  2263. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2264. leaq (AO, %rax, SIZE), AO
  2265. leaq (BO, BI, SIZE), BO
  2266. negq BI
  2267. negq %rax
  2268. ALIGN_4
  2269. .L2_47:
  2270. KERNEL1x2_SUB(xxx)
  2271. addq $2, BI
  2272. addq $1, %rax
  2273. jl .L2_47
  2274. ALIGN_4
  2275. .L2_49:
  2276. vmovss ALPHA, %xmm0
  2277. vfmaddss (CO1),%xmm0, %xmm4,%xmm4
  2278. vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5
  2279. vmovss %xmm4 , (CO1)
  2280. vmovss %xmm5 , (CO1, LDC)
  2281. addq $1 * SIZE, CO1 # coffset += 1
  2282. ALIGN_4
  2283. .L2_60:
  2284. decq J // j --
  2285. jg .L2_01 // next 2 lines of N
  2286. .L1_0:
  2287. /************************************************************************************************
  2288. * Loop for Nmod6 % 2 > 0
  2289. *************************************************************************************************/
  2290. movq Nmod6, J
  2291. andq $1, J // j % 2
  2292. je .L999
  2293. ALIGN_4
  2294. .L1_01:
  2295. // copy to sub buffer
  2296. movq B, BO1
  2297. leaq BUFFER1, BO // first buffer to BO
  2298. movq K, %rax
  2299. ALIGN_4
  2300. .L1_02b:
  2301. vmovss (BO1), %xmm0
  2302. vmovss %xmm0, (BO)
  2303. addq $1*SIZE,BO1
  2304. addq $1*SIZE,BO
  2305. decq %rax
  2306. jnz .L1_02b
  2307. .L1_02c:
  2308. movq BO1, B // next offset of B
  2309. .L1_10:
  2310. movq C, CO1
  2311. leaq (C, LDC, 1), C // c += 1 * ldc
  2312. movq A, AO // aoffset = a
  2313. addq $32 * SIZE, AO
  2314. movq M, I
  2315. sarq $4, I // i = (m >> 4)
  2316. je .L1_20
  2317. ALIGN_4
  2318. .L1_11:
  2319. leaq BUFFER1, BO // first buffer to BO
  2320. addq $2 * SIZE, BO
  2321. vzeroall
  2322. movq K, %rax
  2323. andq $-8, %rax // K = K - ( K % 8 )
  2324. je .L1_16
  2325. movq %rax, BI // Index for BO
  2326. salq $4, %rax // rax = rax * 16 ; number of values
  2327. leaq (AO, %rax, SIZE), AO
  2328. leaq (BO, BI, SIZE), BO
  2329. negq BI
  2330. negq %rax
  2331. ALIGN_4
  2332. .L1_12:
  2333. prefetcht0 B_PR1(BO,BI, SIZE)
  2334. KERNEL16x1_1(xxx)
  2335. KERNEL16x1_2(xxx)
  2336. KERNEL16x1_3(xxx)
  2337. KERNEL16x1_4(xxx)
  2338. KERNEL16x1_1(xxx)
  2339. KERNEL16x1_2(xxx)
  2340. KERNEL16x1_3(xxx)
  2341. KERNEL16x1_4(xxx)
  2342. je .L1_16
  2343. KERNEL16x1_1(xxx)
  2344. KERNEL16x1_2(xxx)
  2345. KERNEL16x1_3(xxx)
  2346. KERNEL16x1_4(xxx)
  2347. KERNEL16x1_1(xxx)
  2348. KERNEL16x1_2(xxx)
  2349. KERNEL16x1_3(xxx)
  2350. KERNEL16x1_4(xxx)
  2351. je .L1_16
  2352. jmp .L1_12
  2353. ALIGN_4
  2354. .L1_16:
  2355. movq K, %rax
  2356. andq $7, %rax # if (k & 1)
  2357. je .L1_19
  2358. movq %rax, BI // Index for BO
  2359. salq $4, %rax // rax = rax * 16 ; number of values
  2360. leaq (AO, %rax, SIZE), AO
  2361. leaq (BO, BI, SIZE), BO
  2362. negq BI
  2363. negq %rax
  2364. ALIGN_4
  2365. .L1_17:
  2366. KERNEL16x1_SUB(xxx)
  2367. addq $1, BI
  2368. addq $16, %rax
  2369. jl .L1_17
  2370. ALIGN_4
  2371. .L1_19:
  2372. vbroadcastss ALPHA, %xmm0
  2373. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  2374. vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  2375. vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
  2376. vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
  2377. vmovups %xmm4 , (CO1)
  2378. vmovups %xmm7 , 4 * SIZE(CO1)
  2379. vmovups %xmm10, 8 * SIZE(CO1)
  2380. vmovups %xmm13,12 * SIZE(CO1)
  2381. addq $16 * SIZE, CO1 # coffset += 16
  2382. decq I # i --
  2383. jg .L1_11
  2384. ALIGN_4
  2385. /**************************************************************************
  2386. * Rest of M
  2387. ***************************************************************************/
  2388. .L1_20:
  2389. // Test rest of M
  2390. testq $15, M
  2391. jz .L999
  2392. testq $8, M
  2393. jz .L1_21pre
  2394. ALIGN_4
  2395. /**************************************************************************/
  2396. .L1_20_1:
  2397. leaq BUFFER1, BO // first buffer to BO
  2398. addq $2 * SIZE, BO
  2399. vzeroall
  2400. movq K, %rax
  2401. andq $-8, %rax
  2402. je .L1_20_6
  2403. movq %rax, BI // Index for BO
  2404. salq $3, %rax // rax = rax * 8 ; number of values
  2405. leaq (AO, %rax, SIZE), AO
  2406. leaq (BO, BI, SIZE), BO
  2407. negq BI
  2408. negq %rax
  2409. ALIGN_4
  2410. .L1_20_2:
  2411. prefetcht0 B_PR1(BO,BI, SIZE)
  2412. KERNEL8x1_1(xxx)
  2413. KERNEL8x1_2(xxx)
  2414. KERNEL8x1_3(xxx)
  2415. KERNEL8x1_4(xxx)
  2416. KERNEL8x1_1(xxx)
  2417. KERNEL8x1_2(xxx)
  2418. KERNEL8x1_3(xxx)
  2419. KERNEL8x1_4(xxx)
  2420. je .L1_20_6
  2421. KERNEL8x1_1(xxx)
  2422. KERNEL8x1_2(xxx)
  2423. KERNEL8x1_3(xxx)
  2424. KERNEL8x1_4(xxx)
  2425. KERNEL8x1_1(xxx)
  2426. KERNEL8x1_2(xxx)
  2427. KERNEL8x1_3(xxx)
  2428. KERNEL8x1_4(xxx)
  2429. je .L1_20_6
  2430. jmp .L1_20_2
  2431. ALIGN_4
  2432. .L1_20_6:
  2433. movq K, %rax
  2434. andq $7, %rax # if (k & 1)
  2435. je .L1_20_9
  2436. movq %rax, BI // Index for BO
  2437. salq $3, %rax // rax = rax * 8 ; number of values
  2438. leaq (AO, %rax, SIZE), AO
  2439. leaq (BO, BI, SIZE), BO
  2440. negq BI
  2441. negq %rax
  2442. ALIGN_4
  2443. .L1_20_7:
  2444. KERNEL8x1_SUB(xxx)
  2445. addq $1, BI
  2446. addq $8, %rax
  2447. jl .L1_20_7
  2448. ALIGN_4
  2449. .L1_20_9:
  2450. vbroadcastss ALPHA, %xmm0
  2451. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  2452. vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  2453. vmovups %xmm4 , (CO1)
  2454. vmovups %xmm7 , 4 * SIZE(CO1)
  2455. addq $8 * SIZE, CO1 # coffset += 8
  2456. ALIGN_4
  2457. /**************************************************************************/
  2458. .L1_21pre:
  2459. testq $4, M
  2460. jz .L1_30
  2461. ALIGN_4
  2462. .L1_21:
  2463. leaq BUFFER1, BO // first buffer to BO
  2464. addq $2 * SIZE, BO
  2465. vzeroall
  2466. movq K, %rax
  2467. andq $-8, %rax
  2468. je .L1_26
  2469. movq %rax, BI // Index for BO
  2470. salq $2, %rax // rax = rax * 4 ; number of values
  2471. leaq (AO, %rax, SIZE), AO
  2472. leaq (BO, BI, SIZE), BO
  2473. negq BI
  2474. negq %rax
  2475. ALIGN_4
  2476. .L1_22:
  2477. prefetcht0 B_PR1(BO,BI, SIZE)
  2478. KERNEL4x1_1(xxx)
  2479. KERNEL4x1_2(xxx)
  2480. KERNEL4x1_3(xxx)
  2481. KERNEL4x1_4(xxx)
  2482. KERNEL4x1_1(xxx)
  2483. KERNEL4x1_2(xxx)
  2484. KERNEL4x1_3(xxx)
  2485. KERNEL4x1_4(xxx)
  2486. je .L1_26
  2487. KERNEL4x1_1(xxx)
  2488. KERNEL4x1_2(xxx)
  2489. KERNEL4x1_3(xxx)
  2490. KERNEL4x1_4(xxx)
  2491. KERNEL4x1_1(xxx)
  2492. KERNEL4x1_2(xxx)
  2493. KERNEL4x1_3(xxx)
  2494. KERNEL4x1_4(xxx)
  2495. je .L1_26
  2496. jmp .L1_22
  2497. ALIGN_4
  2498. .L1_26:
  2499. movq K, %rax
  2500. andq $7, %rax # if (k & 1)
  2501. je .L1_29
  2502. movq %rax, BI // Index for BO
  2503. salq $2, %rax // rax = rax * 4 ; number of values
  2504. leaq (AO, %rax, SIZE), AO
  2505. leaq (BO, BI, SIZE), BO
  2506. negq BI
  2507. negq %rax
  2508. ALIGN_4
  2509. .L1_27:
  2510. KERNEL4x1_SUB(xxx)
  2511. addq $1, BI
  2512. addq $4, %rax
  2513. jl .L1_27
  2514. ALIGN_4
  2515. .L1_29:
  2516. vbroadcastss ALPHA, %xmm0
  2517. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  2518. vmovups %xmm4 , (CO1)
  2519. addq $4 * SIZE, CO1 # coffset += 4
  2520. ALIGN_4
  2521. .L1_30:
  2522. testq $2, M
  2523. jz .L1_40
  2524. ALIGN_4
  2525. .L1_31:
  2526. leaq BUFFER1, BO // first buffer to BO
  2527. addq $2 * SIZE, BO
  2528. vzeroall
  2529. movq K, %rax
  2530. andq $-8, %rax
  2531. je .L1_36
  2532. movq %rax, BI // Index for BO
  2533. salq $1, %rax // rax = rax *2 ; number of values
  2534. leaq (AO, %rax, SIZE), AO
  2535. leaq (BO, BI, SIZE), BO
  2536. negq BI
  2537. negq %rax
  2538. ALIGN_4
  2539. .L1_32:
  2540. prefetcht0 B_PR1(BO,BI,SIZE)
  2541. KERNEL2x1_1(xxx)
  2542. KERNEL2x1_2(xxx)
  2543. KERNEL2x1_3(xxx)
  2544. KERNEL2x1_4(xxx)
  2545. KERNEL2x1_1(xxx)
  2546. KERNEL2x1_2(xxx)
  2547. KERNEL2x1_3(xxx)
  2548. KERNEL2x1_4(xxx)
  2549. je .L1_36
  2550. KERNEL2x1_1(xxx)
  2551. KERNEL2x1_2(xxx)
  2552. KERNEL2x1_3(xxx)
  2553. KERNEL2x1_4(xxx)
  2554. KERNEL2x1_1(xxx)
  2555. KERNEL2x1_2(xxx)
  2556. KERNEL2x1_3(xxx)
  2557. KERNEL2x1_4(xxx)
  2558. je .L1_36
  2559. jmp .L1_32
  2560. ALIGN_4
  2561. .L1_36:
  2562. movq K, %rax
  2563. andq $7, %rax # if (k & 1)
  2564. je .L1_39
  2565. movq %rax, BI // Index for BO
  2566. salq $1, %rax // rax = rax *2 ; number of values
  2567. leaq (AO, %rax, SIZE), AO
  2568. leaq (BO, BI, SIZE), BO
  2569. negq BI
  2570. negq %rax
  2571. ALIGN_4
  2572. .L1_37:
  2573. KERNEL2x1_SUB(xxx)
  2574. addq $1, BI
  2575. addq $2, %rax
  2576. jl .L1_37
  2577. ALIGN_4
  2578. .L1_39:
  2579. vmovss ALPHA, %xmm0
  2580. vfmaddss (CO1),%xmm0, %xmm4,%xmm4
  2581. vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
  2582. vmovss %xmm4 , (CO1)
  2583. vmovss %xmm8 , 1 * SIZE(CO1)
  2584. addq $2 * SIZE, CO1 # coffset += 2
  2585. ALIGN_4
  2586. .L1_40:
  2587. testq $1, M
  2588. jz .L999
  2589. ALIGN_4
  2590. .L1_41:
  2591. leaq BUFFER1, BO // first buffer to BO
  2592. addq $2 * SIZE, BO
  2593. vzeroall
  2594. movq K, %rax
  2595. andq $-8, %rax
  2596. je .L1_46
  2597. movq %rax, BI // Index for BO
  2598. leaq (AO, %rax, SIZE), AO
  2599. leaq (BO, BI, SIZE), BO
  2600. negq BI
  2601. negq %rax
  2602. ALIGN_4
  2603. .L1_42:
  2604. KERNEL1x1_1(xxx)
  2605. KERNEL1x1_2(xxx)
  2606. KERNEL1x1_3(xxx)
  2607. KERNEL1x1_4(xxx)
  2608. KERNEL1x1_1(xxx)
  2609. KERNEL1x1_2(xxx)
  2610. KERNEL1x1_3(xxx)
  2611. KERNEL1x1_4(xxx)
  2612. je .L1_46
  2613. KERNEL1x1_1(xxx)
  2614. KERNEL1x1_2(xxx)
  2615. KERNEL1x1_3(xxx)
  2616. KERNEL1x1_4(xxx)
  2617. KERNEL1x1_1(xxx)
  2618. KERNEL1x1_2(xxx)
  2619. KERNEL1x1_3(xxx)
  2620. KERNEL1x1_4(xxx)
  2621. je .L1_46
  2622. jmp .L1_42
  2623. ALIGN_4
  2624. .L1_46:
  2625. movq K, %rax
  2626. andq $7, %rax # if (k & 1)
  2627. je .L1_49
  2628. movq %rax, BI // Index for BO
  2629. leaq (AO, %rax, SIZE), AO
  2630. leaq (BO, BI, SIZE), BO
  2631. negq BI
  2632. negq %rax
  2633. ALIGN_4
  2634. .L1_47:
  2635. KERNEL1x1_SUB(xxx)
  2636. addq $1, BI
  2637. addq $1, %rax
  2638. jl .L1_47
  2639. ALIGN_4
  2640. .L1_49:
  2641. vmovss ALPHA, %xmm0
  2642. vfmaddss (CO1),%xmm0, %xmm4,%xmm4
  2643. vmovss %xmm4 , (CO1)
  2644. addq $1 * SIZE, CO1 # coffset += 1
  2645. ALIGN_4
  2646. .L999:
  2647. movq SP, %rsp
  2648. movq (%rsp), %rbx
  2649. movq 8(%rsp), %rbp
  2650. movq 16(%rsp), %r12
  2651. movq 24(%rsp), %r13
  2652. movq 32(%rsp), %r14
  2653. movq 40(%rsp), %r15
  2654. #ifdef WINDOWS_ABI
  2655. movq 48(%rsp), %rdi
  2656. movq 56(%rsp), %rsi
  2657. movups 64(%rsp), %xmm6
  2658. movups 80(%rsp), %xmm7
  2659. movups 96(%rsp), %xmm8
  2660. movups 112(%rsp), %xmm9
  2661. movups 128(%rsp), %xmm10
  2662. movups 144(%rsp), %xmm11
  2663. movups 160(%rsp), %xmm12
  2664. movups 176(%rsp), %xmm13
  2665. movups 192(%rsp), %xmm14
  2666. movups 208(%rsp), %xmm15
  2667. #endif
  2668. addq $STACKSIZE, %rsp
  2669. ret
  2670. EPILOGUE
  2671. #else
  2672. /*************************************************************************************
  2673. * TRMM Kernel
  2674. *************************************************************************************/
  2675. PROLOGUE
  2676. PROFCODE
  2677. subq $STACKSIZE, %rsp
  2678. movq %rbx, (%rsp)
  2679. movq %rbp, 8(%rsp)
  2680. movq %r12, 16(%rsp)
  2681. movq %r13, 24(%rsp)
  2682. movq %r14, 32(%rsp)
  2683. movq %r15, 40(%rsp)
  2684. vzeroupper
  2685. #ifdef WINDOWS_ABI
  2686. movq %rdi, 48(%rsp)
  2687. movq %rsi, 56(%rsp)
  2688. movups %xmm6, 64(%rsp)
  2689. movups %xmm7, 80(%rsp)
  2690. movups %xmm8, 96(%rsp)
  2691. movups %xmm9, 112(%rsp)
  2692. movups %xmm10, 128(%rsp)
  2693. movups %xmm11, 144(%rsp)
  2694. movups %xmm12, 160(%rsp)
  2695. movups %xmm13, 176(%rsp)
  2696. movups %xmm14, 192(%rsp)
  2697. movups %xmm15, 208(%rsp)
  2698. movq ARG1, OLD_M
  2699. movq ARG2, OLD_N
  2700. movq ARG3, OLD_K
  2701. movq OLD_A, A
  2702. movq OLD_B, B
  2703. movq OLD_C, C
  2704. movq OLD_LDC, LDC
  2705. #ifdef TRMMKERNEL
  2706. movsd OLD_OFFSET, %xmm12
  2707. #endif
  2708. vmovaps %xmm3, %xmm0
  2709. #else
  2710. movq STACKSIZE + 8(%rsp), LDC
  2711. #ifdef TRMMKERNEL
  2712. movsd STACKSIZE + 16(%rsp), %xmm12
  2713. #endif
  2714. #endif
  2715. movq %rsp, SP # save old stack
  2716. subq $128 + L_BUFFER_SIZE, %rsp
  2717. andq $-4096, %rsp # align stack
  2718. STACK_TOUCH
  2719. cmpq $0, OLD_M
  2720. je .L999
  2721. cmpq $0, OLD_N
  2722. je .L999
  2723. cmpq $0, OLD_K
  2724. je .L999
  2725. movq OLD_M, M
  2726. movq OLD_N, N
  2727. movq OLD_K, K
  2728. vmovsd %xmm0, ALPHA
  2729. salq $BASE_SHIFT, LDC
  2730. movq N, %rax
  2731. xorq %rdx, %rdx
  2732. movq $2, %rdi
  2733. divq %rdi // N / 6
  2734. movq %rax, Ndiv6 // N / 6
  2735. movq %rdx, Nmod6 // N % 6
  2736. #ifdef TRMMKERNEL
  2737. vmovsd %xmm12, OFFSET
  2738. vmovsd %xmm12, KK
  2739. #ifndef LEFT
  2740. negq KK
  2741. #endif
  2742. #endif
  2743. movq Ndiv6, J
  2744. cmpq $0, J
  2745. je .L1_0
  2746. ALIGN_4
  2747. .L2_01:
  2748. // copy to sub buffer
  2749. movq B, BO1
  2750. leaq BUFFER1, BO // first buffer to BO
  2751. movq K, %rax
  2752. ALIGN_4
  2753. .L2_02b:
  2754. vmovsd (BO1), %xmm0
  2755. vmovsd %xmm0, (BO)
  2756. addq $2*SIZE,BO1
  2757. addq $2*SIZE,BO
  2758. decq %rax
  2759. jnz .L2_02b
  2760. .L2_02c:
  2761. movq BO1, B // next offset of B
  2762. .L2_10:
  2763. movq C, CO1
  2764. leaq (C, LDC, 2), C // c += 2 * ldc
  2765. #if defined(TRMMKERNEL) && defined(LEFT)
  2766. movq OFFSET, %rax
  2767. movq %rax, KK
  2768. #endif
  2769. movq A, AO // aoffset = a
  2770. addq $32 * SIZE, AO
  2771. movq M, I
  2772. sarq $4, I // i = (m >> 4)
  2773. je .L2_20
  2774. ALIGN_4
  2775. .L2_11:
  2776. #if !defined(TRMMKERNEL) || \
  2777. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2778. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2779. leaq BUFFER1, BO // first buffer to BO
  2780. addq $4 * SIZE, BO
  2781. #else
  2782. movq KK, %rax
  2783. leaq BUFFER1, BO // first buffer to BO
  2784. addq $4 * SIZE, BO
  2785. movq %rax, BI // Index for BO
  2786. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2787. leaq (BO, BI, SIZE), BO
  2788. salq $4, %rax // rax = rax * 16 ; number of values
  2789. leaq (AO, %rax, SIZE), AO
  2790. #endif
  2791. vzeroall
  2792. #ifndef TRMMKERNEL
  2793. movq K, %rax
  2794. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2795. movq K, %rax
  2796. subq KK, %rax
  2797. movq %rax, KKK
  2798. #else
  2799. movq KK, %rax
  2800. #ifdef LEFT
  2801. addq $16, %rax // number of values in AO
  2802. #else
  2803. addq $2, %rax // number of values in BO
  2804. #endif
  2805. movq %rax, KKK
  2806. #endif
  2807. andq $-8, %rax // K = K - ( K % 8 )
  2808. je .L2_16
  2809. movq %rax, BI // Index for BO
  2810. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2811. salq $4, %rax // rax = rax * 16 ; number of values
  2812. leaq (AO, %rax, SIZE), AO
  2813. leaq (BO, BI, SIZE), BO
  2814. negq BI
  2815. negq %rax
  2816. ALIGN_4
  2817. .L2_12:
  2818. prefetcht0 B_PR1(BO,BI, SIZE)
  2819. KERNEL16x2_1(xxx)
  2820. KERNEL16x2_2(xxx)
  2821. KERNEL16x2_3(xxx)
  2822. KERNEL16x2_4(xxx)
  2823. KERNEL16x2_1(xxx)
  2824. KERNEL16x2_2(xxx)
  2825. KERNEL16x2_3(xxx)
  2826. KERNEL16x2_4(xxx)
  2827. je .L2_16
  2828. prefetcht0 B_PR1(BO,BI, SIZE)
  2829. KERNEL16x2_1(xxx)
  2830. KERNEL16x2_2(xxx)
  2831. KERNEL16x2_3(xxx)
  2832. KERNEL16x2_4(xxx)
  2833. KERNEL16x2_1(xxx)
  2834. KERNEL16x2_2(xxx)
  2835. KERNEL16x2_3(xxx)
  2836. KERNEL16x2_4(xxx)
  2837. je .L2_16
  2838. jmp .L2_12
  2839. ALIGN_4
  2840. .L2_16:
  2841. #ifndef TRMMKERNEL
  2842. movq K, %rax
  2843. #else
  2844. movq KKK, %rax
  2845. #endif
  2846. andq $7, %rax # if (k & 1)
  2847. je .L2_19
  2848. movq %rax, BI // Index for BO
  2849. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2850. salq $4, %rax // rax = rax * 16 ; number of values
  2851. leaq (AO, %rax, SIZE), AO
  2852. leaq (BO, BI, SIZE), BO
  2853. negq BI
  2854. negq %rax
  2855. ALIGN_4
  2856. .L2_17:
  2857. KERNEL16x2_SUB(xxx)
  2858. addq $2, BI
  2859. addq $16, %rax
  2860. jl .L2_17
  2861. ALIGN_4
  2862. .L2_19:
  2863. vbroadcastss ALPHA, %xmm0
  2864. #ifndef TRMMKERNEL
  2865. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  2866. vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  2867. vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
  2868. vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
  2869. vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5
  2870. vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  2871. vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
  2872. vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
  2873. #else
  2874. vmulps %xmm0, %xmm4,%xmm4
  2875. vmulps %xmm0, %xmm7,%xmm7
  2876. vmulps %xmm0, %xmm10,%xmm10
  2877. vmulps %xmm0, %xmm13,%xmm13
  2878. vmulps %xmm0, %xmm5,%xmm5
  2879. vmulps %xmm0, %xmm8,%xmm8
  2880. vmulps %xmm0, %xmm11,%xmm11
  2881. vmulps %xmm0, %xmm14,%xmm14
  2882. #endif
  2883. vmovups %xmm4 , (CO1)
  2884. vmovups %xmm7 , 4 * SIZE(CO1)
  2885. vmovups %xmm10, 8 * SIZE(CO1)
  2886. vmovups %xmm13,12 * SIZE(CO1)
  2887. vmovups %xmm5 , (CO1, LDC)
  2888. vmovups %xmm8 , 4 * SIZE(CO1, LDC)
  2889. vmovups %xmm11, 8 * SIZE(CO1, LDC)
  2890. vmovups %xmm14,12 * SIZE(CO1, LDC)
  2891. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2892. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2893. movq K, %rax
  2894. subq KKK, %rax
  2895. movq %rax, BI // Index for BO
  2896. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2897. leaq (BO, BI, SIZE), BO
  2898. salq $4, %rax // rax = rax * 16 ; number of values
  2899. leaq (AO, %rax, SIZE), AO
  2900. #endif
  2901. #if defined(TRMMKERNEL) && defined(LEFT)
  2902. addq $16, KK
  2903. #endif
  2904. addq $16 * SIZE, CO1 # coffset += 16
  2905. decq I # i --
  2906. jg .L2_11
  2907. ALIGN_4
  2908. /**************************************************************************
  2909. * Rest of M
  2910. ***************************************************************************/
  2911. .L2_20:
  2912. // Test rest of M
  2913. testq $15, M
  2914. jz .L2_60 // to next 3 lines of N
  2915. testq $8, M
  2916. jz .L2_21pre
  2917. ALIGN_4
  2918. /**************************************************************************/
  2919. .L2_20_1:
  2920. #if !defined(TRMMKERNEL) || \
  2921. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2922. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2923. leaq BUFFER1, BO // first buffer to BO
  2924. addq $4 * SIZE, BO
  2925. #else
  2926. movq KK, %rax
  2927. leaq BUFFER1, BO // first buffer to BO
  2928. addq $4 * SIZE, BO
  2929. movq %rax, BI // Index for BO
  2930. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2931. leaq (BO, BI, SIZE), BO
  2932. salq $3, %rax // rax = rax * 8 ; number of values
  2933. leaq (AO, %rax, SIZE), AO
  2934. #endif
  2935. vzeroall
  2936. #ifndef TRMMKERNEL
  2937. movq K, %rax
  2938. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2939. movq K, %rax
  2940. subq KK, %rax
  2941. movq %rax, KKK
  2942. #else
  2943. movq KK, %rax
  2944. #ifdef LEFT
  2945. addq $8, %rax // number of values in A
  2946. #else
  2947. addq $2, %rax // number of values in BO
  2948. #endif
  2949. movq %rax, KKK
  2950. #endif
  2951. andq $-8, %rax
  2952. je .L2_20_6
  2953. movq %rax, BI // Index for BO
  2954. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2955. salq $3, %rax // rax = rax * 8 ; number of values
  2956. leaq (AO, %rax, SIZE), AO
  2957. leaq (BO, BI, SIZE), BO
  2958. negq BI
  2959. negq %rax
  2960. ALIGN_4
  2961. .L2_20_2:
  2962. prefetcht0 B_PR1(BO,BI, SIZE)
  2963. KERNEL8x2_1(xxx)
  2964. KERNEL8x2_2(xxx)
  2965. KERNEL8x2_3(xxx)
  2966. KERNEL8x2_4(xxx)
  2967. KERNEL8x2_1(xxx)
  2968. KERNEL8x2_2(xxx)
  2969. KERNEL8x2_3(xxx)
  2970. KERNEL8x2_4(xxx)
  2971. je .L2_20_6
  2972. prefetcht0 B_PR1(BO,BI, SIZE)
  2973. KERNEL8x2_1(xxx)
  2974. KERNEL8x2_2(xxx)
  2975. KERNEL8x2_3(xxx)
  2976. KERNEL8x2_4(xxx)
  2977. KERNEL8x2_1(xxx)
  2978. KERNEL8x2_2(xxx)
  2979. KERNEL8x2_3(xxx)
  2980. KERNEL8x2_4(xxx)
  2981. je .L2_20_6
  2982. jmp .L2_20_2
  2983. ALIGN_4
  2984. .L2_20_6:
  2985. #ifndef TRMMKERNEL
  2986. movq K, %rax
  2987. #else
  2988. movq KKK, %rax
  2989. #endif
  2990. andq $7, %rax # if (k & 1)
  2991. je .L2_20_9
  2992. movq %rax, BI // Index for BO
  2993. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2994. salq $3, %rax // rax = rax * 8 ; number of values
  2995. leaq (AO, %rax, SIZE), AO
  2996. leaq (BO, BI, SIZE), BO
  2997. negq BI
  2998. negq %rax
  2999. ALIGN_4
  3000. .L2_20_7:
  3001. KERNEL8x2_SUB(xxx)
  3002. addq $2, BI
  3003. addq $8, %rax
  3004. jl .L2_20_7
  3005. ALIGN_4
  3006. .L2_20_9:
  3007. vbroadcastss ALPHA, %xmm0
  3008. #ifndef TRMMKERNEL
  3009. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  3010. vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  3011. vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5
  3012. vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  3013. #else
  3014. vmulps %xmm0, %xmm4,%xmm4
  3015. vmulps %xmm0, %xmm7,%xmm7
  3016. vmulps %xmm0, %xmm5,%xmm5
  3017. vmulps %xmm0, %xmm8,%xmm8
  3018. #endif
  3019. vmovups %xmm4 , (CO1)
  3020. vmovups %xmm7 , 4 * SIZE(CO1)
  3021. vmovups %xmm5 , (CO1, LDC)
  3022. vmovups %xmm8 , 4 * SIZE(CO1, LDC)
  3023. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3024. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3025. movq K, %rax
  3026. subq KKK, %rax
  3027. movq %rax, BI // Index for BO
  3028. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3029. leaq (BO, BI, SIZE), BO
  3030. salq $3, %rax // rax = rax * 8 ; number of values
  3031. leaq (AO, %rax, SIZE), AO
  3032. #endif
  3033. #if defined(TRMMKERNEL) && defined(LEFT)
  3034. addq $8, KK
  3035. #endif
  3036. addq $8 * SIZE, CO1 # coffset += 8
  3037. ALIGN_4
  3038. /**************************************************************************/
  3039. .L2_21pre:
  3040. testq $4, M
  3041. jz .L2_30
  3042. ALIGN_4
  3043. .L2_21:
  3044. #if !defined(TRMMKERNEL) || \
  3045. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3046. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3047. leaq BUFFER1, BO // first buffer to BO
  3048. addq $4 * SIZE, BO
  3049. #else
  3050. movq KK, %rax
  3051. leaq BUFFER1, BO // first buffer to BO
  3052. addq $4 * SIZE, BO
  3053. movq %rax, BI // Index for BO
  3054. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3055. leaq (BO, BI, SIZE), BO
  3056. salq $2, %rax // rax = rax * 4 ; number of values
  3057. leaq (AO, %rax, SIZE), AO
  3058. #endif
  3059. vzeroall
  3060. #ifndef TRMMKERNEL
  3061. movq K, %rax
  3062. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3063. movq K, %rax
  3064. subq KK, %rax
  3065. movq %rax, KKK
  3066. #else
  3067. movq KK, %rax
  3068. #ifdef LEFT
  3069. addq $4, %rax // number of values in A
  3070. #else
  3071. addq $2, %rax // number of values in BO
  3072. #endif
  3073. movq %rax, KKK
  3074. #endif
  3075. andq $-8, %rax
  3076. je .L2_26
  3077. movq %rax, BI // Index for BO
  3078. leaq (BI,BI,1), BI // BI = BI * 1 ; number of values
  3079. salq $2, %rax // rax = rax * 4 ; number of values
  3080. leaq (AO, %rax, SIZE), AO
  3081. leaq (BO, BI, SIZE), BO
  3082. negq BI
  3083. negq %rax
  3084. ALIGN_4
  3085. .L2_22:
  3086. prefetcht0 B_PR1(BO,BI, SIZE)
  3087. KERNEL4x2_1(xxx)
  3088. KERNEL4x2_2(xxx)
  3089. KERNEL4x2_3(xxx)
  3090. KERNEL4x2_4(xxx)
  3091. KERNEL4x2_1(xxx)
  3092. KERNEL4x2_2(xxx)
  3093. KERNEL4x2_3(xxx)
  3094. KERNEL4x2_4(xxx)
  3095. je .L2_26
  3096. prefetcht0 B_PR1(BO,BI, SIZE)
  3097. KERNEL4x2_1(xxx)
  3098. KERNEL4x2_2(xxx)
  3099. KERNEL4x2_3(xxx)
  3100. KERNEL4x2_4(xxx)
  3101. KERNEL4x2_1(xxx)
  3102. KERNEL4x2_2(xxx)
  3103. KERNEL4x2_3(xxx)
  3104. KERNEL4x2_4(xxx)
  3105. je .L2_26
  3106. jmp .L2_22
  3107. ALIGN_4
  3108. .L2_26:
  3109. #ifndef TRMMKERNEL
  3110. movq K, %rax
  3111. #else
  3112. movq KKK, %rax
  3113. #endif
  3114. andq $7, %rax # if (k & 1)
  3115. je .L2_29
  3116. movq %rax, BI // Index for BO
  3117. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3118. salq $2, %rax // rax = rax * 4 ; number of values
  3119. leaq (AO, %rax, SIZE), AO
  3120. leaq (BO, BI, SIZE), BO
  3121. negq BI
  3122. negq %rax
  3123. ALIGN_4
  3124. .L2_27:
  3125. KERNEL4x2_SUB(xxx)
  3126. addq $2, BI
  3127. addq $4, %rax
  3128. jl .L2_27
  3129. ALIGN_4
  3130. .L2_29:
  3131. vbroadcastss ALPHA, %xmm0
  3132. #ifndef TRMMKERNEL
  3133. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  3134. vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5
  3135. #else
  3136. vmulps %xmm0, %xmm4,%xmm4
  3137. vmulps %xmm0, %xmm5,%xmm5
  3138. #endif
  3139. vmovups %xmm4 , (CO1)
  3140. vmovups %xmm5 , (CO1, LDC)
  3141. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3142. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3143. movq K, %rax
  3144. subq KKK, %rax
  3145. movq %rax, BI // Index for BO
  3146. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3147. leaq (BO, BI, SIZE), BO
  3148. salq $2, %rax // rax = rax * 4 ; number of values
  3149. leaq (AO, %rax, SIZE), AO
  3150. #endif
  3151. #if defined(TRMMKERNEL) && defined(LEFT)
  3152. addq $4, KK
  3153. #endif
  3154. addq $4 * SIZE, CO1 # coffset += 4
  3155. ALIGN_4
  3156. .L2_30:
  3157. testq $2, M
  3158. jz .L2_40
  3159. ALIGN_4
  3160. .L2_31:
  3161. #if !defined(TRMMKERNEL) || \
  3162. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3163. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3164. leaq BUFFER1, BO // first buffer to BO
  3165. addq $4 * SIZE, BO
  3166. #else
  3167. movq KK, %rax
  3168. leaq BUFFER1, BO // first buffer to BO
  3169. addq $4 * SIZE, BO
  3170. movq %rax, BI // Index for BO
  3171. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3172. leaq (BO, BI, SIZE), BO
  3173. salq $1, %rax // rax = rax * 2 ; number of values
  3174. leaq (AO, %rax, SIZE), AO
  3175. #endif
  3176. vzeroall
  3177. #ifndef TRMMKERNEL
  3178. movq K, %rax
  3179. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3180. movq K, %rax
  3181. subq KK, %rax
  3182. movq %rax, KKK
  3183. #else
  3184. movq KK, %rax
  3185. #ifdef LEFT
  3186. addq $2, %rax // number of values in AO
  3187. #else
  3188. addq $2, %rax // number of values in BO
  3189. #endif
  3190. movq %rax, KKK
  3191. #endif
  3192. andq $-8, %rax
  3193. je .L2_36
  3194. movq %rax, BI // Index for BO
  3195. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3196. salq $1, %rax // rax = rax *2 ; number of values
  3197. leaq (AO, %rax, SIZE), AO
  3198. leaq (BO, BI, SIZE), BO
  3199. negq BI
  3200. negq %rax
  3201. ALIGN_4
  3202. .L2_32:
  3203. prefetcht0 B_PR1(BO,BI,SIZE)
  3204. KERNEL2x2_1(xxx)
  3205. KERNEL2x2_2(xxx)
  3206. KERNEL2x2_3(xxx)
  3207. KERNEL2x2_4(xxx)
  3208. KERNEL2x2_1(xxx)
  3209. KERNEL2x2_2(xxx)
  3210. KERNEL2x2_3(xxx)
  3211. KERNEL2x2_4(xxx)
  3212. je .L2_36
  3213. prefetcht0 B_PR1(BO,BI,SIZE)
  3214. KERNEL2x2_1(xxx)
  3215. KERNEL2x2_2(xxx)
  3216. KERNEL2x2_3(xxx)
  3217. KERNEL2x2_4(xxx)
  3218. KERNEL2x2_1(xxx)
  3219. KERNEL2x2_2(xxx)
  3220. KERNEL2x2_3(xxx)
  3221. KERNEL2x2_4(xxx)
  3222. je .L2_36
  3223. jmp .L2_32
  3224. ALIGN_4
  3225. .L2_36:
  3226. #ifndef TRMMKERNEL
  3227. movq K, %rax
  3228. #else
  3229. movq KKK, %rax
  3230. #endif
  3231. andq $7, %rax # if (k & 1)
  3232. je .L2_39
  3233. movq %rax, BI // Index for BO
  3234. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3235. salq $1, %rax // rax = rax *2 ; number of values
  3236. leaq (AO, %rax, SIZE), AO
  3237. leaq (BO, BI, SIZE), BO
  3238. negq BI
  3239. negq %rax
  3240. ALIGN_4
  3241. .L2_37:
  3242. KERNEL2x2_SUB(xxx)
  3243. addq $2, BI
  3244. addq $2, %rax
  3245. jl .L2_37
  3246. ALIGN_4
  3247. .L2_39:
  3248. vmovss ALPHA, %xmm0
  3249. #ifndef TRMMKERNEL
  3250. vfmaddss (CO1),%xmm0, %xmm4,%xmm4
  3251. vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
  3252. vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5
  3253. vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
  3254. #else
  3255. vmulss %xmm0, %xmm4,%xmm4
  3256. vmulss %xmm0, %xmm8,%xmm8
  3257. vmulss %xmm0, %xmm5,%xmm5
  3258. vmulss %xmm0, %xmm10,%xmm10
  3259. #endif
  3260. vmovss %xmm4 , (CO1)
  3261. vmovss %xmm8 , 1 * SIZE(CO1)
  3262. vmovss %xmm5 , (CO1, LDC)
  3263. vmovss %xmm10, 1 * SIZE(CO1, LDC)
  3264. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3265. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3266. movq K, %rax
  3267. subq KKK, %rax
  3268. movq %rax, BI // Index for BO
  3269. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3270. leaq (BO, BI, SIZE), BO
  3271. salq $1, %rax // rax = rax * 2 ; number of values
  3272. leaq (AO, %rax, SIZE), AO
  3273. #endif
  3274. #if defined(TRMMKERNEL) && defined(LEFT)
  3275. addq $2, KK
  3276. #endif
  3277. addq $2 * SIZE, CO1 # coffset += 2
  3278. ALIGN_4
  3279. .L2_40:
  3280. testq $1, M
  3281. jz .L2_60 // to next 2 lines of N
  3282. ALIGN_4
  3283. .L2_41:
  3284. #if !defined(TRMMKERNEL) || \
  3285. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3286. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3287. leaq BUFFER1, BO // first buffer to BO
  3288. addq $4 * SIZE, BO
  3289. #else
  3290. movq KK, %rax
  3291. leaq BUFFER1, BO // first buffer to BO
  3292. addq $4 * SIZE, BO
  3293. movq %rax, BI // Index for BO
  3294. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3295. leaq (BO, BI, SIZE), BO
  3296. leaq (AO, %rax, SIZE), AO
  3297. #endif
  3298. vzeroall
  3299. #ifndef TRMMKERNEL
  3300. movq K, %rax
  3301. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3302. movq K, %rax
  3303. subq KK, %rax
  3304. movq %rax, KKK
  3305. #else
  3306. movq KK, %rax
  3307. #ifdef LEFT
  3308. addq $1, %rax // number of values in AO
  3309. #else
  3310. addq $2, %rax // number of values in BO
  3311. #endif
  3312. movq %rax, KKK
  3313. #endif
  3314. andq $-8, %rax
  3315. je .L2_46
  3316. movq %rax, BI // Index for BO
  3317. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3318. leaq (AO, %rax, SIZE), AO
  3319. leaq (BO, BI, SIZE), BO
  3320. negq BI
  3321. negq %rax
  3322. ALIGN_4
  3323. .L2_42:
  3324. KERNEL1x2_1(xxx)
  3325. KERNEL1x2_2(xxx)
  3326. KERNEL1x2_3(xxx)
  3327. KERNEL1x2_4(xxx)
  3328. KERNEL1x2_1(xxx)
  3329. KERNEL1x2_2(xxx)
  3330. KERNEL1x2_3(xxx)
  3331. KERNEL1x2_4(xxx)
  3332. je .L2_46
  3333. KERNEL1x2_1(xxx)
  3334. KERNEL1x2_2(xxx)
  3335. KERNEL1x2_3(xxx)
  3336. KERNEL1x2_4(xxx)
  3337. KERNEL1x2_1(xxx)
  3338. KERNEL1x2_2(xxx)
  3339. KERNEL1x2_3(xxx)
  3340. KERNEL1x2_4(xxx)
  3341. je .L2_46
  3342. jmp .L2_42
  3343. ALIGN_4
  3344. .L2_46:
  3345. #ifndef TRMMKERNEL
  3346. movq K, %rax
  3347. #else
  3348. movq KKK, %rax
  3349. #endif
  3350. andq $7, %rax # if (k & 1)
  3351. je .L2_49
  3352. movq %rax, BI // Index for BO
  3353. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3354. leaq (AO, %rax, SIZE), AO
  3355. leaq (BO, BI, SIZE), BO
  3356. negq BI
  3357. negq %rax
  3358. ALIGN_4
  3359. .L2_47:
  3360. KERNEL1x2_SUB(xxx)
  3361. addq $2, BI
  3362. addq $1, %rax
  3363. jl .L2_47
  3364. ALIGN_4
  3365. .L2_49:
  3366. vmovss ALPHA, %xmm0
  3367. #ifndef TRMMKERNEL
  3368. vfmaddss (CO1),%xmm0, %xmm4,%xmm4
  3369. vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5
  3370. #else
  3371. vmulss %xmm0, %xmm4,%xmm4
  3372. vmulss %xmm0, %xmm5,%xmm5
  3373. #endif
  3374. vmovss %xmm4 , (CO1)
  3375. vmovss %xmm5 , (CO1, LDC)
  3376. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3377. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3378. movq K, %rax
  3379. subq KKK, %rax
  3380. movq %rax, BI // Index for BO
  3381. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3382. leaq (BO, BI, SIZE), BO
  3383. leaq (AO, %rax, SIZE), AO
  3384. #endif
  3385. #if defined(TRMMKERNEL) && defined(LEFT)
  3386. addq $1, KK
  3387. #endif
  3388. addq $1 * SIZE, CO1 # coffset += 1
  3389. ALIGN_4
  3390. .L2_60:
  3391. #if defined(TRMMKERNEL) && !defined(LEFT)
  3392. addq $2, KK
  3393. #endif
  3394. decq J // j --
  3395. jg .L2_01 // next 2 lines of N
  3396. .L1_0:
  3397. /************************************************************************************************
  3398. * Loop for Nmod6 % 2 > 0
  3399. *************************************************************************************************/
  3400. movq Nmod6, J
  3401. andq $1, J // j % 2
  3402. je .L999
  3403. ALIGN_4
  3404. .L1_01:
  3405. // copy to sub buffer
  3406. movq B, BO1
  3407. leaq BUFFER1, BO // first buffer to BO
  3408. movq K, %rax
  3409. ALIGN_4
  3410. .L1_02b:
  3411. vmovss (BO1), %xmm0
  3412. vmovss %xmm0, (BO)
  3413. addq $1*SIZE,BO1
  3414. addq $1*SIZE,BO
  3415. decq %rax
  3416. jnz .L1_02b
  3417. .L1_02c:
  3418. movq BO1, B // next offset of B
  3419. .L1_10:
  3420. movq C, CO1
  3421. leaq (C, LDC, 1), C // c += 1 * ldc
  3422. #if defined(TRMMKERNEL) && defined(LEFT)
  3423. movq OFFSET, %rax
  3424. movq %rax, KK
  3425. #endif
  3426. movq A, AO // aoffset = a
  3427. addq $32 * SIZE, AO
  3428. movq M, I
  3429. sarq $4, I // i = (m >> 4)
  3430. je .L1_20
  3431. ALIGN_4
  3432. .L1_11:
  3433. #if !defined(TRMMKERNEL) || \
  3434. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3435. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3436. leaq BUFFER1, BO // first buffer to BO
  3437. addq $2 * SIZE, BO
  3438. #else
  3439. movq KK, %rax
  3440. leaq BUFFER1, BO // first buffer to BO
  3441. addq $2 * SIZE, BO
  3442. movq %rax, BI // Index for BO
  3443. leaq (BO, BI, SIZE), BO
  3444. salq $4, %rax // rax = rax * 16 ; number of values
  3445. leaq (AO, %rax, SIZE), AO
  3446. #endif
  3447. vzeroall
  3448. #ifndef TRMMKERNEL
  3449. movq K, %rax
  3450. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3451. movq K, %rax
  3452. subq KK, %rax
  3453. movq %rax, KKK
  3454. #else
  3455. movq KK, %rax
  3456. #ifdef LEFT
  3457. addq $16, %rax // number of values in AO
  3458. #else
  3459. addq $1, %rax // number of values in BO
  3460. #endif
  3461. movq %rax, KKK
  3462. #endif
  3463. andq $-8, %rax // K = K - ( K % 8 )
  3464. je .L1_16
  3465. movq %rax, BI // Index for BO
  3466. salq $4, %rax // rax = rax * 16 ; number of values
  3467. leaq (AO, %rax, SIZE), AO
  3468. leaq (BO, BI, SIZE), BO
  3469. negq BI
  3470. negq %rax
  3471. ALIGN_4
  3472. .L1_12:
  3473. prefetcht0 B_PR1(BO,BI, SIZE)
  3474. KERNEL16x1_1(xxx)
  3475. KERNEL16x1_2(xxx)
  3476. KERNEL16x1_3(xxx)
  3477. KERNEL16x1_4(xxx)
  3478. KERNEL16x1_1(xxx)
  3479. KERNEL16x1_2(xxx)
  3480. KERNEL16x1_3(xxx)
  3481. KERNEL16x1_4(xxx)
  3482. je .L1_16
  3483. KERNEL16x1_1(xxx)
  3484. KERNEL16x1_2(xxx)
  3485. KERNEL16x1_3(xxx)
  3486. KERNEL16x1_4(xxx)
  3487. KERNEL16x1_1(xxx)
  3488. KERNEL16x1_2(xxx)
  3489. KERNEL16x1_3(xxx)
  3490. KERNEL16x1_4(xxx)
  3491. je .L1_16
  3492. jmp .L1_12
  3493. ALIGN_4
  3494. .L1_16:
  3495. #ifndef TRMMKERNEL
  3496. movq K, %rax
  3497. #else
  3498. movq KKK, %rax
  3499. #endif
  3500. andq $7, %rax # if (k & 1)
  3501. je .L1_19
  3502. movq %rax, BI // Index for BO
  3503. salq $4, %rax // rax = rax * 16 ; number of values
  3504. leaq (AO, %rax, SIZE), AO
  3505. leaq (BO, BI, SIZE), BO
  3506. negq BI
  3507. negq %rax
  3508. ALIGN_4
  3509. .L1_17:
  3510. KERNEL16x1_SUB(xxx)
  3511. addq $1, BI
  3512. addq $16, %rax
  3513. jl .L1_17
  3514. ALIGN_4
  3515. .L1_19:
  3516. vbroadcastss ALPHA, %xmm0
  3517. #ifndef TRMMKERNEL
  3518. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  3519. vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  3520. vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
  3521. vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
  3522. #else
  3523. vmulps %xmm0, %xmm4,%xmm4
  3524. vmulps %xmm0, %xmm7,%xmm7
  3525. vmulps %xmm0, %xmm10,%xmm10
  3526. vmulps %xmm0, %xmm13,%xmm13
  3527. #endif
  3528. vmovups %xmm4 , (CO1)
  3529. vmovups %xmm7 , 4 * SIZE(CO1)
  3530. vmovups %xmm10, 8 * SIZE(CO1)
  3531. vmovups %xmm13,12 * SIZE(CO1)
  3532. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3533. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3534. movq K, %rax
  3535. subq KKK, %rax
  3536. movq %rax, BI // Index for BO
  3537. leaq (BO, BI, SIZE), BO
  3538. salq $4, %rax // rax = rax * 16 ; number of values
  3539. leaq (AO, %rax, SIZE), AO
  3540. #endif
  3541. #if defined(TRMMKERNEL) && defined(LEFT)
  3542. addq $16, KK
  3543. #endif
  3544. addq $16 * SIZE, CO1 # coffset += 16
  3545. decq I # i --
  3546. jg .L1_11
  3547. ALIGN_4
  3548. /**************************************************************************
  3549. * Rest of M
  3550. ***************************************************************************/
  3551. .L1_20:
  3552. // Test rest of M
  3553. testq $15, M
  3554. jz .L999
  3555. testq $8, M
  3556. jz .L1_21pre
  3557. ALIGN_4
  3558. /**************************************************************************/
  3559. .L1_20_1:
  3560. #if !defined(TRMMKERNEL) || \
  3561. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3562. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3563. leaq BUFFER1, BO // first buffer to BO
  3564. addq $2 * SIZE, BO
  3565. #else
  3566. movq KK, %rax
  3567. leaq BUFFER1, BO // first buffer to BO
  3568. addq $2 * SIZE, BO
  3569. movq %rax, BI // Index for BO
  3570. leaq (BO, BI, SIZE), BO
  3571. salq $3, %rax // rax = rax * 8 ; number of values
  3572. leaq (AO, %rax, SIZE), AO
  3573. #endif
  3574. vzeroall
  3575. #ifndef TRMMKERNEL
  3576. movq K, %rax
  3577. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3578. movq K, %rax
  3579. subq KK, %rax
  3580. movq %rax, KKK
  3581. #else
  3582. movq KK, %rax
  3583. #ifdef LEFT
  3584. addq $8, %rax // number of values in A
  3585. #else
  3586. addq $1, %rax // number of values in BO
  3587. #endif
  3588. movq %rax, KKK
  3589. #endif
  3590. andq $-8, %rax
  3591. je .L1_20_6
  3592. movq %rax, BI // Index for BO
  3593. salq $3, %rax // rax = rax * 8 ; number of values
  3594. leaq (AO, %rax, SIZE), AO
  3595. leaq (BO, BI, SIZE), BO
  3596. negq BI
  3597. negq %rax
  3598. ALIGN_4
  3599. .L1_20_2:
  3600. prefetcht0 B_PR1(BO,BI, SIZE)
  3601. KERNEL8x1_1(xxx)
  3602. KERNEL8x1_2(xxx)
  3603. KERNEL8x1_3(xxx)
  3604. KERNEL8x1_4(xxx)
  3605. KERNEL8x1_1(xxx)
  3606. KERNEL8x1_2(xxx)
  3607. KERNEL8x1_3(xxx)
  3608. KERNEL8x1_4(xxx)
  3609. je .L1_20_6
  3610. KERNEL8x1_1(xxx)
  3611. KERNEL8x1_2(xxx)
  3612. KERNEL8x1_3(xxx)
  3613. KERNEL8x1_4(xxx)
  3614. KERNEL8x1_1(xxx)
  3615. KERNEL8x1_2(xxx)
  3616. KERNEL8x1_3(xxx)
  3617. KERNEL8x1_4(xxx)
  3618. je .L1_20_6
  3619. jmp .L1_20_2
  3620. ALIGN_4
  3621. .L1_20_6:
  3622. #ifndef TRMMKERNEL
  3623. movq K, %rax
  3624. #else
  3625. movq KKK, %rax
  3626. #endif
  3627. andq $7, %rax # if (k & 1)
  3628. je .L1_20_9
  3629. movq %rax, BI // Index for BO
  3630. salq $3, %rax // rax = rax * 8 ; number of values
  3631. leaq (AO, %rax, SIZE), AO
  3632. leaq (BO, BI, SIZE), BO
  3633. negq BI
  3634. negq %rax
  3635. ALIGN_4
  3636. .L1_20_7:
  3637. KERNEL8x1_SUB(xxx)
  3638. addq $1, BI
  3639. addq $8, %rax
  3640. jl .L1_20_7
  3641. ALIGN_4
  3642. .L1_20_9:
  3643. vbroadcastss ALPHA, %xmm0
  3644. #ifndef TRMMKERNEL
  3645. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  3646. vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  3647. #else
  3648. vmulps %xmm0, %xmm4,%xmm4
  3649. vmulps %xmm0, %xmm7,%xmm7
  3650. #endif
  3651. vmovups %xmm4 , (CO1)
  3652. vmovups %xmm7 , 4 * SIZE(CO1)
  3653. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3654. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3655. movq K, %rax
  3656. subq KKK, %rax
  3657. movq %rax, BI // Index for BO
  3658. leaq (BO, BI, SIZE), BO
  3659. salq $3, %rax // rax = rax * 8 ; number of values
  3660. leaq (AO, %rax, SIZE), AO
  3661. #endif
  3662. #if defined(TRMMKERNEL) && defined(LEFT)
  3663. addq $8, KK
  3664. #endif
  3665. addq $8 * SIZE, CO1 # coffset += 8
  3666. ALIGN_4
  3667. /**************************************************************************/
  3668. .L1_21pre:
  3669. testq $4, M
  3670. jz .L1_30
  3671. ALIGN_4
  3672. .L1_21:
  3673. #if !defined(TRMMKERNEL) || \
  3674. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3675. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3676. leaq BUFFER1, BO // first buffer to BO
  3677. addq $2 * SIZE, BO
  3678. #else
  3679. movq KK, %rax
  3680. leaq BUFFER1, BO // first buffer to BO
  3681. addq $2 * SIZE, BO
  3682. movq %rax, BI // Index for BO
  3683. leaq (BO, BI, SIZE), BO
  3684. salq $2, %rax // rax = rax * 4 ; number of values
  3685. leaq (AO, %rax, SIZE), AO
  3686. #endif
  3687. vzeroall
  3688. #ifndef TRMMKERNEL
  3689. movq K, %rax
  3690. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3691. movq K, %rax
  3692. subq KK, %rax
  3693. movq %rax, KKK
  3694. #else
  3695. movq KK, %rax
  3696. #ifdef LEFT
  3697. addq $4, %rax // number of values in A
  3698. #else
  3699. addq $1, %rax // number of values in BO
  3700. #endif
  3701. movq %rax, KKK
  3702. #endif
  3703. andq $-8, %rax
  3704. je .L1_26
  3705. movq %rax, BI // Index for BO
  3706. salq $2, %rax // rax = rax * 4 ; number of values
  3707. leaq (AO, %rax, SIZE), AO
  3708. leaq (BO, BI, SIZE), BO
  3709. negq BI
  3710. negq %rax
  3711. ALIGN_4
  3712. .L1_22:
  3713. prefetcht0 B_PR1(BO,BI, SIZE)
  3714. KERNEL4x1_1(xxx)
  3715. KERNEL4x1_2(xxx)
  3716. KERNEL4x1_3(xxx)
  3717. KERNEL4x1_4(xxx)
  3718. KERNEL4x1_1(xxx)
  3719. KERNEL4x1_2(xxx)
  3720. KERNEL4x1_3(xxx)
  3721. KERNEL4x1_4(xxx)
  3722. je .L1_26
  3723. KERNEL4x1_1(xxx)
  3724. KERNEL4x1_2(xxx)
  3725. KERNEL4x1_3(xxx)
  3726. KERNEL4x1_4(xxx)
  3727. KERNEL4x1_1(xxx)
  3728. KERNEL4x1_2(xxx)
  3729. KERNEL4x1_3(xxx)
  3730. KERNEL4x1_4(xxx)
  3731. je .L1_26
  3732. jmp .L1_22
  3733. ALIGN_4
  3734. .L1_26:
  3735. #ifndef TRMMKERNEL
  3736. movq K, %rax
  3737. #else
  3738. movq KKK, %rax
  3739. #endif
  3740. andq $7, %rax # if (k & 1)
  3741. je .L1_29
  3742. movq %rax, BI // Index for BO
  3743. salq $2, %rax // rax = rax * 4 ; number of values
  3744. leaq (AO, %rax, SIZE), AO
  3745. leaq (BO, BI, SIZE), BO
  3746. negq BI
  3747. negq %rax
  3748. ALIGN_4
  3749. .L1_27:
  3750. KERNEL4x1_SUB(xxx)
  3751. addq $1, BI
  3752. addq $4, %rax
  3753. jl .L1_27
  3754. ALIGN_4
  3755. .L1_29:
  3756. vbroadcastss ALPHA, %xmm0
  3757. #ifndef TRMMKERNEL
  3758. vfmaddps (CO1),%xmm0, %xmm4,%xmm4
  3759. #else
  3760. vmulps %xmm0, %xmm4,%xmm4
  3761. #endif
  3762. vmovups %xmm4 , (CO1)
  3763. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3764. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3765. movq K, %rax
  3766. subq KKK, %rax
  3767. movq %rax, BI // Index for BO
  3768. leaq (BO, BI, SIZE), BO
  3769. salq $2, %rax // rax = rax * 4 ; number of values
  3770. leaq (AO, %rax, SIZE), AO
  3771. #endif
  3772. #if defined(TRMMKERNEL) && defined(LEFT)
  3773. addq $4, KK
  3774. #endif
  3775. addq $4 * SIZE, CO1 # coffset += 4
  3776. ALIGN_4
  3777. .L1_30:
  3778. testq $2, M
  3779. jz .L1_40
  3780. ALIGN_4
  3781. .L1_31:
  3782. #if !defined(TRMMKERNEL) || \
  3783. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3784. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3785. leaq BUFFER1, BO // first buffer to BO
  3786. addq $2 * SIZE, BO
  3787. #else
  3788. movq KK, %rax
  3789. leaq BUFFER1, BO // first buffer to BO
  3790. addq $2 * SIZE, BO
  3791. movq %rax, BI // Index for BO
  3792. leaq (BO, BI, SIZE), BO
  3793. salq $1, %rax // rax = rax * 2 ; number of values
  3794. leaq (AO, %rax, SIZE), AO
  3795. #endif
  3796. vzeroall
  3797. #ifndef TRMMKERNEL
  3798. movq K, %rax
  3799. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3800. movq K, %rax
  3801. subq KK, %rax
  3802. movq %rax, KKK
  3803. #else
  3804. movq KK, %rax
  3805. #ifdef LEFT
  3806. addq $2, %rax // number of values in AO
  3807. #else
  3808. addq $1, %rax // number of values in BO
  3809. #endif
  3810. movq %rax, KKK
  3811. #endif
  3812. andq $-8, %rax
  3813. je .L1_36
  3814. movq %rax, BI // Index for BO
  3815. salq $1, %rax // rax = rax *2 ; number of values
  3816. leaq (AO, %rax, SIZE), AO
  3817. leaq (BO, BI, SIZE), BO
  3818. negq BI
  3819. negq %rax
  3820. ALIGN_4
  3821. .L1_32:
  3822. prefetcht0 B_PR1(BO,BI,SIZE)
  3823. KERNEL2x1_1(xxx)
  3824. KERNEL2x1_2(xxx)
  3825. KERNEL2x1_3(xxx)
  3826. KERNEL2x1_4(xxx)
  3827. KERNEL2x1_1(xxx)
  3828. KERNEL2x1_2(xxx)
  3829. KERNEL2x1_3(xxx)
  3830. KERNEL2x1_4(xxx)
  3831. je .L1_36
  3832. KERNEL2x1_1(xxx)
  3833. KERNEL2x1_2(xxx)
  3834. KERNEL2x1_3(xxx)
  3835. KERNEL2x1_4(xxx)
  3836. KERNEL2x1_1(xxx)
  3837. KERNEL2x1_2(xxx)
  3838. KERNEL2x1_3(xxx)
  3839. KERNEL2x1_4(xxx)
  3840. je .L1_36
  3841. jmp .L1_32
  3842. ALIGN_4
  3843. .L1_36:
  3844. #ifndef TRMMKERNEL
  3845. movq K, %rax
  3846. #else
  3847. movq KKK, %rax
  3848. #endif
  3849. andq $7, %rax # if (k & 1)
  3850. je .L1_39
  3851. movq %rax, BI // Index for BO
  3852. salq $1, %rax // rax = rax *2 ; number of values
  3853. leaq (AO, %rax, SIZE), AO
  3854. leaq (BO, BI, SIZE), BO
  3855. negq BI
  3856. negq %rax
  3857. ALIGN_4
  3858. .L1_37:
  3859. KERNEL2x1_SUB(xxx)
  3860. addq $1, BI
  3861. addq $2, %rax
  3862. jl .L1_37
  3863. ALIGN_4
  3864. .L1_39:
  3865. vmovss ALPHA, %xmm0
  3866. #ifndef TRMMKERNEL
  3867. vfmaddss (CO1),%xmm0, %xmm4,%xmm4
  3868. vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
  3869. #else
  3870. vmulss %xmm0, %xmm4,%xmm4
  3871. vmulss %xmm0, %xmm8,%xmm8
  3872. #endif
  3873. vmovss %xmm4 , (CO1)
  3874. vmovss %xmm8 , 1 * SIZE(CO1)
  3875. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3876. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3877. movq K, %rax
  3878. subq KKK, %rax
  3879. movq %rax, BI // Index for BO
  3880. leaq (BO, BI, SIZE), BO
  3881. salq $1, %rax // rax = rax * 2 ; number of values
  3882. leaq (AO, %rax, SIZE), AO
  3883. #endif
  3884. #if defined(TRMMKERNEL) && defined(LEFT)
  3885. addq $2, KK
  3886. #endif
  3887. addq $2 * SIZE, CO1 # coffset += 2
  3888. ALIGN_4
  3889. .L1_40:
  3890. testq $1, M
  3891. jz .L999
  3892. ALIGN_4
  3893. .L1_41:
  3894. #if !defined(TRMMKERNEL) || \
  3895. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3896. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3897. leaq BUFFER1, BO // first buffer to BO
  3898. addq $2 * SIZE, BO
  3899. #else
  3900. movq KK, %rax
  3901. leaq BUFFER1, BO // first buffer to BO
  3902. addq $2 * SIZE, BO
  3903. movq %rax, BI // Index for BO
  3904. leaq (BO, BI, SIZE), BO
  3905. leaq (AO, %rax, SIZE), AO
  3906. #endif
  3907. vzeroall
  3908. #ifndef TRMMKERNEL
  3909. movq K, %rax
  3910. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3911. movq K, %rax
  3912. subq KK, %rax
  3913. movq %rax, KKK
  3914. #else
  3915. movq KK, %rax
  3916. #ifdef LEFT
  3917. addq $1, %rax // number of values in AO
  3918. #else
  3919. addq $1, %rax // number of values in BO
  3920. #endif
  3921. movq %rax, KKK
  3922. #endif
  3923. andq $-8, %rax
  3924. je .L1_46
  3925. movq %rax, BI // Index for BO
  3926. leaq (AO, %rax, SIZE), AO
  3927. leaq (BO, BI, SIZE), BO
  3928. negq BI
  3929. negq %rax
  3930. ALIGN_4
  3931. .L1_42:
  3932. KERNEL1x1_1(xxx)
  3933. KERNEL1x1_2(xxx)
  3934. KERNEL1x1_3(xxx)
  3935. KERNEL1x1_4(xxx)
  3936. KERNEL1x1_1(xxx)
  3937. KERNEL1x1_2(xxx)
  3938. KERNEL1x1_3(xxx)
  3939. KERNEL1x1_4(xxx)
  3940. je .L1_46
  3941. KERNEL1x1_1(xxx)
  3942. KERNEL1x1_2(xxx)
  3943. KERNEL1x1_3(xxx)
  3944. KERNEL1x1_4(xxx)
  3945. KERNEL1x1_1(xxx)
  3946. KERNEL1x1_2(xxx)
  3947. KERNEL1x1_3(xxx)
  3948. KERNEL1x1_4(xxx)
  3949. je .L1_46
  3950. jmp .L1_42
  3951. ALIGN_4
  3952. .L1_46:
  3953. #ifndef TRMMKERNEL
  3954. movq K, %rax
  3955. #else
  3956. movq KKK, %rax
  3957. #endif
  3958. andq $7, %rax # if (k & 1)
  3959. je .L1_49
  3960. movq %rax, BI // Index for BO
  3961. leaq (AO, %rax, SIZE), AO
  3962. leaq (BO, BI, SIZE), BO
  3963. negq BI
  3964. negq %rax
  3965. ALIGN_4
  3966. .L1_47:
  3967. KERNEL1x1_SUB(xxx)
  3968. addq $1, BI
  3969. addq $1, %rax
  3970. jl .L1_47
  3971. ALIGN_4
  3972. .L1_49:
  3973. vmovss ALPHA, %xmm0
  3974. #ifndef TRMMKERNEL
  3975. vfmaddss (CO1),%xmm0, %xmm4,%xmm4
  3976. #else
  3977. vmulss %xmm0, %xmm4,%xmm4
  3978. #endif
  3979. vmovss %xmm4 , (CO1)
  3980. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3981. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3982. movq K, %rax
  3983. subq KKK, %rax
  3984. movq %rax, BI // Index for BO
  3985. leaq (BO, BI, SIZE), BO
  3986. leaq (AO, %rax, SIZE), AO
  3987. #endif
  3988. #if defined(TRMMKERNEL) && defined(LEFT)
  3989. addq $1, KK
  3990. #endif
  3991. addq $1 * SIZE, CO1 # coffset += 1
  3992. ALIGN_4
  3993. .L999:
  3994. movq SP, %rsp
  3995. movq (%rsp), %rbx
  3996. movq 8(%rsp), %rbp
  3997. movq 16(%rsp), %r12
  3998. movq 24(%rsp), %r13
  3999. movq 32(%rsp), %r14
  4000. movq 40(%rsp), %r15
  4001. #ifdef WINDOWS_ABI
  4002. movq 48(%rsp), %rdi
  4003. movq 56(%rsp), %rsi
  4004. movups 64(%rsp), %xmm6
  4005. movups 80(%rsp), %xmm7
  4006. movups 96(%rsp), %xmm8
  4007. movups 112(%rsp), %xmm9
  4008. movups 128(%rsp), %xmm10
  4009. movups 144(%rsp), %xmm11
  4010. movups 160(%rsp), %xmm12
  4011. movups 176(%rsp), %xmm13
  4012. movups 192(%rsp), %xmm14
  4013. movups 208(%rsp), %xmm15
  4014. #endif
  4015. addq $STACKSIZE, %rsp
  4016. ret
  4017. EPILOGUE
  4018. #endif