You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_4x2_haswell.S 86 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881
  1. /*********************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. /********************************************************************************
  28. * 2014/07/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. * 2013/10/28 Saar
  34. * Parameter:
  35. * ZGEMM_DEFAULT_UNROLL_N 2
  36. * ZGEMM_DEFAULT_UNROLL_M 4
  37. * ZGEMM_DEFAULT_P 256
  38. * ZGEMM_DEFAULT_Q 128
  39. * A_PR1 512
  40. * B_PR1 512
  41. *
  42. * 2014/07/28 Saar
  43. * Performance at 4608x4608x4608:
  44. * 1 thread: 53 GFLOPS (SANDYBRIDGE: 29) (MKL: 53)
  45. * 2 threads: 101 GFLOPS (SANDYBRIDGE: 59) (MKL: 100)
  46. * 3 threads: 146 GFLOPS (SANDYBRIDGE: 86) (MKL: 138)
  47. * 4 threads: 184 GFLOPS (SANDYBRIDGE: 108) (MKL: 172)
  48. *
  49. ********************************************************************************/
  50. #define ASSEMBLER
  51. #include "common.h"
  52. #define OLD_M %rdi
  53. #define OLD_N %rsi
  54. #define M %r13
  55. #define J %r14
  56. #define OLD_K %rdx
  57. #define A %rcx
  58. #define B %r8
  59. #define C %r9
  60. #define LDC %r10
  61. #define I %r11
  62. #define AO %rdi
  63. #define BO %rsi
  64. #define CO1 %r15
  65. #define K %r12
  66. #define BI %rbp
  67. #define SP %rbx
  68. #define BO1 %rdi
  69. #define BO2 %r15
  70. #ifndef WINDOWS_ABI
  71. #define STACKSIZE 96
  72. #else
  73. #define STACKSIZE 320
  74. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  75. #define OLD_A 48 + STACKSIZE(%rsp)
  76. #define OLD_B 56 + STACKSIZE(%rsp)
  77. #define OLD_C 64 + STACKSIZE(%rsp)
  78. #define OLD_LDC 72 + STACKSIZE(%rsp)
  79. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  80. #endif
  81. #define L_BUFFER_SIZE 8192
  82. #define Ndiv6 24(%rsp)
  83. #define Nmod6 32(%rsp)
  84. #define N 40(%rsp)
  85. #define ALPHA_R 48(%rsp)
  86. #define ALPHA_I 56(%rsp)
  87. #define OFFSET 64(%rsp)
  88. #define KK 72(%rsp)
  89. #define KKK 80(%rsp)
  90. #define BUFFER1 128(%rsp)
  91. #if defined(OS_WINDOWS)
  92. #if L_BUFFER_SIZE > 16384
  93. #define STACK_TOUCH \
  94. movl $ 0, 4096 * 4(%rsp);\
  95. movl $ 0, 4096 * 3(%rsp);\
  96. movl $ 0, 4096 * 2(%rsp);\
  97. movl $ 0, 4096 * 1(%rsp);
  98. #elif L_BUFFER_SIZE > 12288
  99. #define STACK_TOUCH \
  100. movl $ 0, 4096 * 3(%rsp);\
  101. movl $ 0, 4096 * 2(%rsp);\
  102. movl $ 0, 4096 * 1(%rsp);
  103. #elif L_BUFFER_SIZE > 8192
  104. #define STACK_TOUCH \
  105. movl $ 0, 4096 * 2(%rsp);\
  106. movl $ 0, 4096 * 1(%rsp);
  107. #elif L_BUFFER_SIZE > 4096
  108. #define STACK_TOUCH \
  109. movl $ 0, 4096 * 1(%rsp);
  110. #else
  111. #define STACK_TOUCH
  112. #endif
  113. #else
  114. #define STACK_TOUCH
  115. #endif
  116. #if defined(BULLDOZER)
  117. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  118. #define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
  119. #define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
  120. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  121. #define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
  122. #define VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
  123. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  124. #define VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
  125. #define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
  126. #else
  127. #define VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
  128. #define VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
  129. #endif
  130. #else
  131. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  132. #define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
  133. #define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
  134. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  135. #define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
  136. #define VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
  137. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  138. #define VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
  139. #define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
  140. #else
  141. #define VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
  142. #define VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
  143. #endif
  144. #endif
  145. #define A_PR1 512
  146. #define B_PR1 512
  147. /***************************************************************************************************/
  148. .macro KERNEL4x3_SUB
  149. vmovups (AO), %ymm0
  150. vmovups 4 * SIZE(AO), %ymm1
  151. prefetcht0 A_PR1(AO)
  152. vbroadcastsd (BO), %ymm2
  153. vbroadcastsd 1 * SIZE(BO), %ymm3
  154. VFMADDPD_R( %ymm8 ,%ymm2,%ymm0 )
  155. VFMADDPD_R( %ymm12,%ymm2,%ymm1 )
  156. VFMADDPD_I( %ymm9 ,%ymm3,%ymm0 )
  157. VFMADDPD_I( %ymm13,%ymm3,%ymm1 )
  158. vbroadcastsd 2 * SIZE(BO), %ymm2
  159. vbroadcastsd 3 * SIZE(BO), %ymm3
  160. VFMADDPD_R( %ymm10,%ymm2,%ymm0 )
  161. VFMADDPD_R( %ymm14,%ymm2,%ymm1 )
  162. VFMADDPD_I( %ymm11,%ymm3,%ymm0 )
  163. VFMADDPD_I( %ymm15,%ymm3,%ymm1 )
  164. vbroadcastsd 4 * SIZE(BO), %ymm2
  165. vbroadcastsd 5 * SIZE(BO), %ymm3
  166. VFMADDPD_R( %ymm4 ,%ymm2,%ymm0 )
  167. VFMADDPD_R( %ymm6 ,%ymm2,%ymm1 )
  168. VFMADDPD_I( %ymm5 ,%ymm3,%ymm0 )
  169. VFMADDPD_I( %ymm7 ,%ymm3,%ymm1 )
  170. addq $ 6*SIZE, BO
  171. addq $ 8*SIZE, AO
  172. decq %rax
  173. .endm
  174. .macro SAVE4x3
  175. vbroadcastsd ALPHA_R, %ymm0
  176. vbroadcastsd ALPHA_I, %ymm1
  177. // swap high and low 8 bytes
  178. vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
  179. vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
  180. vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
  181. vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
  182. vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
  183. vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
  184. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  185. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  186. vaddsubpd %ymm9, %ymm8 , %ymm8
  187. vaddsubpd %ymm11,%ymm10, %ymm10
  188. vaddsubpd %ymm13,%ymm12, %ymm12
  189. vaddsubpd %ymm15,%ymm14, %ymm14
  190. vaddsubpd %ymm5 ,%ymm4 , %ymm4
  191. vaddsubpd %ymm7 ,%ymm6 , %ymm6
  192. vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9
  193. vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
  194. vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
  195. vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
  196. vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5
  197. vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7
  198. #else
  199. vaddsubpd %ymm8, %ymm9 ,%ymm9
  200. vaddsubpd %ymm10, %ymm11,%ymm11
  201. vaddsubpd %ymm12, %ymm13,%ymm13
  202. vaddsubpd %ymm14, %ymm15,%ymm15
  203. vaddsubpd %ymm4 , %ymm5 ,%ymm5
  204. vaddsubpd %ymm6 , %ymm7 ,%ymm7
  205. vmovapd %ymm9, %ymm8
  206. vmovapd %ymm11, %ymm10
  207. vmovapd %ymm13, %ymm12
  208. vmovapd %ymm15, %ymm14
  209. vmovapd %ymm5 , %ymm4
  210. vmovapd %ymm7 , %ymm6
  211. // swap high and low 8 bytes
  212. vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
  213. vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
  214. vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
  215. vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
  216. vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
  217. vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
  218. #endif
  219. // multiply with ALPHA_R
  220. vmulpd %ymm8 , %ymm0, %ymm8
  221. vmulpd %ymm10, %ymm0, %ymm10
  222. vmulpd %ymm12, %ymm0, %ymm12
  223. vmulpd %ymm14, %ymm0, %ymm14
  224. vmulpd %ymm4 , %ymm0, %ymm4
  225. vmulpd %ymm6 , %ymm0, %ymm6
  226. // multiply with ALPHA_I
  227. vmulpd %ymm9 , %ymm1, %ymm9
  228. vmulpd %ymm11, %ymm1, %ymm11
  229. vmulpd %ymm13, %ymm1, %ymm13
  230. vmulpd %ymm15, %ymm1, %ymm15
  231. vmulpd %ymm5 , %ymm1, %ymm5
  232. vmulpd %ymm7 , %ymm1, %ymm7
  233. vaddsubpd %ymm9, %ymm8 , %ymm8
  234. vaddsubpd %ymm11,%ymm10, %ymm10
  235. vaddsubpd %ymm13,%ymm12, %ymm12
  236. vaddsubpd %ymm15,%ymm14, %ymm14
  237. vaddsubpd %ymm5 ,%ymm4 , %ymm4
  238. vaddsubpd %ymm7 ,%ymm6 , %ymm6
  239. #ifndef TRMMKERNEL
  240. vaddpd (CO1), %ymm8 , %ymm8
  241. vaddpd 4 * SIZE(CO1), %ymm12, %ymm12
  242. vaddpd (CO1, LDC), %ymm10, %ymm10
  243. vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14
  244. vaddpd (CO1, LDC,2), %ymm4 , %ymm4
  245. vaddpd 4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6
  246. #endif
  247. vmovups %ymm8 , (CO1)
  248. vmovups %ymm12 , 4 * SIZE(CO1)
  249. vmovups %ymm10 , (CO1, LDC)
  250. vmovups %ymm14 , 4 * SIZE(CO1, LDC)
  251. vmovups %ymm4 , (CO1, LDC, 2)
  252. vmovups %ymm6 , 4 * SIZE(CO1, LDC, 2)
  253. prefetcht0 64(CO1)
  254. prefetcht0 64(CO1, LDC)
  255. .endm
  256. /***************************************************************************************************/
  257. .macro KERNEL2x3_SUB
  258. vmovups (AO), %xmm0
  259. vmovups 2 * SIZE(AO), %xmm1
  260. vmovddup (BO), %xmm2
  261. vmovddup 1 * SIZE(BO), %xmm3
  262. VFMADDPD_R( %xmm8 ,%xmm2,%xmm0 )
  263. VFMADDPD_R( %xmm12,%xmm2,%xmm1 )
  264. VFMADDPD_I( %xmm9 ,%xmm3,%xmm0 )
  265. VFMADDPD_I( %xmm13,%xmm3,%xmm1 )
  266. vmovddup 2 * SIZE(BO), %xmm2
  267. vmovddup 3 * SIZE(BO), %xmm3
  268. VFMADDPD_R( %xmm10,%xmm2,%xmm0 )
  269. VFMADDPD_R( %xmm14,%xmm2,%xmm1 )
  270. VFMADDPD_I( %xmm11,%xmm3,%xmm0 )
  271. VFMADDPD_I( %xmm15,%xmm3,%xmm1 )
  272. vmovddup 4 * SIZE(BO), %xmm2
  273. vmovddup 5 * SIZE(BO), %xmm3
  274. VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 )
  275. VFMADDPD_R( %xmm6 ,%xmm2,%xmm1 )
  276. VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 )
  277. VFMADDPD_I( %xmm7 ,%xmm3,%xmm1 )
  278. addq $ 6*SIZE, BO
  279. addq $ 4*SIZE, AO
  280. decq %rax
  281. .endm
  282. .macro SAVE2x3
  283. vmovddup ALPHA_R, %xmm0
  284. vmovddup ALPHA_I, %xmm1
  285. // swap high and low 64 bytes
  286. vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
  287. vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
  288. vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
  289. vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
  290. vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5
  291. vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7
  292. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  293. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  294. vaddsubpd %xmm9, %xmm8 , %xmm8
  295. vaddsubpd %xmm11,%xmm10, %xmm10
  296. vaddsubpd %xmm13,%xmm12, %xmm12
  297. vaddsubpd %xmm15,%xmm14, %xmm14
  298. vaddsubpd %xmm5, %xmm4 , %xmm4
  299. vaddsubpd %xmm7, %xmm6 , %xmm6
  300. vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
  301. vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
  302. vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
  303. vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
  304. vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
  305. vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7
  306. #else
  307. vaddsubpd %xmm8, %xmm9 ,%xmm9
  308. vaddsubpd %xmm10, %xmm11,%xmm11
  309. vaddsubpd %xmm12, %xmm13,%xmm13
  310. vaddsubpd %xmm14, %xmm15,%xmm15
  311. vaddsubpd %xmm4, %xmm5 ,%xmm5
  312. vaddsubpd %xmm6, %xmm7 ,%xmm7
  313. vmovapd %xmm9, %xmm8
  314. vmovapd %xmm11, %xmm10
  315. vmovapd %xmm13, %xmm12
  316. vmovapd %xmm15, %xmm14
  317. vmovapd %xmm5, %xmm4
  318. vmovapd %xmm7, %xmm6
  319. // swap high and low 64 bytes
  320. vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
  321. vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
  322. vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
  323. vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
  324. vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
  325. vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7
  326. #endif
  327. // multiply with ALPHA_R
  328. vmulpd %xmm8 , %xmm0, %xmm8
  329. vmulpd %xmm10, %xmm0, %xmm10
  330. vmulpd %xmm12, %xmm0, %xmm12
  331. vmulpd %xmm14, %xmm0, %xmm14
  332. vmulpd %xmm4 , %xmm0, %xmm4
  333. vmulpd %xmm6 , %xmm0, %xmm6
  334. // multiply with ALPHA_I
  335. vmulpd %xmm9 , %xmm1, %xmm9
  336. vmulpd %xmm11, %xmm1, %xmm11
  337. vmulpd %xmm13, %xmm1, %xmm13
  338. vmulpd %xmm15, %xmm1, %xmm15
  339. vmulpd %xmm5 , %xmm1, %xmm5
  340. vmulpd %xmm7 , %xmm1, %xmm7
  341. vaddsubpd %xmm9, %xmm8 , %xmm8
  342. vaddsubpd %xmm11,%xmm10, %xmm10
  343. vaddsubpd %xmm13,%xmm12, %xmm12
  344. vaddsubpd %xmm15,%xmm14, %xmm14
  345. vaddsubpd %xmm5, %xmm4 , %xmm4
  346. vaddsubpd %xmm7, %xmm6 , %xmm6
  347. #ifndef TRMMKERNEL
  348. vaddpd (CO1), %xmm8 , %xmm8
  349. vaddpd 2 * SIZE(CO1), %xmm12, %xmm12
  350. vaddpd (CO1, LDC), %xmm10, %xmm10
  351. vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14
  352. vaddpd (CO1, LDC,2), %xmm4 , %xmm4
  353. vaddpd 2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6
  354. #endif
  355. vmovups %xmm8 , (CO1)
  356. vmovups %xmm12 , 2 * SIZE(CO1)
  357. vmovups %xmm10 , (CO1, LDC)
  358. vmovups %xmm14 , 2 * SIZE(CO1, LDC)
  359. vmovups %xmm4 , (CO1, LDC,2)
  360. vmovups %xmm6 , 2 * SIZE(CO1, LDC,2)
  361. .endm
  362. /************************************************************************************************/
  363. .macro KERNEL1x3_SUB
  364. vmovups (AO), %xmm0
  365. vmovddup (BO), %xmm2
  366. vmovddup 1 * SIZE(BO), %xmm3
  367. VFMADDPD_R( %xmm8,%xmm2,%xmm0 )
  368. VFMADDPD_I( %xmm9,%xmm3,%xmm0 )
  369. vmovddup 2 * SIZE(BO), %xmm2
  370. vmovddup 3 * SIZE(BO), %xmm3
  371. VFMADDPD_R( %xmm10,%xmm2,%xmm0 )
  372. VFMADDPD_I( %xmm11,%xmm3,%xmm0 )
  373. vmovddup 4 * SIZE(BO), %xmm2
  374. vmovddup 5 * SIZE(BO), %xmm3
  375. VFMADDPD_R( %xmm4 ,%xmm2,%xmm0 )
  376. VFMADDPD_I( %xmm5 ,%xmm3,%xmm0 )
  377. addq $ 6*SIZE, BO
  378. addq $ 2*SIZE, AO
  379. decq %rax
  380. .endm
  381. .macro SAVE1x3
  382. vmovddup ALPHA_R, %xmm0
  383. vmovddup ALPHA_I, %xmm1
  384. // swap high and low 64 bytes
  385. vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
  386. vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
  387. vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
  388. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  389. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  390. vaddsubpd %xmm9, %xmm8 , %xmm8
  391. vaddsubpd %xmm11,%xmm10, %xmm10
  392. vaddsubpd %xmm5, %xmm4 , %xmm4
  393. vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
  394. vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
  395. vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
  396. #else
  397. vaddsubpd %xmm8, %xmm9, %xmm9
  398. vaddsubpd %xmm10,%xmm11, %xmm11
  399. vaddsubpd %xmm4, %xmm5, %xmm5
  400. vmovapd %xmm9, %xmm8
  401. vmovapd %xmm11, %xmm10
  402. vmovapd %xmm5, %xmm4
  403. // swap high and low 64 bytes
  404. vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
  405. vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
  406. vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
  407. #endif
  408. // multiply with ALPHA_R
  409. vmulpd %xmm8 , %xmm0, %xmm8
  410. vmulpd %xmm10, %xmm0, %xmm10
  411. vmulpd %xmm4 , %xmm0, %xmm4
  412. // multiply with ALPHA_I
  413. vmulpd %xmm9 , %xmm1, %xmm9
  414. vmulpd %xmm11, %xmm1, %xmm11
  415. vmulpd %xmm5 , %xmm1, %xmm5
  416. vaddsubpd %xmm9, %xmm8 , %xmm8
  417. vaddsubpd %xmm11,%xmm10, %xmm10
  418. vaddsubpd %xmm5, %xmm4 , %xmm4
  419. #ifndef TRMMKERNEL
  420. vaddpd (CO1) , %xmm8 , %xmm8
  421. vaddpd (CO1, LDC) , %xmm10, %xmm10
  422. vaddpd (CO1, LDC,2) , %xmm4 , %xmm4
  423. #endif
  424. vmovups %xmm8 , (CO1)
  425. vmovups %xmm10 , (CO1, LDC)
  426. vmovups %xmm4 , (CO1, LDC,2)
  427. .endm
  428. /***************************************************************************************************/
  429. .macro KERNEL4x2_SUB
  430. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0
  431. vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1
  432. vbroadcastsd -8 * SIZE(BO, BI, SIZE), %ymm4
  433. vbroadcastsd -7 * SIZE(BO, BI, SIZE), %ymm5
  434. VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 )
  435. VFMADDPD_R( %ymm12,%ymm4,%ymm1 )
  436. vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm6
  437. VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 )
  438. VFMADDPD_I( %ymm13,%ymm5,%ymm1 )
  439. vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm7
  440. VFMADDPD_R( %ymm10,%ymm6,%ymm0 )
  441. VFMADDPD_R( %ymm14,%ymm6,%ymm1 )
  442. VFMADDPD_I( %ymm11,%ymm7,%ymm0 )
  443. VFMADDPD_I( %ymm15,%ymm7,%ymm1 )
  444. addq $ 4, BI
  445. addq $ 8, %rax
  446. .endm
  447. .macro SAVE4x2
  448. vbroadcastsd ALPHA_R, %ymm0
  449. vbroadcastsd ALPHA_I, %ymm1
  450. // swap high and low 8 bytes
  451. vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
  452. vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
  453. vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
  454. vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
  455. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  456. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  457. vaddsubpd %ymm9, %ymm8 , %ymm8
  458. vaddsubpd %ymm11,%ymm10, %ymm10
  459. vaddsubpd %ymm13,%ymm12, %ymm12
  460. vaddsubpd %ymm15,%ymm14, %ymm14
  461. vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9
  462. vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
  463. vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
  464. vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
  465. #else
  466. vaddsubpd %ymm8, %ymm9 ,%ymm9
  467. vaddsubpd %ymm10, %ymm11,%ymm11
  468. vaddsubpd %ymm12, %ymm13,%ymm13
  469. vaddsubpd %ymm14, %ymm15,%ymm15
  470. vmovapd %ymm9, %ymm8
  471. vmovapd %ymm11, %ymm10
  472. vmovapd %ymm13, %ymm12
  473. vmovapd %ymm15, %ymm14
  474. // swap high and low 8 bytes
  475. vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
  476. vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
  477. vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
  478. vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
  479. #endif
  480. // multiply with ALPHA_R
  481. vmulpd %ymm8 , %ymm0, %ymm8
  482. vmulpd %ymm10, %ymm0, %ymm10
  483. vmulpd %ymm12, %ymm0, %ymm12
  484. vmulpd %ymm14, %ymm0, %ymm14
  485. // multiply with ALPHA_I
  486. vmulpd %ymm9 , %ymm1, %ymm9
  487. vmulpd %ymm11, %ymm1, %ymm11
  488. vmulpd %ymm13, %ymm1, %ymm13
  489. vmulpd %ymm15, %ymm1, %ymm15
  490. vaddsubpd %ymm9, %ymm8 , %ymm8
  491. vaddsubpd %ymm11,%ymm10, %ymm10
  492. vaddsubpd %ymm13,%ymm12, %ymm12
  493. vaddsubpd %ymm15,%ymm14, %ymm14
  494. #ifndef TRMMKERNEL
  495. vaddpd (CO1), %ymm8 , %ymm8
  496. vaddpd 4 * SIZE(CO1), %ymm12, %ymm12
  497. vaddpd (CO1, LDC), %ymm10, %ymm10
  498. vaddpd 4 * SIZE(CO1, LDC), %ymm14, %ymm14
  499. #endif
  500. vmovups %ymm8 , (CO1)
  501. vmovups %ymm12 , 4 * SIZE(CO1)
  502. vmovups %ymm10 , (CO1, LDC)
  503. vmovups %ymm14 , 4 * SIZE(CO1, LDC)
  504. prefetcht0 64(CO1)
  505. prefetcht0 64(CO1, LDC)
  506. .endm
  507. /***************************************************************************************************/
  508. .macro KERNEL2x2_SUB
  509. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0
  510. vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4
  511. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1
  512. VFMADDPD_R( %xmm8,%xmm4,%xmm0 )
  513. VFMADDPD_R( %xmm12,%xmm4,%xmm1 )
  514. vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5
  515. VFMADDPD_I( %xmm9,%xmm5,%xmm0 )
  516. VFMADDPD_I( %xmm13,%xmm5,%xmm1 )
  517. vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6
  518. VFMADDPD_R( %xmm10,%xmm6,%xmm0 )
  519. VFMADDPD_R( %xmm14,%xmm6,%xmm1 )
  520. vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7
  521. VFMADDPD_I( %xmm11,%xmm7,%xmm0 )
  522. VFMADDPD_I( %xmm15,%xmm7,%xmm1 )
  523. addq $ 4, BI
  524. addq $ 4, %rax
  525. .endm
  526. .macro SAVE2x2
  527. vmovddup ALPHA_R, %xmm0
  528. vmovddup ALPHA_I, %xmm1
  529. // swap high and low 64 bytes
  530. vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
  531. vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
  532. vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
  533. vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
  534. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  535. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  536. vaddsubpd %xmm9, %xmm8 , %xmm8
  537. vaddsubpd %xmm11,%xmm10, %xmm10
  538. vaddsubpd %xmm13,%xmm12, %xmm12
  539. vaddsubpd %xmm15,%xmm14, %xmm14
  540. vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
  541. vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
  542. vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
  543. vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
  544. #else
  545. vaddsubpd %xmm8, %xmm9 ,%xmm9
  546. vaddsubpd %xmm10, %xmm11,%xmm11
  547. vaddsubpd %xmm12, %xmm13,%xmm13
  548. vaddsubpd %xmm14, %xmm15,%xmm15
  549. vmovapd %xmm9, %xmm8
  550. vmovapd %xmm11, %xmm10
  551. vmovapd %xmm13, %xmm12
  552. vmovapd %xmm15, %xmm14
  553. // swap high and low 64 bytes
  554. vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
  555. vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
  556. vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
  557. vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
  558. #endif
  559. // multiply with ALPHA_R
  560. vmulpd %xmm8 , %xmm0, %xmm8
  561. vmulpd %xmm10, %xmm0, %xmm10
  562. vmulpd %xmm12, %xmm0, %xmm12
  563. vmulpd %xmm14, %xmm0, %xmm14
  564. // multiply with ALPHA_I
  565. vmulpd %xmm9 , %xmm1, %xmm9
  566. vmulpd %xmm11, %xmm1, %xmm11
  567. vmulpd %xmm13, %xmm1, %xmm13
  568. vmulpd %xmm15, %xmm1, %xmm15
  569. vaddsubpd %xmm9, %xmm8 , %xmm8
  570. vaddsubpd %xmm11,%xmm10, %xmm10
  571. vaddsubpd %xmm13,%xmm12, %xmm12
  572. vaddsubpd %xmm15,%xmm14, %xmm14
  573. #ifndef TRMMKERNEL
  574. vaddpd (CO1), %xmm8 , %xmm8
  575. vaddpd 2 * SIZE(CO1), %xmm12, %xmm12
  576. vaddpd (CO1, LDC), %xmm10, %xmm10
  577. vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14
  578. #endif
  579. vmovups %xmm8 , (CO1)
  580. vmovups %xmm12 , 2 * SIZE(CO1)
  581. vmovups %xmm10 , (CO1, LDC)
  582. vmovups %xmm14 , 2 * SIZE(CO1, LDC)
  583. .endm
  584. /************************************************************************************************/
  585. /************************************************************************************************/
  586. .macro KERNEL1x2_SUB
  587. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0
  588. vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4
  589. vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5
  590. VFMADDPD_R( %xmm8,%xmm4,%xmm0 )
  591. VFMADDPD_I( %xmm9,%xmm5,%xmm0 )
  592. vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6
  593. vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7
  594. VFMADDPD_R( %xmm10,%xmm6,%xmm0 )
  595. VFMADDPD_I( %xmm11,%xmm7,%xmm0 )
  596. addq $ 4, BI
  597. addq $ 2, %rax
  598. .endm
  599. .macro SAVE1x2
  600. vmovddup ALPHA_R, %xmm0
  601. vmovddup ALPHA_I, %xmm1
  602. // swap high and low 64 bytes
  603. vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
  604. vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
  605. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  606. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  607. vaddsubpd %xmm9, %xmm8 , %xmm8
  608. vaddsubpd %xmm11,%xmm10, %xmm10
  609. vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
  610. vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
  611. #else
  612. vaddsubpd %xmm8, %xmm9, %xmm9
  613. vaddsubpd %xmm10,%xmm11, %xmm11
  614. vmovapd %xmm9, %xmm8
  615. vmovapd %xmm11, %xmm10
  616. // swap high and low 64 bytes
  617. vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
  618. vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
  619. #endif
  620. // multiply with ALPHA_R
  621. vmulpd %xmm8 , %xmm0, %xmm8
  622. vmulpd %xmm10, %xmm0, %xmm10
  623. // multiply with ALPHA_I
  624. vmulpd %xmm9 , %xmm1, %xmm9
  625. vmulpd %xmm11, %xmm1, %xmm11
  626. vaddsubpd %xmm9, %xmm8 , %xmm8
  627. vaddsubpd %xmm11,%xmm10, %xmm10
  628. #ifndef TRMMKERNEL
  629. vaddpd (CO1), %xmm8 , %xmm8
  630. vaddpd (CO1, LDC), %xmm10, %xmm10
  631. #endif
  632. vmovups %xmm8 , (CO1)
  633. vmovups %xmm10 , (CO1, LDC)
  634. .endm
  635. /************************************************************************************************/
  636. .macro KERNEL4x1_SUB
  637. vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0
  638. vmovups -4 * SIZE(AO, %rax, SIZE), %ymm1
  639. vbroadcastsd -4 * SIZE(BO, BI, SIZE) , %ymm4
  640. vbroadcastsd -3 * SIZE(BO, BI, SIZE) , %ymm5
  641. VFMADDPD_R( %ymm8 ,%ymm4,%ymm0 )
  642. VFMADDPD_R( %ymm12,%ymm4,%ymm1 )
  643. VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 )
  644. VFMADDPD_I( %ymm13,%ymm5,%ymm1 )
  645. addq $ 2, BI
  646. addq $ 8, %rax
  647. .endm
  648. .macro SAVE4x1
  649. vbroadcastsd ALPHA_R, %ymm0
  650. vbroadcastsd ALPHA_I, %ymm1
  651. // swap high and low 8 bytes
  652. vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
  653. vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
  654. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  655. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  656. vaddsubpd %ymm9, %ymm8 , %ymm8
  657. vaddsubpd %ymm13,%ymm12 , %ymm12
  658. vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9
  659. vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
  660. #else
  661. vaddsubpd %ymm8, %ymm9 , %ymm9
  662. vaddsubpd %ymm12,%ymm13, %ymm13
  663. vmovapd %ymm9, %ymm8
  664. vmovapd %ymm13, %ymm12
  665. // swap high and low 8 bytes
  666. vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
  667. vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
  668. #endif
  669. // multiply with ALPHA_R
  670. vmulpd %ymm8 , %ymm0, %ymm8
  671. vmulpd %ymm12, %ymm0, %ymm12
  672. // multiply with ALPHA_I
  673. vmulpd %ymm9 , %ymm1, %ymm9
  674. vmulpd %ymm13, %ymm1, %ymm13
  675. vaddsubpd %ymm9, %ymm8 , %ymm8
  676. vaddsubpd %ymm13, %ymm12, %ymm12
  677. #ifndef TRMMKERNEL
  678. vaddpd (CO1), %ymm8 , %ymm8
  679. vaddpd 4 * SIZE(CO1), %ymm12, %ymm12
  680. #endif
  681. vmovups %ymm8 , (CO1)
  682. vmovups %ymm12 ,4 * SIZE(CO1)
  683. .endm
  684. /************************************************************************************************/
  685. .macro KERNEL2x1_SUB
  686. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0
  687. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4
  688. VFMADDPD_R( %xmm8,%xmm4,%xmm0 )
  689. vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1
  690. VFMADDPD_R( %xmm12,%xmm4,%xmm1 )
  691. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5
  692. VFMADDPD_I( %xmm9,%xmm5,%xmm0 )
  693. VFMADDPD_I( %xmm13,%xmm5,%xmm1 )
  694. addq $ 2, BI
  695. addq $ 4, %rax
  696. .endm
  697. .macro SAVE2x1
  698. vmovddup ALPHA_R, %xmm0
  699. vmovddup ALPHA_I, %xmm1
  700. // swap high and low 64 bytes
  701. vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
  702. vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
  703. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  704. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  705. vaddsubpd %xmm9, %xmm8 , %xmm8
  706. vaddsubpd %xmm13,%xmm12 , %xmm12
  707. vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
  708. vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
  709. #else
  710. vaddsubpd %xmm8, %xmm9 , %xmm9
  711. vaddsubpd %xmm12,%xmm13, %xmm13
  712. vmovapd %xmm9, %xmm8
  713. vmovapd %xmm13, %xmm12
  714. // swap high and low 64 bytes
  715. vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
  716. vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
  717. #endif
  718. // multiply with ALPHA_R
  719. vmulpd %xmm8 , %xmm0, %xmm8
  720. vmulpd %xmm12, %xmm0, %xmm12
  721. // multiply with ALPHA_I
  722. vmulpd %xmm9 , %xmm1, %xmm9
  723. vmulpd %xmm13, %xmm1, %xmm13
  724. vaddsubpd %xmm9, %xmm8 , %xmm8
  725. vaddsubpd %xmm13, %xmm12, %xmm12
  726. #ifndef TRMMKERNEL
  727. vaddpd (CO1), %xmm8 , %xmm8
  728. vaddpd 2 * SIZE(CO1), %xmm12, %xmm12
  729. #endif
  730. vmovups %xmm8 , (CO1)
  731. vmovups %xmm12 , 2 * SIZE(CO1)
  732. .endm
  733. /************************************************************************************************/
  734. .macro KERNEL1x1_SUB
  735. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0
  736. vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4
  737. VFMADDPD_R( %xmm8,%xmm4,%xmm0 )
  738. vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5
  739. VFMADDPD_I( %xmm9,%xmm5,%xmm0 )
  740. addq $ 2, BI
  741. addq $ 2, %rax
  742. .endm
  743. .macro SAVE1x1
  744. vmovddup ALPHA_R, %xmm0
  745. vmovddup ALPHA_I, %xmm1
  746. // swap high and low 64 bytes
  747. vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
  748. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  749. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  750. vaddsubpd %xmm9, %xmm8, %xmm8
  751. vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
  752. #else
  753. vaddsubpd %xmm8, %xmm9, %xmm9
  754. vmovapd %xmm9, %xmm8
  755. // swap high and low 64 bytes
  756. vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
  757. #endif
  758. // multiply with ALPHA_R
  759. vmulpd %xmm8 , %xmm0, %xmm8
  760. // multiply with ALPHA_I
  761. vmulpd %xmm9 , %xmm1, %xmm9
  762. vaddsubpd %xmm9 ,%xmm8, %xmm8
  763. #ifndef TRMMKERNEL
  764. vaddpd (CO1), %xmm8 , %xmm8
  765. #endif
  766. vmovups %xmm8 , (CO1)
  767. .endm
  768. /************************************************************************************************/
  769. #if !defined(TRMMKERNEL)
  770. PROLOGUE
  771. PROFCODE
  772. subq $ STACKSIZE, %rsp
  773. movq %rbx, (%rsp)
  774. movq %rbp, 8(%rsp)
  775. movq %r12, 16(%rsp)
  776. movq %r13, 24(%rsp)
  777. movq %r14, 32(%rsp)
  778. movq %r15, 40(%rsp)
  779. vzeroupper
  780. #ifdef WINDOWS_ABI
  781. movq %rdi, 48(%rsp)
  782. movq %rsi, 56(%rsp)
  783. vmovups %xmm6, 64(%rsp)
  784. vmovups %xmm7, 80(%rsp)
  785. vmovups %xmm8, 96(%rsp)
  786. vmovups %xmm9, 112(%rsp)
  787. vmovups %xmm10, 128(%rsp)
  788. vmovups %xmm11, 144(%rsp)
  789. vmovups %xmm12, 160(%rsp)
  790. vmovups %xmm13, 176(%rsp)
  791. vmovups %xmm14, 192(%rsp)
  792. vmovups %xmm15, 208(%rsp)
  793. movq ARG1, OLD_M
  794. movq ARG2, OLD_N
  795. movq ARG3, OLD_K
  796. movq OLD_A, A
  797. movq OLD_B, B
  798. movq OLD_C, C
  799. movq OLD_LDC, LDC
  800. #ifdef TRMMKERNEL
  801. movsd OLD_OFFSET, %xmm12
  802. #endif
  803. vmovaps %xmm3, %xmm0
  804. vmovsd OLD_ALPHA_I, %xmm1
  805. #else
  806. movq STACKSIZE + 8(%rsp), LDC
  807. #ifdef TRMMKERNEL
  808. movsd STACKSIZE + 16(%rsp), %xmm12
  809. #endif
  810. #endif
  811. movq %rsp, SP # save old stack
  812. subq $ 128 + L_BUFFER_SIZE, %rsp
  813. andq $ -4096, %rsp # align stack
  814. STACK_TOUCH
  815. cmpq $ 0, OLD_M
  816. je .L999
  817. cmpq $ 0, OLD_N
  818. je .L999
  819. cmpq $ 0, OLD_K
  820. je .L999
  821. movq OLD_M, M
  822. movq OLD_N, N
  823. movq OLD_K, K
  824. vmovsd %xmm0, ALPHA_R
  825. vmovsd %xmm1, ALPHA_I
  826. salq $ ZBASE_SHIFT, LDC
  827. movq N, %rax
  828. xorq %rdx, %rdx
  829. movq $ 6, %rdi
  830. divq %rdi // N / 6
  831. movq %rax, Ndiv6 // N / 6
  832. movq %rdx, Nmod6 // N % 6
  833. /************************************************************************************************/
  834. .L6_00_0:
  835. movq Ndiv6, J
  836. cmpq $ 0, J
  837. je .L2_00_0
  838. ALIGN_4
  839. .L6_00_01:
  840. // copy to sub buffer
  841. movq B, BO1
  842. leaq BUFFER1, BO // first buffer to BO
  843. movq K, %rax
  844. salq $2, %rax // 2 * COMPSIZE
  845. leaq (B, %rax,8), BO2
  846. movq BO2, B // next offset of B
  847. movq K, %rax
  848. ALIGN_4
  849. .L6_00_02b:
  850. vmovups (BO1), %xmm0
  851. vmovups 2 * SIZE(BO1), %xmm1
  852. vmovups (BO2), %xmm2
  853. vmovups %xmm0, (BO)
  854. vmovups %xmm1, 2 * SIZE(BO)
  855. vmovups %xmm2, 4 * SIZE(BO)
  856. addq $ 4*SIZE,BO1
  857. addq $ 4*SIZE,BO2
  858. addq $ 6*SIZE,BO
  859. decq %rax
  860. jnz .L6_00_02b
  861. .L6_00_02c:
  862. .L6_00_10:
  863. movq C, CO1
  864. leaq (C, LDC, 2), C // c += 2 * ldc
  865. leaq (C, LDC, 1), C // c += 1 * ldc
  866. movq A, AO // aoffset = a
  867. movq M, I
  868. sarq $ 2, I // i = (m >> 2)
  869. je .L6_2_10
  870. ALIGN_4
  871. /******************************************************************************************************************/
  872. .L6_4_11:
  873. leaq BUFFER1, BO // first buffer to BO
  874. vzeroall
  875. movq K, %rax
  876. andq $ -8, %rax // K = K - ( K % 8 )
  877. je .L6_4_16
  878. ALIGN_4
  879. .L6_4_12:
  880. KERNEL4x3_SUB
  881. KERNEL4x3_SUB
  882. KERNEL4x3_SUB
  883. KERNEL4x3_SUB
  884. KERNEL4x3_SUB
  885. KERNEL4x3_SUB
  886. KERNEL4x3_SUB
  887. KERNEL4x3_SUB
  888. je .L6_4_16
  889. KERNEL4x3_SUB
  890. KERNEL4x3_SUB
  891. KERNEL4x3_SUB
  892. KERNEL4x3_SUB
  893. KERNEL4x3_SUB
  894. KERNEL4x3_SUB
  895. KERNEL4x3_SUB
  896. KERNEL4x3_SUB
  897. je .L6_4_16
  898. jmp .L6_4_12
  899. ALIGN_4
  900. .L6_4_16:
  901. movq K, %rax
  902. andq $ 7, %rax # if (k & 1)
  903. je .L6_4_19
  904. ALIGN_4
  905. .L6_4_17:
  906. KERNEL4x3_SUB
  907. jnz .L6_4_17
  908. ALIGN_4
  909. .L6_4_19:
  910. SAVE4x3
  911. addq $ 8 * SIZE, CO1 # coffset += 8
  912. decq I # i --
  913. jg .L6_4_11
  914. ALIGN_4
  915. /**************************************************************************
  916. * Rest of M
  917. ***************************************************************************/
  918. /******************************************************************************************************************/
  919. .L6_2_10:
  920. testq $ 2, M
  921. jz .L6_2_40 // to next 2 lines of N
  922. .L6_2_11:
  923. leaq BUFFER1, BO // first buffer to BO
  924. vzeroall
  925. movq K, %rax
  926. andq $ -8, %rax // K = K - ( K % 8 )
  927. je .L6_2_16
  928. ALIGN_4
  929. .L6_2_12:
  930. KERNEL2x3_SUB
  931. KERNEL2x3_SUB
  932. KERNEL2x3_SUB
  933. KERNEL2x3_SUB
  934. KERNEL2x3_SUB
  935. KERNEL2x3_SUB
  936. KERNEL2x3_SUB
  937. KERNEL2x3_SUB
  938. je .L6_2_16
  939. KERNEL2x3_SUB
  940. KERNEL2x3_SUB
  941. KERNEL2x3_SUB
  942. KERNEL2x3_SUB
  943. KERNEL2x3_SUB
  944. KERNEL2x3_SUB
  945. KERNEL2x3_SUB
  946. KERNEL2x3_SUB
  947. je .L6_2_16
  948. jmp .L6_2_12
  949. ALIGN_4
  950. .L6_2_16:
  951. movq K, %rax
  952. andq $ 7, %rax # if (k & 1)
  953. je .L6_2_19
  954. ALIGN_4
  955. .L6_2_17:
  956. KERNEL2x3_SUB
  957. jnz .L6_2_17
  958. ALIGN_4
  959. .L6_2_19:
  960. SAVE2x3
  961. addq $ 4 * SIZE, CO1 # coffset += 4
  962. ALIGN_4
  963. /**************************************************************************
  964. * Rest of M
  965. ***************************************************************************/
  966. .L6_2_40:
  967. testq $ 1, M
  968. jz .L6_2_60 // to next 2 lines of N
  969. ALIGN_4
  970. .L6_2_41:
  971. leaq BUFFER1, BO // first buffer to BO
  972. vzeroall
  973. movq K, %rax
  974. andq $ -8, %rax // K = K - ( K % 8 )
  975. je .L6_2_46
  976. ALIGN_4
  977. .L6_2_42:
  978. KERNEL1x3_SUB
  979. KERNEL1x3_SUB
  980. KERNEL1x3_SUB
  981. KERNEL1x3_SUB
  982. KERNEL1x3_SUB
  983. KERNEL1x3_SUB
  984. KERNEL1x3_SUB
  985. KERNEL1x3_SUB
  986. je .L6_2_46
  987. KERNEL1x3_SUB
  988. KERNEL1x3_SUB
  989. KERNEL1x3_SUB
  990. KERNEL1x3_SUB
  991. KERNEL1x3_SUB
  992. KERNEL1x3_SUB
  993. KERNEL1x3_SUB
  994. KERNEL1x3_SUB
  995. je .L6_2_46
  996. jmp .L6_2_42
  997. ALIGN_4
  998. .L6_2_46:
  999. movq K, %rax
  1000. andq $ 7, %rax # if (k & 1)
  1001. je .L6_2_49
  1002. ALIGN_4
  1003. .L6_2_47:
  1004. KERNEL1x3_SUB
  1005. jnz .L6_2_47
  1006. ALIGN_4
  1007. .L6_2_49:
  1008. SAVE1x3
  1009. addq $ 2 * SIZE, CO1 # coffset += 2
  1010. decq I # i --
  1011. jg .L6_2_41
  1012. ALIGN_4
  1013. .L6_2_60:
  1014. /************************************************************************************************/
  1015. /************************************************************************************************/
  1016. .L7_00_01:
  1017. // copy to sub buffer
  1018. movq B, BO1
  1019. leaq BUFFER1, BO // first buffer to BO
  1020. movq K, %rax
  1021. salq $2, %rax // 2 * COMPSIZE
  1022. leaq (B, %rax,8), BO2
  1023. movq K, %rax
  1024. ALIGN_4
  1025. .L7_00_02b:
  1026. vmovups 2 * SIZE(BO1), %xmm0
  1027. vmovups (BO2), %xmm1
  1028. vmovups 2 * SIZE(BO2), %xmm2
  1029. vmovups %xmm0, (BO)
  1030. vmovups %xmm1, 2 * SIZE(BO)
  1031. vmovups %xmm2, 4 * SIZE(BO)
  1032. addq $ 4*SIZE,BO1
  1033. addq $ 4*SIZE,BO2
  1034. addq $ 6*SIZE,BO
  1035. decq %rax
  1036. jnz .L7_00_02b
  1037. .L7_00_02c:
  1038. movq BO2, B // next offset of B
  1039. .L7_00_10:
  1040. movq C, CO1
  1041. leaq (C, LDC, 2), C // c += 2 * ldc
  1042. leaq (C, LDC, 1), C // c += 1 * ldc
  1043. movq A, AO // aoffset = a
  1044. movq M, I
  1045. sarq $ 2, I // i = (m >> 2)
  1046. je .L7_2_10
  1047. ALIGN_4
  1048. /******************************************************************************************************************/
  1049. .L7_4_11:
  1050. leaq BUFFER1, BO // first buffer to BO
  1051. vzeroall
  1052. movq K, %rax
  1053. andq $ -8, %rax // K = K - ( K % 8 )
  1054. je .L7_4_16
  1055. ALIGN_4
  1056. .L7_4_12:
  1057. KERNEL4x3_SUB
  1058. KERNEL4x3_SUB
  1059. KERNEL4x3_SUB
  1060. KERNEL4x3_SUB
  1061. KERNEL4x3_SUB
  1062. KERNEL4x3_SUB
  1063. KERNEL4x3_SUB
  1064. KERNEL4x3_SUB
  1065. je .L7_4_16
  1066. KERNEL4x3_SUB
  1067. KERNEL4x3_SUB
  1068. KERNEL4x3_SUB
  1069. KERNEL4x3_SUB
  1070. KERNEL4x3_SUB
  1071. KERNEL4x3_SUB
  1072. KERNEL4x3_SUB
  1073. KERNEL4x3_SUB
  1074. je .L7_4_16
  1075. jmp .L7_4_12
  1076. ALIGN_4
  1077. .L7_4_16:
  1078. movq K, %rax
  1079. andq $ 7, %rax # if (k & 1)
  1080. je .L7_4_19
  1081. ALIGN_4
  1082. .L7_4_17:
  1083. KERNEL4x3_SUB
  1084. jnz .L7_4_17
  1085. ALIGN_4
  1086. .L7_4_19:
  1087. SAVE4x3
  1088. addq $ 8 * SIZE, CO1 # coffset += 8
  1089. decq I # i --
  1090. jg .L7_4_11
  1091. ALIGN_4
  1092. /**************************************************************************
  1093. * Rest of M
  1094. ***************************************************************************/
  1095. /******************************************************************************************************************/
  1096. .L7_2_10:
  1097. testq $ 2, M
  1098. jz .L7_2_40 // to next 2 lines of N
  1099. .L7_2_11:
  1100. leaq BUFFER1, BO // first buffer to BO
  1101. vzeroall
  1102. movq K, %rax
  1103. andq $ -8, %rax // K = K - ( K % 8 )
  1104. je .L7_2_16
  1105. ALIGN_4
  1106. .L7_2_12:
  1107. KERNEL2x3_SUB
  1108. KERNEL2x3_SUB
  1109. KERNEL2x3_SUB
  1110. KERNEL2x3_SUB
  1111. KERNEL2x3_SUB
  1112. KERNEL2x3_SUB
  1113. KERNEL2x3_SUB
  1114. KERNEL2x3_SUB
  1115. je .L7_2_16
  1116. KERNEL2x3_SUB
  1117. KERNEL2x3_SUB
  1118. KERNEL2x3_SUB
  1119. KERNEL2x3_SUB
  1120. KERNEL2x3_SUB
  1121. KERNEL2x3_SUB
  1122. KERNEL2x3_SUB
  1123. KERNEL2x3_SUB
  1124. je .L7_2_16
  1125. jmp .L7_2_12
  1126. ALIGN_4
  1127. .L7_2_16:
  1128. movq K, %rax
  1129. andq $ 7, %rax # if (k & 1)
  1130. je .L7_2_19
  1131. ALIGN_4
  1132. .L7_2_17:
  1133. KERNEL2x3_SUB
  1134. jnz .L7_2_17
  1135. ALIGN_4
  1136. .L7_2_19:
  1137. SAVE2x3
  1138. addq $ 4 * SIZE, CO1 # coffset += 4
  1139. ALIGN_4
  1140. /**************************************************************************
  1141. * Rest of M
  1142. ***************************************************************************/
  1143. .L7_2_40:
  1144. testq $ 1, M
  1145. jz .L7_2_60 // to next 2 lines of N
  1146. ALIGN_4
  1147. .L7_2_41:
  1148. leaq BUFFER1, BO // first buffer to BO
  1149. vzeroall
  1150. movq K, %rax
  1151. andq $ -8, %rax // K = K - ( K % 8 )
  1152. je .L7_2_46
  1153. ALIGN_4
  1154. .L7_2_42:
  1155. KERNEL1x3_SUB
  1156. KERNEL1x3_SUB
  1157. KERNEL1x3_SUB
  1158. KERNEL1x3_SUB
  1159. KERNEL1x3_SUB
  1160. KERNEL1x3_SUB
  1161. KERNEL1x3_SUB
  1162. KERNEL1x3_SUB
  1163. je .L7_2_46
  1164. KERNEL1x3_SUB
  1165. KERNEL1x3_SUB
  1166. KERNEL1x3_SUB
  1167. KERNEL1x3_SUB
  1168. KERNEL1x3_SUB
  1169. KERNEL1x3_SUB
  1170. KERNEL1x3_SUB
  1171. KERNEL1x3_SUB
  1172. je .L7_2_46
  1173. jmp .L7_2_42
  1174. ALIGN_4
  1175. .L7_2_46:
  1176. movq K, %rax
  1177. andq $ 7, %rax # if (k & 1)
  1178. je .L7_2_49
  1179. ALIGN_4
  1180. .L7_2_47:
  1181. KERNEL1x3_SUB
  1182. jnz .L7_2_47
  1183. ALIGN_4
  1184. .L7_2_49:
  1185. SAVE1x3
  1186. addq $ 2 * SIZE, CO1 # coffset += 2
  1187. decq I # i --
  1188. jg .L7_2_41
  1189. ALIGN_4
  1190. .L7_2_60:
  1191. decq J // j --
  1192. jg .L6_00_01 // next 6 lines of N
  1193. /************************************************************************************************/
  1194. /************************************************************************************************/
  1195. .L2_00_0:
  1196. movq Nmod6, J
  1197. sarq $1, J // j = j / 2
  1198. cmpq $ 0, J
  1199. je .L1_2_0
  1200. ALIGN_4
  1201. .L2_00_01:
  1202. // copy to sub buffer
  1203. movq B, BO1
  1204. leaq BUFFER1, BO // first buffer to BO
  1205. movq K, %rax
  1206. ALIGN_4
  1207. .L2_00_02b:
  1208. vmovups (BO1), %xmm0
  1209. vmovups 2 * SIZE(BO1), %xmm1
  1210. vmovups %xmm0, (BO)
  1211. vmovups %xmm1, 2 * SIZE(BO)
  1212. addq $ 4*SIZE,BO1
  1213. addq $ 4*SIZE,BO
  1214. decq %rax
  1215. jnz .L2_00_02b
  1216. .L2_00_02c:
  1217. movq BO1, B // next offset of B
  1218. .L2_00_10:
  1219. movq C, CO1
  1220. leaq (C, LDC, 2), C // c += 2 * ldc
  1221. #if defined(TRMMKERNEL) && defined(LEFT)
  1222. movq OFFSET, %rax
  1223. movq %rax, KK
  1224. #endif
  1225. movq A, AO // aoffset = a
  1226. addq $ 8 * SIZE, AO
  1227. movq M, I
  1228. sarq $ 2, I // i = (m >> 2)
  1229. je .L2_2_10
  1230. ALIGN_4
  1231. /******************************************************************************************************************/
  1232. .L2_4_11:
  1233. #if !defined(TRMMKERNEL) || \
  1234. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1235. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1236. leaq BUFFER1, BO // first buffer to BO
  1237. addq $ 8 * SIZE, BO
  1238. #else
  1239. movq KK, %rax
  1240. leaq BUFFER1, BO // first buffer to BO
  1241. addq $ 8 * SIZE, BO
  1242. movq %rax, BI // Index for BO
  1243. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1244. leaq (BO, BI, SIZE), BO
  1245. salq $ 3, %rax // rax = rax * 8 ; number of values
  1246. leaq (AO, %rax, SIZE), AO
  1247. #endif
  1248. vzeroall
  1249. #ifndef TRMMKERNEL
  1250. movq K, %rax
  1251. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1252. movq K, %rax
  1253. subq KK, %rax
  1254. movq %rax, KKK
  1255. #else
  1256. movq KK, %rax
  1257. #ifdef LEFT
  1258. addq $ 4, %rax // number of values in AO
  1259. #else
  1260. addq $ 2, %rax // number of values in BO
  1261. #endif
  1262. movq %rax, KKK
  1263. #endif
  1264. andq $ -8, %rax // K = K - ( K % 8 )
  1265. je .L2_4_16
  1266. movq %rax, BI // Index for BO
  1267. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1268. salq $ 3, %rax // rax = rax * 8 ; number of values
  1269. leaq (AO, %rax, SIZE), AO
  1270. leaq (BO, BI, SIZE), BO
  1271. negq BI
  1272. negq %rax
  1273. ALIGN_4
  1274. .L2_4_12:
  1275. prefetcht0 A_PR1(AO,%rax,SIZE)
  1276. prefetcht0 B_PR1(BO,BI ,SIZE)
  1277. KERNEL4x2_SUB
  1278. prefetcht0 A_PR1(AO,%rax,SIZE)
  1279. KERNEL4x2_SUB
  1280. prefetcht0 A_PR1(AO,%rax,SIZE)
  1281. prefetcht0 B_PR1(BO,BI ,SIZE)
  1282. KERNEL4x2_SUB
  1283. prefetcht0 A_PR1(AO,%rax,SIZE)
  1284. KERNEL4x2_SUB
  1285. prefetcht0 A_PR1(AO,%rax,SIZE)
  1286. prefetcht0 B_PR1(BO,BI ,SIZE)
  1287. KERNEL4x2_SUB
  1288. prefetcht0 A_PR1(AO,%rax,SIZE)
  1289. KERNEL4x2_SUB
  1290. prefetcht0 A_PR1(AO,%rax,SIZE)
  1291. prefetcht0 B_PR1(BO,BI ,SIZE)
  1292. KERNEL4x2_SUB
  1293. prefetcht0 A_PR1(AO,%rax,SIZE)
  1294. KERNEL4x2_SUB
  1295. je .L2_4_16
  1296. prefetcht0 A_PR1(AO,%rax,SIZE)
  1297. prefetcht0 B_PR1(BO,BI ,SIZE)
  1298. KERNEL4x2_SUB
  1299. prefetcht0 A_PR1(AO,%rax,SIZE)
  1300. KERNEL4x2_SUB
  1301. prefetcht0 A_PR1(AO,%rax,SIZE)
  1302. prefetcht0 B_PR1(BO,BI ,SIZE)
  1303. KERNEL4x2_SUB
  1304. prefetcht0 A_PR1(AO,%rax,SIZE)
  1305. KERNEL4x2_SUB
  1306. prefetcht0 A_PR1(AO,%rax,SIZE)
  1307. prefetcht0 B_PR1(BO,BI ,SIZE)
  1308. KERNEL4x2_SUB
  1309. prefetcht0 A_PR1(AO,%rax,SIZE)
  1310. KERNEL4x2_SUB
  1311. prefetcht0 A_PR1(AO,%rax,SIZE)
  1312. prefetcht0 B_PR1(BO,BI ,SIZE)
  1313. KERNEL4x2_SUB
  1314. prefetcht0 A_PR1(AO,%rax,SIZE)
  1315. KERNEL4x2_SUB
  1316. je .L2_4_16
  1317. jmp .L2_4_12
  1318. ALIGN_4
  1319. .L2_4_16:
  1320. #ifndef TRMMKERNEL
  1321. movq K, %rax
  1322. #else
  1323. movq KKK, %rax
  1324. #endif
  1325. andq $ 7, %rax # if (k & 1)
  1326. je .L2_4_19
  1327. movq %rax, BI // Index for BO
  1328. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1329. salq $ 3, %rax // rax = rax * 8 ; number of values
  1330. leaq (AO, %rax, SIZE), AO
  1331. leaq (BO, BI, SIZE), BO
  1332. negq BI
  1333. negq %rax
  1334. ALIGN_4
  1335. .L2_4_17:
  1336. KERNEL4x2_SUB
  1337. jl .L2_4_17
  1338. ALIGN_4
  1339. .L2_4_19:
  1340. SAVE4x2
  1341. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1342. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1343. movq K, %rax
  1344. subq KKK, %rax
  1345. movq %rax, BI // Index for BO
  1346. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1347. leaq (BO, BI, SIZE), BO
  1348. salq $ 3, %rax // rax = rax * 8 ; number of values
  1349. leaq (AO, %rax, SIZE), AO
  1350. #endif
  1351. #if defined(TRMMKERNEL) && defined(LEFT)
  1352. addq $ 4, KK
  1353. #endif
  1354. addq $ 8 * SIZE, CO1 # coffset += 8
  1355. decq I # i --
  1356. jg .L2_4_11
  1357. ALIGN_4
  1358. /**************************************************************************
  1359. * Rest of M
  1360. ***************************************************************************/
  1361. /******************************************************************************************************************/
  1362. .L2_2_10:
  1363. testq $ 2, M
  1364. jz .L2_2_40 // to next 2 lines of N
  1365. .L2_2_11:
  1366. #if !defined(TRMMKERNEL) || \
  1367. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1368. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1369. leaq BUFFER1, BO // first buffer to BO
  1370. addq $ 8 * SIZE, BO
  1371. #else
  1372. movq KK, %rax
  1373. leaq BUFFER1, BO // first buffer to BO
  1374. addq $ 8 * SIZE, BO
  1375. movq %rax, BI // Index for BO
  1376. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1377. leaq (BO, BI, SIZE), BO
  1378. salq $ 2, %rax // rax = rax * 4 ; number of values
  1379. leaq (AO, %rax, SIZE), AO
  1380. #endif
  1381. vzeroall
  1382. #ifndef TRMMKERNEL
  1383. movq K, %rax
  1384. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1385. movq K, %rax
  1386. subq KK, %rax
  1387. movq %rax, KKK
  1388. #else
  1389. movq KK, %rax
  1390. #ifdef LEFT
  1391. addq $ 2, %rax // number of values in AO
  1392. #else
  1393. addq $ 2, %rax // number of values in BO
  1394. #endif
  1395. movq %rax, KKK
  1396. #endif
  1397. andq $ -8, %rax // K = K - ( K % 8 )
  1398. je .L2_2_16
  1399. movq %rax, BI // Index for BO
  1400. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1401. salq $ 2, %rax // rax = rax * 4 ; number of values
  1402. leaq (AO, %rax, SIZE), AO
  1403. leaq (BO, BI, SIZE), BO
  1404. negq BI
  1405. negq %rax
  1406. ALIGN_4
  1407. .L2_2_12:
  1408. prefetcht0 A_PR1(AO,%rax,SIZE)
  1409. prefetcht0 B_PR1(BO,BI,SIZE)
  1410. KERNEL2x2_SUB
  1411. KERNEL2x2_SUB
  1412. prefetcht0 A_PR1(AO,%rax,SIZE)
  1413. prefetcht0 B_PR1(BO,BI,SIZE)
  1414. KERNEL2x2_SUB
  1415. KERNEL2x2_SUB
  1416. prefetcht0 A_PR1(AO,%rax,SIZE)
  1417. prefetcht0 B_PR1(BO,BI,SIZE)
  1418. KERNEL2x2_SUB
  1419. KERNEL2x2_SUB
  1420. prefetcht0 A_PR1(AO,%rax,SIZE)
  1421. prefetcht0 B_PR1(BO,BI,SIZE)
  1422. KERNEL2x2_SUB
  1423. KERNEL2x2_SUB
  1424. je .L2_2_16
  1425. prefetcht0 A_PR1(AO,%rax,SIZE)
  1426. prefetcht0 B_PR1(BO,BI,SIZE)
  1427. KERNEL2x2_SUB
  1428. KERNEL2x2_SUB
  1429. prefetcht0 A_PR1(AO,%rax,SIZE)
  1430. prefetcht0 B_PR1(BO,BI,SIZE)
  1431. KERNEL2x2_SUB
  1432. KERNEL2x2_SUB
  1433. prefetcht0 A_PR1(AO,%rax,SIZE)
  1434. prefetcht0 B_PR1(BO,BI,SIZE)
  1435. KERNEL2x2_SUB
  1436. KERNEL2x2_SUB
  1437. prefetcht0 A_PR1(AO,%rax,SIZE)
  1438. prefetcht0 B_PR1(BO,BI,SIZE)
  1439. KERNEL2x2_SUB
  1440. KERNEL2x2_SUB
  1441. je .L2_2_16
  1442. jmp .L2_2_12
  1443. ALIGN_4
  1444. .L2_2_16:
  1445. #ifndef TRMMKERNEL
  1446. movq K, %rax
  1447. #else
  1448. movq KKK, %rax
  1449. #endif
  1450. andq $ 7, %rax # if (k & 1)
  1451. je .L2_2_19
  1452. movq %rax, BI // Index for BO
  1453. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1454. salq $ 2, %rax // rax = rax * 4 ; number of values
  1455. leaq (AO, %rax, SIZE), AO
  1456. leaq (BO, BI, SIZE), BO
  1457. negq BI
  1458. negq %rax
  1459. ALIGN_4
  1460. .L2_2_17:
  1461. KERNEL2x2_SUB
  1462. jl .L2_2_17
  1463. ALIGN_4
  1464. .L2_2_19:
  1465. SAVE2x2
  1466. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1467. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1468. movq K, %rax
  1469. subq KKK, %rax
  1470. movq %rax, BI // Index for BO
  1471. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1472. leaq (BO, BI, SIZE), BO
  1473. salq $ 2, %rax // rax = rax * 4 ; number of values
  1474. leaq (AO, %rax, SIZE), AO
  1475. #endif
  1476. #if defined(TRMMKERNEL) && defined(LEFT)
  1477. addq $ 2, KK
  1478. #endif
  1479. addq $ 4 * SIZE, CO1 # coffset += 4
  1480. ALIGN_4
  1481. /**************************************************************************
  1482. * Rest of M
  1483. ***************************************************************************/
  1484. .L2_2_40:
  1485. testq $ 1, M
  1486. jz .L2_2_60 // to next 2 lines of N
  1487. ALIGN_4
  1488. .L2_2_41:
  1489. #if !defined(TRMMKERNEL) || \
  1490. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1491. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1492. leaq BUFFER1, BO // first buffer to BO
  1493. addq $ 8 * SIZE, BO
  1494. #else
  1495. movq KK, %rax
  1496. leaq BUFFER1, BO // first buffer to BO
  1497. addq $ 8 * SIZE, BO
  1498. movq %rax, BI // Index for BO
  1499. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1500. leaq (BO, BI, SIZE), BO
  1501. salq $ 1, %rax // rax = rax * 2 ; number of values
  1502. leaq (AO, %rax, SIZE), AO
  1503. #endif
  1504. vzeroall
  1505. #ifndef TRMMKERNEL
  1506. movq K, %rax
  1507. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1508. movq K, %rax
  1509. subq KK, %rax
  1510. movq %rax, KKK
  1511. #else
  1512. movq KK, %rax
  1513. #ifdef LEFT
  1514. addq $ 1, %rax // number of values in AO
  1515. #else
  1516. addq $ 2, %rax // number of values in BO
  1517. #endif
  1518. movq %rax, KKK
  1519. #endif
  1520. andq $ -8, %rax // K = K - ( K % 8 )
  1521. je .L2_2_46
  1522. movq %rax, BI // Index for BO
  1523. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1524. salq $ 1, %rax // rax = rax * 2 ; number of values
  1525. leaq (AO, %rax, SIZE), AO
  1526. leaq (BO, BI, SIZE), BO
  1527. negq BI
  1528. negq %rax
  1529. ALIGN_4
  1530. .L2_2_42:
  1531. prefetcht0 A_PR1(AO,%rax,SIZE)
  1532. prefetcht0 B_PR1(BO,BI,SIZE)
  1533. KERNEL1x2_SUB
  1534. KERNEL1x2_SUB
  1535. prefetcht0 B_PR1(BO,BI,SIZE)
  1536. KERNEL1x2_SUB
  1537. KERNEL1x2_SUB
  1538. prefetcht0 A_PR1(AO,%rax,SIZE)
  1539. prefetcht0 B_PR1(BO,BI,SIZE)
  1540. KERNEL1x2_SUB
  1541. KERNEL1x2_SUB
  1542. prefetcht0 B_PR1(BO,BI,SIZE)
  1543. KERNEL1x2_SUB
  1544. KERNEL1x2_SUB
  1545. je .L2_2_46
  1546. prefetcht0 A_PR1(AO,%rax,SIZE)
  1547. prefetcht0 B_PR1(BO,BI,SIZE)
  1548. KERNEL1x2_SUB
  1549. KERNEL1x2_SUB
  1550. prefetcht0 B_PR1(BO,BI,SIZE)
  1551. KERNEL1x2_SUB
  1552. KERNEL1x2_SUB
  1553. prefetcht0 A_PR1(AO,%rax,SIZE)
  1554. prefetcht0 B_PR1(BO,BI,SIZE)
  1555. KERNEL1x2_SUB
  1556. KERNEL1x2_SUB
  1557. prefetcht0 B_PR1(BO,BI,SIZE)
  1558. KERNEL1x2_SUB
  1559. KERNEL1x2_SUB
  1560. je .L2_2_46
  1561. jmp .L2_2_42
  1562. ALIGN_4
  1563. .L2_2_46:
  1564. #ifndef TRMMKERNEL
  1565. movq K, %rax
  1566. #else
  1567. movq KKK, %rax
  1568. #endif
  1569. andq $ 7, %rax # if (k & 1)
  1570. je .L2_2_49
  1571. movq %rax, BI // Index for BO
  1572. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1573. salq $ 1, %rax // rax = rax * 2 ; number of values
  1574. leaq (AO, %rax, SIZE), AO
  1575. leaq (BO, BI, SIZE), BO
  1576. negq BI
  1577. negq %rax
  1578. ALIGN_4
  1579. .L2_2_47:
  1580. KERNEL1x2_SUB
  1581. jl .L2_2_47
  1582. ALIGN_4
  1583. .L2_2_49:
  1584. SAVE1x2
  1585. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1586. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1587. movq K, %rax
  1588. subq KKK, %rax
  1589. movq %rax, BI // Index for BO
  1590. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1591. leaq (BO, BI, SIZE), BO
  1592. salq $ 1, %rax // rax = rax * 2 ; number of values
  1593. leaq (AO, %rax, SIZE), AO
  1594. #endif
  1595. #if defined(TRMMKERNEL) && defined(LEFT)
  1596. addq $ 1, KK
  1597. #endif
  1598. addq $ 2 * SIZE, CO1 # coffset += 2
  1599. decq I # i --
  1600. jg .L2_2_41
  1601. ALIGN_4
  1602. .L2_2_60:
  1603. #if defined(TRMMKERNEL) && !defined(LEFT)
  1604. addq $ 2, KK
  1605. #endif
  1606. decq J // j --
  1607. jg .L2_00_01 // next 2 lines of N
  1608. .L1_2_0:
  1609. /************************************************************************************************
  1610. * Loop for Nmod6 % 2 > 0
  1611. *************************************************************************************************/
  1612. movq Nmod6, J
  1613. andq $ 1, J // j % 2
  1614. je .L999
  1615. ALIGN_4
  1616. .L1_00_01:
  1617. // copy to sub buffer
  1618. movq B, BO1
  1619. leaq BUFFER1, BO // first buffer to BO
  1620. movq K, %rax
  1621. ALIGN_4
  1622. .L1_00_02b:
  1623. vmovups (BO1), %xmm0
  1624. vmovups %xmm0, (BO)
  1625. addq $ 2*SIZE,BO1
  1626. addq $ 2*SIZE,BO
  1627. decq %rax
  1628. jnz .L1_00_02b
  1629. .L1_00_02c:
  1630. movq BO1, B // next offset of B
  1631. .L1_00_10:
  1632. movq C, CO1
  1633. leaq (C, LDC, 1), C // c += 1 * ldc
  1634. #if defined(TRMMKERNEL) && defined(LEFT)
  1635. movq OFFSET, %rax
  1636. movq %rax, KK
  1637. #endif
  1638. movq A, AO // aoffset = a
  1639. addq $ 8 * SIZE, AO
  1640. movq M, I
  1641. sarq $ 2, I // i = (m >> 2)
  1642. je .L1_2_10
  1643. ALIGN_4
  1644. /*******************************************************************************************************/
  1645. .L1_4_11:
  1646. #if !defined(TRMMKERNEL) || \
  1647. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1648. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1649. leaq BUFFER1, BO // first buffer to BO
  1650. addq $ 4 * SIZE, BO
  1651. #else
  1652. movq KK, %rax
  1653. leaq BUFFER1, BO // first buffer to BO
  1654. addq $ 4 * SIZE, BO
  1655. movq %rax, BI // Index for BO
  1656. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1657. leaq (BO, BI, SIZE), BO
  1658. salq $ 3, %rax // rax = rax * 8 ; number of values
  1659. leaq (AO, %rax, SIZE), AO
  1660. #endif
  1661. vzeroall
  1662. #ifndef TRMMKERNEL
  1663. movq K, %rax
  1664. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1665. movq K, %rax
  1666. subq KK, %rax
  1667. movq %rax, KKK
  1668. #else
  1669. movq KK, %rax
  1670. #ifdef LEFT
  1671. addq $ 4, %rax // number of values in AO
  1672. #else
  1673. addq $ 1, %rax // number of values in BO
  1674. #endif
  1675. movq %rax, KKK
  1676. #endif
  1677. andq $ -8, %rax // K = K - ( K % 8 )
  1678. je .L1_4_16
  1679. movq %rax, BI // Index for BO
  1680. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1681. salq $ 3, %rax // rax = rax * 8 ; number of values
  1682. leaq (AO, %rax, SIZE), AO
  1683. leaq (BO, BI, SIZE), BO
  1684. negq BI
  1685. negq %rax
  1686. ALIGN_4
  1687. .L1_4_12:
  1688. KERNEL4x1_SUB
  1689. KERNEL4x1_SUB
  1690. KERNEL4x1_SUB
  1691. KERNEL4x1_SUB
  1692. KERNEL4x1_SUB
  1693. KERNEL4x1_SUB
  1694. KERNEL4x1_SUB
  1695. KERNEL4x1_SUB
  1696. je .L1_4_16
  1697. KERNEL4x1_SUB
  1698. KERNEL4x1_SUB
  1699. KERNEL4x1_SUB
  1700. KERNEL4x1_SUB
  1701. KERNEL4x1_SUB
  1702. KERNEL4x1_SUB
  1703. KERNEL4x1_SUB
  1704. KERNEL4x1_SUB
  1705. je .L1_4_16
  1706. jmp .L1_4_12
  1707. ALIGN_4
  1708. .L1_4_16:
  1709. #ifndef TRMMKERNEL
  1710. movq K, %rax
  1711. #else
  1712. movq KKK, %rax
  1713. #endif
  1714. andq $ 7, %rax # if (k & 1)
  1715. je .L1_4_19
  1716. movq %rax, BI // Index for BO
  1717. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1718. salq $ 3, %rax // rax = rax * 8 ; number of values
  1719. leaq (AO, %rax, SIZE), AO
  1720. leaq (BO, BI, SIZE), BO
  1721. negq BI
  1722. negq %rax
  1723. ALIGN_4
  1724. .L1_4_17:
  1725. KERNEL4x1_SUB
  1726. jl .L1_4_17
  1727. ALIGN_4
  1728. .L1_4_19:
  1729. SAVE4x1
  1730. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1731. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1732. movq K, %rax
  1733. subq KKK, %rax
  1734. movq %rax, BI // Index for BO
  1735. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1736. leaq (BO, BI, SIZE), BO
  1737. salq $ 3, %rax // rax = rax * 8 ; number of values
  1738. leaq (AO, %rax, SIZE), AO
  1739. #endif
  1740. #if defined(TRMMKERNEL) && defined(LEFT)
  1741. addq $ 4, KK
  1742. #endif
  1743. addq $ 8 * SIZE, CO1 # coffset += 8
  1744. decq I # i --
  1745. jg .L1_4_11
  1746. ALIGN_4
  1747. /*******************************************************************************************************/
  1748. .L1_2_10:
  1749. testq $ 2, M
  1750. jz .L1_2_40
  1751. .L1_2_11:
  1752. #if !defined(TRMMKERNEL) || \
  1753. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1754. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1755. leaq BUFFER1, BO // first buffer to BO
  1756. addq $ 4 * SIZE, BO
  1757. #else
  1758. movq KK, %rax
  1759. leaq BUFFER1, BO // first buffer to BO
  1760. addq $ 4 * SIZE, BO
  1761. movq %rax, BI // Index for BO
  1762. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1763. leaq (BO, BI, SIZE), BO
  1764. salq $ 2, %rax // rax = rax * 4 ; number of values
  1765. leaq (AO, %rax, SIZE), AO
  1766. #endif
  1767. vzeroall
  1768. #ifndef TRMMKERNEL
  1769. movq K, %rax
  1770. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1771. movq K, %rax
  1772. subq KK, %rax
  1773. movq %rax, KKK
  1774. #else
  1775. movq KK, %rax
  1776. #ifdef LEFT
  1777. addq $ 2, %rax // number of values in AO
  1778. #else
  1779. addq $ 1, %rax // number of values in BO
  1780. #endif
  1781. movq %rax, KKK
  1782. #endif
  1783. andq $ -8, %rax // K = K - ( K % 8 )
  1784. je .L1_2_16
  1785. movq %rax, BI // Index for BO
  1786. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1787. salq $ 2, %rax // rax = rax * 4 ; number of values
  1788. leaq (AO, %rax, SIZE), AO
  1789. leaq (BO, BI, SIZE), BO
  1790. negq BI
  1791. negq %rax
  1792. ALIGN_4
  1793. .L1_2_12:
  1794. prefetcht0 A_PR1(AO,%rax,SIZE)
  1795. prefetcht0 B_PR1(BO,BI,SIZE)
  1796. KERNEL2x1_SUB
  1797. KERNEL2x1_SUB
  1798. prefetcht0 A_PR1(AO,%rax,SIZE)
  1799. KERNEL2x1_SUB
  1800. KERNEL2x1_SUB
  1801. prefetcht0 B_PR1(BO,BI,SIZE)
  1802. KERNEL2x1_SUB
  1803. KERNEL2x1_SUB
  1804. prefetcht0 A_PR1(AO,%rax,SIZE)
  1805. KERNEL2x1_SUB
  1806. KERNEL2x1_SUB
  1807. je .L1_2_16
  1808. prefetcht0 B_PR1(BO,BI,SIZE)
  1809. KERNEL2x1_SUB
  1810. KERNEL2x1_SUB
  1811. prefetcht0 A_PR1(AO,%rax,SIZE)
  1812. KERNEL2x1_SUB
  1813. KERNEL2x1_SUB
  1814. prefetcht0 B_PR1(BO,BI,SIZE)
  1815. KERNEL2x1_SUB
  1816. KERNEL2x1_SUB
  1817. prefetcht0 A_PR1(AO,%rax,SIZE)
  1818. KERNEL2x1_SUB
  1819. KERNEL2x1_SUB
  1820. je .L1_2_16
  1821. jmp .L1_2_12
  1822. ALIGN_4
  1823. .L1_2_16:
  1824. #ifndef TRMMKERNEL
  1825. movq K, %rax
  1826. #else
  1827. movq KKK, %rax
  1828. #endif
  1829. andq $ 7, %rax # if (k & 1)
  1830. je .L1_2_19
  1831. movq %rax, BI // Index for BO
  1832. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1833. salq $ 2, %rax // rax = rax * 4 ; number of values
  1834. leaq (AO, %rax, SIZE), AO
  1835. leaq (BO, BI, SIZE), BO
  1836. negq BI
  1837. negq %rax
  1838. ALIGN_4
  1839. .L1_2_17:
  1840. KERNEL2x1_SUB
  1841. jl .L1_2_17
  1842. ALIGN_4
  1843. .L1_2_19:
  1844. SAVE2x1
  1845. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1846. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1847. movq K, %rax
  1848. subq KKK, %rax
  1849. movq %rax, BI // Index for BO
  1850. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1851. leaq (BO, BI, SIZE), BO
  1852. salq $ 2, %rax // rax = rax * 4 ; number of values
  1853. leaq (AO, %rax, SIZE), AO
  1854. #endif
  1855. #if defined(TRMMKERNEL) && defined(LEFT)
  1856. addq $ 2, KK
  1857. #endif
  1858. addq $ 4 * SIZE, CO1 # coffset += 4
  1859. ALIGN_4
  1860. /**************************************************************************
  1861. * Rest of M
  1862. ***************************************************************************/
  1863. .L1_2_40:
  1864. testq $ 1, M
  1865. jz .L999
  1866. ALIGN_4
  1867. .L1_2_41:
  1868. #if !defined(TRMMKERNEL) || \
  1869. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1870. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1871. leaq BUFFER1, BO // first buffer to BO
  1872. addq $ 4 * SIZE, BO
  1873. #else
  1874. movq KK, %rax
  1875. leaq BUFFER1, BO // first buffer to BO
  1876. addq $ 4 * SIZE, BO
  1877. movq %rax, BI // Index for BO
  1878. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1879. leaq (BO, BI, SIZE), BO
  1880. salq $ 1, %rax // rax = rax * 2 ; number of values
  1881. leaq (AO, %rax, SIZE), AO
  1882. #endif
  1883. vzeroall
  1884. #ifndef TRMMKERNEL
  1885. movq K, %rax
  1886. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1887. movq K, %rax
  1888. subq KK, %rax
  1889. movq %rax, KKK
  1890. #else
  1891. movq KK, %rax
  1892. #ifdef LEFT
  1893. addq $ 1, %rax // number of values in AO
  1894. #else
  1895. addq $ 1, %rax // number of values in BO
  1896. #endif
  1897. movq %rax, KKK
  1898. #endif
  1899. andq $ -8, %rax // K = K - ( K % 8 )
  1900. je .L1_2_46
  1901. movq %rax, BI // Index for BO
  1902. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1903. salq $ 1, %rax // rax = rax * 2 ; number of values
  1904. leaq (AO, %rax, SIZE), AO
  1905. leaq (BO, BI, SIZE), BO
  1906. negq BI
  1907. negq %rax
  1908. ALIGN_4
  1909. .L1_2_42:
  1910. prefetcht0 A_PR1(AO,%rax,SIZE)
  1911. prefetcht0 B_PR1(BO,BI,SIZE)
  1912. KERNEL1x1_SUB
  1913. KERNEL1x1_SUB
  1914. KERNEL1x1_SUB
  1915. KERNEL1x1_SUB
  1916. prefetcht0 A_PR1(AO,%rax,SIZE)
  1917. prefetcht0 B_PR1(BO,BI,SIZE)
  1918. KERNEL1x1_SUB
  1919. KERNEL1x1_SUB
  1920. KERNEL1x1_SUB
  1921. KERNEL1x1_SUB
  1922. je .L1_2_46
  1923. prefetcht0 A_PR1(AO,%rax,SIZE)
  1924. prefetcht0 B_PR1(BO,BI,SIZE)
  1925. KERNEL1x1_SUB
  1926. KERNEL1x1_SUB
  1927. KERNEL1x1_SUB
  1928. KERNEL1x1_SUB
  1929. prefetcht0 A_PR1(AO,%rax,SIZE)
  1930. prefetcht0 B_PR1(BO,BI,SIZE)
  1931. KERNEL1x1_SUB
  1932. KERNEL1x1_SUB
  1933. KERNEL1x1_SUB
  1934. KERNEL1x1_SUB
  1935. je .L1_2_46
  1936. jmp .L1_2_42
  1937. ALIGN_4
  1938. .L1_2_46:
  1939. #ifndef TRMMKERNEL
  1940. movq K, %rax
  1941. #else
  1942. movq KKK, %rax
  1943. #endif
  1944. andq $ 7, %rax # if (k & 1)
  1945. je .L1_2_49
  1946. movq %rax, BI // Index for BO
  1947. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1948. salq $ 1, %rax // rax = rax * 2 ; number of values
  1949. leaq (AO, %rax, SIZE), AO
  1950. leaq (BO, BI, SIZE), BO
  1951. negq BI
  1952. negq %rax
  1953. ALIGN_4
  1954. .L1_2_47:
  1955. KERNEL1x1_SUB
  1956. jl .L1_2_47
  1957. ALIGN_4
  1958. .L1_2_49:
  1959. SAVE1x1
  1960. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1961. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1962. movq K, %rax
  1963. subq KKK, %rax
  1964. movq %rax, BI // Index for BO
  1965. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1966. leaq (BO, BI, SIZE), BO
  1967. salq $ 1, %rax // rax = rax * 2 ; number of values
  1968. leaq (AO, %rax, SIZE), AO
  1969. #endif
  1970. #if defined(TRMMKERNEL) && defined(LEFT)
  1971. addq $ 1, KK
  1972. #endif
  1973. addq $ 2 * SIZE, CO1 # coffset += 2
  1974. decq I # i --
  1975. jg .L1_2_41
  1976. ALIGN_4
  1977. .L999:
  1978. vzeroupper
  1979. movq SP, %rsp
  1980. movq (%rsp), %rbx
  1981. movq 8(%rsp), %rbp
  1982. movq 16(%rsp), %r12
  1983. movq 24(%rsp), %r13
  1984. movq 32(%rsp), %r14
  1985. movq 40(%rsp), %r15
  1986. #ifdef WINDOWS_ABI
  1987. movq 48(%rsp), %rdi
  1988. movq 56(%rsp), %rsi
  1989. vmovups 64(%rsp), %xmm6
  1990. vmovups 80(%rsp), %xmm7
  1991. vmovups 96(%rsp), %xmm8
  1992. vmovups 112(%rsp), %xmm9
  1993. vmovups 128(%rsp), %xmm10
  1994. vmovups 144(%rsp), %xmm11
  1995. vmovups 160(%rsp), %xmm12
  1996. vmovups 176(%rsp), %xmm13
  1997. vmovups 192(%rsp), %xmm14
  1998. vmovups 208(%rsp), %xmm15
  1999. #endif
  2000. addq $ STACKSIZE, %rsp
  2001. ret
  2002. EPILOGUE
  2003. #else
  2004. /************************************************************************************************
  2005. TRMM Kernel
  2006. ************************************************************************************************/
  2007. PROLOGUE
  2008. PROFCODE
  2009. subq $ STACKSIZE, %rsp
  2010. movq %rbx, (%rsp)
  2011. movq %rbp, 8(%rsp)
  2012. movq %r12, 16(%rsp)
  2013. movq %r13, 24(%rsp)
  2014. movq %r14, 32(%rsp)
  2015. movq %r15, 40(%rsp)
  2016. vzeroupper
  2017. #ifdef WINDOWS_ABI
  2018. movq %rdi, 48(%rsp)
  2019. movq %rsi, 56(%rsp)
  2020. vmovups %xmm6, 64(%rsp)
  2021. vmovups %xmm7, 80(%rsp)
  2022. vmovups %xmm8, 96(%rsp)
  2023. vmovups %xmm9, 112(%rsp)
  2024. vmovups %xmm10, 128(%rsp)
  2025. vmovups %xmm11, 144(%rsp)
  2026. vmovups %xmm12, 160(%rsp)
  2027. vmovups %xmm13, 176(%rsp)
  2028. vmovups %xmm14, 192(%rsp)
  2029. vmovups %xmm15, 208(%rsp)
  2030. movq ARG1, OLD_M
  2031. movq ARG2, OLD_N
  2032. movq ARG3, OLD_K
  2033. movq OLD_A, A
  2034. movq OLD_B, B
  2035. movq OLD_C, C
  2036. movq OLD_LDC, LDC
  2037. #ifdef TRMMKERNEL
  2038. movsd OLD_OFFSET, %xmm12
  2039. #endif
  2040. vmovaps %xmm3, %xmm0
  2041. vmovsd OLD_ALPHA_I, %xmm1
  2042. #else
  2043. movq STACKSIZE + 8(%rsp), LDC
  2044. #ifdef TRMMKERNEL
  2045. movsd STACKSIZE + 16(%rsp), %xmm12
  2046. #endif
  2047. #endif
  2048. movq %rsp, SP # save old stack
  2049. subq $ 128 + L_BUFFER_SIZE, %rsp
  2050. andq $ -4096, %rsp # align stack
  2051. STACK_TOUCH
  2052. cmpq $ 0, OLD_M
  2053. je .L999
  2054. cmpq $ 0, OLD_N
  2055. je .L999
  2056. cmpq $ 0, OLD_K
  2057. je .L999
  2058. movq OLD_M, M
  2059. movq OLD_N, N
  2060. movq OLD_K, K
  2061. vmovsd %xmm0, ALPHA_R
  2062. vmovsd %xmm1, ALPHA_I
  2063. salq $ ZBASE_SHIFT, LDC
  2064. movq N, %rax
  2065. xorq %rdx, %rdx
  2066. movq $ 2, %rdi
  2067. divq %rdi // N / 2
  2068. movq %rax, Ndiv6 // N / 2
  2069. movq %rdx, Nmod6 // N % 2
  2070. #ifdef TRMMKERNEL
  2071. vmovsd %xmm12, OFFSET
  2072. vmovsd %xmm12, KK
  2073. #ifndef LEFT
  2074. negq KK
  2075. #endif
  2076. #endif
  2077. .L2_00_0:
  2078. movq Ndiv6, J
  2079. cmpq $ 0, J
  2080. je .L1_2_0
  2081. ALIGN_4
  2082. .L2_00_01:
  2083. // copy to sub buffer
  2084. movq B, BO1
  2085. leaq BUFFER1, BO // first buffer to BO
  2086. movq K, %rax
  2087. ALIGN_4
  2088. .L2_00_02b:
  2089. vmovups (BO1), %xmm0
  2090. vmovups 2 * SIZE(BO1), %xmm1
  2091. vmovups %xmm0, (BO)
  2092. vmovups %xmm1, 2 * SIZE(BO)
  2093. addq $ 4*SIZE,BO1
  2094. addq $ 4*SIZE,BO
  2095. decq %rax
  2096. jnz .L2_00_02b
  2097. .L2_00_02c:
  2098. movq BO1, B // next offset of B
  2099. .L2_00_10:
  2100. movq C, CO1
  2101. leaq (C, LDC, 2), C // c += 2 * ldc
  2102. #if defined(TRMMKERNEL) && defined(LEFT)
  2103. movq OFFSET, %rax
  2104. movq %rax, KK
  2105. #endif
  2106. movq A, AO // aoffset = a
  2107. addq $ 8 * SIZE, AO
  2108. movq M, I
  2109. sarq $ 2, I // i = (m >> 2)
  2110. je .L2_2_10
  2111. ALIGN_4
  2112. /******************************************************************************************************************/
  2113. .L2_4_11:
  2114. #if !defined(TRMMKERNEL) || \
  2115. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2116. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2117. leaq BUFFER1, BO // first buffer to BO
  2118. addq $ 8 * SIZE, BO
  2119. #else
  2120. movq KK, %rax
  2121. leaq BUFFER1, BO // first buffer to BO
  2122. addq $ 8 * SIZE, BO
  2123. movq %rax, BI // Index for BO
  2124. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  2125. leaq (BO, BI, SIZE), BO
  2126. salq $ 3, %rax // rax = rax * 8 ; number of values
  2127. leaq (AO, %rax, SIZE), AO
  2128. #endif
  2129. vzeroall
  2130. #ifndef TRMMKERNEL
  2131. movq K, %rax
  2132. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2133. movq K, %rax
  2134. subq KK, %rax
  2135. movq %rax, KKK
  2136. #else
  2137. movq KK, %rax
  2138. #ifdef LEFT
  2139. addq $ 4, %rax // number of values in AO
  2140. #else
  2141. addq $ 2, %rax // number of values in BO
  2142. #endif
  2143. movq %rax, KKK
  2144. #endif
  2145. andq $ -8, %rax // K = K - ( K % 8 )
  2146. je .L2_4_16
  2147. movq %rax, BI // Index for BO
  2148. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2149. salq $ 3, %rax // rax = rax * 8 ; number of values
  2150. leaq (AO, %rax, SIZE), AO
  2151. leaq (BO, BI, SIZE), BO
  2152. negq BI
  2153. negq %rax
  2154. ALIGN_4
  2155. .L2_4_12:
  2156. prefetcht0 A_PR1(AO,%rax,SIZE)
  2157. prefetcht0 B_PR1(BO,BI ,SIZE)
  2158. KERNEL4x2_SUB
  2159. prefetcht0 A_PR1(AO,%rax,SIZE)
  2160. KERNEL4x2_SUB
  2161. prefetcht0 A_PR1(AO,%rax,SIZE)
  2162. prefetcht0 B_PR1(BO,BI ,SIZE)
  2163. KERNEL4x2_SUB
  2164. prefetcht0 A_PR1(AO,%rax,SIZE)
  2165. KERNEL4x2_SUB
  2166. prefetcht0 A_PR1(AO,%rax,SIZE)
  2167. prefetcht0 B_PR1(BO,BI ,SIZE)
  2168. KERNEL4x2_SUB
  2169. prefetcht0 A_PR1(AO,%rax,SIZE)
  2170. KERNEL4x2_SUB
  2171. prefetcht0 A_PR1(AO,%rax,SIZE)
  2172. prefetcht0 B_PR1(BO,BI ,SIZE)
  2173. KERNEL4x2_SUB
  2174. prefetcht0 A_PR1(AO,%rax,SIZE)
  2175. KERNEL4x2_SUB
  2176. je .L2_4_16
  2177. prefetcht0 A_PR1(AO,%rax,SIZE)
  2178. prefetcht0 B_PR1(BO,BI ,SIZE)
  2179. KERNEL4x2_SUB
  2180. prefetcht0 A_PR1(AO,%rax,SIZE)
  2181. KERNEL4x2_SUB
  2182. prefetcht0 A_PR1(AO,%rax,SIZE)
  2183. prefetcht0 B_PR1(BO,BI ,SIZE)
  2184. KERNEL4x2_SUB
  2185. prefetcht0 A_PR1(AO,%rax,SIZE)
  2186. KERNEL4x2_SUB
  2187. prefetcht0 A_PR1(AO,%rax,SIZE)
  2188. prefetcht0 B_PR1(BO,BI ,SIZE)
  2189. KERNEL4x2_SUB
  2190. prefetcht0 A_PR1(AO,%rax,SIZE)
  2191. KERNEL4x2_SUB
  2192. prefetcht0 A_PR1(AO,%rax,SIZE)
  2193. prefetcht0 B_PR1(BO,BI ,SIZE)
  2194. KERNEL4x2_SUB
  2195. prefetcht0 A_PR1(AO,%rax,SIZE)
  2196. KERNEL4x2_SUB
  2197. je .L2_4_16
  2198. jmp .L2_4_12
  2199. ALIGN_4
  2200. .L2_4_16:
  2201. #ifndef TRMMKERNEL
  2202. movq K, %rax
  2203. #else
  2204. movq KKK, %rax
  2205. #endif
  2206. andq $ 7, %rax # if (k & 1)
  2207. je .L2_4_19
  2208. movq %rax, BI // Index for BO
  2209. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2210. salq $ 3, %rax // rax = rax * 8 ; number of values
  2211. leaq (AO, %rax, SIZE), AO
  2212. leaq (BO, BI, SIZE), BO
  2213. negq BI
  2214. negq %rax
  2215. ALIGN_4
  2216. .L2_4_17:
  2217. KERNEL4x2_SUB
  2218. jl .L2_4_17
  2219. ALIGN_4
  2220. .L2_4_19:
  2221. SAVE4x2
  2222. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2223. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2224. movq K, %rax
  2225. subq KKK, %rax
  2226. movq %rax, BI // Index for BO
  2227. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2228. leaq (BO, BI, SIZE), BO
  2229. salq $ 3, %rax // rax = rax * 8 ; number of values
  2230. leaq (AO, %rax, SIZE), AO
  2231. #endif
  2232. #if defined(TRMMKERNEL) && defined(LEFT)
  2233. addq $ 4, KK
  2234. #endif
  2235. addq $ 8 * SIZE, CO1 # coffset += 8
  2236. decq I # i --
  2237. jg .L2_4_11
  2238. ALIGN_4
  2239. /**************************************************************************
  2240. * Rest of M
  2241. ***************************************************************************/
  2242. /******************************************************************************************************************/
  2243. .L2_2_10:
  2244. testq $ 2, M
  2245. jz .L2_2_40 // to next 2 lines of N
  2246. .L2_2_11:
  2247. #if !defined(TRMMKERNEL) || \
  2248. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2249. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2250. leaq BUFFER1, BO // first buffer to BO
  2251. addq $ 8 * SIZE, BO
  2252. #else
  2253. movq KK, %rax
  2254. leaq BUFFER1, BO // first buffer to BO
  2255. addq $ 8 * SIZE, BO
  2256. movq %rax, BI // Index for BO
  2257. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  2258. leaq (BO, BI, SIZE), BO
  2259. salq $ 2, %rax // rax = rax * 4 ; number of values
  2260. leaq (AO, %rax, SIZE), AO
  2261. #endif
  2262. vzeroall
  2263. #ifndef TRMMKERNEL
  2264. movq K, %rax
  2265. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2266. movq K, %rax
  2267. subq KK, %rax
  2268. movq %rax, KKK
  2269. #else
  2270. movq KK, %rax
  2271. #ifdef LEFT
  2272. addq $ 2, %rax // number of values in AO
  2273. #else
  2274. addq $ 2, %rax // number of values in BO
  2275. #endif
  2276. movq %rax, KKK
  2277. #endif
  2278. andq $ -8, %rax // K = K - ( K % 8 )
  2279. je .L2_2_16
  2280. movq %rax, BI // Index for BO
  2281. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2282. salq $ 2, %rax // rax = rax * 4 ; number of values
  2283. leaq (AO, %rax, SIZE), AO
  2284. leaq (BO, BI, SIZE), BO
  2285. negq BI
  2286. negq %rax
  2287. ALIGN_4
  2288. .L2_2_12:
  2289. prefetcht0 A_PR1(AO,%rax,SIZE)
  2290. prefetcht0 B_PR1(BO,BI,SIZE)
  2291. KERNEL2x2_SUB
  2292. KERNEL2x2_SUB
  2293. prefetcht0 A_PR1(AO,%rax,SIZE)
  2294. prefetcht0 B_PR1(BO,BI,SIZE)
  2295. KERNEL2x2_SUB
  2296. KERNEL2x2_SUB
  2297. prefetcht0 A_PR1(AO,%rax,SIZE)
  2298. prefetcht0 B_PR1(BO,BI,SIZE)
  2299. KERNEL2x2_SUB
  2300. KERNEL2x2_SUB
  2301. prefetcht0 A_PR1(AO,%rax,SIZE)
  2302. prefetcht0 B_PR1(BO,BI,SIZE)
  2303. KERNEL2x2_SUB
  2304. KERNEL2x2_SUB
  2305. je .L2_2_16
  2306. prefetcht0 A_PR1(AO,%rax,SIZE)
  2307. prefetcht0 B_PR1(BO,BI,SIZE)
  2308. KERNEL2x2_SUB
  2309. KERNEL2x2_SUB
  2310. prefetcht0 A_PR1(AO,%rax,SIZE)
  2311. prefetcht0 B_PR1(BO,BI,SIZE)
  2312. KERNEL2x2_SUB
  2313. KERNEL2x2_SUB
  2314. prefetcht0 A_PR1(AO,%rax,SIZE)
  2315. prefetcht0 B_PR1(BO,BI,SIZE)
  2316. KERNEL2x2_SUB
  2317. KERNEL2x2_SUB
  2318. prefetcht0 A_PR1(AO,%rax,SIZE)
  2319. prefetcht0 B_PR1(BO,BI,SIZE)
  2320. KERNEL2x2_SUB
  2321. KERNEL2x2_SUB
  2322. je .L2_2_16
  2323. jmp .L2_2_12
  2324. ALIGN_4
  2325. .L2_2_16:
  2326. #ifndef TRMMKERNEL
  2327. movq K, %rax
  2328. #else
  2329. movq KKK, %rax
  2330. #endif
  2331. andq $ 7, %rax # if (k & 1)
  2332. je .L2_2_19
  2333. movq %rax, BI // Index for BO
  2334. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2335. salq $ 2, %rax // rax = rax * 4 ; number of values
  2336. leaq (AO, %rax, SIZE), AO
  2337. leaq (BO, BI, SIZE), BO
  2338. negq BI
  2339. negq %rax
  2340. ALIGN_4
  2341. .L2_2_17:
  2342. KERNEL2x2_SUB
  2343. jl .L2_2_17
  2344. ALIGN_4
  2345. .L2_2_19:
  2346. SAVE2x2
  2347. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2348. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2349. movq K, %rax
  2350. subq KKK, %rax
  2351. movq %rax, BI // Index for BO
  2352. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2353. leaq (BO, BI, SIZE), BO
  2354. salq $ 2, %rax // rax = rax * 4 ; number of values
  2355. leaq (AO, %rax, SIZE), AO
  2356. #endif
  2357. #if defined(TRMMKERNEL) && defined(LEFT)
  2358. addq $ 2, KK
  2359. #endif
  2360. addq $ 4 * SIZE, CO1 # coffset += 4
  2361. ALIGN_4
  2362. /**************************************************************************
  2363. * Rest of M
  2364. ***************************************************************************/
  2365. .L2_2_40:
  2366. testq $ 1, M
  2367. jz .L2_2_60 // to next 2 lines of N
  2368. ALIGN_4
  2369. .L2_2_41:
  2370. #if !defined(TRMMKERNEL) || \
  2371. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2372. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2373. leaq BUFFER1, BO // first buffer to BO
  2374. addq $ 8 * SIZE, BO
  2375. #else
  2376. movq KK, %rax
  2377. leaq BUFFER1, BO // first buffer to BO
  2378. addq $ 8 * SIZE, BO
  2379. movq %rax, BI // Index for BO
  2380. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  2381. leaq (BO, BI, SIZE), BO
  2382. salq $ 1, %rax // rax = rax * 2 ; number of values
  2383. leaq (AO, %rax, SIZE), AO
  2384. #endif
  2385. vzeroall
  2386. #ifndef TRMMKERNEL
  2387. movq K, %rax
  2388. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2389. movq K, %rax
  2390. subq KK, %rax
  2391. movq %rax, KKK
  2392. #else
  2393. movq KK, %rax
  2394. #ifdef LEFT
  2395. addq $ 1, %rax // number of values in AO
  2396. #else
  2397. addq $ 2, %rax // number of values in BO
  2398. #endif
  2399. movq %rax, KKK
  2400. #endif
  2401. andq $ -8, %rax // K = K - ( K % 8 )
  2402. je .L2_2_46
  2403. movq %rax, BI // Index for BO
  2404. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2405. salq $ 1, %rax // rax = rax * 2 ; number of values
  2406. leaq (AO, %rax, SIZE), AO
  2407. leaq (BO, BI, SIZE), BO
  2408. negq BI
  2409. negq %rax
  2410. ALIGN_4
  2411. .L2_2_42:
  2412. prefetcht0 A_PR1(AO,%rax,SIZE)
  2413. prefetcht0 B_PR1(BO,BI,SIZE)
  2414. KERNEL1x2_SUB
  2415. KERNEL1x2_SUB
  2416. prefetcht0 B_PR1(BO,BI,SIZE)
  2417. KERNEL1x2_SUB
  2418. KERNEL1x2_SUB
  2419. prefetcht0 A_PR1(AO,%rax,SIZE)
  2420. prefetcht0 B_PR1(BO,BI,SIZE)
  2421. KERNEL1x2_SUB
  2422. KERNEL1x2_SUB
  2423. prefetcht0 B_PR1(BO,BI,SIZE)
  2424. KERNEL1x2_SUB
  2425. KERNEL1x2_SUB
  2426. je .L2_2_46
  2427. prefetcht0 A_PR1(AO,%rax,SIZE)
  2428. prefetcht0 B_PR1(BO,BI,SIZE)
  2429. KERNEL1x2_SUB
  2430. KERNEL1x2_SUB
  2431. prefetcht0 B_PR1(BO,BI,SIZE)
  2432. KERNEL1x2_SUB
  2433. KERNEL1x2_SUB
  2434. prefetcht0 A_PR1(AO,%rax,SIZE)
  2435. prefetcht0 B_PR1(BO,BI,SIZE)
  2436. KERNEL1x2_SUB
  2437. KERNEL1x2_SUB
  2438. prefetcht0 B_PR1(BO,BI,SIZE)
  2439. KERNEL1x2_SUB
  2440. KERNEL1x2_SUB
  2441. je .L2_2_46
  2442. jmp .L2_2_42
  2443. ALIGN_4
  2444. .L2_2_46:
  2445. #ifndef TRMMKERNEL
  2446. movq K, %rax
  2447. #else
  2448. movq KKK, %rax
  2449. #endif
  2450. andq $ 7, %rax # if (k & 1)
  2451. je .L2_2_49
  2452. movq %rax, BI // Index for BO
  2453. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2454. salq $ 1, %rax // rax = rax * 2 ; number of values
  2455. leaq (AO, %rax, SIZE), AO
  2456. leaq (BO, BI, SIZE), BO
  2457. negq BI
  2458. negq %rax
  2459. ALIGN_4
  2460. .L2_2_47:
  2461. KERNEL1x2_SUB
  2462. jl .L2_2_47
  2463. ALIGN_4
  2464. .L2_2_49:
  2465. SAVE1x2
  2466. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2467. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2468. movq K, %rax
  2469. subq KKK, %rax
  2470. movq %rax, BI // Index for BO
  2471. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  2472. leaq (BO, BI, SIZE), BO
  2473. salq $ 1, %rax // rax = rax * 2 ; number of values
  2474. leaq (AO, %rax, SIZE), AO
  2475. #endif
  2476. #if defined(TRMMKERNEL) && defined(LEFT)
  2477. addq $ 1, KK
  2478. #endif
  2479. addq $ 2 * SIZE, CO1 # coffset += 2
  2480. decq I # i --
  2481. jg .L2_2_41
  2482. ALIGN_4
  2483. .L2_2_60:
  2484. #if defined(TRMMKERNEL) && !defined(LEFT)
  2485. addq $ 2, KK
  2486. #endif
  2487. decq J // j --
  2488. jg .L2_00_01 // next 2 lines of N
  2489. .L1_2_0:
  2490. /************************************************************************************************
  2491. * Loop for Nmod6 % 2 > 0
  2492. *************************************************************************************************/
  2493. movq Nmod6, J
  2494. andq $ 1, J // j % 2
  2495. je .L999
  2496. ALIGN_4
  2497. .L1_00_01:
  2498. // copy to sub buffer
  2499. movq B, BO1
  2500. leaq BUFFER1, BO // first buffer to BO
  2501. movq K, %rax
  2502. ALIGN_4
  2503. .L1_00_02b:
  2504. vmovups (BO1), %xmm0
  2505. vmovups %xmm0, (BO)
  2506. addq $ 2*SIZE,BO1
  2507. addq $ 2*SIZE,BO
  2508. decq %rax
  2509. jnz .L1_00_02b
  2510. .L1_00_02c:
  2511. movq BO1, B // next offset of B
  2512. .L1_00_10:
  2513. movq C, CO1
  2514. leaq (C, LDC, 1), C // c += 1 * ldc
  2515. #if defined(TRMMKERNEL) && defined(LEFT)
  2516. movq OFFSET, %rax
  2517. movq %rax, KK
  2518. #endif
  2519. movq A, AO // aoffset = a
  2520. addq $ 8 * SIZE, AO
  2521. movq M, I
  2522. sarq $ 2, I // i = (m >> 2)
  2523. je .L1_2_10
  2524. ALIGN_4
  2525. /*******************************************************************************************************/
  2526. .L1_4_11:
  2527. #if !defined(TRMMKERNEL) || \
  2528. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2529. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2530. leaq BUFFER1, BO // first buffer to BO
  2531. addq $ 4 * SIZE, BO
  2532. #else
  2533. movq KK, %rax
  2534. leaq BUFFER1, BO // first buffer to BO
  2535. addq $ 4 * SIZE, BO
  2536. movq %rax, BI // Index for BO
  2537. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  2538. leaq (BO, BI, SIZE), BO
  2539. salq $ 3, %rax // rax = rax * 8 ; number of values
  2540. leaq (AO, %rax, SIZE), AO
  2541. #endif
  2542. vzeroall
  2543. #ifndef TRMMKERNEL
  2544. movq K, %rax
  2545. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2546. movq K, %rax
  2547. subq KK, %rax
  2548. movq %rax, KKK
  2549. #else
  2550. movq KK, %rax
  2551. #ifdef LEFT
  2552. addq $ 4, %rax // number of values in AO
  2553. #else
  2554. addq $ 1, %rax // number of values in BO
  2555. #endif
  2556. movq %rax, KKK
  2557. #endif
  2558. andq $ -8, %rax // K = K - ( K % 8 )
  2559. je .L1_4_16
  2560. movq %rax, BI // Index for BO
  2561. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2562. salq $ 3, %rax // rax = rax * 8 ; number of values
  2563. leaq (AO, %rax, SIZE), AO
  2564. leaq (BO, BI, SIZE), BO
  2565. negq BI
  2566. negq %rax
  2567. ALIGN_4
  2568. .L1_4_12:
  2569. KERNEL4x1_SUB
  2570. KERNEL4x1_SUB
  2571. KERNEL4x1_SUB
  2572. KERNEL4x1_SUB
  2573. KERNEL4x1_SUB
  2574. KERNEL4x1_SUB
  2575. KERNEL4x1_SUB
  2576. KERNEL4x1_SUB
  2577. je .L1_4_16
  2578. KERNEL4x1_SUB
  2579. KERNEL4x1_SUB
  2580. KERNEL4x1_SUB
  2581. KERNEL4x1_SUB
  2582. KERNEL4x1_SUB
  2583. KERNEL4x1_SUB
  2584. KERNEL4x1_SUB
  2585. KERNEL4x1_SUB
  2586. je .L1_4_16
  2587. jmp .L1_4_12
  2588. ALIGN_4
  2589. .L1_4_16:
  2590. #ifndef TRMMKERNEL
  2591. movq K, %rax
  2592. #else
  2593. movq KKK, %rax
  2594. #endif
  2595. andq $ 7, %rax # if (k & 1)
  2596. je .L1_4_19
  2597. movq %rax, BI // Index for BO
  2598. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2599. salq $ 3, %rax // rax = rax * 8 ; number of values
  2600. leaq (AO, %rax, SIZE), AO
  2601. leaq (BO, BI, SIZE), BO
  2602. negq BI
  2603. negq %rax
  2604. ALIGN_4
  2605. .L1_4_17:
  2606. KERNEL4x1_SUB
  2607. jl .L1_4_17
  2608. ALIGN_4
  2609. .L1_4_19:
  2610. SAVE4x1
  2611. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2612. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2613. movq K, %rax
  2614. subq KKK, %rax
  2615. movq %rax, BI // Index for BO
  2616. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2617. leaq (BO, BI, SIZE), BO
  2618. salq $ 3, %rax // rax = rax * 8 ; number of values
  2619. leaq (AO, %rax, SIZE), AO
  2620. #endif
  2621. #if defined(TRMMKERNEL) && defined(LEFT)
  2622. addq $ 4, KK
  2623. #endif
  2624. addq $ 8 * SIZE, CO1 # coffset += 8
  2625. decq I # i --
  2626. jg .L1_4_11
  2627. ALIGN_4
  2628. /*******************************************************************************************************/
  2629. .L1_2_10:
  2630. testq $ 2, M
  2631. jz .L1_2_40
  2632. .L1_2_11:
  2633. #if !defined(TRMMKERNEL) || \
  2634. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2635. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2636. leaq BUFFER1, BO // first buffer to BO
  2637. addq $ 4 * SIZE, BO
  2638. #else
  2639. movq KK, %rax
  2640. leaq BUFFER1, BO // first buffer to BO
  2641. addq $ 4 * SIZE, BO
  2642. movq %rax, BI // Index for BO
  2643. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  2644. leaq (BO, BI, SIZE), BO
  2645. salq $ 2, %rax // rax = rax * 4 ; number of values
  2646. leaq (AO, %rax, SIZE), AO
  2647. #endif
  2648. vzeroall
  2649. #ifndef TRMMKERNEL
  2650. movq K, %rax
  2651. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2652. movq K, %rax
  2653. subq KK, %rax
  2654. movq %rax, KKK
  2655. #else
  2656. movq KK, %rax
  2657. #ifdef LEFT
  2658. addq $ 2, %rax // number of values in AO
  2659. #else
  2660. addq $ 1, %rax // number of values in BO
  2661. #endif
  2662. movq %rax, KKK
  2663. #endif
  2664. andq $ -8, %rax // K = K - ( K % 8 )
  2665. je .L1_2_16
  2666. movq %rax, BI // Index for BO
  2667. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2668. salq $ 2, %rax // rax = rax * 4 ; number of values
  2669. leaq (AO, %rax, SIZE), AO
  2670. leaq (BO, BI, SIZE), BO
  2671. negq BI
  2672. negq %rax
  2673. ALIGN_4
  2674. .L1_2_12:
  2675. prefetcht0 A_PR1(AO,%rax,SIZE)
  2676. prefetcht0 B_PR1(BO,BI,SIZE)
  2677. KERNEL2x1_SUB
  2678. KERNEL2x1_SUB
  2679. prefetcht0 A_PR1(AO,%rax,SIZE)
  2680. KERNEL2x1_SUB
  2681. KERNEL2x1_SUB
  2682. prefetcht0 B_PR1(BO,BI,SIZE)
  2683. KERNEL2x1_SUB
  2684. KERNEL2x1_SUB
  2685. prefetcht0 A_PR1(AO,%rax,SIZE)
  2686. KERNEL2x1_SUB
  2687. KERNEL2x1_SUB
  2688. je .L1_2_16
  2689. prefetcht0 B_PR1(BO,BI,SIZE)
  2690. KERNEL2x1_SUB
  2691. KERNEL2x1_SUB
  2692. prefetcht0 A_PR1(AO,%rax,SIZE)
  2693. KERNEL2x1_SUB
  2694. KERNEL2x1_SUB
  2695. prefetcht0 B_PR1(BO,BI,SIZE)
  2696. KERNEL2x1_SUB
  2697. KERNEL2x1_SUB
  2698. prefetcht0 A_PR1(AO,%rax,SIZE)
  2699. KERNEL2x1_SUB
  2700. KERNEL2x1_SUB
  2701. je .L1_2_16
  2702. jmp .L1_2_12
  2703. ALIGN_4
  2704. .L1_2_16:
  2705. #ifndef TRMMKERNEL
  2706. movq K, %rax
  2707. #else
  2708. movq KKK, %rax
  2709. #endif
  2710. andq $ 7, %rax # if (k & 1)
  2711. je .L1_2_19
  2712. movq %rax, BI // Index for BO
  2713. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2714. salq $ 2, %rax // rax = rax * 4 ; number of values
  2715. leaq (AO, %rax, SIZE), AO
  2716. leaq (BO, BI, SIZE), BO
  2717. negq BI
  2718. negq %rax
  2719. ALIGN_4
  2720. .L1_2_17:
  2721. KERNEL2x1_SUB
  2722. jl .L1_2_17
  2723. ALIGN_4
  2724. .L1_2_19:
  2725. SAVE2x1
  2726. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2727. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2728. movq K, %rax
  2729. subq KKK, %rax
  2730. movq %rax, BI // Index for BO
  2731. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2732. leaq (BO, BI, SIZE), BO
  2733. salq $ 2, %rax // rax = rax * 4 ; number of values
  2734. leaq (AO, %rax, SIZE), AO
  2735. #endif
  2736. #if defined(TRMMKERNEL) && defined(LEFT)
  2737. addq $ 2, KK
  2738. #endif
  2739. addq $ 4 * SIZE, CO1 # coffset += 4
  2740. ALIGN_4
  2741. /**************************************************************************
  2742. * Rest of M
  2743. ***************************************************************************/
  2744. .L1_2_40:
  2745. testq $ 1, M
  2746. jz .L999
  2747. ALIGN_4
  2748. .L1_2_41:
  2749. #if !defined(TRMMKERNEL) || \
  2750. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2751. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2752. leaq BUFFER1, BO // first buffer to BO
  2753. addq $ 4 * SIZE, BO
  2754. #else
  2755. movq KK, %rax
  2756. leaq BUFFER1, BO // first buffer to BO
  2757. addq $ 4 * SIZE, BO
  2758. movq %rax, BI // Index for BO
  2759. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  2760. leaq (BO, BI, SIZE), BO
  2761. salq $ 1, %rax // rax = rax * 2 ; number of values
  2762. leaq (AO, %rax, SIZE), AO
  2763. #endif
  2764. vzeroall
  2765. #ifndef TRMMKERNEL
  2766. movq K, %rax
  2767. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2768. movq K, %rax
  2769. subq KK, %rax
  2770. movq %rax, KKK
  2771. #else
  2772. movq KK, %rax
  2773. #ifdef LEFT
  2774. addq $ 1, %rax // number of values in AO
  2775. #else
  2776. addq $ 1, %rax // number of values in BO
  2777. #endif
  2778. movq %rax, KKK
  2779. #endif
  2780. andq $ -8, %rax // K = K - ( K % 8 )
  2781. je .L1_2_46
  2782. movq %rax, BI // Index for BO
  2783. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2784. salq $ 1, %rax // rax = rax * 2 ; number of values
  2785. leaq (AO, %rax, SIZE), AO
  2786. leaq (BO, BI, SIZE), BO
  2787. negq BI
  2788. negq %rax
  2789. ALIGN_4
  2790. .L1_2_42:
  2791. prefetcht0 A_PR1(AO,%rax,SIZE)
  2792. prefetcht0 B_PR1(BO,BI,SIZE)
  2793. KERNEL1x1_SUB
  2794. KERNEL1x1_SUB
  2795. KERNEL1x1_SUB
  2796. KERNEL1x1_SUB
  2797. prefetcht0 A_PR1(AO,%rax,SIZE)
  2798. prefetcht0 B_PR1(BO,BI,SIZE)
  2799. KERNEL1x1_SUB
  2800. KERNEL1x1_SUB
  2801. KERNEL1x1_SUB
  2802. KERNEL1x1_SUB
  2803. je .L1_2_46
  2804. prefetcht0 A_PR1(AO,%rax,SIZE)
  2805. prefetcht0 B_PR1(BO,BI,SIZE)
  2806. KERNEL1x1_SUB
  2807. KERNEL1x1_SUB
  2808. KERNEL1x1_SUB
  2809. KERNEL1x1_SUB
  2810. prefetcht0 A_PR1(AO,%rax,SIZE)
  2811. prefetcht0 B_PR1(BO,BI,SIZE)
  2812. KERNEL1x1_SUB
  2813. KERNEL1x1_SUB
  2814. KERNEL1x1_SUB
  2815. KERNEL1x1_SUB
  2816. je .L1_2_46
  2817. jmp .L1_2_42
  2818. ALIGN_4
  2819. .L1_2_46:
  2820. #ifndef TRMMKERNEL
  2821. movq K, %rax
  2822. #else
  2823. movq KKK, %rax
  2824. #endif
  2825. andq $ 7, %rax # if (k & 1)
  2826. je .L1_2_49
  2827. movq %rax, BI // Index for BO
  2828. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2829. salq $ 1, %rax // rax = rax * 2 ; number of values
  2830. leaq (AO, %rax, SIZE), AO
  2831. leaq (BO, BI, SIZE), BO
  2832. negq BI
  2833. negq %rax
  2834. ALIGN_4
  2835. .L1_2_47:
  2836. KERNEL1x1_SUB
  2837. jl .L1_2_47
  2838. ALIGN_4
  2839. .L1_2_49:
  2840. SAVE1x1
  2841. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2842. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2843. movq K, %rax
  2844. subq KKK, %rax
  2845. movq %rax, BI // Index for BO
  2846. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  2847. leaq (BO, BI, SIZE), BO
  2848. salq $ 1, %rax // rax = rax * 2 ; number of values
  2849. leaq (AO, %rax, SIZE), AO
  2850. #endif
  2851. #if defined(TRMMKERNEL) && defined(LEFT)
  2852. addq $ 1, KK
  2853. #endif
  2854. addq $ 2 * SIZE, CO1 # coffset += 2
  2855. decq I # i --
  2856. jg .L1_2_41
  2857. ALIGN_4
  2858. .L999:
  2859. vzeroupper
  2860. movq SP, %rsp
  2861. movq (%rsp), %rbx
  2862. movq 8(%rsp), %rbp
  2863. movq 16(%rsp), %r12
  2864. movq 24(%rsp), %r13
  2865. movq 32(%rsp), %r14
  2866. movq 40(%rsp), %r15
  2867. #ifdef WINDOWS_ABI
  2868. movq 48(%rsp), %rdi
  2869. movq 56(%rsp), %rsi
  2870. vmovups 64(%rsp), %xmm6
  2871. vmovups 80(%rsp), %xmm7
  2872. vmovups 96(%rsp), %xmm8
  2873. vmovups 112(%rsp), %xmm9
  2874. vmovups 128(%rsp), %xmm10
  2875. vmovups 144(%rsp), %xmm11
  2876. vmovups 160(%rsp), %xmm12
  2877. vmovups 176(%rsp), %xmm13
  2878. vmovups 192(%rsp), %xmm14
  2879. vmovups 208(%rsp), %xmm15
  2880. #endif
  2881. addq $ STACKSIZE, %rsp
  2882. ret
  2883. EPILOGUE
  2884. #endif