You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_4x4_haswell.S 67 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494
  1. /*********************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. /*********************************************************************
  28. * 2013/10/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. *
  34. * 2013/10/27 Saar
  35. * Parameter:
  36. * DGEMM_DEFAULT_UNROLL_N 4
  37. * DGEMM_DEFAULT_UNROLL_M 4
  38. * DGEMM_DEFAULT_P 512
  39. * DGEMM_DEFAULT_Q 256
  40. * A_PR1 512
  41. * B_PR1 512
  42. *
  43. *
  44. * Performance at 9216x9216x9216:
  45. * 1 thread: 53.3 GFLOPS (MKL: 54)
  46. * 2 threads: 100.0 GFLOPS (MKL: 97)
  47. * 3 threads: 147.0 GFLOPS (MKL: 133)
  48. * 4 threads: 184.0 GFLOPS (MKL: 170)
  49. *********************************************************************/
  50. #define ASSEMBLER
  51. #include "common.h"
  52. #define OLD_M %rdi
  53. #define OLD_N %rsi
  54. #define M %r13
  55. #define J %r14
  56. #define OLD_K %rdx
  57. #define A %rcx
  58. #define B %r8
  59. #define C %r9
  60. #define LDC %r10
  61. #define I %r11
  62. #define AO %rdi
  63. #define BO %rsi
  64. #define CO1 %r15
  65. #define K %r12
  66. #define SP %rbx
  67. #define BO1 %rdi
  68. #define BO2 %r15
  69. #define BO3 %rbp
  70. #ifndef WINDOWS_ABI
  71. #define STACKSIZE 96
  72. #define L_BUFFER_SIZE 256*8*12+4096
  73. #else
  74. #define STACKSIZE 256
  75. #define L_BUFFER_SIZE 128*8*12+512
  76. #define OLD_A 40 + STACKSIZE(%rsp)
  77. #define OLD_B 48 + STACKSIZE(%rsp)
  78. #define OLD_C 56 + STACKSIZE(%rsp)
  79. #define OLD_LDC 64 + STACKSIZE(%rsp)
  80. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  81. #endif
  82. #define Ndiv12 24(%rsp)
  83. #define Nmod12 32(%rsp)
  84. #define N 40(%rsp)
  85. #define ALPHA 48(%rsp)
  86. #define OFFSET 56(%rsp)
  87. #define KK 64(%rsp)
  88. #define KKK 72(%rsp)
  89. #define BUFFER1 128(%rsp)
  90. #if defined(OS_WINDOWS)
  91. #if L_BUFFER_SIZE > 16384
  92. #define STACK_TOUCH \
  93. movl $ 0, 4096 * 4(%rsp);\
  94. movl $ 0, 4096 * 3(%rsp);\
  95. movl $ 0, 4096 * 2(%rsp);\
  96. movl $ 0, 4096 * 1(%rsp);
  97. #elif L_BUFFER_SIZE > 12288
  98. #define STACK_TOUCH \
  99. movl $ 0, 4096 * 3(%rsp);\
  100. movl $ 0, 4096 * 2(%rsp);\
  101. movl $ 0, 4096 * 1(%rsp);
  102. #elif L_BUFFER_SIZE > 8192
  103. #define STACK_TOUCH \
  104. movl $ 0, 4096 * 2(%rsp);\
  105. movl $ 0, 4096 * 1(%rsp);
  106. #elif L_BUFFER_SIZE > 4096
  107. #define STACK_TOUCH \
  108. movl $ 0, 4096 * 1(%rsp);
  109. #else
  110. #define STACK_TOUCH
  111. #endif
  112. #else
  113. #define STACK_TOUCH
  114. #endif
  115. #define A_PR1 512
  116. #define B_PR1 512
  117. /*******************************************************************************************
  118. * Macro definitions
  119. *******************************************************************************************/
  120. .macro INIT4x12
  121. vxorpd %ymm4 , %ymm4 , %ymm4
  122. vxorpd %ymm5 , %ymm5 , %ymm5
  123. vxorpd %ymm6 , %ymm6 , %ymm6
  124. vxorpd %ymm7 , %ymm7 , %ymm7
  125. vxorpd %ymm8 , %ymm8 , %ymm8
  126. vxorpd %ymm9 , %ymm9 , %ymm9
  127. vxorpd %ymm10, %ymm10, %ymm10
  128. vxorpd %ymm11, %ymm11, %ymm11
  129. vxorpd %ymm12, %ymm12, %ymm12
  130. vxorpd %ymm13, %ymm13, %ymm13
  131. vxorpd %ymm14, %ymm14, %ymm14
  132. vxorpd %ymm15, %ymm15, %ymm15
  133. .endm
  134. .macro KERNEL4x12_I
  135. prefetcht0 A_PR1(AO)
  136. vmovups -12 * SIZE(BO), %ymm1
  137. prefetcht0 B_PR1(BO)
  138. vmovups -16 * SIZE(AO), %ymm0
  139. prefetcht0 B_PR1+64(BO)
  140. vmovups -8 * SIZE(BO), %ymm2
  141. prefetcht0 B_PR1+128(BO)
  142. vmovups -4 * SIZE(BO), %ymm3
  143. vmulpd %ymm0 ,%ymm1 , %ymm4
  144. prefetcht0 B_PR1+192(BO)
  145. vmulpd %ymm0 ,%ymm2 , %ymm8
  146. vmulpd %ymm0 ,%ymm3 , %ymm12
  147. prefetcht0 B_PR1+256(BO)
  148. vpermpd $ 0xb1, %ymm0 , %ymm0
  149. vmulpd %ymm0 ,%ymm1 , %ymm5
  150. vmulpd %ymm0 ,%ymm2 , %ymm9
  151. vmulpd %ymm0 ,%ymm3 , %ymm13
  152. vpermpd $ 0x1b, %ymm0 , %ymm0
  153. vmulpd %ymm0 ,%ymm1 , %ymm6
  154. vmulpd %ymm0 ,%ymm2 , %ymm10
  155. addq $ 12*SIZE, BO
  156. vmulpd %ymm0 ,%ymm3 , %ymm14
  157. vpermpd $ 0xb1, %ymm0 , %ymm0
  158. vmulpd %ymm0 ,%ymm1 , %ymm7
  159. vmovups -12 * SIZE(BO), %ymm1
  160. vmulpd %ymm0 ,%ymm2 , %ymm11
  161. vmovups -8 * SIZE(BO), %ymm2
  162. vmulpd %ymm0 ,%ymm3 , %ymm15
  163. vmovups -4 * SIZE(BO), %ymm3
  164. .endm
  165. .macro KERNEL4x12_M1
  166. prefetcht0 A_PR1(AO)
  167. vmovups -16 * SIZE(AO), %ymm0
  168. prefetcht0 B_PR1(BO)
  169. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  170. prefetcht0 B_PR1+64(BO)
  171. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  172. prefetcht0 B_PR1+128(BO)
  173. vfmadd231pd %ymm0 ,%ymm3 , %ymm12
  174. vpermpd $ 0xb1, %ymm0 , %ymm0
  175. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  176. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  177. vfmadd231pd %ymm0 ,%ymm3 , %ymm13
  178. vpermpd $ 0x1b, %ymm0 , %ymm0
  179. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  180. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  181. vfmadd231pd %ymm0 ,%ymm3 , %ymm14
  182. vpermpd $ 0xb1, %ymm0 , %ymm0
  183. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  184. vmovups -12 * SIZE(BO), %ymm1
  185. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  186. vmovups -8 * SIZE(BO), %ymm2
  187. vfmadd231pd %ymm0 ,%ymm3 , %ymm15
  188. vmovups -4 * SIZE(BO), %ymm3
  189. .endm
  190. .macro KERNEL4x12_M2
  191. vmovups -12 * SIZE(AO), %ymm0
  192. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  193. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  194. vfmadd231pd %ymm0 ,%ymm3 , %ymm12
  195. vpermpd $ 0xb1, %ymm0 , %ymm0
  196. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  197. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  198. vfmadd231pd %ymm0 ,%ymm3 , %ymm13
  199. vpermpd $ 0x1b, %ymm0 , %ymm0
  200. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  201. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  202. addq $ 8*SIZE, AO
  203. vfmadd231pd %ymm0 ,%ymm3 , %ymm14
  204. vpermpd $ 0xb1, %ymm0 , %ymm0
  205. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  206. vmovups 0 * SIZE(BO), %ymm1
  207. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  208. vmovups 4 * SIZE(BO), %ymm2
  209. vfmadd231pd %ymm0 ,%ymm3 , %ymm15
  210. vmovups 8 * SIZE(BO), %ymm3
  211. addq $ 24*SIZE, BO
  212. .endm
  213. .macro KERNEL4x12_E
  214. vmovups -12 * SIZE(AO), %ymm0
  215. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  216. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  217. vfmadd231pd %ymm0 ,%ymm3 , %ymm12
  218. vpermpd $ 0xb1, %ymm0 , %ymm0
  219. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  220. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  221. vfmadd231pd %ymm0 ,%ymm3 , %ymm13
  222. vpermpd $ 0x1b, %ymm0 , %ymm0
  223. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  224. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  225. addq $ 8*SIZE, AO
  226. vfmadd231pd %ymm0 ,%ymm3 , %ymm14
  227. vpermpd $ 0xb1, %ymm0 , %ymm0
  228. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  229. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  230. vfmadd231pd %ymm0 ,%ymm3 , %ymm15
  231. addq $ 12*SIZE, BO
  232. .endm
  233. .macro KERNEL4x12_SUB
  234. vmovups -12 * SIZE(BO), %ymm1
  235. vmovups -16 * SIZE(AO), %ymm0
  236. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  237. vmovups -8 * SIZE(BO), %ymm2
  238. vfmadd231pd %ymm0 ,%ymm2 , %ymm8
  239. vmovups -4 * SIZE(BO), %ymm3
  240. vfmadd231pd %ymm0 ,%ymm3 , %ymm12
  241. vpermpd $ 0xb1, %ymm0 , %ymm0
  242. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  243. vfmadd231pd %ymm0 ,%ymm2 , %ymm9
  244. addq $ 12*SIZE, BO
  245. vfmadd231pd %ymm0 ,%ymm3 , %ymm13
  246. vpermpd $ 0x1b, %ymm0 , %ymm0
  247. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  248. vfmadd231pd %ymm0 ,%ymm2 , %ymm10
  249. addq $ 4*SIZE, AO
  250. vfmadd231pd %ymm0 ,%ymm3 , %ymm14
  251. vpermpd $ 0xb1, %ymm0 , %ymm0
  252. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  253. vfmadd231pd %ymm0 ,%ymm2 , %ymm11
  254. vfmadd231pd %ymm0 ,%ymm3 , %ymm15
  255. .endm
  256. .macro SAVE4x12
  257. vbroadcastsd ALPHA, %ymm0
  258. vmulpd %ymm0 , %ymm4 , %ymm4
  259. vmulpd %ymm0 , %ymm5 , %ymm5
  260. vmulpd %ymm0 , %ymm6 , %ymm6
  261. vmulpd %ymm0 , %ymm7 , %ymm7
  262. vmulpd %ymm0 , %ymm8 , %ymm8
  263. vmulpd %ymm0 , %ymm9 , %ymm9
  264. vmulpd %ymm0 , %ymm10, %ymm10
  265. vmulpd %ymm0 , %ymm11, %ymm11
  266. vmulpd %ymm0 , %ymm12, %ymm12
  267. vmulpd %ymm0 , %ymm13, %ymm13
  268. vmulpd %ymm0 , %ymm14, %ymm14
  269. vmulpd %ymm0 , %ymm15, %ymm15
  270. vpermpd $ 0xb1 , %ymm5, %ymm5
  271. vpermpd $ 0xb1 , %ymm7, %ymm7
  272. vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
  273. vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
  274. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
  275. vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
  276. vpermpd $ 0x1b , %ymm2, %ymm2
  277. vpermpd $ 0x1b , %ymm3, %ymm3
  278. vpermpd $ 0xb1 , %ymm2, %ymm2
  279. vpermpd $ 0xb1 , %ymm3, %ymm3
  280. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  281. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  282. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  283. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  284. leaq (CO1, LDC, 2), %rax
  285. #if !defined(TRMMKERNEL)
  286. vaddpd (CO1), %ymm4, %ymm4
  287. vaddpd (CO1, LDC), %ymm5, %ymm5
  288. vaddpd (%rax), %ymm6, %ymm6
  289. vaddpd (%rax, LDC), %ymm7, %ymm7
  290. #endif
  291. vmovups %ymm4 , (CO1)
  292. vmovups %ymm5 , (CO1, LDC)
  293. vmovups %ymm6 , (%rax)
  294. vmovups %ymm7 , (%rax, LDC)
  295. prefetcht0 32(CO1)
  296. prefetcht0 32(CO1,LDC)
  297. prefetcht0 32(%rax)
  298. prefetcht0 32(%rax,LDC)
  299. vpermpd $ 0xb1 , %ymm9 , %ymm9
  300. vpermpd $ 0xb1 , %ymm11, %ymm11
  301. vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
  302. vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
  303. vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
  304. vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
  305. vpermpd $ 0x1b , %ymm2, %ymm2
  306. vpermpd $ 0x1b , %ymm3, %ymm3
  307. vpermpd $ 0xb1 , %ymm2, %ymm2
  308. vpermpd $ 0xb1 , %ymm3, %ymm3
  309. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  310. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  311. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  312. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  313. leaq (%rax, LDC, 2), %rax
  314. leaq (%rax, LDC, 2), %rbp
  315. #if !defined(TRMMKERNEL)
  316. vaddpd (%rax), %ymm4, %ymm4
  317. vaddpd (%rax, LDC), %ymm5, %ymm5
  318. vaddpd (%rbp), %ymm6, %ymm6
  319. vaddpd (%rbp, LDC), %ymm7, %ymm7
  320. #endif
  321. vmovups %ymm4 , (%rax)
  322. vmovups %ymm5 , (%rax, LDC)
  323. vmovups %ymm6 , (%rbp)
  324. vmovups %ymm7 , (%rbp, LDC)
  325. prefetcht0 32(%rax)
  326. prefetcht0 32(%rax,LDC)
  327. prefetcht0 32(%rbp)
  328. prefetcht0 32(%rbp,LDC)
  329. vpermpd $ 0xb1 , %ymm13, %ymm13
  330. vpermpd $ 0xb1 , %ymm15, %ymm15
  331. vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0
  332. vblendpd $ 0x05, %ymm13, %ymm12, %ymm1
  333. vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2
  334. vblendpd $ 0x05, %ymm15, %ymm14, %ymm3
  335. vpermpd $ 0x1b , %ymm2, %ymm2
  336. vpermpd $ 0x1b , %ymm3, %ymm3
  337. vpermpd $ 0xb1 , %ymm2, %ymm2
  338. vpermpd $ 0xb1 , %ymm3, %ymm3
  339. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  340. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  341. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  342. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  343. leaq (%rax, LDC, 4), %rax
  344. leaq (%rbp, LDC, 4), %rbp
  345. #if !defined(TRMMKERNEL)
  346. vaddpd (%rax), %ymm4, %ymm4
  347. vaddpd (%rax, LDC), %ymm5, %ymm5
  348. vaddpd (%rbp), %ymm6, %ymm6
  349. vaddpd (%rbp, LDC), %ymm7, %ymm7
  350. #endif
  351. vmovups %ymm4 , (%rax)
  352. vmovups %ymm5 , (%rax, LDC)
  353. vmovups %ymm6 , (%rbp)
  354. vmovups %ymm7 , (%rbp, LDC)
  355. prefetcht0 32(%rax)
  356. prefetcht0 32(%rax,LDC)
  357. prefetcht0 32(%rbp)
  358. prefetcht0 32(%rbp,LDC)
  359. addq $ 4*SIZE, CO1
  360. .endm
  361. /******************************************************************************************/
  362. .macro INIT2x12
  363. vxorpd %xmm4 , %xmm4 , %xmm4
  364. vxorpd %xmm5 , %xmm5 , %xmm5
  365. vxorpd %xmm6 , %xmm6 , %xmm6
  366. vxorpd %xmm7 , %xmm7 , %xmm7
  367. vxorpd %xmm8 , %xmm8 , %xmm8
  368. vxorpd %xmm9 , %xmm9 , %xmm9
  369. vxorpd %xmm10, %xmm10, %xmm10
  370. vxorpd %xmm11, %xmm11, %xmm11
  371. vxorpd %xmm12, %xmm12, %xmm12
  372. vxorpd %xmm13, %xmm13, %xmm13
  373. vxorpd %xmm14, %xmm14, %xmm14
  374. vxorpd %xmm15, %xmm15, %xmm15
  375. .endm
  376. .macro KERNEL2x12_SUB
  377. vmovups -16 * SIZE(AO), %xmm0
  378. vmovddup -12 * SIZE(BO), %xmm1
  379. vmovddup -11 * SIZE(BO), %xmm2
  380. vmovddup -10 * SIZE(BO), %xmm3
  381. vfmadd231pd %xmm0 ,%xmm1 , %xmm4
  382. vmovddup -9 * SIZE(BO), %xmm1
  383. vfmadd231pd %xmm0 ,%xmm2 , %xmm5
  384. vmovddup -8 * SIZE(BO), %xmm2
  385. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  386. vmovddup -7 * SIZE(BO), %xmm3
  387. vfmadd231pd %xmm0 ,%xmm1 , %xmm7
  388. vmovddup -6 * SIZE(BO), %xmm1
  389. vfmadd231pd %xmm0 ,%xmm2 , %xmm8
  390. vmovddup -5 * SIZE(BO), %xmm2
  391. vfmadd231pd %xmm0 ,%xmm3 , %xmm9
  392. vmovddup -4 * SIZE(BO), %xmm3
  393. vfmadd231pd %xmm0 ,%xmm1 , %xmm10
  394. vmovddup -3 * SIZE(BO), %xmm1
  395. vfmadd231pd %xmm0 ,%xmm2 , %xmm11
  396. vmovddup -2 * SIZE(BO), %xmm2
  397. vfmadd231pd %xmm0 ,%xmm3 , %xmm12
  398. vmovddup -1 * SIZE(BO), %xmm3
  399. vfmadd231pd %xmm0 ,%xmm1 , %xmm13
  400. addq $ 12*SIZE, BO
  401. vfmadd231pd %xmm0 ,%xmm2 , %xmm14
  402. addq $ 2*SIZE, AO
  403. vfmadd231pd %xmm0 ,%xmm3 , %xmm15
  404. .endm
  405. .macro SAVE2x12
  406. vmovddup ALPHA, %xmm0
  407. vmulpd %xmm0 , %xmm4 , %xmm4
  408. vmulpd %xmm0 , %xmm5 , %xmm5
  409. vmulpd %xmm0 , %xmm6 , %xmm6
  410. vmulpd %xmm0 , %xmm7 , %xmm7
  411. vmulpd %xmm0 , %xmm8 , %xmm8
  412. vmulpd %xmm0 , %xmm9 , %xmm9
  413. vmulpd %xmm0 , %xmm10, %xmm10
  414. vmulpd %xmm0 , %xmm11, %xmm11
  415. vmulpd %xmm0 , %xmm12, %xmm12
  416. vmulpd %xmm0 , %xmm13, %xmm13
  417. vmulpd %xmm0 , %xmm14, %xmm14
  418. vmulpd %xmm0 , %xmm15, %xmm15
  419. leaq (CO1, LDC, 2), %rax
  420. #if !defined(TRMMKERNEL)
  421. vaddpd (CO1), %xmm4, %xmm4
  422. vaddpd (CO1, LDC), %xmm5, %xmm5
  423. vaddpd (%rax), %xmm6, %xmm6
  424. vaddpd (%rax, LDC), %xmm7, %xmm7
  425. #endif
  426. vmovups %xmm4 , (CO1)
  427. vmovups %xmm5 , (CO1, LDC)
  428. vmovups %xmm6 , (%rax)
  429. vmovups %xmm7 , (%rax, LDC)
  430. leaq (%rax, LDC, 2), %rax
  431. leaq (%rax, LDC, 2), %rbp
  432. #if !defined(TRMMKERNEL)
  433. vaddpd (%rax), %xmm8 , %xmm4
  434. vaddpd (%rax, LDC), %xmm9 , %xmm5
  435. vaddpd (%rbp), %xmm10, %xmm6
  436. vaddpd (%rbp, LDC), %xmm11, %xmm7
  437. #endif
  438. vmovups %xmm4 , (%rax)
  439. vmovups %xmm5 , (%rax, LDC)
  440. vmovups %xmm6 , (%rbp)
  441. vmovups %xmm7 , (%rbp, LDC)
  442. leaq (%rax, LDC, 4), %rax
  443. leaq (%rbp, LDC, 4), %rbp
  444. #if !defined(TRMMKERNEL)
  445. vaddpd (%rax), %xmm12, %xmm4
  446. vaddpd (%rax, LDC), %xmm13, %xmm5
  447. vaddpd (%rbp), %xmm14, %xmm6
  448. vaddpd (%rbp, LDC), %xmm15, %xmm7
  449. #endif
  450. vmovups %xmm4 , (%rax)
  451. vmovups %xmm5 , (%rax, LDC)
  452. vmovups %xmm6 , (%rbp)
  453. vmovups %xmm7 , (%rbp, LDC)
  454. addq $ 2*SIZE, CO1
  455. .endm
  456. /******************************************************************************************/
  457. .macro INIT1x12
  458. vxorpd %xmm4 , %xmm4 , %xmm4
  459. vxorpd %xmm5 , %xmm5 , %xmm5
  460. vxorpd %xmm6 , %xmm6 , %xmm6
  461. vxorpd %xmm7 , %xmm7 , %xmm7
  462. vxorpd %xmm8 , %xmm8 , %xmm8
  463. vxorpd %xmm9 , %xmm9 , %xmm9
  464. vxorpd %xmm10, %xmm10, %xmm10
  465. vxorpd %xmm11, %xmm11, %xmm11
  466. vxorpd %xmm12, %xmm12, %xmm12
  467. vxorpd %xmm13, %xmm13, %xmm13
  468. vxorpd %xmm14, %xmm14, %xmm14
  469. vxorpd %xmm15, %xmm15, %xmm15
  470. .endm
  471. .macro KERNEL1x12_SUB
  472. vmovsd -16 * SIZE(AO), %xmm0
  473. vmovsd -12 * SIZE(BO), %xmm1
  474. vmovsd -11 * SIZE(BO), %xmm2
  475. vmovsd -10 * SIZE(BO), %xmm3
  476. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  477. vmovsd -9 * SIZE(BO), %xmm1
  478. vfmadd231sd %xmm0 ,%xmm2 , %xmm5
  479. vmovsd -8 * SIZE(BO), %xmm2
  480. vfmadd231sd %xmm0 ,%xmm3 , %xmm6
  481. vmovsd -7 * SIZE(BO), %xmm3
  482. vfmadd231sd %xmm0 ,%xmm1 , %xmm7
  483. vmovsd -6 * SIZE(BO), %xmm1
  484. vfmadd231sd %xmm0 ,%xmm2 , %xmm8
  485. vmovsd -5 * SIZE(BO), %xmm2
  486. vfmadd231sd %xmm0 ,%xmm3 , %xmm9
  487. vmovsd -4 * SIZE(BO), %xmm3
  488. vfmadd231sd %xmm0 ,%xmm1 , %xmm10
  489. vmovsd -3 * SIZE(BO), %xmm1
  490. vfmadd231sd %xmm0 ,%xmm2 , %xmm11
  491. vmovsd -2 * SIZE(BO), %xmm2
  492. vfmadd231sd %xmm0 ,%xmm3 , %xmm12
  493. vmovsd -1 * SIZE(BO), %xmm3
  494. vfmadd231sd %xmm0 ,%xmm1 , %xmm13
  495. addq $ 12*SIZE, BO
  496. vfmadd231sd %xmm0 ,%xmm2 , %xmm14
  497. addq $ 1*SIZE, AO
  498. vfmadd231sd %xmm0 ,%xmm3 , %xmm15
  499. .endm
  500. .macro SAVE1x12
  501. vmovsd ALPHA, %xmm0
  502. vmulsd %xmm0 , %xmm4 , %xmm4
  503. vmulsd %xmm0 , %xmm5 , %xmm5
  504. vmulsd %xmm0 , %xmm6 , %xmm6
  505. vmulsd %xmm0 , %xmm7 , %xmm7
  506. vmulsd %xmm0 , %xmm8 , %xmm8
  507. vmulsd %xmm0 , %xmm9 , %xmm9
  508. vmulsd %xmm0 , %xmm10, %xmm10
  509. vmulsd %xmm0 , %xmm11, %xmm11
  510. vmulsd %xmm0 , %xmm12, %xmm12
  511. vmulsd %xmm0 , %xmm13, %xmm13
  512. vmulsd %xmm0 , %xmm14, %xmm14
  513. vmulsd %xmm0 , %xmm15, %xmm15
  514. leaq (CO1, LDC, 2), %rax
  515. #if !defined(TRMMKERNEL)
  516. vaddsd (CO1), %xmm4, %xmm4
  517. vaddsd (CO1, LDC), %xmm5, %xmm5
  518. vaddsd (%rax), %xmm6, %xmm6
  519. vaddsd (%rax, LDC), %xmm7, %xmm7
  520. #endif
  521. vmovsd %xmm4 , (CO1)
  522. vmovsd %xmm5 , (CO1, LDC)
  523. vmovsd %xmm6 , (%rax)
  524. vmovsd %xmm7 , (%rax, LDC)
  525. leaq (%rax, LDC, 2), %rax
  526. leaq (%rax, LDC, 2), %rbp
  527. #if !defined(TRMMKERNEL)
  528. vaddsd (%rax), %xmm8 , %xmm4
  529. vaddsd (%rax, LDC), %xmm9 , %xmm5
  530. vaddsd (%rbp), %xmm10, %xmm6
  531. vaddsd (%rbp, LDC), %xmm11, %xmm7
  532. #endif
  533. vmovsd %xmm4 , (%rax)
  534. vmovsd %xmm5 , (%rax, LDC)
  535. vmovsd %xmm6 , (%rbp)
  536. vmovsd %xmm7 , (%rbp, LDC)
  537. leaq (%rax, LDC, 4), %rax
  538. leaq (%rbp, LDC, 4), %rbp
  539. #if !defined(TRMMKERNEL)
  540. vaddsd (%rax), %xmm12, %xmm4
  541. vaddsd (%rax, LDC), %xmm13, %xmm5
  542. vaddsd (%rbp), %xmm14, %xmm6
  543. vaddsd (%rbp, LDC), %xmm15, %xmm7
  544. #endif
  545. vmovsd %xmm4 , (%rax)
  546. vmovsd %xmm5 , (%rax, LDC)
  547. vmovsd %xmm6 , (%rbp)
  548. vmovsd %xmm7 , (%rbp, LDC)
  549. addq $ 1*SIZE, CO1
  550. .endm
  551. /******************************************************************************************/
  552. /******************************************************************************************/
  553. .macro INIT4x4
  554. vxorpd %ymm4 , %ymm4 , %ymm4
  555. vxorpd %ymm5 , %ymm5 , %ymm5
  556. vxorpd %ymm6 , %ymm6 , %ymm6
  557. vxorpd %ymm7 , %ymm7 , %ymm7
  558. .endm
  559. .macro KERNEL4x4_I
  560. prefetcht0 A_PR1(AO)
  561. vmovups -12 * SIZE(BO), %ymm1
  562. vmovups -16 * SIZE(AO), %ymm0
  563. vmulpd %ymm0 ,%ymm1 , %ymm4
  564. vpermpd $ 0xb1, %ymm0 , %ymm0
  565. vmulpd %ymm0 ,%ymm1 , %ymm5
  566. vpermpd $ 0x1b, %ymm0 , %ymm0
  567. vmulpd %ymm0 ,%ymm1 , %ymm6
  568. addq $ 4*SIZE, BO
  569. vpermpd $ 0xb1, %ymm0 , %ymm0
  570. vmulpd %ymm0 ,%ymm1 , %ymm7
  571. vmovups -12 * SIZE(BO), %ymm1
  572. .endm
  573. .macro KERNEL4x4_M1
  574. prefetcht0 A_PR1(AO)
  575. vmovups -16 * SIZE(AO), %ymm0
  576. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  577. vpermpd $ 0xb1, %ymm0 , %ymm0
  578. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  579. vpermpd $ 0x1b, %ymm0 , %ymm0
  580. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  581. vpermpd $ 0xb1, %ymm0 , %ymm0
  582. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  583. vmovups -12 * SIZE(BO), %ymm1
  584. .endm
  585. .macro KERNEL4x4_M2
  586. vmovups -12 * SIZE(AO), %ymm0
  587. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  588. vpermpd $ 0xb1, %ymm0 , %ymm0
  589. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  590. vpermpd $ 0x1b, %ymm0 , %ymm0
  591. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  592. addq $ 8*SIZE, AO
  593. vpermpd $ 0xb1, %ymm0 , %ymm0
  594. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  595. vmovups -8 * SIZE(BO), %ymm1
  596. addq $ 8*SIZE, BO
  597. .endm
  598. .macro KERNEL4x4_E
  599. vmovups -12 * SIZE(AO), %ymm0
  600. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  601. vpermpd $ 0xb1, %ymm0 , %ymm0
  602. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  603. vpermpd $ 0x1b, %ymm0 , %ymm0
  604. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  605. addq $ 8*SIZE, AO
  606. vpermpd $ 0xb1, %ymm0 , %ymm0
  607. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  608. addq $ 4*SIZE, BO
  609. .endm
  610. .macro KERNEL4x4_SUB
  611. vmovups -12 * SIZE(BO), %ymm1
  612. vmovups -16 * SIZE(AO), %ymm0
  613. vfmadd231pd %ymm0 ,%ymm1 , %ymm4
  614. vpermpd $ 0xb1, %ymm0 , %ymm0
  615. vfmadd231pd %ymm0 ,%ymm1 , %ymm5
  616. addq $ 4*SIZE, BO
  617. vpermpd $ 0x1b, %ymm0 , %ymm0
  618. vfmadd231pd %ymm0 ,%ymm1 , %ymm6
  619. addq $ 4*SIZE, AO
  620. vpermpd $ 0xb1, %ymm0 , %ymm0
  621. vfmadd231pd %ymm0 ,%ymm1 , %ymm7
  622. .endm
  623. .macro SAVE4x4
  624. vbroadcastsd ALPHA, %ymm0
  625. vmulpd %ymm0 , %ymm4 , %ymm4
  626. vmulpd %ymm0 , %ymm7 , %ymm7
  627. vmulpd %ymm0 , %ymm5 , %ymm5
  628. vmulpd %ymm0 , %ymm6 , %ymm6
  629. vpermpd $ 0xb1 , %ymm5, %ymm5
  630. vpermpd $ 0xb1 , %ymm7, %ymm7
  631. vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
  632. vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
  633. vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
  634. vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
  635. vpermpd $ 0x1b , %ymm2, %ymm2
  636. vpermpd $ 0x1b , %ymm3, %ymm3
  637. vpermpd $ 0xb1 , %ymm2, %ymm2
  638. vpermpd $ 0xb1 , %ymm3, %ymm3
  639. vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
  640. vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
  641. vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
  642. vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
  643. leaq (CO1, LDC, 2), %rax
  644. #if !defined(TRMMKERNEL)
  645. vaddpd (CO1), %ymm4, %ymm4
  646. vaddpd (CO1, LDC), %ymm5, %ymm5
  647. vaddpd (%rax), %ymm6, %ymm6
  648. vaddpd (%rax, LDC), %ymm7, %ymm7
  649. #endif
  650. vmovups %ymm4 , (CO1)
  651. vmovups %ymm5 , (CO1, LDC)
  652. vmovups %ymm6 , (%rax)
  653. vmovups %ymm7 , (%rax, LDC)
  654. addq $ 4*SIZE, CO1
  655. .endm
  656. /******************************************************************************************/
  657. /******************************************************************************************/
  658. .macro INIT2x4
  659. vxorpd %xmm4 , %xmm4 , %xmm4
  660. vxorpd %xmm5 , %xmm5 , %xmm5
  661. vxorpd %xmm6 , %xmm6 , %xmm6
  662. vxorpd %xmm7 , %xmm7 , %xmm7
  663. .endm
  664. .macro KERNEL2x4_SUB
  665. vmovddup -12 * SIZE(BO), %xmm1
  666. vmovups -16 * SIZE(AO), %xmm0
  667. vmovddup -11 * SIZE(BO), %xmm2
  668. vfmadd231pd %xmm0 ,%xmm1 , %xmm4
  669. vmovddup -10 * SIZE(BO), %xmm3
  670. vfmadd231pd %xmm0 ,%xmm2 , %xmm5
  671. vmovddup -9 * SIZE(BO), %xmm8
  672. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  673. addq $ 4*SIZE, BO
  674. vfmadd231pd %xmm0 ,%xmm8 , %xmm7
  675. addq $ 2*SIZE, AO
  676. .endm
  677. .macro SAVE2x4
  678. vmovddup ALPHA, %xmm0
  679. vmulpd %xmm0 , %xmm4 , %xmm4
  680. vmulpd %xmm0 , %xmm5 , %xmm5
  681. vmulpd %xmm0 , %xmm6 , %xmm6
  682. vmulpd %xmm0 , %xmm7 , %xmm7
  683. leaq (CO1, LDC, 2), %rax
  684. #if !defined(TRMMKERNEL)
  685. vaddpd (CO1), %xmm4, %xmm4
  686. vaddpd (CO1, LDC), %xmm5, %xmm5
  687. vaddpd (%rax), %xmm6, %xmm6
  688. vaddpd (%rax, LDC), %xmm7, %xmm7
  689. #endif
  690. vmovups %xmm4 , (CO1)
  691. vmovups %xmm5 , (CO1, LDC)
  692. vmovups %xmm6 , (%rax)
  693. vmovups %xmm7 , (%rax, LDC)
  694. addq $ 2*SIZE, CO1
  695. .endm
  696. /******************************************************************************************/
  697. /******************************************************************************************/
  698. .macro INIT1x4
  699. vxorpd %xmm4 , %xmm4 , %xmm4
  700. vxorpd %xmm5 , %xmm5 , %xmm5
  701. vxorpd %xmm6 , %xmm6 , %xmm6
  702. vxorpd %xmm7 , %xmm7 , %xmm7
  703. .endm
  704. .macro KERNEL1x4_SUB
  705. vmovsd -12 * SIZE(BO), %xmm1
  706. vmovsd -16 * SIZE(AO), %xmm0
  707. vmovsd -11 * SIZE(BO), %xmm2
  708. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  709. vmovsd -10 * SIZE(BO), %xmm3
  710. vfmadd231sd %xmm0 ,%xmm2 , %xmm5
  711. vmovsd -9 * SIZE(BO), %xmm8
  712. vfmadd231sd %xmm0 ,%xmm3 , %xmm6
  713. addq $ 4*SIZE, BO
  714. vfmadd231sd %xmm0 ,%xmm8 , %xmm7
  715. addq $ 1*SIZE, AO
  716. .endm
  717. .macro SAVE1x4
  718. vmovsd ALPHA, %xmm0
  719. vmulsd %xmm0 , %xmm4 , %xmm4
  720. vmulsd %xmm0 , %xmm5 , %xmm5
  721. vmulsd %xmm0 , %xmm6 , %xmm6
  722. vmulsd %xmm0 , %xmm7 , %xmm7
  723. leaq (CO1, LDC, 2), %rax
  724. #if !defined(TRMMKERNEL)
  725. vaddsd (CO1), %xmm4, %xmm4
  726. vaddsd (CO1, LDC), %xmm5, %xmm5
  727. vaddsd (%rax), %xmm6, %xmm6
  728. vaddsd (%rax, LDC), %xmm7, %xmm7
  729. #endif
  730. vmovsd %xmm4 , (CO1)
  731. vmovsd %xmm5 , (CO1, LDC)
  732. vmovsd %xmm6 , (%rax)
  733. vmovsd %xmm7 , (%rax, LDC)
  734. addq $ 1*SIZE, CO1
  735. .endm
  736. /******************************************************************************************/
  737. /******************************************************************************************/
  738. .macro INIT4x2
  739. vxorpd %xmm4 , %xmm4 , %xmm4
  740. vxorpd %xmm5 , %xmm5 , %xmm5
  741. vxorpd %xmm6 , %xmm6 , %xmm6
  742. vxorpd %xmm7 , %xmm7 , %xmm7
  743. .endm
  744. .macro KERNEL4x2_SUB
  745. vmovddup -12 * SIZE(BO), %xmm2
  746. vmovups -16 * SIZE(AO), %xmm0
  747. vmovups -14 * SIZE(AO), %xmm1
  748. vmovddup -11 * SIZE(BO), %xmm3
  749. vfmadd231pd %xmm0 ,%xmm2 , %xmm4
  750. vfmadd231pd %xmm1 ,%xmm2 , %xmm5
  751. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  752. vfmadd231pd %xmm1 ,%xmm3 , %xmm7
  753. addq $ 2*SIZE, BO
  754. addq $ 4*SIZE, AO
  755. .endm
  756. .macro SAVE4x2
  757. vmovddup ALPHA, %xmm0
  758. vmulpd %xmm0 , %xmm4 , %xmm4
  759. vmulpd %xmm0 , %xmm5 , %xmm5
  760. vmulpd %xmm0 , %xmm6 , %xmm6
  761. vmulpd %xmm0 , %xmm7 , %xmm7
  762. #if !defined(TRMMKERNEL)
  763. vaddpd (CO1) , %xmm4, %xmm4
  764. vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5
  765. vaddpd (CO1, LDC), %xmm6, %xmm6
  766. vaddpd 2 * SIZE(CO1, LDC), %xmm7, %xmm7
  767. #endif
  768. vmovups %xmm4 , (CO1)
  769. vmovups %xmm5 , 2 * SIZE(CO1)
  770. vmovups %xmm6 , (CO1, LDC)
  771. vmovups %xmm7 , 2 * SIZE(CO1, LDC)
  772. addq $ 4*SIZE, CO1
  773. .endm
  774. /******************************************************************************************/
  775. /******************************************************************************************/
  776. .macro INIT2x2
  777. vxorpd %xmm4 , %xmm4 , %xmm4
  778. vxorpd %xmm6 , %xmm6 , %xmm6
  779. .endm
  780. .macro KERNEL2x2_SUB
  781. vmovddup -12 * SIZE(BO), %xmm2
  782. vmovups -16 * SIZE(AO), %xmm0
  783. vmovddup -11 * SIZE(BO), %xmm3
  784. vfmadd231pd %xmm0 ,%xmm2 , %xmm4
  785. vfmadd231pd %xmm0 ,%xmm3 , %xmm6
  786. addq $ 2*SIZE, BO
  787. addq $ 2*SIZE, AO
  788. .endm
  789. .macro SAVE2x2
  790. vmovddup ALPHA, %xmm0
  791. vmulpd %xmm0 , %xmm4 , %xmm4
  792. vmulpd %xmm0 , %xmm6 , %xmm6
  793. #if !defined(TRMMKERNEL)
  794. vaddpd (CO1) , %xmm4, %xmm4
  795. vaddpd (CO1, LDC), %xmm6, %xmm6
  796. #endif
  797. vmovups %xmm4 , (CO1)
  798. vmovups %xmm6 , (CO1, LDC)
  799. addq $ 2*SIZE, CO1
  800. .endm
  801. /******************************************************************************************/
  802. /******************************************************************************************/
  803. .macro INIT1x2
  804. vxorpd %xmm4 , %xmm4 , %xmm4
  805. vxorpd %xmm5 , %xmm5 , %xmm5
  806. .endm
  807. .macro KERNEL1x2_SUB
  808. vmovsd -12 * SIZE(BO), %xmm1
  809. vmovsd -16 * SIZE(AO), %xmm0
  810. vmovsd -11 * SIZE(BO), %xmm2
  811. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  812. vfmadd231sd %xmm0 ,%xmm2 , %xmm5
  813. addq $ 2*SIZE, BO
  814. addq $ 1*SIZE, AO
  815. .endm
  816. .macro SAVE1x2
  817. vmovsd ALPHA, %xmm0
  818. vmulsd %xmm0 , %xmm4 , %xmm4
  819. vmulsd %xmm0 , %xmm5 , %xmm5
  820. #if !defined(TRMMKERNEL)
  821. vaddsd (CO1), %xmm4, %xmm4
  822. vaddsd (CO1, LDC), %xmm5, %xmm5
  823. #endif
  824. vmovsd %xmm4 , (CO1)
  825. vmovsd %xmm5 , (CO1, LDC)
  826. addq $ 1*SIZE, CO1
  827. .endm
  828. /******************************************************************************************/
  829. /******************************************************************************************/
  830. .macro INIT4x1
  831. vxorpd %ymm4 , %ymm4 , %ymm4
  832. vxorpd %ymm5 , %ymm5 , %ymm5
  833. vxorpd %ymm6 , %ymm6 , %ymm6
  834. vxorpd %ymm7 , %ymm7 , %ymm7
  835. .endm
  836. .macro KERNEL4x1
  837. vbroadcastsd -12 * SIZE(BO), %ymm0
  838. vbroadcastsd -11 * SIZE(BO), %ymm1
  839. vbroadcastsd -10 * SIZE(BO), %ymm2
  840. vbroadcastsd -9 * SIZE(BO), %ymm3
  841. vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4
  842. vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5
  843. vbroadcastsd -8 * SIZE(BO), %ymm0
  844. vbroadcastsd -7 * SIZE(BO), %ymm1
  845. vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6
  846. vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7
  847. vbroadcastsd -6 * SIZE(BO), %ymm2
  848. vbroadcastsd -5 * SIZE(BO), %ymm3
  849. vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4
  850. vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5
  851. vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6
  852. vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7
  853. addq $ 8 *SIZE, BO
  854. addq $ 32*SIZE, AO
  855. .endm
  856. .macro KERNEL4x1_SUB
  857. vbroadcastsd -12 * SIZE(BO), %ymm2
  858. vmovups -16 * SIZE(AO), %ymm0
  859. vfmadd231pd %ymm0 ,%ymm2 , %ymm4
  860. addq $ 1*SIZE, BO
  861. addq $ 4*SIZE, AO
  862. .endm
  863. .macro SAVE4x1
  864. vbroadcastsd ALPHA, %ymm0
  865. vaddpd %ymm4,%ymm5, %ymm4
  866. vaddpd %ymm6,%ymm7, %ymm6
  867. vaddpd %ymm4,%ymm6, %ymm4
  868. vmulpd %ymm0 , %ymm4 , %ymm4
  869. #if !defined(TRMMKERNEL)
  870. vaddpd (CO1) , %ymm4, %ymm4
  871. #endif
  872. vmovups %ymm4 , (CO1)
  873. addq $ 4*SIZE, CO1
  874. .endm
  875. /******************************************************************************************/
  876. /******************************************************************************************/
  877. .macro INIT2x1
  878. vxorpd %xmm4 , %xmm4 , %xmm4
  879. .endm
  880. .macro KERNEL2x1_SUB
  881. vmovddup -12 * SIZE(BO), %xmm2
  882. vmovups -16 * SIZE(AO), %xmm0
  883. vfmadd231pd %xmm0 ,%xmm2 , %xmm4
  884. addq $ 1*SIZE, BO
  885. addq $ 2*SIZE, AO
  886. .endm
  887. .macro SAVE2x1
  888. vmovddup ALPHA, %xmm0
  889. vmulpd %xmm0 , %xmm4 , %xmm4
  890. #if !defined(TRMMKERNEL)
  891. vaddpd (CO1) , %xmm4, %xmm4
  892. #endif
  893. vmovups %xmm4 , (CO1)
  894. addq $ 2*SIZE, CO1
  895. .endm
  896. /******************************************************************************************/
  897. /******************************************************************************************/
  898. .macro INIT1x1
  899. vxorpd %xmm4 , %xmm4 , %xmm4
  900. .endm
  901. .macro KERNEL1x1_SUB
  902. vmovsd -12 * SIZE(BO), %xmm1
  903. vmovsd -16 * SIZE(AO), %xmm0
  904. vfmadd231sd %xmm0 ,%xmm1 , %xmm4
  905. addq $ 1*SIZE, BO
  906. addq $ 1*SIZE, AO
  907. .endm
  908. .macro SAVE1x1
  909. vmovsd ALPHA, %xmm0
  910. vmulsd %xmm0 , %xmm4 , %xmm4
  911. #if !defined(TRMMKERNEL)
  912. vaddsd (CO1), %xmm4, %xmm4
  913. #endif
  914. vmovsd %xmm4 , (CO1)
  915. addq $ 1*SIZE, CO1
  916. .endm
  917. /*******************************************************************************************/
  918. #if !defined(TRMMKERNEL)
  919. PROLOGUE
  920. PROFCODE
  921. subq $STACKSIZE, %rsp
  922. movq %rbx, (%rsp)
  923. movq %rbp, 8(%rsp)
  924. movq %r12, 16(%rsp)
  925. movq %r13, 24(%rsp)
  926. movq %r14, 32(%rsp)
  927. movq %r15, 40(%rsp)
  928. vzeroupper
  929. #ifdef WINDOWS_ABI
  930. movq %rdi, 48(%rsp)
  931. movq %rsi, 56(%rsp)
  932. vmovups %xmm6, 64(%rsp)
  933. vmovups %xmm7, 80(%rsp)
  934. vmovups %xmm8, 96(%rsp)
  935. vmovups %xmm9, 112(%rsp)
  936. vmovups %xmm10, 128(%rsp)
  937. vmovups %xmm11, 144(%rsp)
  938. vmovups %xmm12, 160(%rsp)
  939. vmovups %xmm13, 176(%rsp)
  940. vmovups %xmm14, 192(%rsp)
  941. vmovups %xmm15, 208(%rsp)
  942. movq ARG1, OLD_M
  943. movq ARG2, OLD_N
  944. movq ARG3, OLD_K
  945. movq OLD_A, A
  946. movq OLD_B, B
  947. movq OLD_C, C
  948. movq OLD_LDC, LDC
  949. vmovups %xmm3, %xmm0
  950. #else
  951. movq STACKSIZE + 8(%rsp), LDC
  952. #endif
  953. movq %rsp, SP # save old stack
  954. subq $128 + L_BUFFER_SIZE, %rsp
  955. andq $-4096, %rsp # align stack
  956. STACK_TOUCH
  957. cmpq $ 0, OLD_M
  958. je .L999
  959. cmpq $ 0, OLD_N
  960. je .L999
  961. cmpq $ 0, OLD_K
  962. je .L999
  963. movq OLD_M, M
  964. movq OLD_N, N
  965. movq OLD_K, K
  966. vmovsd %xmm0, ALPHA
  967. salq $BASE_SHIFT, LDC
  968. movq N, %rax
  969. xorq %rdx, %rdx
  970. movq $12, %rdi
  971. divq %rdi // N / 12
  972. movq %rax, Ndiv12 // N / 12
  973. movq %rdx, Nmod12 // N % 12
  974. movq Ndiv12, J
  975. cmpq $ 0, J
  976. je .L4_0
  977. ALIGN_4
  978. .L12_01:
  979. // copy to sub buffer
  980. movq K, %rax
  981. salq $2,%rax // K * 4 ; read 2 values
  982. movq B, BO1
  983. leaq (B,%rax, SIZE), BO2 // next offset to BO2
  984. leaq (BO2,%rax, SIZE), BO3 // next offset to BO2
  985. leaq BUFFER1, BO // first buffer to BO
  986. movq K, %rax
  987. sarq $1 , %rax // K / 2
  988. jz .L12_01a_2
  989. ALIGN_4
  990. .L12_01a_1:
  991. prefetcht0 512(BO1)
  992. prefetcht0 512(BO2)
  993. prefetcht0 512(BO3)
  994. prefetchw 512(BO)
  995. vmovups 0 * SIZE(BO1), %ymm1
  996. vmovups 4 * SIZE(BO1), %ymm5
  997. vmovups 0 * SIZE(BO2), %ymm2
  998. vmovups 4 * SIZE(BO2), %ymm6
  999. vmovups 0 * SIZE(BO3), %ymm3
  1000. vmovups 4 * SIZE(BO3), %ymm7
  1001. vmovups %ymm1, 0 * SIZE(BO)
  1002. vmovups %ymm2, 4 * SIZE(BO)
  1003. vmovups %ymm3, 8 * SIZE(BO)
  1004. vmovups %ymm5, 12 * SIZE(BO)
  1005. vmovups %ymm6, 16 * SIZE(BO)
  1006. vmovups %ymm7, 20 * SIZE(BO)
  1007. addq $ 8 * SIZE ,BO1
  1008. addq $ 8 * SIZE ,BO2
  1009. addq $ 8 * SIZE ,BO3
  1010. addq $ 24 *SIZE ,BO
  1011. decq %rax
  1012. jnz .L12_01a_1
  1013. .L12_01a_2:
  1014. movq K, %rax
  1015. andq $1, %rax // K % 2
  1016. jz .L12_03c
  1017. ALIGN_4
  1018. .L12_02b:
  1019. vmovups 0 * SIZE(BO1), %ymm1
  1020. vmovups 0 * SIZE(BO2), %ymm2
  1021. vmovups 0 * SIZE(BO3), %ymm3
  1022. vmovups %ymm1, 0 * SIZE(BO)
  1023. vmovups %ymm2, 4 * SIZE(BO)
  1024. vmovups %ymm3, 8 * SIZE(BO)
  1025. addq $ 4*SIZE,BO1
  1026. addq $ 4*SIZE,BO2
  1027. addq $ 4*SIZE,BO3
  1028. addq $ 12*SIZE,BO
  1029. decq %rax
  1030. jnz .L12_02b
  1031. .L12_03c:
  1032. movq BO3, B // next offset of B
  1033. .L12_10:
  1034. movq C, CO1
  1035. leaq (C, LDC, 8), C
  1036. leaq (C, LDC, 4), C // c += 12 * ldc
  1037. movq A, AO // aoffset = a
  1038. addq $16 * SIZE, AO
  1039. movq M, I
  1040. sarq $2, I // i = m / 4
  1041. je .L12_20
  1042. ALIGN_4
  1043. .L12_11:
  1044. leaq BUFFER1, BO // first buffer to BO
  1045. addq $12 * SIZE, BO
  1046. movq K, %rax
  1047. sarq $3, %rax // K / 8
  1048. cmpq $2, %rax
  1049. jl .L12_13
  1050. KERNEL4x12_I
  1051. KERNEL4x12_M2
  1052. KERNEL4x12_M1
  1053. KERNEL4x12_M2
  1054. KERNEL4x12_M1
  1055. KERNEL4x12_M2
  1056. KERNEL4x12_M1
  1057. KERNEL4x12_M2
  1058. subq $2, %rax
  1059. je .L12_12a
  1060. ALIGN_5
  1061. .L12_12:
  1062. KERNEL4x12_M1
  1063. KERNEL4x12_M2
  1064. KERNEL4x12_M1
  1065. KERNEL4x12_M2
  1066. KERNEL4x12_M1
  1067. KERNEL4x12_M2
  1068. KERNEL4x12_M1
  1069. KERNEL4x12_M2
  1070. dec %rax
  1071. jne .L12_12
  1072. .L12_12a:
  1073. KERNEL4x12_M1
  1074. KERNEL4x12_M2
  1075. KERNEL4x12_M1
  1076. KERNEL4x12_M2
  1077. KERNEL4x12_M1
  1078. KERNEL4x12_M2
  1079. KERNEL4x12_M1
  1080. KERNEL4x12_E
  1081. jmp .L12_16
  1082. .L12_13:
  1083. test $1, %rax
  1084. jz .L12_14
  1085. KERNEL4x12_I
  1086. KERNEL4x12_M2
  1087. KERNEL4x12_M1
  1088. KERNEL4x12_M2
  1089. KERNEL4x12_M1
  1090. KERNEL4x12_M2
  1091. KERNEL4x12_M1
  1092. KERNEL4x12_E
  1093. jmp .L12_16
  1094. .L12_14:
  1095. INIT4x12
  1096. .L12_16:
  1097. movq K, %rax
  1098. andq $7, %rax # if (k & 1)
  1099. je .L12_19
  1100. ALIGN_4
  1101. .L12_17:
  1102. KERNEL4x12_SUB
  1103. dec %rax
  1104. jne .L12_17
  1105. ALIGN_4
  1106. .L12_19:
  1107. SAVE4x12
  1108. decq I # i --
  1109. jne .L12_11
  1110. ALIGN_4
  1111. /**************************************************************************
  1112. * Rest of M
  1113. ***************************************************************************/
  1114. .L12_20:
  1115. // Test rest of M
  1116. testq $3, M
  1117. jz .L12_100 // to next 16 lines of N
  1118. .L12_30:
  1119. testq $2, M
  1120. jz .L12_40
  1121. ALIGN_4
  1122. .L12_31:
  1123. leaq BUFFER1, BO // first buffer to BO
  1124. addq $12 * SIZE, BO
  1125. INIT2x12
  1126. movq K, %rax
  1127. sarq $3, %rax
  1128. je .L12_36
  1129. ALIGN_4
  1130. .L12_32:
  1131. KERNEL2x12_SUB
  1132. KERNEL2x12_SUB
  1133. KERNEL2x12_SUB
  1134. KERNEL2x12_SUB
  1135. KERNEL2x12_SUB
  1136. KERNEL2x12_SUB
  1137. KERNEL2x12_SUB
  1138. KERNEL2x12_SUB
  1139. dec %rax
  1140. jne .L12_32
  1141. ALIGN_4
  1142. .L12_36:
  1143. movq K, %rax
  1144. andq $7, %rax # if (k & 1)
  1145. je .L12_39
  1146. ALIGN_4
  1147. .L12_37:
  1148. KERNEL2x12_SUB
  1149. dec %rax
  1150. jne .L12_37
  1151. ALIGN_4
  1152. .L12_39:
  1153. SAVE2x12
  1154. ALIGN_4
  1155. .L12_40:
  1156. testq $1, M
  1157. jz .L12_100 // to next 3 lines of N
  1158. ALIGN_4
  1159. .L12_41:
  1160. leaq BUFFER1, BO // first buffer to BO
  1161. addq $12 * SIZE, BO
  1162. INIT1x12
  1163. movq K, %rax
  1164. sarq $3,%rax
  1165. je .L12_46
  1166. ALIGN_4
  1167. .L12_42:
  1168. KERNEL1x12_SUB
  1169. KERNEL1x12_SUB
  1170. KERNEL1x12_SUB
  1171. KERNEL1x12_SUB
  1172. KERNEL1x12_SUB
  1173. KERNEL1x12_SUB
  1174. KERNEL1x12_SUB
  1175. KERNEL1x12_SUB
  1176. dec %rax
  1177. jne .L12_42
  1178. ALIGN_4
  1179. .L12_46:
  1180. movq K, %rax
  1181. andq $7, %rax # if (k & 1)
  1182. je .L12_49
  1183. ALIGN_4
  1184. .L12_47:
  1185. KERNEL1x12_SUB
  1186. dec %rax
  1187. jne .L12_47
  1188. ALIGN_4
  1189. .L12_49:
  1190. SAVE1x12
  1191. ALIGN_4
  1192. .L12_100:
  1193. decq J // j --
  1194. jg .L12_01
  1195. .L4_0:
  1196. cmpq $ 0, Nmod12 // N % 12 == 0
  1197. je .L999
  1198. movq Nmod12, J
  1199. sarq $2, J // j = j / 4
  1200. je .L2_0
  1201. .L4_10:
  1202. movq C, CO1
  1203. leaq (C, LDC, 4), C // c += 4 * ldc
  1204. movq A, AO // aoffset = a
  1205. addq $16 * SIZE, AO
  1206. movq M, I
  1207. sarq $2, I // i = m / 4
  1208. je .L4_20
  1209. ALIGN_4
  1210. .L4_11:
  1211. movq B, BO
  1212. addq $12 * SIZE, BO
  1213. movq K, %rax
  1214. sarq $3, %rax // K / 8
  1215. cmpq $2, %rax
  1216. jl .L4_13
  1217. KERNEL4x4_I
  1218. KERNEL4x4_M2
  1219. KERNEL4x4_M1
  1220. KERNEL4x4_M2
  1221. KERNEL4x4_M1
  1222. KERNEL4x4_M2
  1223. KERNEL4x4_M1
  1224. KERNEL4x4_M2
  1225. subq $2, %rax
  1226. je .L4_12a
  1227. ALIGN_5
  1228. .L4_12:
  1229. KERNEL4x4_M1
  1230. KERNEL4x4_M2
  1231. KERNEL4x4_M1
  1232. KERNEL4x4_M2
  1233. KERNEL4x4_M1
  1234. KERNEL4x4_M2
  1235. KERNEL4x4_M1
  1236. KERNEL4x4_M2
  1237. dec %rax
  1238. jne .L4_12
  1239. .L4_12a:
  1240. KERNEL4x4_M1
  1241. KERNEL4x4_M2
  1242. KERNEL4x4_M1
  1243. KERNEL4x4_M2
  1244. KERNEL4x4_M1
  1245. KERNEL4x4_M2
  1246. KERNEL4x4_M1
  1247. KERNEL4x4_E
  1248. jmp .L4_16
  1249. .L4_13:
  1250. test $1, %rax
  1251. jz .L4_14
  1252. KERNEL4x4_I
  1253. KERNEL4x4_M2
  1254. KERNEL4x4_M1
  1255. KERNEL4x4_M2
  1256. KERNEL4x4_M1
  1257. KERNEL4x4_M2
  1258. KERNEL4x4_M1
  1259. KERNEL4x4_E
  1260. jmp .L4_16
  1261. .L4_14:
  1262. INIT4x4
  1263. .L4_16:
  1264. movq K, %rax
  1265. andq $7, %rax # if (k & 1)
  1266. je .L4_19
  1267. ALIGN_4
  1268. .L4_17:
  1269. KERNEL4x4_SUB
  1270. dec %rax
  1271. jne .L4_17
  1272. ALIGN_4
  1273. .L4_19:
  1274. SAVE4x4
  1275. decq I # i --
  1276. jg .L4_11
  1277. ALIGN_4
  1278. /**************************************************************************
  1279. * Rest of M
  1280. ***************************************************************************/
  1281. .L4_20:
  1282. // Test rest of M
  1283. testq $3, M
  1284. jz .L4_100 // to next 16 lines of N
  1285. .L4_30:
  1286. testq $2, M
  1287. jz .L4_40
  1288. ALIGN_4
  1289. .L4_31:
  1290. movq B, BO // first buffer to BO
  1291. addq $12 * SIZE, BO
  1292. INIT2x4
  1293. movq K, %rax
  1294. sarq $3, %rax
  1295. je .L4_36
  1296. ALIGN_4
  1297. .L4_32:
  1298. KERNEL2x4_SUB
  1299. KERNEL2x4_SUB
  1300. KERNEL2x4_SUB
  1301. KERNEL2x4_SUB
  1302. KERNEL2x4_SUB
  1303. KERNEL2x4_SUB
  1304. KERNEL2x4_SUB
  1305. KERNEL2x4_SUB
  1306. dec %rax
  1307. jne .L4_32
  1308. ALIGN_4
  1309. .L4_36:
  1310. movq K, %rax
  1311. andq $7, %rax # if (k & 1)
  1312. je .L4_39
  1313. ALIGN_4
  1314. .L4_37:
  1315. KERNEL2x4_SUB
  1316. dec %rax
  1317. jne .L4_37
  1318. .L4_39:
  1319. SAVE2x4
  1320. .L4_40:
  1321. testq $1, M
  1322. jz .L4_100 // to next 3 lines of N
  1323. ALIGN_4
  1324. .L4_41:
  1325. movq B, BO // first buffer to BO
  1326. addq $12 * SIZE, BO
  1327. INIT1x4
  1328. movq K, %rax
  1329. sarq $3,%rax
  1330. je .L4_46
  1331. ALIGN_4
  1332. .L4_42:
  1333. KERNEL1x4_SUB
  1334. KERNEL1x4_SUB
  1335. KERNEL1x4_SUB
  1336. KERNEL1x4_SUB
  1337. KERNEL1x4_SUB
  1338. KERNEL1x4_SUB
  1339. KERNEL1x4_SUB
  1340. KERNEL1x4_SUB
  1341. dec %rax
  1342. jne .L4_42
  1343. ALIGN_4
  1344. .L4_46:
  1345. movq K, %rax
  1346. andq $7, %rax # if (k & 1)
  1347. je .L4_49
  1348. ALIGN_4
  1349. .L4_47:
  1350. KERNEL1x4_SUB
  1351. dec %rax
  1352. jne .L4_47
  1353. ALIGN_4
  1354. .L4_49:
  1355. SAVE1x4
  1356. ALIGN_4
  1357. .L4_100:
  1358. movq K, %rax
  1359. salq $2, %rax // * 4
  1360. leaq (B , %rax, SIZE), B
  1361. decq J // j --
  1362. jg .L4_10
  1363. /***************************************************************************************************************/
  1364. .L2_0:
  1365. movq Nmod12, J
  1366. testq $2, J
  1367. je .L1_0
  1368. .L2_10:
  1369. movq C, CO1
  1370. leaq (C, LDC, 2), C // c += 2 * ldc
  1371. movq A, AO // aoffset = a
  1372. addq $16 * SIZE, AO
  1373. movq M, I
  1374. sarq $2, I // i = m / 4
  1375. je .L2_20
  1376. ALIGN_4
  1377. .L2_11:
  1378. movq B, BO
  1379. addq $12 * SIZE, BO
  1380. INIT4x2
  1381. movq K, %rax
  1382. sarq $3, %rax // K / 8
  1383. je .L2_16
  1384. ALIGN_5
  1385. .L2_12:
  1386. KERNEL4x2_SUB
  1387. KERNEL4x2_SUB
  1388. KERNEL4x2_SUB
  1389. KERNEL4x2_SUB
  1390. KERNEL4x2_SUB
  1391. KERNEL4x2_SUB
  1392. KERNEL4x2_SUB
  1393. KERNEL4x2_SUB
  1394. dec %rax
  1395. jne .L2_12
  1396. .L2_16:
  1397. movq K, %rax
  1398. andq $7, %rax # if (k & 1)
  1399. je .L2_19
  1400. ALIGN_4
  1401. .L2_17:
  1402. KERNEL4x2_SUB
  1403. dec %rax
  1404. jne .L2_17
  1405. ALIGN_4
  1406. .L2_19:
  1407. SAVE4x2
  1408. decq I # i --
  1409. jg .L2_11
  1410. ALIGN_4
  1411. /**************************************************************************
  1412. * Rest of M
  1413. ***************************************************************************/
  1414. .L2_20:
  1415. // Test rest of M
  1416. testq $3, M
  1417. jz .L2_100 // to next 16 lines of N
  1418. .L2_30:
  1419. testq $2, M
  1420. jz .L2_40
  1421. ALIGN_4
  1422. .L2_31:
  1423. movq B, BO // first buffer to BO
  1424. addq $12 * SIZE, BO
  1425. INIT2x2
  1426. movq K, %rax
  1427. sarq $3, %rax
  1428. je .L2_36
  1429. ALIGN_4
  1430. .L2_32:
  1431. KERNEL2x2_SUB
  1432. KERNEL2x2_SUB
  1433. KERNEL2x2_SUB
  1434. KERNEL2x2_SUB
  1435. KERNEL2x2_SUB
  1436. KERNEL2x2_SUB
  1437. KERNEL2x2_SUB
  1438. KERNEL2x2_SUB
  1439. dec %rax
  1440. jne .L2_32
  1441. .L2_36:
  1442. movq K, %rax
  1443. andq $7, %rax # if (k & 1)
  1444. je .L2_39
  1445. ALIGN_4
  1446. .L2_37:
  1447. KERNEL2x2_SUB
  1448. dec %rax
  1449. jne .L2_37
  1450. .L2_39:
  1451. SAVE2x2
  1452. .L2_40:
  1453. testq $1, M
  1454. jz .L2_100 // to next 3 lines of N
  1455. .L2_41:
  1456. movq B, BO // first buffer to BO
  1457. addq $12 * SIZE, BO
  1458. INIT1x2
  1459. movq K, %rax
  1460. sarq $3,%rax
  1461. je .L2_46
  1462. ALIGN_4
  1463. .L2_42:
  1464. KERNEL1x2_SUB
  1465. KERNEL1x2_SUB
  1466. KERNEL1x2_SUB
  1467. KERNEL1x2_SUB
  1468. KERNEL1x2_SUB
  1469. KERNEL1x2_SUB
  1470. KERNEL1x2_SUB
  1471. KERNEL1x2_SUB
  1472. dec %rax
  1473. jne .L2_42
  1474. .L2_46:
  1475. movq K, %rax
  1476. andq $7, %rax # if (k & 1)
  1477. je .L2_49
  1478. ALIGN_4
  1479. .L2_47:
  1480. KERNEL1x2_SUB
  1481. dec %rax
  1482. jne .L2_47
  1483. .L2_49:
  1484. SAVE1x2
  1485. .L2_100:
  1486. movq K, %rax
  1487. salq $1, %rax // * 2
  1488. leaq (B , %rax, SIZE), B
  1489. /***************************************************************************************************************/
  1490. .L1_0:
  1491. movq Nmod12, J
  1492. testq $1, J
  1493. je .L999
  1494. .L1_10:
  1495. movq C, CO1
  1496. leaq (C, LDC, 1), C // c += 1 * ldc
  1497. movq A, AO // aoffset = a
  1498. addq $16 * SIZE, AO
  1499. movq M, I
  1500. sarq $2, I // i = m / 4
  1501. je .L1_20
  1502. ALIGN_4
  1503. .L1_11:
  1504. movq B, BO
  1505. addq $12 * SIZE, BO
  1506. INIT4x1
  1507. movq K, %rax
  1508. sarq $3, %rax // K / 8
  1509. je .L1_16
  1510. ALIGN_5
  1511. .L1_12:
  1512. KERNEL4x1
  1513. dec %rax
  1514. jne .L1_12
  1515. .L1_16:
  1516. movq K, %rax
  1517. andq $7, %rax # if (k & 1)
  1518. je .L1_19
  1519. ALIGN_4
  1520. .L1_17:
  1521. KERNEL4x1_SUB
  1522. dec %rax
  1523. jne .L1_17
  1524. ALIGN_4
  1525. .L1_19:
  1526. SAVE4x1
  1527. decq I # i --
  1528. jg .L1_11
  1529. /**************************************************************************
  1530. * Rest of M
  1531. ***************************************************************************/
  1532. .L1_20:
  1533. // Test rest of M
  1534. testq $3, M
  1535. jz .L1_100
  1536. .L1_30:
  1537. testq $2, M
  1538. jz .L1_40
  1539. ALIGN_4
  1540. .L1_31:
  1541. movq B, BO // first buffer to BO
  1542. addq $12 * SIZE, BO
  1543. INIT2x1
  1544. movq K, %rax
  1545. sarq $3, %rax
  1546. je .L1_36
  1547. ALIGN_4
  1548. .L1_32:
  1549. KERNEL2x1_SUB
  1550. KERNEL2x1_SUB
  1551. KERNEL2x1_SUB
  1552. KERNEL2x1_SUB
  1553. KERNEL2x1_SUB
  1554. KERNEL2x1_SUB
  1555. KERNEL2x1_SUB
  1556. KERNEL2x1_SUB
  1557. dec %rax
  1558. jne .L1_32
  1559. .L1_36:
  1560. movq K, %rax
  1561. andq $7, %rax # if (k & 1)
  1562. je .L1_39
  1563. ALIGN_4
  1564. .L1_37:
  1565. KERNEL2x1_SUB
  1566. dec %rax
  1567. jne .L1_37
  1568. .L1_39:
  1569. SAVE2x1
  1570. .L1_40:
  1571. testq $1, M
  1572. jz .L1_100 // to next 3 lines of N
  1573. .L1_41:
  1574. movq B, BO // first buffer to BO
  1575. addq $12 * SIZE, BO
  1576. INIT1x1
  1577. movq K, %rax
  1578. sarq $3,%rax
  1579. je .L1_46
  1580. ALIGN_4
  1581. .L1_42:
  1582. KERNEL1x1_SUB
  1583. KERNEL1x1_SUB
  1584. KERNEL1x1_SUB
  1585. KERNEL1x1_SUB
  1586. KERNEL1x1_SUB
  1587. KERNEL1x1_SUB
  1588. KERNEL1x1_SUB
  1589. KERNEL1x1_SUB
  1590. dec %rax
  1591. jne .L1_42
  1592. .L1_46:
  1593. movq K, %rax
  1594. andq $7, %rax # if (k & 1)
  1595. je .L1_49
  1596. ALIGN_4
  1597. .L1_47:
  1598. KERNEL1x1_SUB
  1599. dec %rax
  1600. jne .L1_47
  1601. .L1_49:
  1602. SAVE1x1
  1603. .L1_100:
  1604. .L999:
  1605. vzeroupper
  1606. movq SP, %rsp
  1607. movq (%rsp), %rbx
  1608. movq 8(%rsp), %rbp
  1609. movq 16(%rsp), %r12
  1610. movq 24(%rsp), %r13
  1611. movq 32(%rsp), %r14
  1612. movq 40(%rsp), %r15
  1613. #ifdef WINDOWS_ABI
  1614. movq 48(%rsp), %rdi
  1615. movq 56(%rsp), %rsi
  1616. vmovups 64(%rsp), %xmm6
  1617. vmovups 80(%rsp), %xmm7
  1618. vmovups 96(%rsp), %xmm8
  1619. vmovups 112(%rsp), %xmm9
  1620. vmovups 128(%rsp), %xmm10
  1621. vmovups 144(%rsp), %xmm11
  1622. vmovups 160(%rsp), %xmm12
  1623. vmovups 176(%rsp), %xmm13
  1624. vmovups 192(%rsp), %xmm14
  1625. vmovups 208(%rsp), %xmm15
  1626. #endif
  1627. addq $STACKSIZE, %rsp
  1628. ret
  1629. EPILOGUE
  1630. #else
  1631. /*************************************************************************************
  1632. * TRMM Kernel
  1633. *************************************************************************************/
  1634. PROLOGUE
  1635. PROFCODE
  1636. subq $STACKSIZE, %rsp
  1637. movq %rbx, (%rsp)
  1638. movq %rbp, 8(%rsp)
  1639. movq %r12, 16(%rsp)
  1640. movq %r13, 24(%rsp)
  1641. movq %r14, 32(%rsp)
  1642. movq %r15, 40(%rsp)
  1643. vzeroupper
  1644. #ifdef WINDOWS_ABI
  1645. movq %rdi, 48(%rsp)
  1646. movq %rsi, 56(%rsp)
  1647. vmovups %xmm6, 64(%rsp)
  1648. vmovups %xmm7, 80(%rsp)
  1649. vmovups %xmm8, 96(%rsp)
  1650. vmovups %xmm9, 112(%rsp)
  1651. vmovups %xmm10, 128(%rsp)
  1652. vmovups %xmm11, 144(%rsp)
  1653. vmovups %xmm12, 160(%rsp)
  1654. vmovups %xmm13, 176(%rsp)
  1655. vmovups %xmm14, 192(%rsp)
  1656. vmovups %xmm15, 208(%rsp)
  1657. movq ARG1, OLD_M
  1658. movq ARG2, OLD_N
  1659. movq ARG3, OLD_K
  1660. movq OLD_A, A
  1661. movq OLD_B, B
  1662. movq OLD_C, C
  1663. movq OLD_LDC, LDC
  1664. #ifdef TRMMKERNEL
  1665. vmovsd OLD_OFFSET, %xmm12
  1666. #endif
  1667. vmovups %xmm3, %xmm0
  1668. #else
  1669. movq STACKSIZE + 8(%rsp), LDC
  1670. #ifdef TRMMKERNEL
  1671. vmovsd STACKSIZE + 16(%rsp), %xmm12
  1672. #endif
  1673. #endif
  1674. movq %rsp, SP # save old stack
  1675. subq $128 + L_BUFFER_SIZE, %rsp
  1676. andq $-4096, %rsp # align stack
  1677. STACK_TOUCH
  1678. cmpq $ 0, OLD_M
  1679. je .L999
  1680. cmpq $ 0, OLD_N
  1681. je .L999
  1682. cmpq $ 0, OLD_K
  1683. je .L999
  1684. movq OLD_M, M
  1685. movq OLD_N, N
  1686. movq OLD_K, K
  1687. vmovsd %xmm0, ALPHA
  1688. salq $BASE_SHIFT, LDC
  1689. movq N, %rax
  1690. xorq %rdx, %rdx
  1691. movq $4, %rdi
  1692. divq %rdi // N / 4
  1693. movq %rax, Ndiv12 // N / 4
  1694. movq %rdx, Nmod12 // N % 4
  1695. #ifdef TRMMKERNEL
  1696. vmovsd %xmm12, OFFSET
  1697. vmovsd %xmm12, KK
  1698. #ifndef LEFT
  1699. negq KK
  1700. #endif
  1701. #endif
  1702. movq Ndiv12, J
  1703. cmpq $ 0, J
  1704. je .L2_0
  1705. ALIGN_4
  1706. .L4_10:
  1707. movq C, CO1
  1708. leaq (C, LDC, 4), C // c += 4 * ldc
  1709. #if defined(TRMMKERNEL) && defined(LEFT)
  1710. movq OFFSET, %rax
  1711. movq %rax, KK
  1712. #endif
  1713. movq A, AO // aoffset = a
  1714. addq $16 * SIZE, AO
  1715. movq M, I
  1716. sarq $2, I // i = m / 4
  1717. je .L4_20
  1718. ALIGN_4
  1719. .L4_11:
  1720. #if !defined(TRMMKERNEL) || \
  1721. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1722. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1723. movq B, BO
  1724. addq $12 * SIZE, BO
  1725. #else
  1726. movq B, BO
  1727. addq $12 * SIZE, BO
  1728. movq KK, %rax
  1729. salq $3, %rax // rax * SIZE
  1730. leaq (BO,%rax,4), BO // add number of values in B
  1731. leaq (AO,%rax,4), AO // add number of values in A
  1732. #endif
  1733. #ifndef TRMMKERNEL
  1734. movq K, %rax
  1735. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1736. movq K, %rax
  1737. subq KK, %rax
  1738. movq %rax, KKK
  1739. #else
  1740. movq KK, %rax
  1741. #ifdef LEFT
  1742. addq $4, %rax // number of values in AO
  1743. #else
  1744. addq $4, %rax // number of values in BO
  1745. #endif
  1746. movq %rax, KKK
  1747. #endif
  1748. sarq $3, %rax // K / 8
  1749. cmpq $2, %rax
  1750. jl .L4_13
  1751. KERNEL4x4_I
  1752. KERNEL4x4_M2
  1753. KERNEL4x4_M1
  1754. KERNEL4x4_M2
  1755. KERNEL4x4_M1
  1756. KERNEL4x4_M2
  1757. KERNEL4x4_M1
  1758. KERNEL4x4_M2
  1759. subq $2, %rax
  1760. je .L4_12a
  1761. ALIGN_5
  1762. .L4_12:
  1763. KERNEL4x4_M1
  1764. KERNEL4x4_M2
  1765. KERNEL4x4_M1
  1766. KERNEL4x4_M2
  1767. KERNEL4x4_M1
  1768. KERNEL4x4_M2
  1769. KERNEL4x4_M1
  1770. KERNEL4x4_M2
  1771. dec %rax
  1772. jne .L4_12
  1773. .L4_12a:
  1774. KERNEL4x4_M1
  1775. KERNEL4x4_M2
  1776. KERNEL4x4_M1
  1777. KERNEL4x4_M2
  1778. KERNEL4x4_M1
  1779. KERNEL4x4_M2
  1780. KERNEL4x4_M1
  1781. KERNEL4x4_E
  1782. jmp .L4_16
  1783. .L4_13:
  1784. test $1, %rax
  1785. jz .L4_14
  1786. KERNEL4x4_I
  1787. KERNEL4x4_M2
  1788. KERNEL4x4_M1
  1789. KERNEL4x4_M2
  1790. KERNEL4x4_M1
  1791. KERNEL4x4_M2
  1792. KERNEL4x4_M1
  1793. KERNEL4x4_E
  1794. jmp .L4_16
  1795. .L4_14:
  1796. INIT4x4
  1797. .L4_16:
  1798. movq KKK, %rax
  1799. andq $7, %rax # if (k & 1)
  1800. je .L4_19
  1801. ALIGN_4
  1802. .L4_17:
  1803. KERNEL4x4_SUB
  1804. dec %rax
  1805. jne .L4_17
  1806. ALIGN_4
  1807. .L4_19:
  1808. SAVE4x4
  1809. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1810. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1811. movq K, %rax
  1812. subq KKK, %rax
  1813. salq $3, %rax // rax + SIZE
  1814. leaq (BO, %rax, 4), BO // number of values in B
  1815. leaq (AO, %rax, 4), AO // number of values in A
  1816. #endif
  1817. #if defined(TRMMKERNEL) && defined(LEFT)
  1818. addq $4, KK // number of values in A
  1819. #endif
  1820. decq I # i --
  1821. jg .L4_11
  1822. ALIGN_4
  1823. /**************************************************************************
  1824. * Rest of M
  1825. ***************************************************************************/
  1826. .L4_20:
  1827. // Test rest of M
  1828. testq $3, M
  1829. jz .L4_100 // to next 16 lines of N
  1830. .L4_30:
  1831. testq $2, M
  1832. jz .L4_40
  1833. ALIGN_4
  1834. .L4_31:
  1835. #if !defined(TRMMKERNEL) || \
  1836. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1837. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1838. movq B, BO
  1839. addq $12 * SIZE, BO
  1840. #else
  1841. movq B, BO
  1842. addq $12 * SIZE, BO
  1843. movq KK, %rax
  1844. salq $3, %rax // rax * SIZE
  1845. leaq (BO,%rax,4), BO // add number of values in B
  1846. leaq (AO,%rax,2), AO // add number of values in A
  1847. #endif
  1848. #ifndef TRMMKERNEL
  1849. movq K, %rax
  1850. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1851. movq K, %rax
  1852. subq KK, %rax
  1853. movq %rax, KKK
  1854. #else
  1855. movq KK, %rax
  1856. #ifdef LEFT
  1857. addq $2, %rax // number of values in AO
  1858. #else
  1859. addq $4, %rax // number of values in BO
  1860. #endif
  1861. movq %rax, KKK
  1862. #endif
  1863. INIT2x4
  1864. sarq $3, %rax
  1865. je .L4_36
  1866. ALIGN_4
  1867. .L4_32:
  1868. KERNEL2x4_SUB
  1869. KERNEL2x4_SUB
  1870. KERNEL2x4_SUB
  1871. KERNEL2x4_SUB
  1872. KERNEL2x4_SUB
  1873. KERNEL2x4_SUB
  1874. KERNEL2x4_SUB
  1875. KERNEL2x4_SUB
  1876. dec %rax
  1877. jne .L4_32
  1878. ALIGN_4
  1879. .L4_36:
  1880. movq KKK, %rax
  1881. andq $7, %rax # if (k & 1)
  1882. je .L4_39
  1883. ALIGN_4
  1884. .L4_37:
  1885. KERNEL2x4_SUB
  1886. dec %rax
  1887. jne .L4_37
  1888. .L4_39:
  1889. SAVE2x4
  1890. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1891. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1892. movq K, %rax
  1893. subq KKK, %rax
  1894. salq $3, %rax // rax + SIZE
  1895. leaq (BO, %rax, 4), BO // number of values in B
  1896. leaq (AO, %rax, 2), AO // number of values in A
  1897. #endif
  1898. #if defined(TRMMKERNEL) && defined(LEFT)
  1899. addq $2, KK // number of values in A
  1900. #endif
  1901. .L4_40:
  1902. testq $1, M
  1903. jz .L4_100 // to next 3 lines of N
  1904. ALIGN_4
  1905. .L4_41:
  1906. #if !defined(TRMMKERNEL) || \
  1907. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1908. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1909. movq B, BO
  1910. addq $12 * SIZE, BO
  1911. #else
  1912. movq B, BO
  1913. addq $12 * SIZE, BO
  1914. movq KK, %rax
  1915. salq $3, %rax // rax * SIZE
  1916. leaq (BO,%rax,4), BO // add number of values in B
  1917. leaq (AO,%rax,1), AO // add number of values in A
  1918. #endif
  1919. #ifndef TRMMKERNEL
  1920. movq K, %rax
  1921. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1922. movq K, %rax
  1923. subq KK, %rax
  1924. movq %rax, KKK
  1925. #else
  1926. movq KK, %rax
  1927. #ifdef LEFT
  1928. addq $1, %rax // number of values in AO
  1929. #else
  1930. addq $4, %rax // number of values in BO
  1931. #endif
  1932. movq %rax, KKK
  1933. #endif
  1934. INIT1x4
  1935. sarq $3,%rax
  1936. je .L4_46
  1937. ALIGN_4
  1938. .L4_42:
  1939. KERNEL1x4_SUB
  1940. KERNEL1x4_SUB
  1941. KERNEL1x4_SUB
  1942. KERNEL1x4_SUB
  1943. KERNEL1x4_SUB
  1944. KERNEL1x4_SUB
  1945. KERNEL1x4_SUB
  1946. KERNEL1x4_SUB
  1947. dec %rax
  1948. jne .L4_42
  1949. ALIGN_4
  1950. .L4_46:
  1951. movq KKK, %rax
  1952. andq $7, %rax # if (k & 1)
  1953. je .L4_49
  1954. ALIGN_4
  1955. .L4_47:
  1956. KERNEL1x4_SUB
  1957. dec %rax
  1958. jne .L4_47
  1959. ALIGN_4
  1960. .L4_49:
  1961. SAVE1x4
  1962. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1963. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1964. movq K, %rax
  1965. subq KKK, %rax
  1966. salq $3, %rax // rax + SIZE
  1967. leaq (BO, %rax, 4), BO // number of values in B
  1968. leaq (AO, %rax, 1), AO // number of values in A
  1969. #endif
  1970. #if defined(TRMMKERNEL) && defined(LEFT)
  1971. addq $1, KK // number of values in A
  1972. #endif
  1973. .L4_100:
  1974. #if defined(TRMMKERNEL) && !defined(LEFT)
  1975. addq $4, KK // number of values in B
  1976. #endif
  1977. movq K, %rax
  1978. salq $2, %rax // * 4
  1979. leaq (B , %rax, SIZE), B
  1980. decq J // j --
  1981. jg .L4_10
  1982. /***************************************************************************************************************/
  1983. .L2_0:
  1984. movq Nmod12, J
  1985. testq $2, J
  1986. je .L1_0
  1987. .L2_10:
  1988. movq C, CO1
  1989. leaq (C, LDC, 2), C // c += 2 * ldc
  1990. #if defined(TRMMKERNEL) && defined(LEFT)
  1991. movq OFFSET, %rax
  1992. movq %rax, KK
  1993. #endif
  1994. movq A, AO // aoffset = a
  1995. addq $16 * SIZE, AO
  1996. movq M, I
  1997. sarq $2, I // i = m / 4
  1998. je .L2_20
  1999. ALIGN_4
  2000. .L2_11:
  2001. #if !defined(TRMMKERNEL) || \
  2002. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2003. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2004. movq B, BO
  2005. addq $12 * SIZE, BO
  2006. #else
  2007. movq B, BO
  2008. addq $12 * SIZE, BO
  2009. movq KK, %rax
  2010. salq $3, %rax // rax * SIZE
  2011. leaq (BO,%rax,2), BO // add number of values in B
  2012. leaq (AO,%rax,4), AO // add number of values in A
  2013. #endif
  2014. #ifndef TRMMKERNEL
  2015. movq K, %rax
  2016. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2017. movq K, %rax
  2018. subq KK, %rax
  2019. movq %rax, KKK
  2020. #else
  2021. movq KK, %rax
  2022. #ifdef LEFT
  2023. addq $4, %rax // number of values in AO
  2024. #else
  2025. addq $2, %rax // number of values in BO
  2026. #endif
  2027. movq %rax, KKK
  2028. #endif
  2029. INIT4x2
  2030. sarq $3, %rax // K / 8
  2031. je .L2_16
  2032. ALIGN_5
  2033. .L2_12:
  2034. KERNEL4x2_SUB
  2035. KERNEL4x2_SUB
  2036. KERNEL4x2_SUB
  2037. KERNEL4x2_SUB
  2038. KERNEL4x2_SUB
  2039. KERNEL4x2_SUB
  2040. KERNEL4x2_SUB
  2041. KERNEL4x2_SUB
  2042. dec %rax
  2043. jne .L2_12
  2044. .L2_16:
  2045. movq KKK, %rax
  2046. andq $7, %rax # if (k & 1)
  2047. je .L2_19
  2048. ALIGN_4
  2049. .L2_17:
  2050. KERNEL4x2_SUB
  2051. dec %rax
  2052. jne .L2_17
  2053. ALIGN_4
  2054. .L2_19:
  2055. SAVE4x2
  2056. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2057. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2058. movq K, %rax
  2059. subq KKK, %rax
  2060. salq $3, %rax // rax + SIZE
  2061. leaq (BO, %rax, 2), BO // number of values in B
  2062. leaq (AO, %rax, 4), AO // number of values in A
  2063. #endif
  2064. #if defined(TRMMKERNEL) && defined(LEFT)
  2065. addq $4, KK // number of values in A
  2066. #endif
  2067. decq I # i --
  2068. jg .L2_11
  2069. ALIGN_4
  2070. /**************************************************************************
  2071. * Rest of M
  2072. ***************************************************************************/
  2073. .L2_20:
  2074. // Test rest of M
  2075. testq $3, M
  2076. jz .L2_100 // to next 16 lines of N
  2077. .L2_30:
  2078. testq $2, M
  2079. jz .L2_40
  2080. ALIGN_4
  2081. .L2_31:
  2082. #if !defined(TRMMKERNEL) || \
  2083. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2084. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2085. movq B, BO
  2086. addq $12 * SIZE, BO
  2087. #else
  2088. movq B, BO
  2089. addq $12 * SIZE, BO
  2090. movq KK, %rax
  2091. salq $3, %rax // rax * SIZE
  2092. leaq (BO,%rax,2), BO // add number of values in B
  2093. leaq (AO,%rax,2), AO // add number of values in A
  2094. #endif
  2095. #ifndef TRMMKERNEL
  2096. movq K, %rax
  2097. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2098. movq K, %rax
  2099. subq KK, %rax
  2100. movq %rax, KKK
  2101. #else
  2102. movq KK, %rax
  2103. #ifdef LEFT
  2104. addq $2, %rax // number of values in AO
  2105. #else
  2106. addq $2, %rax // number of values in BO
  2107. #endif
  2108. movq %rax, KKK
  2109. #endif
  2110. INIT2x2
  2111. sarq $3, %rax
  2112. je .L2_36
  2113. ALIGN_4
  2114. .L2_32:
  2115. KERNEL2x2_SUB
  2116. KERNEL2x2_SUB
  2117. KERNEL2x2_SUB
  2118. KERNEL2x2_SUB
  2119. KERNEL2x2_SUB
  2120. KERNEL2x2_SUB
  2121. KERNEL2x2_SUB
  2122. KERNEL2x2_SUB
  2123. dec %rax
  2124. jne .L2_32
  2125. .L2_36:
  2126. movq KKK, %rax
  2127. andq $7, %rax # if (k & 1)
  2128. je .L2_39
  2129. ALIGN_4
  2130. .L2_37:
  2131. KERNEL2x2_SUB
  2132. dec %rax
  2133. jne .L2_37
  2134. .L2_39:
  2135. SAVE2x2
  2136. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2137. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2138. movq K, %rax
  2139. subq KKK, %rax
  2140. salq $3, %rax // rax + SIZE
  2141. leaq (BO, %rax, 2), BO // number of values in B
  2142. leaq (AO, %rax, 2), AO // number of values in A
  2143. #endif
  2144. #if defined(TRMMKERNEL) && defined(LEFT)
  2145. addq $2, KK // number of values in A
  2146. #endif
  2147. .L2_40:
  2148. testq $1, M
  2149. jz .L2_100 // to next 3 lines of N
  2150. .L2_41:
  2151. #if !defined(TRMMKERNEL) || \
  2152. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2153. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2154. movq B, BO
  2155. addq $12 * SIZE, BO
  2156. #else
  2157. movq B, BO
  2158. addq $12 * SIZE, BO
  2159. movq KK, %rax
  2160. salq $3, %rax // rax * SIZE
  2161. leaq (BO,%rax,2), BO // add number of values in B
  2162. leaq (AO,%rax,1), AO // add number of values in A
  2163. #endif
  2164. #ifndef TRMMKERNEL
  2165. movq K, %rax
  2166. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2167. movq K, %rax
  2168. subq KK, %rax
  2169. movq %rax, KKK
  2170. #else
  2171. movq KK, %rax
  2172. #ifdef LEFT
  2173. addq $1, %rax // number of values in AO
  2174. #else
  2175. addq $2, %rax // number of values in BO
  2176. #endif
  2177. movq %rax, KKK
  2178. #endif
  2179. INIT1x2
  2180. sarq $3,%rax
  2181. je .L2_46
  2182. ALIGN_4
  2183. .L2_42:
  2184. KERNEL1x2_SUB
  2185. KERNEL1x2_SUB
  2186. KERNEL1x2_SUB
  2187. KERNEL1x2_SUB
  2188. KERNEL1x2_SUB
  2189. KERNEL1x2_SUB
  2190. KERNEL1x2_SUB
  2191. KERNEL1x2_SUB
  2192. dec %rax
  2193. jne .L2_42
  2194. .L2_46:
  2195. movq KKK, %rax
  2196. andq $7, %rax # if (k & 1)
  2197. je .L2_49
  2198. ALIGN_4
  2199. .L2_47:
  2200. KERNEL1x2_SUB
  2201. dec %rax
  2202. jne .L2_47
  2203. .L2_49:
  2204. SAVE1x2
  2205. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2206. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2207. movq K, %rax
  2208. subq KKK, %rax
  2209. salq $3, %rax // rax * SIZE
  2210. leaq (BO, %rax, 2), BO // number of values in B
  2211. leaq (AO, %rax, 1), AO // number of values in A
  2212. #endif
  2213. #if defined(TRMMKERNEL) && defined(LEFT)
  2214. addq $1, KK // number of values in A
  2215. #endif
  2216. .L2_100:
  2217. #if defined(TRMMKERNEL) && !defined(LEFT)
  2218. addq $2, KK // number of values in B
  2219. #endif
  2220. movq K, %rax
  2221. salq $1, %rax // * 2
  2222. leaq (B , %rax, SIZE), B
  2223. /***************************************************************************************************************/
  2224. .L1_0:
  2225. movq Nmod12, J
  2226. testq $1, J
  2227. je .L999
  2228. .L1_10:
  2229. movq C, CO1
  2230. leaq (C, LDC, 1), C // c += 1 * ldc
  2231. #if defined(TRMMKERNEL) && defined(LEFT)
  2232. movq OFFSET, %rax
  2233. movq %rax, KK
  2234. #endif
  2235. movq A, AO // aoffset = a
  2236. addq $16 * SIZE, AO
  2237. movq M, I
  2238. sarq $2, I // i = m / 4
  2239. je .L1_20
  2240. ALIGN_4
  2241. .L1_11:
  2242. #if !defined(TRMMKERNEL) || \
  2243. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2244. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2245. movq B, BO
  2246. addq $12 * SIZE, BO
  2247. #else
  2248. movq B, BO
  2249. addq $12 * SIZE, BO
  2250. movq KK, %rax
  2251. salq $3, %rax // rax * SIZE
  2252. leaq (BO,%rax,1), BO // add number of values in B
  2253. leaq (AO,%rax,4), AO // add number of values in A
  2254. #endif
  2255. #ifndef TRMMKERNEL
  2256. movq K, %rax
  2257. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2258. movq K, %rax
  2259. subq KK, %rax
  2260. movq %rax, KKK
  2261. #else
  2262. movq KK, %rax
  2263. #ifdef LEFT
  2264. addq $4, %rax // number of values in AO
  2265. #else
  2266. addq $1, %rax // number of values in BO
  2267. #endif
  2268. movq %rax, KKK
  2269. #endif
  2270. INIT4x1
  2271. sarq $3, %rax // K / 8
  2272. je .L1_16
  2273. ALIGN_5
  2274. .L1_12:
  2275. KERNEL4x1
  2276. dec %rax
  2277. jne .L1_12
  2278. .L1_16:
  2279. movq KKK, %rax
  2280. andq $7, %rax # if (k & 1)
  2281. je .L1_19
  2282. ALIGN_4
  2283. .L1_17:
  2284. KERNEL4x1_SUB
  2285. dec %rax
  2286. jne .L1_17
  2287. ALIGN_4
  2288. .L1_19:
  2289. SAVE4x1
  2290. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2291. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2292. movq K, %rax
  2293. subq KKK, %rax
  2294. salq $3, %rax // rax * SIZE
  2295. leaq (BO, %rax, 1), BO // number of values in B
  2296. leaq (AO, %rax, 4), AO // number of values in A
  2297. #endif
  2298. #if defined(TRMMKERNEL) && defined(LEFT)
  2299. addq $4, KK // number of values in A
  2300. #endif
  2301. decq I # i --
  2302. jg .L1_11
  2303. /**************************************************************************
  2304. * Rest of M
  2305. ***************************************************************************/
  2306. .L1_20:
  2307. // Test rest of M
  2308. testq $3, M
  2309. jz .L1_100
  2310. .L1_30:
  2311. testq $2, M
  2312. jz .L1_40
  2313. ALIGN_4
  2314. .L1_31:
  2315. #if !defined(TRMMKERNEL) || \
  2316. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2317. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2318. movq B, BO
  2319. addq $12 * SIZE, BO
  2320. #else
  2321. movq B, BO
  2322. addq $12 * SIZE, BO
  2323. movq KK, %rax
  2324. salq $3, %rax // rax * SIZE
  2325. leaq (BO,%rax,1), BO // add number of values in B
  2326. leaq (AO,%rax,2), AO // add number of values in A
  2327. #endif
  2328. #ifndef TRMMKERNEL
  2329. movq K, %rax
  2330. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2331. movq K, %rax
  2332. subq KK, %rax
  2333. movq %rax, KKK
  2334. #else
  2335. movq KK, %rax
  2336. #ifdef LEFT
  2337. addq $2, %rax // number of values in AO
  2338. #else
  2339. addq $1, %rax // number of values in BO
  2340. #endif
  2341. movq %rax, KKK
  2342. #endif
  2343. INIT2x1
  2344. sarq $3, %rax
  2345. je .L1_36
  2346. ALIGN_4
  2347. .L1_32:
  2348. KERNEL2x1_SUB
  2349. KERNEL2x1_SUB
  2350. KERNEL2x1_SUB
  2351. KERNEL2x1_SUB
  2352. KERNEL2x1_SUB
  2353. KERNEL2x1_SUB
  2354. KERNEL2x1_SUB
  2355. KERNEL2x1_SUB
  2356. dec %rax
  2357. jne .L1_32
  2358. .L1_36:
  2359. movq KKK, %rax
  2360. andq $7, %rax # if (k & 1)
  2361. je .L1_39
  2362. ALIGN_4
  2363. .L1_37:
  2364. KERNEL2x1_SUB
  2365. dec %rax
  2366. jne .L1_37
  2367. .L1_39:
  2368. SAVE2x1
  2369. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2370. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2371. movq K, %rax
  2372. subq KKK, %rax
  2373. salq $3, %rax // rax * SIZE
  2374. leaq (BO, %rax, 1), BO // number of values in B
  2375. leaq (AO, %rax, 2), AO // number of values in A
  2376. #endif
  2377. #if defined(TRMMKERNEL) && defined(LEFT)
  2378. addq $2, KK // number of values in A
  2379. #endif
  2380. .L1_40:
  2381. testq $1, M
  2382. jz .L1_100 // to next 3 lines of N
  2383. .L1_41:
  2384. #if !defined(TRMMKERNEL) || \
  2385. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2386. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2387. movq B, BO
  2388. addq $12 * SIZE, BO
  2389. #else
  2390. movq B, BO
  2391. addq $12 * SIZE, BO
  2392. movq KK, %rax
  2393. salq $3, %rax // rax * SIZE
  2394. leaq (BO,%rax,1), BO // add number of values in B
  2395. leaq (AO,%rax,1), AO // add number of values in A
  2396. #endif
  2397. #ifndef TRMMKERNEL
  2398. movq K, %rax
  2399. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2400. movq K, %rax
  2401. subq KK, %rax
  2402. movq %rax, KKK
  2403. #else
  2404. movq KK, %rax
  2405. #ifdef LEFT
  2406. addq $1, %rax // number of values in AO
  2407. #else
  2408. addq $1, %rax // number of values in BO
  2409. #endif
  2410. movq %rax, KKK
  2411. #endif
  2412. INIT1x1
  2413. sarq $3,%rax
  2414. je .L1_46
  2415. ALIGN_4
  2416. .L1_42:
  2417. KERNEL1x1_SUB
  2418. KERNEL1x1_SUB
  2419. KERNEL1x1_SUB
  2420. KERNEL1x1_SUB
  2421. KERNEL1x1_SUB
  2422. KERNEL1x1_SUB
  2423. KERNEL1x1_SUB
  2424. KERNEL1x1_SUB
  2425. dec %rax
  2426. jne .L1_42
  2427. .L1_46:
  2428. movq KKK, %rax
  2429. andq $7, %rax # if (k & 1)
  2430. je .L1_49
  2431. ALIGN_4
  2432. .L1_47:
  2433. KERNEL1x1_SUB
  2434. dec %rax
  2435. jne .L1_47
  2436. .L1_49:
  2437. SAVE1x1
  2438. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2439. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2440. movq K, %rax
  2441. subq KKK, %rax
  2442. salq $3, %rax // rax * SIZE
  2443. leaq (BO, %rax, 1), BO // number of values in B
  2444. leaq (AO, %rax, 1), AO // number of values in A
  2445. #endif
  2446. #if defined(TRMMKERNEL) && defined(LEFT)
  2447. addq $1, KK // number of values in A
  2448. #endif
  2449. .L1_100:
  2450. #if defined(TRMMKERNEL) && !defined(LEFT)
  2451. addq $1, KK // number of values in B
  2452. #endif
  2453. .L999:
  2454. vzeroupper
  2455. movq SP, %rsp
  2456. movq (%rsp), %rbx
  2457. movq 8(%rsp), %rbp
  2458. movq 16(%rsp), %r12
  2459. movq 24(%rsp), %r13
  2460. movq 32(%rsp), %r14
  2461. movq 40(%rsp), %r15
  2462. #ifdef WINDOWS_ABI
  2463. movq 48(%rsp), %rdi
  2464. movq 56(%rsp), %rsi
  2465. vmovups 64(%rsp), %xmm6
  2466. vmovups 80(%rsp), %xmm7
  2467. vmovups 96(%rsp), %xmm8
  2468. vmovups 112(%rsp), %xmm9
  2469. vmovups 128(%rsp), %xmm10
  2470. vmovups 144(%rsp), %xmm11
  2471. vmovups 160(%rsp), %xmm12
  2472. vmovups 176(%rsp), %xmm13
  2473. vmovups 192(%rsp), %xmm14
  2474. vmovups 208(%rsp), %xmm15
  2475. #endif
  2476. addq $STACKSIZE, %rsp
  2477. ret
  2478. EPILOGUE
  2479. #endif