You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x4_barcelona.S 43 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define M %r13
  43. #define N %r14
  44. #define K %rdx
  45. #define A %rcx
  46. #define B %r8
  47. #define C %r9
  48. #define LDC %r10
  49. #define I %r11
  50. #define AO %rdi
  51. #define BO %rsi
  52. #define CO1 %r15
  53. #define CO2 %r12
  54. #define BB %rbp
  55. #define J %rbx
  56. #ifndef WINDOWS_ABI
  57. #define STACKSIZE 96
  58. #define ALPHA 48(%rsp)
  59. #define OFFSET 56(%rsp)
  60. #define KK 64(%rsp)
  61. #define KKK 72(%rsp)
  62. #else
  63. #define STACKSIZE 256
  64. #define OLD_A 40 + STACKSIZE(%rsp)
  65. #define OLD_B 48 + STACKSIZE(%rsp)
  66. #define OLD_C 56 + STACKSIZE(%rsp)
  67. #define OLD_LDC 64 + STACKSIZE(%rsp)
  68. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  69. #define ALPHA 224(%rsp)
  70. #define OFFSET 232(%rsp)
  71. #define KK 240(%rsp)
  72. #define KKK 248(%rsp)
  73. #endif
  74. #define movapd movaps
  75. #define movupd movups
  76. #define KERNEL1(xx) \
  77. mulpd %xmm1, %xmm0 ;\
  78. addpd %xmm0, %xmm8 ;\
  79. mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\
  80. movapd %xmm2, %xmm0 ;\
  81. addpd %xmm1, %xmm12 ;\
  82. movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\
  83. mulpd %xmm3, %xmm2 ;\
  84. mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\
  85. addpd %xmm2, %xmm9 ;\
  86. movapd %xmm0, %xmm2 ;\
  87. addpd %xmm3, %xmm13 ;\
  88. movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\
  89. mulpd %xmm1, %xmm0 ;\
  90. mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\
  91. addpd %xmm0, %xmm10 ;\
  92. movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\
  93. addpd %xmm1, %xmm14 ;\
  94. movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\
  95. mulpd %xmm3, %xmm2 ;\
  96. mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\
  97. addpd %xmm2, %xmm11 ;\
  98. addpd %xmm3, %xmm15 ;\
  99. movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\
  100. movapd %xmm0, %xmm2
  101. #define KERNEL2(xx) \
  102. mulpd %xmm1, %xmm0 ;\
  103. addpd %xmm0, %xmm8 ;\
  104. mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\
  105. movapd %xmm2, %xmm0 ;\
  106. addpd %xmm1, %xmm12 ;\
  107. /*A*/ movapd (AO, %rax, 4), %xmm6 ;\
  108. movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\
  109. mulpd %xmm3, %xmm2 ;\
  110. mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\
  111. addpd %xmm2, %xmm9 ;\
  112. movapd %xmm0, %xmm2 ;\
  113. addpd %xmm3, %xmm13 ;\
  114. movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\
  115. mulpd %xmm1, %xmm0 ;\
  116. mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\
  117. addpd %xmm0, %xmm10 ;\
  118. addpd %xmm1, %xmm14 ;\
  119. /**/ movddup (BO, %rax, 4), %xmm1 ;\
  120. mulpd %xmm3, %xmm2 ;\
  121. mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\
  122. addpd %xmm2, %xmm11 ;\
  123. addpd %xmm3, %xmm15 ;\
  124. movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\
  125. movapd %xmm4, %xmm2
  126. #define KERNEL3(xx) \
  127. mulpd %xmm5, %xmm4 ;\
  128. addpd %xmm4, %xmm8 ;\
  129. mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\
  130. movapd %xmm2, %xmm4 ;\
  131. addpd %xmm5, %xmm12 ;\
  132. movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\
  133. mulpd %xmm3, %xmm2 ;\
  134. mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\
  135. addpd %xmm2, %xmm9 ;\
  136. movapd %xmm4, %xmm2 ;\
  137. addpd %xmm3, %xmm13 ;\
  138. movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\
  139. mulpd %xmm5, %xmm4 ;\
  140. mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\
  141. addpd %xmm4, %xmm10 ;\
  142. movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\
  143. addpd %xmm5, %xmm14 ;\
  144. movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\
  145. mulpd %xmm3, %xmm2 ;\
  146. mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\
  147. addpd %xmm2, %xmm11 ;\
  148. addpd %xmm3, %xmm15 ;\
  149. movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\
  150. movapd %xmm4, %xmm2
  151. #define KERNEL4(xx) \
  152. mulpd %xmm5, %xmm4 ;\
  153. addpd %xmm4, %xmm8 ;\
  154. mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\
  155. movapd %xmm2, %xmm4 ;\
  156. addpd %xmm5, %xmm12 ;\
  157. /*A*/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\
  158. movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\
  159. mulpd %xmm3, %xmm2 ;\
  160. mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\
  161. addpd %xmm2, %xmm9 ;\
  162. movapd %xmm4, %xmm2 ;\
  163. addpd %xmm3, %xmm13 ;\
  164. movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\
  165. mulpd %xmm5, %xmm4 ;\
  166. mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\
  167. addpd %xmm4, %xmm10 ;\
  168. addpd %xmm5, %xmm14 ;\
  169. /**/ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\
  170. mulpd %xmm3, %xmm2 ;\
  171. mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\
  172. addpd %xmm2, %xmm11 ;\
  173. addpd %xmm3, %xmm15 ;\
  174. movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\
  175. movapd %xmm6, %xmm2
  176. #define KERNEL5(xx) \
  177. mulpd %xmm1, %xmm6 ;\
  178. addpd %xmm6, %xmm8 ;\
  179. mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\
  180. movapd %xmm2, %xmm6 ;\
  181. addpd %xmm1, %xmm12 ;\
  182. movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\
  183. mulpd %xmm3, %xmm2 ;\
  184. mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\
  185. addpd %xmm2, %xmm9 ;\
  186. movapd %xmm6, %xmm2 ;\
  187. addpd %xmm3, %xmm13 ;\
  188. movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\
  189. mulpd %xmm1, %xmm6 ;\
  190. mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\
  191. addpd %xmm6, %xmm10 ;\
  192. movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\
  193. addpd %xmm1, %xmm14 ;\
  194. movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\
  195. mulpd %xmm3, %xmm2 ;\
  196. mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\
  197. addpd %xmm2, %xmm11 ;\
  198. addpd %xmm3, %xmm15 ;\
  199. movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\
  200. movapd %xmm6, %xmm2
  201. #define KERNEL6(xx) \
  202. mulpd %xmm1, %xmm6 ;\
  203. addpd %xmm6, %xmm8 ;\
  204. mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\
  205. movapd %xmm2, %xmm6 ;\
  206. addpd %xmm1, %xmm12 ;\
  207. /*A*/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\
  208. movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\
  209. mulpd %xmm3, %xmm2 ;\
  210. mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\
  211. addpd %xmm2, %xmm9 ;\
  212. movapd %xmm6, %xmm2 ;\
  213. addpd %xmm3, %xmm13 ;\
  214. movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\
  215. mulpd %xmm1, %xmm6 ;\
  216. mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\
  217. addpd %xmm6, %xmm10 ;\
  218. addpd %xmm1, %xmm14 ;\
  219. /**/ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\
  220. mulpd %xmm3, %xmm2 ;\
  221. mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\
  222. addpd %xmm2, %xmm11 ;\
  223. addpd %xmm3, %xmm15 ;\
  224. movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\
  225. movapd %xmm7, %xmm2
  226. #define KERNEL7(xx) \
  227. mulpd %xmm5, %xmm7 ;\
  228. addpd %xmm7, %xmm8 ;\
  229. mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\
  230. movapd %xmm2, %xmm7 ;\
  231. addpd %xmm5, %xmm12 ;\
  232. movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\
  233. mulpd %xmm3, %xmm2 ;\
  234. mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\
  235. addpd %xmm2, %xmm9 ;\
  236. movapd %xmm7, %xmm2 ;\
  237. addpd %xmm3, %xmm13 ;\
  238. movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\
  239. mulpd %xmm5, %xmm7 ;\
  240. mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\
  241. addpd %xmm7, %xmm10 ;\
  242. movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\
  243. addpd %xmm5, %xmm14 ;\
  244. movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\
  245. mulpd %xmm3, %xmm2 ;\
  246. mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\
  247. addpd %xmm2, %xmm11 ;\
  248. addpd %xmm3, %xmm15 ;\
  249. movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\
  250. movapd %xmm7, %xmm2
  251. #define KERNEL8(xx) \
  252. mulpd %xmm5, %xmm7 ;\
  253. addpd %xmm7, %xmm8 ;\
  254. mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\
  255. movapd %xmm2, %xmm7 ;\
  256. addpd %xmm5, %xmm12 ;\
  257. /*A*/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\
  258. movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\
  259. mulpd %xmm3, %xmm2 ;\
  260. mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\
  261. addpd %xmm2, %xmm9 ;\
  262. movapd %xmm7, %xmm2 ;\
  263. addpd %xmm3, %xmm13 ;\
  264. movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\
  265. mulpd %xmm5, %xmm7 ;\
  266. mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\
  267. addpd %xmm7, %xmm10 ;\
  268. addpd %xmm5, %xmm14 ;\
  269. /**/ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\
  270. mulpd %xmm3, %xmm2 ;\
  271. mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\
  272. addpd %xmm2, %xmm11 ;\
  273. addpd %xmm3, %xmm15 ;\
  274. movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\
  275. movapd %xmm0, %xmm2 ;\
  276. addq $8 * SIZE, %rax ;\
  277. #define KERNEL_SUB1(xx) \
  278. mulpd %xmm1, %xmm0 ;\
  279. mulpd -14 * SIZE(AO), %xmm1 ;\
  280. addpd %xmm0, %xmm8 ;\
  281. movapd %xmm2, %xmm0 ;\
  282. addpd %xmm1, %xmm12 ;\
  283. movddup -14 * SIZE(BO), %xmm1 ;\
  284. mulpd %xmm3, %xmm2 ;\
  285. mulpd -14 * SIZE(AO), %xmm3 ;\
  286. addpd %xmm2, %xmm9 ;\
  287. movapd %xmm0, %xmm2 ;\
  288. addpd %xmm3, %xmm13 ;\
  289. movddup -13 * SIZE(BO), %xmm3 ;\
  290. mulpd %xmm1, %xmm0 ;\
  291. mulpd -14 * SIZE(AO), %xmm1 ;\
  292. addpd %xmm0, %xmm10 ;\
  293. movapd -12 * SIZE(AO), %xmm0 ;\
  294. addpd %xmm1, %xmm14 ;\
  295. movddup -12 * SIZE(BO), %xmm1 ;\
  296. mulpd %xmm3, %xmm2 ;\
  297. mulpd -14 * SIZE(AO), %xmm3 ;\
  298. addpd %xmm2, %xmm11 ;\
  299. addpd %xmm3, %xmm15 ;\
  300. movddup -11 * SIZE(BO), %xmm3 ;\
  301. movapd %xmm0, %xmm2
  302. #define KERNEL_SUB2(xx) \
  303. mulpd %xmm1, %xmm0 ;\
  304. mulpd -10 * SIZE(AO), %xmm1 ;\
  305. addpd %xmm0, %xmm8 ;\
  306. movapd %xmm2, %xmm0 ;\
  307. addpd %xmm1, %xmm12 ;\
  308. movddup -10 * SIZE(BO), %xmm1 ;\
  309. mulpd %xmm3, %xmm2 ;\
  310. mulpd -10 * SIZE(AO), %xmm3 ;\
  311. addpd %xmm2, %xmm9 ;\
  312. movapd %xmm0, %xmm2 ;\
  313. addpd %xmm3, %xmm13 ;\
  314. movddup -9 * SIZE(BO), %xmm3 ;\
  315. mulpd %xmm1, %xmm0 ;\
  316. mulpd -10 * SIZE(AO), %xmm1 ;\
  317. addpd %xmm0, %xmm10 ;\
  318. movapd (AO), %xmm0 ;\
  319. addpd %xmm1, %xmm14 ;\
  320. movddup (BO), %xmm1 ;\
  321. mulpd %xmm3, %xmm2 ;\
  322. mulpd -10 * SIZE(AO), %xmm3 ;\
  323. addpd %xmm2, %xmm11 ;\
  324. addpd %xmm3, %xmm15 ;\
  325. movddup -7 * SIZE(BO), %xmm3 ;\
  326. movapd %xmm4, %xmm2
  327. #define KERNEL_SUB3(xx) \
  328. mulpd %xmm5, %xmm4 ;\
  329. mulpd -6 * SIZE(AO), %xmm5 ;\
  330. addpd %xmm4, %xmm8 ;\
  331. movapd %xmm2, %xmm4 ;\
  332. addpd %xmm5, %xmm12 ;\
  333. movddup -6 * SIZE(BO), %xmm5 ;\
  334. mulpd %xmm3, %xmm2 ;\
  335. mulpd -6 * SIZE(AO), %xmm3 ;\
  336. addpd %xmm2, %xmm9 ;\
  337. movapd %xmm4, %xmm2 ;\
  338. addpd %xmm3, %xmm13 ;\
  339. movddup -5 * SIZE(BO), %xmm3 ;\
  340. mulpd %xmm5, %xmm4 ;\
  341. mulpd -6 * SIZE(AO), %xmm5 ;\
  342. addpd %xmm4, %xmm10 ;\
  343. movapd -4 * SIZE(AO), %xmm4 ;\
  344. addpd %xmm5, %xmm14 ;\
  345. movddup -4 * SIZE(BO), %xmm5 ;\
  346. mulpd %xmm3, %xmm2 ;\
  347. mulpd -6 * SIZE(AO), %xmm3 ;\
  348. addpd %xmm2, %xmm11 ;\
  349. addpd %xmm3, %xmm15 ;\
  350. movddup -3 * SIZE(BO), %xmm3 ;\
  351. movapd %xmm4, %xmm2
  352. #define KERNEL_SUB4(xx) \
  353. mulpd %xmm5, %xmm4 ;\
  354. mulpd -2 * SIZE(AO), %xmm5 ;\
  355. addpd %xmm4, %xmm8 ;\
  356. movapd %xmm2, %xmm4 ;\
  357. addpd %xmm5, %xmm12 ;\
  358. movddup -2 * SIZE(BO), %xmm5 ;\
  359. mulpd %xmm3, %xmm2 ;\
  360. mulpd -2 * SIZE(AO), %xmm3 ;\
  361. addpd %xmm2, %xmm9 ;\
  362. movapd %xmm4, %xmm2 ;\
  363. addpd %xmm3, %xmm13 ;\
  364. movddup -1 * SIZE(BO), %xmm3 ;\
  365. mulpd %xmm5, %xmm4 ;\
  366. mulpd -2 * SIZE(AO), %xmm5 ;\
  367. addpd %xmm4, %xmm10 ;\
  368. addpd %xmm5, %xmm14 ;\
  369. mulpd %xmm3, %xmm2 ;\
  370. mulpd -2 * SIZE(AO), %xmm3 ;\
  371. addpd %xmm2, %xmm11 ;\
  372. addpd %xmm3, %xmm15 ;\
  373. movddup 1 * SIZE(BO), %xmm3 ;\
  374. movapd %xmm0, %xmm2
  375. PROLOGUE
  376. PROFCODE
  377. subq $STACKSIZE, %rsp
  378. movq %rbx, (%rsp)
  379. movq %rbp, 8(%rsp)
  380. movq %r12, 16(%rsp)
  381. movq %r13, 24(%rsp)
  382. movq %r14, 32(%rsp)
  383. movq %r15, 40(%rsp)
  384. #ifdef WINDOWS_ABI
  385. movq %rdi, 48(%rsp)
  386. movq %rsi, 56(%rsp)
  387. movups %xmm6, 64(%rsp)
  388. movups %xmm7, 80(%rsp)
  389. movups %xmm8, 96(%rsp)
  390. movups %xmm9, 112(%rsp)
  391. movups %xmm10, 128(%rsp)
  392. movups %xmm11, 144(%rsp)
  393. movups %xmm12, 160(%rsp)
  394. movups %xmm13, 176(%rsp)
  395. movups %xmm14, 192(%rsp)
  396. movups %xmm15, 208(%rsp)
  397. movq ARG1, OLD_M
  398. movq ARG2, OLD_N
  399. movq ARG3, K
  400. movq OLD_A, A
  401. movq OLD_B, B
  402. movq OLD_C, C
  403. movq OLD_LDC, LDC
  404. #ifdef TRMMKERNEL
  405. movsd OLD_OFFSET, %xmm12
  406. #endif
  407. movaps %xmm3, %xmm0
  408. #else
  409. movq STACKSIZE + 8(%rsp), LDC
  410. #ifdef TRMMKERNEL
  411. movsd STACKSIZE + 16(%rsp), %xmm12
  412. #endif
  413. #endif
  414. movq OLD_M, M
  415. movq OLD_N, N
  416. subq $-16 * SIZE, A
  417. subq $-16 * SIZE, B
  418. movsd %xmm0, ALPHA
  419. salq $BASE_SHIFT, LDC
  420. #ifdef TRMMKERNEL
  421. movsd %xmm12, OFFSET
  422. movsd %xmm12, KK
  423. #ifndef LEFT
  424. negq KK
  425. #endif
  426. #endif
  427. movq N, J
  428. sarq $2, J # j = (n >> 2)
  429. jle .L40
  430. ALIGN_4
  431. .L01:
  432. movq C, CO1 # coffset1 = c
  433. leaq (C, LDC, 2), CO2 # coffset2 = c + ldc
  434. leaq (C, LDC, 4), C # c += 4 * ldc
  435. #if defined(TRMMKERNEL) && defined(LEFT)
  436. movq OFFSET, %rax
  437. movq %rax, KK
  438. #endif
  439. movq A, AO # aoffset = a
  440. movq K, %rax
  441. salq $BASE_SHIFT + 2, %rax
  442. leaq (B, %rax), BB
  443. movq M, I
  444. sarq $2, I # i = (m >> 2)
  445. jle .L20
  446. ALIGN_4
  447. .L11:
  448. #if !defined(TRMMKERNEL) || \
  449. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  450. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  451. movq B, BO
  452. #else
  453. movq KK, %rax
  454. leaq (, %rax, SIZE), %rax
  455. leaq (AO, %rax, 4), AO
  456. leaq (B, %rax, 4), BO
  457. #endif
  458. movapd -16 * SIZE(AO), %xmm0
  459. xorps %xmm8, %xmm8
  460. movddup -16 * SIZE(BO), %xmm1
  461. xorps %xmm9, %xmm9
  462. movddup -15 * SIZE(BO), %xmm3
  463. xorps %xmm10, %xmm10
  464. movapd -8 * SIZE(AO), %xmm4
  465. xorps %xmm11, %xmm11
  466. movddup -8 * SIZE(BO), %xmm5
  467. xorps %xmm12, %xmm12
  468. prefetchw 3 * SIZE(CO1)
  469. xorps %xmm13, %xmm13
  470. prefetchw 7 * SIZE(CO1, LDC)
  471. xorps %xmm14, %xmm14
  472. prefetchw 3 * SIZE(CO2)
  473. xorps %xmm15, %xmm15
  474. prefetchw 7 * SIZE(CO2, LDC)
  475. movapd %xmm0, %xmm2
  476. prefetch -16 * SIZE(BB)
  477. #ifndef TRMMKERNEL
  478. movq K, %rax
  479. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  480. movq K, %rax
  481. subq KK, %rax
  482. movq %rax, KKK
  483. #else
  484. movq KK, %rax
  485. #ifdef LEFT
  486. addq $4, %rax
  487. #else
  488. addq $4, %rax
  489. #endif
  490. movq %rax, KKK
  491. #endif
  492. andq $-8, %rax
  493. salq $BASE_SHIFT, %rax
  494. leaq (AO, %rax, 4), AO
  495. leaq (BO, %rax, 4), BO
  496. negq %rax
  497. NOBRANCH
  498. je .L15
  499. ALIGN_4
  500. .L12:
  501. KERNEL1(16 * 0)
  502. KERNEL2(16 * 0)
  503. KERNEL3(16 * 0)
  504. KERNEL4(16 * 0)
  505. KERNEL5(16 * 0)
  506. KERNEL6(16 * 0)
  507. KERNEL7(16 * 0)
  508. KERNEL8(16 * 0)
  509. NOBRANCH
  510. je .L15
  511. KERNEL1(16 * 0)
  512. KERNEL2(16 * 0)
  513. KERNEL3(16 * 0)
  514. KERNEL4(16 * 0)
  515. KERNEL5(16 * 0)
  516. KERNEL6(16 * 0)
  517. KERNEL7(16 * 0)
  518. KERNEL8(16 * 0)
  519. NOBRANCH
  520. je .L15
  521. KERNEL1(16 * 0)
  522. KERNEL2(16 * 0)
  523. KERNEL3(16 * 0)
  524. KERNEL4(16 * 0)
  525. KERNEL5(16 * 0)
  526. KERNEL6(16 * 0)
  527. KERNEL7(16 * 0)
  528. KERNEL8(16 * 0)
  529. NOBRANCH
  530. je .L15
  531. KERNEL1(16 * 0)
  532. KERNEL2(16 * 0)
  533. KERNEL3(16 * 0)
  534. KERNEL4(16 * 0)
  535. KERNEL5(16 * 0)
  536. KERNEL6(16 * 0)
  537. KERNEL7(16 * 0)
  538. KERNEL8(16 * 0)
  539. NOBRANCH
  540. je .L15
  541. KERNEL1(16 * 0)
  542. KERNEL2(16 * 0)
  543. KERNEL3(16 * 0)
  544. KERNEL4(16 * 0)
  545. KERNEL5(16 * 0)
  546. KERNEL6(16 * 0)
  547. KERNEL7(16 * 0)
  548. KERNEL8(16 * 0)
  549. NOBRANCH
  550. je .L15
  551. KERNEL1(16 * 0)
  552. KERNEL2(16 * 0)
  553. KERNEL3(16 * 0)
  554. KERNEL4(16 * 0)
  555. KERNEL5(16 * 0)
  556. KERNEL6(16 * 0)
  557. KERNEL7(16 * 0)
  558. KERNEL8(16 * 0)
  559. NOBRANCH
  560. je .L15
  561. KERNEL1(16 * 0)
  562. KERNEL2(16 * 0)
  563. KERNEL3(16 * 0)
  564. KERNEL4(16 * 0)
  565. KERNEL5(16 * 0)
  566. KERNEL6(16 * 0)
  567. KERNEL7(16 * 0)
  568. KERNEL8(16 * 0)
  569. NOBRANCH
  570. je .L15
  571. KERNEL1(16 * 0)
  572. KERNEL2(16 * 0)
  573. KERNEL3(16 * 0)
  574. KERNEL4(16 * 0)
  575. KERNEL5(16 * 0)
  576. KERNEL6(16 * 0)
  577. KERNEL7(16 * 0)
  578. KERNEL8(16 * 0)
  579. jl .L12
  580. ALIGN_4
  581. .L15:
  582. movddup ALPHA, %xmm7
  583. #ifndef TRMMKERNEL
  584. movq K, %rax
  585. #else
  586. movq KKK, %rax
  587. #endif
  588. testq $4, %rax
  589. je .L16
  590. ALIGN_4
  591. KERNEL_SUB1(16 * 0)
  592. KERNEL_SUB2(16 * 0)
  593. KERNEL_SUB3(16 * 0)
  594. KERNEL_SUB4(16 * 0)
  595. subq $-16 * SIZE, BO
  596. subq $-16 * SIZE, AO
  597. ALIGN_4
  598. .L16:
  599. #ifndef TRMMKERNEL
  600. movq K, %rax
  601. #else
  602. movq KKK, %rax
  603. #endif
  604. andq $3, %rax # if (k & 1)
  605. je .L19
  606. leaq (, %rax, SIZE), %rax
  607. leaq (AO, %rax, 4), AO
  608. leaq (BO, %rax, 4), BO
  609. negq %rax
  610. ALIGN_4
  611. .L17:
  612. mulpd %xmm1, %xmm0
  613. mulpd -14 * SIZE(AO, %rax, 4), %xmm1
  614. addpd %xmm0, %xmm8
  615. movapd %xmm2, %xmm0
  616. addpd %xmm1, %xmm12
  617. movddup -14 * SIZE(BO, %rax, 4), %xmm1
  618. mulpd %xmm3, %xmm2
  619. mulpd -14 * SIZE(AO, %rax, 4), %xmm3
  620. addpd %xmm2, %xmm9
  621. movapd %xmm0, %xmm2
  622. addpd %xmm3, %xmm13
  623. movddup -13 * SIZE(BO, %rax, 4), %xmm3
  624. mulpd %xmm1, %xmm0
  625. mulpd -14 * SIZE(AO, %rax, 4), %xmm1
  626. addpd %xmm0, %xmm10
  627. movapd -12 * SIZE(AO, %rax, 4), %xmm0
  628. addpd %xmm1, %xmm14
  629. movddup -12 * SIZE(BO, %rax, 4), %xmm1
  630. mulpd %xmm3, %xmm2
  631. mulpd -14 * SIZE(AO, %rax, 4), %xmm3
  632. addpd %xmm2, %xmm11
  633. addpd %xmm3, %xmm15
  634. movddup -11 * SIZE(BO, %rax, 4), %xmm3
  635. movapd %xmm0, %xmm2
  636. addq $SIZE, %rax
  637. jl .L17
  638. ALIGN_4
  639. .L19:
  640. prefetch -8 * SIZE(BB)
  641. subq $-16 * SIZE, BB
  642. #ifndef TRMMKERNEL
  643. movupd (CO1), %xmm0
  644. movupd 2 * SIZE(CO1), %xmm1
  645. #endif
  646. mulpd %xmm7, %xmm8
  647. mulpd %xmm7, %xmm12
  648. #ifndef TRMMKERNEL
  649. addpd %xmm0, %xmm8
  650. addpd %xmm1, %xmm12
  651. #endif
  652. movsd %xmm8, (CO1)
  653. movhps %xmm8, 1 * SIZE(CO1)
  654. movsd %xmm12, 2 * SIZE(CO1)
  655. movhps %xmm12, 3 * SIZE(CO1)
  656. #ifndef TRMMKERNEL
  657. movupd (CO1, LDC), %xmm2
  658. movupd 2 * SIZE(CO1, LDC), %xmm3
  659. #endif
  660. mulpd %xmm7, %xmm9
  661. mulpd %xmm7, %xmm13
  662. #ifndef TRMMKERNEL
  663. addpd %xmm2, %xmm9
  664. addpd %xmm3, %xmm13
  665. #endif
  666. movsd %xmm9, (CO1, LDC)
  667. movhps %xmm9, 1 * SIZE(CO1, LDC)
  668. movsd %xmm13, 2 * SIZE(CO1, LDC)
  669. movhps %xmm13, 3 * SIZE(CO1, LDC)
  670. #ifndef TRMMKERNEL
  671. movupd (CO2), %xmm0
  672. movupd 2 * SIZE(CO2), %xmm1
  673. #endif
  674. mulpd %xmm7, %xmm10
  675. mulpd %xmm7, %xmm14
  676. #ifndef TRMMKERNEL
  677. addpd %xmm0, %xmm10
  678. addpd %xmm1, %xmm14
  679. #endif
  680. movsd %xmm10, (CO2)
  681. movhps %xmm10, 1 * SIZE(CO2)
  682. movsd %xmm14, 2 * SIZE(CO2)
  683. movhps %xmm14, 3 * SIZE(CO2)
  684. #ifndef TRMMKERNEL
  685. movupd (CO2, LDC), %xmm2
  686. movupd 2 * SIZE(CO2, LDC), %xmm3
  687. #endif
  688. mulpd %xmm7, %xmm11
  689. mulpd %xmm7, %xmm15
  690. #ifndef TRMMKERNEL
  691. addpd %xmm2, %xmm11
  692. addpd %xmm3, %xmm15
  693. #endif
  694. movsd %xmm11, (CO2, LDC)
  695. movhps %xmm11, 1 * SIZE(CO2, LDC)
  696. movsd %xmm15, 2 * SIZE(CO2, LDC)
  697. movhps %xmm15, 3 * SIZE(CO2, LDC)
  698. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  699. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  700. movq K, %rax
  701. subq KKK, %rax
  702. leaq (,%rax, SIZE), %rax
  703. leaq (AO, %rax, 4), AO
  704. leaq (BO, %rax, 4), BO
  705. #endif
  706. #if defined(TRMMKERNEL) && defined(LEFT)
  707. addq $4, KK
  708. #endif
  709. addq $4 * SIZE, CO1 # coffset += 4
  710. addq $4 * SIZE, CO2 # coffset += 4
  711. decq I # i --
  712. BRANCH
  713. jg .L11
  714. ALIGN_4
  715. .L20:
  716. testq $3, M
  717. je .L39
  718. testq $2, M
  719. je .L30
  720. ALIGN_4
  721. .L21:
  722. #if !defined(TRMMKERNEL) || \
  723. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  724. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  725. movq B, BO
  726. #else
  727. movq KK, %rax
  728. leaq (, %rax, SIZE), %rax
  729. leaq (AO, %rax, 2), AO
  730. leaq (B, %rax, 4), BO
  731. #endif
  732. movapd -16 * SIZE(AO), %xmm0
  733. xorps %xmm8, %xmm8
  734. movapd -12 * SIZE(AO), %xmm2
  735. xorps %xmm9, %xmm9
  736. movddup -16 * SIZE(BO), %xmm1
  737. xorps %xmm10, %xmm10
  738. movddup -15 * SIZE(BO), %xmm5
  739. xorps %xmm11, %xmm11
  740. movddup -8 * SIZE(BO), %xmm3
  741. #ifndef TRMMKERNEL
  742. movq K, %rax
  743. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  744. movq K, %rax
  745. subq KK, %rax
  746. movq %rax, KKK
  747. #else
  748. movq KK, %rax
  749. #ifdef LEFT
  750. addq $2, %rax
  751. #else
  752. addq $4, %rax
  753. #endif
  754. movq %rax, KKK
  755. #endif
  756. andq $-4, %rax
  757. leaq (, %rax, SIZE), %rax
  758. leaq (AO, %rax, 2), AO
  759. leaq (BO, %rax, 4), BO
  760. negq %rax
  761. NOBRANCH
  762. je .L26
  763. ALIGN_4
  764. .L22:
  765. mulpd %xmm0, %xmm1
  766. addpd %xmm1, %xmm8
  767. movddup -14 * SIZE(BO, %rax, 4), %xmm1
  768. mulpd %xmm0, %xmm5
  769. addpd %xmm5, %xmm9
  770. movddup -13 * SIZE(BO, %rax, 4), %xmm5
  771. mulpd %xmm0, %xmm1
  772. addpd %xmm1, %xmm10
  773. movddup -12 * SIZE(BO, %rax, 4), %xmm1
  774. mulpd %xmm0, %xmm5
  775. movapd -14 * SIZE(AO, %rax, 2), %xmm0
  776. addpd %xmm5, %xmm11
  777. movddup -11 * SIZE(BO, %rax, 4), %xmm5
  778. mulpd %xmm0, %xmm1
  779. addpd %xmm1, %xmm8
  780. movddup -10 * SIZE(BO, %rax, 4), %xmm1
  781. mulpd %xmm0, %xmm5
  782. addpd %xmm5, %xmm9
  783. movddup -9 * SIZE(BO, %rax, 4), %xmm5
  784. mulpd %xmm0, %xmm1
  785. addpd %xmm1, %xmm10
  786. movddup (BO, %rax, 4), %xmm1
  787. mulpd %xmm0, %xmm5
  788. movapd -8 * SIZE(AO, %rax, 2), %xmm0
  789. addpd %xmm5, %xmm11
  790. movddup -7 * SIZE(BO, %rax, 4), %xmm5
  791. mulpd %xmm2, %xmm3
  792. addpd %xmm3, %xmm8
  793. movddup -6 * SIZE(BO, %rax, 4), %xmm3
  794. mulpd %xmm2, %xmm5
  795. addpd %xmm5, %xmm9
  796. movddup -5 * SIZE(BO, %rax, 4), %xmm5
  797. mulpd %xmm2, %xmm3
  798. addpd %xmm3, %xmm10
  799. movddup -4 * SIZE(BO, %rax, 4), %xmm3
  800. mulpd %xmm2, %xmm5
  801. movapd -10 * SIZE(AO, %rax, 2), %xmm2
  802. addpd %xmm5, %xmm11
  803. movddup -3 * SIZE(BO, %rax, 4), %xmm5
  804. mulpd %xmm2, %xmm3
  805. addpd %xmm3, %xmm8
  806. movddup -2 * SIZE(BO, %rax, 4), %xmm3
  807. mulpd %xmm2, %xmm5
  808. addpd %xmm5, %xmm9
  809. movddup -1 * SIZE(BO, %rax, 4), %xmm5
  810. mulpd %xmm2, %xmm3
  811. addpd %xmm3, %xmm10
  812. movddup 8 * SIZE(BO, %rax, 4), %xmm3
  813. mulpd %xmm2, %xmm5
  814. movapd -4 * SIZE(AO, %rax, 2), %xmm2
  815. addpd %xmm5, %xmm11
  816. movddup 1 * SIZE(BO, %rax, 4), %xmm5
  817. addq $4 * SIZE, %rax
  818. BRANCH
  819. jl .L22
  820. ALIGN_4
  821. .L26:
  822. movddup ALPHA, %xmm7
  823. #ifndef TRMMKERNEL
  824. movq K, %rax
  825. #else
  826. movq KKK, %rax
  827. #endif
  828. andq $3, %rax # if (k & 1)
  829. je .L29
  830. leaq (, %rax, SIZE), %rax
  831. leaq (AO, %rax, 2), AO
  832. leaq (BO, %rax, 4), BO
  833. negq %rax
  834. ALIGN_4
  835. .L27:
  836. mulpd %xmm0, %xmm1
  837. addpd %xmm1, %xmm8
  838. movddup -14 * SIZE(BO, %rax, 4), %xmm1
  839. mulpd %xmm0, %xmm5
  840. addpd %xmm5, %xmm9
  841. movddup -13 * SIZE(BO, %rax, 4), %xmm5
  842. mulpd %xmm0, %xmm1
  843. addpd %xmm1, %xmm10
  844. movddup -12 * SIZE(BO, %rax, 4), %xmm1
  845. mulpd %xmm0, %xmm5
  846. movapd -14 * SIZE(AO, %rax, 2), %xmm0
  847. addpd %xmm5, %xmm11
  848. movddup -11 * SIZE(BO, %rax, 4), %xmm5
  849. addq $SIZE, %rax
  850. jl .L27
  851. ALIGN_4
  852. .L29:
  853. #ifndef TRMMKERNEL
  854. movupd (CO1), %xmm0
  855. movupd (CO1, LDC), %xmm2
  856. movupd (CO2), %xmm4
  857. movupd (CO2, LDC), %xmm6
  858. #endif
  859. mulpd %xmm7, %xmm8
  860. mulpd %xmm7, %xmm9
  861. mulpd %xmm7, %xmm10
  862. mulpd %xmm7, %xmm11
  863. #ifndef TRMMKERNEL
  864. addpd %xmm0, %xmm8
  865. addpd %xmm2, %xmm9
  866. addpd %xmm4, %xmm10
  867. addpd %xmm6, %xmm11
  868. #endif
  869. movsd %xmm8, (CO1)
  870. movhps %xmm8, 1 * SIZE(CO1)
  871. movsd %xmm9, (CO1, LDC)
  872. movhps %xmm9, 1 * SIZE(CO1, LDC)
  873. movsd %xmm10, (CO2)
  874. movhps %xmm10, 1 * SIZE(CO2)
  875. movsd %xmm11, (CO2, LDC)
  876. movhps %xmm11, 1 * SIZE(CO2, LDC)
  877. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  878. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  879. movq K, %rax
  880. subq KKK, %rax
  881. leaq (,%rax, SIZE), %rax
  882. leaq (AO, %rax, 2), AO
  883. leaq (BO, %rax, 4), BO
  884. #endif
  885. #if defined(TRMMKERNEL) && defined(LEFT)
  886. addq $2, KK
  887. #endif
  888. addq $2 * SIZE, CO1
  889. addq $2 * SIZE, CO2
  890. ALIGN_4
  891. .L30:
  892. testq $1, M
  893. je .L39
  894. #if !defined(TRMMKERNEL) || \
  895. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  896. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  897. movq B, BO
  898. #else
  899. movq KK, %rax
  900. leaq (, %rax, SIZE), %rax
  901. leaq (AO, %rax, 1), AO
  902. leaq (B, %rax, 4), BO
  903. #endif
  904. movddup -16 * SIZE(AO), %xmm0
  905. xorps %xmm8, %xmm8
  906. movddup -14 * SIZE(AO), %xmm2
  907. xorps %xmm9, %xmm9
  908. movddup -15 * SIZE(AO), %xmm4
  909. xorps %xmm10, %xmm10
  910. movapd -16 * SIZE(BO), %xmm1
  911. xorps %xmm11, %xmm11
  912. movapd -8 * SIZE(BO), %xmm3
  913. #ifndef TRMMKERNEL
  914. movq K, %rax
  915. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  916. movq K, %rax
  917. subq KK, %rax
  918. movq %rax, KKK
  919. #else
  920. movq KK, %rax
  921. #ifdef LEFT
  922. addq $1, %rax
  923. #else
  924. addq $4, %rax
  925. #endif
  926. movq %rax, KKK
  927. #endif
  928. andq $-4, %rax
  929. leaq (, %rax, SIZE), %rax
  930. leaq (AO, %rax, 1), AO
  931. leaq (BO, %rax, 4), BO
  932. negq %rax
  933. NOBRANCH
  934. je .L36
  935. ALIGN_4
  936. .L32:
  937. mulpd %xmm0, %xmm1
  938. mulpd -14 * SIZE(BO, %rax, 4), %xmm0
  939. addpd %xmm1, %xmm8
  940. movapd -12 * SIZE(BO, %rax, 4), %xmm1
  941. addpd %xmm0, %xmm9
  942. movddup -12 * SIZE(AO, %rax, 1), %xmm0
  943. mulpd %xmm4, %xmm1
  944. mulpd -10 * SIZE(BO, %rax, 4), %xmm4
  945. addpd %xmm1, %xmm10
  946. movapd (BO, %rax, 4), %xmm1
  947. addpd %xmm4, %xmm11
  948. movddup -11 * SIZE(AO, %rax, 1), %xmm4
  949. mulpd %xmm2, %xmm3
  950. mulpd -6 * SIZE(BO, %rax, 4), %xmm2
  951. addpd %xmm3, %xmm8
  952. movapd -4 * SIZE(BO, %rax, 4), %xmm3
  953. addpd %xmm2, %xmm9
  954. movddup -13 * SIZE(AO, %rax, 1), %xmm2
  955. mulpd %xmm2, %xmm3
  956. mulpd -2 * SIZE(BO, %rax, 4), %xmm2
  957. addpd %xmm3, %xmm10
  958. movapd 8 * SIZE(BO, %rax, 4), %xmm3
  959. addpd %xmm2, %xmm11
  960. movddup -10 * SIZE(AO, %rax, 1), %xmm2
  961. addq $4 * SIZE, %rax
  962. BRANCH
  963. jl .L32
  964. ALIGN_4
  965. .L36:
  966. movddup ALPHA, %xmm7
  967. #ifndef TRMMKERNEL
  968. movq K, %rax
  969. #else
  970. movq KKK, %rax
  971. #endif
  972. andq $3, %rax # if (k & 1)
  973. je .L38
  974. leaq (, %rax, SIZE), %rax
  975. leaq (AO, %rax, 1), AO
  976. leaq (BO, %rax, 4), BO
  977. negq %rax
  978. ALIGN_4
  979. .L37:
  980. mulpd %xmm0, %xmm1
  981. mulpd -14 * SIZE(BO, %rax, 4), %xmm0
  982. addpd %xmm1, %xmm8
  983. movapd -12 * SIZE(BO, %rax, 4), %xmm1
  984. addpd %xmm0, %xmm9
  985. movddup -15 * SIZE(AO, %rax, 1), %xmm0
  986. addq $SIZE, %rax
  987. jl .L37
  988. ALIGN_4
  989. .L38:
  990. addpd %xmm10, %xmm8
  991. addpd %xmm11, %xmm9
  992. #ifndef TRMMKERNEL
  993. movsd (CO1), %xmm0
  994. movhps (CO1, LDC), %xmm0
  995. movsd (CO2), %xmm1
  996. movhps (CO2, LDC), %xmm1
  997. #endif
  998. mulpd %xmm7, %xmm8
  999. mulpd %xmm7, %xmm9
  1000. #ifndef TRMMKERNEL
  1001. addpd %xmm0, %xmm8
  1002. addpd %xmm1, %xmm9
  1003. #endif
  1004. movsd %xmm8, (CO1)
  1005. movhps %xmm8, (CO1, LDC)
  1006. movsd %xmm9, (CO2)
  1007. movhps %xmm9, (CO2, LDC)
  1008. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1009. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1010. movq K, %rax
  1011. subq KKK, %rax
  1012. leaq (,%rax, SIZE), %rax
  1013. leaq (AO, %rax, 1), AO
  1014. leaq (BO, %rax, 4), BO
  1015. #endif
  1016. #if defined(TRMMKERNEL) && defined(LEFT)
  1017. addq $1, KK
  1018. #endif
  1019. ALIGN_4
  1020. .L39:
  1021. #if defined(TRMMKERNEL) && !defined(LEFT)
  1022. addq $4, KK
  1023. #endif
  1024. movq BO, B
  1025. decq J # j --
  1026. jg .L01
  1027. ALIGN_4
  1028. .L40:
  1029. testq $3, N
  1030. je .L999
  1031. testq $2, N
  1032. je .L80
  1033. ALIGN_4
  1034. .L41:
  1035. #if defined(TRMMKERNEL) && defined(LEFT)
  1036. movq OFFSET, %rax
  1037. movq %rax, KK
  1038. #endif
  1039. movq C, CO1 # coffset1 = c
  1040. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  1041. movq A, AO # aoffset = a
  1042. movq K, %rax
  1043. salq $BASE_SHIFT + 1, %rax
  1044. leaq (B, %rax), BB
  1045. movq M, I
  1046. sarq $2, I # i = (m >> 2)
  1047. jle .L60
  1048. ALIGN_4
  1049. .L51:
  1050. #if !defined(TRMMKERNEL) || \
  1051. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1052. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1053. movq B, BO
  1054. #else
  1055. movq KK, %rax
  1056. leaq (, %rax, SIZE), %rax
  1057. leaq (AO, %rax, 4), AO
  1058. leaq (B, %rax, 2), BO
  1059. #endif
  1060. movddup -16 * SIZE(BO), %xmm1
  1061. movddup -15 * SIZE(BO), %xmm5
  1062. xorps %xmm8, %xmm8
  1063. movddup -12 * SIZE(BO), %xmm3
  1064. xorps %xmm9, %xmm9
  1065. movapd -16 * SIZE(AO), %xmm0
  1066. xorps %xmm12, %xmm12
  1067. movapd -8 * SIZE(AO), %xmm4
  1068. xorps %xmm13, %xmm13
  1069. prefetchw 3 * SIZE(CO1)
  1070. movapd %xmm0, %xmm2
  1071. prefetchw 3 * SIZE(CO2)
  1072. prefetch -16 * SIZE(BB)
  1073. subq $-8 * SIZE, BB
  1074. #ifndef TRMMKERNEL
  1075. movq K, %rax
  1076. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1077. movq K, %rax
  1078. subq KK, %rax
  1079. movq %rax, KKK
  1080. #else
  1081. movq KK, %rax
  1082. #ifdef LEFT
  1083. addq $4, %rax
  1084. #else
  1085. addq $2, %rax
  1086. #endif
  1087. movq %rax, KKK
  1088. #endif
  1089. andq $-4, %rax
  1090. leaq (, %rax, SIZE), %rax
  1091. leaq (AO, %rax, 4), AO
  1092. leaq (BO, %rax, 2), BO
  1093. negq %rax
  1094. NOBRANCH
  1095. je .L56
  1096. ALIGN_4
  1097. .L52:
  1098. mulpd %xmm1, %xmm0
  1099. mulpd -14 * SIZE(AO, %rax, 4), %xmm1
  1100. addpd %xmm0, %xmm8
  1101. movapd -12 * SIZE(AO, %rax, 4), %xmm0
  1102. addpd %xmm1, %xmm12
  1103. movddup -14 * SIZE(BO, %rax, 2), %xmm1
  1104. mulpd %xmm5, %xmm2
  1105. mulpd -14 * SIZE(AO, %rax, 4), %xmm5
  1106. addpd %xmm2, %xmm9
  1107. addpd %xmm5, %xmm13
  1108. movddup -13 * SIZE(BO, %rax, 2), %xmm5
  1109. movapd %xmm0, %xmm2
  1110. mulpd %xmm1, %xmm0
  1111. mulpd -10 * SIZE(AO, %rax, 4), %xmm1
  1112. addpd %xmm0, %xmm8
  1113. movapd (AO, %rax, 4), %xmm0
  1114. addpd %xmm1, %xmm12
  1115. movddup -8 * SIZE(BO, %rax, 2), %xmm1
  1116. mulpd %xmm5, %xmm2
  1117. mulpd -10 * SIZE(AO, %rax, 4), %xmm5
  1118. addpd %xmm2, %xmm9
  1119. addpd %xmm5, %xmm13
  1120. movddup -11 * SIZE(BO, %rax, 2), %xmm5
  1121. movapd %xmm4, %xmm2
  1122. mulpd %xmm3, %xmm4
  1123. mulpd -6 * SIZE(AO, %rax, 4), %xmm3
  1124. addpd %xmm4, %xmm8
  1125. movapd -4 * SIZE(AO, %rax, 4), %xmm4
  1126. addpd %xmm3, %xmm12
  1127. movddup -10 * SIZE(BO, %rax, 2), %xmm3
  1128. mulpd %xmm5, %xmm2
  1129. mulpd -6 * SIZE(AO, %rax, 4), %xmm5
  1130. addpd %xmm2, %xmm9
  1131. addpd %xmm5, %xmm13
  1132. movddup -9 * SIZE(BO, %rax, 2), %xmm5
  1133. movapd %xmm4, %xmm2
  1134. mulpd %xmm3, %xmm4
  1135. mulpd -2 * SIZE(AO, %rax, 4), %xmm3
  1136. addpd %xmm4, %xmm8
  1137. movapd 8 * SIZE(AO, %rax, 4), %xmm4
  1138. addpd %xmm3, %xmm12
  1139. movddup -4 * SIZE(BO, %rax, 2), %xmm3
  1140. mulpd %xmm5, %xmm2
  1141. mulpd -2 * SIZE(AO, %rax, 4), %xmm5
  1142. addpd %xmm2, %xmm9
  1143. addpd %xmm5, %xmm13
  1144. movddup -7 * SIZE(BO, %rax, 2), %xmm5
  1145. movapd %xmm0, %xmm2
  1146. addq $4 * SIZE, %rax
  1147. BRANCH
  1148. jl .L52
  1149. ALIGN_4
  1150. .L56:
  1151. movddup ALPHA, %xmm7
  1152. #ifndef TRMMKERNEL
  1153. movq K, %rax
  1154. #else
  1155. movq KKK, %rax
  1156. #endif
  1157. andq $3, %rax # if (k & 1)
  1158. je .L59
  1159. leaq (, %rax, SIZE), %rax
  1160. leaq (AO, %rax, 4), AO
  1161. leaq (BO, %rax, 2), BO
  1162. negq %rax
  1163. ALIGN_4
  1164. .L57:
  1165. mulpd %xmm1, %xmm0
  1166. mulpd -14 * SIZE(AO, %rax, 4), %xmm1
  1167. addpd %xmm0, %xmm8
  1168. movapd -12 * SIZE(AO, %rax, 4), %xmm0
  1169. addpd %xmm1, %xmm12
  1170. movddup -14 * SIZE(BO, %rax, 2), %xmm1
  1171. mulpd %xmm5, %xmm2
  1172. mulpd -14 * SIZE(AO, %rax, 4), %xmm5
  1173. addpd %xmm2, %xmm9
  1174. addpd %xmm5, %xmm13
  1175. movddup -13 * SIZE(BO, %rax, 2), %xmm5
  1176. movapd %xmm0, %xmm2
  1177. addq $SIZE, %rax
  1178. jl .L57
  1179. ALIGN_4
  1180. .L59:
  1181. #ifndef TRMMKERNEL
  1182. movupd (CO1), %xmm0
  1183. movupd 2 * SIZE(CO1), %xmm1
  1184. movupd (CO2), %xmm2
  1185. movupd 2 * SIZE(CO2), %xmm3
  1186. #endif
  1187. mulpd %xmm7, %xmm8
  1188. mulpd %xmm7, %xmm9
  1189. mulpd %xmm7, %xmm12
  1190. mulpd %xmm7, %xmm13
  1191. #ifndef TRMMKERNEL
  1192. addpd %xmm0, %xmm8
  1193. addpd %xmm1, %xmm12
  1194. addpd %xmm2, %xmm9
  1195. addpd %xmm3, %xmm13
  1196. #endif
  1197. movsd %xmm8, (CO1)
  1198. movhps %xmm8, 1 * SIZE(CO1)
  1199. movsd %xmm12, 2 * SIZE(CO1)
  1200. movhps %xmm12, 3 * SIZE(CO1)
  1201. movsd %xmm9, (CO2)
  1202. movhps %xmm9, 1 * SIZE(CO2)
  1203. movsd %xmm13, 2 * SIZE(CO2)
  1204. movhps %xmm13, 3 * SIZE(CO2)
  1205. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1206. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1207. movq K, %rax
  1208. subq KKK, %rax
  1209. leaq (,%rax, SIZE), %rax
  1210. leaq (AO, %rax, 4), AO
  1211. leaq (BO, %rax, 2), BO
  1212. #endif
  1213. #if defined(TRMMKERNEL) && defined(LEFT)
  1214. addq $4, KK
  1215. #endif
  1216. addq $4 * SIZE, CO1 # coffset += 4
  1217. addq $4 * SIZE, CO2 # coffset += 4
  1218. decq I # i --
  1219. jg .L51
  1220. ALIGN_4
  1221. .L60:
  1222. testq $2, M
  1223. je .L70
  1224. ALIGN_4
  1225. .L61:
  1226. #if !defined(TRMMKERNEL) || \
  1227. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1228. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1229. movq B, BO
  1230. #else
  1231. movq KK, %rax
  1232. leaq (, %rax, SIZE), %rax
  1233. leaq (AO, %rax, 2), AO
  1234. leaq (B, %rax, 2), BO
  1235. #endif
  1236. movapd -16 * SIZE(AO), %xmm0
  1237. xorps %xmm8, %xmm8
  1238. movapd -12 * SIZE(AO), %xmm2
  1239. xorps %xmm9, %xmm9
  1240. movddup -16 * SIZE(BO), %xmm1
  1241. xorps %xmm10, %xmm10
  1242. movddup -15 * SIZE(BO), %xmm3
  1243. xorps %xmm11, %xmm11
  1244. #ifndef TRMMKERNEL
  1245. movq K, %rax
  1246. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1247. movq K, %rax
  1248. subq KK, %rax
  1249. movq %rax, KKK
  1250. #else
  1251. movq KK, %rax
  1252. #ifdef LEFT
  1253. addq $2, %rax
  1254. #else
  1255. addq $2, %rax
  1256. #endif
  1257. movq %rax, KKK
  1258. #endif
  1259. andq $-4, %rax
  1260. leaq (, %rax, SIZE), %rax
  1261. leaq (AO, %rax, 2), AO
  1262. leaq (BO, %rax, 2), BO
  1263. negq %rax
  1264. NOBRANCH
  1265. je .L66
  1266. ALIGN_4
  1267. .L62:
  1268. mulpd %xmm0, %xmm1
  1269. addpd %xmm1, %xmm8
  1270. movddup -14 * SIZE(BO, %rax, 2), %xmm1
  1271. mulpd %xmm0, %xmm3
  1272. movapd -14 * SIZE(AO, %rax, 2), %xmm0
  1273. addpd %xmm3, %xmm9
  1274. movddup -13 * SIZE(BO, %rax, 2), %xmm3
  1275. mulpd %xmm0, %xmm1
  1276. addpd %xmm1, %xmm10
  1277. movddup -12 * SIZE(BO, %rax, 2), %xmm1
  1278. mulpd %xmm0, %xmm3
  1279. movapd -8 * SIZE(AO, %rax, 2), %xmm0
  1280. addpd %xmm3, %xmm11
  1281. movddup -11 * SIZE(BO, %rax, 2), %xmm3
  1282. mulpd %xmm2, %xmm1
  1283. addpd %xmm1, %xmm8
  1284. movddup -10 * SIZE(BO, %rax, 2), %xmm1
  1285. mulpd %xmm2, %xmm3
  1286. movapd -10 * SIZE(AO, %rax, 2), %xmm2
  1287. addpd %xmm3, %xmm9
  1288. movddup -9 * SIZE(BO, %rax, 2), %xmm3
  1289. mulpd %xmm2, %xmm1
  1290. addpd %xmm1, %xmm10
  1291. movddup -8 * SIZE(BO, %rax, 2), %xmm1
  1292. mulpd %xmm2, %xmm3
  1293. movapd -4 * SIZE(AO, %rax, 2), %xmm2
  1294. addpd %xmm3, %xmm11
  1295. movddup -7 * SIZE(BO, %rax, 2), %xmm3
  1296. addq $4 * SIZE, %rax
  1297. BRANCH
  1298. jl .L62
  1299. ALIGN_4
  1300. .L66:
  1301. movddup ALPHA, %xmm7
  1302. #ifndef TRMMKERNEL
  1303. movq K, %rax
  1304. #else
  1305. movq KKK, %rax
  1306. #endif
  1307. andq $3, %rax # if (k & 1)
  1308. je .L69
  1309. leaq (, %rax, SIZE), %rax
  1310. leaq (AO, %rax, 2), AO
  1311. leaq (BO, %rax, 2), BO
  1312. negq %rax
  1313. ALIGN_4
  1314. .L67:
  1315. mulpd %xmm0, %xmm1
  1316. addpd %xmm1, %xmm8
  1317. movddup -14 * SIZE(BO, %rax, 2), %xmm1
  1318. mulpd %xmm0, %xmm3
  1319. movapd -14 * SIZE(AO, %rax, 2), %xmm0
  1320. addpd %xmm3, %xmm9
  1321. movddup -13 * SIZE(BO, %rax, 2), %xmm3
  1322. addq $SIZE, %rax
  1323. jl .L67
  1324. ALIGN_4
  1325. .L69:
  1326. addpd %xmm10, %xmm8
  1327. addpd %xmm11, %xmm9
  1328. #ifndef TRMMKERNEL
  1329. movupd (CO1), %xmm0
  1330. movupd (CO2), %xmm2
  1331. #endif
  1332. mulpd %xmm7, %xmm8
  1333. mulpd %xmm7, %xmm9
  1334. #ifndef TRMMKERNEL
  1335. addpd %xmm0, %xmm8
  1336. addpd %xmm2, %xmm9
  1337. #endif
  1338. movsd %xmm8, (CO1)
  1339. movhps %xmm8, 1 * SIZE(CO1)
  1340. movsd %xmm9, (CO2)
  1341. movhps %xmm9, 1 * SIZE(CO2)
  1342. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1343. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1344. movq K, %rax
  1345. subq KKK, %rax
  1346. leaq (,%rax, SIZE), %rax
  1347. leaq (AO, %rax, 2), AO
  1348. leaq (BO, %rax, 2), BO
  1349. #endif
  1350. #if defined(TRMMKERNEL) && defined(LEFT)
  1351. addq $2, KK
  1352. #endif
  1353. addq $2 * SIZE, CO1 # coffset += 4
  1354. addq $2 * SIZE, CO2 # coffset += 4
  1355. ALIGN_4
  1356. .L70:
  1357. testq $1, M
  1358. je .L79
  1359. ALIGN_4
  1360. .L71:
  1361. #if !defined(TRMMKERNEL) || \
  1362. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1363. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1364. movq B, BO
  1365. #else
  1366. movq KK, %rax
  1367. leaq (, %rax, SIZE), %rax
  1368. leaq (AO, %rax, 1), AO
  1369. leaq (B, %rax, 2), BO
  1370. #endif
  1371. movddup -16 * SIZE(AO), %xmm0
  1372. xorps %xmm8, %xmm8
  1373. movddup -15 * SIZE(AO), %xmm1
  1374. xorps %xmm9, %xmm9
  1375. movddup -14 * SIZE(AO), %xmm2
  1376. xorps %xmm10, %xmm10
  1377. movddup -13 * SIZE(AO), %xmm3
  1378. xorps %xmm11, %xmm11
  1379. #ifndef TRMMKERNEL
  1380. movq K, %rax
  1381. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1382. movq K, %rax
  1383. subq KK, %rax
  1384. movq %rax, KKK
  1385. #else
  1386. movq KK, %rax
  1387. #ifdef LEFT
  1388. addq $1, %rax
  1389. #else
  1390. addq $2, %rax
  1391. #endif
  1392. movq %rax, KKK
  1393. #endif
  1394. andq $-4, %rax
  1395. leaq (, %rax, SIZE), %rax
  1396. leaq (AO, %rax, 1), AO
  1397. leaq (BO, %rax, 2), BO
  1398. negq %rax
  1399. NOBRANCH
  1400. je .L76
  1401. ALIGN_4
  1402. .L72:
  1403. mulpd -16 * SIZE(BO, %rax, 2), %xmm0
  1404. addpd %xmm0, %xmm8
  1405. movddup -12 * SIZE(AO, %rax, 1), %xmm0
  1406. mulpd -14 * SIZE(BO, %rax, 2), %xmm1
  1407. addpd %xmm1, %xmm9
  1408. movddup -11 * SIZE(AO, %rax, 1), %xmm1
  1409. mulpd -12 * SIZE(BO, %rax, 2), %xmm2
  1410. addpd %xmm2, %xmm10
  1411. movddup -10 * SIZE(AO, %rax, 1), %xmm2
  1412. mulpd -10 * SIZE(BO, %rax, 2), %xmm3
  1413. addpd %xmm3, %xmm11
  1414. movddup -9 * SIZE(AO, %rax, 1), %xmm3
  1415. addq $4 * SIZE, %rax
  1416. BRANCH
  1417. jl .L72
  1418. ALIGN_4
  1419. .L76:
  1420. movddup ALPHA, %xmm7
  1421. #ifndef TRMMKERNEL
  1422. movq K, %rax
  1423. #else
  1424. movq KKK, %rax
  1425. #endif
  1426. andq $3, %rax # if (k & 1)
  1427. je .L78
  1428. leaq (, %rax, SIZE), %rax
  1429. leaq (AO, %rax, 1), AO
  1430. leaq (BO, %rax, 2), BO
  1431. negq %rax
  1432. ALIGN_4
  1433. .L77:
  1434. mulpd -16 * SIZE(BO, %rax, 2), %xmm0
  1435. addpd %xmm0, %xmm8
  1436. movddup -15 * SIZE(AO, %rax, 1), %xmm0
  1437. addq $SIZE, %rax
  1438. jl .L77
  1439. ALIGN_4
  1440. .L78:
  1441. addpd %xmm9, %xmm8
  1442. addpd %xmm11, %xmm10
  1443. addpd %xmm10, %xmm8
  1444. #ifndef TRMMKERNEL
  1445. movsd (CO1), %xmm0
  1446. movhps (CO2), %xmm0
  1447. #endif
  1448. mulpd %xmm7, %xmm8
  1449. #ifndef TRMMKERNEL
  1450. addpd %xmm0, %xmm8
  1451. #endif
  1452. movsd %xmm8, (CO1)
  1453. movhps %xmm8, (CO2)
  1454. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1455. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1456. movq K, %rax
  1457. subq KKK, %rax
  1458. leaq (,%rax, SIZE), %rax
  1459. leaq (AO, %rax, 1), AO
  1460. leaq (BO, %rax, 2), BO
  1461. #endif
  1462. #if defined(TRMMKERNEL) && defined(LEFT)
  1463. addq $1, KK
  1464. #endif
  1465. ALIGN_4
  1466. .L79:
  1467. #if defined(TRMMKERNEL) && !defined(LEFT)
  1468. addq $2, KK
  1469. #endif
  1470. movq BO, B
  1471. leaq (C, LDC, 2), C
  1472. ALIGN_4
  1473. .L80:
  1474. testq $1, N
  1475. je .L999
  1476. ALIGN_4
  1477. .L81:
  1478. #if defined(TRMMKERNEL) && defined(LEFT)
  1479. movq OFFSET, %rax
  1480. movq %rax, KK
  1481. #endif
  1482. movq C, CO1 # coffset1 = c
  1483. movq A, AO # aoffset = a
  1484. movq M, I
  1485. sarq $2, I # i = (m >> 2)
  1486. jle .L100
  1487. ALIGN_4
  1488. .L91:
  1489. #if !defined(TRMMKERNEL) || \
  1490. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1491. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1492. movq B, BO
  1493. #else
  1494. movq KK, %rax
  1495. leaq (, %rax, SIZE), %rax
  1496. leaq (AO, %rax, 4), AO
  1497. leaq (B, %rax, 1), BO
  1498. #endif
  1499. movapd -8 * SIZE(AO), %xmm2
  1500. xorps %xmm8, %xmm8
  1501. movapd -16 * SIZE(AO), %xmm0
  1502. xorps %xmm9, %xmm9
  1503. movddup -16 * SIZE(BO), %xmm1
  1504. xorps %xmm12, %xmm12
  1505. movddup -14 * SIZE(BO), %xmm3
  1506. xorps %xmm13, %xmm13
  1507. movddup -15 * SIZE(BO), %xmm5
  1508. prefetchw 3 * SIZE(CO1)
  1509. #ifndef TRMMKERNEL
  1510. movq K, %rax
  1511. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1512. movq K, %rax
  1513. subq KK, %rax
  1514. movq %rax, KKK
  1515. #else
  1516. movq KK, %rax
  1517. #ifdef LEFT
  1518. addq $4, %rax
  1519. #else
  1520. addq $1, %rax
  1521. #endif
  1522. movq %rax, KKK
  1523. #endif
  1524. andq $-4, %rax
  1525. leaq (, %rax, SIZE), %rax
  1526. leaq (AO, %rax, 4), AO
  1527. leaq (BO, %rax, 1), BO
  1528. negq %rax
  1529. NOBRANCH
  1530. je .L96
  1531. ALIGN_4
  1532. .L92:
  1533. mulpd %xmm1, %xmm0
  1534. mulpd -14 * SIZE(AO, %rax, 4), %xmm1
  1535. addpd %xmm0, %xmm8
  1536. movapd -12 * SIZE(AO, %rax, 4), %xmm0
  1537. addpd %xmm1, %xmm12
  1538. movddup -12 * SIZE(BO, %rax, 1), %xmm1
  1539. mulpd %xmm5, %xmm0
  1540. mulpd -10 * SIZE(AO, %rax, 4), %xmm5
  1541. addpd %xmm0, %xmm9
  1542. movapd (AO, %rax, 4), %xmm0
  1543. addpd %xmm5, %xmm13
  1544. movddup -13 * SIZE(BO, %rax, 1), %xmm5
  1545. mulpd %xmm3, %xmm2
  1546. mulpd -6 * SIZE(AO, %rax, 4), %xmm3
  1547. addpd %xmm2, %xmm8
  1548. movapd -4 * SIZE(AO, %rax, 4), %xmm2
  1549. addpd %xmm3, %xmm12
  1550. movddup -10 * SIZE(BO, %rax, 1), %xmm3
  1551. mulpd %xmm5, %xmm2
  1552. mulpd -2 * SIZE(AO, %rax, 4), %xmm5
  1553. addpd %xmm2, %xmm9
  1554. movapd 8 * SIZE(AO, %rax, 4), %xmm2
  1555. addpd %xmm5, %xmm13
  1556. movddup -11 * SIZE(BO, %rax, 1), %xmm5
  1557. addq $4 * SIZE, %rax
  1558. BRANCH
  1559. jl .L92
  1560. ALIGN_4
  1561. .L96:
  1562. movddup ALPHA, %xmm7
  1563. #ifndef TRMMKERNEL
  1564. movq K, %rax
  1565. #else
  1566. movq KKK, %rax
  1567. #endif
  1568. andq $3, %rax # if (k & 1)
  1569. je .L99
  1570. leaq (, %rax, SIZE), %rax
  1571. leaq (AO, %rax, 4), AO
  1572. leaq (BO, %rax, 1), BO
  1573. negq %rax
  1574. ALIGN_4
  1575. .L97:
  1576. mulpd %xmm1, %xmm0
  1577. mulpd -14 * SIZE(AO, %rax, 4), %xmm1
  1578. addpd %xmm0, %xmm8
  1579. movapd -12 * SIZE(AO, %rax, 4), %xmm0
  1580. addpd %xmm1, %xmm12
  1581. movddup -15 * SIZE(BO, %rax, 1), %xmm1
  1582. addq $SIZE, %rax
  1583. jl .L97
  1584. ALIGN_4
  1585. .L99:
  1586. addpd %xmm9, %xmm8
  1587. addpd %xmm13, %xmm12
  1588. #ifndef TRMMKERNEL
  1589. movupd (CO1), %xmm0
  1590. movupd 2 * SIZE(CO1), %xmm1
  1591. #endif
  1592. mulpd %xmm7, %xmm8
  1593. mulpd %xmm7, %xmm12
  1594. #ifndef TRMMKERNEL
  1595. addpd %xmm0, %xmm8
  1596. addpd %xmm1, %xmm12
  1597. #endif
  1598. movsd %xmm8, (CO1)
  1599. movhps %xmm8, 1 * SIZE(CO1)
  1600. movsd %xmm12, 2 * SIZE(CO1)
  1601. movhps %xmm12, 3 * SIZE(CO1)
  1602. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1603. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1604. movq K, %rax
  1605. subq KKK, %rax
  1606. leaq (,%rax, SIZE), %rax
  1607. leaq (AO, %rax, 4), AO
  1608. leaq (BO, %rax, 1), BO
  1609. #endif
  1610. #if defined(TRMMKERNEL) && defined(LEFT)
  1611. addq $4, KK
  1612. #endif
  1613. addq $4 * SIZE, CO1 # coffset += 4
  1614. decq I # i --
  1615. jg .L91
  1616. ALIGN_4
  1617. .L100:
  1618. testq $2, M
  1619. je .L110
  1620. ALIGN_4
  1621. .L101:
  1622. #if !defined(TRMMKERNEL) || \
  1623. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1624. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1625. movq B, BO
  1626. #else
  1627. movq KK, %rax
  1628. leaq (, %rax, SIZE), %rax
  1629. leaq (AO, %rax, 2), AO
  1630. leaq (B, %rax, 1), BO
  1631. #endif
  1632. movddup -16 * SIZE(BO), %xmm0
  1633. xorps %xmm8, %xmm8
  1634. movddup -15 * SIZE(BO), %xmm1
  1635. xorps %xmm9, %xmm9
  1636. movddup -14 * SIZE(BO), %xmm2
  1637. xorps %xmm10, %xmm10
  1638. movddup -13 * SIZE(BO), %xmm3
  1639. xorps %xmm11, %xmm11
  1640. #ifndef TRMMKERNEL
  1641. movq K, %rax
  1642. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1643. movq K, %rax
  1644. subq KK, %rax
  1645. movq %rax, KKK
  1646. #else
  1647. movq KK, %rax
  1648. #ifdef LEFT
  1649. addq $2, %rax
  1650. #else
  1651. addq $1, %rax
  1652. #endif
  1653. movq %rax, KKK
  1654. #endif
  1655. andq $-4, %rax
  1656. leaq (, %rax, SIZE), %rax
  1657. leaq (AO, %rax, 2), AO
  1658. leaq (BO, %rax, 1), BO
  1659. negq %rax
  1660. NOBRANCH
  1661. je .L106
  1662. ALIGN_4
  1663. .L102:
  1664. mulpd -16 * SIZE(AO, %rax, 2), %xmm0
  1665. addpd %xmm0, %xmm8
  1666. movddup -12 * SIZE(BO, %rax, 1), %xmm0
  1667. mulpd -14 * SIZE(AO, %rax, 2), %xmm1
  1668. addpd %xmm1, %xmm9
  1669. movddup -11 * SIZE(BO, %rax, 1), %xmm1
  1670. mulpd -12 * SIZE(AO, %rax, 2), %xmm2
  1671. addpd %xmm2, %xmm10
  1672. movddup -10 * SIZE(BO, %rax, 1), %xmm2
  1673. mulpd -10 * SIZE(AO, %rax, 2), %xmm3
  1674. addpd %xmm3, %xmm11
  1675. movddup -9 * SIZE(BO, %rax, 1), %xmm3
  1676. addq $4 * SIZE, %rax
  1677. BRANCH
  1678. jl .L102
  1679. ALIGN_4
  1680. .L106:
  1681. movddup ALPHA, %xmm7
  1682. #ifndef TRMMKERNEL
  1683. movq K, %rax
  1684. #else
  1685. movq KKK, %rax
  1686. #endif
  1687. andq $3, %rax # if (k & 1)
  1688. je .L109
  1689. leaq (, %rax, SIZE), %rax
  1690. leaq (AO, %rax, 2), AO
  1691. leaq (BO, %rax, 1), BO
  1692. negq %rax
  1693. ALIGN_4
  1694. .L107:
  1695. movddup -16 * SIZE(BO, %rax, 1), %xmm0
  1696. mulpd -16 * SIZE(AO, %rax, 2), %xmm0
  1697. addpd %xmm0, %xmm8
  1698. addq $SIZE, %rax
  1699. jl .L107
  1700. ALIGN_4
  1701. .L109:
  1702. addpd %xmm9, %xmm8
  1703. addpd %xmm11, %xmm10
  1704. addpd %xmm10, %xmm8
  1705. #ifndef TRMMKERNEL
  1706. movupd (CO1), %xmm0
  1707. #endif
  1708. mulpd %xmm7, %xmm8
  1709. #ifndef TRMMKERNEL
  1710. addpd %xmm0, %xmm8
  1711. #endif
  1712. movsd %xmm8, (CO1)
  1713. movhps %xmm8, 1 * SIZE(CO1)
  1714. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1715. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1716. movq K, %rax
  1717. subq KKK, %rax
  1718. leaq (,%rax, SIZE), %rax
  1719. leaq (AO, %rax, 2), AO
  1720. leaq (BO, %rax, 1), BO
  1721. #endif
  1722. #if defined(TRMMKERNEL) && defined(LEFT)
  1723. addq $2, KK
  1724. #endif
  1725. addq $2 * SIZE, CO1 # coffset += 4
  1726. ALIGN_4
  1727. .L110:
  1728. testq $1, M
  1729. je .L999
  1730. ALIGN_4
  1731. .L111:
  1732. #if !defined(TRMMKERNEL) || \
  1733. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1734. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1735. movq B, BO
  1736. #else
  1737. movq KK, %rax
  1738. leaq (, %rax, SIZE), %rax
  1739. leaq (AO, %rax, 1), AO
  1740. leaq (B, %rax, 1), BO
  1741. #endif
  1742. movapd -16 * SIZE(AO), %xmm0
  1743. xorps %xmm8, %xmm8
  1744. movapd -14 * SIZE(AO), %xmm1
  1745. xorps %xmm9, %xmm9
  1746. #ifndef TRMMKERNEL
  1747. movq K, %rax
  1748. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1749. movq K, %rax
  1750. subq KK, %rax
  1751. movq %rax, KKK
  1752. #else
  1753. movq KK, %rax
  1754. #ifdef LEFT
  1755. addq $1, %rax
  1756. #else
  1757. addq $1, %rax
  1758. #endif
  1759. movq %rax, KKK
  1760. #endif
  1761. andq $-4, %rax
  1762. leaq (, %rax, SIZE), %rax
  1763. leaq (AO, %rax, 1), AO
  1764. leaq (BO, %rax, 1), BO
  1765. negq %rax
  1766. NOBRANCH
  1767. je .L116
  1768. ALIGN_4
  1769. .L112:
  1770. mulpd -16 * SIZE(BO, %rax, 1), %xmm0
  1771. addpd %xmm0, %xmm8
  1772. movapd -12 * SIZE(AO, %rax, 1), %xmm0
  1773. mulpd -14 * SIZE(BO, %rax, 1), %xmm1
  1774. addpd %xmm1, %xmm9
  1775. movapd -10 * SIZE(AO, %rax, 1), %xmm1
  1776. addq $4 * SIZE, %rax
  1777. BRANCH
  1778. jl .L112
  1779. ALIGN_4
  1780. .L116:
  1781. movddup ALPHA, %xmm7
  1782. #ifndef TRMMKERNEL
  1783. movq K, %rax
  1784. #else
  1785. movq KKK, %rax
  1786. #endif
  1787. andq $3, %rax # if (k & 1)
  1788. je .L118
  1789. leaq (, %rax, SIZE), %rax
  1790. leaq (AO, %rax, 1), AO
  1791. leaq (BO, %rax, 1), BO
  1792. negq %rax
  1793. ALIGN_4
  1794. .L117:
  1795. mulsd -16 * SIZE(BO, %rax, 1), %xmm0
  1796. addsd %xmm0, %xmm8
  1797. movsd -15 * SIZE(AO, %rax, 1), %xmm0
  1798. addq $SIZE, %rax
  1799. jl .L117
  1800. ALIGN_4
  1801. .L118:
  1802. addpd %xmm9, %xmm8
  1803. haddpd %xmm8, %xmm8
  1804. #ifndef TRMMKERNEL
  1805. movsd (CO1), %xmm0
  1806. #endif
  1807. mulsd %xmm7, %xmm8
  1808. #ifndef TRMMKERNEL
  1809. addsd %xmm0, %xmm8
  1810. #endif
  1811. movsd %xmm8, (CO1)
  1812. ALIGN_4
  1813. .L999:
  1814. movq (%rsp), %rbx
  1815. movq 8(%rsp), %rbp
  1816. movq 16(%rsp), %r12
  1817. movq 24(%rsp), %r13
  1818. movq 32(%rsp), %r14
  1819. movq 40(%rsp), %r15
  1820. #ifdef WINDOWS_ABI
  1821. movq 48(%rsp), %rdi
  1822. movq 56(%rsp), %rsi
  1823. movups 64(%rsp), %xmm6
  1824. movups 80(%rsp), %xmm7
  1825. movups 96(%rsp), %xmm8
  1826. movups 112(%rsp), %xmm9
  1827. movups 128(%rsp), %xmm10
  1828. movups 144(%rsp), %xmm11
  1829. movups 160(%rsp), %xmm12
  1830. movups 176(%rsp), %xmm13
  1831. movups 192(%rsp), %xmm14
  1832. movups 208(%rsp), %xmm15
  1833. #endif
  1834. addq $STACKSIZE, %rsp
  1835. ret
  1836. EPILOGUE