You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x2_sse2.S 40 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define M %r13
  43. #define N %r14
  44. #define K %rdx
  45. #define A %rcx
  46. #define B %r8
  47. #define C %r9
  48. #define LDC %r10
  49. #define I %r11
  50. #define AO %rdi
  51. #define BO %rsi
  52. #define CO1 %r15
  53. #define CO2 %rbp
  54. #define BB %r12
  55. #ifndef WINDOWS_ABI
  56. #define STACKSIZE 64
  57. #else
  58. #define STACKSIZE 256
  59. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  60. #define OLD_A 48 + STACKSIZE(%rsp)
  61. #define OLD_B 56 + STACKSIZE(%rsp)
  62. #define OLD_C 64 + STACKSIZE(%rsp)
  63. #define OLD_LDC 72 + STACKSIZE(%rsp)
  64. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  65. #endif
  66. #define POSINV 0(%rsp)
  67. #define ALPHA_R 16(%rsp)
  68. #define ALPHA_I 32(%rsp)
  69. #define J 48(%rsp)
  70. #define OFFSET 56(%rsp)
  71. #define KK 64(%rsp)
  72. #define KKK 72(%rsp)
  73. #define BUFFER 256(%rsp)
  74. #ifdef OPTERON
  75. #define PREFETCH prefetch
  76. #define PREFETCHW prefetchw
  77. #define PREFETCHSIZE (8 * 9 + 4)
  78. #define RPREFETCHSIZE (8 * 7 + 4)
  79. #define WPREFETCHSIZE (8 * 8 + 4)
  80. #endif
  81. #ifdef GENERIC
  82. #define PREFETCH prefetcht0
  83. #define PREFETCHW prefetcht0
  84. #define PREFETCHSIZE (8 * 5 + 4)
  85. #define RPREFETCHSIZE (8 * 7 + 4)
  86. #define WPREFETCHSIZE (8 * 8 + 4)
  87. #endif
  88. #ifndef GENERIC
  89. #define KERNEL1(xx) \
  90. mulpd %xmm0, %xmm1 ;\
  91. addpd %xmm1, %xmm8 ;\
  92. movapd -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  93. mulpd %xmm0, %xmm3 ;\
  94. addpd %xmm3, %xmm9 ;\
  95. movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  96. mulpd %xmm0, %xmm5 ;\
  97. PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
  98. mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
  99. addpd %xmm5, %xmm10 ;\
  100. movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  101. addpd %xmm0, %xmm11 ;\
  102. movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
  103. #define KERNEL2(xx) \
  104. mulpd %xmm2, %xmm1 ;\
  105. addpd %xmm1, %xmm12 ;\
  106. movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  107. mulpd %xmm2, %xmm3 ;\
  108. addpd %xmm3, %xmm13 ;\
  109. movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  110. mulpd %xmm2, %xmm5 ;\
  111. mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
  112. addpd %xmm5, %xmm14 ;\
  113. movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  114. addpd %xmm2, %xmm15 ;\
  115. movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
  116. #define KERNEL3(xx) \
  117. mulpd %xmm4, %xmm7 ;\
  118. addpd %xmm7, %xmm8 ;\
  119. movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  120. mulpd %xmm4, %xmm3 ;\
  121. addpd %xmm3, %xmm9 ;\
  122. movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  123. mulpd %xmm4, %xmm5 ;\
  124. mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
  125. addpd %xmm5, %xmm10 ;\
  126. movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  127. addpd %xmm4, %xmm11 ;\
  128. movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
  129. #define KERNEL4(xx) \
  130. mulpd %xmm6, %xmm7 ;\
  131. addpd %xmm7, %xmm12 ;\
  132. movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  133. mulpd %xmm6, %xmm3 ;\
  134. addpd %xmm3, %xmm13 ;\
  135. movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  136. mulpd %xmm6, %xmm5 ;\
  137. mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
  138. addpd %xmm5, %xmm14 ;\
  139. movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  140. PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
  141. addpd %xmm6, %xmm15 ;\
  142. movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
  143. #define KERNEL5(xx) \
  144. mulpd %xmm0, %xmm1 ;\
  145. addpd %xmm1, %xmm8 ;\
  146. movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  147. mulpd %xmm0, %xmm3 ;\
  148. addpd %xmm3, %xmm9 ;\
  149. movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  150. mulpd %xmm0, %xmm5 ;\
  151. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
  152. addpd %xmm5, %xmm10 ;\
  153. movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  154. addpd %xmm0, %xmm11 ;\
  155. movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
  156. #define KERNEL6(xx) \
  157. mulpd %xmm2, %xmm1 ;\
  158. addpd %xmm1, %xmm12 ;\
  159. movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  160. mulpd %xmm2, %xmm3 ;\
  161. addpd %xmm3, %xmm13 ;\
  162. movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  163. mulpd %xmm2, %xmm5 ;\
  164. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
  165. addpd %xmm5, %xmm14 ;\
  166. movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  167. addpd %xmm2, %xmm15 ;\
  168. movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
  169. #define KERNEL7(xx) \
  170. mulpd %xmm4, %xmm7 ;\
  171. addpd %xmm7, %xmm8 ;\
  172. movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  173. mulpd %xmm4, %xmm3 ;\
  174. addpd %xmm3, %xmm9 ;\
  175. movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  176. mulpd %xmm4, %xmm5 ;\
  177. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
  178. addpd %xmm5, %xmm10 ;\
  179. movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  180. addpd %xmm4, %xmm11 ;\
  181. movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
  182. #define KERNEL8(xx) \
  183. mulpd %xmm6, %xmm7 ;\
  184. addpd %xmm7, %xmm12 ;\
  185. movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  186. mulpd %xmm6, %xmm3 ;\
  187. addpd %xmm3, %xmm13 ;\
  188. movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  189. mulpd %xmm6, %xmm5 ;\
  190. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
  191. addpd %xmm5, %xmm14 ;\
  192. movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  193. addpd %xmm6, %xmm15 ;\
  194. movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
  195. #else
  196. #define KERNEL1(xx) \
  197. mulpd %xmm0, %xmm1 ;\
  198. addpd %xmm1, %xmm8 ;\
  199. movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  200. mulpd %xmm0, %xmm3 ;\
  201. addpd %xmm3, %xmm9 ;\
  202. movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  203. mulpd %xmm0, %xmm5 ;\
  204. PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\
  205. mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
  206. addpd %xmm5, %xmm10 ;\
  207. movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  208. addpd %xmm0, %xmm11 ;\
  209. movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
  210. #define KERNEL2(xx) \
  211. mulpd %xmm2, %xmm1 ;\
  212. addpd %xmm1, %xmm12 ;\
  213. movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  214. mulpd %xmm2, %xmm3 ;\
  215. addpd %xmm3, %xmm13 ;\
  216. movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  217. mulpd %xmm2, %xmm5 ;\
  218. mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
  219. addpd %xmm5, %xmm14 ;\
  220. movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  221. addpd %xmm2, %xmm15 ;\
  222. movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
  223. #define KERNEL3(xx) \
  224. mulpd %xmm4, %xmm7 ;\
  225. addpd %xmm7, %xmm8 ;\
  226. movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  227. mulpd %xmm4, %xmm3 ;\
  228. addpd %xmm3, %xmm9 ;\
  229. movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  230. mulpd %xmm4, %xmm5 ;\
  231. mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
  232. addpd %xmm5, %xmm10 ;\
  233. movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  234. addpd %xmm4, %xmm11 ;\
  235. movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
  236. #define KERNEL4(xx) \
  237. mulpd %xmm6, %xmm7 ;\
  238. addpd %xmm7, %xmm12 ;\
  239. movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  240. mulpd %xmm6, %xmm3 ;\
  241. addpd %xmm3, %xmm13 ;\
  242. movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  243. mulpd %xmm6, %xmm5 ;\
  244. mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
  245. addpd %xmm5, %xmm14 ;\
  246. movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  247. PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\
  248. addpd %xmm6, %xmm15 ;\
  249. movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
  250. #define KERNEL5(xx) \
  251. mulpd %xmm0, %xmm1 ;\
  252. addpd %xmm1, %xmm8 ;\
  253. movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  254. mulpd %xmm0, %xmm3 ;\
  255. addpd %xmm3, %xmm9 ;\
  256. movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  257. mulpd %xmm0, %xmm5 ;\
  258. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
  259. addpd %xmm5, %xmm10 ;\
  260. movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  261. addpd %xmm0, %xmm11 ;\
  262. movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
  263. #define KERNEL6(xx) \
  264. mulpd %xmm2, %xmm1 ;\
  265. addpd %xmm1, %xmm12 ;\
  266. movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  267. mulpd %xmm2, %xmm3 ;\
  268. addpd %xmm3, %xmm13 ;\
  269. movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  270. mulpd %xmm2, %xmm5 ;\
  271. mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
  272. addpd %xmm5, %xmm14 ;\
  273. movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  274. addpd %xmm2, %xmm15 ;\
  275. movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
  276. #define KERNEL7(xx) \
  277. mulpd %xmm4, %xmm7 ;\
  278. addpd %xmm7, %xmm8 ;\
  279. movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  280. mulpd %xmm4, %xmm3 ;\
  281. addpd %xmm3, %xmm9 ;\
  282. movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  283. mulpd %xmm4, %xmm5 ;\
  284. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
  285. addpd %xmm5, %xmm10 ;\
  286. movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  287. addpd %xmm4, %xmm11 ;\
  288. movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
  289. #define KERNEL8(xx) \
  290. mulpd %xmm6, %xmm7 ;\
  291. addpd %xmm7, %xmm12 ;\
  292. movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  293. mulpd %xmm6, %xmm3 ;\
  294. addpd %xmm3, %xmm13 ;\
  295. movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  296. mulpd %xmm6, %xmm5 ;\
  297. mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
  298. addpd %xmm5, %xmm14 ;\
  299. movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  300. addpd %xmm6, %xmm15 ;\
  301. movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
  302. #endif
  303. PROLOGUE
  304. PROFCODE
  305. subq $STACKSIZE, %rsp
  306. movq %rbx, 0(%rsp)
  307. movq %rbp, 8(%rsp)
  308. movq %r12, 16(%rsp)
  309. movq %r13, 24(%rsp)
  310. movq %r14, 32(%rsp)
  311. movq %r15, 40(%rsp)
  312. #ifdef WINDOWS_ABI
  313. movq %rdi, 48(%rsp)
  314. movq %rsi, 56(%rsp)
  315. movups %xmm6, 64(%rsp)
  316. movups %xmm7, 80(%rsp)
  317. movups %xmm8, 96(%rsp)
  318. movups %xmm9, 112(%rsp)
  319. movups %xmm10, 128(%rsp)
  320. movups %xmm11, 144(%rsp)
  321. movups %xmm12, 160(%rsp)
  322. movups %xmm13, 176(%rsp)
  323. movups %xmm14, 192(%rsp)
  324. movups %xmm15, 208(%rsp)
  325. movq ARG1, OLD_M
  326. movq ARG2, OLD_N
  327. movq ARG3, K
  328. movq OLD_A, A
  329. movq OLD_B, B
  330. movq OLD_C, C
  331. movq OLD_LDC, LDC
  332. #ifdef TRMMKERNEL
  333. movsd OLD_OFFSET, %xmm12
  334. #endif
  335. movaps %xmm3, %xmm0
  336. movsd OLD_ALPHA_I, %xmm1
  337. #else
  338. movq 72(%rsp), LDC
  339. #ifdef TRMMKERNEL
  340. movsd 80(%rsp), %xmm12
  341. #endif
  342. #endif
  343. EMMS
  344. movq %rsp, %rbx # save old stack
  345. subq $256 + LOCAL_BUFFER_SIZE, %rsp
  346. andq $-4096, %rsp # align stack
  347. STACK_TOUCHING
  348. movq OLD_M, M
  349. movq OLD_N, N
  350. pcmpeqb %xmm7, %xmm7
  351. psllq $63, %xmm7 # Generate mask
  352. pxor %xmm10, %xmm10
  353. movlpd %xmm0, 0 + ALPHA_R
  354. movlpd %xmm0, 8 + ALPHA_R
  355. movlpd %xmm1, 8 + ALPHA_I
  356. xorpd %xmm7, %xmm1
  357. movlpd %xmm1, 0 + ALPHA_I
  358. movlpd %xmm10, 0 + POSINV
  359. movlpd %xmm7, 8 + POSINV
  360. #ifdef TRMMKERNEL
  361. movlpd %xmm12, OFFSET
  362. movlpd %xmm12, KK
  363. #ifndef LEFT
  364. negq KK
  365. #endif
  366. #endif
  367. subq $-16 * SIZE, A
  368. salq $ZBASE_SHIFT, LDC
  369. movq N, J
  370. sarq $1, J # j = (n >> 2)
  371. jle .L100
  372. ALIGN_4
  373. .L01:
  374. movq C, CO1 # coffset1 = c
  375. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  376. #if defined(TRMMKERNEL) && defined(LEFT)
  377. movq OFFSET, %rax
  378. movq %rax, KK
  379. #endif
  380. leaq 16 * SIZE + BUFFER, BO
  381. movq K, %rax
  382. sarq $2, %rax
  383. jle .L03
  384. ALIGN_4
  385. .L02:
  386. PREFETCH (RPREFETCHSIZE + 0) * SIZE(B)
  387. movq 0 * SIZE(B), %mm0
  388. movq %mm0, -16 * SIZE(BO)
  389. movq %mm0, -15 * SIZE(BO)
  390. movq 1 * SIZE(B), %mm1
  391. movq %mm1, -14 * SIZE(BO)
  392. movq %mm1, -13 * SIZE(BO)
  393. movq 2 * SIZE(B), %mm2
  394. movq %mm2, -12 * SIZE(BO)
  395. movq %mm2, -11 * SIZE(BO)
  396. movq 3 * SIZE(B), %mm3
  397. movq %mm3, -10 * SIZE(BO)
  398. movq %mm3, -9 * SIZE(BO)
  399. PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO)
  400. movq 4 * SIZE(B), %mm4
  401. movq %mm4, -8 * SIZE(BO)
  402. movq %mm4, -7 * SIZE(BO)
  403. movq 5 * SIZE(B), %mm5
  404. movq %mm5, -6 * SIZE(BO)
  405. movq %mm5, -5 * SIZE(BO)
  406. PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO)
  407. movq 6 * SIZE(B), %mm6
  408. movq %mm6, -4 * SIZE(BO)
  409. movq %mm6, -3 * SIZE(BO)
  410. movq 7 * SIZE(B), %mm7
  411. movq %mm7, -2 * SIZE(BO)
  412. movq %mm7, -1 * SIZE(BO)
  413. PREFETCH (RPREFETCHSIZE + 8) * SIZE(B)
  414. movq 8 * SIZE(B), %mm0
  415. movq %mm0, 0 * SIZE(BO)
  416. movq %mm0, 1 * SIZE(BO)
  417. movq 9 * SIZE(B), %mm1
  418. movq %mm1, 2 * SIZE(BO)
  419. movq %mm1, 3 * SIZE(BO)
  420. movq 10 * SIZE(B), %mm2
  421. movq %mm2, 4 * SIZE(BO)
  422. movq %mm2, 5 * SIZE(BO)
  423. movq 11 * SIZE(B), %mm3
  424. movq %mm3, 6 * SIZE(BO)
  425. movq %mm3, 7 * SIZE(BO)
  426. PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO)
  427. movq 12 * SIZE(B), %mm4
  428. movq %mm4, 8 * SIZE(BO)
  429. movq %mm4, 9 * SIZE(BO)
  430. movq 13 * SIZE(B), %mm5
  431. movq %mm5, 10 * SIZE(BO)
  432. movq %mm5, 11 * SIZE(BO)
  433. PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO)
  434. movq 14 * SIZE(B), %mm6
  435. movq %mm6, 12 * SIZE(BO)
  436. movq %mm6, 13 * SIZE(BO)
  437. movq 15 * SIZE(B), %mm7
  438. movq %mm7, 14 * SIZE(BO)
  439. movq %mm7, 15 * SIZE(BO)
  440. addq $ 32 * SIZE, BO
  441. subq $-16 * SIZE, B
  442. decq %rax
  443. jne .L02
  444. ALIGN_4
  445. .L03:
  446. movq K, %rax
  447. andq $3, %rax
  448. BRANCH
  449. jle .L05
  450. ALIGN_4
  451. .L04:
  452. movq 0 * SIZE(B), %mm0
  453. movq %mm0, -16 * SIZE(BO)
  454. movq %mm0, -15 * SIZE(BO)
  455. movq 1 * SIZE(B), %mm1
  456. movq %mm1, -14 * SIZE(BO)
  457. movq %mm1, -13 * SIZE(BO)
  458. movq 2 * SIZE(B), %mm2
  459. movq %mm2, -12 * SIZE(BO)
  460. movq %mm2, -11 * SIZE(BO)
  461. movq 3 * SIZE(B), %mm3
  462. movq %mm3, -10 * SIZE(BO)
  463. movq %mm3, -9 * SIZE(BO)
  464. addq $ 4 * SIZE, B
  465. addq $ 8 * SIZE, BO
  466. decq %rax
  467. jne .L04
  468. ALIGN_4
  469. .L05:
  470. movq A, AO # aoffset = a
  471. leaq (RPREFETCHSIZE + 0) * SIZE(B), BB
  472. movq M, I
  473. sarq $1, I # i = (m >> 2)
  474. jle .L30
  475. ALIGN_4
  476. .L10:
  477. #if !defined(TRMMKERNEL) || \
  478. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  479. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  480. leaq 16 * SIZE + BUFFER, BO
  481. #else
  482. leaq 16 * SIZE + BUFFER, BO
  483. movq KK, %rax
  484. leaq (, %rax, SIZE), %rax
  485. leaq (AO, %rax, 4), AO
  486. leaq (BO, %rax, 8), BO
  487. #endif
  488. movapd -16 * SIZE(AO), %xmm0
  489. movapd -16 * SIZE(BO), %xmm1
  490. pxor %xmm8, %xmm8
  491. PREFETCH 0 * SIZE(BB)
  492. movapd -14 * SIZE(AO), %xmm2
  493. movapd -14 * SIZE(BO), %xmm3
  494. pxor %xmm9, %xmm9
  495. movapd -12 * SIZE(AO), %xmm4
  496. movapd -12 * SIZE(BO), %xmm5
  497. pxor %xmm10, %xmm10
  498. movapd -10 * SIZE(AO), %xmm6
  499. movapd -8 * SIZE(BO), %xmm7
  500. pxor %xmm11, %xmm11
  501. pxor %xmm12, %xmm12
  502. PREFETCHW 3 * SIZE(CO1)
  503. pxor %xmm13, %xmm13
  504. PREFETCHW 3 * SIZE(CO2)
  505. pxor %xmm14, %xmm14
  506. pxor %xmm15, %xmm15
  507. #ifndef TRMMKERNEL
  508. movq K, %rax
  509. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  510. movq K, %rax
  511. subq KK, %rax
  512. movq %rax, KKK
  513. #else
  514. movq KK, %rax
  515. #ifdef LEFT
  516. addq $2, %rax
  517. #else
  518. addq $2, %rax
  519. #endif
  520. movq %rax, KKK
  521. #endif
  522. #ifndef GENERIC
  523. andq $-8, %rax
  524. leaq (, %rax, SIZE), %rax
  525. leaq (AO, %rax, 4), AO
  526. leaq (BO, %rax, 8), BO
  527. negq %rax
  528. NOBRANCH
  529. je .L15
  530. ALIGN_3
  531. .L12:
  532. KERNEL1(16 * 0)
  533. KERNEL2(16 * 0)
  534. KERNEL3(16 * 0)
  535. KERNEL4(16 * 0)
  536. KERNEL5(16 * 0)
  537. KERNEL6(16 * 0)
  538. KERNEL7(16 * 0)
  539. KERNEL8(16 * 0)
  540. KERNEL1(16 * 1)
  541. KERNEL2(16 * 1)
  542. KERNEL3(16 * 1)
  543. KERNEL4(16 * 1)
  544. KERNEL5(16 * 1)
  545. KERNEL6(16 * 1)
  546. KERNEL7(16 * 1)
  547. KERNEL8(16 * 1)
  548. addq $8 * SIZE, %rax
  549. NOBRANCH
  550. je .L15
  551. KERNEL1(16 * 0)
  552. KERNEL2(16 * 0)
  553. KERNEL3(16 * 0)
  554. KERNEL4(16 * 0)
  555. KERNEL5(16 * 0)
  556. KERNEL6(16 * 0)
  557. KERNEL7(16 * 0)
  558. KERNEL8(16 * 0)
  559. KERNEL1(16 * 1)
  560. KERNEL2(16 * 1)
  561. KERNEL3(16 * 1)
  562. KERNEL4(16 * 1)
  563. KERNEL5(16 * 1)
  564. KERNEL6(16 * 1)
  565. KERNEL7(16 * 1)
  566. KERNEL8(16 * 1)
  567. addq $8 * SIZE, %rax
  568. NOBRANCH
  569. je .L15
  570. KERNEL1(16 * 0)
  571. KERNEL2(16 * 0)
  572. KERNEL3(16 * 0)
  573. KERNEL4(16 * 0)
  574. KERNEL5(16 * 0)
  575. KERNEL6(16 * 0)
  576. KERNEL7(16 * 0)
  577. KERNEL8(16 * 0)
  578. KERNEL1(16 * 1)
  579. KERNEL2(16 * 1)
  580. KERNEL3(16 * 1)
  581. KERNEL4(16 * 1)
  582. KERNEL5(16 * 1)
  583. KERNEL6(16 * 1)
  584. KERNEL7(16 * 1)
  585. KERNEL8(16 * 1)
  586. addq $8 * SIZE, %rax
  587. NOBRANCH
  588. je .L15
  589. KERNEL1(16 * 0)
  590. KERNEL2(16 * 0)
  591. KERNEL3(16 * 0)
  592. KERNEL4(16 * 0)
  593. KERNEL5(16 * 0)
  594. KERNEL6(16 * 0)
  595. KERNEL7(16 * 0)
  596. KERNEL8(16 * 0)
  597. KERNEL1(16 * 1)
  598. KERNEL2(16 * 1)
  599. KERNEL3(16 * 1)
  600. KERNEL4(16 * 1)
  601. KERNEL5(16 * 1)
  602. KERNEL6(16 * 1)
  603. KERNEL7(16 * 1)
  604. KERNEL8(16 * 1)
  605. addq $8 * SIZE, %rax
  606. NOBRANCH
  607. je .L15
  608. KERNEL1(16 * 0)
  609. KERNEL2(16 * 0)
  610. KERNEL3(16 * 0)
  611. KERNEL4(16 * 0)
  612. KERNEL5(16 * 0)
  613. KERNEL6(16 * 0)
  614. KERNEL7(16 * 0)
  615. KERNEL8(16 * 0)
  616. KERNEL1(16 * 1)
  617. KERNEL2(16 * 1)
  618. KERNEL3(16 * 1)
  619. KERNEL4(16 * 1)
  620. KERNEL5(16 * 1)
  621. KERNEL6(16 * 1)
  622. KERNEL7(16 * 1)
  623. KERNEL8(16 * 1)
  624. addq $8 * SIZE, %rax
  625. NOBRANCH
  626. je .L15
  627. KERNEL1(16 * 0)
  628. KERNEL2(16 * 0)
  629. KERNEL3(16 * 0)
  630. KERNEL4(16 * 0)
  631. KERNEL5(16 * 0)
  632. KERNEL6(16 * 0)
  633. KERNEL7(16 * 0)
  634. KERNEL8(16 * 0)
  635. KERNEL1(16 * 1)
  636. KERNEL2(16 * 1)
  637. KERNEL3(16 * 1)
  638. KERNEL4(16 * 1)
  639. KERNEL5(16 * 1)
  640. KERNEL6(16 * 1)
  641. KERNEL7(16 * 1)
  642. KERNEL8(16 * 1)
  643. addq $8 * SIZE, %rax
  644. NOBRANCH
  645. je .L15
  646. KERNEL1(16 * 0)
  647. KERNEL2(16 * 0)
  648. KERNEL3(16 * 0)
  649. KERNEL4(16 * 0)
  650. KERNEL5(16 * 0)
  651. KERNEL6(16 * 0)
  652. KERNEL7(16 * 0)
  653. KERNEL8(16 * 0)
  654. KERNEL1(16 * 1)
  655. KERNEL2(16 * 1)
  656. KERNEL3(16 * 1)
  657. KERNEL4(16 * 1)
  658. KERNEL5(16 * 1)
  659. KERNEL6(16 * 1)
  660. KERNEL7(16 * 1)
  661. KERNEL8(16 * 1)
  662. addq $8 * SIZE, %rax
  663. NOBRANCH
  664. je .L15
  665. KERNEL1(16 * 0)
  666. KERNEL2(16 * 0)
  667. KERNEL3(16 * 0)
  668. KERNEL4(16 * 0)
  669. KERNEL5(16 * 0)
  670. KERNEL6(16 * 0)
  671. KERNEL7(16 * 0)
  672. KERNEL8(16 * 0)
  673. KERNEL1(16 * 1)
  674. KERNEL2(16 * 1)
  675. KERNEL3(16 * 1)
  676. KERNEL4(16 * 1)
  677. KERNEL5(16 * 1)
  678. KERNEL6(16 * 1)
  679. KERNEL7(16 * 1)
  680. KERNEL8(16 * 1)
  681. addq $8 * SIZE, %rax
  682. BRANCH
  683. jl .L12
  684. ALIGN_3
  685. .L15:
  686. PREFETCH 8 * SIZE(BB)
  687. subq $-16 * SIZE, BB
  688. #ifndef TRMMKERNEL
  689. movq K, %rax
  690. #else
  691. movq KKK, %rax
  692. #endif
  693. testq $4, %rax
  694. je .L16
  695. xorq %rax, %rax
  696. ALIGN_3
  697. KERNEL1(16 * 0)
  698. KERNEL2(16 * 0)
  699. KERNEL3(16 * 0)
  700. KERNEL4(16 * 0)
  701. KERNEL5(16 * 0)
  702. KERNEL6(16 * 0)
  703. KERNEL7(16 * 0)
  704. KERNEL8(16 * 0)
  705. addq $32 * SIZE, BO
  706. addq $16 * SIZE, AO
  707. ALIGN_3
  708. #else
  709. sarq $2, %rax
  710. NOBRANCH
  711. jle .L16
  712. ALIGN_3
  713. .L12:
  714. KERNEL1(16 * 0)
  715. KERNEL2(16 * 0)
  716. KERNEL3(16 * 0)
  717. KERNEL4(16 * 0)
  718. KERNEL5(16 * 0)
  719. KERNEL6(16 * 0)
  720. KERNEL7(16 * 0)
  721. KERNEL8(16 * 0)
  722. addq $ 32 * SIZE, BO
  723. subq $-16 * SIZE, AO
  724. decq %rax
  725. BRANCH
  726. jg .L12
  727. #endif
  728. .L16:
  729. movapd POSINV, %xmm5
  730. movapd ALPHA_R, %xmm6
  731. movapd ALPHA_I, %xmm7
  732. #ifndef TRMMKERNEL
  733. movq K, %rax
  734. #else
  735. movq KKK, %rax
  736. #endif
  737. andq $3, %rax # if (k & 1)
  738. je .L19
  739. leaq (, %rax, SIZE), %rax
  740. leaq (AO, %rax, 4), AO
  741. leaq (BO, %rax, 8), BO
  742. negq %rax
  743. ALIGN_3
  744. .L17:
  745. mulpd %xmm0, %xmm1
  746. addpd %xmm1, %xmm8
  747. movapd -14 * SIZE(BO, %rax, 8), %xmm1
  748. mulpd %xmm0, %xmm1
  749. addpd %xmm1, %xmm9
  750. movapd -12 * SIZE(BO, %rax, 8), %xmm1
  751. mulpd %xmm0, %xmm1
  752. mulpd -10 * SIZE(BO, %rax, 8), %xmm0
  753. addpd %xmm1, %xmm10
  754. movapd -16 * SIZE(BO, %rax, 8), %xmm1
  755. addpd %xmm0, %xmm11
  756. movapd -12 * SIZE(AO, %rax, 4), %xmm0
  757. mulpd %xmm2, %xmm1
  758. addpd %xmm1, %xmm12
  759. movapd -14 * SIZE(BO, %rax, 8), %xmm1
  760. mulpd %xmm2, %xmm1
  761. addpd %xmm1, %xmm13
  762. movapd -12 * SIZE(BO, %rax, 8), %xmm1
  763. mulpd %xmm2, %xmm1
  764. mulpd -10 * SIZE(BO, %rax, 8), %xmm2
  765. addpd %xmm1, %xmm14
  766. movapd -8 * SIZE(BO, %rax, 8), %xmm1
  767. addpd %xmm2, %xmm15
  768. movapd -10 * SIZE(AO, %rax, 4), %xmm2
  769. addq $SIZE, %rax
  770. jl .L17
  771. ALIGN_3
  772. .L19:
  773. #ifndef TRMMKERNEL
  774. movlpd 0 * SIZE(CO1), %xmm0
  775. movhpd 1 * SIZE(CO1), %xmm0
  776. movlpd 2 * SIZE(CO1), %xmm2
  777. movhpd 3 * SIZE(CO1), %xmm2
  778. movlpd 0 * SIZE(CO2), %xmm1
  779. movhpd 1 * SIZE(CO2), %xmm1
  780. movlpd 2 * SIZE(CO2), %xmm3
  781. movhpd 3 * SIZE(CO2), %xmm3
  782. #endif
  783. SHUFPD_1 %xmm9, %xmm9
  784. SHUFPD_1 %xmm11, %xmm11
  785. SHUFPD_1 %xmm13, %xmm13
  786. SHUFPD_1 %xmm15, %xmm15
  787. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  788. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  789. xorpd %xmm5, %xmm9
  790. xorpd %xmm5, %xmm11
  791. xorpd %xmm5, %xmm13
  792. xorpd %xmm5, %xmm15
  793. #else
  794. xorpd %xmm5, %xmm8
  795. xorpd %xmm5, %xmm10
  796. xorpd %xmm5, %xmm12
  797. xorpd %xmm5, %xmm14
  798. #endif
  799. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  800. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  801. subpd %xmm9, %xmm8
  802. subpd %xmm11, %xmm10
  803. subpd %xmm13, %xmm12
  804. subpd %xmm15, %xmm14
  805. #else
  806. addpd %xmm9, %xmm8
  807. addpd %xmm11, %xmm10
  808. addpd %xmm13, %xmm12
  809. addpd %xmm15, %xmm14
  810. #endif
  811. pshufd $0x4e, %xmm8, %xmm9
  812. pshufd $0x4e, %xmm10, %xmm11
  813. pshufd $0x4e, %xmm12, %xmm13
  814. pshufd $0x4e, %xmm14, %xmm15
  815. mulpd %xmm6, %xmm8
  816. mulpd %xmm7, %xmm9
  817. mulpd %xmm6, %xmm10
  818. mulpd %xmm7, %xmm11
  819. mulpd %xmm6, %xmm12
  820. mulpd %xmm7, %xmm13
  821. mulpd %xmm6, %xmm14
  822. mulpd %xmm7, %xmm15
  823. addpd %xmm9, %xmm8
  824. addpd %xmm11, %xmm10
  825. addpd %xmm13, %xmm12
  826. addpd %xmm15, %xmm14
  827. #ifndef TRMMKERNEL
  828. addpd %xmm0, %xmm8
  829. addpd %xmm2, %xmm12
  830. addpd %xmm1, %xmm10
  831. addpd %xmm3, %xmm14
  832. #endif
  833. movlpd %xmm8, 0 * SIZE(CO1)
  834. movhpd %xmm8, 1 * SIZE(CO1)
  835. movlpd %xmm12, 2 * SIZE(CO1)
  836. movhpd %xmm12, 3 * SIZE(CO1)
  837. movlpd %xmm10, 0 * SIZE(CO2)
  838. movhpd %xmm10, 1 * SIZE(CO2)
  839. movlpd %xmm14, 2 * SIZE(CO2)
  840. movhpd %xmm14, 3 * SIZE(CO2)
  841. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  842. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  843. movq K, %rax
  844. subq KKK, %rax
  845. leaq (,%rax, SIZE), %rax
  846. leaq (AO, %rax, 4), AO
  847. leaq (BO, %rax, 8), BO
  848. #endif
  849. #if defined(TRMMKERNEL) && defined(LEFT)
  850. addq $2, KK
  851. #endif
  852. addq $4 * SIZE, CO1 # coffset += 4
  853. addq $4 * SIZE, CO2 # coffset += 4
  854. decq I # i --
  855. jg .L10
  856. ALIGN_4
  857. .L30:
  858. testq $1, M
  859. jle .L99
  860. #if !defined(TRMMKERNEL) || \
  861. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  862. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  863. leaq 16 * SIZE + BUFFER, BO
  864. #else
  865. leaq 16 * SIZE + BUFFER, BO
  866. movq KK, %rax
  867. leaq (, %rax, SIZE), %rax
  868. leaq (AO, %rax, 2), AO
  869. leaq (BO, %rax, 8), BO
  870. #endif
  871. movapd -16 * SIZE(AO), %xmm0
  872. pxor %xmm8, %xmm8
  873. movapd -8 * SIZE(AO), %xmm2
  874. pxor %xmm9, %xmm9
  875. movapd -16 * SIZE(BO), %xmm1
  876. pxor %xmm10, %xmm10
  877. movapd -8 * SIZE(BO), %xmm3
  878. pxor %xmm11, %xmm11
  879. #ifndef TRMMKERNEL
  880. movq K, %rax
  881. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  882. movq K, %rax
  883. subq KK, %rax
  884. movq %rax, KKK
  885. #else
  886. movq KK, %rax
  887. #ifdef LEFT
  888. addq $1, %rax
  889. #else
  890. addq $2, %rax
  891. #endif
  892. movq %rax, KKK
  893. #endif
  894. sarq $3, %rax
  895. je .L44
  896. ALIGN_4
  897. .L41:
  898. mulpd %xmm0, %xmm1
  899. addpd %xmm1, %xmm8
  900. movapd -14 * SIZE(BO), %xmm1
  901. mulpd %xmm0, %xmm1
  902. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  903. addpd %xmm1, %xmm9
  904. movapd -12 * SIZE(BO), %xmm1
  905. mulpd %xmm0, %xmm1
  906. mulpd -10 * SIZE(BO), %xmm0
  907. addpd %xmm1, %xmm10
  908. movapd 0 * SIZE(BO), %xmm1
  909. addpd %xmm0, %xmm11
  910. movapd -14 * SIZE(AO), %xmm0
  911. mulpd %xmm0, %xmm3
  912. addpd %xmm3, %xmm8
  913. movapd -6 * SIZE(BO), %xmm3
  914. mulpd %xmm0, %xmm3
  915. addpd %xmm3, %xmm9
  916. movapd -4 * SIZE(BO), %xmm3
  917. mulpd %xmm0, %xmm3
  918. mulpd -2 * SIZE(BO), %xmm0
  919. addpd %xmm3, %xmm10
  920. movapd 8 * SIZE(BO), %xmm3
  921. addpd %xmm0, %xmm11
  922. movapd -12 * SIZE(AO), %xmm0
  923. mulpd %xmm0, %xmm1
  924. addpd %xmm1, %xmm8
  925. movapd 2 * SIZE(BO), %xmm1
  926. mulpd %xmm0, %xmm1
  927. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  928. addpd %xmm1, %xmm9
  929. movapd 4 * SIZE(BO), %xmm1
  930. mulpd %xmm0, %xmm1
  931. mulpd 6 * SIZE(BO), %xmm0
  932. addpd %xmm1, %xmm10
  933. movapd 16 * SIZE(BO), %xmm1
  934. addpd %xmm0, %xmm11
  935. movapd -10 * SIZE(AO), %xmm0
  936. mulpd %xmm0, %xmm3
  937. addpd %xmm3, %xmm8
  938. movapd 10 * SIZE(BO), %xmm3
  939. mulpd %xmm0, %xmm3
  940. addpd %xmm3, %xmm9
  941. movapd 12 * SIZE(BO), %xmm3
  942. mulpd %xmm0, %xmm3
  943. mulpd 14 * SIZE(BO), %xmm0
  944. addpd %xmm3, %xmm10
  945. movapd 24 * SIZE(BO), %xmm3
  946. addpd %xmm0, %xmm11
  947. movapd 0 * SIZE(AO), %xmm0
  948. mulpd %xmm2, %xmm1
  949. addpd %xmm1, %xmm8
  950. movapd 18 * SIZE(BO), %xmm1
  951. mulpd %xmm2, %xmm1
  952. addpd %xmm1, %xmm9
  953. movapd 20 * SIZE(BO), %xmm1
  954. mulpd %xmm2, %xmm1
  955. mulpd 22 * SIZE(BO), %xmm2
  956. addpd %xmm1, %xmm10
  957. movapd 32 * SIZE(BO), %xmm1
  958. addpd %xmm2, %xmm11
  959. movapd -6 * SIZE(AO), %xmm2
  960. mulpd %xmm2, %xmm3
  961. addpd %xmm3, %xmm8
  962. movapd 26 * SIZE(BO), %xmm3
  963. mulpd %xmm2, %xmm3
  964. addpd %xmm3, %xmm9
  965. movapd 28 * SIZE(BO), %xmm3
  966. mulpd %xmm2, %xmm3
  967. mulpd 30 * SIZE(BO), %xmm2
  968. addpd %xmm3, %xmm10
  969. movapd 40 * SIZE(BO), %xmm3
  970. addpd %xmm2, %xmm11
  971. movapd -4 * SIZE(AO), %xmm2
  972. mulpd %xmm2, %xmm1
  973. addpd %xmm1, %xmm8
  974. movapd 34 * SIZE(BO), %xmm1
  975. mulpd %xmm2, %xmm1
  976. addpd %xmm1, %xmm9
  977. movapd 36 * SIZE(BO), %xmm1
  978. mulpd %xmm2, %xmm1
  979. mulpd 38 * SIZE(BO), %xmm2
  980. addpd %xmm1, %xmm10
  981. movapd 48 * SIZE(BO), %xmm1
  982. addpd %xmm2, %xmm11
  983. movapd -2 * SIZE(AO), %xmm2
  984. mulpd %xmm2, %xmm3
  985. addpd %xmm3, %xmm8
  986. movapd 42 * SIZE(BO), %xmm3
  987. mulpd %xmm2, %xmm3
  988. addpd %xmm3, %xmm9
  989. movapd 44 * SIZE(BO), %xmm3
  990. mulpd %xmm2, %xmm3
  991. mulpd 46 * SIZE(BO), %xmm2
  992. addpd %xmm3, %xmm10
  993. movapd 56 * SIZE(BO), %xmm3
  994. addpd %xmm2, %xmm11
  995. movapd 8 * SIZE(AO), %xmm2
  996. subq $-16 * SIZE, AO
  997. addq $64 * SIZE, BO
  998. decq %rax
  999. jne .L41
  1000. ALIGN_4
  1001. .L44:
  1002. #ifndef TRMMKERNEL
  1003. movq K, %rax
  1004. #else
  1005. movq KKK, %rax
  1006. #endif
  1007. andq $4, %rax
  1008. BRANCH
  1009. jle .L45
  1010. mulpd %xmm0, %xmm1
  1011. addpd %xmm1, %xmm8
  1012. movapd -14 * SIZE(BO), %xmm1
  1013. mulpd %xmm0, %xmm1
  1014. addpd %xmm1, %xmm9
  1015. movapd -12 * SIZE(BO), %xmm1
  1016. mulpd %xmm0, %xmm1
  1017. mulpd -10 * SIZE(BO), %xmm0
  1018. addpd %xmm1, %xmm10
  1019. movapd 0 * SIZE(BO), %xmm1
  1020. addpd %xmm0, %xmm11
  1021. movapd -14 * SIZE(AO), %xmm0
  1022. mulpd %xmm0, %xmm3
  1023. addpd %xmm3, %xmm8
  1024. movapd -6 * SIZE(BO), %xmm3
  1025. mulpd %xmm0, %xmm3
  1026. addpd %xmm3, %xmm9
  1027. movapd -4 * SIZE(BO), %xmm3
  1028. mulpd %xmm0, %xmm3
  1029. mulpd -2 * SIZE(BO), %xmm0
  1030. addpd %xmm3, %xmm10
  1031. movapd 8 * SIZE(BO), %xmm3
  1032. addpd %xmm0, %xmm11
  1033. movapd -12 * SIZE(AO), %xmm0
  1034. mulpd %xmm0, %xmm1
  1035. addpd %xmm1, %xmm8
  1036. movapd 2 * SIZE(BO), %xmm1
  1037. mulpd %xmm0, %xmm1
  1038. addpd %xmm1, %xmm9
  1039. movapd 4 * SIZE(BO), %xmm1
  1040. mulpd %xmm0, %xmm1
  1041. mulpd 6 * SIZE(BO), %xmm0
  1042. addpd %xmm1, %xmm10
  1043. movapd 16 * SIZE(BO), %xmm1
  1044. addpd %xmm0, %xmm11
  1045. movapd -10 * SIZE(AO), %xmm0
  1046. mulpd %xmm0, %xmm3
  1047. addpd %xmm3, %xmm8
  1048. movapd 10 * SIZE(BO), %xmm3
  1049. mulpd %xmm0, %xmm3
  1050. addpd %xmm3, %xmm9
  1051. movapd 12 * SIZE(BO), %xmm3
  1052. mulpd %xmm0, %xmm3
  1053. mulpd 14 * SIZE(BO), %xmm0
  1054. addpd %xmm3, %xmm10
  1055. movapd 24 * SIZE(BO), %xmm3
  1056. addpd %xmm0, %xmm11
  1057. movapd -8 * SIZE(AO), %xmm0
  1058. addq $ 8 * SIZE, AO
  1059. addq $32 * SIZE, BO
  1060. ALIGN_4
  1061. .L45:
  1062. #ifndef TRMMKERNEL
  1063. movq K, %rax
  1064. #else
  1065. movq KKK, %rax
  1066. #endif
  1067. movapd POSINV, %xmm5
  1068. movapd ALPHA_R, %xmm6
  1069. movapd ALPHA_I, %xmm7
  1070. andq $3, %rax # if (k & 1)
  1071. BRANCH
  1072. jle .L47
  1073. ALIGN_4
  1074. .L46:
  1075. mulpd %xmm0, %xmm1
  1076. addpd %xmm1, %xmm8
  1077. movapd -14 * SIZE(BO), %xmm1
  1078. mulpd %xmm0, %xmm1
  1079. addpd %xmm1, %xmm9
  1080. movapd -12 * SIZE(BO), %xmm1
  1081. mulpd %xmm0, %xmm1
  1082. mulpd -10 * SIZE(BO), %xmm0
  1083. addpd %xmm1, %xmm10
  1084. movapd -8 * SIZE(BO), %xmm1
  1085. addpd %xmm0, %xmm11
  1086. movapd -14 * SIZE(AO), %xmm0
  1087. addq $2 * SIZE, AO
  1088. addq $8 * SIZE, BO
  1089. decq %rax
  1090. jg .L46
  1091. ALIGN_4
  1092. .L47:
  1093. #ifndef TRMMKERNEL
  1094. movlpd 0 * SIZE(CO1), %xmm0
  1095. movhpd 1 * SIZE(CO1), %xmm0
  1096. movlpd 0 * SIZE(CO2), %xmm1
  1097. movhpd 1 * SIZE(CO2), %xmm1
  1098. #endif
  1099. SHUFPD_1 %xmm9, %xmm9
  1100. SHUFPD_1 %xmm11, %xmm11
  1101. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1102. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1103. xorpd %xmm5, %xmm9
  1104. xorpd %xmm5, %xmm11
  1105. #else
  1106. xorpd %xmm5, %xmm8
  1107. xorpd %xmm5, %xmm10
  1108. #endif
  1109. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1110. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1111. subpd %xmm9, %xmm8
  1112. subpd %xmm11, %xmm10
  1113. #else
  1114. addpd %xmm9, %xmm8
  1115. addpd %xmm11, %xmm10
  1116. #endif
  1117. pshufd $0x4e, %xmm8, %xmm9
  1118. pshufd $0x4e, %xmm10, %xmm11
  1119. mulpd %xmm6, %xmm8
  1120. mulpd %xmm7, %xmm9
  1121. mulpd %xmm6, %xmm10
  1122. mulpd %xmm7, %xmm11
  1123. addpd %xmm9, %xmm8
  1124. addpd %xmm11, %xmm10
  1125. #ifndef TRMMKERNEL
  1126. addpd %xmm0, %xmm8
  1127. addpd %xmm1, %xmm10
  1128. #endif
  1129. movlpd %xmm8, 0 * SIZE(CO1)
  1130. movhpd %xmm8, 1 * SIZE(CO1)
  1131. movlpd %xmm10, 0 * SIZE(CO2)
  1132. movhpd %xmm10, 1 * SIZE(CO2)
  1133. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1134. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1135. movq K, %rax
  1136. subq KKK, %rax
  1137. leaq (,%rax, SIZE), %rax
  1138. leaq (AO, %rax, 2), AO
  1139. leaq (BO, %rax, 8), BO
  1140. #endif
  1141. #if defined(TRMMKERNEL) && defined(LEFT)
  1142. addq $1, KK
  1143. #endif
  1144. ALIGN_4
  1145. .L99:
  1146. #if defined(TRMMKERNEL) && !defined(LEFT)
  1147. addl $2, KK
  1148. #endif
  1149. leaq (C, LDC, 2), C # c += 2 * ldc
  1150. decq J # j --
  1151. jg .L01
  1152. .L100:
  1153. testq $1, N
  1154. jle .L999
  1155. .L101:
  1156. #if defined(TRMMKERNEL) && defined(LEFT)
  1157. movq OFFSET, %rax
  1158. movq %rax, KK
  1159. #endif
  1160. /* Copying to Sub Buffer */
  1161. leaq BUFFER, BO
  1162. movq K, %rax
  1163. sarq $2, %rax
  1164. jle .L103
  1165. ALIGN_4
  1166. .L102:
  1167. movlpd 0 * SIZE(B), %xmm8
  1168. movlpd 1 * SIZE(B), %xmm9
  1169. movlpd 2 * SIZE(B), %xmm10
  1170. movlpd 3 * SIZE(B), %xmm11
  1171. movlpd 4 * SIZE(B), %xmm12
  1172. movlpd 5 * SIZE(B), %xmm13
  1173. movlpd 6 * SIZE(B), %xmm14
  1174. movlpd 7 * SIZE(B), %xmm15
  1175. movlpd %xmm8, 0 * SIZE(BO)
  1176. movlpd %xmm8, 1 * SIZE(BO)
  1177. movlpd %xmm9, 2 * SIZE(BO)
  1178. movlpd %xmm9, 3 * SIZE(BO)
  1179. movlpd %xmm10, 4 * SIZE(BO)
  1180. movlpd %xmm10, 5 * SIZE(BO)
  1181. movlpd %xmm11, 6 * SIZE(BO)
  1182. movlpd %xmm11, 7 * SIZE(BO)
  1183. movlpd %xmm12, 8 * SIZE(BO)
  1184. movlpd %xmm12, 9 * SIZE(BO)
  1185. movlpd %xmm13, 10 * SIZE(BO)
  1186. movlpd %xmm13, 11 * SIZE(BO)
  1187. movlpd %xmm14, 12 * SIZE(BO)
  1188. movlpd %xmm14, 13 * SIZE(BO)
  1189. movlpd %xmm15, 14 * SIZE(BO)
  1190. movlpd %xmm15, 15 * SIZE(BO)
  1191. subq $-16 * SIZE, BO
  1192. addq $ 8 * SIZE, B
  1193. decq %rax
  1194. jne .L102
  1195. ALIGN_4
  1196. .L103:
  1197. movq K, %rax
  1198. andq $3, %rax
  1199. BRANCH
  1200. jle .L105
  1201. ALIGN_4
  1202. .L104:
  1203. movlpd 0 * SIZE(B), %xmm8
  1204. movlpd 1 * SIZE(B), %xmm9
  1205. movlpd %xmm8, 0 * SIZE(BO)
  1206. movlpd %xmm8, 1 * SIZE(BO)
  1207. movlpd %xmm9, 2 * SIZE(BO)
  1208. movlpd %xmm9, 3 * SIZE(BO)
  1209. addq $4 * SIZE, BO
  1210. addq $2 * SIZE, B
  1211. decq %rax
  1212. jne .L104
  1213. ALIGN_4
  1214. .L105:
  1215. movq C, CO1 # coffset1 = c
  1216. movq A, AO # aoffset = a
  1217. movq M, I
  1218. sarq $1, I # i = (m >> 2)
  1219. jle .L130
  1220. ALIGN_4
  1221. .L110:
  1222. #if !defined(TRMMKERNEL) || \
  1223. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1224. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1225. leaq 16 * SIZE + BUFFER, BO
  1226. #else
  1227. leaq 16 * SIZE + BUFFER, BO
  1228. movq KK, %rax
  1229. leaq (, %rax, SIZE), %rax
  1230. leaq (AO, %rax, 4), AO
  1231. leaq (BO, %rax, 4), BO
  1232. #endif
  1233. movapd -16 * SIZE(AO), %xmm0
  1234. pxor %xmm8, %xmm8
  1235. movapd -16 * SIZE(BO), %xmm1
  1236. pxor %xmm9, %xmm9
  1237. movapd -8 * SIZE(AO), %xmm2
  1238. pxor %xmm12, %xmm12
  1239. movapd -8 * SIZE(BO), %xmm3
  1240. pxor %xmm13, %xmm13
  1241. PREFETCHW 3 * SIZE(CO1)
  1242. #ifndef TRMMKERNEL
  1243. movq K, %rax
  1244. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1245. movq K, %rax
  1246. subq KK, %rax
  1247. movq %rax, KKK
  1248. #else
  1249. movq KK, %rax
  1250. #ifdef LEFT
  1251. addq $2, %rax
  1252. #else
  1253. addq $1, %rax
  1254. #endif
  1255. movq %rax, KKK
  1256. #endif
  1257. sarq $2, %rax
  1258. je .L112
  1259. .L111:
  1260. mulpd %xmm0, %xmm1
  1261. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1262. mulpd -14 * SIZE(BO), %xmm0
  1263. addpd %xmm1, %xmm8
  1264. movapd -16 * SIZE(BO), %xmm1
  1265. addpd %xmm0, %xmm9
  1266. movapd -14 * SIZE(AO), %xmm0
  1267. mulpd %xmm0, %xmm1
  1268. mulpd -14 * SIZE(BO), %xmm0
  1269. addpd %xmm1, %xmm12
  1270. movapd -12 * SIZE(BO), %xmm1
  1271. addpd %xmm0, %xmm13
  1272. movapd -12 * SIZE(AO), %xmm0
  1273. mulpd %xmm0, %xmm1
  1274. mulpd -10 * SIZE(BO), %xmm0
  1275. addpd %xmm1, %xmm8
  1276. movapd -12 * SIZE(BO), %xmm1
  1277. addpd %xmm0, %xmm9
  1278. movapd -10 * SIZE(AO), %xmm0
  1279. mulpd %xmm0, %xmm1
  1280. mulpd -10 * SIZE(BO), %xmm0
  1281. addpd %xmm1, %xmm12
  1282. movapd 0 * SIZE(BO), %xmm1
  1283. addpd %xmm0, %xmm13
  1284. movapd 0 * SIZE(AO), %xmm0
  1285. mulpd %xmm2, %xmm3
  1286. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  1287. mulpd -6 * SIZE(BO), %xmm2
  1288. addpd %xmm3, %xmm8
  1289. movapd -8 * SIZE(BO), %xmm3
  1290. addpd %xmm2, %xmm9
  1291. movapd -6 * SIZE(AO), %xmm2
  1292. mulpd %xmm2, %xmm3
  1293. mulpd -6 * SIZE(BO), %xmm2
  1294. addpd %xmm3, %xmm12
  1295. movapd -4 * SIZE(BO), %xmm3
  1296. addpd %xmm2, %xmm13
  1297. movapd -4 * SIZE(AO), %xmm2
  1298. mulpd %xmm2, %xmm3
  1299. mulpd -2 * SIZE(BO), %xmm2
  1300. addpd %xmm3, %xmm8
  1301. movapd -4 * SIZE(BO), %xmm3
  1302. addpd %xmm2, %xmm9
  1303. movapd -2 * SIZE(AO), %xmm2
  1304. mulpd %xmm2, %xmm3
  1305. mulpd -2 * SIZE(BO), %xmm2
  1306. addpd %xmm3, %xmm12
  1307. movapd 8 * SIZE(BO), %xmm3
  1308. addpd %xmm2, %xmm13
  1309. movapd 8 * SIZE(AO), %xmm2
  1310. subq $-16 * SIZE, AO
  1311. subq $-16 * SIZE, BO
  1312. decq %rax
  1313. jne .L111
  1314. ALIGN_4
  1315. .L112:
  1316. #ifndef TRMMKERNEL
  1317. movq K, %rax
  1318. #else
  1319. movq KKK, %rax
  1320. #endif
  1321. movapd POSINV, %xmm5
  1322. movapd ALPHA_R, %xmm6
  1323. movapd ALPHA_I, %xmm7
  1324. andq $3, %rax # if (k & 1)
  1325. BRANCH
  1326. jle .L114
  1327. .L113:
  1328. mulpd %xmm0, %xmm1
  1329. mulpd -14 * SIZE(BO), %xmm0
  1330. addpd %xmm1, %xmm8
  1331. movapd -16 * SIZE(BO), %xmm1
  1332. addpd %xmm0, %xmm9
  1333. movapd -14 * SIZE(AO), %xmm0
  1334. mulpd %xmm0, %xmm1
  1335. mulpd -14 * SIZE(BO), %xmm0
  1336. addpd %xmm1, %xmm12
  1337. movapd -12 * SIZE(BO), %xmm1
  1338. addpd %xmm0, %xmm13
  1339. movapd -12 * SIZE(AO), %xmm0
  1340. addq $4 * SIZE, AO # aoffset += 4
  1341. addq $4 * SIZE, BO # boffset1 += 8
  1342. decq %rax
  1343. jg .L113
  1344. ALIGN_4
  1345. .L114:
  1346. #ifndef TRMMKERNEL
  1347. movlpd 0 * SIZE(CO1), %xmm0
  1348. movhpd 1 * SIZE(CO1), %xmm0
  1349. movlpd 2 * SIZE(CO1), %xmm2
  1350. movhpd 3 * SIZE(CO1), %xmm2
  1351. #endif
  1352. SHUFPD_1 %xmm9, %xmm9
  1353. SHUFPD_1 %xmm13, %xmm13
  1354. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1355. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1356. xorpd %xmm5, %xmm9
  1357. xorpd %xmm5, %xmm13
  1358. #else
  1359. xorpd %xmm5, %xmm8
  1360. xorpd %xmm5, %xmm12
  1361. #endif
  1362. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1363. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1364. subpd %xmm9, %xmm8
  1365. subpd %xmm13, %xmm12
  1366. #else
  1367. addpd %xmm9, %xmm8
  1368. addpd %xmm13, %xmm12
  1369. #endif
  1370. pshufd $0x4e, %xmm8, %xmm9
  1371. pshufd $0x4e, %xmm12, %xmm13
  1372. mulpd %xmm6, %xmm8
  1373. mulpd %xmm7, %xmm9
  1374. mulpd %xmm6, %xmm12
  1375. mulpd %xmm7, %xmm13
  1376. addpd %xmm9, %xmm8
  1377. addpd %xmm13, %xmm12
  1378. #ifndef TRMMKERNEL
  1379. addpd %xmm0, %xmm8
  1380. addpd %xmm2, %xmm12
  1381. #endif
  1382. movlpd %xmm8, 0 * SIZE(CO1)
  1383. movhpd %xmm8, 1 * SIZE(CO1)
  1384. movlpd %xmm12, 2 * SIZE(CO1)
  1385. movhpd %xmm12, 3 * SIZE(CO1)
  1386. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1387. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1388. movq K, %rax
  1389. subq KKK, %rax
  1390. leaq (,%rax, SIZE), %rax
  1391. leaq (AO, %rax, 4), AO
  1392. leaq (BO, %rax, 4), BO
  1393. #endif
  1394. #if defined(TRMMKERNEL) && defined(LEFT)
  1395. addq $2, KK
  1396. #endif
  1397. addq $4 * SIZE, CO1 # coffset += 4
  1398. decq I # i --
  1399. jg .L110
  1400. ALIGN_4
  1401. .L130:
  1402. testq $1, M
  1403. jle .L999
  1404. #if !defined(TRMMKERNEL) || \
  1405. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1406. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1407. leaq 16 * SIZE + BUFFER, BO
  1408. #else
  1409. leaq 16 * SIZE + BUFFER, BO
  1410. movq KK, %rax
  1411. leaq (, %rax, SIZE), %rax
  1412. leaq (AO, %rax, 2), AO
  1413. leaq (BO, %rax, 4), BO
  1414. #endif
  1415. movapd -16 * SIZE(AO), %xmm0
  1416. movapd -16 * SIZE(BO), %xmm1
  1417. movapd -8 * SIZE(AO), %xmm2
  1418. movapd -8 * SIZE(BO), %xmm3
  1419. pxor %xmm8, %xmm8
  1420. pxor %xmm9, %xmm9
  1421. pxor %xmm10, %xmm10
  1422. pxor %xmm11, %xmm11
  1423. #ifndef TRMMKERNEL
  1424. movq K, %rax
  1425. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1426. movq K, %rax
  1427. subq KK, %rax
  1428. movq %rax, KKK
  1429. #else
  1430. movq KK, %rax
  1431. #ifdef LEFT
  1432. addq $1, %rax
  1433. #else
  1434. addq $1, %rax
  1435. #endif
  1436. movq %rax, KKK
  1437. #endif
  1438. sarq $3, %rax
  1439. je .L144
  1440. ALIGN_4
  1441. .L141:
  1442. mulpd %xmm0, %xmm1
  1443. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1444. mulpd -14 * SIZE(BO), %xmm0
  1445. addpd %xmm1, %xmm8
  1446. movapd -12 * SIZE(BO), %xmm1
  1447. addpd %xmm0, %xmm9
  1448. movapd -14 * SIZE(AO), %xmm0
  1449. mulpd %xmm0, %xmm1
  1450. mulpd -10 * SIZE(BO), %xmm0
  1451. addpd %xmm1, %xmm10
  1452. movapd 0 * SIZE(BO), %xmm1
  1453. addpd %xmm0, %xmm11
  1454. movapd -12 * SIZE(AO), %xmm0
  1455. mulpd %xmm0, %xmm3
  1456. mulpd -6 * SIZE(BO), %xmm0
  1457. addpd %xmm3, %xmm8
  1458. movapd -4 * SIZE(BO), %xmm3
  1459. addpd %xmm0, %xmm9
  1460. movapd -10 * SIZE(AO), %xmm0
  1461. mulpd %xmm0, %xmm3
  1462. mulpd -2 * SIZE(BO), %xmm0
  1463. addpd %xmm3, %xmm10
  1464. movapd 8 * SIZE(BO), %xmm3
  1465. addpd %xmm0, %xmm11
  1466. movapd 0 * SIZE(AO), %xmm0
  1467. mulpd %xmm2, %xmm1
  1468. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  1469. mulpd 2 * SIZE(BO), %xmm2
  1470. addpd %xmm1, %xmm8
  1471. movapd 4 * SIZE(BO), %xmm1
  1472. addpd %xmm2, %xmm9
  1473. movapd -6 * SIZE(AO), %xmm2
  1474. mulpd %xmm2, %xmm1
  1475. mulpd 6 * SIZE(BO), %xmm2
  1476. addpd %xmm1, %xmm10
  1477. movapd 16 * SIZE(BO), %xmm1
  1478. addpd %xmm2, %xmm11
  1479. movapd -4 * SIZE(AO), %xmm2
  1480. mulpd %xmm2, %xmm3
  1481. mulpd 10 * SIZE(BO), %xmm2
  1482. addpd %xmm3, %xmm8
  1483. movapd 12 * SIZE(BO), %xmm3
  1484. addpd %xmm2, %xmm9
  1485. movapd -2 * SIZE(AO), %xmm2
  1486. mulpd %xmm2, %xmm3
  1487. mulpd 14 * SIZE(BO), %xmm2
  1488. addpd %xmm3, %xmm10
  1489. movapd 24 * SIZE(BO), %xmm3
  1490. addpd %xmm2, %xmm11
  1491. movapd 8 * SIZE(AO), %xmm2
  1492. subq $-16 * SIZE, AO
  1493. subq $-32 * SIZE, BO
  1494. decq %rax
  1495. jne .L141
  1496. ALIGN_4
  1497. .L144:
  1498. #ifndef TRMMKERNEL
  1499. movq K, %rax
  1500. #else
  1501. movq KKK, %rax
  1502. #endif
  1503. andq $4, %rax # if (k & 1)
  1504. BRANCH
  1505. jle .L145
  1506. mulpd %xmm0, %xmm1
  1507. mulpd -14 * SIZE(BO), %xmm0
  1508. addpd %xmm1, %xmm8
  1509. movapd -12 * SIZE(BO), %xmm1
  1510. addpd %xmm0, %xmm9
  1511. movapd -14 * SIZE(AO), %xmm0
  1512. mulpd %xmm0, %xmm1
  1513. mulpd -10 * SIZE(BO), %xmm0
  1514. addpd %xmm1, %xmm10
  1515. movapd 0 * SIZE(BO), %xmm1
  1516. addpd %xmm0, %xmm11
  1517. movapd -12 * SIZE(AO), %xmm0
  1518. mulpd %xmm0, %xmm3
  1519. mulpd -6 * SIZE(BO), %xmm0
  1520. addpd %xmm3, %xmm8
  1521. movapd -4 * SIZE(BO), %xmm3
  1522. addpd %xmm0, %xmm9
  1523. movapd -10 * SIZE(AO), %xmm0
  1524. mulpd %xmm0, %xmm3
  1525. mulpd -2 * SIZE(BO), %xmm0
  1526. addpd %xmm3, %xmm10
  1527. addpd %xmm0, %xmm11
  1528. movapd -8 * SIZE(AO), %xmm0
  1529. addq $8 * SIZE, AO
  1530. subq $-16 * SIZE, BO
  1531. ALIGN_4
  1532. .L145:
  1533. movapd POSINV, %xmm5
  1534. movapd ALPHA_R, %xmm6
  1535. movapd ALPHA_I, %xmm7
  1536. #ifndef TRMMKERNEL
  1537. movq K, %rax
  1538. #else
  1539. movq KKK, %rax
  1540. #endif
  1541. andq $3, %rax # if (k & 1)
  1542. BRANCH
  1543. jle .L148
  1544. ALIGN_4
  1545. .L146:
  1546. mulpd %xmm0, %xmm1
  1547. mulpd -14 * SIZE(BO), %xmm0
  1548. addpd %xmm1, %xmm8
  1549. movapd -12 * SIZE(BO), %xmm1
  1550. addpd %xmm0, %xmm9
  1551. movapd -14 * SIZE(AO), %xmm0
  1552. addq $2 * SIZE, AO # aoffset += 4
  1553. addq $4 * SIZE, BO # boffset1 += 8
  1554. decq %rax
  1555. jg .L146
  1556. ALIGN_4
  1557. .L148:
  1558. addpd %xmm10, %xmm8
  1559. addpd %xmm11, %xmm9
  1560. #ifndef TRMMKERNEL
  1561. movlpd 0 * SIZE(CO1), %xmm0
  1562. movhpd 1 * SIZE(CO1), %xmm0
  1563. #endif
  1564. SHUFPD_1 %xmm9, %xmm9
  1565. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1566. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1567. xorpd %xmm5, %xmm9
  1568. #else
  1569. xorpd %xmm5, %xmm8
  1570. #endif
  1571. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1572. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1573. subpd %xmm9, %xmm8
  1574. #else
  1575. addpd %xmm9, %xmm8
  1576. #endif
  1577. pshufd $0x4e, %xmm8, %xmm9
  1578. mulpd %xmm6, %xmm8
  1579. mulpd %xmm7, %xmm9
  1580. addpd %xmm9, %xmm8
  1581. #ifndef TRMMKERNEL
  1582. addpd %xmm0, %xmm8
  1583. #endif
  1584. movlpd %xmm8, 0 * SIZE(CO1)
  1585. movhpd %xmm8, 1 * SIZE(CO1)
  1586. ALIGN_4
  1587. .L999:
  1588. movq %rbx, %rsp
  1589. EMMS
  1590. movq 0(%rsp), %rbx
  1591. movq 8(%rsp), %rbp
  1592. movq 16(%rsp), %r12
  1593. movq 24(%rsp), %r13
  1594. movq 32(%rsp), %r14
  1595. movq 40(%rsp), %r15
  1596. #ifdef WINDOWS_ABI
  1597. movq 48(%rsp), %rdi
  1598. movq 56(%rsp), %rsi
  1599. movups 64(%rsp), %xmm6
  1600. movups 80(%rsp), %xmm7
  1601. movups 96(%rsp), %xmm8
  1602. movups 112(%rsp), %xmm9
  1603. movups 128(%rsp), %xmm10
  1604. movups 144(%rsp), %xmm11
  1605. movups 160(%rsp), %xmm12
  1606. movups 176(%rsp), %xmm13
  1607. movups 192(%rsp), %xmm14
  1608. movups 208(%rsp), %xmm15
  1609. #endif
  1610. addq $STACKSIZE, %rsp
  1611. ret
  1612. EPILOGUE