You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_4x2_sse.S 49 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define M %r13
  43. #define N %r14
  44. #define K %rdx
  45. #define A %rcx
  46. #define B %r8
  47. #define C %r9
  48. #define LDC %r10
  49. #define I %r11
  50. #define AO %rdi
  51. #define BO %rsi
  52. #define CO1 %r15
  53. #define CO2 %rbp
  54. #define BB %r12
  55. #ifndef WINDOWS_ABI
  56. #define STACKSIZE 64
  57. #else
  58. #define STACKSIZE 256
  59. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  60. #define OLD_A 48 + STACKSIZE(%rsp)
  61. #define OLD_B 56 + STACKSIZE(%rsp)
  62. #define OLD_C 64 + STACKSIZE(%rsp)
  63. #define OLD_LDC 72 + STACKSIZE(%rsp)
  64. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  65. #endif
  66. #define POSINV 0(%rsp)
  67. #define ALPHA_R 16(%rsp)
  68. #define ALPHA_I 32(%rsp)
  69. #define J 48(%rsp)
  70. #define OFFSET 56(%rsp)
  71. #define KK 64(%rsp)
  72. #define KKK 72(%rsp)
  73. #define BUFFER 256(%rsp)
  74. #ifdef OPTERON
  75. #define movsd movlps
  76. #endif
  77. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
  78. #define PREFETCH prefetch
  79. #define PREFETCHW prefetchw
  80. #define PREFETCHSIZE (16 * 5 + 8)
  81. #endif
  82. #if defined(PENTIUM4) || defined(GENERIC)
  83. #define PREFETCH prefetcht0
  84. #define PREFETCHW prefetcht0
  85. #define PREFETCHSIZE 160
  86. #endif
  87. #define RPREFETCHSIZE (8 * 7 + 4)
  88. #define WPREFETCHSIZE (8 * 8 + 4)
  89. #ifdef PREFETCH
  90. #define PREFETCH_KERNEL1(xx) PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;
  91. #define PREFETCH_KERNEL4(xx) PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;
  92. #else
  93. #define PREFETCH_KERNEL1(xx)
  94. #define PREFETCH_KERNEL4(xx)
  95. #endif
  96. #ifndef GENERIC
  97. #define KERNEL1(xx) \
  98. mulps %xmm0, %xmm1 ;\
  99. addps %xmm1, %xmm8 ;\
  100. movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  101. mulps %xmm0, %xmm3 ;\
  102. addps %xmm3, %xmm9 ;\
  103. movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  104. mulps %xmm0, %xmm5 ;\
  105. PREFETCH_KERNEL1(xx) \
  106. mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
  107. addps %xmm5, %xmm10 ;\
  108. movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  109. addps %xmm0, %xmm11 ;\
  110. movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
  111. #define KERNEL2(xx) \
  112. mulps %xmm2, %xmm1 ;\
  113. addps %xmm1, %xmm12 ;\
  114. movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  115. mulps %xmm2, %xmm3 ;\
  116. addps %xmm3, %xmm13 ;\
  117. movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  118. mulps %xmm2, %xmm5 ;\
  119. mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
  120. addps %xmm5, %xmm14 ;\
  121. movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  122. addps %xmm2, %xmm15 ;\
  123. movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
  124. #define KERNEL3(xx) \
  125. mulps %xmm4, %xmm7 ;\
  126. addps %xmm7, %xmm8 ;\
  127. movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  128. mulps %xmm4, %xmm3 ;\
  129. addps %xmm3, %xmm9 ;\
  130. movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  131. mulps %xmm4, %xmm5 ;\
  132. mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
  133. addps %xmm5, %xmm10 ;\
  134. movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  135. addps %xmm4, %xmm11 ;\
  136. movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
  137. #define KERNEL4(xx) \
  138. mulps %xmm6, %xmm7 ;\
  139. addps %xmm7, %xmm12 ;\
  140. movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  141. mulps %xmm6, %xmm3 ;\
  142. addps %xmm3, %xmm13 ;\
  143. movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  144. mulps %xmm6, %xmm5 ;\
  145. mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
  146. addps %xmm5, %xmm14 ;\
  147. movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  148. PREFETCH_KERNEL4(xx) \
  149. addps %xmm6, %xmm15 ;\
  150. movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
  151. #define KERNEL5(xx) \
  152. mulps %xmm0, %xmm1 ;\
  153. addps %xmm1, %xmm8 ;\
  154. movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  155. mulps %xmm0, %xmm3 ;\
  156. addps %xmm3, %xmm9 ;\
  157. movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  158. mulps %xmm0, %xmm5 ;\
  159. mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
  160. addps %xmm5, %xmm10 ;\
  161. movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  162. addps %xmm0, %xmm11 ;\
  163. movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
  164. #define KERNEL6(xx) \
  165. mulps %xmm2, %xmm1 ;\
  166. addps %xmm1, %xmm12 ;\
  167. movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
  168. mulps %xmm2, %xmm3 ;\
  169. addps %xmm3, %xmm13 ;\
  170. movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  171. mulps %xmm2, %xmm5 ;\
  172. mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
  173. addps %xmm5, %xmm14 ;\
  174. movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  175. addps %xmm2, %xmm15 ;\
  176. movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
  177. #define KERNEL7(xx) \
  178. mulps %xmm4, %xmm7 ;\
  179. addps %xmm7, %xmm8 ;\
  180. movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  181. mulps %xmm4, %xmm3 ;\
  182. addps %xmm3, %xmm9 ;\
  183. movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  184. mulps %xmm4, %xmm5 ;\
  185. mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
  186. addps %xmm5, %xmm10 ;\
  187. movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  188. addps %xmm4, %xmm11 ;\
  189. movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
  190. #define KERNEL8(xx) \
  191. mulps %xmm6, %xmm7 ;\
  192. addps %xmm7, %xmm12 ;\
  193. movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
  194. mulps %xmm6, %xmm3 ;\
  195. addps %xmm3, %xmm13 ;\
  196. movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
  197. mulps %xmm6, %xmm5 ;\
  198. mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
  199. addps %xmm5, %xmm14 ;\
  200. movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
  201. addps %xmm6, %xmm15 ;\
  202. movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
  203. #else
  204. #define KERNEL1(xx) \
  205. mulps %xmm0, %xmm1 ;\
  206. addps %xmm1, %xmm8 ;\
  207. movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  208. mulps %xmm0, %xmm3 ;\
  209. addps %xmm3, %xmm9 ;\
  210. movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  211. mulps %xmm0, %xmm5 ;\
  212. PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\
  213. mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
  214. addps %xmm5, %xmm10 ;\
  215. movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  216. addps %xmm0, %xmm11 ;\
  217. movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
  218. #define KERNEL2(xx) \
  219. mulps %xmm2, %xmm1 ;\
  220. addps %xmm1, %xmm12 ;\
  221. movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  222. mulps %xmm2, %xmm3 ;\
  223. addps %xmm3, %xmm13 ;\
  224. movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  225. mulps %xmm2, %xmm5 ;\
  226. mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
  227. addps %xmm5, %xmm14 ;\
  228. movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  229. addps %xmm2, %xmm15 ;\
  230. movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\
  231. #define KERNEL3(xx) \
  232. mulps %xmm4, %xmm7 ;\
  233. addps %xmm7, %xmm8 ;\
  234. movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  235. mulps %xmm4, %xmm3 ;\
  236. addps %xmm3, %xmm9 ;\
  237. movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  238. mulps %xmm4, %xmm5 ;\
  239. mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
  240. addps %xmm5, %xmm10 ;\
  241. movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  242. addps %xmm4, %xmm11 ;\
  243. movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
  244. #define KERNEL4(xx) \
  245. mulps %xmm6, %xmm7 ;\
  246. addps %xmm7, %xmm12 ;\
  247. movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  248. mulps %xmm6, %xmm3 ;\
  249. addps %xmm3, %xmm13 ;\
  250. movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  251. mulps %xmm6, %xmm5 ;\
  252. mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
  253. addps %xmm5, %xmm14 ;\
  254. movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  255. addps %xmm6, %xmm15 ;\
  256. movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
  257. #define KERNEL5(xx) \
  258. mulps %xmm0, %xmm1 ;\
  259. PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\
  260. addps %xmm1, %xmm8 ;\
  261. movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  262. mulps %xmm0, %xmm3 ;\
  263. addps %xmm3, %xmm9 ;\
  264. movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  265. mulps %xmm0, %xmm5 ;\
  266. mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
  267. addps %xmm5, %xmm10 ;\
  268. movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  269. addps %xmm0, %xmm11 ;\
  270. movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
  271. #define KERNEL6(xx) \
  272. mulps %xmm2, %xmm1 ;\
  273. addps %xmm1, %xmm12 ;\
  274. movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
  275. mulps %xmm2, %xmm3 ;\
  276. addps %xmm3, %xmm13 ;\
  277. movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  278. mulps %xmm2, %xmm5 ;\
  279. mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
  280. addps %xmm5, %xmm14 ;\
  281. movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  282. addps %xmm2, %xmm15 ;\
  283. movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
  284. #define KERNEL7(xx) \
  285. mulps %xmm4, %xmm7 ;\
  286. addps %xmm7, %xmm8 ;\
  287. movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  288. mulps %xmm4, %xmm3 ;\
  289. addps %xmm3, %xmm9 ;\
  290. movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  291. mulps %xmm4, %xmm5 ;\
  292. mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
  293. addps %xmm5, %xmm10 ;\
  294. movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  295. addps %xmm4, %xmm11 ;\
  296. movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
  297. #define KERNEL8(xx) \
  298. mulps %xmm6, %xmm7 ;\
  299. addps %xmm7, %xmm12 ;\
  300. movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
  301. mulps %xmm6, %xmm3 ;\
  302. addps %xmm3, %xmm13 ;\
  303. movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
  304. mulps %xmm6, %xmm5 ;\
  305. mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
  306. addps %xmm5, %xmm14 ;\
  307. movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
  308. addps %xmm6, %xmm15 ;\
  309. movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
  310. #endif
  311. PROLOGUE
  312. PROFCODE
  313. subq $STACKSIZE, %rsp
  314. movq %rbx, 0(%rsp)
  315. movq %rbp, 8(%rsp)
  316. movq %r12, 16(%rsp)
  317. movq %r13, 24(%rsp)
  318. movq %r14, 32(%rsp)
  319. movq %r15, 40(%rsp)
  320. #ifdef WINDOWS_ABI
  321. movq %rdi, 48(%rsp)
  322. movq %rsi, 56(%rsp)
  323. movups %xmm6, 64(%rsp)
  324. movups %xmm7, 80(%rsp)
  325. movups %xmm8, 96(%rsp)
  326. movups %xmm9, 112(%rsp)
  327. movups %xmm10, 128(%rsp)
  328. movups %xmm11, 144(%rsp)
  329. movups %xmm12, 160(%rsp)
  330. movups %xmm13, 176(%rsp)
  331. movups %xmm14, 192(%rsp)
  332. movups %xmm15, 208(%rsp)
  333. movq ARG1, OLD_M
  334. movq ARG2, OLD_N
  335. movq ARG3, K
  336. movq OLD_A, A
  337. movq OLD_B, B
  338. movq OLD_C, C
  339. movq OLD_LDC, LDC
  340. #ifdef TRMMKERNEL
  341. movsd OLD_OFFSET, %xmm12
  342. #endif
  343. movaps %xmm3, %xmm0
  344. movsd OLD_ALPHA_I, %xmm1
  345. #else
  346. movq 72(%rsp), LDC
  347. #ifdef TRMMKERNEL
  348. movsd 80(%rsp), %xmm12
  349. #endif
  350. #endif
  351. movq %rsp, %rbx # save old stack
  352. subq $256 + LOCAL_BUFFER_SIZE, %rsp
  353. andq $-4096, %rsp # align stack
  354. STACK_TOUCHING
  355. movq OLD_M, M
  356. movq OLD_N, N
  357. pxor %xmm7, %xmm7
  358. cmpeqps %xmm7, %xmm7
  359. pslld $31, %xmm7 # Generate mask
  360. pxor %xmm10, %xmm10
  361. shufps $0, %xmm0, %xmm0
  362. movaps %xmm0, 0 + ALPHA_R
  363. movss %xmm1, 4 + ALPHA_I
  364. movss %xmm1, 12 + ALPHA_I
  365. xorps %xmm7, %xmm1
  366. movss %xmm1, 0 + ALPHA_I
  367. movss %xmm1, 8 + ALPHA_I
  368. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  369. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  370. movss %xmm7, 0 + POSINV
  371. movss %xmm10, 4 + POSINV
  372. movss %xmm7, 8 + POSINV
  373. movss %xmm10,12 + POSINV
  374. #else
  375. movss %xmm10, 0 + POSINV
  376. movss %xmm7, 4 + POSINV
  377. movss %xmm10, 8 + POSINV
  378. movss %xmm7, 12 + POSINV
  379. #endif
  380. addq $32 * SIZE, A
  381. #ifdef TRMMKERNEL
  382. movsd %xmm12, OFFSET
  383. movsd %xmm12, KK
  384. #ifndef LEFT
  385. negq KK
  386. #endif
  387. #endif
  388. salq $ZBASE_SHIFT, LDC
  389. movq N, J
  390. sarq $1, J # j = (n >> 2)
  391. jle .L40
  392. ALIGN_4
  393. .L01:
  394. #if defined(TRMMKERNEL) && defined(LEFT)
  395. movq OFFSET, %rax
  396. movq %rax, KK
  397. #endif
  398. /* Copying to Sub Buffer */
  399. leaq BUFFER, BO
  400. movaps POSINV, %xmm7
  401. movq K, %rax
  402. sarq $2, %rax
  403. jle .L03
  404. addq %rax, %rax
  405. ALIGN_4
  406. .L02:
  407. PREFETCH (RPREFETCHSIZE + 0) * SIZE(B)
  408. movss 0 * SIZE(B), %xmm8
  409. movss 1 * SIZE(B), %xmm9
  410. movss 2 * SIZE(B), %xmm10
  411. movss 3 * SIZE(B), %xmm11
  412. movss 4 * SIZE(B), %xmm12
  413. movss 5 * SIZE(B), %xmm13
  414. movss 6 * SIZE(B), %xmm14
  415. movss 7 * SIZE(B), %xmm15
  416. PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO)
  417. shufps $0, %xmm8, %xmm8
  418. shufps $0, %xmm9, %xmm9
  419. shufps $0, %xmm10, %xmm10
  420. shufps $0, %xmm11, %xmm11
  421. shufps $0, %xmm12, %xmm12
  422. shufps $0, %xmm13, %xmm13
  423. shufps $0, %xmm14, %xmm14
  424. shufps $0, %xmm15, %xmm15
  425. PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO)
  426. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  427. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  428. xorps %xmm7, %xmm9
  429. xorps %xmm7, %xmm11
  430. xorps %xmm7, %xmm13
  431. xorps %xmm7, %xmm15
  432. #else
  433. xorps %xmm7, %xmm8
  434. xorps %xmm7, %xmm10
  435. xorps %xmm7, %xmm12
  436. xorps %xmm7, %xmm14
  437. #endif
  438. movaps %xmm8, 0 * SIZE(BO)
  439. movaps %xmm9, 4 * SIZE(BO)
  440. movaps %xmm10, 8 * SIZE(BO)
  441. movaps %xmm11, 12 * SIZE(BO)
  442. movaps %xmm12, 16 * SIZE(BO)
  443. movaps %xmm13, 20 * SIZE(BO)
  444. movaps %xmm14, 24 * SIZE(BO)
  445. movaps %xmm15, 28 * SIZE(BO)
  446. addq $32 * SIZE, BO
  447. addq $ 8 * SIZE, B
  448. decq %rax
  449. jne .L02
  450. ALIGN_4
  451. .L03:
  452. movq K, %rax
  453. andq $3, %rax
  454. BRANCH
  455. jle .L10
  456. ALIGN_4
  457. .L04:
  458. movss 0 * SIZE(B), %xmm8
  459. movss 1 * SIZE(B), %xmm9
  460. movss 2 * SIZE(B), %xmm10
  461. movss 3 * SIZE(B), %xmm11
  462. shufps $0, %xmm8, %xmm8
  463. shufps $0, %xmm9, %xmm9
  464. shufps $0, %xmm10, %xmm10
  465. shufps $0, %xmm11, %xmm11
  466. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  467. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  468. xorps %xmm7, %xmm9
  469. xorps %xmm7, %xmm11
  470. #else
  471. xorps %xmm7, %xmm8
  472. xorps %xmm7, %xmm10
  473. #endif
  474. movaps %xmm8, 0 * SIZE(BO)
  475. movaps %xmm9, 4 * SIZE(BO)
  476. movaps %xmm10, 8 * SIZE(BO)
  477. movaps %xmm11, 12 * SIZE(BO)
  478. addq $ 4 * SIZE, B
  479. addq $16 * SIZE, BO
  480. decq %rax
  481. jne .L04
  482. ALIGN_4
  483. .L10:
  484. movq C, CO1 # coffset1 = c
  485. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  486. movq A, AO # aoffset = a
  487. leaq (RPREFETCHSIZE + 0) * SIZE(B), BB
  488. movq M, I
  489. sarq $2, I # i = (m >> 2)
  490. jle .L20
  491. ALIGN_4
  492. .L11:
  493. #if !defined(TRMMKERNEL) || \
  494. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  495. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  496. leaq 32 * SIZE + BUFFER, BO
  497. #else
  498. leaq 32 * SIZE + BUFFER, BO
  499. movq KK, %rax
  500. leaq (, %rax, 8), %rax
  501. leaq (AO, %rax, 4), AO
  502. leaq (BO, %rax, 8), BO
  503. #endif
  504. movaps -32 * SIZE(AO), %xmm0
  505. movaps -32 * SIZE(BO), %xmm1
  506. pxor %xmm8, %xmm8
  507. movaps -28 * SIZE(AO), %xmm2
  508. movaps -28 * SIZE(BO), %xmm3
  509. pxor %xmm9, %xmm9
  510. movaps -24 * SIZE(AO), %xmm4
  511. movaps -24 * SIZE(BO), %xmm5
  512. pxor %xmm10, %xmm10
  513. movaps -20 * SIZE(AO), %xmm6
  514. movaps -16 * SIZE(BO), %xmm7
  515. pxor %xmm11, %xmm11
  516. PREFETCHW 7 * SIZE(CO1)
  517. pxor %xmm12, %xmm12
  518. PREFETCHW 7 * SIZE(CO2)
  519. pxor %xmm13, %xmm13
  520. PREFETCH -32 * SIZE(BB)
  521. pxor %xmm14, %xmm14
  522. PREFETCH -16 * SIZE(BB)
  523. pxor %xmm15, %xmm15
  524. subq $-16 * SIZE, BB
  525. #ifndef TRMMKERNEL
  526. movq K, %rax
  527. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  528. movq K, %rax
  529. subq KK, %rax
  530. movq %rax, KKK
  531. #else
  532. movq KK, %rax
  533. #ifdef LEFT
  534. addq $4, %rax
  535. #else
  536. addq $2, %rax
  537. #endif
  538. movq %rax, KKK
  539. #endif
  540. #ifndef GENERIC
  541. andq $-8, %rax
  542. leaq (, %rax, 8), %rax
  543. leaq (AO, %rax, 4), AO
  544. leaq (BO, %rax, 8), BO
  545. negq %rax
  546. NOBRANCH
  547. je .L15
  548. ALIGN_3
  549. .L12:
  550. KERNEL1(16 * 0)
  551. KERNEL2(16 * 0)
  552. KERNEL3(16 * 0)
  553. KERNEL4(16 * 0)
  554. KERNEL5(16 * 0)
  555. KERNEL6(16 * 0)
  556. KERNEL7(16 * 0)
  557. KERNEL8(16 * 0)
  558. KERNEL1(16 * 2)
  559. KERNEL2(16 * 2)
  560. KERNEL3(16 * 2)
  561. KERNEL4(16 * 2)
  562. KERNEL5(16 * 2)
  563. KERNEL6(16 * 2)
  564. KERNEL7(16 * 2)
  565. KERNEL8(16 * 2)
  566. addq $16 * SIZE, %rax
  567. NOBRANCH
  568. je .L15
  569. KERNEL1(16 * 0)
  570. KERNEL2(16 * 0)
  571. KERNEL3(16 * 0)
  572. KERNEL4(16 * 0)
  573. KERNEL5(16 * 0)
  574. KERNEL6(16 * 0)
  575. KERNEL7(16 * 0)
  576. KERNEL8(16 * 0)
  577. KERNEL1(16 * 2)
  578. KERNEL2(16 * 2)
  579. KERNEL3(16 * 2)
  580. KERNEL4(16 * 2)
  581. KERNEL5(16 * 2)
  582. KERNEL6(16 * 2)
  583. KERNEL7(16 * 2)
  584. KERNEL8(16 * 2)
  585. addq $16 * SIZE, %rax
  586. NOBRANCH
  587. je .L15
  588. KERNEL1(16 * 0)
  589. KERNEL2(16 * 0)
  590. KERNEL3(16 * 0)
  591. KERNEL4(16 * 0)
  592. KERNEL5(16 * 0)
  593. KERNEL6(16 * 0)
  594. KERNEL7(16 * 0)
  595. KERNEL8(16 * 0)
  596. KERNEL1(16 * 2)
  597. KERNEL2(16 * 2)
  598. KERNEL3(16 * 2)
  599. KERNEL4(16 * 2)
  600. KERNEL5(16 * 2)
  601. KERNEL6(16 * 2)
  602. KERNEL7(16 * 2)
  603. KERNEL8(16 * 2)
  604. addq $16 * SIZE, %rax
  605. NOBRANCH
  606. je .L15
  607. KERNEL1(16 * 0)
  608. KERNEL2(16 * 0)
  609. KERNEL3(16 * 0)
  610. KERNEL4(16 * 0)
  611. KERNEL5(16 * 0)
  612. KERNEL6(16 * 0)
  613. KERNEL7(16 * 0)
  614. KERNEL8(16 * 0)
  615. KERNEL1(16 * 2)
  616. KERNEL2(16 * 2)
  617. KERNEL3(16 * 2)
  618. KERNEL4(16 * 2)
  619. KERNEL5(16 * 2)
  620. KERNEL6(16 * 2)
  621. KERNEL7(16 * 2)
  622. KERNEL8(16 * 2)
  623. addq $16 * SIZE, %rax
  624. NOBRANCH
  625. je .L15
  626. KERNEL1(16 * 0)
  627. KERNEL2(16 * 0)
  628. KERNEL3(16 * 0)
  629. KERNEL4(16 * 0)
  630. KERNEL5(16 * 0)
  631. KERNEL6(16 * 0)
  632. KERNEL7(16 * 0)
  633. KERNEL8(16 * 0)
  634. KERNEL1(16 * 2)
  635. KERNEL2(16 * 2)
  636. KERNEL3(16 * 2)
  637. KERNEL4(16 * 2)
  638. KERNEL5(16 * 2)
  639. KERNEL6(16 * 2)
  640. KERNEL7(16 * 2)
  641. KERNEL8(16 * 2)
  642. addq $16 * SIZE, %rax
  643. NOBRANCH
  644. je .L15
  645. KERNEL1(16 * 0)
  646. KERNEL2(16 * 0)
  647. KERNEL3(16 * 0)
  648. KERNEL4(16 * 0)
  649. KERNEL5(16 * 0)
  650. KERNEL6(16 * 0)
  651. KERNEL7(16 * 0)
  652. KERNEL8(16 * 0)
  653. KERNEL1(16 * 2)
  654. KERNEL2(16 * 2)
  655. KERNEL3(16 * 2)
  656. KERNEL4(16 * 2)
  657. KERNEL5(16 * 2)
  658. KERNEL6(16 * 2)
  659. KERNEL7(16 * 2)
  660. KERNEL8(16 * 2)
  661. addq $16 * SIZE, %rax
  662. NOBRANCH
  663. je .L15
  664. KERNEL1(16 * 0)
  665. KERNEL2(16 * 0)
  666. KERNEL3(16 * 0)
  667. KERNEL4(16 * 0)
  668. KERNEL5(16 * 0)
  669. KERNEL6(16 * 0)
  670. KERNEL7(16 * 0)
  671. KERNEL8(16 * 0)
  672. KERNEL1(16 * 2)
  673. KERNEL2(16 * 2)
  674. KERNEL3(16 * 2)
  675. KERNEL4(16 * 2)
  676. KERNEL5(16 * 2)
  677. KERNEL6(16 * 2)
  678. KERNEL7(16 * 2)
  679. KERNEL8(16 * 2)
  680. addq $16 * SIZE, %rax
  681. NOBRANCH
  682. je .L15
  683. KERNEL1(16 * 0)
  684. KERNEL2(16 * 0)
  685. KERNEL3(16 * 0)
  686. KERNEL4(16 * 0)
  687. KERNEL5(16 * 0)
  688. KERNEL6(16 * 0)
  689. KERNEL7(16 * 0)
  690. KERNEL8(16 * 0)
  691. KERNEL1(16 * 2)
  692. KERNEL2(16 * 2)
  693. KERNEL3(16 * 2)
  694. KERNEL4(16 * 2)
  695. KERNEL5(16 * 2)
  696. KERNEL6(16 * 2)
  697. KERNEL7(16 * 2)
  698. KERNEL8(16 * 2)
  699. addq $16 * SIZE, %rax
  700. BRANCH
  701. jl .L12
  702. ALIGN_3
  703. .L15:
  704. #ifndef TRMMKERNEL
  705. movq K, %rax
  706. #else
  707. movq KKK, %rax
  708. #endif
  709. testq $4, %rax
  710. je .L16
  711. xorq %rax, %rax
  712. ALIGN_3
  713. KERNEL1(16 * 0)
  714. KERNEL2(16 * 0)
  715. KERNEL3(16 * 0)
  716. KERNEL4(16 * 0)
  717. KERNEL5(16 * 0)
  718. KERNEL6(16 * 0)
  719. KERNEL7(16 * 0)
  720. KERNEL8(16 * 0)
  721. addq $64 * SIZE, BO
  722. addq $32 * SIZE, AO
  723. ALIGN_3
  724. #else
  725. sarq $2, %rax
  726. NOBRANCH
  727. jle .L16
  728. ALIGN_3
  729. .L12:
  730. KERNEL1(16 * 0)
  731. KERNEL2(16 * 0)
  732. KERNEL3(16 * 0)
  733. KERNEL4(16 * 0)
  734. KERNEL5(16 * 0)
  735. KERNEL6(16 * 0)
  736. KERNEL7(16 * 0)
  737. KERNEL8(16 * 0)
  738. addq $ 64 * SIZE, BO
  739. subq $-32 * SIZE, AO
  740. decq %rax
  741. BRANCH
  742. jg .L12
  743. #endif
  744. .L16:
  745. #ifndef TRMMKERNEL
  746. movq K, %rax
  747. #else
  748. movq KKK, %rax
  749. #endif
  750. movaps ALPHA_R, %xmm6
  751. movaps ALPHA_I, %xmm7
  752. andq $3, %rax # if (k & 1)
  753. BRANCH
  754. je .L18
  755. leaq (, %rax, 8), %rax
  756. leaq (AO, %rax, 4), AO
  757. leaq (BO, %rax, 8), BO
  758. negq %rax
  759. ALIGN_4
  760. .L17:
  761. mulps %xmm0, %xmm1
  762. addps %xmm1, %xmm8
  763. movaps -28 * SIZE(BO, %rax, 8), %xmm1
  764. mulps %xmm0, %xmm1
  765. addps %xmm1, %xmm9
  766. movaps -24 * SIZE(BO, %rax, 8), %xmm1
  767. mulps %xmm0, %xmm1
  768. mulps -20 * SIZE(BO, %rax, 8), %xmm0
  769. addps %xmm1, %xmm10
  770. movaps -32 * SIZE(BO, %rax, 8), %xmm1
  771. addps %xmm0, %xmm11
  772. movaps -24 * SIZE(AO, %rax, 4), %xmm0
  773. mulps %xmm2, %xmm1
  774. addps %xmm1, %xmm12
  775. movaps -28 * SIZE(BO, %rax, 8), %xmm1
  776. mulps %xmm2, %xmm1
  777. addps %xmm1, %xmm13
  778. movaps -24 * SIZE(BO, %rax, 8), %xmm1
  779. mulps %xmm2, %xmm1
  780. mulps -20 * SIZE(BO, %rax, 8), %xmm2
  781. addps %xmm1, %xmm14
  782. movaps -16 * SIZE(BO, %rax, 8), %xmm1
  783. addps %xmm2, %xmm15
  784. movaps -20 * SIZE(AO, %rax, 4), %xmm2
  785. addq $SIZE * 2, %rax
  786. jl .L17
  787. ALIGN_4
  788. .L18:
  789. #ifndef TRMMKERNEL
  790. movsd 0 * SIZE(CO1), %xmm0
  791. movhps 2 * SIZE(CO1), %xmm0
  792. movsd 4 * SIZE(CO1), %xmm2
  793. movhps 6 * SIZE(CO1), %xmm2
  794. movsd 0 * SIZE(CO2), %xmm1
  795. movhps 2 * SIZE(CO2), %xmm1
  796. movsd 4 * SIZE(CO2), %xmm3
  797. movhps 6 * SIZE(CO2), %xmm3
  798. #endif
  799. shufps $0xb1, %xmm9, %xmm9
  800. shufps $0xb1, %xmm11, %xmm11
  801. shufps $0xb1, %xmm13, %xmm13
  802. shufps $0xb1, %xmm15, %xmm15
  803. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  804. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  805. subps %xmm9, %xmm8
  806. subps %xmm11, %xmm10
  807. subps %xmm13, %xmm12
  808. subps %xmm15, %xmm14
  809. #else
  810. addps %xmm9, %xmm8
  811. addps %xmm11, %xmm10
  812. addps %xmm13, %xmm12
  813. addps %xmm15, %xmm14
  814. #endif
  815. movaps %xmm8, %xmm9
  816. movaps %xmm10, %xmm11
  817. movaps %xmm12, %xmm13
  818. movaps %xmm14, %xmm15
  819. shufps $0xb1, %xmm8, %xmm8
  820. shufps $0xb1, %xmm10, %xmm10
  821. shufps $0xb1, %xmm12, %xmm12
  822. shufps $0xb1, %xmm14, %xmm14
  823. mulps %xmm6, %xmm9
  824. mulps %xmm7, %xmm8
  825. mulps %xmm6, %xmm11
  826. mulps %xmm7, %xmm10
  827. mulps %xmm6, %xmm13
  828. mulps %xmm7, %xmm12
  829. mulps %xmm6, %xmm15
  830. mulps %xmm7, %xmm14
  831. addps %xmm9, %xmm8
  832. addps %xmm11, %xmm10
  833. addps %xmm13, %xmm12
  834. addps %xmm15, %xmm14
  835. #ifndef TRMMKERNEL
  836. addps %xmm0, %xmm8
  837. addps %xmm1, %xmm10
  838. addps %xmm2, %xmm12
  839. addps %xmm3, %xmm14
  840. #endif
  841. movsd %xmm8, 0 * SIZE(CO1)
  842. movhps %xmm8, 2 * SIZE(CO1)
  843. movsd %xmm12, 4 * SIZE(CO1)
  844. movhps %xmm12, 6 * SIZE(CO1)
  845. movsd %xmm10, 0 * SIZE(CO2)
  846. movhps %xmm10, 2 * SIZE(CO2)
  847. movsd %xmm14, 4 * SIZE(CO2)
  848. movhps %xmm14, 6 * SIZE(CO2)
  849. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  850. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  851. movq K, %rax
  852. subq KKK, %rax
  853. leaq (,%rax, 8), %rax
  854. leaq (AO, %rax, 4), AO
  855. leaq (BO, %rax, 8), BO
  856. #endif
  857. #if defined(TRMMKERNEL) && defined(LEFT)
  858. addq $4, KK
  859. #endif
  860. addq $8 * SIZE, CO1 # coffset += 4
  861. addq $8 * SIZE, CO2 # coffset += 4
  862. decq I # i --
  863. jg .L11
  864. ALIGN_4
  865. .L20:
  866. testq $2, M
  867. je .L30
  868. #if !defined(TRMMKERNEL) || \
  869. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  870. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  871. leaq 32 * SIZE + BUFFER, BO
  872. #else
  873. leaq 32 * SIZE + BUFFER, BO
  874. movq KK, %rax
  875. leaq (, %rax, 8), %rax
  876. leaq (AO, %rax, 2), AO
  877. leaq (BO, %rax, 8), BO
  878. #endif
  879. movaps -32 * SIZE(AO), %xmm0
  880. movaps -16 * SIZE(AO), %xmm2
  881. movaps 0 * SIZE(AO), %xmm4
  882. movaps 16 * SIZE(AO), %xmm6
  883. movaps -32 * SIZE(BO), %xmm1
  884. movaps -16 * SIZE(BO), %xmm3
  885. movaps 0 * SIZE(BO), %xmm5
  886. movaps 16 * SIZE(BO), %xmm7
  887. pxor %xmm8, %xmm8
  888. pxor %xmm9, %xmm9
  889. pxor %xmm10, %xmm10
  890. pxor %xmm11, %xmm11
  891. #ifndef TRMMKERNEL
  892. movq K, %rax
  893. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  894. movq K, %rax
  895. subq KK, %rax
  896. movq %rax, KKK
  897. #else
  898. movq KK, %rax
  899. #ifdef LEFT
  900. addq $2, %rax
  901. #else
  902. addq $2, %rax
  903. #endif
  904. movq %rax, KKK
  905. #endif
  906. sarq $3, %rax
  907. je .L25
  908. ALIGN_4
  909. .L22:
  910. mulps %xmm0, %xmm1
  911. #ifdef PREFETCH
  912. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  913. #endif
  914. addps %xmm1, %xmm8
  915. movaps -28 * SIZE(BO), %xmm1
  916. mulps %xmm0, %xmm1
  917. addps %xmm1, %xmm9
  918. movaps -24 * SIZE(BO), %xmm1
  919. mulps %xmm0, %xmm1
  920. mulps -20 * SIZE(BO), %xmm0
  921. addps %xmm1, %xmm10
  922. movaps 32 * SIZE(BO), %xmm1
  923. addps %xmm0, %xmm11
  924. movaps -28 * SIZE(AO), %xmm0
  925. mulps %xmm0, %xmm3
  926. addps %xmm3, %xmm8
  927. movaps -12 * SIZE(BO), %xmm3
  928. mulps %xmm0, %xmm3
  929. addps %xmm3, %xmm9
  930. movaps -8 * SIZE(BO), %xmm3
  931. mulps %xmm0, %xmm3
  932. mulps -4 * SIZE(BO), %xmm0
  933. addps %xmm3, %xmm10
  934. movaps 48 * SIZE(BO), %xmm3
  935. addps %xmm0, %xmm11
  936. movaps -24 * SIZE(AO), %xmm0
  937. mulps %xmm0, %xmm5
  938. addps %xmm5, %xmm8
  939. movaps 4 * SIZE(BO), %xmm5
  940. mulps %xmm0, %xmm5
  941. addps %xmm5, %xmm9
  942. movaps 8 * SIZE(BO), %xmm5
  943. mulps %xmm0, %xmm5
  944. mulps 12 * SIZE(BO), %xmm0
  945. addps %xmm5, %xmm10
  946. movaps 64 * SIZE(BO), %xmm5
  947. addps %xmm0, %xmm11
  948. movaps -20 * SIZE(AO), %xmm0
  949. mulps %xmm0, %xmm7
  950. addps %xmm7, %xmm8
  951. movaps 20 * SIZE(BO), %xmm7
  952. mulps %xmm0, %xmm7
  953. addps %xmm7, %xmm9
  954. movaps 24 * SIZE(BO), %xmm7
  955. mulps %xmm0, %xmm7
  956. mulps 28 * SIZE(BO), %xmm0
  957. addps %xmm7, %xmm10
  958. movaps 80 * SIZE(BO), %xmm7
  959. addps %xmm0, %xmm11
  960. movaps 0 * SIZE(AO), %xmm0
  961. mulps %xmm2, %xmm1
  962. #ifdef PREFETCH
  963. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  964. #endif
  965. addps %xmm1, %xmm8
  966. movaps 36 * SIZE(BO), %xmm1
  967. mulps %xmm2, %xmm1
  968. addps %xmm1, %xmm9
  969. movaps 40 * SIZE(BO), %xmm1
  970. mulps %xmm2, %xmm1
  971. mulps 44 * SIZE(BO), %xmm2
  972. addps %xmm1, %xmm10
  973. movaps 96 * SIZE(BO), %xmm1
  974. addps %xmm2, %xmm11
  975. movaps -12 * SIZE(AO), %xmm2
  976. mulps %xmm2, %xmm3
  977. addps %xmm3, %xmm8
  978. movaps 52 * SIZE(BO), %xmm3
  979. mulps %xmm2, %xmm3
  980. addps %xmm3, %xmm9
  981. movaps 56 * SIZE(BO), %xmm3
  982. mulps %xmm2, %xmm3
  983. mulps 60 * SIZE(BO), %xmm2
  984. addps %xmm3, %xmm10
  985. movaps 112 * SIZE(BO), %xmm3
  986. addps %xmm2, %xmm11
  987. movaps -8 * SIZE(AO), %xmm2
  988. mulps %xmm2, %xmm5
  989. addps %xmm5, %xmm8
  990. movaps 68 * SIZE(BO), %xmm5
  991. mulps %xmm2, %xmm5
  992. addps %xmm5, %xmm9
  993. movaps 72 * SIZE(BO), %xmm5
  994. mulps %xmm2, %xmm5
  995. mulps 76 * SIZE(BO), %xmm2
  996. addps %xmm5, %xmm10
  997. movaps 128 * SIZE(BO), %xmm5
  998. addps %xmm2, %xmm11
  999. movaps -4 * SIZE(AO), %xmm2
  1000. mulps %xmm2, %xmm7
  1001. addps %xmm7, %xmm8
  1002. movaps 84 * SIZE(BO), %xmm7
  1003. mulps %xmm2, %xmm7
  1004. addps %xmm7, %xmm9
  1005. movaps 88 * SIZE(BO), %xmm7
  1006. mulps %xmm2, %xmm7
  1007. mulps 92 * SIZE(BO), %xmm2
  1008. addps %xmm7, %xmm10
  1009. movaps 144 * SIZE(BO), %xmm7
  1010. addps %xmm2, %xmm11
  1011. movaps 16 * SIZE(AO), %xmm2
  1012. subq $ -32 * SIZE, AO
  1013. subq $-128 * SIZE, BO
  1014. decq %rax
  1015. jne .L22
  1016. ALIGN_4
  1017. .L25:
  1018. #ifndef TRMMKERNEL
  1019. movq K, %rax
  1020. #else
  1021. movq KKK, %rax
  1022. #endif
  1023. movaps ALPHA_R, %xmm6
  1024. movaps ALPHA_I, %xmm7
  1025. andq $7, %rax # if (k & 1)
  1026. BRANCH
  1027. je .L28
  1028. ALIGN_4
  1029. .L26:
  1030. mulps %xmm0, %xmm1
  1031. addps %xmm1, %xmm8
  1032. movaps -28 * SIZE(BO), %xmm1
  1033. mulps %xmm0, %xmm1
  1034. addps %xmm1, %xmm9
  1035. movaps -24 * SIZE(BO), %xmm1
  1036. mulps %xmm0, %xmm1
  1037. mulps -20 * SIZE(BO), %xmm0
  1038. addps %xmm1, %xmm10
  1039. movaps -16 * SIZE(BO), %xmm1
  1040. addps %xmm0, %xmm11
  1041. movaps -28 * SIZE(AO), %xmm0
  1042. subq $- 4 * SIZE, AO
  1043. subq $-16 * SIZE, BO
  1044. decq %rax
  1045. jg .L26
  1046. ALIGN_4
  1047. .L28:
  1048. #ifndef TRMMKERNEL
  1049. movsd 0 * SIZE(CO1), %xmm0
  1050. movhps 2 * SIZE(CO1), %xmm0
  1051. movsd 0 * SIZE(CO2), %xmm1
  1052. movhps 2 * SIZE(CO2), %xmm1
  1053. #endif
  1054. shufps $0xb1, %xmm9, %xmm9
  1055. shufps $0xb1, %xmm11, %xmm11
  1056. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1057. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1058. subps %xmm9, %xmm8
  1059. subps %xmm11, %xmm10
  1060. #else
  1061. addps %xmm9, %xmm8
  1062. addps %xmm11, %xmm10
  1063. #endif
  1064. movaps %xmm8, %xmm9
  1065. movaps %xmm10, %xmm11
  1066. shufps $0xb1, %xmm8, %xmm8
  1067. shufps $0xb1, %xmm10, %xmm10
  1068. mulps %xmm6, %xmm9
  1069. mulps %xmm7, %xmm8
  1070. mulps %xmm6, %xmm11
  1071. mulps %xmm7, %xmm10
  1072. addps %xmm9, %xmm8
  1073. addps %xmm11, %xmm10
  1074. #ifndef TRMMKERNEL
  1075. addps %xmm0, %xmm8
  1076. addps %xmm1, %xmm10
  1077. #endif
  1078. movsd %xmm8, 0 * SIZE(CO1)
  1079. movhps %xmm8, 2 * SIZE(CO1)
  1080. movsd %xmm10, 0 * SIZE(CO2)
  1081. movhps %xmm10, 2 * SIZE(CO2)
  1082. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1083. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1084. movq K, %rax
  1085. subq KKK, %rax
  1086. leaq (,%rax, 8), %rax
  1087. leaq (AO, %rax, 2), AO
  1088. leaq (BO, %rax, 8), BO
  1089. #endif
  1090. #if defined(TRMMKERNEL) && defined(LEFT)
  1091. addq $2, KK
  1092. #endif
  1093. addq $4 * SIZE, CO1 # coffset += 4
  1094. addq $4 * SIZE, CO2 # coffset += 4
  1095. ALIGN_4
  1096. .L30:
  1097. testq $1, M
  1098. je .L39
  1099. #if !defined(TRMMKERNEL) || \
  1100. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1101. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1102. leaq 32 * SIZE + BUFFER, BO
  1103. #else
  1104. leaq 32 * SIZE + BUFFER, BO
  1105. movq KK, %rax
  1106. leaq (, %rax, 8), %rax
  1107. leaq (AO, %rax, 1), AO
  1108. leaq (BO, %rax, 8), BO
  1109. #endif
  1110. movaps -32 * SIZE(AO), %xmm0
  1111. movaps -24 * SIZE(AO), %xmm2
  1112. movaps -32 * SIZE(BO), %xmm1
  1113. movaps -16 * SIZE(BO), %xmm3
  1114. movaps 0 * SIZE(BO), %xmm5
  1115. movaps 16 * SIZE(BO), %xmm7
  1116. pxor %xmm8, %xmm8
  1117. pxor %xmm9, %xmm9
  1118. pxor %xmm10, %xmm10
  1119. pxor %xmm11, %xmm11
  1120. #ifndef TRMMKERNEL
  1121. movq K, %rax
  1122. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1123. movq K, %rax
  1124. subq KK, %rax
  1125. movq %rax, KKK
  1126. #else
  1127. movq KK, %rax
  1128. #ifdef LEFT
  1129. addq $1, %rax
  1130. #else
  1131. addq $2, %rax
  1132. #endif
  1133. movq %rax, KKK
  1134. #endif
  1135. sarq $3, %rax
  1136. je .L35
  1137. ALIGN_4
  1138. .L32:
  1139. mulps %xmm0, %xmm1
  1140. #ifdef PREFETCH
  1141. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1142. #endif
  1143. addps %xmm1, %xmm8
  1144. movaps -28 * SIZE(BO), %xmm1
  1145. mulps %xmm0, %xmm1
  1146. addps %xmm1, %xmm9
  1147. movaps -24 * SIZE(BO), %xmm1
  1148. mulps %xmm0, %xmm1
  1149. addps %xmm1, %xmm10
  1150. movaps -20 * SIZE(BO), %xmm1
  1151. mulps %xmm0, %xmm1
  1152. movsd -30 * SIZE(AO), %xmm0
  1153. addps %xmm1, %xmm11
  1154. movaps 32 * SIZE(BO), %xmm1
  1155. mulps %xmm0, %xmm3
  1156. addps %xmm3, %xmm8
  1157. movaps -12 * SIZE(BO), %xmm3
  1158. mulps %xmm0, %xmm3
  1159. addps %xmm3, %xmm9
  1160. movaps -8 * SIZE(BO), %xmm3
  1161. mulps %xmm0, %xmm3
  1162. addps %xmm3, %xmm10
  1163. movaps -4 * SIZE(BO), %xmm3
  1164. mulps %xmm0, %xmm3
  1165. movsd -28 * SIZE(AO), %xmm0
  1166. addps %xmm3, %xmm11
  1167. movaps 48 * SIZE(BO), %xmm3
  1168. mulps %xmm0, %xmm5
  1169. addps %xmm5, %xmm8
  1170. movaps 4 * SIZE(BO), %xmm5
  1171. mulps %xmm0, %xmm5
  1172. addps %xmm5, %xmm9
  1173. movaps 8 * SIZE(BO), %xmm5
  1174. mulps %xmm0, %xmm5
  1175. addps %xmm5, %xmm10
  1176. movaps 12 * SIZE(BO), %xmm5
  1177. mulps %xmm0, %xmm5
  1178. movsd -26 * SIZE(AO), %xmm0
  1179. addps %xmm5, %xmm11
  1180. movaps 64 * SIZE(BO), %xmm5
  1181. mulps %xmm0, %xmm7
  1182. addps %xmm7, %xmm8
  1183. movaps 20 * SIZE(BO), %xmm7
  1184. mulps %xmm0, %xmm7
  1185. addps %xmm7, %xmm9
  1186. movaps 24 * SIZE(BO), %xmm7
  1187. mulps %xmm0, %xmm7
  1188. addps %xmm7, %xmm10
  1189. movaps 28 * SIZE(BO), %xmm7
  1190. mulps %xmm0, %xmm7
  1191. movsd -16 * SIZE(AO), %xmm0
  1192. addps %xmm7, %xmm11
  1193. movaps 80 * SIZE(BO), %xmm7
  1194. mulps %xmm2, %xmm1
  1195. addps %xmm1, %xmm8
  1196. movaps 36 * SIZE(BO), %xmm1
  1197. mulps %xmm2, %xmm1
  1198. addps %xmm1, %xmm9
  1199. movaps 40 * SIZE(BO), %xmm1
  1200. mulps %xmm2, %xmm1
  1201. addps %xmm1, %xmm10
  1202. movaps 44 * SIZE(BO), %xmm1
  1203. mulps %xmm2, %xmm1
  1204. movsd -22 * SIZE(AO), %xmm2
  1205. addps %xmm1, %xmm11
  1206. movaps 96 * SIZE(BO), %xmm1
  1207. mulps %xmm2, %xmm3
  1208. addps %xmm3, %xmm8
  1209. movaps 52 * SIZE(BO), %xmm3
  1210. mulps %xmm2, %xmm3
  1211. addps %xmm3, %xmm9
  1212. movaps 56 * SIZE(BO), %xmm3
  1213. mulps %xmm2, %xmm3
  1214. addps %xmm3, %xmm10
  1215. movaps 60 * SIZE(BO), %xmm3
  1216. mulps %xmm2, %xmm3
  1217. movsd -20 * SIZE(AO), %xmm2
  1218. addps %xmm3, %xmm11
  1219. movaps 112 * SIZE(BO), %xmm3
  1220. mulps %xmm2, %xmm5
  1221. addps %xmm5, %xmm8
  1222. movaps 68 * SIZE(BO), %xmm5
  1223. mulps %xmm2, %xmm5
  1224. addps %xmm5, %xmm9
  1225. movaps 72 * SIZE(BO), %xmm5
  1226. mulps %xmm2, %xmm5
  1227. addps %xmm5, %xmm10
  1228. movaps 76 * SIZE(BO), %xmm5
  1229. mulps %xmm2, %xmm5
  1230. movsd -18 * SIZE(AO), %xmm2
  1231. addps %xmm5, %xmm11
  1232. movaps 128 * SIZE(BO), %xmm5
  1233. mulps %xmm2, %xmm7
  1234. addps %xmm7, %xmm8
  1235. movaps 84 * SIZE(BO), %xmm7
  1236. mulps %xmm2, %xmm7
  1237. addps %xmm7, %xmm9
  1238. movaps 88 * SIZE(BO), %xmm7
  1239. mulps %xmm2, %xmm7
  1240. addps %xmm7, %xmm10
  1241. movaps 92 * SIZE(BO), %xmm7
  1242. mulps %xmm2, %xmm7
  1243. movsd -8 * SIZE(AO), %xmm2
  1244. addps %xmm7, %xmm11
  1245. movaps 144 * SIZE(BO), %xmm7
  1246. subq $ -16 * SIZE, AO
  1247. subq $-128 * SIZE, BO
  1248. decq %rax
  1249. jne .L32
  1250. ALIGN_4
  1251. .L35:
  1252. #ifndef TRMMKERNEL
  1253. movq K, %rax
  1254. #else
  1255. movq KKK, %rax
  1256. #endif
  1257. movaps ALPHA_R, %xmm6
  1258. movaps ALPHA_I, %xmm7
  1259. andq $7, %rax # if (k & 1)
  1260. BRANCH
  1261. je .L38
  1262. ALIGN_4
  1263. .L36:
  1264. mulps %xmm0, %xmm1
  1265. addps %xmm1, %xmm8
  1266. movaps -28 * SIZE(BO), %xmm1
  1267. mulps %xmm0, %xmm1
  1268. addps %xmm1, %xmm9
  1269. movaps -24 * SIZE(BO), %xmm1
  1270. mulps %xmm0, %xmm1
  1271. addps %xmm1, %xmm10
  1272. movaps -20 * SIZE(BO), %xmm1
  1273. mulps %xmm0, %xmm1
  1274. movsd -30 * SIZE(AO), %xmm0
  1275. addps %xmm1, %xmm11
  1276. movaps -16 * SIZE(BO), %xmm1
  1277. subq $ -2 * SIZE, AO
  1278. subq $-16 * SIZE, BO
  1279. decq %rax
  1280. jg .L36
  1281. ALIGN_4
  1282. .L38:
  1283. #ifndef TRMMKERNEL
  1284. #ifdef movsd
  1285. xorps %xmm0, %xmm0
  1286. #endif
  1287. movsd 0 * SIZE(CO1), %xmm0
  1288. #ifdef movsd
  1289. xorps %xmm1, %xmm1
  1290. #endif
  1291. movsd 0 * SIZE(CO2), %xmm1
  1292. #endif
  1293. shufps $0xb1, %xmm9, %xmm9
  1294. shufps $0xb1, %xmm11, %xmm11
  1295. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1296. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1297. subps %xmm9, %xmm8
  1298. subps %xmm11, %xmm10
  1299. #else
  1300. addps %xmm9, %xmm8
  1301. addps %xmm11, %xmm10
  1302. #endif
  1303. movaps %xmm8, %xmm9
  1304. movaps %xmm10, %xmm11
  1305. shufps $0xb1, %xmm8, %xmm8
  1306. shufps $0xb1, %xmm10, %xmm10
  1307. mulps %xmm6, %xmm9
  1308. mulps %xmm7, %xmm8
  1309. mulps %xmm6, %xmm11
  1310. mulps %xmm7, %xmm10
  1311. addps %xmm9, %xmm8
  1312. addps %xmm11, %xmm10
  1313. #ifndef TRMMKERNEL
  1314. addps %xmm0, %xmm8
  1315. addps %xmm1, %xmm10
  1316. #endif
  1317. movlps %xmm8, 0 * SIZE(CO1)
  1318. movlps %xmm10, 0 * SIZE(CO2)
  1319. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1320. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1321. movq K, %rax
  1322. subq KKK, %rax
  1323. leaq (,%rax, 8), %rax
  1324. leaq (AO, %rax, 1), AO
  1325. leaq (BO, %rax, 8), BO
  1326. #endif
  1327. #if defined(TRMMKERNEL) && defined(LEFT)
  1328. addq $1, KK
  1329. #endif
  1330. ALIGN_4
  1331. .L39:
  1332. #if defined(TRMMKERNEL) && !defined(LEFT)
  1333. addl $2, KK
  1334. #endif
  1335. leaq (C, LDC, 2), C # c += 2 * ldc
  1336. decq J # j --
  1337. jg .L01
  1338. ALIGN_4
  1339. .L40:
  1340. testq $1, N
  1341. je .L999
  1342. ALIGN_4
  1343. .L41:
  1344. #if defined(TRMMKERNEL) && defined(LEFT)
  1345. movq OFFSET, %rax
  1346. movq %rax, KK
  1347. #endif
  1348. /* Copying to Sub Buffer */
  1349. leaq BUFFER, BO
  1350. movaps POSINV, %xmm7
  1351. movq K, %rax
  1352. sarq $2, %rax
  1353. jle .L43
  1354. ALIGN_4
  1355. .L42:
  1356. movss 0 * SIZE(B), %xmm8
  1357. movss 1 * SIZE(B), %xmm9
  1358. movss 2 * SIZE(B), %xmm10
  1359. movss 3 * SIZE(B), %xmm11
  1360. movss 4 * SIZE(B), %xmm12
  1361. movss 5 * SIZE(B), %xmm13
  1362. movss 6 * SIZE(B), %xmm14
  1363. movss 7 * SIZE(B), %xmm15
  1364. shufps $0, %xmm8, %xmm8
  1365. shufps $0, %xmm9, %xmm9
  1366. shufps $0, %xmm10, %xmm10
  1367. shufps $0, %xmm11, %xmm11
  1368. shufps $0, %xmm12, %xmm12
  1369. shufps $0, %xmm13, %xmm13
  1370. shufps $0, %xmm14, %xmm14
  1371. shufps $0, %xmm15, %xmm15
  1372. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  1373. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  1374. xorps %xmm7, %xmm9
  1375. xorps %xmm7, %xmm11
  1376. xorps %xmm7, %xmm13
  1377. xorps %xmm7, %xmm15
  1378. #else
  1379. xorps %xmm7, %xmm8
  1380. xorps %xmm7, %xmm10
  1381. xorps %xmm7, %xmm12
  1382. xorps %xmm7, %xmm14
  1383. #endif
  1384. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
  1385. prefetchnta 56 * SIZE(B)
  1386. #endif
  1387. movaps %xmm8, 0 * SIZE(BO)
  1388. movaps %xmm9, 4 * SIZE(BO)
  1389. movaps %xmm10, 8 * SIZE(BO)
  1390. movaps %xmm11, 12 * SIZE(BO)
  1391. movaps %xmm12, 16 * SIZE(BO)
  1392. movaps %xmm13, 20 * SIZE(BO)
  1393. movaps %xmm14, 24 * SIZE(BO)
  1394. movaps %xmm15, 28 * SIZE(BO)
  1395. #if defined(PENTIUM4) || defined(GENERIC)
  1396. PREFETCHW 128 * SIZE(BO)
  1397. PREFETCH 112 * SIZE(B)
  1398. #endif
  1399. addq $32 * SIZE, BO
  1400. addq $ 8 * SIZE, B
  1401. decq %rax
  1402. jne .L42
  1403. ALIGN_4
  1404. .L43:
  1405. movq K, %rax
  1406. andq $3, %rax
  1407. BRANCH
  1408. jle .L50
  1409. ALIGN_4
  1410. .L44:
  1411. movss 0 * SIZE(B), %xmm8
  1412. movss 1 * SIZE(B), %xmm9
  1413. shufps $0, %xmm8, %xmm8
  1414. shufps $0, %xmm9, %xmm9
  1415. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  1416. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  1417. xorps %xmm7, %xmm9
  1418. #else
  1419. xorps %xmm7, %xmm8
  1420. #endif
  1421. movaps %xmm8, 0 * SIZE(BO)
  1422. movaps %xmm9, 4 * SIZE(BO)
  1423. addq $2 * SIZE, B
  1424. addq $8 * SIZE, BO
  1425. decq %rax
  1426. jne .L44
  1427. ALIGN_4
  1428. .L50:
  1429. movq C, CO1 # coffset1 = c
  1430. movq A, AO # aoffset = a
  1431. movq M, I
  1432. sarq $2, I # i = (m >> 2)
  1433. jle .L60
  1434. ALIGN_4
  1435. .L51:
  1436. #if !defined(TRMMKERNEL) || \
  1437. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1438. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1439. leaq 32 * SIZE + BUFFER, BO
  1440. #else
  1441. leaq 32 * SIZE + BUFFER, BO
  1442. movq KK, %rax
  1443. leaq (, %rax, 8), %rax
  1444. leaq (AO, %rax, 4), AO
  1445. leaq (BO, %rax, 4), BO
  1446. #endif
  1447. movaps -32 * SIZE(AO), %xmm0
  1448. pxor %xmm8, %xmm8
  1449. movaps -16 * SIZE(AO), %xmm2
  1450. pxor %xmm9, %xmm9
  1451. movaps 0 * SIZE(AO), %xmm4
  1452. pxor %xmm10, %xmm10
  1453. movaps 16 * SIZE(AO), %xmm6
  1454. pxor %xmm11, %xmm11
  1455. movaps -32 * SIZE(BO), %xmm1
  1456. pxor %xmm12, %xmm12
  1457. movaps -16 * SIZE(BO), %xmm3
  1458. pxor %xmm13, %xmm13
  1459. movaps 0 * SIZE(BO), %xmm5
  1460. pxor %xmm14, %xmm14
  1461. movaps 16 * SIZE(BO), %xmm7
  1462. pxor %xmm15, %xmm15
  1463. PREFETCHW 7 * SIZE(CO1)
  1464. #ifndef TRMMKERNEL
  1465. movq K, %rax
  1466. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1467. movq K, %rax
  1468. subq KK, %rax
  1469. movq %rax, KKK
  1470. #else
  1471. movq KK, %rax
  1472. #ifdef LEFT
  1473. addq $4, %rax
  1474. #else
  1475. addq $1, %rax
  1476. #endif
  1477. movq %rax, KKK
  1478. #endif
  1479. sarq $3, %rax
  1480. je .L55
  1481. ALIGN_4
  1482. .L52:
  1483. mulps %xmm0, %xmm1
  1484. #ifdef PREFETCH
  1485. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1486. #endif
  1487. mulps -28 * SIZE(BO), %xmm0
  1488. addps %xmm1, %xmm8
  1489. movaps -32 * SIZE(BO), %xmm1
  1490. addps %xmm0, %xmm9
  1491. movaps -28 * SIZE(AO), %xmm0
  1492. mulps %xmm0, %xmm1
  1493. mulps -28 * SIZE(BO), %xmm0
  1494. addps %xmm1, %xmm12
  1495. movaps -24 * SIZE(BO), %xmm1
  1496. addps %xmm0, %xmm13
  1497. movaps -24 * SIZE(AO), %xmm0
  1498. mulps %xmm0, %xmm1
  1499. mulps -20 * SIZE(BO), %xmm0
  1500. addps %xmm1, %xmm8
  1501. movaps -24 * SIZE(BO), %xmm1
  1502. addps %xmm0, %xmm9
  1503. movaps -20 * SIZE(AO), %xmm0
  1504. mulps %xmm0, %xmm1
  1505. mulps -20 * SIZE(BO), %xmm0
  1506. addps %xmm1, %xmm12
  1507. movaps 32 * SIZE(BO), %xmm1
  1508. addps %xmm0, %xmm13
  1509. movaps 32 * SIZE(AO), %xmm0
  1510. #ifdef PREFETCH
  1511. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1512. #endif
  1513. mulps %xmm2, %xmm3
  1514. mulps -12 * SIZE(BO), %xmm2
  1515. addps %xmm3, %xmm8
  1516. movaps -16 * SIZE(BO), %xmm3
  1517. addps %xmm2, %xmm9
  1518. movaps -12 * SIZE(AO), %xmm2
  1519. mulps %xmm2, %xmm3
  1520. mulps -12 * SIZE(BO), %xmm2
  1521. addps %xmm3, %xmm12
  1522. movaps -8 * SIZE(BO), %xmm3
  1523. addps %xmm2, %xmm13
  1524. movaps -8 * SIZE(AO), %xmm2
  1525. mulps %xmm2, %xmm3
  1526. mulps -4 * SIZE(BO), %xmm2
  1527. addps %xmm3, %xmm8
  1528. movaps -8 * SIZE(BO), %xmm3
  1529. addps %xmm2, %xmm9
  1530. movaps -4 * SIZE(AO), %xmm2
  1531. mulps %xmm2, %xmm3
  1532. mulps -4 * SIZE(BO), %xmm2
  1533. addps %xmm3, %xmm12
  1534. movaps 48 * SIZE(BO), %xmm3
  1535. addps %xmm2, %xmm13
  1536. movaps 48 * SIZE(AO), %xmm2
  1537. #ifdef PREFETCH
  1538. PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
  1539. #endif
  1540. mulps %xmm4, %xmm5
  1541. mulps 4 * SIZE(BO), %xmm4
  1542. addps %xmm5, %xmm8
  1543. movaps 0 * SIZE(BO), %xmm5
  1544. addps %xmm4, %xmm9
  1545. movaps 4 * SIZE(AO), %xmm4
  1546. mulps %xmm4, %xmm5
  1547. mulps 4 * SIZE(BO), %xmm4
  1548. addps %xmm5, %xmm12
  1549. movaps 8 * SIZE(BO), %xmm5
  1550. addps %xmm4, %xmm13
  1551. movaps 8 * SIZE(AO), %xmm4
  1552. mulps %xmm4, %xmm5
  1553. mulps 12 * SIZE(BO), %xmm4
  1554. addps %xmm5, %xmm8
  1555. movaps 8 * SIZE(BO), %xmm5
  1556. addps %xmm4, %xmm9
  1557. movaps 12 * SIZE(AO), %xmm4
  1558. mulps %xmm4, %xmm5
  1559. mulps 12 * SIZE(BO), %xmm4
  1560. addps %xmm5, %xmm12
  1561. movaps 64 * SIZE(BO), %xmm5
  1562. addps %xmm4, %xmm13
  1563. movaps 64 * SIZE(AO), %xmm4
  1564. #ifdef PREFETCH
  1565. PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
  1566. #endif
  1567. mulps %xmm6, %xmm7
  1568. mulps 20 * SIZE(BO), %xmm6
  1569. addps %xmm7, %xmm8
  1570. movaps 16 * SIZE(BO), %xmm7
  1571. addps %xmm6, %xmm9
  1572. movaps 20 * SIZE(AO), %xmm6
  1573. mulps %xmm6, %xmm7
  1574. mulps 20 * SIZE(BO), %xmm6
  1575. addps %xmm7, %xmm12
  1576. movaps 24 * SIZE(BO), %xmm7
  1577. addps %xmm6, %xmm13
  1578. movaps 24 * SIZE(AO), %xmm6
  1579. mulps %xmm6, %xmm7
  1580. mulps 28 * SIZE(BO), %xmm6
  1581. addps %xmm7, %xmm8
  1582. movaps 24 * SIZE(BO), %xmm7
  1583. addps %xmm6, %xmm9
  1584. movaps 28 * SIZE(AO), %xmm6
  1585. mulps %xmm6, %xmm7
  1586. mulps 28 * SIZE(BO), %xmm6
  1587. addps %xmm7, %xmm12
  1588. movaps 80 * SIZE(BO), %xmm7
  1589. addps %xmm6, %xmm13
  1590. movaps 80 * SIZE(AO), %xmm6
  1591. subq $-64 * SIZE, AO
  1592. subq $-64 * SIZE, BO
  1593. decq %rax
  1594. jne .L52
  1595. ALIGN_4
  1596. .L55:
  1597. #ifndef TRMMKERNEL
  1598. movq K, %rax
  1599. #else
  1600. movq KKK, %rax
  1601. #endif
  1602. movaps ALPHA_R, %xmm6
  1603. movaps ALPHA_I, %xmm7
  1604. andq $7, %rax # if (k & 1)
  1605. BRANCH
  1606. je .L58
  1607. ALIGN_4
  1608. .L56:
  1609. mulps %xmm0, %xmm1
  1610. mulps -28 * SIZE(BO), %xmm0
  1611. addps %xmm1, %xmm8
  1612. movaps -32 * SIZE(BO), %xmm1
  1613. addps %xmm0, %xmm9
  1614. movaps -28 * SIZE(AO), %xmm0
  1615. mulps %xmm0, %xmm1
  1616. mulps -28 * SIZE(BO), %xmm0
  1617. addps %xmm1, %xmm12
  1618. movaps -24 * SIZE(BO), %xmm1
  1619. addps %xmm0, %xmm13
  1620. movaps -24 * SIZE(AO), %xmm0
  1621. addq $ 8 * SIZE, AO
  1622. addq $ 8 * SIZE, BO
  1623. decq %rax
  1624. jg .L56
  1625. ALIGN_4
  1626. .L58:
  1627. #ifndef TRMMKERNEL
  1628. movsd 0 * SIZE(CO1), %xmm0
  1629. movhps 2 * SIZE(CO1), %xmm0
  1630. movsd 4 * SIZE(CO1), %xmm2
  1631. movhps 6 * SIZE(CO1), %xmm2
  1632. #endif
  1633. shufps $0xb1, %xmm9, %xmm9
  1634. shufps $0xb1, %xmm13, %xmm13
  1635. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1636. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1637. subps %xmm9, %xmm8
  1638. subps %xmm13, %xmm12
  1639. #else
  1640. addps %xmm9, %xmm8
  1641. addps %xmm13, %xmm12
  1642. #endif
  1643. movaps %xmm8, %xmm9
  1644. movaps %xmm12, %xmm13
  1645. shufps $0xb1, %xmm8, %xmm8
  1646. shufps $0xb1, %xmm12, %xmm12
  1647. mulps %xmm6, %xmm9
  1648. mulps %xmm7, %xmm8
  1649. mulps %xmm6, %xmm13
  1650. mulps %xmm7, %xmm12
  1651. addps %xmm9, %xmm8
  1652. addps %xmm13, %xmm12
  1653. #ifndef TRMMKERNEL
  1654. addps %xmm0, %xmm8
  1655. addps %xmm2, %xmm12
  1656. #endif
  1657. movlps %xmm8, 0 * SIZE(CO1)
  1658. movhps %xmm8, 2 * SIZE(CO1)
  1659. movlps %xmm12, 4 * SIZE(CO1)
  1660. movhps %xmm12, 6 * SIZE(CO1)
  1661. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1662. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1663. movq K, %rax
  1664. subq KKK, %rax
  1665. leaq (,%rax, 8), %rax
  1666. leaq (AO, %rax, 4), AO
  1667. leaq (BO, %rax, 4), BO
  1668. #endif
  1669. #if defined(TRMMKERNEL) && defined(LEFT)
  1670. addq $4, KK
  1671. #endif
  1672. addq $8 * SIZE, CO1 # coffset += 4
  1673. decq I # i --
  1674. jg .L51
  1675. ALIGN_4
  1676. .L60:
  1677. testq $2, M
  1678. je .L70
  1679. #if !defined(TRMMKERNEL) || \
  1680. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1681. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1682. leaq 32 * SIZE + BUFFER, BO
  1683. #else
  1684. leaq 32 * SIZE + BUFFER, BO
  1685. movq KK, %rax
  1686. leaq (, %rax, 8), %rax
  1687. leaq (AO, %rax, 2), AO
  1688. leaq (BO, %rax, 4), BO
  1689. #endif
  1690. movaps -32 * SIZE(AO), %xmm0
  1691. pxor %xmm8, %xmm8
  1692. movaps -16 * SIZE(AO), %xmm2
  1693. pxor %xmm9, %xmm9
  1694. movaps -32 * SIZE(BO), %xmm1
  1695. pxor %xmm10, %xmm10
  1696. movaps -16 * SIZE(BO), %xmm3
  1697. pxor %xmm11, %xmm11
  1698. movaps 0 * SIZE(BO), %xmm5
  1699. movaps 16 * SIZE(BO), %xmm7
  1700. #ifndef TRMMKERNEL
  1701. movq K, %rax
  1702. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1703. movq K, %rax
  1704. subq KK, %rax
  1705. movq %rax, KKK
  1706. #else
  1707. movq KK, %rax
  1708. #ifdef LEFT
  1709. addq $2, %rax
  1710. #else
  1711. addq $1, %rax
  1712. #endif
  1713. movq %rax, KKK
  1714. #endif
  1715. sarq $3, %rax
  1716. je .L65
  1717. ALIGN_4
  1718. .L62:
  1719. mulps %xmm0, %xmm1
  1720. #ifdef PREFETCH
  1721. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1722. #endif
  1723. mulps -28 * SIZE(BO), %xmm0
  1724. addps %xmm1, %xmm8
  1725. movaps -24 * SIZE(BO), %xmm1
  1726. addps %xmm0, %xmm9
  1727. movaps -28 * SIZE(AO), %xmm0
  1728. mulps %xmm0, %xmm1
  1729. mulps -20 * SIZE(BO), %xmm0
  1730. addps %xmm1, %xmm10
  1731. movaps 32 * SIZE(BO), %xmm1
  1732. addps %xmm0, %xmm11
  1733. movaps -24 * SIZE(AO), %xmm0
  1734. mulps %xmm0, %xmm3
  1735. mulps -12 * SIZE(BO), %xmm0
  1736. addps %xmm3, %xmm8
  1737. movaps -8 * SIZE(BO), %xmm3
  1738. addps %xmm0, %xmm9
  1739. movaps -20 * SIZE(AO), %xmm0
  1740. mulps %xmm0, %xmm3
  1741. mulps -4 * SIZE(BO), %xmm0
  1742. addps %xmm3, %xmm10
  1743. movaps 48 * SIZE(BO), %xmm3
  1744. addps %xmm0, %xmm11
  1745. movaps 0 * SIZE(AO), %xmm0
  1746. #ifdef PREFETCH
  1747. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1748. #endif
  1749. mulps %xmm2, %xmm5
  1750. mulps 4 * SIZE(BO), %xmm2
  1751. addps %xmm5, %xmm8
  1752. movaps 8 * SIZE(BO), %xmm5
  1753. addps %xmm2, %xmm9
  1754. movaps -12 * SIZE(AO), %xmm2
  1755. mulps %xmm2, %xmm5
  1756. mulps 12 * SIZE(BO), %xmm2
  1757. addps %xmm5, %xmm10
  1758. movaps 64 * SIZE(BO), %xmm5
  1759. addps %xmm2, %xmm11
  1760. movaps -8 * SIZE(AO), %xmm2
  1761. mulps %xmm2, %xmm7
  1762. mulps 20 * SIZE(BO), %xmm2
  1763. addps %xmm7, %xmm8
  1764. movaps 24 * SIZE(BO), %xmm7
  1765. addps %xmm2, %xmm9
  1766. movaps -4 * SIZE(AO), %xmm2
  1767. mulps %xmm2, %xmm7
  1768. mulps 28 * SIZE(BO), %xmm2
  1769. addps %xmm7, %xmm10
  1770. movaps 80 * SIZE(BO), %xmm7
  1771. addps %xmm2, %xmm11
  1772. movaps 16 * SIZE(AO), %xmm2
  1773. subq $-32 * SIZE, AO
  1774. subq $-64 * SIZE, BO
  1775. decq %rax
  1776. jne .L62
  1777. ALIGN_4
  1778. .L65:
  1779. #ifndef TRMMKERNEL
  1780. movq K, %rax
  1781. #else
  1782. movq KKK, %rax
  1783. #endif
  1784. movaps ALPHA_R, %xmm6
  1785. movaps ALPHA_I, %xmm7
  1786. andq $7, %rax # if (k & 1)
  1787. BRANCH
  1788. je .L68
  1789. ALIGN_4
  1790. .L66:
  1791. mulps %xmm0, %xmm1
  1792. mulps -28 * SIZE(BO), %xmm0
  1793. addps %xmm1, %xmm8
  1794. movaps -24 * SIZE(BO), %xmm1
  1795. addps %xmm0, %xmm9
  1796. movaps -28 * SIZE(AO), %xmm0
  1797. addq $4 * SIZE, AO # aoffset += 4
  1798. addq $8 * SIZE, BO # boffset1 += 8
  1799. decq %rax
  1800. jg .L66
  1801. ALIGN_4
  1802. .L68:
  1803. #ifndef TRMMKERNEL
  1804. movsd 0 * SIZE(CO1), %xmm0
  1805. movhps 2 * SIZE(CO1), %xmm0
  1806. #endif
  1807. addps %xmm10, %xmm8
  1808. addps %xmm11, %xmm9
  1809. shufps $0xb1, %xmm9, %xmm9
  1810. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1811. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1812. subps %xmm9, %xmm8
  1813. #else
  1814. addps %xmm9, %xmm8
  1815. #endif
  1816. movaps %xmm8, %xmm9
  1817. shufps $0xb1, %xmm8, %xmm8
  1818. mulps %xmm6, %xmm9
  1819. mulps %xmm7, %xmm8
  1820. addps %xmm9, %xmm8
  1821. #ifndef TRMMKERNEL
  1822. addps %xmm0, %xmm8
  1823. #endif
  1824. movsd %xmm8, 0 * SIZE(CO1)
  1825. movhps %xmm8, 2 * SIZE(CO1)
  1826. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1827. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1828. movq K, %rax
  1829. subq KKK, %rax
  1830. leaq (,%rax, 8), %rax
  1831. leaq (AO, %rax, 2), AO
  1832. leaq (BO, %rax, 4), BO
  1833. #endif
  1834. #if defined(TRMMKERNEL) && defined(LEFT)
  1835. addq $2, KK
  1836. #endif
  1837. addq $4 * SIZE, CO1 # coffset += 4
  1838. ALIGN_4
  1839. .L70:
  1840. testq $1, M
  1841. je .L999
  1842. #if !defined(TRMMKERNEL) || \
  1843. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1844. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1845. leaq 32 * SIZE + BUFFER, BO
  1846. #else
  1847. leaq 32 * SIZE + BUFFER, BO
  1848. movq KK, %rax
  1849. leaq (, %rax, 8), %rax
  1850. leaq (AO, %rax, 1), AO
  1851. leaq (BO, %rax, 4), BO
  1852. #endif
  1853. movaps -32 * SIZE(AO), %xmm0
  1854. pxor %xmm8, %xmm8
  1855. movaps -24 * SIZE(AO), %xmm2
  1856. pxor %xmm9, %xmm9
  1857. movaps -32 * SIZE(BO), %xmm1
  1858. pxor %xmm10, %xmm10
  1859. movaps -16 * SIZE(BO), %xmm3
  1860. pxor %xmm11, %xmm11
  1861. movaps 0 * SIZE(BO), %xmm5
  1862. movaps 16 * SIZE(BO), %xmm7
  1863. #ifndef TRMMKERNEL
  1864. movq K, %rax
  1865. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1866. movq K, %rax
  1867. subq KK, %rax
  1868. movq %rax, KKK
  1869. #else
  1870. movq KK, %rax
  1871. #ifdef LEFT
  1872. addq $1, %rax
  1873. #else
  1874. addq $1, %rax
  1875. #endif
  1876. movq %rax, KKK
  1877. #endif
  1878. sarq $3, %rax
  1879. je .L75
  1880. ALIGN_4
  1881. .L72:
  1882. mulps %xmm0, %xmm1
  1883. #ifdef PREFETCH
  1884. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1885. #endif
  1886. addps %xmm1, %xmm8
  1887. movaps -28 * SIZE(BO), %xmm1
  1888. mulps %xmm0, %xmm1
  1889. movsd -30 * SIZE(AO), %xmm0
  1890. addps %xmm1, %xmm9
  1891. movaps -24 * SIZE(BO), %xmm1
  1892. mulps %xmm0, %xmm1
  1893. addps %xmm1, %xmm10
  1894. movaps -20 * SIZE(BO), %xmm1
  1895. mulps %xmm0, %xmm1
  1896. movsd -28 * SIZE(AO), %xmm0
  1897. addps %xmm1, %xmm11
  1898. movaps 32 * SIZE(BO), %xmm1
  1899. mulps %xmm0, %xmm3
  1900. addps %xmm3, %xmm8
  1901. movaps -12 * SIZE(BO), %xmm3
  1902. mulps %xmm0, %xmm3
  1903. movsd -26 * SIZE(AO), %xmm0
  1904. addps %xmm3, %xmm9
  1905. movaps -8 * SIZE(BO), %xmm3
  1906. mulps %xmm0, %xmm3
  1907. addps %xmm3, %xmm10
  1908. movaps -4 * SIZE(BO), %xmm3
  1909. mulps %xmm0, %xmm3
  1910. movsd -16 * SIZE(AO), %xmm0
  1911. addps %xmm3, %xmm11
  1912. movaps 48 * SIZE(BO), %xmm3
  1913. mulps %xmm2, %xmm5
  1914. addps %xmm5, %xmm8
  1915. movaps 4 * SIZE(BO), %xmm5
  1916. mulps %xmm2, %xmm5
  1917. movsd -22 * SIZE(AO), %xmm2
  1918. addps %xmm5, %xmm9
  1919. movaps 8 * SIZE(BO), %xmm5
  1920. mulps %xmm2, %xmm5
  1921. addps %xmm5, %xmm10
  1922. movaps 12 * SIZE(BO), %xmm5
  1923. mulps %xmm2, %xmm5
  1924. movsd -20 * SIZE(AO), %xmm2
  1925. addps %xmm5, %xmm11
  1926. movaps 64 * SIZE(BO), %xmm5
  1927. mulps %xmm2, %xmm7
  1928. addps %xmm7, %xmm8
  1929. movaps 20 * SIZE(BO), %xmm7
  1930. mulps %xmm2, %xmm7
  1931. movsd -18 * SIZE(AO), %xmm2
  1932. addps %xmm7, %xmm9
  1933. movaps 24 * SIZE(BO), %xmm7
  1934. mulps %xmm2, %xmm7
  1935. addps %xmm7, %xmm10
  1936. movaps 28 * SIZE(BO), %xmm7
  1937. mulps %xmm2, %xmm7
  1938. movsd -8 * SIZE(AO), %xmm2
  1939. addps %xmm7, %xmm11
  1940. movaps 80 * SIZE(BO), %xmm7
  1941. subq $-16 * SIZE, AO
  1942. subq $-64 * SIZE, BO
  1943. decq %rax
  1944. jne .L72
  1945. ALIGN_4
  1946. .L75:
  1947. #ifndef TRMMKERNEL
  1948. movq K, %rax
  1949. #else
  1950. movq KKK, %rax
  1951. #endif
  1952. movaps ALPHA_R, %xmm6
  1953. movaps ALPHA_I, %xmm7
  1954. andq $7, %rax # if (k & 1)
  1955. BRANCH
  1956. je .L78
  1957. ALIGN_4
  1958. .L76:
  1959. mulps %xmm0, %xmm1
  1960. addps %xmm1, %xmm8
  1961. movaps -28 * SIZE(BO), %xmm1
  1962. mulps %xmm0, %xmm1
  1963. movsd -30 * SIZE(AO), %xmm0
  1964. addps %xmm1, %xmm9
  1965. movaps -24 * SIZE(BO), %xmm1
  1966. addq $2 * SIZE, AO
  1967. addq $8 * SIZE, BO
  1968. decq %rax
  1969. jg .L76
  1970. ALIGN_4
  1971. .L78:
  1972. #ifndef TRMMKERNEL
  1973. #ifdef movsd
  1974. xorps %xmm0, %xmm0
  1975. #endif
  1976. movsd 0 * SIZE(CO1), %xmm0
  1977. #endif
  1978. addps %xmm10, %xmm8
  1979. addps %xmm11, %xmm9
  1980. shufps $0xb1, %xmm9, %xmm9
  1981. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1982. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1983. subps %xmm9, %xmm8
  1984. #else
  1985. addps %xmm9, %xmm8
  1986. #endif
  1987. movaps %xmm8, %xmm9
  1988. shufps $0xb1, %xmm8, %xmm8
  1989. mulps %xmm6, %xmm9
  1990. mulps %xmm7, %xmm8
  1991. addps %xmm9, %xmm8
  1992. #ifndef TRMMKERNEL
  1993. addps %xmm0, %xmm8
  1994. #endif
  1995. movlps %xmm8, 0 * SIZE(CO1)
  1996. ALIGN_4
  1997. .L999:
  1998. movq %rbx, %rsp
  1999. movq 0(%rsp), %rbx
  2000. movq 8(%rsp), %rbp
  2001. movq 16(%rsp), %r12
  2002. movq 24(%rsp), %r13
  2003. movq 32(%rsp), %r14
  2004. movq 40(%rsp), %r15
  2005. #ifdef WINDOWS_ABI
  2006. movq 48(%rsp), %rdi
  2007. movq 56(%rsp), %rsi
  2008. movups 64(%rsp), %xmm6
  2009. movups 80(%rsp), %xmm7
  2010. movups 96(%rsp), %xmm8
  2011. movups 112(%rsp), %xmm9
  2012. movups 128(%rsp), %xmm10
  2013. movups 144(%rsp), %xmm11
  2014. movups 160(%rsp), %xmm12
  2015. movups 176(%rsp), %xmm13
  2016. movups 192(%rsp), %xmm14
  2017. movups 208(%rsp), %xmm15
  2018. #endif
  2019. addq $STACKSIZE, %rsp
  2020. ret
  2021. EPILOGUE