You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x4_sse3.S 57 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define J %r12
  49. #define AO %r13
  50. #define BO %r14
  51. #define CO1 %r15
  52. #define CO2 %rbx
  53. #define BB %rbp
  54. #ifndef WINDOWS_ABI
  55. #define STACKSIZE 128
  56. #define OLD_LDC 8 + STACKSIZE(%rsp)
  57. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  58. #define ALPHA 48(%rsp)
  59. #define OFFSET 56(%rsp)
  60. #define KKK 64(%rsp)
  61. #define KK 72(%rsp)
  62. #else
  63. #define STACKSIZE 256
  64. #define OLD_A 40 + STACKSIZE(%rsp)
  65. #define OLD_B 48 + STACKSIZE(%rsp)
  66. #define OLD_C 56 + STACKSIZE(%rsp)
  67. #define OLD_LDC 64 + STACKSIZE(%rsp)
  68. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  69. #define ALPHA 224(%rsp)
  70. #define OFFSET 232(%rsp)
  71. #define KK 240(%rsp)
  72. #define KKK 248(%rsp)
  73. #endif
  74. #define PREFETCH prefetcht1
  75. #define PREFETCHSIZE (16 * 12 + 3)
  76. #define PREFETCH_R (4 * 4 + 0)
  77. #define KERNEL1(address) \
  78. mulpd %xmm8, %xmm9 ;\
  79. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\
  80. addpd %xmm9, %xmm0;\
  81. movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  82. mulpd %xmm8, %xmm9;\
  83. addpd %xmm9, %xmm1;\
  84. movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  85. mulpd %xmm8, %xmm9;\
  86. addpd %xmm9, %xmm2;\
  87. movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  88. mulpd %xmm8, %xmm9;\
  89. movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
  90. addpd %xmm9, %xmm3;\
  91. movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  92. #define KERNEL2(address) \
  93. mulpd %xmm8, %xmm9;\
  94. addpd %xmm9, %xmm4;\
  95. movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  96. mulpd %xmm8, %xmm9;\
  97. addpd %xmm9, %xmm5;\
  98. movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  99. mulpd %xmm8, %xmm9;\
  100. addpd %xmm9, %xmm6;\
  101. movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  102. mulpd %xmm8, %xmm9;\
  103. movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
  104. addpd %xmm9, %xmm7;\
  105. movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  106. #define KERNEL3(address) \
  107. mulpd %xmm8, %xmm9;\
  108. addpd %xmm9, %xmm0;\
  109. movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  110. mulpd %xmm8, %xmm9;\
  111. addpd %xmm9, %xmm1;\
  112. movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  113. mulpd %xmm8, %xmm9;\
  114. addpd %xmm9, %xmm2;\
  115. movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  116. mulpd %xmm8, %xmm9;\
  117. movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
  118. addpd %xmm9, %xmm3;\
  119. movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  120. #define KERNEL4(address) \
  121. mulpd %xmm8, %xmm9;\
  122. addpd %xmm9, %xmm4;\
  123. movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  124. mulpd %xmm8, %xmm9;\
  125. addpd %xmm9, %xmm5;\
  126. movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  127. mulpd %xmm8, %xmm9;\
  128. addpd %xmm9, %xmm6;\
  129. movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
  130. mulpd %xmm8, %xmm9;\
  131. movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
  132. addpd %xmm9, %xmm7;\
  133. movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9
  134. #define KERNEL5(address) \
  135. mulpd %xmm10, %xmm11;\
  136. addpd %xmm11, %xmm0;\
  137. movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  138. mulpd %xmm10, %xmm11;\
  139. addpd %xmm11, %xmm1;\
  140. movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  141. mulpd %xmm10, %xmm11;\
  142. addpd %xmm11, %xmm2;\
  143. movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  144. mulpd %xmm10, %xmm11;\
  145. movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
  146. addpd %xmm11, %xmm3;\
  147. movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  148. #define KERNEL6(address) \
  149. mulpd %xmm10, %xmm11;\
  150. addpd %xmm11, %xmm4;\
  151. movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  152. mulpd %xmm10, %xmm11;\
  153. addpd %xmm11, %xmm5;\
  154. movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  155. mulpd %xmm10, %xmm11;\
  156. addpd %xmm11, %xmm6;\
  157. movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  158. mulpd %xmm10, %xmm11;\
  159. movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
  160. addpd %xmm11, %xmm7;\
  161. movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  162. #define KERNEL7(address) \
  163. mulpd %xmm10, %xmm11;\
  164. addpd %xmm11, %xmm0;\
  165. movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  166. mulpd %xmm10, %xmm11;\
  167. addpd %xmm11, %xmm1;\
  168. movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  169. mulpd %xmm10, %xmm11;\
  170. addpd %xmm11, %xmm2;\
  171. movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  172. mulpd %xmm10, %xmm11;\
  173. movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
  174. addpd %xmm11, %xmm3;\
  175. movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  176. #define KERNEL8(address) \
  177. mulpd %xmm10, %xmm11;\
  178. addpd %xmm11, %xmm4;\
  179. movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  180. mulpd %xmm10, %xmm11;\
  181. addpd %xmm11, %xmm5;\
  182. movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  183. mulpd %xmm10, %xmm11;\
  184. addpd %xmm11, %xmm6;\
  185. movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
  186. mulpd %xmm10, %xmm11;\
  187. movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
  188. addpd %xmm11, %xmm7;\
  189. movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11
  190. #define KERNEL9(address) \
  191. mulpd %xmm12, %xmm13;\
  192. PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\
  193. addpd %xmm13, %xmm0;\
  194. movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  195. mulpd %xmm12, %xmm13;\
  196. addpd %xmm13, %xmm1;\
  197. movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  198. mulpd %xmm12, %xmm13;\
  199. addpd %xmm13, %xmm2;\
  200. movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  201. mulpd %xmm12, %xmm13;\
  202. movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
  203. addpd %xmm13, %xmm3;\
  204. movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  205. #define KERNEL10(address) \
  206. mulpd %xmm12, %xmm13;\
  207. addpd %xmm13, %xmm4;\
  208. movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  209. mulpd %xmm12, %xmm13;\
  210. addpd %xmm13, %xmm5;\
  211. movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  212. mulpd %xmm12, %xmm13;\
  213. addpd %xmm13, %xmm6;\
  214. movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  215. mulpd %xmm12, %xmm13;\
  216. movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
  217. addpd %xmm13, %xmm7;\
  218. movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  219. #define KERNEL11(address) \
  220. mulpd %xmm12, %xmm13;\
  221. addpd %xmm13, %xmm0;\
  222. movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  223. mulpd %xmm12, %xmm13;\
  224. addpd %xmm13, %xmm1;\
  225. movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  226. mulpd %xmm12, %xmm13;\
  227. addpd %xmm13, %xmm2;\
  228. movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  229. mulpd %xmm12, %xmm13;\
  230. movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
  231. addpd %xmm13, %xmm3;\
  232. movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  233. #define KERNEL12(address) \
  234. mulpd %xmm12, %xmm13;\
  235. addpd %xmm13, %xmm4;\
  236. movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  237. mulpd %xmm12, %xmm13;\
  238. addpd %xmm13, %xmm5;\
  239. movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  240. mulpd %xmm12, %xmm13;\
  241. addpd %xmm13, %xmm6;\
  242. movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
  243. mulpd %xmm12, %xmm13;\
  244. movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
  245. addpd %xmm13, %xmm7;\
  246. movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13
  247. #define KERNEL13(address) \
  248. mulpd %xmm14, %xmm15;\
  249. addpd %xmm15, %xmm0;\
  250. movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  251. mulpd %xmm14, %xmm15;\
  252. addpd %xmm15, %xmm1;\
  253. movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  254. mulpd %xmm14, %xmm15;\
  255. addpd %xmm15, %xmm2;\
  256. movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  257. mulpd %xmm14, %xmm15;\
  258. movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
  259. addpd %xmm15, %xmm3;\
  260. movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  261. #define KERNEL14(address) \
  262. mulpd %xmm14, %xmm15;\
  263. addpd %xmm15, %xmm4;\
  264. movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  265. mulpd %xmm14, %xmm15;\
  266. addpd %xmm15, %xmm5;\
  267. movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  268. mulpd %xmm14, %xmm15;\
  269. addpd %xmm15, %xmm6;\
  270. movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  271. mulpd %xmm14, %xmm15;\
  272. movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
  273. addpd %xmm15, %xmm7;\
  274. movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  275. #define KERNEL15(address) \
  276. mulpd %xmm14, %xmm15;\
  277. addpd %xmm15, %xmm0;\
  278. movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  279. mulpd %xmm14, %xmm15;\
  280. addpd %xmm15, %xmm1;\
  281. movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  282. mulpd %xmm14, %xmm15;\
  283. addpd %xmm15, %xmm2;\
  284. movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  285. mulpd %xmm14, %xmm15;\
  286. movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
  287. addpd %xmm15, %xmm3;\
  288. movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  289. #define KERNEL16(address) \
  290. mulpd %xmm14, %xmm15;\
  291. addpd %xmm15, %xmm4;\
  292. movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  293. mulpd %xmm14, %xmm15;\
  294. addpd %xmm15, %xmm5;\
  295. movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  296. mulpd %xmm14, %xmm15;\
  297. addpd %xmm15, %xmm6;\
  298. movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
  299. mulpd %xmm14, %xmm15;\
  300. movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
  301. addpd %xmm15, %xmm7;\
  302. movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15
  303. PROLOGUE
  304. PROFCODE
  305. subq $STACKSIZE, %rsp
  306. movq %rbx, 0(%rsp)
  307. movq %rbp, 8(%rsp)
  308. movq %r12, 16(%rsp)
  309. movq %r13, 24(%rsp)
  310. movq %r14, 32(%rsp)
  311. movq %r15, 40(%rsp)
  312. #ifdef WINDOWS_ABI
  313. movq %rdi, 48(%rsp)
  314. movq %rsi, 56(%rsp)
  315. movups %xmm6, 64(%rsp)
  316. movups %xmm7, 80(%rsp)
  317. movups %xmm8, 96(%rsp)
  318. movups %xmm9, 112(%rsp)
  319. movups %xmm10, 128(%rsp)
  320. movups %xmm11, 144(%rsp)
  321. movups %xmm12, 160(%rsp)
  322. movups %xmm13, 176(%rsp)
  323. movups %xmm14, 192(%rsp)
  324. movups %xmm15, 208(%rsp)
  325. movq ARG1, M
  326. movq ARG2, N
  327. movq ARG3, K
  328. movq OLD_A, A
  329. movq OLD_B, B
  330. movq OLD_C, C
  331. movq OLD_LDC, LDC
  332. #ifdef TRMMKERNEL
  333. movsd OLD_OFFSET, %xmm4
  334. #endif
  335. movaps %xmm3, %xmm0
  336. #else
  337. movq OLD_LDC, LDC
  338. #ifdef TRMMKERNEL
  339. movsd OLD_OFFSET, %xmm4
  340. #endif
  341. #endif
  342. movsd %xmm0, ALPHA
  343. #ifdef TRMMKERNEL
  344. movsd %xmm4, OFFSET
  345. movsd %xmm4, KK
  346. #ifndef LEFT
  347. negq KK
  348. #endif
  349. #endif
  350. leaq (, LDC, SIZE), LDC
  351. movq N, J
  352. sarq $2, J # j = (n >> 2)
  353. jle .L40
  354. ALIGN_4
  355. .L10:
  356. #if defined(TRMMKERNEL) && defined(LEFT)
  357. movq OFFSET, %rax
  358. movq %rax, KK
  359. #endif
  360. movq C, CO1 # coffset1 = c
  361. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  362. movq A, AO # aoffset = a
  363. movq K, %rax
  364. salq $BASE_SHIFT + 2, %rax
  365. leaq (B, %rax), BB
  366. movq M, I
  367. sarq $2, I # i = (m >> 2)
  368. jle .L20
  369. ALIGN_4
  370. .L11:
  371. #if !defined(TRMMKERNEL) || \
  372. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  373. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  374. movq B, BO
  375. #else
  376. movq KK, %rax
  377. leaq (, %rax, SIZE), %rax
  378. leaq (AO, %rax, 4), AO
  379. leaq (B, %rax, 4), BO
  380. #endif
  381. movapd 0 * SIZE(AO), %xmm8
  382. pxor %xmm0, %xmm0
  383. movddup 0 * SIZE(BO), %xmm9
  384. pxor %xmm1, %xmm1
  385. movapd 8 * SIZE(AO), %xmm10
  386. pxor %xmm2, %xmm2
  387. movddup 8 * SIZE(BO), %xmm11
  388. pxor %xmm3, %xmm3
  389. movapd 16 * SIZE(AO), %xmm12
  390. pxor %xmm4, %xmm4
  391. movddup 16 * SIZE(BO), %xmm13
  392. pxor %xmm5, %xmm5
  393. movapd 24 * SIZE(AO), %xmm14
  394. pxor %xmm6, %xmm6
  395. movddup 24 * SIZE(BO), %xmm15
  396. pxor %xmm7, %xmm7
  397. prefetchnta 3 * SIZE(CO1)
  398. prefetchnta 3 * SIZE(CO2)
  399. prefetchnta 3 * SIZE(CO1, LDC, 2)
  400. prefetchnta 3 * SIZE(CO2, LDC, 2)
  401. prefetcht0 0 * SIZE(BB)
  402. subq $-8 * SIZE, BB
  403. #ifndef TRMMKERNEL
  404. movq K, %rax
  405. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  406. movq K, %rax
  407. subq KK, %rax
  408. movq %rax, KKK
  409. #else
  410. movq KK, %rax
  411. #ifdef LEFT
  412. addq $4, %rax
  413. #else
  414. addq $4, %rax
  415. #endif
  416. movq %rax, KKK
  417. #endif
  418. #if 1
  419. andq $-8, %rax
  420. salq $4, %rax
  421. NOBRANCH
  422. je .L15
  423. .L1X:
  424. KERNEL1 (16 * 0)
  425. KERNEL2 (16 * 0)
  426. KERNEL3 (16 * 0)
  427. KERNEL4 (16 * 0)
  428. KERNEL5 (16 * 0)
  429. KERNEL6 (16 * 0)
  430. KERNEL7 (16 * 0)
  431. KERNEL8 (16 * 0)
  432. KERNEL9 (16 * 0)
  433. KERNEL10(16 * 0)
  434. KERNEL11(16 * 0)
  435. KERNEL12(16 * 0)
  436. KERNEL13(16 * 0)
  437. KERNEL14(16 * 0)
  438. KERNEL15(16 * 0)
  439. KERNEL16(16 * 0)
  440. cmpq $128 * 1, %rax
  441. NOBRANCH
  442. jle .L12
  443. KERNEL1 (16 * 1)
  444. KERNEL2 (16 * 1)
  445. KERNEL3 (16 * 1)
  446. KERNEL4 (16 * 1)
  447. KERNEL5 (16 * 1)
  448. KERNEL6 (16 * 1)
  449. KERNEL7 (16 * 1)
  450. KERNEL8 (16 * 1)
  451. KERNEL9 (16 * 1)
  452. KERNEL10(16 * 1)
  453. KERNEL11(16 * 1)
  454. KERNEL12(16 * 1)
  455. KERNEL13(16 * 1)
  456. KERNEL14(16 * 1)
  457. KERNEL15(16 * 1)
  458. KERNEL16(16 * 1)
  459. cmpq $128 * 2, %rax
  460. NOBRANCH
  461. jle .L12
  462. KERNEL1 (16 * 2)
  463. KERNEL2 (16 * 2)
  464. KERNEL3 (16 * 2)
  465. KERNEL4 (16 * 2)
  466. KERNEL5 (16 * 2)
  467. KERNEL6 (16 * 2)
  468. KERNEL7 (16 * 2)
  469. KERNEL8 (16 * 2)
  470. KERNEL9 (16 * 2)
  471. KERNEL10(16 * 2)
  472. KERNEL11(16 * 2)
  473. KERNEL12(16 * 2)
  474. KERNEL13(16 * 2)
  475. KERNEL14(16 * 2)
  476. KERNEL15(16 * 2)
  477. KERNEL16(16 * 2)
  478. cmpq $128 * 3, %rax
  479. NOBRANCH
  480. jle .L12
  481. KERNEL1 (16 * 3)
  482. KERNEL2 (16 * 3)
  483. KERNEL3 (16 * 3)
  484. KERNEL4 (16 * 3)
  485. KERNEL5 (16 * 3)
  486. KERNEL6 (16 * 3)
  487. KERNEL7 (16 * 3)
  488. KERNEL8 (16 * 3)
  489. KERNEL9 (16 * 3)
  490. KERNEL10(16 * 3)
  491. KERNEL11(16 * 3)
  492. KERNEL12(16 * 3)
  493. KERNEL13(16 * 3)
  494. KERNEL14(16 * 3)
  495. KERNEL15(16 * 3)
  496. KERNEL16(16 * 3)
  497. cmpq $128 * 4, %rax
  498. NOBRANCH
  499. jle .L12
  500. KERNEL1 (16 * 4)
  501. KERNEL2 (16 * 4)
  502. KERNEL3 (16 * 4)
  503. KERNEL4 (16 * 4)
  504. KERNEL5 (16 * 4)
  505. KERNEL6 (16 * 4)
  506. KERNEL7 (16 * 4)
  507. KERNEL8 (16 * 4)
  508. KERNEL9 (16 * 4)
  509. KERNEL10(16 * 4)
  510. KERNEL11(16 * 4)
  511. KERNEL12(16 * 4)
  512. KERNEL13(16 * 4)
  513. KERNEL14(16 * 4)
  514. KERNEL15(16 * 4)
  515. KERNEL16(16 * 4)
  516. cmpq $128 * 5, %rax
  517. NOBRANCH
  518. jle .L12
  519. KERNEL1 (16 * 5)
  520. KERNEL2 (16 * 5)
  521. KERNEL3 (16 * 5)
  522. KERNEL4 (16 * 5)
  523. KERNEL5 (16 * 5)
  524. KERNEL6 (16 * 5)
  525. KERNEL7 (16 * 5)
  526. KERNEL8 (16 * 5)
  527. KERNEL9 (16 * 5)
  528. KERNEL10(16 * 5)
  529. KERNEL11(16 * 5)
  530. KERNEL12(16 * 5)
  531. KERNEL13(16 * 5)
  532. KERNEL14(16 * 5)
  533. KERNEL15(16 * 5)
  534. KERNEL16(16 * 5)
  535. cmpq $128 * 6, %rax
  536. NOBRANCH
  537. jle .L12
  538. KERNEL1 (16 * 6)
  539. KERNEL2 (16 * 6)
  540. KERNEL3 (16 * 6)
  541. KERNEL4 (16 * 6)
  542. KERNEL5 (16 * 6)
  543. KERNEL6 (16 * 6)
  544. KERNEL7 (16 * 6)
  545. KERNEL8 (16 * 6)
  546. KERNEL9 (16 * 6)
  547. KERNEL10(16 * 6)
  548. KERNEL11(16 * 6)
  549. KERNEL12(16 * 6)
  550. KERNEL13(16 * 6)
  551. KERNEL14(16 * 6)
  552. KERNEL15(16 * 6)
  553. KERNEL16(16 * 6)
  554. cmpq $128 * 7, %rax
  555. NOBRANCH
  556. jle .L12
  557. KERNEL1 (16 * 7)
  558. KERNEL2 (16 * 7)
  559. KERNEL3 (16 * 7)
  560. KERNEL4 (16 * 7)
  561. KERNEL5 (16 * 7)
  562. KERNEL6 (16 * 7)
  563. KERNEL7 (16 * 7)
  564. KERNEL8 (16 * 7)
  565. KERNEL9 (16 * 7)
  566. KERNEL10(16 * 7)
  567. KERNEL11(16 * 7)
  568. KERNEL12(16 * 7)
  569. KERNEL13(16 * 7)
  570. KERNEL14(16 * 7)
  571. KERNEL15(16 * 7)
  572. KERNEL16(16 * 7)
  573. addq $32 * 8 * SIZE, AO
  574. addq $32 * 8 * SIZE, BO
  575. subq $128 * 8, %rax
  576. BRANCH
  577. jg .L1X
  578. .L12:
  579. leaq (AO, %rax, 2), AO # * 16
  580. leaq (BO, %rax, 2), BO # * 64
  581. #else
  582. sarq $3, %rax
  583. je .L15
  584. ALIGN_4
  585. .L12:
  586. mulpd %xmm8, %xmm9
  587. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  588. addpd %xmm9, %xmm0
  589. movddup 1 * SIZE(BO), %xmm9
  590. mulpd %xmm8, %xmm9
  591. addpd %xmm9, %xmm1
  592. movddup 2 * SIZE(BO), %xmm9
  593. mulpd %xmm8, %xmm9
  594. addpd %xmm9, %xmm2
  595. movddup 3 * SIZE(BO), %xmm9
  596. mulpd %xmm8, %xmm9
  597. movapd 2 * SIZE(AO), %xmm8
  598. addpd %xmm9, %xmm3
  599. movddup 0 * SIZE(BO), %xmm9
  600. mulpd %xmm8, %xmm9
  601. addpd %xmm9, %xmm4
  602. movddup 1 * SIZE(BO), %xmm9
  603. mulpd %xmm8, %xmm9
  604. addpd %xmm9, %xmm5
  605. movddup 2 * SIZE(BO), %xmm9
  606. mulpd %xmm8, %xmm9
  607. addpd %xmm9, %xmm6
  608. movddup 3 * SIZE(BO), %xmm9
  609. mulpd %xmm8, %xmm9
  610. movapd 4 * SIZE(AO), %xmm8
  611. addpd %xmm9, %xmm7
  612. movddup 4 * SIZE(BO), %xmm9
  613. mulpd %xmm8, %xmm9
  614. addpd %xmm9, %xmm0
  615. movddup 5 * SIZE(BO), %xmm9
  616. mulpd %xmm8, %xmm9
  617. addpd %xmm9, %xmm1
  618. movddup 6 * SIZE(BO), %xmm9
  619. mulpd %xmm8, %xmm9
  620. addpd %xmm9, %xmm2
  621. movddup 7 * SIZE(BO), %xmm9
  622. mulpd %xmm8, %xmm9
  623. movapd 6 * SIZE(AO), %xmm8
  624. addpd %xmm9, %xmm3
  625. movddup 4 * SIZE(BO), %xmm9
  626. mulpd %xmm8, %xmm9
  627. addpd %xmm9, %xmm4
  628. movddup 5 * SIZE(BO), %xmm9
  629. mulpd %xmm8, %xmm9
  630. addpd %xmm9, %xmm5
  631. movddup 6 * SIZE(BO), %xmm9
  632. mulpd %xmm8, %xmm9
  633. addpd %xmm9, %xmm6
  634. movddup 7 * SIZE(BO), %xmm9
  635. mulpd %xmm8, %xmm9
  636. movapd 32 * SIZE(AO), %xmm8
  637. addpd %xmm9, %xmm7
  638. movddup 32 * SIZE(BO), %xmm9
  639. mulpd %xmm10, %xmm11
  640. addpd %xmm11, %xmm0
  641. movddup 9 * SIZE(BO), %xmm11
  642. mulpd %xmm10, %xmm11
  643. addpd %xmm11, %xmm1
  644. movddup 10 * SIZE(BO), %xmm11
  645. mulpd %xmm10, %xmm11
  646. addpd %xmm11, %xmm2
  647. movddup 11 * SIZE(BO), %xmm11
  648. mulpd %xmm10, %xmm11
  649. movapd 10 * SIZE(AO), %xmm10
  650. addpd %xmm11, %xmm3
  651. movddup 8 * SIZE(BO), %xmm11
  652. mulpd %xmm10, %xmm11
  653. addpd %xmm11, %xmm4
  654. movddup 9 * SIZE(BO), %xmm11
  655. mulpd %xmm10, %xmm11
  656. addpd %xmm11, %xmm5
  657. movddup 10 * SIZE(BO), %xmm11
  658. mulpd %xmm10, %xmm11
  659. addpd %xmm11, %xmm6
  660. movddup 11 * SIZE(BO), %xmm11
  661. mulpd %xmm10, %xmm11
  662. movapd 12 * SIZE(AO), %xmm10
  663. addpd %xmm11, %xmm7
  664. movddup 12 * SIZE(BO), %xmm11
  665. mulpd %xmm10, %xmm11
  666. addpd %xmm11, %xmm0
  667. movddup 13 * SIZE(BO), %xmm11
  668. mulpd %xmm10, %xmm11
  669. addpd %xmm11, %xmm1
  670. movddup 14 * SIZE(BO), %xmm11
  671. mulpd %xmm10, %xmm11
  672. addpd %xmm11, %xmm2
  673. movddup 15 * SIZE(BO), %xmm11
  674. mulpd %xmm10, %xmm11
  675. movapd 14 * SIZE(AO), %xmm10
  676. addpd %xmm11, %xmm3
  677. movddup 12 * SIZE(BO), %xmm11
  678. mulpd %xmm10, %xmm11
  679. addpd %xmm11, %xmm4
  680. movddup 13 * SIZE(BO), %xmm11
  681. mulpd %xmm10, %xmm11
  682. addpd %xmm11, %xmm5
  683. movddup 14 * SIZE(BO), %xmm11
  684. mulpd %xmm10, %xmm11
  685. addpd %xmm11, %xmm6
  686. movddup 15 * SIZE(BO), %xmm11
  687. mulpd %xmm10, %xmm11
  688. movapd 40 * SIZE(AO), %xmm10
  689. addpd %xmm11, %xmm7
  690. movddup 40 * SIZE(BO), %xmm11
  691. mulpd %xmm12, %xmm13
  692. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  693. addpd %xmm13, %xmm0
  694. movddup 17 * SIZE(BO), %xmm13
  695. mulpd %xmm12, %xmm13
  696. addpd %xmm13, %xmm1
  697. movddup 18 * SIZE(BO), %xmm13
  698. mulpd %xmm12, %xmm13
  699. addpd %xmm13, %xmm2
  700. movddup 19 * SIZE(BO), %xmm13
  701. mulpd %xmm12, %xmm13
  702. movapd 18 * SIZE(AO), %xmm12
  703. addpd %xmm13, %xmm3
  704. movddup 16 * SIZE(BO), %xmm13
  705. mulpd %xmm12, %xmm13
  706. addpd %xmm13, %xmm4
  707. movddup 17 * SIZE(BO), %xmm13
  708. mulpd %xmm12, %xmm13
  709. addpd %xmm13, %xmm5
  710. movddup 18 * SIZE(BO), %xmm13
  711. mulpd %xmm12, %xmm13
  712. addpd %xmm13, %xmm6
  713. movddup 19 * SIZE(BO), %xmm13
  714. mulpd %xmm12, %xmm13
  715. movapd 20 * SIZE(AO), %xmm12
  716. addpd %xmm13, %xmm7
  717. movddup 20 * SIZE(BO), %xmm13
  718. mulpd %xmm12, %xmm13
  719. addpd %xmm13, %xmm0
  720. movddup 21 * SIZE(BO), %xmm13
  721. mulpd %xmm12, %xmm13
  722. addpd %xmm13, %xmm1
  723. movddup 22 * SIZE(BO), %xmm13
  724. mulpd %xmm12, %xmm13
  725. addpd %xmm13, %xmm2
  726. movddup 23 * SIZE(BO), %xmm13
  727. mulpd %xmm12, %xmm13
  728. movapd 22 * SIZE(AO), %xmm12
  729. addpd %xmm13, %xmm3
  730. movddup 20 * SIZE(BO), %xmm13
  731. mulpd %xmm12, %xmm13
  732. addpd %xmm13, %xmm4
  733. movddup 21 * SIZE(BO), %xmm13
  734. mulpd %xmm12, %xmm13
  735. addpd %xmm13, %xmm5
  736. movddup 22 * SIZE(BO), %xmm13
  737. mulpd %xmm12, %xmm13
  738. addpd %xmm13, %xmm6
  739. movddup 23 * SIZE(BO), %xmm13
  740. mulpd %xmm12, %xmm13
  741. movapd 48 * SIZE(AO), %xmm12
  742. addpd %xmm13, %xmm7
  743. movddup 48 * SIZE(BO), %xmm13
  744. mulpd %xmm14, %xmm15
  745. addpd %xmm15, %xmm0
  746. movddup 25 * SIZE(BO), %xmm15
  747. mulpd %xmm14, %xmm15
  748. addpd %xmm15, %xmm1
  749. movddup 26 * SIZE(BO), %xmm15
  750. mulpd %xmm14, %xmm15
  751. addpd %xmm15, %xmm2
  752. movddup 27 * SIZE(BO), %xmm15
  753. mulpd %xmm14, %xmm15
  754. movapd 26 * SIZE(AO), %xmm14
  755. addpd %xmm15, %xmm3
  756. movddup 24 * SIZE(BO), %xmm15
  757. mulpd %xmm14, %xmm15
  758. addpd %xmm15, %xmm4
  759. movddup 25 * SIZE(BO), %xmm15
  760. mulpd %xmm14, %xmm15
  761. addpd %xmm15, %xmm5
  762. movddup 26 * SIZE(BO), %xmm15
  763. mulpd %xmm14, %xmm15
  764. addpd %xmm15, %xmm6
  765. movddup 27 * SIZE(BO), %xmm15
  766. mulpd %xmm14, %xmm15
  767. movapd 28 * SIZE(AO), %xmm14
  768. addpd %xmm15, %xmm7
  769. movddup 28 * SIZE(BO), %xmm15
  770. mulpd %xmm14, %xmm15
  771. addpd %xmm15, %xmm0
  772. movddup 29 * SIZE(BO), %xmm15
  773. mulpd %xmm14, %xmm15
  774. addpd %xmm15, %xmm1
  775. movddup 30 * SIZE(BO), %xmm15
  776. mulpd %xmm14, %xmm15
  777. addpd %xmm15, %xmm2
  778. movddup 31 * SIZE(BO), %xmm15
  779. mulpd %xmm14, %xmm15
  780. movapd 30 * SIZE(AO), %xmm14
  781. addpd %xmm15, %xmm3
  782. movddup 28 * SIZE(BO), %xmm15
  783. mulpd %xmm14, %xmm15
  784. addpd %xmm15, %xmm4
  785. movddup 29 * SIZE(BO), %xmm15
  786. mulpd %xmm14, %xmm15
  787. addpd %xmm15, %xmm5
  788. movddup 30 * SIZE(BO), %xmm15
  789. mulpd %xmm14, %xmm15
  790. addpd %xmm15, %xmm6
  791. movddup 31 * SIZE(BO), %xmm15
  792. mulpd %xmm14, %xmm15
  793. movapd 56 * SIZE(AO), %xmm14
  794. addpd %xmm15, %xmm7
  795. movddup 56 * SIZE(BO), %xmm15
  796. addq $32 * SIZE, BO
  797. addq $32 * SIZE, AO
  798. decq %rax
  799. BRANCH
  800. jne .L12
  801. #endif
  802. ALIGN_4
  803. .L15:
  804. #ifndef TRMMKERNEL
  805. movq K, %rax
  806. #else
  807. movq KKK, %rax
  808. #endif
  809. movddup ALPHA, %xmm15
  810. andq $7, %rax # if (k & 1)
  811. BRANCH
  812. BRANCH
  813. je .L19
  814. ALIGN_4
  815. .L16:
  816. mulpd %xmm8, %xmm9
  817. movapd 2 * SIZE(AO), %xmm10
  818. addpd %xmm9, %xmm0
  819. movddup 1 * SIZE(BO), %xmm9
  820. mulpd %xmm8, %xmm9
  821. movddup 0 * SIZE(BO), %xmm11
  822. addpd %xmm9, %xmm1
  823. movddup 2 * SIZE(BO), %xmm9
  824. mulpd %xmm8, %xmm9
  825. addpd %xmm9, %xmm2
  826. movddup 3 * SIZE(BO), %xmm9
  827. mulpd %xmm8, %xmm9
  828. movapd 4 * SIZE(AO), %xmm8
  829. addpd %xmm9, %xmm3
  830. movddup 4 * SIZE(BO), %xmm9
  831. mulpd %xmm10, %xmm11
  832. addpd %xmm11, %xmm4
  833. movddup 1 * SIZE(BO), %xmm11
  834. mulpd %xmm10, %xmm11
  835. addpd %xmm11, %xmm5
  836. movddup 2 * SIZE(BO), %xmm11
  837. mulpd %xmm10, %xmm11
  838. addpd %xmm11, %xmm6
  839. movddup 3 * SIZE(BO), %xmm11
  840. mulpd %xmm10, %xmm11
  841. addpd %xmm11, %xmm7
  842. addq $4 * SIZE, AO # aoffset += 4
  843. addq $4 * SIZE, BO # boffset1 += 8
  844. decq %rax
  845. BRANCH
  846. jg .L16
  847. ALIGN_4
  848. .L19:
  849. mulpd %xmm15, %xmm0
  850. mulpd %xmm15, %xmm4
  851. mulpd %xmm15, %xmm1
  852. mulpd %xmm15, %xmm5
  853. testq $15, CO1
  854. NOBRANCH
  855. jne .L19x
  856. testq $15, LDC
  857. NOBRANCH
  858. jne .L19x
  859. mulpd %xmm15, %xmm2
  860. mulpd %xmm15, %xmm3
  861. mulpd %xmm15, %xmm6
  862. mulpd %xmm15, %xmm7
  863. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  864. addpd 0 * SIZE(CO1), %xmm0
  865. addpd 2 * SIZE(CO1), %xmm4
  866. addpd 0 * SIZE(CO2), %xmm1
  867. addpd 2 * SIZE(CO2), %xmm5
  868. addpd 0 * SIZE(CO1, LDC, 2), %xmm2
  869. addpd 2 * SIZE(CO1, LDC, 2), %xmm6
  870. addpd 0 * SIZE(CO2, LDC, 2), %xmm3
  871. addpd 2 * SIZE(CO2, LDC, 2), %xmm7
  872. #endif
  873. movapd %xmm0, 0 * SIZE(CO1)
  874. movapd %xmm4, 2 * SIZE(CO1)
  875. movapd %xmm1, 0 * SIZE(CO2)
  876. movapd %xmm5, 2 * SIZE(CO2)
  877. movapd %xmm2, 0 * SIZE(CO1, LDC, 2)
  878. movapd %xmm6, 2 * SIZE(CO1, LDC, 2)
  879. movapd %xmm3, 0 * SIZE(CO2, LDC, 2)
  880. movapd %xmm7, 2 * SIZE(CO2, LDC, 2)
  881. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  882. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  883. movq K, %rax
  884. subq KKK, %rax
  885. leaq (,%rax, SIZE), %rax
  886. leaq (AO, %rax, 4), AO
  887. leaq (BO, %rax, 4), BO
  888. #endif
  889. #if defined(TRMMKERNEL) && defined(LEFT)
  890. addq $4, KK
  891. #endif
  892. addq $4 * SIZE, CO1 # coffset += 4
  893. addq $4 * SIZE, CO2 # coffset += 4
  894. decq I # i --
  895. jg .L11
  896. jmp .L20
  897. ALIGN_4
  898. .L19x:
  899. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  900. movsd 0 * SIZE(CO1), %xmm8
  901. movhpd 1 * SIZE(CO1), %xmm8
  902. movsd 2 * SIZE(CO1), %xmm9
  903. movhpd 3 * SIZE(CO1), %xmm9
  904. movsd 0 * SIZE(CO2), %xmm10
  905. movhpd 1 * SIZE(CO2), %xmm10
  906. movsd 2 * SIZE(CO2), %xmm11
  907. movhpd 3 * SIZE(CO2), %xmm11
  908. addpd %xmm8, %xmm0
  909. addpd %xmm9, %xmm4
  910. addpd %xmm10, %xmm1
  911. addpd %xmm11, %xmm5
  912. #endif
  913. mulpd %xmm15, %xmm2
  914. mulpd %xmm15, %xmm3
  915. mulpd %xmm15, %xmm6
  916. mulpd %xmm15, %xmm7
  917. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  918. movsd 0 * SIZE(CO1, LDC, 2), %xmm12
  919. movhpd 1 * SIZE(CO1, LDC, 2), %xmm12
  920. movsd 2 * SIZE(CO1, LDC, 2), %xmm13
  921. movhpd 3 * SIZE(CO1, LDC, 2), %xmm13
  922. movsd 0 * SIZE(CO2, LDC, 2), %xmm14
  923. movhpd 1 * SIZE(CO2, LDC, 2), %xmm14
  924. movsd 2 * SIZE(CO2, LDC, 2), %xmm15
  925. movhpd 3 * SIZE(CO2, LDC, 2), %xmm15
  926. addpd %xmm12, %xmm2
  927. addpd %xmm13, %xmm6
  928. addpd %xmm14, %xmm3
  929. addpd %xmm15, %xmm7
  930. #endif
  931. movsd %xmm0, 0 * SIZE(CO1)
  932. movhpd %xmm0, 1 * SIZE(CO1)
  933. movsd %xmm4, 2 * SIZE(CO1)
  934. movhpd %xmm4, 3 * SIZE(CO1)
  935. movsd %xmm1, 0 * SIZE(CO2)
  936. movhpd %xmm1, 1 * SIZE(CO2)
  937. movsd %xmm5, 2 * SIZE(CO2)
  938. movhpd %xmm5, 3 * SIZE(CO2)
  939. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  940. movhpd %xmm2, 1 * SIZE(CO1, LDC, 2)
  941. movsd %xmm6, 2 * SIZE(CO1, LDC, 2)
  942. movhpd %xmm6, 3 * SIZE(CO1, LDC, 2)
  943. movsd %xmm3, 0 * SIZE(CO2, LDC, 2)
  944. movhpd %xmm3, 1 * SIZE(CO2, LDC, 2)
  945. movsd %xmm7, 2 * SIZE(CO2, LDC, 2)
  946. movhpd %xmm7, 3 * SIZE(CO2, LDC, 2)
  947. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  948. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  949. movq K, %rax
  950. subq KKK, %rax
  951. leaq (,%rax, SIZE), %rax
  952. leaq (AO, %rax, 4), AO
  953. leaq (BO, %rax, 4), BO
  954. #endif
  955. #if defined(TRMMKERNEL) && defined(LEFT)
  956. addq $4, KK
  957. #endif
  958. addq $4 * SIZE, CO1 # coffset += 4
  959. addq $4 * SIZE, CO2 # coffset += 4
  960. decq I # i --
  961. jg .L11
  962. ALIGN_4
  963. .L20:
  964. testq $2, M
  965. BRANCH
  966. je .L30
  967. ALIGN_4
  968. .L21:
  969. #if !defined(TRMMKERNEL) || \
  970. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  971. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  972. movq B, BO
  973. #else
  974. movq KK, %rax
  975. leaq (, %rax, SIZE), %rax
  976. leaq (AO, %rax, 2), AO
  977. leaq (B, %rax, 4), BO
  978. #endif
  979. movapd 0 * SIZE(AO), %xmm8
  980. pxor %xmm0, %xmm0
  981. movddup 0 * SIZE(BO), %xmm9
  982. pxor %xmm1, %xmm1
  983. movapd 8 * SIZE(AO), %xmm10
  984. pxor %xmm2, %xmm2
  985. movddup 8 * SIZE(BO), %xmm11
  986. pxor %xmm3, %xmm3
  987. #ifndef TRMMKERNEL
  988. movq K, %rax
  989. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  990. movq K, %rax
  991. subq KK, %rax
  992. movq %rax, KKK
  993. #else
  994. movq KK, %rax
  995. #ifdef LEFT
  996. addq $2, %rax
  997. #else
  998. addq $4, %rax
  999. #endif
  1000. movq %rax, KKK
  1001. #endif
  1002. sarq $3, %rax
  1003. je .L25
  1004. ALIGN_4
  1005. .L22:
  1006. mulpd %xmm8, %xmm9
  1007. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1008. addpd %xmm9, %xmm0
  1009. movddup 1 * SIZE(BO), %xmm9
  1010. mulpd %xmm8, %xmm9
  1011. addpd %xmm9, %xmm1
  1012. movddup 2 * SIZE(BO), %xmm9
  1013. mulpd %xmm8, %xmm9
  1014. addpd %xmm9, %xmm2
  1015. movddup 3 * SIZE(BO), %xmm9
  1016. mulpd %xmm8, %xmm9
  1017. movapd 2 * SIZE(AO), %xmm8
  1018. addpd %xmm9, %xmm3
  1019. movddup 4 * SIZE(BO), %xmm9
  1020. mulpd %xmm8, %xmm9
  1021. addpd %xmm9, %xmm0
  1022. movddup 5 * SIZE(BO), %xmm9
  1023. mulpd %xmm8, %xmm9
  1024. addpd %xmm9, %xmm1
  1025. movddup 6 * SIZE(BO), %xmm9
  1026. mulpd %xmm8, %xmm9
  1027. addpd %xmm9, %xmm2
  1028. movddup 7 * SIZE(BO), %xmm9
  1029. mulpd %xmm8, %xmm9
  1030. movapd 4 * SIZE(AO), %xmm8
  1031. addpd %xmm9, %xmm3
  1032. movddup 16 * SIZE(BO), %xmm9
  1033. mulpd %xmm8, %xmm11
  1034. addpd %xmm11, %xmm0
  1035. movddup 9 * SIZE(BO), %xmm11
  1036. mulpd %xmm8, %xmm11
  1037. addpd %xmm11, %xmm1
  1038. movddup 10 * SIZE(BO), %xmm11
  1039. mulpd %xmm8, %xmm11
  1040. addpd %xmm11, %xmm2
  1041. movddup 11 * SIZE(BO), %xmm11
  1042. mulpd %xmm8, %xmm11
  1043. movapd 6 * SIZE(AO), %xmm8
  1044. addpd %xmm11, %xmm3
  1045. movddup 12 * SIZE(BO), %xmm11
  1046. mulpd %xmm8, %xmm11
  1047. addpd %xmm11, %xmm0
  1048. movddup 13 * SIZE(BO), %xmm11
  1049. mulpd %xmm8, %xmm11
  1050. addpd %xmm11, %xmm1
  1051. movddup 14 * SIZE(BO), %xmm11
  1052. mulpd %xmm8, %xmm11
  1053. addpd %xmm11, %xmm2
  1054. movddup 15 * SIZE(BO), %xmm11
  1055. mulpd %xmm8, %xmm11
  1056. movapd 16 * SIZE(AO), %xmm8
  1057. addpd %xmm11, %xmm3
  1058. movddup 24 * SIZE(BO), %xmm11
  1059. mulpd %xmm10, %xmm9
  1060. addpd %xmm9, %xmm0
  1061. movddup 17 * SIZE(BO), %xmm9
  1062. mulpd %xmm10, %xmm9
  1063. addpd %xmm9, %xmm1
  1064. movddup 18 * SIZE(BO), %xmm9
  1065. mulpd %xmm10, %xmm9
  1066. addpd %xmm9, %xmm2
  1067. movddup 19 * SIZE(BO), %xmm9
  1068. mulpd %xmm10, %xmm9
  1069. movapd 10 * SIZE(AO), %xmm10
  1070. addpd %xmm9, %xmm3
  1071. movddup 20 * SIZE(BO), %xmm9
  1072. mulpd %xmm10, %xmm9
  1073. addpd %xmm9, %xmm0
  1074. movddup 21 * SIZE(BO), %xmm9
  1075. mulpd %xmm10, %xmm9
  1076. addpd %xmm9, %xmm1
  1077. movddup 22 * SIZE(BO), %xmm9
  1078. mulpd %xmm10, %xmm9
  1079. addpd %xmm9, %xmm2
  1080. movddup 23 * SIZE(BO), %xmm9
  1081. mulpd %xmm10, %xmm9
  1082. movapd 12 * SIZE(AO), %xmm10
  1083. addpd %xmm9, %xmm3
  1084. movddup 32 * SIZE(BO), %xmm9
  1085. mulpd %xmm10, %xmm11
  1086. addpd %xmm11, %xmm0
  1087. movddup 25 * SIZE(BO), %xmm11
  1088. mulpd %xmm10, %xmm11
  1089. addpd %xmm11, %xmm1
  1090. movddup 26 * SIZE(BO), %xmm11
  1091. mulpd %xmm10, %xmm11
  1092. addpd %xmm11, %xmm2
  1093. movddup 27 * SIZE(BO), %xmm11
  1094. mulpd %xmm10, %xmm11
  1095. movapd 14 * SIZE(AO), %xmm10
  1096. addpd %xmm11, %xmm3
  1097. movddup 28 * SIZE(BO), %xmm11
  1098. mulpd %xmm10, %xmm11
  1099. addpd %xmm11, %xmm0
  1100. movddup 29 * SIZE(BO), %xmm11
  1101. mulpd %xmm10, %xmm11
  1102. addpd %xmm11, %xmm1
  1103. movddup 30 * SIZE(BO), %xmm11
  1104. mulpd %xmm10, %xmm11
  1105. addpd %xmm11, %xmm2
  1106. movddup 31 * SIZE(BO), %xmm11
  1107. mulpd %xmm10, %xmm11
  1108. movapd 24 * SIZE(AO), %xmm10
  1109. addpd %xmm11, %xmm3
  1110. movddup 40 * SIZE(BO), %xmm11
  1111. addq $16 * SIZE, AO
  1112. addq $32 * SIZE, BO
  1113. decq %rax
  1114. jne .L22
  1115. ALIGN_4
  1116. .L25:
  1117. #ifndef TRMMKERNEL
  1118. movq K, %rax
  1119. #else
  1120. movq KKK, %rax
  1121. #endif
  1122. movddup ALPHA, %xmm15
  1123. andq $7, %rax # if (k & 1)
  1124. BRANCH
  1125. je .L29
  1126. ALIGN_4
  1127. .L26:
  1128. mulpd %xmm8, %xmm9
  1129. addpd %xmm9, %xmm0
  1130. movddup 1 * SIZE(BO), %xmm9
  1131. mulpd %xmm8, %xmm9
  1132. addpd %xmm9, %xmm1
  1133. movddup 2 * SIZE(BO), %xmm9
  1134. mulpd %xmm8, %xmm9
  1135. addpd %xmm9, %xmm2
  1136. movddup 3 * SIZE(BO), %xmm9
  1137. mulpd %xmm8, %xmm9
  1138. movapd 2 * SIZE(AO), %xmm8
  1139. addpd %xmm9, %xmm3
  1140. movddup 4 * SIZE(BO), %xmm9
  1141. addq $2 * SIZE, AO # aoffset += 4
  1142. addq $4 * SIZE, BO # boffset1 += 8
  1143. decq %rax
  1144. jg .L26
  1145. ALIGN_4
  1146. .L29:
  1147. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1148. movsd 0 * SIZE(CO1), %xmm8
  1149. movhpd 1 * SIZE(CO1), %xmm8
  1150. movsd 0 * SIZE(CO2), %xmm10
  1151. movhpd 1 * SIZE(CO2), %xmm10
  1152. movsd 0 * SIZE(CO1, LDC, 2), %xmm12
  1153. movhpd 1 * SIZE(CO1, LDC, 2), %xmm12
  1154. movsd 0 * SIZE(CO2, LDC, 2), %xmm14
  1155. movhpd 1 * SIZE(CO2, LDC, 2), %xmm14
  1156. #endif
  1157. mulpd %xmm15, %xmm0
  1158. mulpd %xmm15, %xmm1
  1159. mulpd %xmm15, %xmm2
  1160. mulpd %xmm15, %xmm3
  1161. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1162. addpd %xmm8, %xmm0
  1163. addpd %xmm10, %xmm1
  1164. addpd %xmm12, %xmm2
  1165. addpd %xmm14, %xmm3
  1166. #endif
  1167. movsd %xmm0, 0 * SIZE(CO1)
  1168. movhpd %xmm0, 1 * SIZE(CO1)
  1169. movsd %xmm1, 0 * SIZE(CO2)
  1170. movhpd %xmm1, 1 * SIZE(CO2)
  1171. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  1172. movhpd %xmm2, 1 * SIZE(CO1, LDC, 2)
  1173. movsd %xmm3, 0 * SIZE(CO2, LDC, 2)
  1174. movhpd %xmm3, 1 * SIZE(CO2, LDC, 2)
  1175. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1176. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1177. movq K, %rax
  1178. subq KKK, %rax
  1179. leaq (,%rax, SIZE), %rax
  1180. leaq (AO, %rax, 2), AO
  1181. leaq (BO, %rax, 4), BO
  1182. #endif
  1183. #if defined(TRMMKERNEL) && defined(LEFT)
  1184. addq $2, KK
  1185. #endif
  1186. addq $2 * SIZE, CO1 # coffset += 4
  1187. addq $2 * SIZE, CO2 # coffset += 4
  1188. ALIGN_4
  1189. .L30:
  1190. testq $1, M
  1191. je .L39
  1192. ALIGN_4
  1193. .L31:
  1194. #if !defined(TRMMKERNEL) || \
  1195. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1196. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1197. movq B, BO
  1198. #else
  1199. movq KK, %rax
  1200. leaq (, %rax, SIZE), %rax
  1201. leaq (AO, %rax, 1), AO
  1202. leaq (B, %rax, 4), BO
  1203. #endif
  1204. movddup 0 * SIZE(AO), %xmm8
  1205. pxor %xmm0, %xmm0
  1206. movapd 0 * SIZE(BO), %xmm9
  1207. pxor %xmm1, %xmm1
  1208. movddup 4 * SIZE(AO), %xmm10
  1209. pxor %xmm2, %xmm2
  1210. movapd 8 * SIZE(BO), %xmm11
  1211. pxor %xmm3, %xmm3
  1212. #ifndef TRMMKERNEL
  1213. movq K, %rax
  1214. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1215. movq K, %rax
  1216. subq KK, %rax
  1217. movq %rax, KKK
  1218. #else
  1219. movq KK, %rax
  1220. #ifdef LEFT
  1221. addq $1, %rax
  1222. #else
  1223. addq $4, %rax
  1224. #endif
  1225. movq %rax, KKK
  1226. #endif
  1227. sarq $3, %rax
  1228. je .L35
  1229. ALIGN_4
  1230. .L32:
  1231. mulpd %xmm8, %xmm9
  1232. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1233. addpd %xmm9, %xmm0
  1234. movapd 2 * SIZE(BO), %xmm9
  1235. mulpd %xmm8, %xmm9
  1236. movddup 1 * SIZE(AO), %xmm8
  1237. addpd %xmm9, %xmm1
  1238. movapd 4 * SIZE(BO), %xmm9
  1239. mulpd %xmm8, %xmm9
  1240. addpd %xmm9, %xmm0
  1241. movapd 6 * SIZE(BO), %xmm9
  1242. mulpd %xmm8, %xmm9
  1243. movddup 2 * SIZE(AO), %xmm8
  1244. addpd %xmm9, %xmm1
  1245. movapd 16 * SIZE(BO), %xmm9
  1246. mulpd %xmm8, %xmm11
  1247. addpd %xmm11, %xmm0
  1248. movapd 10 * SIZE(BO), %xmm11
  1249. mulpd %xmm8, %xmm11
  1250. movddup 3 * SIZE(AO), %xmm8
  1251. addpd %xmm11, %xmm1
  1252. movapd 12 * SIZE(BO), %xmm11
  1253. mulpd %xmm8, %xmm11
  1254. addpd %xmm11, %xmm0
  1255. movapd 14 * SIZE(BO), %xmm11
  1256. mulpd %xmm8, %xmm11
  1257. movddup 8 * SIZE(AO), %xmm8
  1258. addpd %xmm11, %xmm1
  1259. movapd 24 * SIZE(BO), %xmm11
  1260. mulpd %xmm10, %xmm9
  1261. addpd %xmm9, %xmm0
  1262. movapd 18 * SIZE(BO), %xmm9
  1263. mulpd %xmm10, %xmm9
  1264. movddup 5 * SIZE(AO), %xmm10
  1265. addpd %xmm9, %xmm1
  1266. movapd 20 * SIZE(BO), %xmm9
  1267. mulpd %xmm10, %xmm9
  1268. addpd %xmm9, %xmm0
  1269. movapd 22 * SIZE(BO), %xmm9
  1270. mulpd %xmm10, %xmm9
  1271. movddup 6 * SIZE(AO), %xmm10
  1272. addpd %xmm9, %xmm1
  1273. movapd 32 * SIZE(BO), %xmm9
  1274. mulpd %xmm10, %xmm11
  1275. addpd %xmm11, %xmm0
  1276. movapd 26 * SIZE(BO), %xmm11
  1277. mulpd %xmm10, %xmm11
  1278. movddup 7 * SIZE(AO), %xmm10
  1279. addpd %xmm11, %xmm1
  1280. movapd 28 * SIZE(BO), %xmm11
  1281. mulpd %xmm10, %xmm11
  1282. addpd %xmm11, %xmm0
  1283. movapd 30 * SIZE(BO), %xmm11
  1284. mulpd %xmm10, %xmm11
  1285. movddup 12 * SIZE(AO), %xmm10
  1286. addpd %xmm11, %xmm1
  1287. movapd 40 * SIZE(BO), %xmm11
  1288. addq $ 8 * SIZE, AO
  1289. addq $32 * SIZE, BO
  1290. decq %rax
  1291. jne .L32
  1292. ALIGN_4
  1293. .L35:
  1294. #ifndef TRMMKERNEL
  1295. movq K, %rax
  1296. #else
  1297. movq KKK, %rax
  1298. #endif
  1299. movddup ALPHA, %xmm15
  1300. andq $7, %rax # if (k & 1)
  1301. BRANCH
  1302. je .L38
  1303. ALIGN_4
  1304. .L36:
  1305. mulpd %xmm8, %xmm9
  1306. addpd %xmm9, %xmm0
  1307. movapd 2 * SIZE(BO), %xmm9
  1308. mulpd %xmm8, %xmm9
  1309. movddup 1 * SIZE(AO), %xmm8
  1310. addpd %xmm9, %xmm1
  1311. movapd 4 * SIZE(BO), %xmm9
  1312. addq $1 * SIZE, AO # aoffset += 4
  1313. addq $4 * SIZE, BO # boffset1 += 8
  1314. decq %rax
  1315. jg .L36
  1316. ALIGN_4
  1317. .L38:
  1318. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1319. movsd 0 * SIZE(CO1), %xmm8
  1320. movhpd 0 * SIZE(CO2), %xmm8
  1321. movsd 0 * SIZE(CO1, LDC, 2), %xmm9
  1322. movhpd 0 * SIZE(CO2, LDC, 2), %xmm9
  1323. #endif
  1324. mulpd %xmm15, %xmm0
  1325. mulpd %xmm15, %xmm1
  1326. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1327. addpd %xmm8, %xmm0
  1328. addpd %xmm9, %xmm1
  1329. #endif
  1330. movsd %xmm0, 0 * SIZE(CO1)
  1331. movhpd %xmm0, 0 * SIZE(CO2)
  1332. movsd %xmm1, 0 * SIZE(CO1, LDC, 2)
  1333. movhpd %xmm1, 0 * SIZE(CO2, LDC, 2)
  1334. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1335. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1336. movq K, %rax
  1337. subq KKK, %rax
  1338. leaq (,%rax, SIZE), %rax
  1339. leaq (AO, %rax, 1), AO
  1340. leaq (BO, %rax, 4), BO
  1341. #endif
  1342. #if defined(TRMMKERNEL) && defined(LEFT)
  1343. addq $1, KK
  1344. #endif
  1345. ALIGN_4
  1346. .L39:
  1347. #if defined(TRMMKERNEL) && !defined(LEFT)
  1348. addl $4, KK
  1349. #endif
  1350. leaq (C, LDC, 4), C # c += 4 * ldc
  1351. movq BO, B
  1352. decq J # j --
  1353. jg .L10
  1354. ALIGN_4
  1355. .L40:
  1356. testq $2, N
  1357. je .L80
  1358. ALIGN_4
  1359. #if defined(TRMMKERNEL) && defined(LEFT)
  1360. movq OFFSET, %rax
  1361. movq %rax, KK
  1362. #endif
  1363. movq C, CO1 # coffset1 = c
  1364. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  1365. movq A, AO # aoffset = a
  1366. movq K, %rax
  1367. salq $BASE_SHIFT + 1, %rax
  1368. leaq (B, %rax), BB
  1369. movq M, I
  1370. sarq $2, I # i = (m >> 2)
  1371. jle .L60
  1372. ALIGN_4
  1373. .L51:
  1374. #if !defined(TRMMKERNEL) || \
  1375. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1376. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1377. movq B, BO
  1378. #else
  1379. movq KK, %rax
  1380. leaq (, %rax, SIZE), %rax
  1381. leaq (AO, %rax, 4), AO
  1382. leaq (B, %rax, 2), BO
  1383. #endif
  1384. prefetcht0 0 * SIZE(BB)
  1385. subq $-4 * SIZE, BB
  1386. movapd 0 * SIZE(AO), %xmm8
  1387. pxor %xmm0, %xmm0
  1388. movddup 0 * SIZE(BO), %xmm9
  1389. pxor %xmm1, %xmm1
  1390. movapd 8 * SIZE(AO), %xmm10
  1391. pxor %xmm4, %xmm4
  1392. movddup 8 * SIZE(BO), %xmm11
  1393. pxor %xmm5, %xmm5
  1394. #ifdef HAVE_3DNOW
  1395. prefetchw 4 * SIZE(CO1)
  1396. prefetchw 4 * SIZE(CO2)
  1397. #else
  1398. prefetchnta 4 * SIZE(CO1)
  1399. prefetchnta 4 * SIZE(CO2)
  1400. #endif
  1401. #ifndef TRMMKERNEL
  1402. movq K, %rax
  1403. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1404. movq K, %rax
  1405. subq KK, %rax
  1406. movq %rax, KKK
  1407. #else
  1408. movq KK, %rax
  1409. #ifdef LEFT
  1410. addq $4, %rax
  1411. #else
  1412. addq $2, %rax
  1413. #endif
  1414. movq %rax, KKK
  1415. #endif
  1416. sarq $3, %rax
  1417. je .L55
  1418. ALIGN_4
  1419. .L52:
  1420. mulpd %xmm8, %xmm9
  1421. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1422. addpd %xmm9, %xmm0
  1423. movddup 1 * SIZE(BO), %xmm9
  1424. mulpd %xmm8, %xmm9
  1425. movapd 2 * SIZE(AO), %xmm8
  1426. addpd %xmm9, %xmm1
  1427. movddup 0 * SIZE(BO), %xmm9
  1428. mulpd %xmm8, %xmm9
  1429. addpd %xmm9, %xmm4
  1430. movddup 1 * SIZE(BO), %xmm9
  1431. mulpd %xmm8, %xmm9
  1432. movapd 4 * SIZE(AO), %xmm8
  1433. addpd %xmm9, %xmm5
  1434. movddup 2 * SIZE(BO), %xmm9
  1435. mulpd %xmm8, %xmm9
  1436. addpd %xmm9, %xmm0
  1437. movddup 3 * SIZE(BO), %xmm9
  1438. mulpd %xmm8, %xmm9
  1439. movapd 6 * SIZE(AO), %xmm8
  1440. addpd %xmm9, %xmm1
  1441. movddup 2 * SIZE(BO), %xmm9
  1442. mulpd %xmm8, %xmm9
  1443. addpd %xmm9, %xmm4
  1444. movddup 3 * SIZE(BO), %xmm9
  1445. mulpd %xmm8, %xmm9
  1446. movapd 16 * SIZE(AO), %xmm8
  1447. addpd %xmm9, %xmm5
  1448. movddup 4 * SIZE(BO), %xmm9
  1449. mulpd %xmm10, %xmm9
  1450. addpd %xmm9, %xmm0
  1451. movddup 5 * SIZE(BO), %xmm9
  1452. mulpd %xmm10, %xmm9
  1453. movapd 10 * SIZE(AO), %xmm10
  1454. addpd %xmm9, %xmm1
  1455. movddup 4 * SIZE(BO), %xmm9
  1456. mulpd %xmm10, %xmm9
  1457. addpd %xmm9, %xmm4
  1458. movddup 5 * SIZE(BO), %xmm9
  1459. mulpd %xmm10, %xmm9
  1460. movapd 12 * SIZE(AO), %xmm10
  1461. addpd %xmm9, %xmm5
  1462. movddup 6 * SIZE(BO), %xmm9
  1463. mulpd %xmm10, %xmm9
  1464. addpd %xmm9, %xmm0
  1465. movddup 7 * SIZE(BO), %xmm9
  1466. mulpd %xmm10, %xmm9
  1467. movapd 14 * SIZE(AO), %xmm10
  1468. addpd %xmm9, %xmm1
  1469. movddup 6 * SIZE(BO), %xmm9
  1470. mulpd %xmm10, %xmm9
  1471. addpd %xmm9, %xmm4
  1472. movddup 7 * SIZE(BO), %xmm9
  1473. mulpd %xmm10, %xmm9
  1474. movapd 40 * SIZE(AO), %xmm10
  1475. addpd %xmm9, %xmm5
  1476. movddup 16 * SIZE(BO), %xmm9
  1477. mulpd %xmm8, %xmm11
  1478. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1479. addpd %xmm11, %xmm0
  1480. movddup 9 * SIZE(BO), %xmm11
  1481. mulpd %xmm8, %xmm11
  1482. movapd 18 * SIZE(AO), %xmm8
  1483. addpd %xmm11, %xmm1
  1484. movddup 8 * SIZE(BO), %xmm11
  1485. mulpd %xmm8, %xmm11
  1486. addpd %xmm11, %xmm4
  1487. movddup 9 * SIZE(BO), %xmm11
  1488. mulpd %xmm8, %xmm11
  1489. movapd 20 * SIZE(AO), %xmm8
  1490. addpd %xmm11, %xmm5
  1491. movddup 10 * SIZE(BO), %xmm11
  1492. mulpd %xmm8, %xmm11
  1493. addpd %xmm11, %xmm0
  1494. movddup 11 * SIZE(BO), %xmm11
  1495. mulpd %xmm8, %xmm11
  1496. movapd 22 * SIZE(AO), %xmm8
  1497. addpd %xmm11, %xmm1
  1498. movddup 10 * SIZE(BO), %xmm11
  1499. mulpd %xmm8, %xmm11
  1500. addpd %xmm11, %xmm4
  1501. movddup 11 * SIZE(BO), %xmm11
  1502. mulpd %xmm8, %xmm11
  1503. movapd 24 * SIZE(AO), %xmm8
  1504. addpd %xmm11, %xmm5
  1505. movddup 12 * SIZE(BO), %xmm11
  1506. mulpd %xmm8, %xmm11
  1507. addpd %xmm11, %xmm0
  1508. movddup 13 * SIZE(BO), %xmm11
  1509. mulpd %xmm8, %xmm11
  1510. movapd 26 * SIZE(AO), %xmm8
  1511. addpd %xmm11, %xmm1
  1512. movddup 12 * SIZE(BO), %xmm11
  1513. mulpd %xmm8, %xmm11
  1514. addpd %xmm11, %xmm4
  1515. movddup 13 * SIZE(BO), %xmm11
  1516. mulpd %xmm8, %xmm11
  1517. movapd 28 * SIZE(AO), %xmm8
  1518. addpd %xmm11, %xmm5
  1519. movddup 14 * SIZE(BO), %xmm11
  1520. mulpd %xmm8, %xmm11
  1521. addpd %xmm11, %xmm0
  1522. movddup 15 * SIZE(BO), %xmm11
  1523. mulpd %xmm8, %xmm11
  1524. movapd 30 * SIZE(AO), %xmm8
  1525. addpd %xmm11, %xmm1
  1526. movddup 14 * SIZE(BO), %xmm11
  1527. mulpd %xmm8, %xmm11
  1528. addpd %xmm11, %xmm4
  1529. movddup 15 * SIZE(BO), %xmm11
  1530. mulpd %xmm8, %xmm11
  1531. movapd 32 * SIZE(AO), %xmm8
  1532. addpd %xmm11, %xmm5
  1533. movddup 24 * SIZE(BO), %xmm11
  1534. addq $32 * SIZE, AO
  1535. addq $16 * SIZE, BO
  1536. decq %rax
  1537. jne .L52
  1538. ALIGN_4
  1539. .L55:
  1540. #ifndef TRMMKERNEL
  1541. movq K, %rax
  1542. #else
  1543. movq KKK, %rax
  1544. #endif
  1545. movddup ALPHA, %xmm15
  1546. andq $7, %rax # if (k & 1)
  1547. BRANCH
  1548. je .L59
  1549. ALIGN_4
  1550. .L56:
  1551. mulpd %xmm8, %xmm9
  1552. movapd 2 * SIZE(AO), %xmm10
  1553. addpd %xmm9, %xmm0
  1554. movddup 1 * SIZE(BO), %xmm9
  1555. mulpd %xmm8, %xmm9
  1556. movddup 0 * SIZE(BO), %xmm11
  1557. addpd %xmm9, %xmm1
  1558. movddup 2 * SIZE(BO), %xmm9
  1559. mulpd %xmm10, %xmm11
  1560. movapd 4 * SIZE(AO), %xmm8
  1561. addpd %xmm11, %xmm4
  1562. movddup 1 * SIZE(BO), %xmm11
  1563. mulpd %xmm10, %xmm11
  1564. addpd %xmm11, %xmm5
  1565. addq $4 * SIZE, AO # aoffset += 4
  1566. addq $2 * SIZE, BO # boffset1 += 8
  1567. decq %rax
  1568. jg .L56
  1569. ALIGN_4
  1570. .L59:
  1571. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1572. movsd 0 * SIZE(CO1), %xmm8
  1573. movhpd 1 * SIZE(CO1), %xmm8
  1574. movsd 2 * SIZE(CO1), %xmm9
  1575. movhpd 3 * SIZE(CO1), %xmm9
  1576. movsd 0 * SIZE(CO2), %xmm10
  1577. movhpd 1 * SIZE(CO2), %xmm10
  1578. movsd 2 * SIZE(CO2), %xmm11
  1579. movhpd 3 * SIZE(CO2), %xmm11
  1580. #endif
  1581. mulpd %xmm15, %xmm0
  1582. mulpd %xmm15, %xmm1
  1583. mulpd %xmm15, %xmm4
  1584. mulpd %xmm15, %xmm5
  1585. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1586. addpd %xmm8, %xmm0
  1587. addpd %xmm9, %xmm4
  1588. addpd %xmm10, %xmm1
  1589. addpd %xmm11, %xmm5
  1590. #endif
  1591. movsd %xmm0, 0 * SIZE(CO1)
  1592. movhpd %xmm0, 1 * SIZE(CO1)
  1593. movsd %xmm4, 2 * SIZE(CO1)
  1594. movhpd %xmm4, 3 * SIZE(CO1)
  1595. movsd %xmm1, 0 * SIZE(CO2)
  1596. movhpd %xmm1, 1 * SIZE(CO2)
  1597. movsd %xmm5, 2 * SIZE(CO2)
  1598. movhpd %xmm5, 3 * SIZE(CO2)
  1599. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1600. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1601. movq K, %rax
  1602. subq KKK, %rax
  1603. leaq (,%rax, SIZE), %rax
  1604. leaq (AO, %rax, 4), AO
  1605. leaq (BO, %rax, 2), BO
  1606. #endif
  1607. #if defined(TRMMKERNEL) && defined(LEFT)
  1608. addq $4, KK
  1609. #endif
  1610. addq $4 * SIZE, CO1 # coffset += 4
  1611. addq $4 * SIZE, CO2 # coffset += 4
  1612. decq I # i --
  1613. jg .L51
  1614. ALIGN_4
  1615. .L60:
  1616. testq $2, M
  1617. je .L70
  1618. ALIGN_4
  1619. .L61:
  1620. #if !defined(TRMMKERNEL) || \
  1621. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1622. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1623. movq B, BO
  1624. #else
  1625. movq KK, %rax
  1626. leaq (, %rax, SIZE), %rax
  1627. leaq (AO, %rax, 2), AO
  1628. leaq (B, %rax, 2), BO
  1629. #endif
  1630. movapd 0 * SIZE(AO), %xmm8
  1631. pxor %xmm0, %xmm0
  1632. movddup 0 * SIZE(BO), %xmm9
  1633. pxor %xmm1, %xmm1
  1634. movapd 8 * SIZE(AO), %xmm10
  1635. pxor %xmm2, %xmm2
  1636. movddup 8 * SIZE(BO), %xmm11
  1637. pxor %xmm3, %xmm3
  1638. #ifndef TRMMKERNEL
  1639. movq K, %rax
  1640. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1641. movq K, %rax
  1642. subq KK, %rax
  1643. movq %rax, KKK
  1644. #else
  1645. movq KK, %rax
  1646. #ifdef LEFT
  1647. addq $2, %rax
  1648. #else
  1649. addq $2, %rax
  1650. #endif
  1651. movq %rax, KKK
  1652. #endif
  1653. sarq $3, %rax
  1654. je .L65
  1655. ALIGN_4
  1656. .L62:
  1657. mulpd %xmm8, %xmm9
  1658. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1659. addpd %xmm9, %xmm0
  1660. movddup 1 * SIZE(BO), %xmm9
  1661. mulpd %xmm8, %xmm9
  1662. movapd 2 * SIZE(AO), %xmm8
  1663. addpd %xmm9, %xmm1
  1664. movddup 2 * SIZE(BO), %xmm9
  1665. mulpd %xmm8, %xmm9
  1666. addpd %xmm9, %xmm2
  1667. movddup 3 * SIZE(BO), %xmm9
  1668. mulpd %xmm8, %xmm9
  1669. movapd 4 * SIZE(AO), %xmm8
  1670. addpd %xmm9, %xmm3
  1671. movddup 4 * SIZE(BO), %xmm9
  1672. mulpd %xmm8, %xmm9
  1673. addpd %xmm9, %xmm0
  1674. movddup 5 * SIZE(BO), %xmm9
  1675. mulpd %xmm8, %xmm9
  1676. movapd 6 * SIZE(AO), %xmm8
  1677. addpd %xmm9, %xmm1
  1678. movddup 6 * SIZE(BO), %xmm9
  1679. mulpd %xmm8, %xmm9
  1680. addpd %xmm9, %xmm2
  1681. movddup 7 * SIZE(BO), %xmm9
  1682. mulpd %xmm8, %xmm9
  1683. movapd 16 * SIZE(AO), %xmm8
  1684. addpd %xmm9, %xmm3
  1685. movddup 16 * SIZE(BO), %xmm9
  1686. mulpd %xmm10, %xmm11
  1687. addpd %xmm11, %xmm0
  1688. movddup 9 * SIZE(BO), %xmm11
  1689. mulpd %xmm10, %xmm11
  1690. movapd 10 * SIZE(AO), %xmm10
  1691. addpd %xmm11, %xmm1
  1692. movddup 10 * SIZE(BO), %xmm11
  1693. mulpd %xmm10, %xmm11
  1694. addpd %xmm11, %xmm2
  1695. movddup 11 * SIZE(BO), %xmm11
  1696. mulpd %xmm10, %xmm11
  1697. movapd 12 * SIZE(AO), %xmm10
  1698. addpd %xmm11, %xmm3
  1699. movddup 12 * SIZE(BO), %xmm11
  1700. mulpd %xmm10, %xmm11
  1701. addpd %xmm11, %xmm0
  1702. movddup 13 * SIZE(BO), %xmm11
  1703. mulpd %xmm10, %xmm11
  1704. movapd 14 * SIZE(AO), %xmm10
  1705. addpd %xmm11, %xmm1
  1706. movddup 14 * SIZE(BO), %xmm11
  1707. mulpd %xmm10, %xmm11
  1708. addpd %xmm11, %xmm2
  1709. movddup 15 * SIZE(BO), %xmm11
  1710. mulpd %xmm10, %xmm11
  1711. movapd 24 * SIZE(AO), %xmm10
  1712. addpd %xmm11, %xmm3
  1713. movddup 24 * SIZE(BO), %xmm11
  1714. addq $16 * SIZE, AO
  1715. addq $16 * SIZE, BO
  1716. decq %rax
  1717. jne .L62
  1718. ALIGN_4
  1719. .L65:
  1720. #ifndef TRMMKERNEL
  1721. movq K, %rax
  1722. #else
  1723. movq KKK, %rax
  1724. #endif
  1725. movddup ALPHA, %xmm15
  1726. andq $7, %rax # if (k & 1)
  1727. BRANCH
  1728. je .L69
  1729. ALIGN_4
  1730. .L66:
  1731. mulpd %xmm8, %xmm9
  1732. addpd %xmm9, %xmm0
  1733. movddup 1 * SIZE(BO), %xmm9
  1734. mulpd %xmm8, %xmm9
  1735. movapd 2 * SIZE(AO), %xmm8
  1736. addpd %xmm9, %xmm1
  1737. movddup 2 * SIZE(BO), %xmm9
  1738. addq $2 * SIZE, AO # aoffset += 4
  1739. addq $2 * SIZE, BO # boffset1 += 8
  1740. decq %rax
  1741. jg .L66
  1742. ALIGN_4
  1743. .L69:
  1744. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1745. movsd 0 * SIZE(CO1), %xmm8
  1746. movhpd 1 * SIZE(CO1), %xmm8
  1747. movsd 0 * SIZE(CO2), %xmm10
  1748. movhpd 1 * SIZE(CO2), %xmm10
  1749. #endif
  1750. addpd %xmm2, %xmm0
  1751. addpd %xmm3, %xmm1
  1752. mulpd %xmm15, %xmm0
  1753. mulpd %xmm15, %xmm1
  1754. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1755. addpd %xmm8, %xmm0
  1756. addpd %xmm10, %xmm1
  1757. #endif
  1758. movsd %xmm0, 0 * SIZE(CO1)
  1759. movhpd %xmm0, 1 * SIZE(CO1)
  1760. movsd %xmm1, 0 * SIZE(CO2)
  1761. movhpd %xmm1, 1 * SIZE(CO2)
  1762. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1763. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1764. movq K, %rax
  1765. subq KKK, %rax
  1766. leaq (,%rax, SIZE), %rax
  1767. leaq (AO, %rax, 2), AO
  1768. leaq (BO, %rax, 2), BO
  1769. #endif
  1770. #if defined(TRMMKERNEL) && defined(LEFT)
  1771. addq $2, KK
  1772. #endif
  1773. addq $2 * SIZE, CO1 # coffset += 4
  1774. addq $2 * SIZE, CO2 # coffset += 4
  1775. ALIGN_4
  1776. .L70:
  1777. testq $1, M
  1778. je .L79
  1779. ALIGN_4
  1780. .L71:
  1781. #if !defined(TRMMKERNEL) || \
  1782. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1783. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1784. movq B, BO
  1785. #else
  1786. movq KK, %rax
  1787. leaq (, %rax, SIZE), %rax
  1788. leaq (AO, %rax, 1), AO
  1789. leaq (B, %rax, 2), BO
  1790. #endif
  1791. movddup 0 * SIZE(AO), %xmm8
  1792. pxor %xmm0, %xmm0
  1793. movapd 0 * SIZE(BO), %xmm9
  1794. pxor %xmm1, %xmm1
  1795. movddup 4 * SIZE(AO), %xmm10
  1796. pxor %xmm2, %xmm2
  1797. movapd 8 * SIZE(BO), %xmm11
  1798. pxor %xmm3, %xmm3
  1799. #ifndef TRMMKERNEL
  1800. movq K, %rax
  1801. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1802. movq K, %rax
  1803. subq KK, %rax
  1804. movq %rax, KKK
  1805. #else
  1806. movq KK, %rax
  1807. #ifdef LEFT
  1808. addq $1, %rax
  1809. #else
  1810. addq $2, %rax
  1811. #endif
  1812. movq %rax, KKK
  1813. #endif
  1814. sarq $3, %rax
  1815. je .L75
  1816. ALIGN_4
  1817. .L72:
  1818. mulpd %xmm8, %xmm9
  1819. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1820. movddup 1 * SIZE(AO), %xmm8
  1821. addpd %xmm9, %xmm0
  1822. mulpd 2 * SIZE(BO), %xmm8
  1823. movapd 16 * SIZE(BO), %xmm9
  1824. addpd %xmm8, %xmm1
  1825. movddup 2 * SIZE(AO), %xmm8
  1826. mulpd 4 * SIZE(BO), %xmm8
  1827. addpd %xmm8, %xmm2
  1828. movddup 3 * SIZE(AO), %xmm8
  1829. mulpd 6 * SIZE(BO), %xmm8
  1830. addpd %xmm8, %xmm3
  1831. movddup 8 * SIZE(AO), %xmm8
  1832. mulpd %xmm10, %xmm11
  1833. movddup 5 * SIZE(AO), %xmm10
  1834. addpd %xmm11, %xmm0
  1835. mulpd 10 * SIZE(BO), %xmm10
  1836. movapd 24 * SIZE(BO), %xmm11
  1837. addpd %xmm10, %xmm1
  1838. movddup 6 * SIZE(AO), %xmm10
  1839. mulpd 12 * SIZE(BO), %xmm10
  1840. addpd %xmm10, %xmm2
  1841. movddup 7 * SIZE(AO), %xmm10
  1842. mulpd 14 * SIZE(BO), %xmm10
  1843. addpd %xmm10, %xmm3
  1844. movddup 12 * SIZE(AO), %xmm10
  1845. addq $ 8 * SIZE, AO
  1846. addq $16 * SIZE, BO
  1847. decq %rax
  1848. jne .L72
  1849. ALIGN_4
  1850. .L75:
  1851. #ifndef TRMMKERNEL
  1852. movq K, %rax
  1853. #else
  1854. movq KKK, %rax
  1855. #endif
  1856. movddup ALPHA, %xmm15
  1857. andq $7, %rax # if (k & 1)
  1858. BRANCH
  1859. je .L78
  1860. ALIGN_4
  1861. .L76:
  1862. mulpd %xmm8, %xmm9
  1863. movddup 1 * SIZE(AO), %xmm8
  1864. addpd %xmm9, %xmm0
  1865. movapd 2 * SIZE(BO), %xmm9
  1866. addq $1 * SIZE, AO # aoffset += 4
  1867. addq $2 * SIZE, BO # boffset1 += 8
  1868. decq %rax
  1869. jg .L76
  1870. ALIGN_4
  1871. .L78:
  1872. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1873. movsd 0 * SIZE(CO1), %xmm8
  1874. movhpd 0 * SIZE(CO2), %xmm8
  1875. #endif
  1876. addpd %xmm1, %xmm0
  1877. addpd %xmm3, %xmm2
  1878. addpd %xmm2, %xmm0
  1879. mulpd %xmm15, %xmm0
  1880. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1881. addpd %xmm8, %xmm0
  1882. #endif
  1883. movsd %xmm0, 0 * SIZE(CO1)
  1884. movhpd %xmm0, 0 * SIZE(CO2)
  1885. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1886. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1887. movq K, %rax
  1888. subq KKK, %rax
  1889. leaq (,%rax, SIZE), %rax
  1890. leaq (AO, %rax, 1), AO
  1891. leaq (BO, %rax, 2), BO
  1892. #endif
  1893. #if defined(TRMMKERNEL) && defined(LEFT)
  1894. addq $1, KK
  1895. #endif
  1896. ALIGN_4
  1897. .L79:
  1898. #if defined(TRMMKERNEL) && !defined(LEFT)
  1899. addl $2, KK
  1900. #endif
  1901. leaq (C, LDC, 2), C
  1902. movq BO, B
  1903. ALIGN_4
  1904. .L80:
  1905. testq $1, N
  1906. je .L999
  1907. ALIGN_4
  1908. #if defined(TRMMKERNEL) && defined(LEFT)
  1909. movq OFFSET, %rax
  1910. movq %rax, KK
  1911. #endif
  1912. movq C, CO1
  1913. movq A, AO
  1914. movq M, I
  1915. sarq $2, I # i = (m >> 2)
  1916. jle .L100
  1917. ALIGN_4
  1918. .L91:
  1919. #if !defined(TRMMKERNEL) || \
  1920. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1921. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1922. movq B, BO
  1923. #else
  1924. movq KK, %rax
  1925. leaq (, %rax, SIZE), %rax
  1926. leaq (AO, %rax, 4), AO
  1927. leaq (B, %rax, 1), BO
  1928. #endif
  1929. movapd 0 * SIZE(AO), %xmm8
  1930. pxor %xmm0, %xmm0
  1931. movddup 0 * SIZE(BO), %xmm9
  1932. pxor %xmm1, %xmm1
  1933. movapd 8 * SIZE(AO), %xmm10
  1934. pxor %xmm2, %xmm2
  1935. movddup 4 * SIZE(BO), %xmm11
  1936. pxor %xmm3, %xmm3
  1937. #ifdef HAVE_3DNOW
  1938. prefetchw 4 * SIZE(CO1)
  1939. #else
  1940. prefetchnta 4 * SIZE(CO1)
  1941. #endif
  1942. #ifndef TRMMKERNEL
  1943. movq K, %rax
  1944. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1945. movq K, %rax
  1946. subq KK, %rax
  1947. movq %rax, KKK
  1948. #else
  1949. movq KK, %rax
  1950. #ifdef LEFT
  1951. addq $4, %rax
  1952. #else
  1953. addq $1, %rax
  1954. #endif
  1955. movq %rax, KKK
  1956. #endif
  1957. sarq $3, %rax
  1958. je .L95
  1959. ALIGN_4
  1960. .L92:
  1961. mulpd %xmm9, %xmm8
  1962. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1963. mulpd 2 * SIZE(AO), %xmm9
  1964. addpd %xmm8, %xmm0
  1965. movapd 4 * SIZE(AO), %xmm8
  1966. addpd %xmm9, %xmm1
  1967. movddup 1 * SIZE(BO), %xmm9
  1968. mulpd %xmm9, %xmm8
  1969. mulpd 6 * SIZE(AO), %xmm9
  1970. addpd %xmm8, %xmm2
  1971. movapd 16 * SIZE(AO), %xmm8
  1972. addpd %xmm9, %xmm3
  1973. movddup 2 * SIZE(BO), %xmm9
  1974. mulpd %xmm9, %xmm10
  1975. mulpd 10 * SIZE(AO), %xmm9
  1976. addpd %xmm10, %xmm0
  1977. movapd 12 * SIZE(AO), %xmm10
  1978. addpd %xmm9, %xmm1
  1979. movddup 3 * SIZE(BO), %xmm9
  1980. mulpd %xmm9, %xmm10
  1981. mulpd 14 * SIZE(AO), %xmm9
  1982. addpd %xmm10, %xmm2
  1983. movapd 24 * SIZE(AO), %xmm10
  1984. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1985. addpd %xmm9, %xmm3
  1986. movddup 8 * SIZE(BO), %xmm9
  1987. mulpd %xmm11, %xmm8
  1988. mulpd 18 * SIZE(AO), %xmm11
  1989. addpd %xmm8, %xmm0
  1990. movapd 20 * SIZE(AO), %xmm8
  1991. addpd %xmm11, %xmm1
  1992. movddup 5 * SIZE(BO), %xmm11
  1993. mulpd %xmm11, %xmm8
  1994. mulpd 22 * SIZE(AO), %xmm11
  1995. addpd %xmm8, %xmm2
  1996. movapd 32 * SIZE(AO), %xmm8
  1997. addpd %xmm11, %xmm3
  1998. movddup 6 * SIZE(BO), %xmm11
  1999. mulpd %xmm11, %xmm10
  2000. mulpd 26 * SIZE(AO), %xmm11
  2001. addpd %xmm10, %xmm0
  2002. movapd 28 * SIZE(AO), %xmm10
  2003. addpd %xmm11, %xmm1
  2004. movddup 7 * SIZE(BO), %xmm11
  2005. mulpd %xmm11, %xmm10
  2006. mulpd 30 * SIZE(AO), %xmm11
  2007. addpd %xmm10, %xmm2
  2008. movapd 40 * SIZE(AO), %xmm10
  2009. addpd %xmm11, %xmm3
  2010. movddup 12 * SIZE(BO), %xmm11
  2011. addq $32 * SIZE, AO
  2012. addq $8 * SIZE, BO
  2013. decq %rax
  2014. jne .L92
  2015. ALIGN_4
  2016. .L95:
  2017. #ifndef TRMMKERNEL
  2018. movq K, %rax
  2019. #else
  2020. movq KKK, %rax
  2021. #endif
  2022. movddup ALPHA, %xmm15
  2023. andq $7, %rax # if (k & 1)
  2024. BRANCH
  2025. je .L99
  2026. ALIGN_4
  2027. .L96:
  2028. mulpd %xmm9, %xmm8
  2029. mulpd 2 * SIZE(AO), %xmm9
  2030. addpd %xmm8, %xmm0
  2031. movapd 4 * SIZE(AO), %xmm8
  2032. addpd %xmm9, %xmm1
  2033. movddup 1 * SIZE(BO), %xmm9
  2034. addq $4 * SIZE, AO # aoffset += 4
  2035. addq $1 * SIZE, BO # boffset1 += 8
  2036. decq %rax
  2037. jg .L96
  2038. ALIGN_4
  2039. .L99:
  2040. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2041. movsd 0 * SIZE(CO1), %xmm8
  2042. movhpd 1 * SIZE(CO1), %xmm8
  2043. movsd 2 * SIZE(CO1), %xmm9
  2044. movhpd 3 * SIZE(CO1), %xmm9
  2045. #endif
  2046. addpd %xmm2, %xmm0
  2047. addpd %xmm3, %xmm1
  2048. mulpd %xmm15, %xmm0
  2049. mulpd %xmm15, %xmm1
  2050. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2051. addpd %xmm8, %xmm0
  2052. addpd %xmm9, %xmm1
  2053. #endif
  2054. movsd %xmm0, 0 * SIZE(CO1)
  2055. movhpd %xmm0, 1 * SIZE(CO1)
  2056. movsd %xmm1, 2 * SIZE(CO1)
  2057. movhpd %xmm1, 3 * SIZE(CO1)
  2058. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2059. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2060. movq K, %rax
  2061. subq KKK, %rax
  2062. leaq (,%rax, SIZE), %rax
  2063. leaq (AO, %rax, 4), AO
  2064. leaq (BO, %rax, 1), BO
  2065. #endif
  2066. #if defined(TRMMKERNEL) && defined(LEFT)
  2067. addq $4, KK
  2068. #endif
  2069. addq $4 * SIZE, CO1 # coffset += 4
  2070. decq I # i --
  2071. jg .L91
  2072. ALIGN_4
  2073. .L100:
  2074. testq $2, M
  2075. je .L110
  2076. ALIGN_4
  2077. .L101:
  2078. #if !defined(TRMMKERNEL) || \
  2079. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2080. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2081. movq B, BO
  2082. #else
  2083. movq KK, %rax
  2084. leaq (, %rax, SIZE), %rax
  2085. leaq (AO, %rax, 2), AO
  2086. leaq (B, %rax, 1), BO
  2087. #endif
  2088. movapd 0 * SIZE(AO), %xmm8
  2089. pxor %xmm0, %xmm0
  2090. movddup 0 * SIZE(BO), %xmm9
  2091. pxor %xmm1, %xmm1
  2092. movapd 8 * SIZE(AO), %xmm10
  2093. pxor %xmm2, %xmm2
  2094. movddup 4 * SIZE(BO), %xmm11
  2095. pxor %xmm3, %xmm3
  2096. #ifndef TRMMKERNEL
  2097. movq K, %rax
  2098. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2099. movq K, %rax
  2100. subq KK, %rax
  2101. movq %rax, KKK
  2102. #else
  2103. movq KK, %rax
  2104. #ifdef LEFT
  2105. addq $2, %rax
  2106. #else
  2107. addq $1, %rax
  2108. #endif
  2109. movq %rax, KKK
  2110. #endif
  2111. sarq $3, %rax
  2112. je .L105
  2113. ALIGN_4
  2114. .L102:
  2115. mulpd %xmm9, %xmm8
  2116. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2117. movddup 1 * SIZE(BO), %xmm9
  2118. addpd %xmm8, %xmm0
  2119. mulpd 2 * SIZE(AO), %xmm9
  2120. movapd 16 * SIZE(AO), %xmm8
  2121. addpd %xmm9, %xmm1
  2122. movddup 2 * SIZE(BO), %xmm9
  2123. mulpd 4 * SIZE(AO), %xmm9
  2124. addpd %xmm9, %xmm2
  2125. movddup 3 * SIZE(BO), %xmm9
  2126. mulpd 6 * SIZE(AO), %xmm9
  2127. addpd %xmm9, %xmm3
  2128. movddup 8 * SIZE(BO), %xmm9
  2129. mulpd %xmm11, %xmm10
  2130. movddup 5 * SIZE(BO), %xmm11
  2131. addpd %xmm10, %xmm0
  2132. mulpd 10 * SIZE(AO), %xmm11
  2133. movapd 24 * SIZE(AO), %xmm10
  2134. addpd %xmm11, %xmm1
  2135. movddup 6 * SIZE(BO), %xmm11
  2136. mulpd 12 * SIZE(AO), %xmm11
  2137. addpd %xmm11, %xmm2
  2138. movddup 7 * SIZE(BO), %xmm11
  2139. mulpd 14 * SIZE(AO), %xmm11
  2140. addpd %xmm11, %xmm3
  2141. movddup 12 * SIZE(BO), %xmm11
  2142. addq $16 * SIZE, AO
  2143. addq $ 8 * SIZE, BO
  2144. decq %rax
  2145. jne .L102
  2146. ALIGN_4
  2147. .L105:
  2148. #ifndef TRMMKERNEL
  2149. movq K, %rax
  2150. #else
  2151. movq KKK, %rax
  2152. #endif
  2153. movddup ALPHA, %xmm15
  2154. andq $7, %rax # if (k & 1)
  2155. BRANCH
  2156. je .L109
  2157. ALIGN_4
  2158. .L106:
  2159. mulpd %xmm9, %xmm8
  2160. movddup 1 * SIZE(BO), %xmm9
  2161. addpd %xmm8, %xmm0
  2162. movapd 2 * SIZE(AO), %xmm8
  2163. addq $2 * SIZE, AO # aoffset += 4
  2164. addq $1 * SIZE, BO # boffset1 += 8
  2165. decq %rax
  2166. jg .L106
  2167. ALIGN_4
  2168. .L109:
  2169. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2170. movsd 0 * SIZE(CO1), %xmm8
  2171. movhpd 1 * SIZE(CO1), %xmm8
  2172. #endif
  2173. addpd %xmm1, %xmm0
  2174. addpd %xmm3, %xmm2
  2175. addpd %xmm2, %xmm0
  2176. mulpd %xmm15, %xmm0
  2177. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2178. addpd %xmm8, %xmm0
  2179. #endif
  2180. movsd %xmm0, 0 * SIZE(CO1)
  2181. movhpd %xmm0, 1 * SIZE(CO1)
  2182. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2183. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2184. movq K, %rax
  2185. subq KKK, %rax
  2186. leaq (,%rax, SIZE), %rax
  2187. leaq (AO, %rax, 2), AO
  2188. leaq (BO, %rax, 1), BO
  2189. #endif
  2190. #if defined(TRMMKERNEL) && defined(LEFT)
  2191. addq $2, KK
  2192. #endif
  2193. addq $2 * SIZE, CO1 # coffset += 4
  2194. ALIGN_4
  2195. .L110:
  2196. testq $1, M
  2197. je .L999
  2198. ALIGN_4
  2199. .L111:
  2200. #if !defined(TRMMKERNEL) || \
  2201. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2202. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2203. movq B, BO
  2204. #else
  2205. movq KK, %rax
  2206. leaq (, %rax, SIZE), %rax
  2207. leaq (AO, %rax, 1), AO
  2208. leaq (B, %rax, 1), BO
  2209. #endif
  2210. movsd 0 * SIZE(AO), %xmm8
  2211. pxor %xmm0, %xmm0
  2212. movsd 0 * SIZE(BO), %xmm9
  2213. pxor %xmm1, %xmm1
  2214. movsd 4 * SIZE(AO), %xmm10
  2215. pxor %xmm2, %xmm2
  2216. movsd 4 * SIZE(BO), %xmm11
  2217. pxor %xmm3, %xmm3
  2218. movapd 0 * SIZE(AO), %xmm9
  2219. movapd 0 * SIZE(BO), %xmm8
  2220. movapd 4 * SIZE(AO), %xmm11
  2221. movapd 4 * SIZE(BO), %xmm10
  2222. #ifndef TRMMKERNEL
  2223. movq K, %rax
  2224. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2225. movq K, %rax
  2226. subq KK, %rax
  2227. movq %rax, KKK
  2228. #else
  2229. movq KK, %rax
  2230. #ifdef LEFT
  2231. addq $1, %rax
  2232. #else
  2233. addq $1, %rax
  2234. #endif
  2235. movq %rax, KKK
  2236. #endif
  2237. sarq $3, %rax
  2238. je .L115
  2239. ALIGN_4
  2240. .L112:
  2241. mulpd %xmm9, %xmm8
  2242. movapd 2 * SIZE(AO), %xmm9
  2243. addpd %xmm8, %xmm0
  2244. mulpd 2 * SIZE(BO), %xmm9
  2245. movapd 8 * SIZE(BO), %xmm8
  2246. addpd %xmm9, %xmm1
  2247. movapd 8 * SIZE(AO), %xmm9
  2248. mulpd %xmm11, %xmm10
  2249. movapd 6 * SIZE(AO), %xmm11
  2250. addpd %xmm10, %xmm0
  2251. mulpd 6 * SIZE(BO), %xmm11
  2252. movapd 12 * SIZE(BO), %xmm10
  2253. addpd %xmm11, %xmm1
  2254. movapd 12 * SIZE(AO), %xmm11
  2255. addq $8 * SIZE, AO
  2256. addq $8 * SIZE, BO
  2257. decq %rax
  2258. jne .L112
  2259. ALIGN_4
  2260. .L115:
  2261. #ifndef TRMMKERNEL
  2262. movq K, %rax
  2263. #else
  2264. movq KKK, %rax
  2265. #endif
  2266. movddup ALPHA, %xmm15
  2267. andq $7, %rax # if (k & 1)
  2268. BRANCH
  2269. je .L118
  2270. ALIGN_4
  2271. .L116:
  2272. mulsd 0 * SIZE(BO), %xmm9
  2273. addsd %xmm9, %xmm0
  2274. movsd 1 * SIZE(AO), %xmm9
  2275. addq $1 * SIZE, AO # aoffset += 4
  2276. addq $1 * SIZE, BO # boffset1 += 8
  2277. decq %rax
  2278. jg .L116
  2279. ALIGN_4
  2280. .L118:
  2281. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2282. movsd 0 * SIZE(CO1), %xmm8
  2283. #endif
  2284. addpd %xmm1, %xmm0
  2285. haddpd %xmm0, %xmm0
  2286. mulsd %xmm15, %xmm0
  2287. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2288. addsd %xmm8, %xmm0
  2289. #endif
  2290. movsd %xmm0, 0 * SIZE(CO1)
  2291. ALIGN_4
  2292. .L999:
  2293. movq 0(%rsp), %rbx
  2294. movq 8(%rsp), %rbp
  2295. movq 16(%rsp), %r12
  2296. movq 24(%rsp), %r13
  2297. movq 32(%rsp), %r14
  2298. movq 40(%rsp), %r15
  2299. #ifdef WINDOWS_ABI
  2300. movq 48(%rsp), %rdi
  2301. movq 56(%rsp), %rsi
  2302. movups 64(%rsp), %xmm6
  2303. movups 80(%rsp), %xmm7
  2304. movups 96(%rsp), %xmm8
  2305. movups 112(%rsp), %xmm9
  2306. movups 128(%rsp), %xmm10
  2307. movups 144(%rsp), %xmm11
  2308. movups 160(%rsp), %xmm12
  2309. movups 176(%rsp), %xmm13
  2310. movups 192(%rsp), %xmm14
  2311. movups 208(%rsp), %xmm15
  2312. #endif
  2313. addq $STACKSIZE, %rsp
  2314. ret
  2315. EPILOGUE