You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm3m_kernel_2x4_opteron.S 38 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define OLD_M 4 + STACK + ARGS(%esi)
  43. #define OLD_N 8 + STACK + ARGS(%esi)
  44. #define OLD_K 12 + STACK + ARGS(%esi)
  45. #define OLD_ALPHA_R 16 + STACK + ARGS(%esi)
  46. #define OLD_ALPHA_I 24 + STACK + ARGS(%esi)
  47. #define OLD_A 32 + STACK + ARGS(%esi)
  48. #define OLD_B 36 + STACK + ARGS(%esi)
  49. #define OLD_C 40 + STACK + ARGS(%esi)
  50. #define OLD_LDC 44 + STACK + ARGS(%esi)
  51. #define ALPHA 0(%esp)
  52. #define K 16(%esp)
  53. #define N 20(%esp)
  54. #define M 24(%esp)
  55. #define A 28(%esp)
  56. #define C 32(%esp)
  57. #define J 36(%esp)
  58. #define BX 40(%esp)
  59. #define OLD_STACK 44(%esp)
  60. #define OFFSET 48(%esp)
  61. #define KK 52(%esp)
  62. #define KKK 56(%esp)
  63. #define BUFFER 128(%esp)
  64. #if defined(OPTERON) || defined(BARCELONA)
  65. #define movsd movlpd
  66. #endif
  67. #if defined(OPTERON) || defined(BARCELONA)
  68. #define PREFETCH prefetch
  69. #define PREFETCHSIZE (8 * 10 + 4)
  70. #endif
  71. #define AA %edx
  72. #define BB %ecx
  73. #define LDC %ebp
  74. #define KERNEL1(address) \
  75. mulpd %xmm0, %xmm2; \
  76. addpd %xmm2, %xmm4; \
  77. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  78. movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  79. mulpd %xmm0, %xmm2; \
  80. addpd %xmm2, %xmm5; \
  81. movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  82. mulpd %xmm0, %xmm2; \
  83. mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  84. addpd %xmm2, %xmm6; \
  85. movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  86. addpd %xmm0, %xmm7; \
  87. movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  88. #define KERNEL2(address) \
  89. mulpd %xmm0, %xmm3; \
  90. addpd %xmm3, %xmm4; \
  91. movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  92. mulpd %xmm0, %xmm3; \
  93. addpd %xmm3, %xmm5; \
  94. movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  95. mulpd %xmm0, %xmm3; \
  96. mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  97. addpd %xmm3, %xmm6; \
  98. movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  99. addpd %xmm0, %xmm7; \
  100. movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  101. #define KERNEL3(address) \
  102. mulpd %xmm0, %xmm2; \
  103. addpd %xmm2, %xmm4; \
  104. movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  105. mulpd %xmm0, %xmm2; \
  106. addpd %xmm2, %xmm5; \
  107. movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  108. mulpd %xmm0, %xmm2; \
  109. mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  110. addpd %xmm2, %xmm6; \
  111. movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  112. addpd %xmm0, %xmm7; \
  113. movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  114. #define KERNEL4(address) \
  115. mulpd %xmm0, %xmm3; \
  116. addpd %xmm3, %xmm4; \
  117. movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  118. mulpd %xmm0, %xmm3; \
  119. addpd %xmm3, %xmm5; \
  120. movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  121. mulpd %xmm0, %xmm3; \
  122. mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  123. addpd %xmm3, %xmm6; \
  124. movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  125. addpd %xmm0, %xmm7; \
  126. movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  127. #define KERNEL5(address) \
  128. PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \
  129. mulpd %xmm1, %xmm2; \
  130. addpd %xmm2, %xmm4; \
  131. movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  132. mulpd %xmm1, %xmm2; \
  133. addpd %xmm2, %xmm5; \
  134. movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  135. mulpd %xmm1, %xmm2; \
  136. mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  137. addpd %xmm2, %xmm6; \
  138. movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  139. addpd %xmm1, %xmm7; \
  140. movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  141. #define KERNEL6(address) \
  142. mulpd %xmm1, %xmm3; \
  143. addpd %xmm3, %xmm4; \
  144. movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  145. mulpd %xmm1, %xmm3; \
  146. addpd %xmm3, %xmm5; \
  147. movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  148. mulpd %xmm1, %xmm3; \
  149. mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  150. addpd %xmm3, %xmm6; \
  151. movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  152. addpd %xmm1, %xmm7; \
  153. movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  154. #define KERNEL7(address) \
  155. mulpd %xmm1, %xmm2; \
  156. addpd %xmm2, %xmm4; \
  157. movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  158. mulpd %xmm1, %xmm2; \
  159. addpd %xmm2, %xmm5; \
  160. movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  161. mulpd %xmm1, %xmm2; \
  162. mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  163. addpd %xmm2, %xmm6; \
  164. movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  165. addpd %xmm1, %xmm7; \
  166. movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  167. #define KERNEL8(address) \
  168. mulpd %xmm1, %xmm3; \
  169. addpd %xmm3, %xmm4; \
  170. movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  171. mulpd %xmm1, %xmm3; \
  172. addpd %xmm3, %xmm5; \
  173. movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  174. mulpd %xmm1, %xmm3; \
  175. mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  176. addpd %xmm3, %xmm6; \
  177. movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  178. addpd %xmm1, %xmm7; \
  179. movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  180. PROLOGUE
  181. pushl %ebp
  182. pushl %edi
  183. pushl %esi
  184. pushl %ebx
  185. PROFCODE
  186. EMMS
  187. movl %esp, %esi # save old stack
  188. subl $128 + LOCAL_BUFFER_SIZE, %esp
  189. andl $-1024, %esp # align stack
  190. STACK_TOUCHING
  191. movl OLD_M, %ebx
  192. movl OLD_N, %eax
  193. movl OLD_K, %ecx
  194. movl OLD_A, %edx
  195. movsd OLD_ALPHA_R, %xmm0
  196. movhps OLD_ALPHA_I, %xmm0
  197. movl %ebx, M
  198. movl %eax, N
  199. movl %ecx, K
  200. movl %edx, A
  201. movl %esi, OLD_STACK
  202. #ifdef TRMMKERNEL
  203. movss OLD_OFFT, %xmm4
  204. #endif
  205. movl OLD_B, %edi
  206. movl OLD_C, %ebx
  207. movapd %xmm0, ALPHA
  208. movl %ebx, C
  209. movl OLD_LDC, LDC
  210. #ifdef TRMMKERNEL
  211. movss %xmm4, OFFSET
  212. movss %xmm4, KK
  213. #ifndef LEFT
  214. negl KK
  215. #endif
  216. #endif
  217. sall $ZBASE_SHIFT, LDC
  218. sarl $2, %eax
  219. movl %eax, J
  220. jle .L30
  221. ALIGN_2
  222. .L01:
  223. #if defined(TRMMKERNEL) && defined(LEFT)
  224. movl OFFSET, %eax
  225. movl %eax, KK
  226. #endif
  227. /* Copying to Sub Buffer */
  228. movl K, %eax
  229. leal BUFFER, %ecx
  230. sarl $1, %eax
  231. jle .L05
  232. ALIGN_4
  233. .L02:
  234. #define COPYPREFETCH 40
  235. prefetchnta (COPYPREFETCH) * SIZE(%edi)
  236. movq 0 * SIZE(%edi), %mm0
  237. movq 1 * SIZE(%edi), %mm1
  238. movq 2 * SIZE(%edi), %mm2
  239. movq 3 * SIZE(%edi), %mm3
  240. movq 4 * SIZE(%edi), %mm4
  241. movq 5 * SIZE(%edi), %mm5
  242. movq 6 * SIZE(%edi), %mm6
  243. movq 7 * SIZE(%edi), %mm7
  244. movq %mm0, 0 * SIZE(%ecx)
  245. movq %mm0, 1 * SIZE(%ecx)
  246. movq %mm1, 2 * SIZE(%ecx)
  247. movq %mm1, 3 * SIZE(%ecx)
  248. movq %mm2, 4 * SIZE(%ecx)
  249. movq %mm2, 5 * SIZE(%ecx)
  250. movq %mm3, 6 * SIZE(%ecx)
  251. movq %mm3, 7 * SIZE(%ecx)
  252. movq %mm4, 8 * SIZE(%ecx)
  253. movq %mm4, 9 * SIZE(%ecx)
  254. movq %mm5, 10 * SIZE(%ecx)
  255. movq %mm5, 11 * SIZE(%ecx)
  256. movq %mm6, 12 * SIZE(%ecx)
  257. movq %mm6, 13 * SIZE(%ecx)
  258. movq %mm7, 14 * SIZE(%ecx)
  259. movq %mm7, 15 * SIZE(%ecx)
  260. addl $ 8 * SIZE, %edi
  261. addl $16 * SIZE, %ecx
  262. decl %eax
  263. jne .L02
  264. ALIGN_2
  265. .L05:
  266. movl K, %eax
  267. andl $1, %eax
  268. BRANCH
  269. jle .L10
  270. movq 0 * SIZE(%edi), %mm0
  271. movq 1 * SIZE(%edi), %mm1
  272. movq 2 * SIZE(%edi), %mm2
  273. movq 3 * SIZE(%edi), %mm3
  274. movq %mm0, 0 * SIZE(%ecx)
  275. movq %mm0, 1 * SIZE(%ecx)
  276. movq %mm1, 2 * SIZE(%ecx)
  277. movq %mm1, 3 * SIZE(%ecx)
  278. movq %mm2, 4 * SIZE(%ecx)
  279. movq %mm2, 5 * SIZE(%ecx)
  280. movq %mm3, 6 * SIZE(%ecx)
  281. movq %mm3, 7 * SIZE(%ecx)
  282. addl $4 * SIZE, %edi
  283. ALIGN_4
  284. .L10:
  285. movl %edi, BX
  286. movl C, %esi # coffset = c
  287. movl A, AA # aoffset = a
  288. movl M, %ebx
  289. sarl $1, %ebx # i = (m >> 2)
  290. jle .L20
  291. ALIGN_4
  292. .L11:
  293. #if !defined(TRMMKERNEL) || \
  294. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  295. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  296. leal BUFFER, BB
  297. #else
  298. leal BUFFER, BB
  299. movl KK, %eax
  300. leal (, %eax, SIZE), %eax
  301. leal (AA, %eax, 2), AA
  302. leal (BB, %eax, 8), BB
  303. #endif
  304. movl BX, %eax
  305. prefetchnta 0 * SIZE(%eax)
  306. prefetchnta 8 * SIZE(%eax)
  307. subl $-8 * SIZE, BX
  308. pxor %xmm4, %xmm4
  309. pxor %xmm5, %xmm5
  310. pxor %xmm6, %xmm6
  311. pxor %xmm7, %xmm7
  312. movapd 0 * SIZE(AA), %xmm0
  313. movapd 8 * SIZE(AA), %xmm1
  314. movapd 0 * SIZE(BB), %xmm2
  315. movapd 8 * SIZE(BB), %xmm3
  316. leal (LDC, LDC, 2), %eax
  317. prefetchw 1 * SIZE(%esi)
  318. prefetchw 1 * SIZE(%esi, LDC)
  319. prefetchw 1 * SIZE(%esi, LDC, 2)
  320. prefetchw 1 * SIZE(%esi, %eax)
  321. #ifndef TRMMKERNEL
  322. movl K, %eax
  323. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  324. movl K, %eax
  325. subl KK, %eax
  326. movl %eax, KKK
  327. #else
  328. movl KK, %eax
  329. #ifdef LEFT
  330. addl $2, %eax
  331. #else
  332. addl $4, %eax
  333. #endif
  334. movl %eax, KKK
  335. #endif
  336. #if 1
  337. andl $-8, %eax
  338. sall $4, %eax
  339. je .L15
  340. .L1X:
  341. KERNEL1(16 * 0)
  342. KERNEL2(16 * 0)
  343. KERNEL3(16 * 0)
  344. KERNEL4(16 * 0)
  345. KERNEL5(16 * 0)
  346. KERNEL6(16 * 0)
  347. KERNEL7(16 * 0)
  348. KERNEL8(16 * 0)
  349. cmpl $128 * 1, %eax
  350. jle .L12
  351. KERNEL1(16 * 1)
  352. KERNEL2(16 * 1)
  353. KERNEL3(16 * 1)
  354. KERNEL4(16 * 1)
  355. KERNEL5(16 * 1)
  356. KERNEL6(16 * 1)
  357. KERNEL7(16 * 1)
  358. KERNEL8(16 * 1)
  359. cmpl $128 * 2, %eax
  360. jle .L12
  361. KERNEL1(16 * 2)
  362. KERNEL2(16 * 2)
  363. KERNEL3(16 * 2)
  364. KERNEL4(16 * 2)
  365. KERNEL5(16 * 2)
  366. KERNEL6(16 * 2)
  367. KERNEL7(16 * 2)
  368. KERNEL8(16 * 2)
  369. cmpl $128 * 3, %eax
  370. jle .L12
  371. KERNEL1(16 * 3)
  372. KERNEL2(16 * 3)
  373. KERNEL3(16 * 3)
  374. KERNEL4(16 * 3)
  375. KERNEL5(16 * 3)
  376. KERNEL6(16 * 3)
  377. KERNEL7(16 * 3)
  378. KERNEL8(16 * 3)
  379. cmpl $128 * 4, %eax
  380. jle .L12
  381. KERNEL1(16 * 4)
  382. KERNEL2(16 * 4)
  383. KERNEL3(16 * 4)
  384. KERNEL4(16 * 4)
  385. KERNEL5(16 * 4)
  386. KERNEL6(16 * 4)
  387. KERNEL7(16 * 4)
  388. KERNEL8(16 * 4)
  389. cmpl $128 * 5, %eax
  390. jle .L12
  391. KERNEL1(16 * 5)
  392. KERNEL2(16 * 5)
  393. KERNEL3(16 * 5)
  394. KERNEL4(16 * 5)
  395. KERNEL5(16 * 5)
  396. KERNEL6(16 * 5)
  397. KERNEL7(16 * 5)
  398. KERNEL8(16 * 5)
  399. cmpl $128 * 6, %eax
  400. jle .L12
  401. KERNEL1(16 * 6)
  402. KERNEL2(16 * 6)
  403. KERNEL3(16 * 6)
  404. KERNEL4(16 * 6)
  405. KERNEL5(16 * 6)
  406. KERNEL6(16 * 6)
  407. KERNEL7(16 * 6)
  408. KERNEL8(16 * 6)
  409. cmpl $128 * 7, %eax
  410. jle .L12
  411. KERNEL1(16 * 7)
  412. KERNEL2(16 * 7)
  413. KERNEL3(16 * 7)
  414. KERNEL4(16 * 7)
  415. KERNEL5(16 * 7)
  416. KERNEL6(16 * 7)
  417. KERNEL7(16 * 7)
  418. KERNEL8(16 * 7)
  419. addl $128 * 4 * SIZE, BB
  420. addl $128 * 1 * SIZE, AA
  421. subl $128 * 8, %eax
  422. jg .L1X
  423. jmp .L15
  424. .L12:
  425. leal (AA, %eax, 1), AA
  426. leal (BB, %eax, 4), BB
  427. ALIGN_4
  428. #else
  429. sarl $3, %eax
  430. je .L15
  431. ALIGN_4
  432. .L12:
  433. KERNEL1(16 * 0)
  434. KERNEL2(16 * 0)
  435. KERNEL3(16 * 0)
  436. KERNEL4(16 * 0)
  437. KERNEL5(16 * 0)
  438. KERNEL6(16 * 0)
  439. KERNEL7(16 * 0)
  440. KERNEL8(16 * 0)
  441. addl $64 * SIZE, BB
  442. addl $16 * SIZE, AA
  443. decl %eax
  444. jne .L12
  445. ALIGN_4
  446. #endif
  447. .L15:
  448. #ifndef TRMMKERNEL
  449. movl K, %eax
  450. #else
  451. movl KKK, %eax
  452. #endif
  453. movaps ALPHA, %xmm3
  454. andl $7, %eax # if (k & 1)
  455. BRANCH
  456. je .L18
  457. ALIGN_3
  458. .L16:
  459. mulpd %xmm0, %xmm2
  460. addpd %xmm2, %xmm4
  461. movapd 2 * SIZE(BB), %xmm2
  462. mulpd %xmm0, %xmm2
  463. addpd %xmm2, %xmm5
  464. movapd 4 * SIZE(BB), %xmm2
  465. mulpd %xmm0, %xmm2
  466. mulpd 6 * SIZE(BB), %xmm0
  467. addpd %xmm2, %xmm6
  468. movapd 8 * SIZE(BB), %xmm2
  469. addpd %xmm0, %xmm7
  470. movapd 2 * SIZE(AA), %xmm0
  471. addl $2 * SIZE, AA
  472. addl $8 * SIZE, BB
  473. decl %eax
  474. jg .L16
  475. ALIGN_4
  476. .L18:
  477. leal (LDC, LDC, 2), %eax
  478. movsd 0 * SIZE(%esi), %xmm0
  479. movhps 1 * SIZE(%esi), %xmm0
  480. movsd 2 * SIZE(%esi), %xmm1
  481. movhps 3 * SIZE(%esi), %xmm1
  482. pshufd $0x44, %xmm4, %xmm2
  483. unpckhpd %xmm4, %xmm4
  484. mulpd %xmm3, %xmm2
  485. addpd %xmm2, %xmm0
  486. mulpd %xmm3, %xmm4
  487. addpd %xmm4, %xmm1
  488. movlps %xmm0, 0 * SIZE(%esi)
  489. movhps %xmm0, 1 * SIZE(%esi)
  490. movlps %xmm1, 2 * SIZE(%esi)
  491. movhps %xmm1, 3 * SIZE(%esi)
  492. movsd 0 * SIZE(%esi, LDC), %xmm0
  493. movhps 1 * SIZE(%esi, LDC), %xmm0
  494. movsd 2 * SIZE(%esi, LDC), %xmm1
  495. movhps 3 * SIZE(%esi, LDC), %xmm1
  496. pshufd $0x44, %xmm5, %xmm2
  497. unpckhpd %xmm5, %xmm5
  498. mulpd %xmm3, %xmm2
  499. addpd %xmm2, %xmm0
  500. mulpd %xmm3, %xmm5
  501. addpd %xmm5, %xmm1
  502. movlps %xmm0, 0 * SIZE(%esi, LDC)
  503. movhps %xmm0, 1 * SIZE(%esi, LDC)
  504. movlps %xmm1, 2 * SIZE(%esi, LDC)
  505. movhps %xmm1, 3 * SIZE(%esi, LDC)
  506. movsd 0 * SIZE(%esi, LDC, 2), %xmm0
  507. movhps 1 * SIZE(%esi, LDC, 2), %xmm0
  508. movsd 2 * SIZE(%esi, LDC, 2), %xmm1
  509. movhps 3 * SIZE(%esi, LDC, 2), %xmm1
  510. pshufd $0x44, %xmm6, %xmm2
  511. unpckhpd %xmm6, %xmm6
  512. mulpd %xmm3, %xmm2
  513. addpd %xmm2, %xmm0
  514. mulpd %xmm3, %xmm6
  515. addpd %xmm6, %xmm1
  516. movlps %xmm0, 0 * SIZE(%esi, LDC, 2)
  517. movhps %xmm0, 1 * SIZE(%esi, LDC, 2)
  518. movlps %xmm1, 2 * SIZE(%esi, LDC, 2)
  519. movhps %xmm1, 3 * SIZE(%esi, LDC, 2)
  520. movsd 0 * SIZE(%esi, %eax), %xmm0
  521. movhps 1 * SIZE(%esi, %eax), %xmm0
  522. movsd 2 * SIZE(%esi, %eax), %xmm1
  523. movhps 3 * SIZE(%esi, %eax), %xmm1
  524. pshufd $0x44, %xmm7, %xmm2
  525. unpckhpd %xmm7, %xmm7
  526. mulpd %xmm3, %xmm2
  527. addpd %xmm2, %xmm0
  528. mulpd %xmm3, %xmm7
  529. addpd %xmm7, %xmm1
  530. movlps %xmm0, 0 * SIZE(%esi, %eax)
  531. movhps %xmm0, 1 * SIZE(%esi, %eax)
  532. movlps %xmm1, 2 * SIZE(%esi, %eax)
  533. movhps %xmm1, 3 * SIZE(%esi, %eax)
  534. addl $4 * SIZE, %esi # coffset += 2
  535. decl %ebx # i --
  536. jg .L11
  537. ALIGN_4
  538. .L20:
  539. movl M, %ebx
  540. testl $1, %ebx # i = (m >> 2)
  541. jle .L29
  542. #if !defined(TRMMKERNEL) || \
  543. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  544. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  545. leal BUFFER, BB
  546. #else
  547. leal BUFFER, BB
  548. movl KK, %eax
  549. leal (, %eax, SIZE), %eax
  550. leal (AA, %eax, 1), AA
  551. leal (BB, %eax, 8), BB
  552. #endif
  553. pxor %xmm4, %xmm4
  554. pxor %xmm5, %xmm5
  555. pxor %xmm6, %xmm6
  556. pxor %xmm7, %xmm7
  557. leal (LDC, LDC, 2), %eax
  558. movsd 0 * SIZE(AA), %xmm0
  559. movsd 4 * SIZE(AA), %xmm1
  560. movsd 0 * SIZE(BB), %xmm2
  561. movsd 8 * SIZE(BB), %xmm3
  562. #ifndef TRMMKERNEL
  563. movl K, %eax
  564. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  565. movl K, %eax
  566. subl KK, %eax
  567. movl %eax, KKK
  568. #else
  569. movl KK, %eax
  570. #ifdef LEFT
  571. addl $1, %eax
  572. #else
  573. addl $4, %eax
  574. #endif
  575. movl %eax, KKK
  576. #endif
  577. sarl $3, %eax
  578. je .L25
  579. ALIGN_4
  580. .L22:
  581. mulsd %xmm0, %xmm2
  582. addsd %xmm2, %xmm4
  583. #if defined(OPTERON) || defined(BARCELONA)
  584. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  585. #endif
  586. movsd 2 * SIZE(BB), %xmm2
  587. mulsd %xmm0, %xmm2
  588. addsd %xmm2, %xmm5
  589. movsd 4 * SIZE(BB), %xmm2
  590. mulsd %xmm0, %xmm2
  591. mulsd 6 * SIZE(BB), %xmm0
  592. addsd %xmm2, %xmm6
  593. movsd 16 * SIZE(BB), %xmm2
  594. addsd %xmm0, %xmm7
  595. movsd 1 * SIZE(AA), %xmm0
  596. mulsd %xmm0, %xmm3
  597. addsd %xmm3, %xmm4
  598. movsd 10 * SIZE(BB), %xmm3
  599. mulsd %xmm0, %xmm3
  600. addsd %xmm3, %xmm5
  601. movsd 12 * SIZE(BB), %xmm3
  602. mulsd %xmm0, %xmm3
  603. mulsd 14 * SIZE(BB), %xmm0
  604. addsd %xmm3, %xmm6
  605. movsd 24 * SIZE(BB), %xmm3
  606. addsd %xmm0, %xmm7
  607. movsd 2 * SIZE(AA), %xmm0
  608. mulsd %xmm0, %xmm2
  609. addsd %xmm2, %xmm4
  610. movsd 18 * SIZE(BB), %xmm2
  611. mulsd %xmm0, %xmm2
  612. addsd %xmm2, %xmm5
  613. movsd 20 * SIZE(BB), %xmm2
  614. mulsd %xmm0, %xmm2
  615. mulsd 22 * SIZE(BB), %xmm0
  616. addsd %xmm2, %xmm6
  617. movsd 32 * SIZE(BB), %xmm2
  618. addsd %xmm0, %xmm7
  619. movsd 3 * SIZE(AA), %xmm0
  620. mulsd %xmm0, %xmm3
  621. addsd %xmm3, %xmm4
  622. movsd 26 * SIZE(BB), %xmm3
  623. mulsd %xmm0, %xmm3
  624. addsd %xmm3, %xmm5
  625. movsd 28 * SIZE(BB), %xmm3
  626. mulsd %xmm0, %xmm3
  627. mulsd 30 * SIZE(BB), %xmm0
  628. addsd %xmm3, %xmm6
  629. movsd 40 * SIZE(BB), %xmm3
  630. addsd %xmm0, %xmm7
  631. movsd 8 * SIZE(AA), %xmm0
  632. #if defined(OPTERON) || defined(BARCELONA)
  633. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  634. #endif
  635. mulsd %xmm1, %xmm2
  636. addsd %xmm2, %xmm4
  637. movsd 34 * SIZE(BB), %xmm2
  638. mulsd %xmm1, %xmm2
  639. addsd %xmm2, %xmm5
  640. movsd 36 * SIZE(BB), %xmm2
  641. mulsd %xmm1, %xmm2
  642. mulsd 38 * SIZE(BB), %xmm1
  643. addsd %xmm2, %xmm6
  644. movsd 48 * SIZE(BB), %xmm2
  645. addsd %xmm1, %xmm7
  646. movsd 5 * SIZE(AA), %xmm1
  647. mulsd %xmm1, %xmm3
  648. addsd %xmm3, %xmm4
  649. movsd 42 * SIZE(BB), %xmm3
  650. mulsd %xmm1, %xmm3
  651. addsd %xmm3, %xmm5
  652. movsd 44 * SIZE(BB), %xmm3
  653. mulsd %xmm1, %xmm3
  654. mulsd 46 * SIZE(BB), %xmm1
  655. addsd %xmm3, %xmm6
  656. movsd 56 * SIZE(BB), %xmm3
  657. addsd %xmm1, %xmm7
  658. movsd 6 * SIZE(AA), %xmm1
  659. mulsd %xmm1, %xmm2
  660. addsd %xmm2, %xmm4
  661. movsd 50 * SIZE(BB), %xmm2
  662. mulsd %xmm1, %xmm2
  663. addsd %xmm2, %xmm5
  664. movsd 52 * SIZE(BB), %xmm2
  665. mulsd %xmm1, %xmm2
  666. mulsd 54 * SIZE(BB), %xmm1
  667. addsd %xmm2, %xmm6
  668. movsd 64 * SIZE(BB), %xmm2
  669. addsd %xmm1, %xmm7
  670. movsd 7 * SIZE(AA), %xmm1
  671. mulsd %xmm1, %xmm3
  672. addsd %xmm3, %xmm4
  673. movsd 58 * SIZE(BB), %xmm3
  674. mulsd %xmm1, %xmm3
  675. addsd %xmm3, %xmm5
  676. movsd 60 * SIZE(BB), %xmm3
  677. mulsd %xmm1, %xmm3
  678. mulsd 62 * SIZE(BB), %xmm1
  679. addsd %xmm3, %xmm6
  680. movsd 72 * SIZE(BB), %xmm3
  681. addl $64 * SIZE, BB
  682. addsd %xmm1, %xmm7
  683. movsd 12 * SIZE(AA), %xmm1
  684. addl $8 * SIZE, AA
  685. decl %eax
  686. jne .L22
  687. ALIGN_4
  688. .L25:
  689. #ifndef TRMMKERNEL
  690. movl K, %eax
  691. #else
  692. movl KKK, %eax
  693. #endif
  694. movaps ALPHA, %xmm3
  695. andl $7, %eax # if (k & 1)
  696. BRANCH
  697. je .L28
  698. .L26:
  699. mulsd %xmm0, %xmm2
  700. addsd %xmm2, %xmm4
  701. movsd 2 * SIZE(BB), %xmm2
  702. mulsd %xmm0, %xmm2
  703. addsd %xmm2, %xmm5
  704. movsd 4 * SIZE(BB), %xmm2
  705. mulsd %xmm0, %xmm2
  706. mulsd 6 * SIZE(BB), %xmm0
  707. addsd %xmm2, %xmm6
  708. movsd 8 * SIZE(BB), %xmm2
  709. addsd %xmm0, %xmm7
  710. movsd 1 * SIZE(AA), %xmm0
  711. addl $1 * SIZE, AA
  712. addl $8 * SIZE, BB
  713. decl %eax
  714. jg .L26
  715. ALIGN_4
  716. .L28:
  717. leal (LDC, LDC, 2), %eax
  718. movsd 0 * SIZE(%esi), %xmm0
  719. movhps 1 * SIZE(%esi), %xmm0
  720. movsd 0 * SIZE(%esi, LDC), %xmm1
  721. movhps 1 * SIZE(%esi, LDC), %xmm1
  722. unpcklpd %xmm4, %xmm4
  723. unpcklpd %xmm5, %xmm5
  724. mulpd %xmm3, %xmm4
  725. addpd %xmm4, %xmm0
  726. mulpd %xmm3, %xmm5
  727. addpd %xmm5, %xmm1
  728. movlps %xmm0, 0 * SIZE(%esi)
  729. movhps %xmm0, 1 * SIZE(%esi)
  730. movlps %xmm1, 0 * SIZE(%esi, LDC)
  731. movhps %xmm1, 1 * SIZE(%esi, LDC)
  732. movsd 0 * SIZE(%esi, LDC, 2), %xmm0
  733. movhps 1 * SIZE(%esi, LDC, 2), %xmm0
  734. movsd 0 * SIZE(%esi, %eax), %xmm1
  735. movhps 1 * SIZE(%esi, %eax), %xmm1
  736. unpcklpd %xmm6, %xmm6
  737. unpcklpd %xmm7, %xmm7
  738. mulpd %xmm3, %xmm6
  739. addpd %xmm6, %xmm0
  740. mulpd %xmm3, %xmm7
  741. addpd %xmm7, %xmm1
  742. movlps %xmm0, 0 * SIZE(%esi, LDC, 2)
  743. movhps %xmm0, 1 * SIZE(%esi, LDC, 2)
  744. movlps %xmm1, 0 * SIZE(%esi, %eax)
  745. movhps %xmm1, 1 * SIZE(%esi, %eax)
  746. ALIGN_4
  747. .L29:
  748. #if defined(TRMMKERNEL) && !defined(LEFT)
  749. addl $4, KK
  750. #endif
  751. leal (, LDC, 4), %eax
  752. addl %eax, C # c += 4 * ldc
  753. decl J # j --
  754. jg .L01
  755. ALIGN_4
  756. .L30:
  757. testl $2, N
  758. je .L60
  759. ALIGN_2
  760. .L31:
  761. #if defined(TRMMKERNEL) && defined(LEFT)
  762. movl OFFSET, %eax
  763. movl %eax, KK
  764. #endif
  765. /* Copying to Sub Buffer */
  766. movl K, %eax
  767. leal BUFFER, %ecx
  768. sarl $2, %eax
  769. jle .L35
  770. ALIGN_4
  771. .L32:
  772. #ifdef PENTIUM4
  773. #ifdef HAVE_SSE3
  774. movddup 0 * SIZE(%edi), %xmm0
  775. movddup 1 * SIZE(%edi), %xmm1
  776. movddup 2 * SIZE(%edi), %xmm2
  777. movddup 3 * SIZE(%edi), %xmm3
  778. movddup 4 * SIZE(%edi), %xmm4
  779. movddup 5 * SIZE(%edi), %xmm5
  780. movddup 6 * SIZE(%edi), %xmm6
  781. movddup 7 * SIZE(%edi), %xmm7
  782. movapd %xmm0, 0 * SIZE(%ecx)
  783. movapd %xmm1, 2 * SIZE(%ecx)
  784. movapd %xmm2, 4 * SIZE(%ecx)
  785. movapd %xmm3, 6 * SIZE(%ecx)
  786. movapd %xmm4, 8 * SIZE(%ecx)
  787. movapd %xmm5, 10 * SIZE(%ecx)
  788. movapd %xmm6, 12 * SIZE(%ecx)
  789. movapd %xmm7, 14 * SIZE(%ecx)
  790. #else
  791. movsd 0 * SIZE(%edi), %xmm0
  792. movsd 1 * SIZE(%edi), %xmm1
  793. movsd 2 * SIZE(%edi), %xmm2
  794. movsd 3 * SIZE(%edi), %xmm3
  795. movsd 4 * SIZE(%edi), %xmm4
  796. movsd 5 * SIZE(%edi), %xmm5
  797. movsd 6 * SIZE(%edi), %xmm6
  798. movsd 7 * SIZE(%edi), %xmm7
  799. unpcklpd %xmm0, %xmm0
  800. unpckhpd %xmm1, %xmm1
  801. unpcklpd %xmm2, %xmm2
  802. unpckhpd %xmm3, %xmm3
  803. unpcklpd %xmm4, %xmm4
  804. unpckhpd %xmm5, %xmm5
  805. unpcklpd %xmm6, %xmm6
  806. unpckhpd %xmm7, %xmm7
  807. movapd %xmm0, 0 * SIZE(%ecx)
  808. movapd %xmm1, 2 * SIZE(%ecx)
  809. movapd %xmm2, 4 * SIZE(%ecx)
  810. movapd %xmm3, 6 * SIZE(%ecx)
  811. movapd %xmm4, 8 * SIZE(%ecx)
  812. movapd %xmm5, 10 * SIZE(%ecx)
  813. movapd %xmm6, 12 * SIZE(%ecx)
  814. movapd %xmm7, 14 * SIZE(%ecx)
  815. #endif
  816. prefetcht0 80 * SIZE(%edi)
  817. prefetcht1 112 * SIZE(%ecx)
  818. #endif
  819. #if defined(OPTERON) || defined(BARCELONA)
  820. #define COPYPREFETCH 40
  821. prefetchnta (COPYPREFETCH) * SIZE(%edi)
  822. movq 0 * SIZE(%edi), %mm0
  823. movq 1 * SIZE(%edi), %mm1
  824. movq 2 * SIZE(%edi), %mm2
  825. movq 3 * SIZE(%edi), %mm3
  826. movq 4 * SIZE(%edi), %mm4
  827. movq 5 * SIZE(%edi), %mm5
  828. movq 6 * SIZE(%edi), %mm6
  829. movq 7 * SIZE(%edi), %mm7
  830. movq %mm0, 0 * SIZE(%ecx)
  831. movq %mm0, 1 * SIZE(%ecx)
  832. movq %mm1, 2 * SIZE(%ecx)
  833. movq %mm1, 3 * SIZE(%ecx)
  834. movq %mm2, 4 * SIZE(%ecx)
  835. movq %mm2, 5 * SIZE(%ecx)
  836. movq %mm3, 6 * SIZE(%ecx)
  837. movq %mm3, 7 * SIZE(%ecx)
  838. movq %mm4, 8 * SIZE(%ecx)
  839. movq %mm4, 9 * SIZE(%ecx)
  840. movq %mm5, 10 * SIZE(%ecx)
  841. movq %mm5, 11 * SIZE(%ecx)
  842. movq %mm6, 12 * SIZE(%ecx)
  843. movq %mm6, 13 * SIZE(%ecx)
  844. movq %mm7, 14 * SIZE(%ecx)
  845. movq %mm7, 15 * SIZE(%ecx)
  846. #endif
  847. addl $ 8 * SIZE, %edi
  848. addl $16 * SIZE, %ecx
  849. decl %eax
  850. jne .L32
  851. ALIGN_2
  852. .L35:
  853. movl K, %eax
  854. andl $3, %eax
  855. BRANCH
  856. jle .L40
  857. ALIGN_2
  858. .L36:
  859. #ifdef PENTIUM4
  860. #ifdef HAVE_SSE3
  861. movddup 0 * SIZE(%edi), %xmm0
  862. movddup 1 * SIZE(%edi), %xmm1
  863. movapd %xmm0, 0 * SIZE(%ecx)
  864. movapd %xmm1, 2 * SIZE(%ecx)
  865. #else
  866. movsd 0 * SIZE(%edi), %xmm0
  867. movsd 1 * SIZE(%edi), %xmm1
  868. unpcklpd %xmm0, %xmm0
  869. unpckhpd %xmm1, %xmm1
  870. movapd %xmm0, 0 * SIZE(%ecx)
  871. movapd %xmm1, 2 * SIZE(%ecx)
  872. #endif
  873. #endif
  874. #if defined(OPTERON) || defined(BARCELONA)
  875. movq 0 * SIZE(%edi), %mm0
  876. movq 1 * SIZE(%edi), %mm1
  877. movq %mm0, 0 * SIZE(%ecx)
  878. movq %mm0, 1 * SIZE(%ecx)
  879. movq %mm1, 2 * SIZE(%ecx)
  880. movq %mm1, 3 * SIZE(%ecx)
  881. #endif
  882. addl $2 * SIZE, %edi
  883. addl $4 * SIZE, %ecx
  884. decl %eax
  885. jne .L36
  886. ALIGN_4
  887. .L40:
  888. movl C, %esi # coffset = c
  889. movl A, AA # aoffset = a
  890. movl M, %ebx
  891. sarl $1, %ebx # i = (m >> 2)
  892. jle .L50
  893. ALIGN_4
  894. .L41:
  895. #if !defined(TRMMKERNEL) || \
  896. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  897. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  898. leal BUFFER, BB
  899. #else
  900. leal BUFFER, BB
  901. movl KK, %eax
  902. leal (, %eax, SIZE), %eax
  903. leal (AA, %eax, 2), AA
  904. leal (BB, %eax, 4), BB
  905. #endif
  906. pxor %xmm4, %xmm4
  907. pxor %xmm5, %xmm5
  908. pxor %xmm6, %xmm6
  909. pxor %xmm7, %xmm7
  910. movapd 0 * SIZE(AA), %xmm0
  911. movapd 8 * SIZE(AA), %xmm1
  912. movapd 0 * SIZE(BB), %xmm2
  913. movapd 8 * SIZE(BB), %xmm3
  914. #ifdef HAVE_3DNOW
  915. prefetchw 2 * SIZE(%esi)
  916. prefetchw 2 * SIZE(%esi, LDC)
  917. #endif
  918. #ifdef PENTIUM4
  919. prefetchnta 4 * SIZE(%esi)
  920. prefetchnta 4 * SIZE(%esi, LDC)
  921. #endif
  922. #ifndef TRMMKERNEL
  923. movl K, %eax
  924. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  925. movl K, %eax
  926. subl KK, %eax
  927. movl %eax, KKK
  928. #else
  929. movl KK, %eax
  930. #ifdef LEFT
  931. addl $2, %eax
  932. #else
  933. addl $2, %eax
  934. #endif
  935. movl %eax, KKK
  936. #endif
  937. sarl $3, %eax
  938. je .L45
  939. ALIGN_4
  940. .L42:
  941. mulpd %xmm0, %xmm2
  942. #if defined(OPTERON) || defined(BARCELONA)
  943. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  944. #endif
  945. mulpd 2 * SIZE(BB), %xmm0
  946. addpd %xmm2, %xmm4
  947. movapd 4 * SIZE(BB), %xmm2
  948. addpd %xmm0, %xmm5
  949. movapd 2 * SIZE(AA), %xmm0
  950. mulpd %xmm0, %xmm2
  951. mulpd 6 * SIZE(BB), %xmm0
  952. addpd %xmm2, %xmm6
  953. movapd 16 * SIZE(BB), %xmm2
  954. addpd %xmm0, %xmm7
  955. movapd 4 * SIZE(AA), %xmm0
  956. mulpd %xmm0, %xmm3
  957. mulpd 10 * SIZE(BB), %xmm0
  958. addpd %xmm3, %xmm4
  959. movapd 12 * SIZE(BB), %xmm3
  960. addpd %xmm0, %xmm5
  961. movapd 6 * SIZE(AA), %xmm0
  962. mulpd %xmm0, %xmm3
  963. mulpd 14 * SIZE(BB), %xmm0
  964. addpd %xmm3, %xmm6
  965. movapd 24 * SIZE(BB), %xmm3
  966. addpd %xmm0, %xmm7
  967. movapd 16 * SIZE(AA), %xmm0
  968. #if defined(OPTERON) || defined(BARCELONA)
  969. prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
  970. #endif
  971. mulpd %xmm1, %xmm2
  972. mulpd 18 * SIZE(BB), %xmm1
  973. addpd %xmm2, %xmm4
  974. movapd 20 * SIZE(BB), %xmm2
  975. addpd %xmm1, %xmm5
  976. movapd 10 * SIZE(AA), %xmm1
  977. mulpd %xmm1, %xmm2
  978. mulpd 22 * SIZE(BB), %xmm1
  979. addpd %xmm2, %xmm6
  980. movapd 32 * SIZE(BB), %xmm2
  981. addpd %xmm1, %xmm7
  982. movapd 12 * SIZE(AA), %xmm1
  983. mulpd %xmm1, %xmm3
  984. mulpd 26 * SIZE(BB), %xmm1
  985. addpd %xmm3, %xmm4
  986. movapd 28 * SIZE(BB), %xmm3
  987. addpd %xmm1, %xmm5
  988. movapd 14 * SIZE(AA), %xmm1
  989. mulpd %xmm1, %xmm3
  990. mulpd 30 * SIZE(BB), %xmm1
  991. addpd %xmm3, %xmm6
  992. movapd 40 * SIZE(BB), %xmm3
  993. addpd %xmm1, %xmm7
  994. movapd 24 * SIZE(AA), %xmm1
  995. addl $16 * SIZE, AA
  996. addl $32 * SIZE, BB
  997. decl %eax
  998. jne .L42
  999. ALIGN_4
  1000. .L45:
  1001. #ifndef TRMMKERNEL
  1002. movl K, %eax
  1003. #else
  1004. movl KKK, %eax
  1005. #endif
  1006. movaps ALPHA, %xmm3
  1007. andl $7, %eax # if (k & 1)
  1008. BRANCH
  1009. je .L48
  1010. ALIGN_3
  1011. .L46:
  1012. mulpd %xmm0, %xmm2
  1013. mulpd 2 * SIZE(BB), %xmm0
  1014. addpd %xmm2, %xmm4
  1015. movapd 4 * SIZE(BB), %xmm2
  1016. addpd %xmm0, %xmm5
  1017. movapd 2 * SIZE(AA), %xmm0
  1018. addl $2 * SIZE, AA
  1019. addl $4 * SIZE, BB
  1020. decl %eax
  1021. jg .L46
  1022. ALIGN_4
  1023. .L48:
  1024. addpd %xmm6, %xmm4
  1025. addpd %xmm7, %xmm5
  1026. movsd 0 * SIZE(%esi), %xmm0
  1027. movhps 1 * SIZE(%esi), %xmm0
  1028. movsd 2 * SIZE(%esi), %xmm1
  1029. movhps 3 * SIZE(%esi), %xmm1
  1030. pshufd $0x44, %xmm4, %xmm2
  1031. unpckhpd %xmm4, %xmm4
  1032. mulpd %xmm3, %xmm2
  1033. addpd %xmm2, %xmm0
  1034. mulpd %xmm3, %xmm4
  1035. addpd %xmm4, %xmm1
  1036. movlps %xmm0, 0 * SIZE(%esi)
  1037. movhps %xmm0, 1 * SIZE(%esi)
  1038. movlps %xmm1, 2 * SIZE(%esi)
  1039. movhps %xmm1, 3 * SIZE(%esi)
  1040. movsd 0 * SIZE(%esi, LDC), %xmm0
  1041. movhps 1 * SIZE(%esi, LDC), %xmm0
  1042. movsd 2 * SIZE(%esi, LDC), %xmm1
  1043. movhps 3 * SIZE(%esi, LDC), %xmm1
  1044. pshufd $0x44, %xmm5, %xmm2
  1045. unpckhpd %xmm5, %xmm5
  1046. mulpd %xmm3, %xmm2
  1047. addpd %xmm2, %xmm0
  1048. mulpd %xmm3, %xmm5
  1049. addpd %xmm5, %xmm1
  1050. movlps %xmm0, 0 * SIZE(%esi, LDC)
  1051. movhps %xmm0, 1 * SIZE(%esi, LDC)
  1052. movlps %xmm1, 2 * SIZE(%esi, LDC)
  1053. movhps %xmm1, 3 * SIZE(%esi, LDC)
  1054. addl $4 * SIZE, %esi # coffset += 2
  1055. decl %ebx # i --
  1056. jg .L41
  1057. ALIGN_4
  1058. .L50:
  1059. movl M, %ebx
  1060. testl $1, %ebx # i = (m >> 2)
  1061. jle .L59
  1062. #if !defined(TRMMKERNEL) || \
  1063. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1064. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1065. leal BUFFER, BB
  1066. #else
  1067. leal BUFFER, BB
  1068. movl KK, %eax
  1069. leal (, %eax, SIZE), %eax
  1070. leal (AA, %eax, 1), AA
  1071. leal (BB, %eax, 4), BB
  1072. #endif
  1073. pxor %xmm4, %xmm4
  1074. pxor %xmm5, %xmm5
  1075. pxor %xmm6, %xmm6
  1076. pxor %xmm7, %xmm7
  1077. leal (LDC, LDC, 2), %eax
  1078. movsd 0 * SIZE(AA), %xmm0
  1079. movsd 4 * SIZE(AA), %xmm1
  1080. movsd 0 * SIZE(BB), %xmm2
  1081. movsd 8 * SIZE(BB), %xmm3
  1082. #ifndef TRMMKERNEL
  1083. movl K, %eax
  1084. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1085. movl K, %eax
  1086. subl KK, %eax
  1087. movl %eax, KKK
  1088. #else
  1089. movl KK, %eax
  1090. #ifdef LEFT
  1091. addl $1, %eax
  1092. #else
  1093. addl $2, %eax
  1094. #endif
  1095. movl %eax, KKK
  1096. #endif
  1097. sarl $3, %eax
  1098. je .L55
  1099. ALIGN_4
  1100. .L52:
  1101. mulsd %xmm0, %xmm2
  1102. #if defined(OPTERON) || defined(BARCELONA)
  1103. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1104. #endif
  1105. mulsd 2 * SIZE(BB), %xmm0
  1106. addsd %xmm2, %xmm4
  1107. movsd 4 * SIZE(BB), %xmm2
  1108. addsd %xmm0, %xmm5
  1109. movsd 1 * SIZE(AA), %xmm0
  1110. mulsd %xmm0, %xmm2
  1111. mulsd 6 * SIZE(BB), %xmm0
  1112. addsd %xmm2, %xmm6
  1113. movsd 16 * SIZE(BB), %xmm2
  1114. addsd %xmm0, %xmm7
  1115. movsd 2 * SIZE(AA), %xmm0
  1116. mulsd %xmm0, %xmm3
  1117. mulsd 10 * SIZE(BB), %xmm0
  1118. addsd %xmm3, %xmm4
  1119. movsd 12 * SIZE(BB), %xmm3
  1120. addsd %xmm0, %xmm5
  1121. movsd 3 * SIZE(AA), %xmm0
  1122. mulsd %xmm0, %xmm3
  1123. mulsd 14 * SIZE(BB), %xmm0
  1124. addsd %xmm3, %xmm6
  1125. movsd 24 * SIZE(BB), %xmm3
  1126. addsd %xmm0, %xmm7
  1127. movsd 8 * SIZE(AA), %xmm0
  1128. mulsd %xmm1, %xmm2
  1129. mulsd 18 * SIZE(BB), %xmm1
  1130. addsd %xmm2, %xmm4
  1131. movsd 20 * SIZE(BB), %xmm2
  1132. addsd %xmm1, %xmm5
  1133. movsd 5 * SIZE(AA), %xmm1
  1134. mulsd %xmm1, %xmm2
  1135. mulsd 22 * SIZE(BB), %xmm1
  1136. addsd %xmm2, %xmm6
  1137. movsd 32 * SIZE(BB), %xmm2
  1138. addsd %xmm1, %xmm7
  1139. movsd 6 * SIZE(AA), %xmm1
  1140. mulsd %xmm1, %xmm3
  1141. mulsd 26 * SIZE(BB), %xmm1
  1142. addsd %xmm3, %xmm4
  1143. movsd 28 * SIZE(BB), %xmm3
  1144. addsd %xmm1, %xmm5
  1145. movsd 7 * SIZE(AA), %xmm1
  1146. mulsd %xmm1, %xmm3
  1147. mulsd 30 * SIZE(BB), %xmm1
  1148. addsd %xmm3, %xmm6
  1149. movsd 40 * SIZE(BB), %xmm3
  1150. addsd %xmm1, %xmm7
  1151. movsd 12 * SIZE(AA), %xmm1
  1152. addl $ 8 * SIZE, AA
  1153. addl $32 * SIZE, BB
  1154. decl %eax
  1155. jne .L52
  1156. ALIGN_4
  1157. .L55:
  1158. #ifndef TRMMKERNEL
  1159. movl K, %eax
  1160. #else
  1161. movl KKK, %eax
  1162. #endif
  1163. movaps ALPHA, %xmm3
  1164. andl $7, %eax # if (k & 1)
  1165. BRANCH
  1166. je .L58
  1167. .L56:
  1168. mulsd %xmm0, %xmm2
  1169. mulsd 2 * SIZE(BB), %xmm0
  1170. addsd %xmm2, %xmm4
  1171. movsd 4 * SIZE(BB), %xmm2
  1172. addsd %xmm0, %xmm5
  1173. movsd 1 * SIZE(AA), %xmm0
  1174. addl $1 * SIZE, AA
  1175. addl $4 * SIZE, BB
  1176. decl %eax
  1177. jg .L56
  1178. ALIGN_4
  1179. .L58:
  1180. addsd %xmm6, %xmm4
  1181. addsd %xmm7, %xmm5
  1182. movsd 0 * SIZE(%esi), %xmm0
  1183. movhps 1 * SIZE(%esi), %xmm0
  1184. movsd 0 * SIZE(%esi, LDC), %xmm1
  1185. movhps 1 * SIZE(%esi, LDC), %xmm1
  1186. unpcklpd %xmm4, %xmm4
  1187. unpcklpd %xmm5, %xmm5
  1188. mulpd %xmm3, %xmm4
  1189. addpd %xmm4, %xmm0
  1190. mulpd %xmm3, %xmm5
  1191. addpd %xmm5, %xmm1
  1192. movlps %xmm0, 0 * SIZE(%esi)
  1193. movhps %xmm0, 1 * SIZE(%esi)
  1194. movlps %xmm1, 0 * SIZE(%esi, LDC)
  1195. movhps %xmm1, 1 * SIZE(%esi, LDC)
  1196. ALIGN_4
  1197. .L59:
  1198. #if defined(TRMMKERNEL) && !defined(LEFT)
  1199. addl $2, KK
  1200. #endif
  1201. leal (, LDC, 2), %eax
  1202. addl %eax, C # c += 4 * ldc
  1203. ALIGN_4
  1204. .L60:
  1205. testl $1, N
  1206. je .L999
  1207. #if defined(TRMMKERNEL) && defined(LEFT)
  1208. movl OFFSET, %eax
  1209. movl %eax, KK
  1210. #endif
  1211. movl K, %eax
  1212. leal BUFFER, %ecx
  1213. sarl $3, %eax
  1214. jle .L65
  1215. ALIGN_4
  1216. .L62:
  1217. #ifdef PENTIUM4
  1218. #ifdef HAVE_SSE3
  1219. movddup 0 * SIZE(%edi), %xmm0
  1220. movddup 1 * SIZE(%edi), %xmm1
  1221. movddup 2 * SIZE(%edi), %xmm2
  1222. movddup 3 * SIZE(%edi), %xmm3
  1223. movddup 4 * SIZE(%edi), %xmm4
  1224. movddup 5 * SIZE(%edi), %xmm5
  1225. movddup 6 * SIZE(%edi), %xmm6
  1226. movddup 7 * SIZE(%edi), %xmm7
  1227. movapd %xmm0, 0 * SIZE(%ecx)
  1228. movapd %xmm1, 2 * SIZE(%ecx)
  1229. movapd %xmm2, 4 * SIZE(%ecx)
  1230. movapd %xmm3, 6 * SIZE(%ecx)
  1231. movapd %xmm4, 8 * SIZE(%ecx)
  1232. movapd %xmm5, 10 * SIZE(%ecx)
  1233. movapd %xmm6, 12 * SIZE(%ecx)
  1234. movapd %xmm7, 14 * SIZE(%ecx)
  1235. #else
  1236. movsd 0 * SIZE(%edi), %xmm0
  1237. movsd 1 * SIZE(%edi), %xmm1
  1238. movsd 2 * SIZE(%edi), %xmm2
  1239. movsd 3 * SIZE(%edi), %xmm3
  1240. movsd 4 * SIZE(%edi), %xmm4
  1241. movsd 5 * SIZE(%edi), %xmm5
  1242. movsd 6 * SIZE(%edi), %xmm6
  1243. movsd 7 * SIZE(%edi), %xmm7
  1244. unpcklpd %xmm0, %xmm0
  1245. unpckhpd %xmm1, %xmm1
  1246. unpcklpd %xmm2, %xmm2
  1247. unpckhpd %xmm3, %xmm3
  1248. unpcklpd %xmm4, %xmm4
  1249. unpckhpd %xmm5, %xmm5
  1250. unpcklpd %xmm6, %xmm6
  1251. unpckhpd %xmm7, %xmm7
  1252. movapd %xmm0, 0 * SIZE(%ecx)
  1253. movapd %xmm1, 2 * SIZE(%ecx)
  1254. movapd %xmm2, 4 * SIZE(%ecx)
  1255. movapd %xmm3, 6 * SIZE(%ecx)
  1256. movapd %xmm4, 8 * SIZE(%ecx)
  1257. movapd %xmm5, 10 * SIZE(%ecx)
  1258. movapd %xmm6, 12 * SIZE(%ecx)
  1259. movapd %xmm7, 14 * SIZE(%ecx)
  1260. #endif
  1261. prefetcht1 80 * SIZE(%edi)
  1262. prefetcht0 112 * SIZE(%ecx)
  1263. #endif
  1264. #if defined(OPTERON) || defined(BARCELONA)
  1265. #define COPYPREFETCH 40
  1266. prefetchnta (COPYPREFETCH) * SIZE(%edi)
  1267. movq 0 * SIZE(%edi), %mm0
  1268. movq 1 * SIZE(%edi), %mm1
  1269. movq 2 * SIZE(%edi), %mm2
  1270. movq 3 * SIZE(%edi), %mm3
  1271. movq 4 * SIZE(%edi), %mm4
  1272. movq 5 * SIZE(%edi), %mm5
  1273. movq 6 * SIZE(%edi), %mm6
  1274. movq 7 * SIZE(%edi), %mm7
  1275. movq %mm0, 0 * SIZE(%ecx)
  1276. movq %mm0, 1 * SIZE(%ecx)
  1277. movq %mm1, 2 * SIZE(%ecx)
  1278. movq %mm1, 3 * SIZE(%ecx)
  1279. movq %mm2, 4 * SIZE(%ecx)
  1280. movq %mm2, 5 * SIZE(%ecx)
  1281. movq %mm3, 6 * SIZE(%ecx)
  1282. movq %mm3, 7 * SIZE(%ecx)
  1283. movq %mm4, 8 * SIZE(%ecx)
  1284. movq %mm4, 9 * SIZE(%ecx)
  1285. movq %mm5, 10 * SIZE(%ecx)
  1286. movq %mm5, 11 * SIZE(%ecx)
  1287. movq %mm6, 12 * SIZE(%ecx)
  1288. movq %mm6, 13 * SIZE(%ecx)
  1289. movq %mm7, 14 * SIZE(%ecx)
  1290. movq %mm7, 15 * SIZE(%ecx)
  1291. #endif
  1292. addl $ 8 * SIZE, %edi
  1293. addl $16 * SIZE, %ecx
  1294. decl %eax
  1295. jne .L62
  1296. ALIGN_2
  1297. .L65:
  1298. movl K, %eax
  1299. andl $7, %eax
  1300. BRANCH
  1301. jle .L70
  1302. ALIGN_2
  1303. .L66:
  1304. #ifdef PENTIUM4
  1305. #ifdef HAVE_SSE3
  1306. movddup 0 * SIZE(%edi), %xmm0
  1307. movapd %xmm0, 0 * SIZE(%ecx)
  1308. #else
  1309. movsd 0 * SIZE(%edi), %xmm0
  1310. unpcklpd %xmm0, %xmm0
  1311. movapd %xmm0, 0 * SIZE(%ecx)
  1312. #endif
  1313. #endif
  1314. #if defined(OPTERON) || defined(BARCELONA)
  1315. movq 0 * SIZE(%edi), %mm0
  1316. movq %mm0, 0 * SIZE(%ecx)
  1317. movq %mm0, 1 * SIZE(%ecx)
  1318. #endif
  1319. addl $1 * SIZE, %edi
  1320. addl $2 * SIZE, %ecx
  1321. decl %eax
  1322. jne .L66
  1323. ALIGN_4
  1324. .L70:
  1325. movl C, %esi # coffset = c
  1326. movl A, AA # aoffset = a
  1327. movl M, %ebx
  1328. sarl $1, %ebx # i = (m >> 2)
  1329. jle .L80
  1330. ALIGN_4
  1331. .L71:
  1332. #if !defined(TRMMKERNEL) || \
  1333. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1334. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1335. leal BUFFER, BB
  1336. #else
  1337. leal BUFFER, BB
  1338. movl KK, %eax
  1339. leal (, %eax, SIZE), %eax
  1340. leal (AA, %eax, 2), AA
  1341. leal (BB, %eax, 2), BB
  1342. #endif
  1343. pxor %xmm4, %xmm4
  1344. pxor %xmm5, %xmm5
  1345. pxor %xmm6, %xmm6
  1346. pxor %xmm7, %xmm7
  1347. movapd 0 * SIZE(AA), %xmm0
  1348. movapd 8 * SIZE(AA), %xmm1
  1349. movapd 0 * SIZE(BB), %xmm2
  1350. movapd 8 * SIZE(BB), %xmm3
  1351. #ifdef HAVE_3DNOW
  1352. prefetchw 2 * SIZE(%esi)
  1353. #endif
  1354. #ifdef PENTIUM4
  1355. prefetchnta 2 * SIZE(%esi)
  1356. #endif
  1357. #ifndef TRMMKERNEL
  1358. movl K, %eax
  1359. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1360. movl K, %eax
  1361. subl KK, %eax
  1362. movl %eax, KKK
  1363. #else
  1364. movl KK, %eax
  1365. #ifdef LEFT
  1366. addl $2, %eax
  1367. #else
  1368. addl $1, %eax
  1369. #endif
  1370. movl %eax, KKK
  1371. #endif
  1372. sarl $3, %eax
  1373. je .L75
  1374. ALIGN_4
  1375. .L72:
  1376. mulpd %xmm0, %xmm2
  1377. addpd %xmm2, %xmm4
  1378. #if defined(OPTERON) || defined(BARCELONA)
  1379. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1380. #endif
  1381. movapd 16 * SIZE(BB), %xmm2
  1382. movapd 2 * SIZE(AA), %xmm0
  1383. mulpd 2 * SIZE(BB), %xmm0
  1384. addpd %xmm0, %xmm4
  1385. movapd 4 * SIZE(AA), %xmm0
  1386. mulpd 4 * SIZE(BB), %xmm0
  1387. addpd %xmm0, %xmm4
  1388. movapd 6 * SIZE(AA), %xmm0
  1389. mulpd 6 * SIZE(BB), %xmm0
  1390. addpd %xmm0, %xmm4
  1391. movapd 16 * SIZE(AA), %xmm0
  1392. #if defined(OPTERON) || defined(BARCELONA)
  1393. prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
  1394. #endif
  1395. mulpd %xmm1, %xmm3
  1396. addpd %xmm3, %xmm4
  1397. movapd 24 * SIZE(BB), %xmm3
  1398. movapd 10 * SIZE(AA), %xmm1
  1399. mulpd 10 * SIZE(BB), %xmm1
  1400. addpd %xmm1, %xmm4
  1401. movapd 12 * SIZE(AA), %xmm1
  1402. mulpd 12 * SIZE(BB), %xmm1
  1403. addpd %xmm1, %xmm4
  1404. movapd 14 * SIZE(AA), %xmm1
  1405. mulpd 14 * SIZE(BB), %xmm1
  1406. addpd %xmm1, %xmm4
  1407. movapd 24 * SIZE(AA), %xmm1
  1408. addl $16 * SIZE, AA
  1409. addl $16 * SIZE, BB
  1410. decl %eax
  1411. jne .L72
  1412. ALIGN_4
  1413. .L75:
  1414. #ifndef TRMMKERNEL
  1415. movl K, %eax
  1416. #else
  1417. movl KKK, %eax
  1418. #endif
  1419. movaps ALPHA, %xmm3
  1420. andl $7, %eax # if (k & 1)
  1421. BRANCH
  1422. je .L78
  1423. ALIGN_3
  1424. .L76:
  1425. mulpd %xmm0, %xmm2
  1426. addpd %xmm2, %xmm4
  1427. movapd 2 * SIZE(AA), %xmm0
  1428. movapd 2 * SIZE(BB), %xmm2
  1429. addl $2 * SIZE, AA
  1430. addl $2 * SIZE, BB
  1431. decl %eax
  1432. jg .L76
  1433. ALIGN_4
  1434. .L78:
  1435. movsd 0 * SIZE(%esi), %xmm0
  1436. movhps 1 * SIZE(%esi), %xmm0
  1437. movsd 2 * SIZE(%esi), %xmm1
  1438. movhps 3 * SIZE(%esi), %xmm1
  1439. pshufd $0x44, %xmm4, %xmm2
  1440. unpckhpd %xmm4, %xmm4
  1441. mulpd %xmm3, %xmm2
  1442. addpd %xmm2, %xmm0
  1443. mulpd %xmm3, %xmm4
  1444. addpd %xmm4, %xmm1
  1445. movlps %xmm0, 0 * SIZE(%esi)
  1446. movhps %xmm0, 1 * SIZE(%esi)
  1447. movlps %xmm1, 2 * SIZE(%esi)
  1448. movhps %xmm1, 3 * SIZE(%esi)
  1449. addl $4 * SIZE, %esi # coffset += 2
  1450. decl %ebx # i --
  1451. jg .L71
  1452. ALIGN_4
  1453. .L80:
  1454. movl M, %ebx
  1455. testl $1, %ebx # i = (m >> 2)
  1456. jle .L999
  1457. #if !defined(TRMMKERNEL) || \
  1458. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1459. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1460. leal BUFFER, BB
  1461. #else
  1462. leal BUFFER, BB
  1463. movl KK, %eax
  1464. leal (, %eax, SIZE), %eax
  1465. leal (AA, %eax, 1), AA
  1466. leal (BB, %eax, 2), BB
  1467. #endif
  1468. pxor %xmm4, %xmm4
  1469. pxor %xmm5, %xmm5
  1470. pxor %xmm6, %xmm6
  1471. pxor %xmm7, %xmm7
  1472. leal (LDC, LDC, 2), %eax
  1473. movsd 0 * SIZE(AA), %xmm0
  1474. movsd 4 * SIZE(AA), %xmm1
  1475. movsd 0 * SIZE(BB), %xmm2
  1476. movsd 8 * SIZE(BB), %xmm3
  1477. #ifndef TRMMKERNEL
  1478. movl K, %eax
  1479. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1480. movl K, %eax
  1481. subl KK, %eax
  1482. movl %eax, KKK
  1483. #else
  1484. movl KK, %eax
  1485. #ifdef LEFT
  1486. addl $1, %eax
  1487. #else
  1488. addl $1, %eax
  1489. #endif
  1490. movl %eax, KKK
  1491. #endif
  1492. sarl $3, %eax
  1493. je .L85
  1494. ALIGN_4
  1495. .L82:
  1496. mulsd %xmm0, %xmm2
  1497. #if defined(OPTERON) || defined(BARCELONA)
  1498. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1499. #endif
  1500. movsd 1 * SIZE(AA), %xmm0
  1501. mulsd 2 * SIZE(BB), %xmm0
  1502. addsd %xmm2, %xmm4
  1503. movsd 16 * SIZE(BB), %xmm2
  1504. addsd %xmm0, %xmm5
  1505. movsd 2 * SIZE(AA), %xmm0
  1506. mulsd 4 * SIZE(BB), %xmm0
  1507. addsd %xmm0, %xmm6
  1508. movsd 3 * SIZE(AA), %xmm0
  1509. mulsd 6 * SIZE(BB), %xmm0
  1510. addsd %xmm0, %xmm7
  1511. movsd 8 * SIZE(AA), %xmm0
  1512. mulsd %xmm1, %xmm3
  1513. movsd 5 * SIZE(AA), %xmm1
  1514. mulsd 10 * SIZE(BB), %xmm1
  1515. addsd %xmm3, %xmm4
  1516. movsd 24 * SIZE(BB), %xmm3
  1517. addsd %xmm1, %xmm5
  1518. movsd 6 * SIZE(AA), %xmm1
  1519. mulsd 12 * SIZE(BB), %xmm1
  1520. addsd %xmm1, %xmm6
  1521. movsd 7 * SIZE(AA), %xmm1
  1522. mulsd 14 * SIZE(BB), %xmm1
  1523. addsd %xmm1, %xmm7
  1524. movsd 12 * SIZE(AA), %xmm1
  1525. addl $ 8 * SIZE, AA
  1526. addl $16 * SIZE, BB
  1527. decl %eax
  1528. jne .L82
  1529. ALIGN_4
  1530. .L85:
  1531. #ifndef TRMMKERNEL
  1532. movl K, %eax
  1533. #else
  1534. movl KKK, %eax
  1535. #endif
  1536. movaps ALPHA, %xmm3
  1537. andl $7, %eax # if (k & 1)
  1538. BRANCH
  1539. je .L88
  1540. .L86:
  1541. mulsd %xmm0, %xmm2
  1542. addsd %xmm2, %xmm4
  1543. movsd 2 * SIZE(BB), %xmm2
  1544. movsd 1 * SIZE(AA), %xmm0
  1545. addl $1 * SIZE, AA
  1546. addl $2 * SIZE, BB
  1547. decl %eax
  1548. jg .L86
  1549. ALIGN_4
  1550. .L88:
  1551. addsd %xmm5, %xmm4
  1552. addsd %xmm7, %xmm6
  1553. addsd %xmm6, %xmm4
  1554. movsd 0 * SIZE(%esi), %xmm0
  1555. movhps 1 * SIZE(%esi), %xmm0
  1556. pshufd $0x44, %xmm4, %xmm2
  1557. unpckhpd %xmm4, %xmm4
  1558. mulpd %xmm3, %xmm2
  1559. addpd %xmm2, %xmm0
  1560. movlps %xmm0, 0 * SIZE(%esi)
  1561. movhps %xmm0, 1 * SIZE(%esi)
  1562. ALIGN_4
  1563. .L999:
  1564. movl OLD_STACK, %esp
  1565. EMMS
  1566. popl %ebx
  1567. popl %esi
  1568. popl %edi
  1569. popl %ebp
  1570. ret
  1571. EPILOGUE