You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x2_penryn.S 25 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define OLD_K %rdx
  43. #define M %r13
  44. #define N %r14
  45. #define K %r15
  46. #define A %rcx
  47. #define B %r8
  48. #define C %r9
  49. #define LDC %r10
  50. #define I %r11
  51. #define AO %rdi
  52. #define BO %rsi
  53. #define CO1 %rbx
  54. #define CO2 %rbp
  55. #define BB %r12
  56. #define PREA %rdx
  57. #ifndef WINDOWS_ABI
  58. #define STACKSIZE 128
  59. #define OLD_LDC 8 + STACKSIZE(%rsp)
  60. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  61. #define ALPHA_R 48(%rsp)
  62. #define ALPHA_I 56(%rsp)
  63. #define J 64(%rsp)
  64. #define OFFSET 72(%rsp)
  65. #define KK 80(%rsp)
  66. #define KKK 88(%rsp)
  67. #else
  68. #define STACKSIZE 512
  69. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  70. #define OLD_A 48 + STACKSIZE(%rsp)
  71. #define OLD_B 56 + STACKSIZE(%rsp)
  72. #define OLD_C 64 + STACKSIZE(%rsp)
  73. #define OLD_LDC 72 + STACKSIZE(%rsp)
  74. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  75. #define ALPHA_R 224(%rsp)
  76. #define ALPHA_I 232(%rsp)
  77. #define J 240(%rsp)
  78. #define OFFSET 248(%rsp)
  79. #define KK 256(%rsp)
  80. #define KKK 264(%rsp)
  81. #endif
  82. #ifdef NANO
  83. #define PREFETCHSIZE (8 * 2 + 4)
  84. #define PREFETCHW prefetcht0
  85. #define PREFETCHB prefetcht0
  86. #endif
  87. #ifdef DUNNINGTON
  88. #define PREFETCHSIZE (8 * 81 + 4)
  89. #endif
  90. #ifndef PREFETCH
  91. #define PREFETCH prefetcht0
  92. #endif
  93. #ifndef PREFETCHW
  94. #define PREFETCHW prefetcht2
  95. #endif
  96. #ifndef PREFETCHB
  97. #define PREFETCHB prefetcht0
  98. #endif
  99. #ifndef PREFETCHSIZE
  100. #define PREFETCHSIZE (8 * 17 + 4)
  101. #endif
  102. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  103. #define ADD1 addpd
  104. #define ADD2 addpd
  105. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  106. #define ADD1 addpd
  107. #define ADD2 addpd
  108. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  109. #define ADD1 addpd
  110. #define ADD2 addpd
  111. #else
  112. #define ADD1 addpd
  113. #define ADD2 subpd
  114. #endif
  115. PROLOGUE
  116. PROFCODE
  117. subq $STACKSIZE, %rsp
  118. movq %rbx, 0(%rsp)
  119. movq %rbp, 8(%rsp)
  120. movq %r12, 16(%rsp)
  121. movq %r13, 24(%rsp)
  122. movq %r14, 32(%rsp)
  123. movq %r15, 40(%rsp)
  124. #ifdef WINDOWS_ABI
  125. movq %rdi, 48(%rsp)
  126. movq %rsi, 56(%rsp)
  127. movups %xmm6, 64(%rsp)
  128. movups %xmm7, 80(%rsp)
  129. movups %xmm8, 96(%rsp)
  130. movups %xmm9, 112(%rsp)
  131. movups %xmm10, 128(%rsp)
  132. movups %xmm11, 144(%rsp)
  133. movups %xmm12, 160(%rsp)
  134. movups %xmm13, 176(%rsp)
  135. movups %xmm14, 192(%rsp)
  136. movups %xmm15, 208(%rsp)
  137. movq ARG1, OLD_M
  138. movq ARG2, OLD_N
  139. movq ARG3, OLD_K
  140. movq OLD_A, A
  141. movq OLD_B, B
  142. movq OLD_C, C
  143. movq OLD_LDC, LDC
  144. #ifdef TRMMKERNEL
  145. movq OLD_OFFSET, %r11
  146. #endif
  147. movaps %xmm3, %xmm0
  148. movsd OLD_ALPHA_I, %xmm1
  149. #else
  150. movq OLD_LDC, LDC
  151. #ifdef TRMMKERNEL
  152. movq OLD_OFFSET, %r11
  153. #endif
  154. #endif
  155. movlps %xmm0, ALPHA_R
  156. movlps %xmm1, ALPHA_I
  157. subq $-16 * SIZE, A
  158. subq $-17 * SIZE, B
  159. movq OLD_M, M
  160. movq OLD_N, N
  161. movq OLD_K, K
  162. salq $ZBASE_SHIFT, LDC
  163. #ifdef TRMMKERNEL
  164. movq %r11, OFFSET
  165. #ifndef LEFT
  166. negq %r11
  167. #endif
  168. movq %r11, KK
  169. #endif
  170. movq N, J
  171. sarq $1, J
  172. NOBRANCH
  173. jle .L40
  174. ALIGN_4
  175. .L01:
  176. #if defined(TRMMKERNEL) && defined(LEFT)
  177. movq OFFSET, %rax
  178. movq %rax, KK
  179. #endif
  180. movq C, CO1
  181. leaq (C, LDC, 1), CO2
  182. movq A, AO
  183. movq K, %rax
  184. salq $ZBASE_SHIFT + 1, %rax
  185. leaq (B, %rax), BB
  186. movq M, I
  187. sarq $1, I
  188. NOBRANCH
  189. jle .L20
  190. ALIGN_4
  191. .L11:
  192. #if !defined(TRMMKERNEL) || \
  193. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  194. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  195. movq B, BO
  196. #else
  197. movq B, BO
  198. movq KK, %rax
  199. salq $ZBASE_SHIFT, %rax
  200. leaq (AO, %rax, 2), AO
  201. leaq (BO, %rax, 2), BO
  202. #endif
  203. movaps -16 * SIZE(AO), %xmm0
  204. xorpd %xmm3, %xmm3
  205. movaps -14 * SIZE(AO), %xmm1
  206. xorpd %xmm4, %xmm4
  207. movaps -17 * SIZE(BO), %xmm2
  208. PREFETCHB -16 * SIZE(BB)
  209. xorps %xmm5, %xmm5
  210. xorps %xmm6, %xmm6
  211. movaps %xmm4, %xmm8
  212. movaps %xmm4, %xmm9
  213. PREFETCHW 3 * SIZE(CO1)
  214. movaps %xmm4, %xmm10
  215. movaps %xmm4, %xmm11
  216. movaps %xmm4, %xmm12
  217. movaps %xmm4, %xmm13
  218. PREFETCHW 3 * SIZE(CO2)
  219. movaps %xmm4, %xmm14
  220. movaps %xmm4, %xmm15
  221. #ifndef TRMMKERNEL
  222. movq K, %rax
  223. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  224. movq K, %rax
  225. subq KK, %rax
  226. movq %rax, KKK
  227. #else
  228. movq KK, %rax
  229. #ifdef LEFT
  230. addq $2, %rax
  231. #else
  232. addq $2, %rax
  233. #endif
  234. movq %rax, KKK
  235. #endif
  236. sarq $2, %rax
  237. NOBRANCH
  238. jle .L15
  239. ALIGN_3
  240. .L12:
  241. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  242. ADD1 %xmm3, %xmm12
  243. movaps -15 * SIZE(BO), %xmm3
  244. ADD1 %xmm4, %xmm14
  245. movaps %xmm2, %xmm4
  246. pshufd $0x4e, %xmm2, %xmm7
  247. mulpd %xmm0, %xmm2
  248. mulpd %xmm1, %xmm4
  249. ADD2 %xmm5, %xmm13
  250. ADD2 %xmm6, %xmm15
  251. movaps %xmm7, %xmm6
  252. mulpd %xmm0, %xmm7
  253. mulpd %xmm1, %xmm6
  254. ADD1 %xmm2, %xmm8
  255. movaps -13 * SIZE(BO), %xmm2
  256. ADD1 %xmm4, %xmm10
  257. movaps %xmm3, %xmm4
  258. pshufd $0x4e, %xmm3, %xmm5
  259. mulpd %xmm0, %xmm3
  260. mulpd %xmm1, %xmm4
  261. ADD2 %xmm7, %xmm9
  262. ADD2 %xmm6, %xmm11
  263. movaps %xmm5, %xmm6
  264. mulpd %xmm0, %xmm5
  265. movaps -12 * SIZE(AO), %xmm0
  266. mulpd %xmm1, %xmm6
  267. movaps -10 * SIZE(AO), %xmm1
  268. ADD1 %xmm3, %xmm12
  269. movaps -11 * SIZE(BO), %xmm3
  270. ADD1 %xmm4, %xmm14
  271. movaps %xmm2, %xmm4
  272. pshufd $0x4e, %xmm2, %xmm7
  273. mulpd %xmm0, %xmm2
  274. mulpd %xmm1, %xmm4
  275. ADD2 %xmm5, %xmm13
  276. ADD2 %xmm6, %xmm15
  277. movaps %xmm7, %xmm6
  278. mulpd %xmm0, %xmm7
  279. mulpd %xmm1, %xmm6
  280. ADD1 %xmm2, %xmm8
  281. movaps -9 * SIZE(BO), %xmm2
  282. ADD1 %xmm4, %xmm10
  283. movaps %xmm3, %xmm4
  284. pshufd $0x4e, %xmm3, %xmm5
  285. mulpd %xmm0, %xmm3
  286. mulpd %xmm1, %xmm4
  287. ADD2 %xmm7, %xmm9
  288. ADD2 %xmm6, %xmm11
  289. movaps %xmm5, %xmm6
  290. mulpd %xmm0, %xmm5
  291. movaps -8 * SIZE(AO), %xmm0
  292. mulpd %xmm1, %xmm6
  293. movaps -6 * SIZE(AO), %xmm1
  294. ADD1 %xmm3, %xmm12
  295. movaps -7 * SIZE(BO), %xmm3
  296. ADD1 %xmm4, %xmm14
  297. movaps %xmm2, %xmm4
  298. pshufd $0x4e, %xmm2, %xmm7
  299. mulpd %xmm0, %xmm2
  300. mulpd %xmm1, %xmm4
  301. ADD2 %xmm5, %xmm13
  302. ADD2 %xmm6, %xmm15
  303. movaps %xmm7, %xmm6
  304. mulpd %xmm0, %xmm7
  305. mulpd %xmm1, %xmm6
  306. PADDING
  307. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  308. ADD1 %xmm2, %xmm8
  309. movaps -5 * SIZE(BO), %xmm2
  310. ADD1 %xmm4, %xmm10
  311. movaps %xmm3, %xmm4
  312. pshufd $0x4e, %xmm3, %xmm5
  313. mulpd %xmm0, %xmm3
  314. mulpd %xmm1, %xmm4
  315. ADD2 %xmm7, %xmm9
  316. ADD2 %xmm6, %xmm11
  317. movaps %xmm5, %xmm6
  318. mulpd %xmm0, %xmm5
  319. movaps -4 * SIZE(AO), %xmm0
  320. mulpd %xmm1, %xmm6
  321. movaps -2 * SIZE(AO), %xmm1
  322. ADD1 %xmm3, %xmm12
  323. subq $-16 * SIZE, AO
  324. movaps -3 * SIZE(BO), %xmm3
  325. ADD1 %xmm4, %xmm14
  326. movaps %xmm2, %xmm4
  327. pshufd $0x4e, %xmm2, %xmm7
  328. mulpd %xmm0, %xmm2
  329. mulpd %xmm1, %xmm4
  330. ADD2 %xmm5, %xmm13
  331. ADD2 %xmm6, %xmm15
  332. movaps %xmm7, %xmm6
  333. mulpd %xmm0, %xmm7
  334. mulpd %xmm1, %xmm6
  335. ADD1 %xmm2, %xmm8
  336. movaps -1 * SIZE(BO), %xmm2
  337. ADD1 %xmm4, %xmm10
  338. movaps %xmm3, %xmm4
  339. pshufd $0x4e, %xmm3, %xmm5
  340. subq $-16 * SIZE, BO
  341. mulpd %xmm0, %xmm3
  342. mulpd %xmm1, %xmm4
  343. ADD2 %xmm7, %xmm9
  344. ADD2 %xmm6, %xmm11
  345. movaps %xmm5, %xmm6
  346. mulpd %xmm0, %xmm5
  347. movaps -16 * SIZE(AO), %xmm0
  348. mulpd %xmm1, %xmm6
  349. movaps -14 * SIZE(AO), %xmm1
  350. subq $1, %rax
  351. BRANCH
  352. jg .L12
  353. ALIGN_3
  354. .L15:
  355. PREFETCHB -8 * SIZE(BB)
  356. #ifdef DUNNINGTON
  357. PREFETCHB 0 * SIZE(BB)
  358. PREFETCHB 8 * SIZE(BB)
  359. #endif
  360. #ifndef TRMMKERNEL
  361. movq K, %rax
  362. #else
  363. movq KKK, %rax
  364. #endif
  365. andq $3, %rax # if (k & 1)
  366. BRANCH
  367. je .L18
  368. ALIGN_3
  369. .L16:
  370. ADD1 %xmm3, %xmm12
  371. movaps -15 * SIZE(BO), %xmm3
  372. ADD1 %xmm4, %xmm14
  373. movaps %xmm2, %xmm4
  374. pshufd $0x4e, %xmm2, %xmm7
  375. mulpd %xmm0, %xmm2
  376. mulpd %xmm1, %xmm4
  377. ADD2 %xmm5, %xmm13
  378. ADD2 %xmm6, %xmm15
  379. movaps %xmm7, %xmm6
  380. mulpd %xmm0, %xmm7
  381. mulpd %xmm1, %xmm6
  382. ADD1 %xmm2, %xmm8
  383. movaps -13 * SIZE(BO), %xmm2
  384. ADD1 %xmm4, %xmm10
  385. movaps %xmm3, %xmm4
  386. pshufd $0x4e, %xmm3, %xmm5
  387. mulpd %xmm0, %xmm3
  388. mulpd %xmm1, %xmm4
  389. ADD2 %xmm7, %xmm9
  390. ADD2 %xmm6, %xmm11
  391. movaps %xmm5, %xmm6
  392. mulpd %xmm0, %xmm5
  393. movaps -12 * SIZE(AO), %xmm0
  394. mulpd %xmm1, %xmm6
  395. movaps -10 * SIZE(AO), %xmm1
  396. addq $4 * SIZE, AO
  397. addq $4 * SIZE, BO
  398. subq $1, %rax
  399. BRANCH
  400. jg .L16
  401. ALIGN_3
  402. .L18:
  403. #ifndef DUNNINGTON
  404. subq $-16 * SIZE, BB
  405. #else
  406. subq $-32 * SIZE, BB
  407. #endif
  408. ADD1 %xmm3, %xmm12
  409. pcmpeqb %xmm0, %xmm0
  410. ADD1 %xmm4, %xmm14
  411. psllq $63, %xmm0
  412. ADD2 %xmm5, %xmm13
  413. movddup ALPHA_R, %xmm2
  414. ADD2 %xmm6, %xmm15
  415. movddup ALPHA_I, %xmm3
  416. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  417. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  418. shufps $0x40, %xmm0, %xmm0
  419. xorps %xmm0, %xmm8
  420. xorps %xmm0, %xmm10
  421. xorps %xmm0, %xmm12
  422. xorps %xmm0, %xmm14
  423. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  424. shufps $0x04, %xmm0, %xmm0
  425. xorps %xmm0, %xmm9
  426. xorps %xmm0, %xmm11
  427. xorps %xmm0, %xmm13
  428. xorps %xmm0, %xmm15
  429. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  430. shufps $0x40, %xmm0, %xmm0
  431. xorps %xmm0, %xmm9
  432. xorps %xmm0, %xmm11
  433. xorps %xmm0, %xmm13
  434. xorps %xmm0, %xmm15
  435. #endif
  436. haddpd %xmm9, %xmm8
  437. haddpd %xmm11, %xmm10
  438. haddpd %xmm13, %xmm12
  439. haddpd %xmm15, %xmm14
  440. pshufd $0x4e, %xmm8, %xmm9
  441. pshufd $0x4e, %xmm10, %xmm11
  442. pshufd $0x4e, %xmm12, %xmm13
  443. pshufd $0x4e, %xmm14, %xmm15
  444. mulpd %xmm2, %xmm8
  445. mulpd %xmm3, %xmm9
  446. mulpd %xmm2, %xmm10
  447. mulpd %xmm3, %xmm11
  448. mulpd %xmm2, %xmm12
  449. mulpd %xmm3, %xmm13
  450. mulpd %xmm2, %xmm14
  451. mulpd %xmm3, %xmm15
  452. addsubpd %xmm9, %xmm8
  453. addsubpd %xmm11, %xmm10
  454. addsubpd %xmm13, %xmm12
  455. addsubpd %xmm15, %xmm14
  456. #ifndef TRMMKERNEL
  457. movsd 0 * SIZE(CO1), %xmm0
  458. movhpd 1 * SIZE(CO1), %xmm0
  459. movsd 2 * SIZE(CO1), %xmm1
  460. movhpd 3 * SIZE(CO1), %xmm1
  461. movsd 0 * SIZE(CO2), %xmm2
  462. movhpd 1 * SIZE(CO2), %xmm2
  463. movsd 2 * SIZE(CO2), %xmm3
  464. movhpd 3 * SIZE(CO2), %xmm3
  465. addpd %xmm0, %xmm8
  466. addpd %xmm1, %xmm10
  467. addpd %xmm2, %xmm12
  468. addpd %xmm3, %xmm14
  469. #endif
  470. movsd %xmm8, 0 * SIZE(CO1)
  471. movhpd %xmm8, 1 * SIZE(CO1)
  472. movsd %xmm10, 2 * SIZE(CO1)
  473. movhpd %xmm10, 3 * SIZE(CO1)
  474. movsd %xmm12, 0 * SIZE(CO2)
  475. movhpd %xmm12, 1 * SIZE(CO2)
  476. movsd %xmm14, 2 * SIZE(CO2)
  477. movhpd %xmm14, 3 * SIZE(CO2)
  478. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  479. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  480. movq K, %rax
  481. subq KKK, %rax
  482. salq $ZBASE_SHIFT, %rax
  483. leaq (AO, %rax, 2), AO
  484. leaq (BO, %rax, 2), BO
  485. #endif
  486. #if defined(TRMMKERNEL) && defined(LEFT)
  487. addq $2, KK
  488. #endif
  489. addq $4 * SIZE, CO1 # coffset += 4
  490. addq $4 * SIZE, CO2 # coffset += 4
  491. decq I # i --
  492. BRANCH
  493. jg .L11
  494. ALIGN_4
  495. .L20:
  496. testq $1, M
  497. BRANCH
  498. jle .L39
  499. ALIGN_4
  500. #if !defined(TRMMKERNEL) || \
  501. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  502. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  503. movq B, BO
  504. #else
  505. movq B, BO
  506. movq KK, %rax
  507. salq $ZBASE_SHIFT, %rax
  508. leaq (AO, %rax, 1), AO
  509. leaq (BO, %rax, 2), BO
  510. #endif
  511. movaps -16 * SIZE(AO), %xmm0
  512. movaps -17 * SIZE(BO), %xmm2
  513. movaps -15 * SIZE(BO), %xmm3
  514. xorps %xmm3, %xmm3
  515. xorps %xmm5, %xmm5
  516. movaps %xmm3, %xmm8
  517. movaps %xmm3, %xmm9
  518. movaps %xmm3, %xmm12
  519. movaps %xmm3, %xmm13
  520. #ifndef TRMMKERNEL
  521. movq K, %rax
  522. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  523. movq K, %rax
  524. subq KK, %rax
  525. movq %rax, KKK
  526. #else
  527. movq KK, %rax
  528. #ifdef LEFT
  529. addq $1, %rax
  530. #else
  531. addq $2, %rax
  532. #endif
  533. movq %rax, KKK
  534. #endif
  535. sarq $2, %rax
  536. NOBRANCH
  537. jle .L25
  538. ALIGN_4
  539. .L22:
  540. ADD1 %xmm3, %xmm12
  541. movaps -15 * SIZE(BO), %xmm3
  542. pshufd $0x4e, %xmm2, %xmm7
  543. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  544. mulpd %xmm0, %xmm2
  545. ADD2 %xmm5, %xmm13
  546. mulpd %xmm0, %xmm7
  547. ADD1 %xmm2, %xmm8
  548. movaps -13 * SIZE(BO), %xmm2
  549. pshufd $0x4e, %xmm3, %xmm5
  550. mulpd %xmm0, %xmm3
  551. ADD2 %xmm7, %xmm9
  552. mulpd %xmm0, %xmm5
  553. movaps -14 * SIZE(AO), %xmm0
  554. ADD1 %xmm3, %xmm12
  555. movaps -11 * SIZE(BO), %xmm3
  556. pshufd $0x4e, %xmm2, %xmm7
  557. mulpd %xmm0, %xmm2
  558. ADD2 %xmm5, %xmm13
  559. mulpd %xmm0, %xmm7
  560. ADD1 %xmm2, %xmm8
  561. movaps -9 * SIZE(BO), %xmm2
  562. pshufd $0x4e, %xmm3, %xmm5
  563. mulpd %xmm0, %xmm3
  564. ADD2 %xmm7, %xmm9
  565. mulpd %xmm0, %xmm5
  566. movaps -12 * SIZE(AO), %xmm0
  567. ADD1 %xmm3, %xmm12
  568. movaps -7 * SIZE(BO), %xmm3
  569. pshufd $0x4e, %xmm2, %xmm7
  570. mulpd %xmm0, %xmm2
  571. ADD2 %xmm5, %xmm13
  572. mulpd %xmm0, %xmm7
  573. ADD1 %xmm2, %xmm8
  574. movaps -5 * SIZE(BO), %xmm2
  575. pshufd $0x4e, %xmm3, %xmm5
  576. mulpd %xmm0, %xmm3
  577. ADD2 %xmm7, %xmm9
  578. mulpd %xmm0, %xmm5
  579. movaps -10 * SIZE(AO), %xmm0
  580. ADD1 %xmm3, %xmm12
  581. movaps -3 * SIZE(BO), %xmm3
  582. pshufd $0x4e, %xmm2, %xmm7
  583. mulpd %xmm0, %xmm2
  584. ADD2 %xmm5, %xmm13
  585. mulpd %xmm0, %xmm7
  586. subq $ -8 * SIZE, AO
  587. ADD1 %xmm2, %xmm8
  588. movaps -1 * SIZE(BO), %xmm2
  589. pshufd $0x4e, %xmm3, %xmm5
  590. mulpd %xmm0, %xmm3
  591. ADD2 %xmm7, %xmm9
  592. mulpd %xmm0, %xmm5
  593. movaps -16 * SIZE(AO), %xmm0
  594. subq $-16 * SIZE, BO
  595. subq $1, %rax
  596. BRANCH
  597. jg .L22
  598. ALIGN_4
  599. .L25:
  600. #ifndef TRMMKERNEL
  601. movq K, %rax
  602. #else
  603. movq KKK, %rax
  604. #endif
  605. andq $3, %rax # if (k & 1)
  606. BRANCH
  607. je .L28
  608. ALIGN_4
  609. .L26:
  610. ADD1 %xmm3, %xmm12
  611. movaps -15 * SIZE(BO), %xmm3
  612. pshufd $0x4e, %xmm2, %xmm7
  613. mulpd %xmm0, %xmm2
  614. ADD2 %xmm5, %xmm13
  615. mulpd %xmm0, %xmm7
  616. ADD1 %xmm2, %xmm8
  617. movaps -13 * SIZE(BO), %xmm2
  618. pshufd $0x4e, %xmm3, %xmm5
  619. mulpd %xmm0, %xmm3
  620. ADD2 %xmm7, %xmm9
  621. mulpd %xmm0, %xmm5
  622. movaps -14 * SIZE(AO), %xmm0
  623. addq $2 * SIZE, AO
  624. addq $4 * SIZE, BO
  625. subq $1, %rax
  626. BRANCH
  627. jg .L26
  628. ALIGN_4
  629. .L28:
  630. ADD1 %xmm3, %xmm12
  631. pcmpeqb %xmm0, %xmm0
  632. ADD2 %xmm5, %xmm13
  633. psllq $63, %xmm0
  634. movddup ALPHA_R, %xmm2
  635. movddup ALPHA_I, %xmm3
  636. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  637. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  638. shufps $0x40, %xmm0, %xmm0
  639. xorps %xmm0, %xmm8
  640. xorps %xmm0, %xmm12
  641. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  642. shufps $0x04, %xmm0, %xmm0
  643. xorps %xmm0, %xmm9
  644. xorps %xmm0, %xmm13
  645. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  646. shufps $0x40, %xmm0, %xmm0
  647. xorps %xmm0, %xmm9
  648. xorps %xmm0, %xmm13
  649. #endif
  650. haddpd %xmm9, %xmm8
  651. haddpd %xmm13, %xmm12
  652. pshufd $0x4e, %xmm8, %xmm9
  653. pshufd $0x4e, %xmm12, %xmm13
  654. mulpd %xmm2, %xmm8
  655. mulpd %xmm3, %xmm9
  656. mulpd %xmm2, %xmm12
  657. mulpd %xmm3, %xmm13
  658. addsubpd %xmm9, %xmm8
  659. addsubpd %xmm13, %xmm12
  660. #ifndef TRMMKERNEL
  661. movsd 0 * SIZE(CO1), %xmm0
  662. movhpd 1 * SIZE(CO1), %xmm0
  663. movsd 0 * SIZE(CO2), %xmm2
  664. movhpd 1 * SIZE(CO2), %xmm2
  665. addpd %xmm0, %xmm8
  666. addpd %xmm2, %xmm12
  667. #endif
  668. movsd %xmm8, 0 * SIZE(CO1)
  669. movhpd %xmm8, 1 * SIZE(CO1)
  670. movsd %xmm12, 0 * SIZE(CO2)
  671. movhpd %xmm12, 1 * SIZE(CO2)
  672. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  673. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  674. movq K, %rax
  675. subq KKK, %rax
  676. salq $ZBASE_SHIFT, %rax
  677. leaq (AO, %rax, 1), AO
  678. leaq (BO, %rax, 2), BO
  679. #endif
  680. #if defined(TRMMKERNEL) && defined(LEFT)
  681. addq $1, KK
  682. #endif
  683. addq $2 * SIZE, CO1 # coffset += 4
  684. addq $2 * SIZE, CO2 # coffset += 4
  685. ALIGN_4
  686. .L39:
  687. #if defined(TRMMKERNEL) && !defined(LEFT)
  688. addq $2, KK
  689. #endif
  690. leaq (C, LDC, 2), C
  691. movq BO, B
  692. subq $1, J
  693. BRANCH
  694. jg .L01
  695. ALIGN_4
  696. .L40:
  697. testq $1, N
  698. BRANCH
  699. jle .L999
  700. movq C, CO1
  701. leaq (C, LDC, 1), CO2
  702. movq A, AO
  703. #if defined(TRMMKERNEL) && defined(LEFT)
  704. movq OFFSET, %rax
  705. movq %rax, KK
  706. #endif
  707. movq M, I
  708. sarq $1, I # i = (m >> 2)
  709. NOBRANCH
  710. jle .L60
  711. ALIGN_4
  712. .L51:
  713. #if !defined(TRMMKERNEL) || \
  714. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  715. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  716. movq B, BO
  717. #else
  718. movq B, BO
  719. movq KK, %rax
  720. salq $ZBASE_SHIFT, %rax
  721. leaq (AO, %rax, 2), AO
  722. leaq (BO, %rax, 1), BO
  723. #endif
  724. movaps -16 * SIZE(AO), %xmm0
  725. movaps -14 * SIZE(AO), %xmm1
  726. movaps -17 * SIZE(BO), %xmm2
  727. PREFETCHW 3 * SIZE(CO1)
  728. xorps %xmm8, %xmm8
  729. xorps %xmm9, %xmm9
  730. xorps %xmm12, %xmm12
  731. xorps %xmm13, %xmm13
  732. #ifndef TRMMKERNEL
  733. movq K, %rax
  734. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  735. movq K, %rax
  736. subq KK, %rax
  737. movq %rax, KKK
  738. #else
  739. movq KK, %rax
  740. #ifdef LEFT
  741. addq $2, %rax
  742. #else
  743. addq $1, %rax
  744. #endif
  745. movq %rax, KKK
  746. #endif
  747. sarq $2, %rax
  748. NOBRANCH
  749. jle .L55
  750. ALIGN_4
  751. .L52:
  752. movaps %xmm2, %xmm4
  753. pshufd $0x4e, %xmm2, %xmm7
  754. mulpd %xmm0, %xmm2
  755. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  756. mulpd %xmm1, %xmm4
  757. movaps %xmm7, %xmm6
  758. mulpd %xmm0, %xmm7
  759. movaps -12 * SIZE(AO), %xmm0
  760. mulpd %xmm1, %xmm6
  761. movaps -10 * SIZE(AO), %xmm1
  762. ADD1 %xmm2, %xmm8
  763. movaps -15 * SIZE(BO), %xmm2
  764. ADD1 %xmm4, %xmm12
  765. ADD2 %xmm7, %xmm9
  766. ADD2 %xmm6, %xmm13
  767. movaps %xmm2, %xmm4
  768. pshufd $0x4e, %xmm2, %xmm7
  769. mulpd %xmm0, %xmm2
  770. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  771. mulpd %xmm1, %xmm4
  772. movaps %xmm7, %xmm6
  773. mulpd %xmm0, %xmm7
  774. movaps -8 * SIZE(AO), %xmm0
  775. mulpd %xmm1, %xmm6
  776. movaps -6 * SIZE(AO), %xmm1
  777. ADD1 %xmm2, %xmm8
  778. movaps -13 * SIZE(BO), %xmm2
  779. ADD1 %xmm4, %xmm12
  780. ADD2 %xmm7, %xmm9
  781. ADD2 %xmm6, %xmm13
  782. movaps %xmm2, %xmm4
  783. pshufd $0x4e, %xmm2, %xmm7
  784. mulpd %xmm0, %xmm2
  785. mulpd %xmm1, %xmm4
  786. movaps %xmm7, %xmm6
  787. mulpd %xmm0, %xmm7
  788. movaps -4 * SIZE(AO), %xmm0
  789. mulpd %xmm1, %xmm6
  790. movaps -2 * SIZE(AO), %xmm1
  791. ADD1 %xmm2, %xmm8
  792. movaps -11 * SIZE(BO), %xmm2
  793. ADD1 %xmm4, %xmm12
  794. ADD2 %xmm7, %xmm9
  795. ADD2 %xmm6, %xmm13
  796. movaps %xmm2, %xmm4
  797. pshufd $0x4e, %xmm2, %xmm7
  798. mulpd %xmm0, %xmm2
  799. mulpd %xmm1, %xmm4
  800. movaps %xmm7, %xmm6
  801. mulpd %xmm0, %xmm7
  802. movaps 0 * SIZE(AO), %xmm0
  803. mulpd %xmm1, %xmm6
  804. movaps 2 * SIZE(AO), %xmm1
  805. ADD1 %xmm2, %xmm8
  806. movaps -9 * SIZE(BO), %xmm2
  807. ADD1 %xmm4, %xmm12
  808. ADD2 %xmm7, %xmm9
  809. ADD2 %xmm6, %xmm13
  810. subq $-16 * SIZE, AO
  811. subq $ -8 * SIZE, BO
  812. subq $1, %rax
  813. BRANCH
  814. jg .L52
  815. ALIGN_4
  816. .L55:
  817. #ifndef TRMMKERNEL
  818. movq K, %rax
  819. #else
  820. movq KKK, %rax
  821. #endif
  822. andq $3, %rax # if (k & 1)
  823. BRANCH
  824. je .L58
  825. ALIGN_4
  826. .L56:
  827. movaps %xmm2, %xmm4
  828. pshufd $0x4e, %xmm2, %xmm7
  829. mulpd %xmm0, %xmm2
  830. mulpd %xmm1, %xmm4
  831. movaps %xmm7, %xmm6
  832. mulpd %xmm0, %xmm7
  833. movaps -12 * SIZE(AO), %xmm0
  834. mulpd %xmm1, %xmm6
  835. movaps -10 * SIZE(AO), %xmm1
  836. ADD1 %xmm2, %xmm8
  837. movaps -15 * SIZE(BO), %xmm2
  838. ADD1 %xmm4, %xmm12
  839. ADD2 %xmm7, %xmm9
  840. ADD2 %xmm6, %xmm13
  841. addq $4 * SIZE, AO
  842. addq $2 * SIZE, BO
  843. subq $1, %rax
  844. BRANCH
  845. jg .L56
  846. ALIGN_4
  847. .L58:
  848. pcmpeqb %xmm0, %xmm0
  849. movddup ALPHA_R, %xmm2
  850. psllq $63, %xmm0
  851. movddup ALPHA_I, %xmm3
  852. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  853. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  854. shufps $0x40, %xmm0, %xmm0
  855. xorps %xmm0, %xmm8
  856. xorps %xmm0, %xmm12
  857. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  858. shufps $0x04, %xmm0, %xmm0
  859. xorps %xmm0, %xmm9
  860. xorps %xmm0, %xmm13
  861. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  862. shufps $0x40, %xmm0, %xmm0
  863. xorps %xmm0, %xmm9
  864. xorps %xmm0, %xmm13
  865. #endif
  866. haddpd %xmm9, %xmm8
  867. haddpd %xmm13, %xmm12
  868. pshufd $0x4e, %xmm8, %xmm9
  869. pshufd $0x4e, %xmm12, %xmm13
  870. mulpd %xmm2, %xmm8
  871. mulpd %xmm3, %xmm9
  872. mulpd %xmm2, %xmm12
  873. mulpd %xmm3, %xmm13
  874. addsubpd %xmm9, %xmm8
  875. addsubpd %xmm13, %xmm12
  876. #ifndef TRMMKERNEL
  877. movsd 0 * SIZE(CO1), %xmm0
  878. movhpd 1 * SIZE(CO1), %xmm0
  879. movsd 2 * SIZE(CO1), %xmm1
  880. movhpd 3 * SIZE(CO1), %xmm1
  881. addpd %xmm0, %xmm8
  882. addpd %xmm1, %xmm12
  883. #endif
  884. movsd %xmm8, 0 * SIZE(CO1)
  885. movhpd %xmm8, 1 * SIZE(CO1)
  886. movsd %xmm12, 2 * SIZE(CO1)
  887. movhpd %xmm12, 3 * SIZE(CO1)
  888. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  889. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  890. movq K, %rax
  891. subq KKK, %rax
  892. salq $ZBASE_SHIFT, %rax
  893. leaq (AO, %rax, 2), AO
  894. leaq (BO, %rax, 1), BO
  895. #endif
  896. #if defined(TRMMKERNEL) && defined(LEFT)
  897. addq $2, KK
  898. #endif
  899. addq $4 * SIZE, CO1
  900. addq $4 * SIZE, CO2
  901. decq I
  902. BRANCH
  903. jg .L51
  904. ALIGN_4
  905. .L60:
  906. testq $1, M
  907. BRANCH
  908. jle .L79
  909. ALIGN_4
  910. #if !defined(TRMMKERNEL) || \
  911. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  912. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  913. movq B, BO
  914. #else
  915. movq B, BO
  916. movq KK, %rax
  917. salq $ZBASE_SHIFT, %rax
  918. leaq (AO, %rax, 1), AO
  919. leaq (BO, %rax, 1), BO
  920. #endif
  921. movaps -16 * SIZE(AO), %xmm0
  922. xorps %xmm8, %xmm8
  923. xorps %xmm9, %xmm9
  924. movaps -17 * SIZE(BO), %xmm2
  925. xorps %xmm10, %xmm10
  926. xorps %xmm11, %xmm11
  927. #ifndef TRMMKERNEL
  928. movq K, %rax
  929. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  930. movq K, %rax
  931. subq KK, %rax
  932. movq %rax, KKK
  933. #else
  934. movq KK, %rax
  935. #ifdef LEFT
  936. addq $1, %rax
  937. #else
  938. addq $1, %rax
  939. #endif
  940. movq %rax, KKK
  941. #endif
  942. sarq $2, %rax
  943. NOBRANCH
  944. jle .L65
  945. ALIGN_4
  946. .L62:
  947. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  948. pshufd $0x4e, %xmm2, %xmm7
  949. mulpd %xmm0, %xmm2
  950. mulpd %xmm0, %xmm7
  951. movaps -14 * SIZE(AO), %xmm0
  952. ADD1 %xmm2, %xmm8
  953. ADD2 %xmm7, %xmm9
  954. movaps -15 * SIZE(BO), %xmm2
  955. pshufd $0x4e, %xmm2, %xmm7
  956. mulpd %xmm0, %xmm2
  957. mulpd %xmm0, %xmm7
  958. movaps -12 * SIZE(AO), %xmm0
  959. ADD1 %xmm2, %xmm10
  960. ADD2 %xmm7, %xmm11
  961. movaps -13 * SIZE(BO), %xmm2
  962. pshufd $0x4e, %xmm2, %xmm7
  963. mulpd %xmm0, %xmm2
  964. mulpd %xmm0, %xmm7
  965. movaps -10 * SIZE(AO), %xmm0
  966. ADD1 %xmm2, %xmm8
  967. ADD2 %xmm7, %xmm9
  968. movaps -11 * SIZE(BO), %xmm2
  969. pshufd $0x4e, %xmm2, %xmm7
  970. mulpd %xmm0, %xmm2
  971. mulpd %xmm0, %xmm7
  972. movaps -8 * SIZE(AO), %xmm0
  973. ADD1 %xmm2, %xmm10
  974. ADD2 %xmm7, %xmm11
  975. movaps -9 * SIZE(BO), %xmm2
  976. subq $-8 * SIZE, AO
  977. subq $-8 * SIZE, BO
  978. subq $1, %rax
  979. BRANCH
  980. jg .L62
  981. ALIGN_4
  982. .L65:
  983. #ifndef TRMMKERNEL
  984. movq K, %rax
  985. #else
  986. movq KKK, %rax
  987. #endif
  988. andq $3, %rax # if (k & 1)
  989. BRANCH
  990. je .L68
  991. ALIGN_4
  992. .L66:
  993. pshufd $0x4e, %xmm2, %xmm7
  994. mulpd %xmm0, %xmm2
  995. mulpd %xmm0, %xmm7
  996. movaps -14 * SIZE(AO), %xmm0
  997. ADD1 %xmm2, %xmm8
  998. ADD2 %xmm7, %xmm9
  999. movaps -15 * SIZE(BO), %xmm2
  1000. addq $2 * SIZE, AO
  1001. addq $2 * SIZE, BO
  1002. subq $1, %rax
  1003. BRANCH
  1004. jg .L66
  1005. ALIGN_4
  1006. .L68:
  1007. addpd %xmm10, %xmm8
  1008. addpd %xmm11, %xmm9
  1009. pcmpeqb %xmm0, %xmm0
  1010. movddup ALPHA_R, %xmm2
  1011. psllq $63, %xmm0
  1012. movddup ALPHA_I, %xmm3
  1013. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1014. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1015. shufps $0x40, %xmm0, %xmm0
  1016. xorps %xmm0, %xmm8
  1017. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1018. shufps $0x04, %xmm0, %xmm0
  1019. xorps %xmm0, %xmm9
  1020. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1021. shufps $0x40, %xmm0, %xmm0
  1022. xorps %xmm0, %xmm9
  1023. #endif
  1024. haddpd %xmm9, %xmm8
  1025. pshufd $0x4e, %xmm8, %xmm9
  1026. mulpd %xmm2, %xmm8
  1027. mulpd %xmm3, %xmm9
  1028. addsubpd %xmm9, %xmm8
  1029. #ifndef TRMMKERNEL
  1030. movsd 0 * SIZE(CO1), %xmm0
  1031. movhpd 1 * SIZE(CO1), %xmm0
  1032. addpd %xmm0, %xmm8
  1033. #endif
  1034. movsd %xmm8, 0 * SIZE(CO1)
  1035. movhpd %xmm8, 1 * SIZE(CO1)
  1036. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1037. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1038. movq K, %rax
  1039. subq KKK, %rax
  1040. salq $ZBASE_SHIFT, %rax
  1041. leaq (AO, %rax, 1), AO
  1042. leaq (BO, %rax, 1), BO
  1043. #endif
  1044. #if defined(TRMMKERNEL) && defined(LEFT)
  1045. addq $1, KK
  1046. #endif
  1047. addq $2 * SIZE, CO1
  1048. addq $2 * SIZE, CO2
  1049. ALIGN_4
  1050. .L79:
  1051. #if defined(TRMMKERNEL) && !defined(LEFT)
  1052. addq $1, KK
  1053. #endif
  1054. addq LDC, C
  1055. movq BO, B
  1056. ALIGN_4
  1057. .L999:
  1058. movq 0(%rsp), %rbx
  1059. movq 8(%rsp), %rbp
  1060. movq 16(%rsp), %r12
  1061. movq 24(%rsp), %r13
  1062. movq 32(%rsp), %r14
  1063. movq 40(%rsp), %r15
  1064. #ifdef WINDOWS_ABI
  1065. movq 48(%rsp), %rdi
  1066. movq 56(%rsp), %rsi
  1067. movups 64(%rsp), %xmm6
  1068. movups 80(%rsp), %xmm7
  1069. movups 96(%rsp), %xmm8
  1070. movups 112(%rsp), %xmm9
  1071. movups 128(%rsp), %xmm10
  1072. movups 144(%rsp), %xmm11
  1073. movups 160(%rsp), %xmm12
  1074. movups 176(%rsp), %xmm13
  1075. movups 192(%rsp), %xmm14
  1076. movups 208(%rsp), %xmm15
  1077. #endif
  1078. addq $STACKSIZE, %rsp
  1079. ret
  1080. EPILOGUE