You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qtrsm_kernel_RT_2x2.S 19 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1
  41. #define N ARG2
  42. #define K ARG3
  43. #define A ARG4
  44. #define B ARG5
  45. #define C ARG6
  46. #define LDC %r10
  47. #define I %r12
  48. #define J %r13
  49. #define AO %r14
  50. #define BO %r15
  51. #define CO %rbp
  52. #define KK %r11
  53. #define AORIG 48(%rsp)
  54. #define STACKSIZE 64
  55. #define ALPHA 8 + STACKSIZE(%rsp)
  56. #define OFFSET 32 + STACKSIZE(%rsp)
  57. #ifdef OPTERON
  58. #define PREFETCH prefetch
  59. #define PREFETCHW prefetchw
  60. #else
  61. #define PREFETCH prefetcht0
  62. #define PREFETCHW prefetcht0
  63. #endif
  64. #define PREFETCHSIZE (5 + 4 * 10)
  65. PROLOGUE
  66. PROFCODE
  67. subq $STACKSIZE, %rsp
  68. movq %rbx, 0(%rsp)
  69. movq %rbp, 8(%rsp)
  70. movq %r12, 16(%rsp)
  71. movq %r13, 24(%rsp)
  72. movq %r14, 32(%rsp)
  73. movq %r15, 40(%rsp)
  74. movq 24 + STACKSIZE(%rsp), LDC
  75. #if defined(TRMMKERNEL) && !defined(LEFT)
  76. movq OFFSET, %rax
  77. negq %rax
  78. movq %rax, KK
  79. #endif
  80. addq $8 * SIZE, A
  81. addq $8 * SIZE, B
  82. salq $BASE_SHIFT, LDC
  83. #ifdef LN
  84. movq M, %rax
  85. salq $BASE_SHIFT, %rax
  86. addq %rax, C
  87. imulq K, %rax
  88. addq %rax, A
  89. #endif
  90. #ifdef RT
  91. movq N, %rax
  92. salq $BASE_SHIFT, %rax
  93. imulq K, %rax
  94. addq %rax, B
  95. movq N, %rax
  96. imulq LDC, %rax
  97. addq %rax, C
  98. #endif
  99. #ifdef RN
  100. movq OFFSET, %rax
  101. negq %rax
  102. movq %rax, KK
  103. #endif
  104. #ifdef RT
  105. movq N, %rax
  106. subq OFFSET, %rax
  107. movq %rax, KK
  108. #endif
  109. movq N, %rax
  110. testq $1, %rax
  111. je .L30
  112. #if defined(LT) || defined(RN)
  113. movq A, AO
  114. #else
  115. movq A, %rax
  116. movq %rax, AORIG
  117. #endif
  118. #ifdef RT
  119. movq K, %rax
  120. salq $0 + BASE_SHIFT, %rax
  121. subq %rax, B
  122. #endif
  123. #ifdef RT
  124. subq LDC, C
  125. #endif
  126. movq C, CO
  127. #ifndef RT
  128. addq LDC, C
  129. #endif
  130. #ifdef LN
  131. movq OFFSET, %rax
  132. addq M, %rax
  133. movq %rax, KK
  134. #endif
  135. #ifdef LT
  136. movq OFFSET, %rax
  137. movq %rax, KK
  138. #endif
  139. movq M, I
  140. sarq $1, I
  141. je .L40
  142. ALIGN_4
  143. .L31:
  144. #ifdef LN
  145. movq K, %rax
  146. salq $1 + BASE_SHIFT, %rax
  147. subq %rax, AORIG
  148. #endif
  149. #if defined(LN) || defined(RT)
  150. movq KK, %rax
  151. salq $BASE_SHIFT, %rax
  152. movq AORIG, AO
  153. leaq (AO, %rax, 2), AO
  154. leaq (B, %rax, 1), BO
  155. #else
  156. movq B, BO
  157. #endif
  158. fldz
  159. fldz
  160. #if defined(HAVE_3DNOW)
  161. prefetchw 2 * SIZE(CO)
  162. #elif defined(HAVE_SSE)
  163. prefetchnta 2 * SIZE(CO)
  164. #endif
  165. #if defined(LT) || defined(RN)
  166. movq KK, %rax
  167. #else
  168. movq K, %rax
  169. subq KK, %rax
  170. #endif
  171. sarq $2, %rax
  172. je .L35
  173. ALIGN_4
  174. .L32:
  175. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  176. FLD -8 * SIZE(BO)
  177. FLD -8 * SIZE(AO)
  178. fmul %st(1), %st
  179. faddp %st, %st(2)
  180. FLD -7 * SIZE(AO)
  181. fmulp %st, %st(1)
  182. faddp %st, %st(2)
  183. FLD -7 * SIZE(BO)
  184. FLD -6 * SIZE(AO)
  185. fmul %st(1), %st
  186. faddp %st, %st(2)
  187. FLD -5 * SIZE(AO)
  188. fmulp %st, %st(1)
  189. faddp %st, %st(2)
  190. FLD -6 * SIZE(BO)
  191. FLD -4 * SIZE(AO)
  192. fmul %st(1), %st
  193. faddp %st, %st(2)
  194. FLD -3 * SIZE(AO)
  195. fmulp %st, %st(1)
  196. faddp %st, %st(2)
  197. FLD -5 * SIZE(BO)
  198. FLD -2 * SIZE(AO)
  199. fmul %st(1), %st
  200. faddp %st, %st(2)
  201. FLD -1 * SIZE(AO)
  202. fmulp %st, %st(1)
  203. faddp %st, %st(2)
  204. addq $8 * SIZE,AO
  205. addq $4 * SIZE,BO
  206. decq %rax
  207. jne .L32
  208. ALIGN_4
  209. .L35:
  210. #if defined(LT) || defined(RN)
  211. movq KK, %rax
  212. #else
  213. movq K, %rax
  214. subq KK, %rax
  215. #endif
  216. and $3, %rax
  217. je .L38
  218. ALIGN_4
  219. .L36:
  220. FLD -8 * SIZE(BO)
  221. FLD -8 * SIZE(AO)
  222. fmul %st(1), %st
  223. faddp %st, %st(2)
  224. FLD -7 * SIZE(AO)
  225. fmulp %st, %st(1)
  226. faddp %st, %st(2)
  227. addq $2 * SIZE,AO
  228. addq $1 * SIZE,BO
  229. decq %rax
  230. jne .L36
  231. ALIGN_4
  232. .L38:
  233. #if defined(LN) || defined(RT)
  234. movq KK, %rax
  235. #ifdef LN
  236. subq $2, %rax
  237. #else
  238. subq $1, %rax
  239. #endif
  240. salq $BASE_SHIFT, %rax
  241. movq AORIG, AO
  242. leaq (AO, %rax, 2), AO
  243. leaq (B, %rax, 1), BO
  244. #endif
  245. #if defined(LN) || defined(LT)
  246. FLD -8 * SIZE(BO)
  247. fsubp %st, %st(1)
  248. FLD -7 * SIZE(BO)
  249. fsubp %st, %st(2)
  250. #else
  251. FLD -8 * SIZE(AO)
  252. fsubp %st, %st(1)
  253. FLD -7 * SIZE(AO)
  254. fsubp %st, %st(2)
  255. #endif
  256. #ifdef LN
  257. FLD -5 * SIZE(AO)
  258. fmulp %st, %st(2)
  259. FLD -6 * SIZE(AO)
  260. fmul %st(2), %st
  261. fsubrp %st, %st(1)
  262. FLD -8 * SIZE(AO)
  263. fmulp %st, %st(1)
  264. #endif
  265. #ifdef LT
  266. FLD -8 * SIZE(AO)
  267. fmulp %st, %st(1)
  268. FLD -7 * SIZE(AO)
  269. fmul %st(1), %st
  270. fsubrp %st, %st(2)
  271. FLD -5 * SIZE(AO)
  272. fmulp %st, %st(2)
  273. #endif
  274. #ifdef RN
  275. FLD -8 * SIZE(BO)
  276. fmul %st, %st(1)
  277. fmulp %st, %st(2)
  278. #endif
  279. #ifdef RT
  280. FLD -8 * SIZE(BO)
  281. fmul %st, %st(1)
  282. fmulp %st, %st(2)
  283. #endif
  284. #ifdef LN
  285. subq $2 * SIZE, CO
  286. #endif
  287. #if defined(LN) || defined(LT)
  288. fld %st
  289. FST -8 * SIZE(BO)
  290. fxch %st(1)
  291. fld %st
  292. FST -7 * SIZE(BO)
  293. #else
  294. fld %st
  295. FST -8 * SIZE(AO)
  296. fxch %st(1)
  297. fld %st
  298. FST -7 * SIZE(AO)
  299. #endif
  300. FST 1 * SIZE(CO)
  301. FST 0 * SIZE(CO)
  302. #ifndef LN
  303. addq $2 * SIZE, CO
  304. #endif
  305. #if defined(LT) || defined(RN)
  306. movq K, %rax
  307. subq KK, %rax
  308. salq $BASE_SHIFT, %rax
  309. leaq (AO, %rax, 2), AO
  310. leaq (BO, %rax, 1), BO
  311. #endif
  312. #ifdef LN
  313. subq $2, KK
  314. #endif
  315. #ifdef LT
  316. addq $2, KK
  317. #endif
  318. #ifdef RT
  319. movq K, %rax
  320. salq $1 + BASE_SHIFT, %rax
  321. addq %rax, AORIG
  322. #endif
  323. decq I
  324. jne .L31
  325. ALIGN_4
  326. .L40:
  327. movq M, %rax
  328. andq $1, %rax
  329. je .L49
  330. ALIGN_4
  331. .L41:
  332. #ifdef LN
  333. movq K, %rax
  334. salq $0 + BASE_SHIFT, %rax
  335. subq %rax, AORIG
  336. #endif
  337. #if defined(LN) || defined(RT)
  338. movq KK, %rax
  339. salq $BASE_SHIFT, %rax
  340. movq AORIG, AO
  341. leaq (AO, %rax, 1), AO
  342. leaq (B, %rax, 1), BO
  343. #else
  344. movq B, BO
  345. #endif
  346. fldz
  347. #if defined(LT) || defined(RN)
  348. movq KK, %rax
  349. #else
  350. movq K, %rax
  351. subq KK, %rax
  352. #endif
  353. sarq $2, %rax
  354. je .L45
  355. ALIGN_4
  356. .L42:
  357. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  358. FLD -8 * SIZE(AO)
  359. FLD -8 * SIZE(BO)
  360. fmulp %st, %st(1)
  361. faddp %st, %st(1)
  362. FLD -7 * SIZE(AO)
  363. FLD -7 * SIZE(BO)
  364. fmulp %st, %st(1)
  365. faddp %st, %st(1)
  366. FLD -6 * SIZE(AO)
  367. FLD -6 * SIZE(BO)
  368. fmulp %st, %st(1)
  369. faddp %st, %st(1)
  370. FLD -5 * SIZE(AO)
  371. FLD -5 * SIZE(BO)
  372. fmulp %st, %st(1)
  373. faddp %st, %st(1)
  374. addq $4 * SIZE,AO
  375. addq $4 * SIZE,BO
  376. decq %rax
  377. jne .L42
  378. ALIGN_4
  379. .L45:
  380. #if defined(LT) || defined(RN)
  381. movq KK, %rax
  382. #else
  383. movq K, %rax
  384. subq KK, %rax
  385. #endif
  386. and $3, %rax
  387. je .L48
  388. ALIGN_4
  389. .L46:
  390. FLD -8 * SIZE(AO)
  391. FLD -8 * SIZE(BO)
  392. fmulp %st, %st(1)
  393. faddp %st, %st(1)
  394. addq $1 * SIZE,AO
  395. addq $1 * SIZE,BO
  396. decq %rax
  397. jne .L46
  398. ALIGN_4
  399. .L48:
  400. #if defined(LN) || defined(RT)
  401. movq KK, %rax
  402. #ifdef LN
  403. subq $1, %rax
  404. #else
  405. subq $1, %rax
  406. #endif
  407. salq $BASE_SHIFT, %rax
  408. movq AORIG, AO
  409. leaq (AO, %rax, 1), AO
  410. leaq (B, %rax, 1), BO
  411. #endif
  412. #if defined(LN) || defined(LT)
  413. FLD -8 * SIZE(BO)
  414. fsubp %st, %st(1)
  415. #else
  416. FLD -8 * SIZE(AO)
  417. fsubp %st, %st(1)
  418. #endif
  419. #ifdef LN
  420. FLD -8 * SIZE(AO)
  421. fmulp %st, %st(1)
  422. #endif
  423. #ifdef LT
  424. FLD -8 * SIZE(AO)
  425. fmulp %st, %st(1)
  426. #endif
  427. #ifdef RN
  428. FLD -8 * SIZE(BO)
  429. fmulp %st, %st(1)
  430. #endif
  431. #ifdef RT
  432. FLD -8 * SIZE(BO)
  433. fmulp %st, %st(1)
  434. #endif
  435. #ifdef LN
  436. subq $1 * SIZE, CO
  437. #endif
  438. #if defined(LN) || defined(LT)
  439. fld %st
  440. FST -8 * SIZE(BO)
  441. #else
  442. fld %st
  443. FST -8 * SIZE(AO)
  444. #endif
  445. FST 0 * SIZE(CO)
  446. #ifndef LN
  447. addq $1 * SIZE, CO
  448. #endif
  449. #if defined(LT) || defined(RN)
  450. movq K, %rax
  451. subq KK, %rax
  452. salq $BASE_SHIFT, %rax
  453. leaq (AO, %rax, 1), AO
  454. leaq (BO, %rax, 1), BO
  455. #endif
  456. #ifdef LN
  457. subq $1, KK
  458. #endif
  459. #ifdef LT
  460. addq $1, KK
  461. #endif
  462. #ifdef RT
  463. movq K, %rax
  464. salq $0 + BASE_SHIFT, %rax
  465. addq %rax, AORIG
  466. #endif
  467. ALIGN_4
  468. .L49:
  469. #ifdef LN
  470. movq K, %rax
  471. salq $BASE_SHIFT, %rax
  472. leaq (B, %rax, 1), B
  473. #endif
  474. #if defined(LT) || defined(RN)
  475. movq BO, B
  476. #endif
  477. #ifdef RN
  478. addq $1, KK
  479. #endif
  480. #ifdef RT
  481. subq $1, KK
  482. #endif
  483. ALIGN_4
  484. .L30:
  485. movq N, %rax
  486. sarq $1, %rax
  487. movq %rax, J
  488. je .L999
  489. ALIGN_4
  490. .L01:
  491. #if defined(LT) || defined(RN)
  492. movq A, AO
  493. #else
  494. movq A, %rax
  495. movq %rax, AORIG
  496. #endif
  497. #ifdef RT
  498. movq K, %rax
  499. salq $1 + BASE_SHIFT, %rax
  500. subq %rax, B
  501. #endif
  502. lea (, LDC, 2), %rax
  503. #ifdef RT
  504. subq %rax, C
  505. #endif
  506. movq C, CO
  507. #ifndef RT
  508. addq %rax, C
  509. #endif
  510. #ifdef LN
  511. movq OFFSET, %rax
  512. addq M, %rax
  513. movq %rax, KK
  514. #endif
  515. #ifdef LT
  516. movq OFFSET, %rax
  517. movq %rax, KK
  518. #endif
  519. movq M, I
  520. sarq $1, I
  521. je .L20
  522. ALIGN_4
  523. .L11:
  524. #ifdef LN
  525. movq K, %rax
  526. salq $1 + BASE_SHIFT, %rax
  527. subq %rax, AORIG
  528. #endif
  529. #if defined(LN) || defined(RT)
  530. movq KK, %rax
  531. salq $BASE_SHIFT, %rax
  532. movq AORIG, AO
  533. leaq (AO, %rax, 2), AO
  534. leaq (B, %rax, 2), BO
  535. #else
  536. movq B, BO
  537. #endif
  538. fldz
  539. fldz
  540. fldz
  541. fldz
  542. #if defined(HAVE_3DNOW)
  543. prefetchw 2 * SIZE(CO)
  544. prefetchw 2 * SIZE(CO, LDC, 1)
  545. #elif defined(HAVE_SSE)
  546. prefetchnta 2 * SIZE(CO)
  547. prefetchnta 2 * SIZE(CO, LDC, 1)
  548. #endif
  549. #if defined(LT) || defined(RN)
  550. movq KK, %rax
  551. #else
  552. movq K, %rax
  553. subq KK, %rax
  554. #endif
  555. sarq $2, %rax
  556. je .L15
  557. ALIGN_4
  558. .L12:
  559. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  560. FLD -8 * SIZE(AO)
  561. FLD -8 * SIZE(BO)
  562. fld %st(1)
  563. fmul %st(1), %st
  564. faddp %st, %st(3)
  565. FLD -7 * SIZE(BO)
  566. fmul %st, %st(2)
  567. FLD -7 * SIZE(AO)
  568. fmul %st, %st(2)
  569. fmulp %st, %st(1)
  570. faddp %st, %st(6)
  571. faddp %st, %st(4)
  572. faddp %st, %st(2)
  573. FLD -6 * SIZE(AO)
  574. FLD -6 * SIZE(BO)
  575. fld %st(1)
  576. fmul %st(1), %st
  577. faddp %st, %st(3)
  578. FLD -5 * SIZE(BO)
  579. fmul %st, %st(2)
  580. FLD -5 * SIZE(AO)
  581. fmul %st, %st(2)
  582. fmulp %st, %st(1)
  583. faddp %st, %st(6)
  584. faddp %st, %st(4)
  585. faddp %st, %st(2)
  586. PREFETCH (PREFETCHSIZE + 4) * SIZE(AO)
  587. FLD -4 * SIZE(AO)
  588. FLD -4 * SIZE(BO)
  589. fld %st(1)
  590. fmul %st(1), %st
  591. faddp %st, %st(3)
  592. FLD -3 * SIZE(BO)
  593. fmul %st, %st(2)
  594. FLD -3 * SIZE(AO)
  595. fmul %st, %st(2)
  596. fmulp %st, %st(1)
  597. faddp %st, %st(6)
  598. faddp %st, %st(4)
  599. faddp %st, %st(2)
  600. FLD -2 * SIZE(AO)
  601. FLD -2 * SIZE(BO)
  602. fld %st(1)
  603. fmul %st(1), %st
  604. faddp %st, %st(3)
  605. FLD -1 * SIZE(BO)
  606. fmul %st, %st(2)
  607. FLD -1 * SIZE(AO)
  608. fmul %st, %st(2)
  609. fmulp %st, %st(1)
  610. faddp %st, %st(6)
  611. faddp %st, %st(4)
  612. faddp %st, %st(2)
  613. addq $8 * SIZE,AO
  614. addq $8 * SIZE,BO
  615. decq %rax
  616. jne .L12
  617. ALIGN_4
  618. .L15:
  619. #if defined(LT) || defined(RN)
  620. movq KK, %rax
  621. #else
  622. movq K, %rax
  623. subq KK, %rax
  624. #endif
  625. and $3, %rax
  626. je .L18
  627. ALIGN_4
  628. .L16:
  629. FLD -8 * SIZE(AO)
  630. FLD -8 * SIZE(BO)
  631. fld %st(1)
  632. fmul %st(1), %st
  633. faddp %st, %st(3)
  634. FLD -7 * SIZE(BO)
  635. fmul %st, %st(2)
  636. FLD -7 * SIZE(AO)
  637. fmul %st, %st(2)
  638. fmulp %st, %st(1)
  639. faddp %st, %st(6)
  640. faddp %st, %st(4)
  641. faddp %st, %st(2)
  642. addq $2 * SIZE,AO
  643. addq $2 * SIZE,BO
  644. decq %rax
  645. jne .L16
  646. ALIGN_4
  647. .L18:
  648. #if defined(LN) || defined(RT)
  649. movq KK, %rax
  650. #ifdef LN
  651. subq $2, %rax
  652. #else
  653. subq $2, %rax
  654. #endif
  655. salq $BASE_SHIFT, %rax
  656. movq AORIG, AO
  657. leaq (AO, %rax, 2), AO
  658. leaq (B, %rax, 2), BO
  659. #endif
  660. #if defined(LN) || defined(LT)
  661. FLD -8 * SIZE(BO)
  662. fsubp %st, %st(1)
  663. FLD -7 * SIZE(BO)
  664. fsubp %st, %st(2)
  665. FLD -6 * SIZE(BO)
  666. fsubp %st, %st(3)
  667. FLD -5 * SIZE(BO)
  668. fsubp %st, %st(4)
  669. #else
  670. FLD -8 * SIZE(AO)
  671. fsubp %st, %st(1)
  672. FLD -7 * SIZE(AO)
  673. fsubp %st, %st(3)
  674. FLD -6 * SIZE(AO)
  675. fsubp %st, %st(2)
  676. FLD -5 * SIZE(AO)
  677. fsubp %st, %st(4)
  678. #endif
  679. #ifdef LN
  680. FLD -5 * SIZE(AO)
  681. fmul %st, %st(3)
  682. fmulp %st, %st(4)
  683. FLD -6 * SIZE(AO)
  684. fmul %st(3), %st
  685. FLD -6 * SIZE(AO)
  686. fmul %st(5), %st
  687. fsubrp %st, %st(3)
  688. fsubrp %st, %st(1)
  689. FLD -8 * SIZE(AO)
  690. fmul %st, %st(1)
  691. fmulp %st, %st(2)
  692. #endif
  693. #ifdef LT
  694. FLD -8 * SIZE(AO)
  695. fmul %st, %st(1)
  696. fmulp %st, %st(2)
  697. FLD -7 * SIZE(AO)
  698. fmul %st(1), %st
  699. FLD -7 * SIZE(AO)
  700. fmul %st(3), %st
  701. fsubrp %st, %st(5)
  702. fsubrp %st, %st(3)
  703. FLD -5 * SIZE(AO)
  704. fmul %st, %st(3)
  705. fmulp %st, %st(4)
  706. #endif
  707. #ifdef RN
  708. FLD -8 * SIZE(BO)
  709. fmul %st, %st(1)
  710. fmulp %st, %st(3)
  711. FLD -7 * SIZE(BO)
  712. fmul %st(1), %st
  713. FLD -7 * SIZE(BO)
  714. fmul %st(4), %st
  715. fsubrp %st, %st(5)
  716. fsubrp %st, %st(2)
  717. FLD -5 * SIZE(BO)
  718. fmul %st, %st(2)
  719. fmulp %st, %st(4)
  720. #endif
  721. #ifdef RT
  722. FLD -5 * SIZE(BO)
  723. fmul %st, %st(2)
  724. fmulp %st, %st(4)
  725. FLD -6 * SIZE(BO)
  726. fmul %st(2), %st
  727. FLD -6 * SIZE(BO)
  728. fmul %st(5), %st
  729. fsubrp %st, %st(4)
  730. fsubrp %st, %st(1)
  731. FLD -8 * SIZE(BO)
  732. fmul %st, %st(1)
  733. fmulp %st, %st(3)
  734. #endif
  735. #ifdef LN
  736. subq $2 * SIZE, CO
  737. #endif
  738. #if defined(LN) || defined(LT)
  739. fld %st
  740. FST -8 * SIZE(BO)
  741. fxch %st(1)
  742. fld %st
  743. FST -7 * SIZE(BO)
  744. fxch %st(2)
  745. fld %st
  746. FST -6 * SIZE(BO)
  747. fxch %st(3)
  748. fld %st
  749. FST -5 * SIZE(BO)
  750. FST 1 * SIZE(CO, LDC)
  751. FST 0 * SIZE(CO)
  752. FST 0 * SIZE(CO, LDC)
  753. FST 1 * SIZE(CO)
  754. #else
  755. fld %st
  756. FST -8 * SIZE(AO)
  757. fxch %st(2)
  758. fld %st
  759. FST -7 * SIZE(AO)
  760. fxch %st(1)
  761. fld %st
  762. FST -6 * SIZE(AO)
  763. fxch %st(3)
  764. fld %st
  765. FST -5 * SIZE(AO)
  766. FST 1 * SIZE(CO, LDC)
  767. FST 1 * SIZE(CO)
  768. FST 0 * SIZE(CO)
  769. FST 0 * SIZE(CO, LDC)
  770. #endif
  771. #ifndef LN
  772. addq $2 * SIZE, CO
  773. #endif
  774. #if defined(LT) || defined(RN)
  775. movq K, %rax
  776. subq KK, %rax
  777. salq $BASE_SHIFT, %rax
  778. leaq (AO, %rax, 2), AO
  779. leaq (BO, %rax, 2), BO
  780. #endif
  781. #ifdef LN
  782. subq $2, KK
  783. #endif
  784. #ifdef LT
  785. addq $2, KK
  786. #endif
  787. #ifdef RT
  788. movq K, %rax
  789. salq $1 + BASE_SHIFT, %rax
  790. addq %rax, AORIG
  791. #endif
  792. decq I
  793. jne .L11
  794. ALIGN_4
  795. .L20:
  796. movq M, %rax
  797. andq $1, %rax
  798. je .L29
  799. ALIGN_4
  800. .L21:
  801. #ifdef LN
  802. movq K, %rax
  803. salq $0 + BASE_SHIFT, %rax
  804. subq %rax, AORIG
  805. #endif
  806. #if defined(LN) || defined(RT)
  807. movq KK, %rax
  808. salq $BASE_SHIFT, %rax
  809. movq AORIG, AO
  810. leaq (AO, %rax, 1), AO
  811. leaq (B, %rax, 2), BO
  812. #else
  813. movq B, BO
  814. #endif
  815. fldz
  816. fldz
  817. #if defined(LT) || defined(RN)
  818. movq KK, %rax
  819. #else
  820. movq K, %rax
  821. subq KK, %rax
  822. #endif
  823. sarq $2, %rax
  824. je .L25
  825. ALIGN_4
  826. .L22:
  827. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  828. FLD -8 * SIZE(AO)
  829. FLD -8 * SIZE(BO)
  830. fmul %st(1), %st
  831. faddp %st, %st(2)
  832. FLD -7 * SIZE(BO)
  833. fmulp %st, %st(1)
  834. faddp %st, %st(2)
  835. FLD -7 * SIZE(AO)
  836. FLD -6 * SIZE(BO)
  837. fmul %st(1), %st
  838. faddp %st, %st(2)
  839. FLD -5 * SIZE(BO)
  840. fmulp %st, %st(1)
  841. faddp %st, %st(2)
  842. FLD -6 * SIZE(AO)
  843. FLD -4 * SIZE(BO)
  844. fmul %st(1), %st
  845. faddp %st, %st(2)
  846. FLD -3 * SIZE(BO)
  847. fmulp %st, %st(1)
  848. faddp %st, %st(2)
  849. FLD -5 * SIZE(AO)
  850. FLD -2 * SIZE(BO)
  851. fmul %st(1), %st
  852. faddp %st, %st(2)
  853. FLD -1 * SIZE(BO)
  854. fmulp %st, %st(1)
  855. faddp %st, %st(2)
  856. addq $4 * SIZE,AO
  857. addq $8 * SIZE,BO
  858. decq %rax
  859. jne .L22
  860. ALIGN_4
  861. .L25:
  862. #if defined(LT) || defined(RN)
  863. movq KK, %rax
  864. #else
  865. movq K, %rax
  866. subq KK, %rax
  867. #endif
  868. and $3, %rax
  869. je .L28
  870. ALIGN_4
  871. .L26:
  872. FLD -8 * SIZE(AO)
  873. FLD -8 * SIZE(BO)
  874. fmul %st(1), %st
  875. faddp %st, %st(2)
  876. FLD -7 * SIZE(BO)
  877. fmulp %st, %st(1)
  878. faddp %st, %st(2)
  879. addq $1 * SIZE,AO
  880. addq $2 * SIZE,BO
  881. decq %rax
  882. jne .L26
  883. ALIGN_4
  884. .L28:
  885. #if defined(LN) || defined(RT)
  886. movq KK, %rax
  887. #ifdef LN
  888. subq $1, %rax
  889. #else
  890. subq $2, %rax
  891. #endif
  892. salq $BASE_SHIFT, %rax
  893. movq AORIG, AO
  894. leaq (AO, %rax, 1), AO
  895. leaq (B, %rax, 2), BO
  896. #endif
  897. #if defined(LN) || defined(LT)
  898. FLD -8 * SIZE(BO)
  899. fsubp %st, %st(1)
  900. FLD -7 * SIZE(BO)
  901. fsubp %st, %st(2)
  902. #else
  903. FLD -8 * SIZE(AO)
  904. fsubp %st, %st(1)
  905. FLD -7 * SIZE(AO)
  906. fsubp %st, %st(2)
  907. #endif
  908. #if defined(LN) || defined(LT)
  909. FLD -8 * SIZE(AO)
  910. fmul %st, %st(1)
  911. fmulp %st, %st(2)
  912. #endif
  913. #ifdef RN
  914. FLD -8 * SIZE(BO)
  915. fmulp %st, %st(1)
  916. FLD -7 * SIZE(BO)
  917. fmul %st(1), %st
  918. fsubrp %st, %st(2)
  919. FLD -5 * SIZE(BO)
  920. fmulp %st, %st(2)
  921. #endif
  922. #ifdef RT
  923. FLD -5 * SIZE(BO)
  924. fmulp %st, %st(2)
  925. FLD -6 * SIZE(BO)
  926. fmul %st(2), %st
  927. fsubrp %st, %st(1)
  928. FLD -8 * SIZE(BO)
  929. fmulp %st, %st(1)
  930. #endif
  931. #ifdef LN
  932. subq $1 * SIZE, CO
  933. #endif
  934. #if defined(LN) || defined(LT)
  935. fld %st
  936. FST -8 * SIZE(BO)
  937. fxch %st(1)
  938. fld %st
  939. FST -7 * SIZE(BO)
  940. #else
  941. fld %st
  942. FST -8 * SIZE(AO)
  943. fxch %st(1)
  944. fld %st
  945. FST -7 * SIZE(AO)
  946. #endif
  947. FST 0 * SIZE(CO, LDC)
  948. FST 0 * SIZE(CO)
  949. #ifndef LN
  950. addq $1 * SIZE, CO
  951. #endif
  952. #if defined(LT) || defined(RN)
  953. movq K, %rax
  954. subq KK, %rax
  955. salq $BASE_SHIFT, %rax
  956. leaq (AO, %rax, 1), AO
  957. leaq (BO, %rax, 2), BO
  958. #endif
  959. #ifdef LN
  960. subq $1, KK
  961. #endif
  962. #ifdef LT
  963. addq $1, KK
  964. #endif
  965. #ifdef RT
  966. movq K, %rax
  967. salq $0 + BASE_SHIFT, %rax
  968. addq %rax, AORIG
  969. #endif
  970. ALIGN_4
  971. .L29:
  972. #ifdef LN
  973. movq K, %rax
  974. salq $BASE_SHIFT, %rax
  975. leaq (B, %rax, 2), B
  976. #endif
  977. #if defined(LT) || defined(RN)
  978. movq BO, B
  979. #endif
  980. #ifdef RN
  981. addq $2, KK
  982. #endif
  983. #ifdef RT
  984. subq $2, KK
  985. #endif
  986. decq J
  987. jne .L01
  988. ALIGN_4
  989. .L999:
  990. movq 0(%rsp), %rbx
  991. movq 8(%rsp), %rbp
  992. movq 16(%rsp), %r12
  993. movq 24(%rsp), %r13
  994. movq 32(%rsp), %r14
  995. movq 40(%rsp), %r15
  996. addq $STACKSIZE, %rsp
  997. ret
  998. EPILOGUE