You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qtrsm_kernel_LT_2x2.S 19 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1
  41. #define N ARG2
  42. #define K ARG3
  43. #define A ARG4
  44. #define B ARG5
  45. #define C ARG6
  46. #define LDC %r10
  47. #define I %r12
  48. #define J %r13
  49. #define AO %r14
  50. #define BO %r15
  51. #define CO %rbp
  52. #define KK %r11
  53. #define AORIG 48(%rsp)
  54. #define STACKSIZE 64
  55. #define ALPHA 8 + STACKSIZE(%rsp)
  56. #define OFFSET 32 + STACKSIZE(%rsp)
  57. #ifdef OPTERON
  58. #define PREFETCH prefetch
  59. #define PREFETCHW prefetchw
  60. #else
  61. #define PREFETCH prefetcht0
  62. #define PREFETCHW prefetcht0
  63. #endif
  64. #define PREFETCHSIZE (5 + 4 * 10)
  65. PROLOGUE
  66. PROFCODE
  67. subq $STACKSIZE, %rsp
  68. movq %rbx, 0(%rsp)
  69. movq %rbp, 8(%rsp)
  70. movq %r12, 16(%rsp)
  71. movq %r13, 24(%rsp)
  72. movq %r14, 32(%rsp)
  73. movq %r15, 40(%rsp)
  74. movq 24 + STACKSIZE(%rsp), LDC
  75. #if defined(TRMMKERNEL) && !defined(LEFT)
  76. movq OFFSET, %rax
  77. negq %rax
  78. movq %rax, KK
  79. #endif
  80. addq $8 * SIZE, A
  81. addq $8 * SIZE, B
  82. salq $BASE_SHIFT, LDC
  83. #ifdef LN
  84. movq M, %rax
  85. salq $BASE_SHIFT, %rax
  86. addq %rax, C
  87. imulq K, %rax
  88. addq %rax, A
  89. #endif
  90. #ifdef RT
  91. movq N, %rax
  92. salq $BASE_SHIFT, %rax
  93. imulq K, %rax
  94. addq %rax, B
  95. movq N, %rax
  96. imulq LDC, %rax
  97. addq %rax, C
  98. #endif
  99. #ifdef RN
  100. movq OFFSET, %rax
  101. negq %rax
  102. movq %rax, KK
  103. #endif
  104. #ifdef RT
  105. movq N, %rax
  106. subq OFFSET, %rax
  107. movq %rax, KK
  108. #endif
  109. movq N, %rax
  110. sarq $1, %rax
  111. movq %rax, J
  112. je .L30
  113. ALIGN_4
  114. .L01:
  115. #if defined(LT) || defined(RN)
  116. movq A, AO
  117. #else
  118. movq A, %rax
  119. movq %rax, AORIG
  120. #endif
  121. #ifdef RT
  122. movq K, %rax
  123. salq $1 + BASE_SHIFT, %rax
  124. subq %rax, B
  125. #endif
  126. lea (, LDC, 2), %rax
  127. #ifdef RT
  128. subq %rax, C
  129. #endif
  130. movq C, CO
  131. #ifndef RT
  132. addq %rax, C
  133. #endif
  134. #ifdef LN
  135. movq OFFSET, %rax
  136. addq M, %rax
  137. movq %rax, KK
  138. #endif
  139. #ifdef LT
  140. movq OFFSET, %rax
  141. movq %rax, KK
  142. #endif
  143. movq M, I
  144. sarq $1, I
  145. je .L20
  146. ALIGN_4
  147. .L11:
  148. #ifdef LN
  149. movq K, %rax
  150. salq $1 + BASE_SHIFT, %rax
  151. subq %rax, AORIG
  152. #endif
  153. #if defined(LN) || defined(RT)
  154. movq KK, %rax
  155. salq $BASE_SHIFT, %rax
  156. movq AORIG, AO
  157. leaq (AO, %rax, 2), AO
  158. leaq (B, %rax, 2), BO
  159. #else
  160. movq B, BO
  161. #endif
  162. fldz
  163. fldz
  164. fldz
  165. fldz
  166. #if defined(HAVE_3DNOW)
  167. prefetchw 2 * SIZE(CO)
  168. prefetchw 2 * SIZE(CO, LDC, 1)
  169. #elif defined(HAVE_SSE)
  170. prefetchnta 2 * SIZE(CO)
  171. prefetchnta 2 * SIZE(CO, LDC, 1)
  172. #endif
  173. #if defined(LT) || defined(RN)
  174. movq KK, %rax
  175. #else
  176. movq K, %rax
  177. subq KK, %rax
  178. #endif
  179. sarq $2, %rax
  180. je .L15
  181. ALIGN_4
  182. .L12:
  183. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  184. FLD -8 * SIZE(AO)
  185. FLD -8 * SIZE(BO)
  186. fld %st(1)
  187. fmul %st(1), %st
  188. faddp %st, %st(3)
  189. FLD -7 * SIZE(BO)
  190. fmul %st, %st(2)
  191. FLD -7 * SIZE(AO)
  192. fmul %st, %st(2)
  193. fmulp %st, %st(1)
  194. faddp %st, %st(6)
  195. faddp %st, %st(4)
  196. faddp %st, %st(2)
  197. FLD -6 * SIZE(AO)
  198. FLD -6 * SIZE(BO)
  199. fld %st(1)
  200. fmul %st(1), %st
  201. faddp %st, %st(3)
  202. FLD -5 * SIZE(BO)
  203. fmul %st, %st(2)
  204. FLD -5 * SIZE(AO)
  205. fmul %st, %st(2)
  206. fmulp %st, %st(1)
  207. faddp %st, %st(6)
  208. faddp %st, %st(4)
  209. faddp %st, %st(2)
  210. PREFETCH (PREFETCHSIZE + 4) * SIZE(AO)
  211. FLD -4 * SIZE(AO)
  212. FLD -4 * SIZE(BO)
  213. fld %st(1)
  214. fmul %st(1), %st
  215. faddp %st, %st(3)
  216. FLD -3 * SIZE(BO)
  217. fmul %st, %st(2)
  218. FLD -3 * SIZE(AO)
  219. fmul %st, %st(2)
  220. fmulp %st, %st(1)
  221. faddp %st, %st(6)
  222. faddp %st, %st(4)
  223. faddp %st, %st(2)
  224. FLD -2 * SIZE(AO)
  225. FLD -2 * SIZE(BO)
  226. fld %st(1)
  227. fmul %st(1), %st
  228. faddp %st, %st(3)
  229. FLD -1 * SIZE(BO)
  230. fmul %st, %st(2)
  231. FLD -1 * SIZE(AO)
  232. fmul %st, %st(2)
  233. fmulp %st, %st(1)
  234. faddp %st, %st(6)
  235. faddp %st, %st(4)
  236. faddp %st, %st(2)
  237. addq $8 * SIZE,AO
  238. addq $8 * SIZE,BO
  239. decq %rax
  240. jne .L12
  241. ALIGN_4
  242. .L15:
  243. #if defined(LT) || defined(RN)
  244. movq KK, %rax
  245. #else
  246. movq K, %rax
  247. subq KK, %rax
  248. #endif
  249. and $3, %rax
  250. je .L18
  251. ALIGN_4
  252. .L16:
  253. FLD -8 * SIZE(AO)
  254. FLD -8 * SIZE(BO)
  255. fld %st(1)
  256. fmul %st(1), %st
  257. faddp %st, %st(3)
  258. FLD -7 * SIZE(BO)
  259. fmul %st, %st(2)
  260. FLD -7 * SIZE(AO)
  261. fmul %st, %st(2)
  262. fmulp %st, %st(1)
  263. faddp %st, %st(6)
  264. faddp %st, %st(4)
  265. faddp %st, %st(2)
  266. addq $2 * SIZE,AO
  267. addq $2 * SIZE,BO
  268. decq %rax
  269. jne .L16
  270. ALIGN_4
  271. .L18:
  272. #if defined(LN) || defined(RT)
  273. movq KK, %rax
  274. #ifdef LN
  275. subq $2, %rax
  276. #else
  277. subq $2, %rax
  278. #endif
  279. salq $BASE_SHIFT, %rax
  280. movq AORIG, AO
  281. leaq (AO, %rax, 2), AO
  282. leaq (B, %rax, 2), BO
  283. #endif
  284. #if defined(LN) || defined(LT)
  285. FLD -8 * SIZE(BO)
  286. fsubp %st, %st(1)
  287. FLD -7 * SIZE(BO)
  288. fsubp %st, %st(2)
  289. FLD -6 * SIZE(BO)
  290. fsubp %st, %st(3)
  291. FLD -5 * SIZE(BO)
  292. fsubp %st, %st(4)
  293. #else
  294. FLD -8 * SIZE(AO)
  295. fsubp %st, %st(1)
  296. FLD -7 * SIZE(AO)
  297. fsubp %st, %st(3)
  298. FLD -6 * SIZE(AO)
  299. fsubp %st, %st(2)
  300. FLD -5 * SIZE(AO)
  301. fsubp %st, %st(4)
  302. #endif
  303. #ifdef LN
  304. FLD -5 * SIZE(AO)
  305. fmul %st, %st(3)
  306. fmulp %st, %st(4)
  307. FLD -6 * SIZE(AO)
  308. fmul %st(3), %st
  309. FLD -6 * SIZE(AO)
  310. fmul %st(5), %st
  311. fsubrp %st, %st(3)
  312. fsubrp %st, %st(1)
  313. FLD -8 * SIZE(AO)
  314. fmul %st, %st(1)
  315. fmulp %st, %st(2)
  316. #endif
  317. #ifdef LT
  318. FLD -8 * SIZE(AO)
  319. fmul %st, %st(1)
  320. fmulp %st, %st(2)
  321. FLD -7 * SIZE(AO)
  322. fmul %st(1), %st
  323. FLD -7 * SIZE(AO)
  324. fmul %st(3), %st
  325. fsubrp %st, %st(5)
  326. fsubrp %st, %st(3)
  327. FLD -5 * SIZE(AO)
  328. fmul %st, %st(3)
  329. fmulp %st, %st(4)
  330. #endif
  331. #ifdef RN
  332. FLD -8 * SIZE(BO)
  333. fmul %st, %st(1)
  334. fmulp %st, %st(3)
  335. FLD -7 * SIZE(BO)
  336. fmul %st(1), %st
  337. FLD -7 * SIZE(BO)
  338. fmul %st(4), %st
  339. fsubrp %st, %st(5)
  340. fsubrp %st, %st(2)
  341. FLD -5 * SIZE(BO)
  342. fmul %st, %st(2)
  343. fmulp %st, %st(4)
  344. #endif
  345. #ifdef RT
  346. FLD -5 * SIZE(BO)
  347. fmul %st, %st(2)
  348. fmulp %st, %st(4)
  349. FLD -6 * SIZE(BO)
  350. fmul %st(2), %st
  351. FLD -6 * SIZE(BO)
  352. fmul %st(5), %st
  353. fsubrp %st, %st(4)
  354. fsubrp %st, %st(1)
  355. FLD -8 * SIZE(BO)
  356. fmul %st, %st(1)
  357. fmulp %st, %st(3)
  358. #endif
  359. #ifdef LN
  360. subq $2 * SIZE, CO
  361. #endif
  362. #if defined(LN) || defined(LT)
  363. fld %st
  364. FST -8 * SIZE(BO)
  365. fxch %st(1)
  366. fld %st
  367. FST -7 * SIZE(BO)
  368. fxch %st(2)
  369. fld %st
  370. FST -6 * SIZE(BO)
  371. fxch %st(3)
  372. fld %st
  373. FST -5 * SIZE(BO)
  374. FST 1 * SIZE(CO, LDC)
  375. FST 0 * SIZE(CO)
  376. FST 0 * SIZE(CO, LDC)
  377. FST 1 * SIZE(CO)
  378. #else
  379. fld %st
  380. FST -8 * SIZE(AO)
  381. fxch %st(2)
  382. fld %st
  383. FST -7 * SIZE(AO)
  384. fxch %st(1)
  385. fld %st
  386. FST -6 * SIZE(AO)
  387. fxch %st(3)
  388. fld %st
  389. FST -5 * SIZE(AO)
  390. FST 1 * SIZE(CO, LDC)
  391. FST 1 * SIZE(CO)
  392. FST 0 * SIZE(CO)
  393. FST 0 * SIZE(CO, LDC)
  394. #endif
  395. #ifndef LN
  396. addq $2 * SIZE, CO
  397. #endif
  398. #if defined(LT) || defined(RN)
  399. movq K, %rax
  400. subq KK, %rax
  401. salq $BASE_SHIFT, %rax
  402. leaq (AO, %rax, 2), AO
  403. leaq (BO, %rax, 2), BO
  404. #endif
  405. #ifdef LN
  406. subq $2, KK
  407. #endif
  408. #ifdef LT
  409. addq $2, KK
  410. #endif
  411. #ifdef RT
  412. movq K, %rax
  413. salq $1 + BASE_SHIFT, %rax
  414. addq %rax, AORIG
  415. #endif
  416. decq I
  417. jne .L11
  418. ALIGN_4
  419. .L20:
  420. movq M, %rax
  421. andq $1, %rax
  422. je .L29
  423. ALIGN_4
  424. .L21:
  425. #ifdef LN
  426. movq K, %rax
  427. salq $0 + BASE_SHIFT, %rax
  428. subq %rax, AORIG
  429. #endif
  430. #if defined(LN) || defined(RT)
  431. movq KK, %rax
  432. salq $BASE_SHIFT, %rax
  433. movq AORIG, AO
  434. leaq (AO, %rax, 1), AO
  435. leaq (B, %rax, 2), BO
  436. #else
  437. movq B, BO
  438. #endif
  439. fldz
  440. fldz
  441. #if defined(LT) || defined(RN)
  442. movq KK, %rax
  443. #else
  444. movq K, %rax
  445. subq KK, %rax
  446. #endif
  447. sarq $2, %rax
  448. je .L25
  449. ALIGN_4
  450. .L22:
  451. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  452. FLD -8 * SIZE(AO)
  453. FLD -8 * SIZE(BO)
  454. fmul %st(1), %st
  455. faddp %st, %st(2)
  456. FLD -7 * SIZE(BO)
  457. fmulp %st, %st(1)
  458. faddp %st, %st(2)
  459. FLD -7 * SIZE(AO)
  460. FLD -6 * SIZE(BO)
  461. fmul %st(1), %st
  462. faddp %st, %st(2)
  463. FLD -5 * SIZE(BO)
  464. fmulp %st, %st(1)
  465. faddp %st, %st(2)
  466. FLD -6 * SIZE(AO)
  467. FLD -4 * SIZE(BO)
  468. fmul %st(1), %st
  469. faddp %st, %st(2)
  470. FLD -3 * SIZE(BO)
  471. fmulp %st, %st(1)
  472. faddp %st, %st(2)
  473. FLD -5 * SIZE(AO)
  474. FLD -2 * SIZE(BO)
  475. fmul %st(1), %st
  476. faddp %st, %st(2)
  477. FLD -1 * SIZE(BO)
  478. fmulp %st, %st(1)
  479. faddp %st, %st(2)
  480. addq $4 * SIZE,AO
  481. addq $8 * SIZE,BO
  482. decq %rax
  483. jne .L22
  484. ALIGN_4
  485. .L25:
  486. #if defined(LT) || defined(RN)
  487. movq KK, %rax
  488. #else
  489. movq K, %rax
  490. subq KK, %rax
  491. #endif
  492. and $3, %rax
  493. je .L28
  494. ALIGN_4
  495. .L26:
  496. FLD -8 * SIZE(AO)
  497. FLD -8 * SIZE(BO)
  498. fmul %st(1), %st
  499. faddp %st, %st(2)
  500. FLD -7 * SIZE(BO)
  501. fmulp %st, %st(1)
  502. faddp %st, %st(2)
  503. addq $1 * SIZE,AO
  504. addq $2 * SIZE,BO
  505. decq %rax
  506. jne .L26
  507. ALIGN_4
  508. .L28:
  509. #if defined(LN) || defined(RT)
  510. movq KK, %rax
  511. #ifdef LN
  512. subq $1, %rax
  513. #else
  514. subq $2, %rax
  515. #endif
  516. salq $BASE_SHIFT, %rax
  517. movq AORIG, AO
  518. leaq (AO, %rax, 1), AO
  519. leaq (B, %rax, 2), BO
  520. #endif
  521. #if defined(LN) || defined(LT)
  522. FLD -8 * SIZE(BO)
  523. fsubp %st, %st(1)
  524. FLD -7 * SIZE(BO)
  525. fsubp %st, %st(2)
  526. #else
  527. FLD -8 * SIZE(AO)
  528. fsubp %st, %st(1)
  529. FLD -7 * SIZE(AO)
  530. fsubp %st, %st(2)
  531. #endif
  532. #if defined(LN) || defined(LT)
  533. FLD -8 * SIZE(AO)
  534. fmul %st, %st(1)
  535. fmulp %st, %st(2)
  536. #endif
  537. #ifdef RN
  538. FLD -8 * SIZE(BO)
  539. fmulp %st, %st(1)
  540. FLD -7 * SIZE(BO)
  541. fmul %st(1), %st
  542. fsubrp %st, %st(2)
  543. FLD -5 * SIZE(BO)
  544. fmulp %st, %st(2)
  545. #endif
  546. #ifdef RT
  547. FLD -5 * SIZE(BO)
  548. fmulp %st, %st(2)
  549. FLD -6 * SIZE(BO)
  550. fmul %st(2), %st
  551. fsubrp %st, %st(1)
  552. FLD -8 * SIZE(BO)
  553. fmulp %st, %st(1)
  554. #endif
  555. #ifdef LN
  556. subq $1 * SIZE, CO
  557. #endif
  558. #if defined(LN) || defined(LT)
  559. fld %st
  560. FST -8 * SIZE(BO)
  561. fxch %st(1)
  562. fld %st
  563. FST -7 * SIZE(BO)
  564. #else
  565. fld %st
  566. FST -8 * SIZE(AO)
  567. fxch %st(1)
  568. fld %st
  569. FST -7 * SIZE(AO)
  570. #endif
  571. FST 0 * SIZE(CO, LDC)
  572. FST 0 * SIZE(CO)
  573. #ifndef LN
  574. addq $1 * SIZE, CO
  575. #endif
  576. #if defined(LT) || defined(RN)
  577. movq K, %rax
  578. subq KK, %rax
  579. salq $BASE_SHIFT, %rax
  580. leaq (AO, %rax, 1), AO
  581. leaq (BO, %rax, 2), BO
  582. #endif
  583. #ifdef LN
  584. subq $1, KK
  585. #endif
  586. #ifdef LT
  587. addq $1, KK
  588. #endif
  589. #ifdef RT
  590. movq K, %rax
  591. salq $0 + BASE_SHIFT, %rax
  592. addq %rax, AORIG
  593. #endif
  594. ALIGN_4
  595. .L29:
  596. #ifdef LN
  597. movq K, %rax
  598. salq $BASE_SHIFT, %rax
  599. leaq (B, %rax, 2), B
  600. #endif
  601. #if defined(LT) || defined(RN)
  602. movq BO, B
  603. #endif
  604. #ifdef RN
  605. addq $2, KK
  606. #endif
  607. #ifdef RT
  608. subq $2, KK
  609. #endif
  610. decq J
  611. jne .L01
  612. ALIGN_4
  613. .L30:
  614. movq N, %rax
  615. testq $1, %rax
  616. je .L999
  617. #if defined(LT) || defined(RN)
  618. movq A, AO
  619. #else
  620. movq A, %rax
  621. movq %rax, AORIG
  622. #endif
  623. #ifdef RT
  624. movq K, %rax
  625. salq $0 + BASE_SHIFT, %rax
  626. subq %rax, B
  627. #endif
  628. #ifdef RT
  629. subq LDC, C
  630. #endif
  631. movq C, CO
  632. #ifndef RT
  633. addq LDC, C
  634. #endif
  635. #ifdef LN
  636. movq OFFSET, %rax
  637. addq M, %rax
  638. movq %rax, KK
  639. #endif
  640. #ifdef LT
  641. movq OFFSET, %rax
  642. movq %rax, KK
  643. #endif
  644. movq M, I
  645. sarq $1, I
  646. je .L40
  647. ALIGN_4
  648. .L31:
  649. #ifdef LN
  650. movq K, %rax
  651. salq $1 + BASE_SHIFT, %rax
  652. subq %rax, AORIG
  653. #endif
  654. #if defined(LN) || defined(RT)
  655. movq KK, %rax
  656. salq $BASE_SHIFT, %rax
  657. movq AORIG, AO
  658. leaq (AO, %rax, 2), AO
  659. leaq (B, %rax, 1), BO
  660. #else
  661. movq B, BO
  662. #endif
  663. fldz
  664. fldz
  665. #if defined(HAVE_3DNOW)
  666. prefetchw 2 * SIZE(CO)
  667. #elif defined(HAVE_SSE)
  668. prefetchnta 2 * SIZE(CO)
  669. #endif
  670. #if defined(LT) || defined(RN)
  671. movq KK, %rax
  672. #else
  673. movq K, %rax
  674. subq KK, %rax
  675. #endif
  676. sarq $2, %rax
  677. je .L35
  678. ALIGN_4
  679. .L32:
  680. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  681. FLD -8 * SIZE(BO)
  682. FLD -8 * SIZE(AO)
  683. fmul %st(1), %st
  684. faddp %st, %st(2)
  685. FLD -7 * SIZE(AO)
  686. fmulp %st, %st(1)
  687. faddp %st, %st(2)
  688. FLD -7 * SIZE(BO)
  689. FLD -6 * SIZE(AO)
  690. fmul %st(1), %st
  691. faddp %st, %st(2)
  692. FLD -5 * SIZE(AO)
  693. fmulp %st, %st(1)
  694. faddp %st, %st(2)
  695. FLD -6 * SIZE(BO)
  696. FLD -4 * SIZE(AO)
  697. fmul %st(1), %st
  698. faddp %st, %st(2)
  699. FLD -3 * SIZE(AO)
  700. fmulp %st, %st(1)
  701. faddp %st, %st(2)
  702. FLD -5 * SIZE(BO)
  703. FLD -2 * SIZE(AO)
  704. fmul %st(1), %st
  705. faddp %st, %st(2)
  706. FLD -1 * SIZE(AO)
  707. fmulp %st, %st(1)
  708. faddp %st, %st(2)
  709. addq $8 * SIZE,AO
  710. addq $4 * SIZE,BO
  711. decq %rax
  712. jne .L32
  713. ALIGN_4
  714. .L35:
  715. #if defined(LT) || defined(RN)
  716. movq KK, %rax
  717. #else
  718. movq K, %rax
  719. subq KK, %rax
  720. #endif
  721. and $3, %rax
  722. je .L38
  723. ALIGN_4
  724. .L36:
  725. FLD -8 * SIZE(BO)
  726. FLD -8 * SIZE(AO)
  727. fmul %st(1), %st
  728. faddp %st, %st(2)
  729. FLD -7 * SIZE(AO)
  730. fmulp %st, %st(1)
  731. faddp %st, %st(2)
  732. addq $2 * SIZE,AO
  733. addq $1 * SIZE,BO
  734. decq %rax
  735. jne .L36
  736. ALIGN_4
  737. .L38:
  738. #if defined(LN) || defined(RT)
  739. movq KK, %rax
  740. #ifdef LN
  741. subq $2, %rax
  742. #else
  743. subq $1, %rax
  744. #endif
  745. salq $BASE_SHIFT, %rax
  746. movq AORIG, AO
  747. leaq (AO, %rax, 2), AO
  748. leaq (B, %rax, 1), BO
  749. #endif
  750. #if defined(LN) || defined(LT)
  751. FLD -8 * SIZE(BO)
  752. fsubp %st, %st(1)
  753. FLD -7 * SIZE(BO)
  754. fsubp %st, %st(2)
  755. #else
  756. FLD -8 * SIZE(AO)
  757. fsubp %st, %st(1)
  758. FLD -7 * SIZE(AO)
  759. fsubp %st, %st(2)
  760. #endif
  761. #ifdef LN
  762. FLD -5 * SIZE(AO)
  763. fmulp %st, %st(2)
  764. FLD -6 * SIZE(AO)
  765. fmul %st(2), %st
  766. fsubrp %st, %st(1)
  767. FLD -8 * SIZE(AO)
  768. fmulp %st, %st(1)
  769. #endif
  770. #ifdef LT
  771. FLD -8 * SIZE(AO)
  772. fmulp %st, %st(1)
  773. FLD -7 * SIZE(AO)
  774. fmul %st(1), %st
  775. fsubrp %st, %st(2)
  776. FLD -5 * SIZE(AO)
  777. fmulp %st, %st(2)
  778. #endif
  779. #ifdef RN
  780. FLD -8 * SIZE(BO)
  781. fmul %st, %st(1)
  782. fmulp %st, %st(2)
  783. #endif
  784. #ifdef RT
  785. FLD -8 * SIZE(BO)
  786. fmul %st, %st(1)
  787. fmulp %st, %st(2)
  788. #endif
  789. #ifdef LN
  790. subq $2 * SIZE, CO
  791. #endif
  792. #if defined(LN) || defined(LT)
  793. fld %st
  794. FST -8 * SIZE(BO)
  795. fxch %st(1)
  796. fld %st
  797. FST -7 * SIZE(BO)
  798. #else
  799. fld %st
  800. FST -8 * SIZE(AO)
  801. fxch %st(1)
  802. fld %st
  803. FST -7 * SIZE(AO)
  804. #endif
  805. FST 1 * SIZE(CO)
  806. FST 0 * SIZE(CO)
  807. #ifndef LN
  808. addq $2 * SIZE, CO
  809. #endif
  810. #if defined(LT) || defined(RN)
  811. movq K, %rax
  812. subq KK, %rax
  813. salq $BASE_SHIFT, %rax
  814. leaq (AO, %rax, 2), AO
  815. leaq (BO, %rax, 1), BO
  816. #endif
  817. #ifdef LN
  818. subq $2, KK
  819. #endif
  820. #ifdef LT
  821. addq $2, KK
  822. #endif
  823. #ifdef RT
  824. movq K, %rax
  825. salq $1 + BASE_SHIFT, %rax
  826. addq %rax, AORIG
  827. #endif
  828. decq I
  829. jne .L31
  830. ALIGN_4
  831. .L40:
  832. movq M, %rax
  833. andq $1, %rax
  834. je .L49
  835. ALIGN_4
  836. .L41:
  837. #ifdef LN
  838. movq K, %rax
  839. salq $0 + BASE_SHIFT, %rax
  840. subq %rax, AORIG
  841. #endif
  842. #if defined(LN) || defined(RT)
  843. movq KK, %rax
  844. salq $BASE_SHIFT, %rax
  845. movq AORIG, AO
  846. leaq (AO, %rax, 1), AO
  847. leaq (B, %rax, 1), BO
  848. #else
  849. movq B, BO
  850. #endif
  851. fldz
  852. #if defined(LT) || defined(RN)
  853. movq KK, %rax
  854. #else
  855. movq K, %rax
  856. subq KK, %rax
  857. #endif
  858. sarq $2, %rax
  859. je .L45
  860. ALIGN_4
  861. .L42:
  862. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  863. FLD -8 * SIZE(AO)
  864. FLD -8 * SIZE(BO)
  865. fmulp %st, %st(1)
  866. faddp %st, %st(1)
  867. FLD -7 * SIZE(AO)
  868. FLD -7 * SIZE(BO)
  869. fmulp %st, %st(1)
  870. faddp %st, %st(1)
  871. FLD -6 * SIZE(AO)
  872. FLD -6 * SIZE(BO)
  873. fmulp %st, %st(1)
  874. faddp %st, %st(1)
  875. FLD -5 * SIZE(AO)
  876. FLD -5 * SIZE(BO)
  877. fmulp %st, %st(1)
  878. faddp %st, %st(1)
  879. addq $4 * SIZE,AO
  880. addq $4 * SIZE,BO
  881. decq %rax
  882. jne .L42
  883. ALIGN_4
  884. .L45:
  885. #if defined(LT) || defined(RN)
  886. movq KK, %rax
  887. #else
  888. movq K, %rax
  889. subq KK, %rax
  890. #endif
  891. and $3, %rax
  892. je .L48
  893. ALIGN_4
  894. .L46:
  895. FLD -8 * SIZE(AO)
  896. FLD -8 * SIZE(BO)
  897. fmulp %st, %st(1)
  898. faddp %st, %st(1)
  899. addq $1 * SIZE,AO
  900. addq $1 * SIZE,BO
  901. decq %rax
  902. jne .L46
  903. ALIGN_4
  904. .L48:
  905. #if defined(LN) || defined(RT)
  906. movq KK, %rax
  907. #ifdef LN
  908. subq $1, %rax
  909. #else
  910. subq $1, %rax
  911. #endif
  912. salq $BASE_SHIFT, %rax
  913. movq AORIG, AO
  914. leaq (AO, %rax, 1), AO
  915. leaq (B, %rax, 1), BO
  916. #endif
  917. #if defined(LN) || defined(LT)
  918. FLD -8 * SIZE(BO)
  919. fsubp %st, %st(1)
  920. #else
  921. FLD -8 * SIZE(AO)
  922. fsubp %st, %st(1)
  923. #endif
  924. #ifdef LN
  925. FLD -8 * SIZE(AO)
  926. fmulp %st, %st(1)
  927. #endif
  928. #ifdef LT
  929. FLD -8 * SIZE(AO)
  930. fmulp %st, %st(1)
  931. #endif
  932. #ifdef RN
  933. FLD -8 * SIZE(BO)
  934. fmulp %st, %st(1)
  935. #endif
  936. #ifdef RT
  937. FLD -8 * SIZE(BO)
  938. fmulp %st, %st(1)
  939. #endif
  940. #ifdef LN
  941. subq $1 * SIZE, CO
  942. #endif
  943. #if defined(LN) || defined(LT)
  944. fld %st
  945. FST -8 * SIZE(BO)
  946. #else
  947. fld %st
  948. FST -8 * SIZE(AO)
  949. #endif
  950. FST 0 * SIZE(CO)
  951. #ifndef LN
  952. addq $1 * SIZE, CO
  953. #endif
  954. #if defined(LT) || defined(RN)
  955. movq K, %rax
  956. subq KK, %rax
  957. salq $BASE_SHIFT, %rax
  958. leaq (AO, %rax, 1), AO
  959. leaq (BO, %rax, 1), BO
  960. #endif
  961. #ifdef LN
  962. subq $1, KK
  963. #endif
  964. #ifdef LT
  965. addq $1, KK
  966. #endif
  967. #ifdef RT
  968. movq K, %rax
  969. salq $0 + BASE_SHIFT, %rax
  970. addq %rax, AORIG
  971. #endif
  972. ALIGN_4
  973. .L49:
  974. #ifdef LN
  975. movq K, %rax
  976. salq $BASE_SHIFT, %rax
  977. leaq (B, %rax, 1), B
  978. #endif
  979. #if defined(LT) || defined(RN)
  980. movq BO, B
  981. #endif
  982. #ifdef RN
  983. addq $1, KK
  984. #endif
  985. #ifdef RT
  986. subq $1, KK
  987. #endif
  988. ALIGN_4
  989. .L999:
  990. movq 0(%rsp), %rbx
  991. movq 8(%rsp), %rbp
  992. movq 16(%rsp), %r12
  993. movq 24(%rsp), %r13
  994. movq 32(%rsp), %r14
  995. movq 40(%rsp), %r15
  996. addq $STACKSIZE, %rsp
  997. ret
  998. EPILOGUE