You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qtrsm_kernel_LN_2x2.S 19 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1
  41. #define N ARG2
  42. #define K ARG3
  43. #define A ARG4
  44. #define B ARG5
  45. #define C ARG6
  46. #define LDC %r10
  47. #define I %r12
  48. #define J %r13
  49. #define AO %r14
  50. #define BO %r15
  51. #define CO %rbp
  52. #define KK %r11
  53. #define AORIG 48(%rsp)
  54. #define STACKSIZE 64
  55. #define ALPHA 8 + STACKSIZE(%rsp)
  56. #define OFFSET 32 + STACKSIZE(%rsp)
  57. #ifdef OPTERON
  58. #define PREFETCH prefetch
  59. #define PREFETCHW prefetchw
  60. #else
  61. #define PREFETCH prefetcht0
  62. #define PREFETCHW prefetcht0
  63. #endif
  64. #define PREFETCHSIZE (5 + 4 * 10)
  65. PROLOGUE
  66. PROFCODE
  67. subq $STACKSIZE, %rsp
  68. movq %rbx, 0(%rsp)
  69. movq %rbp, 8(%rsp)
  70. movq %r12, 16(%rsp)
  71. movq %r13, 24(%rsp)
  72. movq %r14, 32(%rsp)
  73. movq %r15, 40(%rsp)
  74. movq 24 + STACKSIZE(%rsp), LDC
  75. #if defined(TRMMKERNEL) && !defined(LEFT)
  76. movq OFFSET, %rax
  77. negq %rax
  78. movq %rax, KK
  79. #endif
  80. addq $8 * SIZE, A
  81. addq $8 * SIZE, B
  82. salq $BASE_SHIFT, LDC
  83. #ifdef LN
  84. movq M, %rax
  85. salq $BASE_SHIFT, %rax
  86. addq %rax, C
  87. imulq K, %rax
  88. addq %rax, A
  89. #endif
  90. #ifdef RT
  91. movq N, %rax
  92. salq $BASE_SHIFT, %rax
  93. imulq K, %rax
  94. addq %rax, B
  95. movq N, %rax
  96. imulq LDC, %rax
  97. addq %rax, C
  98. #endif
  99. #ifdef RN
  100. movq OFFSET, %rax
  101. negq %rax
  102. movq %rax, KK
  103. #endif
  104. #ifdef RT
  105. movq N, %rax
  106. subq OFFSET, %rax
  107. movq %rax, KK
  108. #endif
  109. movq N, %rax
  110. sarq $1, %rax
  111. movq %rax, J
  112. je .L30
  113. ALIGN_4
  114. .L01:
  115. #if defined(LT) || defined(RN)
  116. movq A, AO
  117. #else
  118. movq A, %rax
  119. movq %rax, AORIG
  120. #endif
  121. #ifdef RT
  122. movq K, %rax
  123. salq $1 + BASE_SHIFT, %rax
  124. subq %rax, B
  125. #endif
  126. lea (, LDC, 2), %rax
  127. #ifdef RT
  128. subq %rax, C
  129. #endif
  130. movq C, CO
  131. #ifndef RT
  132. addq %rax, C
  133. #endif
  134. #ifdef LN
  135. movq OFFSET, %rax
  136. addq M, %rax
  137. movq %rax, KK
  138. #endif
  139. #ifdef LT
  140. movq OFFSET, %rax
  141. movq %rax, KK
  142. #endif
  143. movq M, %rax
  144. andq $1, %rax
  145. je .L20
  146. ALIGN_4
  147. .L21:
  148. #ifdef LN
  149. movq K, %rax
  150. salq $0 + BASE_SHIFT, %rax
  151. subq %rax, AORIG
  152. #endif
  153. #if defined(LN) || defined(RT)
  154. movq KK, %rax
  155. salq $BASE_SHIFT, %rax
  156. movq AORIG, AO
  157. leaq (AO, %rax, 1), AO
  158. leaq (B, %rax, 2), BO
  159. #else
  160. movq B, BO
  161. #endif
  162. fldz
  163. fldz
  164. #if defined(LT) || defined(RN)
  165. movq KK, %rax
  166. #else
  167. movq K, %rax
  168. subq KK, %rax
  169. #endif
  170. sarq $2, %rax
  171. je .L25
  172. ALIGN_4
  173. .L22:
  174. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  175. FLD -8 * SIZE(AO)
  176. FLD -8 * SIZE(BO)
  177. fmul %st(1), %st
  178. faddp %st, %st(2)
  179. FLD -7 * SIZE(BO)
  180. fmulp %st, %st(1)
  181. faddp %st, %st(2)
  182. FLD -7 * SIZE(AO)
  183. FLD -6 * SIZE(BO)
  184. fmul %st(1), %st
  185. faddp %st, %st(2)
  186. FLD -5 * SIZE(BO)
  187. fmulp %st, %st(1)
  188. faddp %st, %st(2)
  189. FLD -6 * SIZE(AO)
  190. FLD -4 * SIZE(BO)
  191. fmul %st(1), %st
  192. faddp %st, %st(2)
  193. FLD -3 * SIZE(BO)
  194. fmulp %st, %st(1)
  195. faddp %st, %st(2)
  196. FLD -5 * SIZE(AO)
  197. FLD -2 * SIZE(BO)
  198. fmul %st(1), %st
  199. faddp %st, %st(2)
  200. FLD -1 * SIZE(BO)
  201. fmulp %st, %st(1)
  202. faddp %st, %st(2)
  203. addq $4 * SIZE,AO
  204. addq $8 * SIZE,BO
  205. decq %rax
  206. jne .L22
  207. ALIGN_4
  208. .L25:
  209. #if defined(LT) || defined(RN)
  210. movq KK, %rax
  211. #else
  212. movq K, %rax
  213. subq KK, %rax
  214. #endif
  215. and $3, %rax
  216. je .L28
  217. ALIGN_4
  218. .L26:
  219. FLD -8 * SIZE(AO)
  220. FLD -8 * SIZE(BO)
  221. fmul %st(1), %st
  222. faddp %st, %st(2)
  223. FLD -7 * SIZE(BO)
  224. fmulp %st, %st(1)
  225. faddp %st, %st(2)
  226. addq $1 * SIZE,AO
  227. addq $2 * SIZE,BO
  228. decq %rax
  229. jne .L26
  230. ALIGN_4
  231. .L28:
  232. #if defined(LN) || defined(RT)
  233. movq KK, %rax
  234. #ifdef LN
  235. subq $1, %rax
  236. #else
  237. subq $2, %rax
  238. #endif
  239. salq $BASE_SHIFT, %rax
  240. movq AORIG, AO
  241. leaq (AO, %rax, 1), AO
  242. leaq (B, %rax, 2), BO
  243. #endif
  244. #if defined(LN) || defined(LT)
  245. FLD -8 * SIZE(BO)
  246. fsubp %st, %st(1)
  247. FLD -7 * SIZE(BO)
  248. fsubp %st, %st(2)
  249. #else
  250. FLD -8 * SIZE(AO)
  251. fsubp %st, %st(1)
  252. FLD -7 * SIZE(AO)
  253. fsubp %st, %st(2)
  254. #endif
  255. #if defined(LN) || defined(LT)
  256. FLD -8 * SIZE(AO)
  257. fmul %st, %st(1)
  258. fmulp %st, %st(2)
  259. #endif
  260. #ifdef RN
  261. FLD -8 * SIZE(BO)
  262. fmulp %st, %st(1)
  263. FLD -7 * SIZE(BO)
  264. fmul %st(1), %st
  265. fsubrp %st, %st(2)
  266. FLD -5 * SIZE(BO)
  267. fmulp %st, %st(2)
  268. #endif
  269. #ifdef RT
  270. FLD -5 * SIZE(BO)
  271. fmulp %st, %st(2)
  272. FLD -6 * SIZE(BO)
  273. fmul %st(2), %st
  274. fsubrp %st, %st(1)
  275. FLD -8 * SIZE(BO)
  276. fmulp %st, %st(1)
  277. #endif
  278. #ifdef LN
  279. subq $1 * SIZE, CO
  280. #endif
  281. #if defined(LN) || defined(LT)
  282. fld %st
  283. FST -8 * SIZE(BO)
  284. fxch %st(1)
  285. fld %st
  286. FST -7 * SIZE(BO)
  287. #else
  288. fld %st
  289. FST -8 * SIZE(AO)
  290. fxch %st(1)
  291. fld %st
  292. FST -7 * SIZE(AO)
  293. #endif
  294. FST 0 * SIZE(CO, LDC)
  295. FST 0 * SIZE(CO)
  296. #ifndef LN
  297. addq $1 * SIZE, CO
  298. #endif
  299. #if defined(LT) || defined(RN)
  300. movq K, %rax
  301. subq KK, %rax
  302. salq $BASE_SHIFT, %rax
  303. leaq (AO, %rax, 1), AO
  304. leaq (BO, %rax, 2), BO
  305. #endif
  306. #ifdef LN
  307. subq $1, KK
  308. #endif
  309. #ifdef LT
  310. addq $1, KK
  311. #endif
  312. #ifdef RT
  313. movq K, %rax
  314. salq $0 + BASE_SHIFT, %rax
  315. addq %rax, AORIG
  316. #endif
  317. ALIGN_4
  318. .L20:
  319. movq M, I
  320. sarq $1, I
  321. je .L29
  322. ALIGN_4
  323. .L11:
  324. #ifdef LN
  325. movq K, %rax
  326. salq $1 + BASE_SHIFT, %rax
  327. subq %rax, AORIG
  328. #endif
  329. #if defined(LN) || defined(RT)
  330. movq KK, %rax
  331. salq $BASE_SHIFT, %rax
  332. movq AORIG, AO
  333. leaq (AO, %rax, 2), AO
  334. leaq (B, %rax, 2), BO
  335. #else
  336. movq B, BO
  337. #endif
  338. fldz
  339. fldz
  340. fldz
  341. fldz
  342. #if defined(HAVE_3DNOW)
  343. prefetchw 2 * SIZE(CO)
  344. prefetchw 2 * SIZE(CO, LDC, 1)
  345. #elif defined(HAVE_SSE)
  346. prefetchnta 2 * SIZE(CO)
  347. prefetchnta 2 * SIZE(CO, LDC, 1)
  348. #endif
  349. #if defined(LT) || defined(RN)
  350. movq KK, %rax
  351. #else
  352. movq K, %rax
  353. subq KK, %rax
  354. #endif
  355. sarq $2, %rax
  356. je .L15
  357. ALIGN_4
  358. .L12:
  359. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  360. FLD -8 * SIZE(AO)
  361. FLD -8 * SIZE(BO)
  362. fld %st(1)
  363. fmul %st(1), %st
  364. faddp %st, %st(3)
  365. FLD -7 * SIZE(BO)
  366. fmul %st, %st(2)
  367. FLD -7 * SIZE(AO)
  368. fmul %st, %st(2)
  369. fmulp %st, %st(1)
  370. faddp %st, %st(6)
  371. faddp %st, %st(4)
  372. faddp %st, %st(2)
  373. FLD -6 * SIZE(AO)
  374. FLD -6 * SIZE(BO)
  375. fld %st(1)
  376. fmul %st(1), %st
  377. faddp %st, %st(3)
  378. FLD -5 * SIZE(BO)
  379. fmul %st, %st(2)
  380. FLD -5 * SIZE(AO)
  381. fmul %st, %st(2)
  382. fmulp %st, %st(1)
  383. faddp %st, %st(6)
  384. faddp %st, %st(4)
  385. faddp %st, %st(2)
  386. PREFETCH (PREFETCHSIZE + 4) * SIZE(AO)
  387. FLD -4 * SIZE(AO)
  388. FLD -4 * SIZE(BO)
  389. fld %st(1)
  390. fmul %st(1), %st
  391. faddp %st, %st(3)
  392. FLD -3 * SIZE(BO)
  393. fmul %st, %st(2)
  394. FLD -3 * SIZE(AO)
  395. fmul %st, %st(2)
  396. fmulp %st, %st(1)
  397. faddp %st, %st(6)
  398. faddp %st, %st(4)
  399. faddp %st, %st(2)
  400. FLD -2 * SIZE(AO)
  401. FLD -2 * SIZE(BO)
  402. fld %st(1)
  403. fmul %st(1), %st
  404. faddp %st, %st(3)
  405. FLD -1 * SIZE(BO)
  406. fmul %st, %st(2)
  407. FLD -1 * SIZE(AO)
  408. fmul %st, %st(2)
  409. fmulp %st, %st(1)
  410. faddp %st, %st(6)
  411. faddp %st, %st(4)
  412. faddp %st, %st(2)
  413. addq $8 * SIZE,AO
  414. addq $8 * SIZE,BO
  415. decq %rax
  416. jne .L12
  417. ALIGN_4
  418. .L15:
  419. #if defined(LT) || defined(RN)
  420. movq KK, %rax
  421. #else
  422. movq K, %rax
  423. subq KK, %rax
  424. #endif
  425. and $3, %rax
  426. je .L18
  427. ALIGN_4
  428. .L16:
  429. FLD -8 * SIZE(AO)
  430. FLD -8 * SIZE(BO)
  431. fld %st(1)
  432. fmul %st(1), %st
  433. faddp %st, %st(3)
  434. FLD -7 * SIZE(BO)
  435. fmul %st, %st(2)
  436. FLD -7 * SIZE(AO)
  437. fmul %st, %st(2)
  438. fmulp %st, %st(1)
  439. faddp %st, %st(6)
  440. faddp %st, %st(4)
  441. faddp %st, %st(2)
  442. addq $2 * SIZE,AO
  443. addq $2 * SIZE,BO
  444. decq %rax
  445. jne .L16
  446. ALIGN_4
  447. .L18:
  448. #if defined(LN) || defined(RT)
  449. movq KK, %rax
  450. #ifdef LN
  451. subq $2, %rax
  452. #else
  453. subq $2, %rax
  454. #endif
  455. salq $BASE_SHIFT, %rax
  456. movq AORIG, AO
  457. leaq (AO, %rax, 2), AO
  458. leaq (B, %rax, 2), BO
  459. #endif
  460. #if defined(LN) || defined(LT)
  461. FLD -8 * SIZE(BO)
  462. fsubp %st, %st(1)
  463. FLD -7 * SIZE(BO)
  464. fsubp %st, %st(2)
  465. FLD -6 * SIZE(BO)
  466. fsubp %st, %st(3)
  467. FLD -5 * SIZE(BO)
  468. fsubp %st, %st(4)
  469. #else
  470. FLD -8 * SIZE(AO)
  471. fsubp %st, %st(1)
  472. FLD -7 * SIZE(AO)
  473. fsubp %st, %st(3)
  474. FLD -6 * SIZE(AO)
  475. fsubp %st, %st(2)
  476. FLD -5 * SIZE(AO)
  477. fsubp %st, %st(4)
  478. #endif
  479. #ifdef LN
  480. FLD -5 * SIZE(AO)
  481. fmul %st, %st(3)
  482. fmulp %st, %st(4)
  483. FLD -6 * SIZE(AO)
  484. fmul %st(3), %st
  485. FLD -6 * SIZE(AO)
  486. fmul %st(5), %st
  487. fsubrp %st, %st(3)
  488. fsubrp %st, %st(1)
  489. FLD -8 * SIZE(AO)
  490. fmul %st, %st(1)
  491. fmulp %st, %st(2)
  492. #endif
  493. #ifdef LT
  494. FLD -8 * SIZE(AO)
  495. fmul %st, %st(1)
  496. fmulp %st, %st(2)
  497. FLD -7 * SIZE(AO)
  498. fmul %st(1), %st
  499. FLD -7 * SIZE(AO)
  500. fmul %st(3), %st
  501. fsubrp %st, %st(5)
  502. fsubrp %st, %st(3)
  503. FLD -5 * SIZE(AO)
  504. fmul %st, %st(3)
  505. fmulp %st, %st(4)
  506. #endif
  507. #ifdef RN
  508. FLD -8 * SIZE(BO)
  509. fmul %st, %st(1)
  510. fmulp %st, %st(3)
  511. FLD -7 * SIZE(BO)
  512. fmul %st(1), %st
  513. FLD -7 * SIZE(BO)
  514. fmul %st(4), %st
  515. fsubrp %st, %st(5)
  516. fsubrp %st, %st(2)
  517. FLD -5 * SIZE(BO)
  518. fmul %st, %st(2)
  519. fmulp %st, %st(4)
  520. #endif
  521. #ifdef RT
  522. FLD -5 * SIZE(BO)
  523. fmul %st, %st(2)
  524. fmulp %st, %st(4)
  525. FLD -6 * SIZE(BO)
  526. fmul %st(2), %st
  527. FLD -6 * SIZE(BO)
  528. fmul %st(5), %st
  529. fsubrp %st, %st(4)
  530. fsubrp %st, %st(1)
  531. FLD -8 * SIZE(BO)
  532. fmul %st, %st(1)
  533. fmulp %st, %st(3)
  534. #endif
  535. #ifdef LN
  536. subq $2 * SIZE, CO
  537. #endif
  538. #if defined(LN) || defined(LT)
  539. fld %st
  540. FST -8 * SIZE(BO)
  541. fxch %st(1)
  542. fld %st
  543. FST -7 * SIZE(BO)
  544. fxch %st(2)
  545. fld %st
  546. FST -6 * SIZE(BO)
  547. fxch %st(3)
  548. fld %st
  549. FST -5 * SIZE(BO)
  550. FST 1 * SIZE(CO, LDC)
  551. FST 0 * SIZE(CO)
  552. FST 0 * SIZE(CO, LDC)
  553. FST 1 * SIZE(CO)
  554. #else
  555. fld %st
  556. FST -8 * SIZE(AO)
  557. fxch %st(2)
  558. fld %st
  559. FST -7 * SIZE(AO)
  560. fxch %st(1)
  561. fld %st
  562. FST -6 * SIZE(AO)
  563. fxch %st(3)
  564. fld %st
  565. FST -5 * SIZE(AO)
  566. FST 1 * SIZE(CO, LDC)
  567. FST 1 * SIZE(CO)
  568. FST 0 * SIZE(CO)
  569. FST 0 * SIZE(CO, LDC)
  570. #endif
  571. #ifndef LN
  572. addq $2 * SIZE, CO
  573. #endif
  574. #if defined(LT) || defined(RN)
  575. movq K, %rax
  576. subq KK, %rax
  577. salq $BASE_SHIFT, %rax
  578. leaq (AO, %rax, 2), AO
  579. leaq (BO, %rax, 2), BO
  580. #endif
  581. #ifdef LN
  582. subq $2, KK
  583. #endif
  584. #ifdef LT
  585. addq $2, KK
  586. #endif
  587. #ifdef RT
  588. movq K, %rax
  589. salq $1 + BASE_SHIFT, %rax
  590. addq %rax, AORIG
  591. #endif
  592. decq I
  593. jne .L11
  594. ALIGN_4
  595. .L29:
  596. #ifdef LN
  597. movq K, %rax
  598. salq $BASE_SHIFT, %rax
  599. leaq (B, %rax, 2), B
  600. #endif
  601. #if defined(LT) || defined(RN)
  602. movq BO, B
  603. #endif
  604. #ifdef RN
  605. addq $2, KK
  606. #endif
  607. #ifdef RT
  608. subq $2, KK
  609. #endif
  610. decq J
  611. jne .L01
  612. ALIGN_4
  613. .L30:
  614. movq N, %rax
  615. testq $1, %rax
  616. je .L999
  617. #if defined(LT) || defined(RN)
  618. movq A, AO
  619. #else
  620. movq A, %rax
  621. movq %rax, AORIG
  622. #endif
  623. #ifdef RT
  624. movq K, %rax
  625. salq $0 + BASE_SHIFT, %rax
  626. subq %rax, B
  627. #endif
  628. #ifdef RT
  629. subq LDC, C
  630. #endif
  631. movq C, CO
  632. #ifndef RT
  633. addq LDC, C
  634. #endif
  635. #ifdef LN
  636. movq OFFSET, %rax
  637. addq M, %rax
  638. movq %rax, KK
  639. #endif
  640. #ifdef LT
  641. movq OFFSET, %rax
  642. movq %rax, KK
  643. #endif
  644. movq M, %rax
  645. andq $1, %rax
  646. je .L40
  647. ALIGN_4
  648. .L41:
  649. #ifdef LN
  650. movq K, %rax
  651. salq $0 + BASE_SHIFT, %rax
  652. subq %rax, AORIG
  653. #endif
  654. #if defined(LN) || defined(RT)
  655. movq KK, %rax
  656. salq $BASE_SHIFT, %rax
  657. movq AORIG, AO
  658. leaq (AO, %rax, 1), AO
  659. leaq (B, %rax, 1), BO
  660. #else
  661. movq B, BO
  662. #endif
  663. fldz
  664. #if defined(LT) || defined(RN)
  665. movq KK, %rax
  666. #else
  667. movq K, %rax
  668. subq KK, %rax
  669. #endif
  670. sarq $2, %rax
  671. je .L45
  672. ALIGN_4
  673. .L42:
  674. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  675. FLD -8 * SIZE(AO)
  676. FLD -8 * SIZE(BO)
  677. fmulp %st, %st(1)
  678. faddp %st, %st(1)
  679. FLD -7 * SIZE(AO)
  680. FLD -7 * SIZE(BO)
  681. fmulp %st, %st(1)
  682. faddp %st, %st(1)
  683. FLD -6 * SIZE(AO)
  684. FLD -6 * SIZE(BO)
  685. fmulp %st, %st(1)
  686. faddp %st, %st(1)
  687. FLD -5 * SIZE(AO)
  688. FLD -5 * SIZE(BO)
  689. fmulp %st, %st(1)
  690. faddp %st, %st(1)
  691. addq $4 * SIZE,AO
  692. addq $4 * SIZE,BO
  693. decq %rax
  694. jne .L42
  695. ALIGN_4
  696. .L45:
  697. #if defined(LT) || defined(RN)
  698. movq KK, %rax
  699. #else
  700. movq K, %rax
  701. subq KK, %rax
  702. #endif
  703. and $3, %rax
  704. je .L48
  705. ALIGN_4
  706. .L46:
  707. FLD -8 * SIZE(AO)
  708. FLD -8 * SIZE(BO)
  709. fmulp %st, %st(1)
  710. faddp %st, %st(1)
  711. addq $1 * SIZE,AO
  712. addq $1 * SIZE,BO
  713. decq %rax
  714. jne .L46
  715. ALIGN_4
  716. .L48:
  717. #if defined(LN) || defined(RT)
  718. movq KK, %rax
  719. #ifdef LN
  720. subq $1, %rax
  721. #else
  722. subq $1, %rax
  723. #endif
  724. salq $BASE_SHIFT, %rax
  725. movq AORIG, AO
  726. leaq (AO, %rax, 1), AO
  727. leaq (B, %rax, 1), BO
  728. #endif
  729. #if defined(LN) || defined(LT)
  730. FLD -8 * SIZE(BO)
  731. fsubp %st, %st(1)
  732. #else
  733. FLD -8 * SIZE(AO)
  734. fsubp %st, %st(1)
  735. #endif
  736. #ifdef LN
  737. FLD -8 * SIZE(AO)
  738. fmulp %st, %st(1)
  739. #endif
  740. #ifdef LT
  741. FLD -8 * SIZE(AO)
  742. fmulp %st, %st(1)
  743. #endif
  744. #ifdef RN
  745. FLD -8 * SIZE(BO)
  746. fmulp %st, %st(1)
  747. #endif
  748. #ifdef RT
  749. FLD -8 * SIZE(BO)
  750. fmulp %st, %st(1)
  751. #endif
  752. #ifdef LN
  753. subq $1 * SIZE, CO
  754. #endif
  755. #if defined(LN) || defined(LT)
  756. fld %st
  757. FST -8 * SIZE(BO)
  758. #else
  759. fld %st
  760. FST -8 * SIZE(AO)
  761. #endif
  762. FST 0 * SIZE(CO)
  763. #ifndef LN
  764. addq $1 * SIZE, CO
  765. #endif
  766. #if defined(LT) || defined(RN)
  767. movq K, %rax
  768. subq KK, %rax
  769. salq $BASE_SHIFT, %rax
  770. leaq (AO, %rax, 1), AO
  771. leaq (BO, %rax, 1), BO
  772. #endif
  773. #ifdef LN
  774. subq $1, KK
  775. #endif
  776. #ifdef LT
  777. addq $1, KK
  778. #endif
  779. #ifdef RT
  780. movq K, %rax
  781. salq $0 + BASE_SHIFT, %rax
  782. addq %rax, AORIG
  783. #endif
  784. ALIGN_4
  785. .L40:
  786. movq M, I
  787. sarq $1, I
  788. je .L49
  789. ALIGN_4
  790. .L31:
  791. #ifdef LN
  792. movq K, %rax
  793. salq $1 + BASE_SHIFT, %rax
  794. subq %rax, AORIG
  795. #endif
  796. #if defined(LN) || defined(RT)
  797. movq KK, %rax
  798. salq $BASE_SHIFT, %rax
  799. movq AORIG, AO
  800. leaq (AO, %rax, 2), AO
  801. leaq (B, %rax, 1), BO
  802. #else
  803. movq B, BO
  804. #endif
  805. fldz
  806. fldz
  807. #if defined(HAVE_3DNOW)
  808. prefetchw 2 * SIZE(CO)
  809. #elif defined(HAVE_SSE)
  810. prefetchnta 2 * SIZE(CO)
  811. #endif
  812. #if defined(LT) || defined(RN)
  813. movq KK, %rax
  814. #else
  815. movq K, %rax
  816. subq KK, %rax
  817. #endif
  818. sarq $2, %rax
  819. je .L35
  820. ALIGN_4
  821. .L32:
  822. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  823. FLD -8 * SIZE(BO)
  824. FLD -8 * SIZE(AO)
  825. fmul %st(1), %st
  826. faddp %st, %st(2)
  827. FLD -7 * SIZE(AO)
  828. fmulp %st, %st(1)
  829. faddp %st, %st(2)
  830. FLD -7 * SIZE(BO)
  831. FLD -6 * SIZE(AO)
  832. fmul %st(1), %st
  833. faddp %st, %st(2)
  834. FLD -5 * SIZE(AO)
  835. fmulp %st, %st(1)
  836. faddp %st, %st(2)
  837. FLD -6 * SIZE(BO)
  838. FLD -4 * SIZE(AO)
  839. fmul %st(1), %st
  840. faddp %st, %st(2)
  841. FLD -3 * SIZE(AO)
  842. fmulp %st, %st(1)
  843. faddp %st, %st(2)
  844. FLD -5 * SIZE(BO)
  845. FLD -2 * SIZE(AO)
  846. fmul %st(1), %st
  847. faddp %st, %st(2)
  848. FLD -1 * SIZE(AO)
  849. fmulp %st, %st(1)
  850. faddp %st, %st(2)
  851. addq $8 * SIZE,AO
  852. addq $4 * SIZE,BO
  853. decq %rax
  854. jne .L32
  855. ALIGN_4
  856. .L35:
  857. #if defined(LT) || defined(RN)
  858. movq KK, %rax
  859. #else
  860. movq K, %rax
  861. subq KK, %rax
  862. #endif
  863. and $3, %rax
  864. je .L38
  865. ALIGN_4
  866. .L36:
  867. FLD -8 * SIZE(BO)
  868. FLD -8 * SIZE(AO)
  869. fmul %st(1), %st
  870. faddp %st, %st(2)
  871. FLD -7 * SIZE(AO)
  872. fmulp %st, %st(1)
  873. faddp %st, %st(2)
  874. addq $2 * SIZE,AO
  875. addq $1 * SIZE,BO
  876. decq %rax
  877. jne .L36
  878. ALIGN_4
  879. .L38:
  880. #if defined(LN) || defined(RT)
  881. movq KK, %rax
  882. #ifdef LN
  883. subq $2, %rax
  884. #else
  885. subq $1, %rax
  886. #endif
  887. salq $BASE_SHIFT, %rax
  888. movq AORIG, AO
  889. leaq (AO, %rax, 2), AO
  890. leaq (B, %rax, 1), BO
  891. #endif
  892. #if defined(LN) || defined(LT)
  893. FLD -8 * SIZE(BO)
  894. fsubp %st, %st(1)
  895. FLD -7 * SIZE(BO)
  896. fsubp %st, %st(2)
  897. #else
  898. FLD -8 * SIZE(AO)
  899. fsubp %st, %st(1)
  900. FLD -7 * SIZE(AO)
  901. fsubp %st, %st(2)
  902. #endif
  903. #ifdef LN
  904. FLD -5 * SIZE(AO)
  905. fmulp %st, %st(2)
  906. FLD -6 * SIZE(AO)
  907. fmul %st(2), %st
  908. fsubrp %st, %st(1)
  909. FLD -8 * SIZE(AO)
  910. fmulp %st, %st(1)
  911. #endif
  912. #ifdef LT
  913. FLD -8 * SIZE(AO)
  914. fmulp %st, %st(1)
  915. FLD -7 * SIZE(AO)
  916. fmul %st(1), %st
  917. fsubrp %st, %st(2)
  918. FLD -5 * SIZE(AO)
  919. fmulp %st, %st(2)
  920. #endif
  921. #ifdef RN
  922. FLD -8 * SIZE(BO)
  923. fmul %st, %st(1)
  924. fmulp %st, %st(2)
  925. #endif
  926. #ifdef RT
  927. FLD -8 * SIZE(BO)
  928. fmul %st, %st(1)
  929. fmulp %st, %st(2)
  930. #endif
  931. #ifdef LN
  932. subq $2 * SIZE, CO
  933. #endif
  934. #if defined(LN) || defined(LT)
  935. fld %st
  936. FST -8 * SIZE(BO)
  937. fxch %st(1)
  938. fld %st
  939. FST -7 * SIZE(BO)
  940. #else
  941. fld %st
  942. FST -8 * SIZE(AO)
  943. fxch %st(1)
  944. fld %st
  945. FST -7 * SIZE(AO)
  946. #endif
  947. FST 1 * SIZE(CO)
  948. FST 0 * SIZE(CO)
  949. #ifndef LN
  950. addq $2 * SIZE, CO
  951. #endif
  952. #if defined(LT) || defined(RN)
  953. movq K, %rax
  954. subq KK, %rax
  955. salq $BASE_SHIFT, %rax
  956. leaq (AO, %rax, 2), AO
  957. leaq (BO, %rax, 1), BO
  958. #endif
  959. #ifdef LN
  960. subq $2, KK
  961. #endif
  962. #ifdef LT
  963. addq $2, KK
  964. #endif
  965. #ifdef RT
  966. movq K, %rax
  967. salq $1 + BASE_SHIFT, %rax
  968. addq %rax, AORIG
  969. #endif
  970. decq I
  971. jne .L31
  972. ALIGN_4
  973. .L49:
  974. #ifdef LN
  975. movq K, %rax
  976. salq $BASE_SHIFT, %rax
  977. leaq (B, %rax, 1), B
  978. #endif
  979. #if defined(LT) || defined(RN)
  980. movq BO, B
  981. #endif
  982. #ifdef RN
  983. addq $1, KK
  984. #endif
  985. #ifdef RT
  986. subq $1, KK
  987. #endif
  988. ALIGN_4
  989. .L999:
  990. movq 0(%rsp), %rbx
  991. movq 8(%rsp), %rbp
  992. movq 16(%rsp), %r12
  993. movq 24(%rsp), %r13
  994. movq 32(%rsp), %r14
  995. movq 40(%rsp), %r15
  996. addq $STACKSIZE, %rsp
  997. ret
  998. EPILOGUE