You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qtrsm_kernel_LN_2x2.S 19 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1
  41. #define N ARG2
  42. #define K ARG3
  43. #define A ARG4
  44. #define B ARG5
  45. #define C ARG6
  46. #define LDC %r10
  47. #define I %r12
  48. #define J %r13
  49. #define AO %r14
  50. #define BO %r15
  51. #define CO %rbp
  52. #define KK %r11
  53. #define AORIG 48(%rsp)
  54. #define STACKSIZE 64
  55. #define ALPHA 8 + STACKSIZE(%rsp)
  56. #define OFFSET 32 + STACKSIZE(%rsp)
  57. #ifdef OPTERON
  58. #define PREFETCH prefetch
  59. #define PREFETCHW prefetchw
  60. #else
  61. #define PREFETCH prefetcht0
  62. #define PREFETCHW prefetcht0
  63. #endif
  64. #define PREFETCHSIZE (5 + 4 * 10)
  65. PROLOGUE
  66. PROFCODE
  67. #ifdef WINDOWS_ABI
  68. emms
  69. #endif
  70. subq $STACKSIZE, %rsp
  71. movq %rbx, 0(%rsp)
  72. movq %rbp, 8(%rsp)
  73. movq %r12, 16(%rsp)
  74. movq %r13, 24(%rsp)
  75. movq %r14, 32(%rsp)
  76. movq %r15, 40(%rsp)
  77. movq 24 + STACKSIZE(%rsp), LDC
  78. #if defined(TRMMKERNEL) && !defined(LEFT)
  79. movq OFFSET, %rax
  80. negq %rax
  81. movq %rax, KK
  82. #endif
  83. addq $8 * SIZE, A
  84. addq $8 * SIZE, B
  85. salq $BASE_SHIFT, LDC
  86. #ifdef LN
  87. movq M, %rax
  88. salq $BASE_SHIFT, %rax
  89. addq %rax, C
  90. imulq K, %rax
  91. addq %rax, A
  92. #endif
  93. #ifdef RT
  94. movq N, %rax
  95. salq $BASE_SHIFT, %rax
  96. imulq K, %rax
  97. addq %rax, B
  98. movq N, %rax
  99. imulq LDC, %rax
  100. addq %rax, C
  101. #endif
  102. #ifdef RN
  103. movq OFFSET, %rax
  104. negq %rax
  105. movq %rax, KK
  106. #endif
  107. #ifdef RT
  108. movq N, %rax
  109. subq OFFSET, %rax
  110. movq %rax, KK
  111. #endif
  112. movq N, %rax
  113. sarq $1, %rax
  114. movq %rax, J
  115. je .L30
  116. ALIGN_4
  117. .L01:
  118. #if defined(LT) || defined(RN)
  119. movq A, AO
  120. #else
  121. movq A, %rax
  122. movq %rax, AORIG
  123. #endif
  124. #ifdef RT
  125. movq K, %rax
  126. salq $1 + BASE_SHIFT, %rax
  127. subq %rax, B
  128. #endif
  129. lea (, LDC, 2), %rax
  130. #ifdef RT
  131. subq %rax, C
  132. #endif
  133. movq C, CO
  134. #ifndef RT
  135. addq %rax, C
  136. #endif
  137. #ifdef LN
  138. movq OFFSET, %rax
  139. addq M, %rax
  140. movq %rax, KK
  141. #endif
  142. #ifdef LT
  143. movq OFFSET, %rax
  144. movq %rax, KK
  145. #endif
  146. movq M, %rax
  147. andq $1, %rax
  148. je .L20
  149. ALIGN_4
  150. .L21:
  151. #ifdef LN
  152. movq K, %rax
  153. salq $0 + BASE_SHIFT, %rax
  154. subq %rax, AORIG
  155. #endif
  156. #if defined(LN) || defined(RT)
  157. movq KK, %rax
  158. salq $BASE_SHIFT, %rax
  159. movq AORIG, AO
  160. leaq (AO, %rax, 1), AO
  161. leaq (B, %rax, 2), BO
  162. #else
  163. movq B, BO
  164. #endif
  165. fldz
  166. fldz
  167. #if defined(LT) || defined(RN)
  168. movq KK, %rax
  169. #else
  170. movq K, %rax
  171. subq KK, %rax
  172. #endif
  173. sarq $2, %rax
  174. je .L25
  175. ALIGN_4
  176. .L22:
  177. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  178. FLD -8 * SIZE(AO)
  179. FLD -8 * SIZE(BO)
  180. fmul %st(1), %st
  181. faddp %st, %st(2)
  182. FLD -7 * SIZE(BO)
  183. fmulp %st, %st(1)
  184. faddp %st, %st(2)
  185. FLD -7 * SIZE(AO)
  186. FLD -6 * SIZE(BO)
  187. fmul %st(1), %st
  188. faddp %st, %st(2)
  189. FLD -5 * SIZE(BO)
  190. fmulp %st, %st(1)
  191. faddp %st, %st(2)
  192. FLD -6 * SIZE(AO)
  193. FLD -4 * SIZE(BO)
  194. fmul %st(1), %st
  195. faddp %st, %st(2)
  196. FLD -3 * SIZE(BO)
  197. fmulp %st, %st(1)
  198. faddp %st, %st(2)
  199. FLD -5 * SIZE(AO)
  200. FLD -2 * SIZE(BO)
  201. fmul %st(1), %st
  202. faddp %st, %st(2)
  203. FLD -1 * SIZE(BO)
  204. fmulp %st, %st(1)
  205. faddp %st, %st(2)
  206. addq $4 * SIZE,AO
  207. addq $8 * SIZE,BO
  208. decq %rax
  209. jne .L22
  210. ALIGN_4
  211. .L25:
  212. #if defined(LT) || defined(RN)
  213. movq KK, %rax
  214. #else
  215. movq K, %rax
  216. subq KK, %rax
  217. #endif
  218. and $3, %rax
  219. je .L28
  220. ALIGN_4
  221. .L26:
  222. FLD -8 * SIZE(AO)
  223. FLD -8 * SIZE(BO)
  224. fmul %st(1), %st
  225. faddp %st, %st(2)
  226. FLD -7 * SIZE(BO)
  227. fmulp %st, %st(1)
  228. faddp %st, %st(2)
  229. addq $1 * SIZE,AO
  230. addq $2 * SIZE,BO
  231. decq %rax
  232. jne .L26
  233. ALIGN_4
  234. .L28:
  235. #if defined(LN) || defined(RT)
  236. movq KK, %rax
  237. #ifdef LN
  238. subq $1, %rax
  239. #else
  240. subq $2, %rax
  241. #endif
  242. salq $BASE_SHIFT, %rax
  243. movq AORIG, AO
  244. leaq (AO, %rax, 1), AO
  245. leaq (B, %rax, 2), BO
  246. #endif
  247. #if defined(LN) || defined(LT)
  248. FLD -8 * SIZE(BO)
  249. fsubp %st, %st(1)
  250. FLD -7 * SIZE(BO)
  251. fsubp %st, %st(2)
  252. #else
  253. FLD -8 * SIZE(AO)
  254. fsubp %st, %st(1)
  255. FLD -7 * SIZE(AO)
  256. fsubp %st, %st(2)
  257. #endif
  258. #if defined(LN) || defined(LT)
  259. FLD -8 * SIZE(AO)
  260. fmul %st, %st(1)
  261. fmulp %st, %st(2)
  262. #endif
  263. #ifdef RN
  264. FLD -8 * SIZE(BO)
  265. fmulp %st, %st(1)
  266. FLD -7 * SIZE(BO)
  267. fmul %st(1), %st
  268. fsubrp %st, %st(2)
  269. FLD -5 * SIZE(BO)
  270. fmulp %st, %st(2)
  271. #endif
  272. #ifdef RT
  273. FLD -5 * SIZE(BO)
  274. fmulp %st, %st(2)
  275. FLD -6 * SIZE(BO)
  276. fmul %st(2), %st
  277. fsubrp %st, %st(1)
  278. FLD -8 * SIZE(BO)
  279. fmulp %st, %st(1)
  280. #endif
  281. #ifdef LN
  282. subq $1 * SIZE, CO
  283. #endif
  284. #if defined(LN) || defined(LT)
  285. fld %st
  286. FST -8 * SIZE(BO)
  287. fxch %st(1)
  288. fld %st
  289. FST -7 * SIZE(BO)
  290. #else
  291. fld %st
  292. FST -8 * SIZE(AO)
  293. fxch %st(1)
  294. fld %st
  295. FST -7 * SIZE(AO)
  296. #endif
  297. FST 0 * SIZE(CO, LDC)
  298. FST 0 * SIZE(CO)
  299. #ifndef LN
  300. addq $1 * SIZE, CO
  301. #endif
  302. #if defined(LT) || defined(RN)
  303. movq K, %rax
  304. subq KK, %rax
  305. salq $BASE_SHIFT, %rax
  306. leaq (AO, %rax, 1), AO
  307. leaq (BO, %rax, 2), BO
  308. #endif
  309. #ifdef LN
  310. subq $1, KK
  311. #endif
  312. #ifdef LT
  313. addq $1, KK
  314. #endif
  315. #ifdef RT
  316. movq K, %rax
  317. salq $0 + BASE_SHIFT, %rax
  318. addq %rax, AORIG
  319. #endif
  320. ALIGN_4
  321. .L20:
  322. movq M, I
  323. sarq $1, I
  324. je .L29
  325. ALIGN_4
  326. .L11:
  327. #ifdef LN
  328. movq K, %rax
  329. salq $1 + BASE_SHIFT, %rax
  330. subq %rax, AORIG
  331. #endif
  332. #if defined(LN) || defined(RT)
  333. movq KK, %rax
  334. salq $BASE_SHIFT, %rax
  335. movq AORIG, AO
  336. leaq (AO, %rax, 2), AO
  337. leaq (B, %rax, 2), BO
  338. #else
  339. movq B, BO
  340. #endif
  341. fldz
  342. fldz
  343. fldz
  344. fldz
  345. #if defined(HAVE_3DNOW)
  346. prefetchw 2 * SIZE(CO)
  347. prefetchw 2 * SIZE(CO, LDC, 1)
  348. #elif defined(HAVE_SSE)
  349. prefetchnta 2 * SIZE(CO)
  350. prefetchnta 2 * SIZE(CO, LDC, 1)
  351. #endif
  352. #if defined(LT) || defined(RN)
  353. movq KK, %rax
  354. #else
  355. movq K, %rax
  356. subq KK, %rax
  357. #endif
  358. sarq $2, %rax
  359. je .L15
  360. ALIGN_4
  361. .L12:
  362. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  363. FLD -8 * SIZE(AO)
  364. FLD -8 * SIZE(BO)
  365. fld %st(1)
  366. fmul %st(1), %st
  367. faddp %st, %st(3)
  368. FLD -7 * SIZE(BO)
  369. fmul %st, %st(2)
  370. FLD -7 * SIZE(AO)
  371. fmul %st, %st(2)
  372. fmulp %st, %st(1)
  373. faddp %st, %st(6)
  374. faddp %st, %st(4)
  375. faddp %st, %st(2)
  376. FLD -6 * SIZE(AO)
  377. FLD -6 * SIZE(BO)
  378. fld %st(1)
  379. fmul %st(1), %st
  380. faddp %st, %st(3)
  381. FLD -5 * SIZE(BO)
  382. fmul %st, %st(2)
  383. FLD -5 * SIZE(AO)
  384. fmul %st, %st(2)
  385. fmulp %st, %st(1)
  386. faddp %st, %st(6)
  387. faddp %st, %st(4)
  388. faddp %st, %st(2)
  389. PREFETCH (PREFETCHSIZE + 4) * SIZE(AO)
  390. FLD -4 * SIZE(AO)
  391. FLD -4 * SIZE(BO)
  392. fld %st(1)
  393. fmul %st(1), %st
  394. faddp %st, %st(3)
  395. FLD -3 * SIZE(BO)
  396. fmul %st, %st(2)
  397. FLD -3 * SIZE(AO)
  398. fmul %st, %st(2)
  399. fmulp %st, %st(1)
  400. faddp %st, %st(6)
  401. faddp %st, %st(4)
  402. faddp %st, %st(2)
  403. FLD -2 * SIZE(AO)
  404. FLD -2 * SIZE(BO)
  405. fld %st(1)
  406. fmul %st(1), %st
  407. faddp %st, %st(3)
  408. FLD -1 * SIZE(BO)
  409. fmul %st, %st(2)
  410. FLD -1 * SIZE(AO)
  411. fmul %st, %st(2)
  412. fmulp %st, %st(1)
  413. faddp %st, %st(6)
  414. faddp %st, %st(4)
  415. faddp %st, %st(2)
  416. addq $8 * SIZE,AO
  417. addq $8 * SIZE,BO
  418. decq %rax
  419. jne .L12
  420. ALIGN_4
  421. .L15:
  422. #if defined(LT) || defined(RN)
  423. movq KK, %rax
  424. #else
  425. movq K, %rax
  426. subq KK, %rax
  427. #endif
  428. and $3, %rax
  429. je .L18
  430. ALIGN_4
  431. .L16:
  432. FLD -8 * SIZE(AO)
  433. FLD -8 * SIZE(BO)
  434. fld %st(1)
  435. fmul %st(1), %st
  436. faddp %st, %st(3)
  437. FLD -7 * SIZE(BO)
  438. fmul %st, %st(2)
  439. FLD -7 * SIZE(AO)
  440. fmul %st, %st(2)
  441. fmulp %st, %st(1)
  442. faddp %st, %st(6)
  443. faddp %st, %st(4)
  444. faddp %st, %st(2)
  445. addq $2 * SIZE,AO
  446. addq $2 * SIZE,BO
  447. decq %rax
  448. jne .L16
  449. ALIGN_4
  450. .L18:
  451. #if defined(LN) || defined(RT)
  452. movq KK, %rax
  453. #ifdef LN
  454. subq $2, %rax
  455. #else
  456. subq $2, %rax
  457. #endif
  458. salq $BASE_SHIFT, %rax
  459. movq AORIG, AO
  460. leaq (AO, %rax, 2), AO
  461. leaq (B, %rax, 2), BO
  462. #endif
  463. #if defined(LN) || defined(LT)
  464. FLD -8 * SIZE(BO)
  465. fsubp %st, %st(1)
  466. FLD -7 * SIZE(BO)
  467. fsubp %st, %st(2)
  468. FLD -6 * SIZE(BO)
  469. fsubp %st, %st(3)
  470. FLD -5 * SIZE(BO)
  471. fsubp %st, %st(4)
  472. #else
  473. FLD -8 * SIZE(AO)
  474. fsubp %st, %st(1)
  475. FLD -7 * SIZE(AO)
  476. fsubp %st, %st(3)
  477. FLD -6 * SIZE(AO)
  478. fsubp %st, %st(2)
  479. FLD -5 * SIZE(AO)
  480. fsubp %st, %st(4)
  481. #endif
  482. #ifdef LN
  483. FLD -5 * SIZE(AO)
  484. fmul %st, %st(3)
  485. fmulp %st, %st(4)
  486. FLD -6 * SIZE(AO)
  487. fmul %st(3), %st
  488. FLD -6 * SIZE(AO)
  489. fmul %st(5), %st
  490. fsubrp %st, %st(3)
  491. fsubrp %st, %st(1)
  492. FLD -8 * SIZE(AO)
  493. fmul %st, %st(1)
  494. fmulp %st, %st(2)
  495. #endif
  496. #ifdef LT
  497. FLD -8 * SIZE(AO)
  498. fmul %st, %st(1)
  499. fmulp %st, %st(2)
  500. FLD -7 * SIZE(AO)
  501. fmul %st(1), %st
  502. FLD -7 * SIZE(AO)
  503. fmul %st(3), %st
  504. fsubrp %st, %st(5)
  505. fsubrp %st, %st(3)
  506. FLD -5 * SIZE(AO)
  507. fmul %st, %st(3)
  508. fmulp %st, %st(4)
  509. #endif
  510. #ifdef RN
  511. FLD -8 * SIZE(BO)
  512. fmul %st, %st(1)
  513. fmulp %st, %st(3)
  514. FLD -7 * SIZE(BO)
  515. fmul %st(1), %st
  516. FLD -7 * SIZE(BO)
  517. fmul %st(4), %st
  518. fsubrp %st, %st(5)
  519. fsubrp %st, %st(2)
  520. FLD -5 * SIZE(BO)
  521. fmul %st, %st(2)
  522. fmulp %st, %st(4)
  523. #endif
  524. #ifdef RT
  525. FLD -5 * SIZE(BO)
  526. fmul %st, %st(2)
  527. fmulp %st, %st(4)
  528. FLD -6 * SIZE(BO)
  529. fmul %st(2), %st
  530. FLD -6 * SIZE(BO)
  531. fmul %st(5), %st
  532. fsubrp %st, %st(4)
  533. fsubrp %st, %st(1)
  534. FLD -8 * SIZE(BO)
  535. fmul %st, %st(1)
  536. fmulp %st, %st(3)
  537. #endif
  538. #ifdef LN
  539. subq $2 * SIZE, CO
  540. #endif
  541. #if defined(LN) || defined(LT)
  542. fld %st
  543. FST -8 * SIZE(BO)
  544. fxch %st(1)
  545. fld %st
  546. FST -7 * SIZE(BO)
  547. fxch %st(2)
  548. fld %st
  549. FST -6 * SIZE(BO)
  550. fxch %st(3)
  551. fld %st
  552. FST -5 * SIZE(BO)
  553. FST 1 * SIZE(CO, LDC)
  554. FST 0 * SIZE(CO)
  555. FST 0 * SIZE(CO, LDC)
  556. FST 1 * SIZE(CO)
  557. #else
  558. fld %st
  559. FST -8 * SIZE(AO)
  560. fxch %st(2)
  561. fld %st
  562. FST -7 * SIZE(AO)
  563. fxch %st(1)
  564. fld %st
  565. FST -6 * SIZE(AO)
  566. fxch %st(3)
  567. fld %st
  568. FST -5 * SIZE(AO)
  569. FST 1 * SIZE(CO, LDC)
  570. FST 1 * SIZE(CO)
  571. FST 0 * SIZE(CO)
  572. FST 0 * SIZE(CO, LDC)
  573. #endif
  574. #ifndef LN
  575. addq $2 * SIZE, CO
  576. #endif
  577. #if defined(LT) || defined(RN)
  578. movq K, %rax
  579. subq KK, %rax
  580. salq $BASE_SHIFT, %rax
  581. leaq (AO, %rax, 2), AO
  582. leaq (BO, %rax, 2), BO
  583. #endif
  584. #ifdef LN
  585. subq $2, KK
  586. #endif
  587. #ifdef LT
  588. addq $2, KK
  589. #endif
  590. #ifdef RT
  591. movq K, %rax
  592. salq $1 + BASE_SHIFT, %rax
  593. addq %rax, AORIG
  594. #endif
  595. decq I
  596. jne .L11
  597. ALIGN_4
  598. .L29:
  599. #ifdef LN
  600. movq K, %rax
  601. salq $BASE_SHIFT, %rax
  602. leaq (B, %rax, 2), B
  603. #endif
  604. #if defined(LT) || defined(RN)
  605. movq BO, B
  606. #endif
  607. #ifdef RN
  608. addq $2, KK
  609. #endif
  610. #ifdef RT
  611. subq $2, KK
  612. #endif
  613. decq J
  614. jne .L01
  615. ALIGN_4
  616. .L30:
  617. movq N, %rax
  618. testq $1, %rax
  619. je .L999
  620. #if defined(LT) || defined(RN)
  621. movq A, AO
  622. #else
  623. movq A, %rax
  624. movq %rax, AORIG
  625. #endif
  626. #ifdef RT
  627. movq K, %rax
  628. salq $0 + BASE_SHIFT, %rax
  629. subq %rax, B
  630. #endif
  631. #ifdef RT
  632. subq LDC, C
  633. #endif
  634. movq C, CO
  635. #ifndef RT
  636. addq LDC, C
  637. #endif
  638. #ifdef LN
  639. movq OFFSET, %rax
  640. addq M, %rax
  641. movq %rax, KK
  642. #endif
  643. #ifdef LT
  644. movq OFFSET, %rax
  645. movq %rax, KK
  646. #endif
  647. movq M, %rax
  648. andq $1, %rax
  649. je .L40
  650. ALIGN_4
  651. .L41:
  652. #ifdef LN
  653. movq K, %rax
  654. salq $0 + BASE_SHIFT, %rax
  655. subq %rax, AORIG
  656. #endif
  657. #if defined(LN) || defined(RT)
  658. movq KK, %rax
  659. salq $BASE_SHIFT, %rax
  660. movq AORIG, AO
  661. leaq (AO, %rax, 1), AO
  662. leaq (B, %rax, 1), BO
  663. #else
  664. movq B, BO
  665. #endif
  666. fldz
  667. #if defined(LT) || defined(RN)
  668. movq KK, %rax
  669. #else
  670. movq K, %rax
  671. subq KK, %rax
  672. #endif
  673. sarq $2, %rax
  674. je .L45
  675. ALIGN_4
  676. .L42:
  677. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  678. FLD -8 * SIZE(AO)
  679. FLD -8 * SIZE(BO)
  680. fmulp %st, %st(1)
  681. faddp %st, %st(1)
  682. FLD -7 * SIZE(AO)
  683. FLD -7 * SIZE(BO)
  684. fmulp %st, %st(1)
  685. faddp %st, %st(1)
  686. FLD -6 * SIZE(AO)
  687. FLD -6 * SIZE(BO)
  688. fmulp %st, %st(1)
  689. faddp %st, %st(1)
  690. FLD -5 * SIZE(AO)
  691. FLD -5 * SIZE(BO)
  692. fmulp %st, %st(1)
  693. faddp %st, %st(1)
  694. addq $4 * SIZE,AO
  695. addq $4 * SIZE,BO
  696. decq %rax
  697. jne .L42
  698. ALIGN_4
  699. .L45:
  700. #if defined(LT) || defined(RN)
  701. movq KK, %rax
  702. #else
  703. movq K, %rax
  704. subq KK, %rax
  705. #endif
  706. and $3, %rax
  707. je .L48
  708. ALIGN_4
  709. .L46:
  710. FLD -8 * SIZE(AO)
  711. FLD -8 * SIZE(BO)
  712. fmulp %st, %st(1)
  713. faddp %st, %st(1)
  714. addq $1 * SIZE,AO
  715. addq $1 * SIZE,BO
  716. decq %rax
  717. jne .L46
  718. ALIGN_4
  719. .L48:
  720. #if defined(LN) || defined(RT)
  721. movq KK, %rax
  722. #ifdef LN
  723. subq $1, %rax
  724. #else
  725. subq $1, %rax
  726. #endif
  727. salq $BASE_SHIFT, %rax
  728. movq AORIG, AO
  729. leaq (AO, %rax, 1), AO
  730. leaq (B, %rax, 1), BO
  731. #endif
  732. #if defined(LN) || defined(LT)
  733. FLD -8 * SIZE(BO)
  734. fsubp %st, %st(1)
  735. #else
  736. FLD -8 * SIZE(AO)
  737. fsubp %st, %st(1)
  738. #endif
  739. #ifdef LN
  740. FLD -8 * SIZE(AO)
  741. fmulp %st, %st(1)
  742. #endif
  743. #ifdef LT
  744. FLD -8 * SIZE(AO)
  745. fmulp %st, %st(1)
  746. #endif
  747. #ifdef RN
  748. FLD -8 * SIZE(BO)
  749. fmulp %st, %st(1)
  750. #endif
  751. #ifdef RT
  752. FLD -8 * SIZE(BO)
  753. fmulp %st, %st(1)
  754. #endif
  755. #ifdef LN
  756. subq $1 * SIZE, CO
  757. #endif
  758. #if defined(LN) || defined(LT)
  759. fld %st
  760. FST -8 * SIZE(BO)
  761. #else
  762. fld %st
  763. FST -8 * SIZE(AO)
  764. #endif
  765. FST 0 * SIZE(CO)
  766. #ifndef LN
  767. addq $1 * SIZE, CO
  768. #endif
  769. #if defined(LT) || defined(RN)
  770. movq K, %rax
  771. subq KK, %rax
  772. salq $BASE_SHIFT, %rax
  773. leaq (AO, %rax, 1), AO
  774. leaq (BO, %rax, 1), BO
  775. #endif
  776. #ifdef LN
  777. subq $1, KK
  778. #endif
  779. #ifdef LT
  780. addq $1, KK
  781. #endif
  782. #ifdef RT
  783. movq K, %rax
  784. salq $0 + BASE_SHIFT, %rax
  785. addq %rax, AORIG
  786. #endif
  787. ALIGN_4
  788. .L40:
  789. movq M, I
  790. sarq $1, I
  791. je .L49
  792. ALIGN_4
  793. .L31:
  794. #ifdef LN
  795. movq K, %rax
  796. salq $1 + BASE_SHIFT, %rax
  797. subq %rax, AORIG
  798. #endif
  799. #if defined(LN) || defined(RT)
  800. movq KK, %rax
  801. salq $BASE_SHIFT, %rax
  802. movq AORIG, AO
  803. leaq (AO, %rax, 2), AO
  804. leaq (B, %rax, 1), BO
  805. #else
  806. movq B, BO
  807. #endif
  808. fldz
  809. fldz
  810. #if defined(HAVE_3DNOW)
  811. prefetchw 2 * SIZE(CO)
  812. #elif defined(HAVE_SSE)
  813. prefetchnta 2 * SIZE(CO)
  814. #endif
  815. #if defined(LT) || defined(RN)
  816. movq KK, %rax
  817. #else
  818. movq K, %rax
  819. subq KK, %rax
  820. #endif
  821. sarq $2, %rax
  822. je .L35
  823. ALIGN_4
  824. .L32:
  825. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  826. FLD -8 * SIZE(BO)
  827. FLD -8 * SIZE(AO)
  828. fmul %st(1), %st
  829. faddp %st, %st(2)
  830. FLD -7 * SIZE(AO)
  831. fmulp %st, %st(1)
  832. faddp %st, %st(2)
  833. FLD -7 * SIZE(BO)
  834. FLD -6 * SIZE(AO)
  835. fmul %st(1), %st
  836. faddp %st, %st(2)
  837. FLD -5 * SIZE(AO)
  838. fmulp %st, %st(1)
  839. faddp %st, %st(2)
  840. FLD -6 * SIZE(BO)
  841. FLD -4 * SIZE(AO)
  842. fmul %st(1), %st
  843. faddp %st, %st(2)
  844. FLD -3 * SIZE(AO)
  845. fmulp %st, %st(1)
  846. faddp %st, %st(2)
  847. FLD -5 * SIZE(BO)
  848. FLD -2 * SIZE(AO)
  849. fmul %st(1), %st
  850. faddp %st, %st(2)
  851. FLD -1 * SIZE(AO)
  852. fmulp %st, %st(1)
  853. faddp %st, %st(2)
  854. addq $8 * SIZE,AO
  855. addq $4 * SIZE,BO
  856. decq %rax
  857. jne .L32
  858. ALIGN_4
  859. .L35:
  860. #if defined(LT) || defined(RN)
  861. movq KK, %rax
  862. #else
  863. movq K, %rax
  864. subq KK, %rax
  865. #endif
  866. and $3, %rax
  867. je .L38
  868. ALIGN_4
  869. .L36:
  870. FLD -8 * SIZE(BO)
  871. FLD -8 * SIZE(AO)
  872. fmul %st(1), %st
  873. faddp %st, %st(2)
  874. FLD -7 * SIZE(AO)
  875. fmulp %st, %st(1)
  876. faddp %st, %st(2)
  877. addq $2 * SIZE,AO
  878. addq $1 * SIZE,BO
  879. decq %rax
  880. jne .L36
  881. ALIGN_4
  882. .L38:
  883. #if defined(LN) || defined(RT)
  884. movq KK, %rax
  885. #ifdef LN
  886. subq $2, %rax
  887. #else
  888. subq $1, %rax
  889. #endif
  890. salq $BASE_SHIFT, %rax
  891. movq AORIG, AO
  892. leaq (AO, %rax, 2), AO
  893. leaq (B, %rax, 1), BO
  894. #endif
  895. #if defined(LN) || defined(LT)
  896. FLD -8 * SIZE(BO)
  897. fsubp %st, %st(1)
  898. FLD -7 * SIZE(BO)
  899. fsubp %st, %st(2)
  900. #else
  901. FLD -8 * SIZE(AO)
  902. fsubp %st, %st(1)
  903. FLD -7 * SIZE(AO)
  904. fsubp %st, %st(2)
  905. #endif
  906. #ifdef LN
  907. FLD -5 * SIZE(AO)
  908. fmulp %st, %st(2)
  909. FLD -6 * SIZE(AO)
  910. fmul %st(2), %st
  911. fsubrp %st, %st(1)
  912. FLD -8 * SIZE(AO)
  913. fmulp %st, %st(1)
  914. #endif
  915. #ifdef LT
  916. FLD -8 * SIZE(AO)
  917. fmulp %st, %st(1)
  918. FLD -7 * SIZE(AO)
  919. fmul %st(1), %st
  920. fsubrp %st, %st(2)
  921. FLD -5 * SIZE(AO)
  922. fmulp %st, %st(2)
  923. #endif
  924. #ifdef RN
  925. FLD -8 * SIZE(BO)
  926. fmul %st, %st(1)
  927. fmulp %st, %st(2)
  928. #endif
  929. #ifdef RT
  930. FLD -8 * SIZE(BO)
  931. fmul %st, %st(1)
  932. fmulp %st, %st(2)
  933. #endif
  934. #ifdef LN
  935. subq $2 * SIZE, CO
  936. #endif
  937. #if defined(LN) || defined(LT)
  938. fld %st
  939. FST -8 * SIZE(BO)
  940. fxch %st(1)
  941. fld %st
  942. FST -7 * SIZE(BO)
  943. #else
  944. fld %st
  945. FST -8 * SIZE(AO)
  946. fxch %st(1)
  947. fld %st
  948. FST -7 * SIZE(AO)
  949. #endif
  950. FST 1 * SIZE(CO)
  951. FST 0 * SIZE(CO)
  952. #ifndef LN
  953. addq $2 * SIZE, CO
  954. #endif
  955. #if defined(LT) || defined(RN)
  956. movq K, %rax
  957. subq KK, %rax
  958. salq $BASE_SHIFT, %rax
  959. leaq (AO, %rax, 2), AO
  960. leaq (BO, %rax, 1), BO
  961. #endif
  962. #ifdef LN
  963. subq $2, KK
  964. #endif
  965. #ifdef LT
  966. addq $2, KK
  967. #endif
  968. #ifdef RT
  969. movq K, %rax
  970. salq $1 + BASE_SHIFT, %rax
  971. addq %rax, AORIG
  972. #endif
  973. decq I
  974. jne .L31
  975. ALIGN_4
  976. .L49:
  977. #ifdef LN
  978. movq K, %rax
  979. salq $BASE_SHIFT, %rax
  980. leaq (B, %rax, 1), B
  981. #endif
  982. #if defined(LT) || defined(RN)
  983. movq BO, B
  984. #endif
  985. #ifdef RN
  986. addq $1, KK
  987. #endif
  988. #ifdef RT
  989. subq $1, KK
  990. #endif
  991. ALIGN_4
  992. .L999:
  993. movq 0(%rsp), %rbx
  994. movq 8(%rsp), %rbp
  995. movq 16(%rsp), %r12
  996. movq 24(%rsp), %r13
  997. movq 32(%rsp), %r14
  998. movq 40(%rsp), %r15
  999. addq $STACKSIZE, %rsp
  1000. ret
  1001. EPILOGUE