You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_4x4_LT.S 59 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  41. #error "Architecture is not specified."
  42. #endif
  43. #ifdef EV6
  44. #define PREFETCHSIZE 56
  45. #define UNOP unop
  46. #endif
  47. #ifdef EV5
  48. #define PREFETCHSIZE 56
  49. #define UNOP
  50. #endif
  51. #ifdef EV4
  52. #define UNOP
  53. #endif
  54. #define STACKSIZE 80
  55. #define M $16
  56. #define N $17
  57. #define K $18
  58. #define A $20
  59. #define B $21
  60. #define C $22
  61. #define LDC $23
  62. #define C1 $19
  63. #define C2 $24
  64. #define C3 $25
  65. #define C4 $27
  66. #define AO $at
  67. #define BO $5
  68. #define I $6
  69. #define J $7
  70. #define L $8
  71. #define a1 $f16
  72. #define a2 $f17
  73. #define a3 $f18
  74. #define a4 $f19
  75. #define b1 $f20
  76. #define b2 $f21
  77. #define b3 $f22
  78. #define b4 $f23
  79. #define t1 $f24
  80. #define t2 $f25
  81. #define t3 $f26
  82. #define t4 $f27
  83. #define a5 $f28
  84. #define a6 $f30
  85. #define b5 $f29
  86. #define alpha $f30
  87. #define c01 $f0
  88. #define c02 $f1
  89. #define c03 $f2
  90. #define c04 $f3
  91. #define c05 $f4
  92. #define c06 $f5
  93. #define c07 $f6
  94. #define c08 $f7
  95. #define c09 $f8
  96. #define c10 $f9
  97. #define c11 $f10
  98. #define c12 $f11
  99. #define c13 $f12
  100. #define c14 $f13
  101. #define c15 $f14
  102. #define c16 $f15
  103. #define TMP1 $0
  104. #define TMP2 $1
  105. #define KK $2
  106. #define AORIG $3
  107. #define OFFSET $4
  108. PROLOGUE
  109. PROFCODE
  110. .frame $sp, STACKSIZE, $26, 0
  111. lda $sp, -STACKSIZE($sp)
  112. ldq C, 0 + STACKSIZE($sp)
  113. ldq LDC, 8 + STACKSIZE($sp)
  114. ldq OFFSET, 16 + STACKSIZE($sp)
  115. SXADDQ LDC, 0, LDC
  116. stt $f2, 0($sp)
  117. stt $f3, 8($sp)
  118. stt $f4, 16($sp)
  119. stt $f5, 24($sp)
  120. stt $f6, 32($sp)
  121. stt $f7, 40($sp)
  122. stt $f8, 48($sp)
  123. stt $f9, 56($sp)
  124. cmple M, 0, $0
  125. cmple N, 0, $1
  126. cmple K, 0, $2
  127. or $0, $1, $0
  128. or $0, $2, $0
  129. bne $0, $L999
  130. #ifdef LN
  131. mulq M, K, TMP1
  132. SXADDQ TMP1, A, A
  133. SXADDQ M, C, C
  134. #endif
  135. #ifdef RN
  136. negq OFFSET, KK
  137. #endif
  138. #ifdef RT
  139. mulq N, K, TMP1
  140. SXADDQ TMP1, B, B
  141. mulq N, LDC, TMP1
  142. addq TMP1, C, C
  143. subq N, OFFSET, KK
  144. #endif
  145. sra N, 2, J
  146. ble J, $L40
  147. .align 4
  148. $L01:
  149. #ifdef RT
  150. sll K, 2 + BASE_SHIFT, TMP1
  151. subq B, TMP1, B
  152. s4addq LDC, 0, TMP1
  153. subq C, TMP1, C
  154. #endif
  155. mov C, C1
  156. addq C, LDC, C2
  157. addq C2, LDC, C3
  158. #ifndef RT
  159. s4addq LDC, C, C
  160. #endif
  161. fclr t1
  162. addq C3, LDC, C4
  163. fclr t2
  164. #ifdef LN
  165. addq M, OFFSET, KK
  166. #endif
  167. #ifdef LT
  168. mov OFFSET, KK
  169. #endif
  170. #if defined(LN) || defined(RT)
  171. mov A, AORIG
  172. #else
  173. mov A, AO
  174. #endif
  175. sra M, 2, I
  176. fclr t3
  177. fclr t4
  178. ble I, $L20
  179. .align 4
  180. $L11:
  181. #if defined(LT) || defined(RN)
  182. LD a1, 0 * SIZE(AO)
  183. fclr c11
  184. LD a2, 1 * SIZE(AO)
  185. fclr c12
  186. LD a3, 2 * SIZE(AO)
  187. fclr c16
  188. LD a4, 3 * SIZE(AO)
  189. fclr c15
  190. LD b1, 0 * SIZE(B)
  191. fclr c01
  192. LD b2, 1 * SIZE(B)
  193. fclr c02
  194. LD b3, 2 * SIZE(B)
  195. fclr c06
  196. LD b4, 3 * SIZE(B)
  197. fclr c05
  198. lds $f31, 4 * SIZE(C1)
  199. fclr c03
  200. lda L, -2(KK)
  201. fclr c04
  202. lds $f31, 7 * SIZE(C2)
  203. fclr c08
  204. lda BO, 4 * SIZE(B)
  205. fclr c13
  206. lds $f31, 4 * SIZE(C3)
  207. fclr c09
  208. lda AO, 4 * SIZE(AO)
  209. fclr c10
  210. lds $f31, 7 * SIZE(C4)
  211. fclr c14
  212. fclr c07
  213. ble KK, $L18
  214. #else
  215. #ifdef LN
  216. sll K, BASE_SHIFT + 2, TMP1
  217. subq AORIG, TMP1, AORIG
  218. #endif
  219. sll KK, BASE_SHIFT + 2, TMP1
  220. addq AORIG, TMP1, AO
  221. addq B, TMP1, BO
  222. subq K, KK, TMP1
  223. LD a1, 0 * SIZE(AO)
  224. fclr c11
  225. LD a2, 1 * SIZE(AO)
  226. fclr c12
  227. LD a3, 2 * SIZE(AO)
  228. fclr c16
  229. LD a4, 3 * SIZE(AO)
  230. fclr c15
  231. LD b1, 0 * SIZE(BO)
  232. fclr c01
  233. LD b2, 1 * SIZE(BO)
  234. fclr c02
  235. LD b3, 2 * SIZE(BO)
  236. fclr c06
  237. LD b4, 3 * SIZE(BO)
  238. fclr c05
  239. lds $f31, 4 * SIZE(C1)
  240. fclr c03
  241. lda L, -2(TMP1)
  242. fclr c04
  243. lds $f31, 7 * SIZE(C2)
  244. fclr c08
  245. lda BO, 4 * SIZE(BO)
  246. fclr c13
  247. lds $f31, 4 * SIZE(C3)
  248. fclr c09
  249. lda AO, 4 * SIZE(AO)
  250. fclr c10
  251. lds $f31, 7 * SIZE(C4)
  252. fclr c14
  253. fclr c07
  254. ble TMP1, $L18
  255. #endif
  256. ble L, $L15
  257. .align 5
  258. $L12:
  259. /* 1 */
  260. ADD c11, t1, c11
  261. #ifndef EV4
  262. ldq $31, PREFETCHSIZE * SIZE(AO)
  263. #else
  264. unop
  265. #endif
  266. MUL b1, a1, t1
  267. #ifndef EV4
  268. ldl $31, PREFETCHSIZE * SIZE(BO)
  269. #else
  270. unop
  271. #endif
  272. ADD c12, t2, c12
  273. unop
  274. MUL b1, a2, t2
  275. unop
  276. ADD c16, t3, c16
  277. unop
  278. MUL b2, a2, t3
  279. LD a5, 0 * SIZE(AO)
  280. ADD c15, t4, c15
  281. unop
  282. MUL b2, a1, t4
  283. LD b5, 0 * SIZE(BO)
  284. /* 2 */
  285. ADD c01, t1, c01
  286. UNOP
  287. MUL b1, a3, t1
  288. UNOP
  289. ADD c02, t2, c02
  290. UNOP
  291. MUL b1, a4, t2
  292. UNOP
  293. ADD c06, t3, c06
  294. unop
  295. MUL b2, a4, t3
  296. unop
  297. ADD c05, t4, c05
  298. unop
  299. MUL b4, a1, t4
  300. unop
  301. /* 3 */
  302. ADD c03, t1, c03
  303. unop
  304. MUL b3, a1, t1
  305. unop
  306. ADD c04, t2, c04
  307. unop
  308. MUL b3, a2, t2
  309. unop
  310. ADD c08, t3, c08
  311. unop
  312. MUL b4, a2, t3
  313. LD a2, 1 * SIZE(AO)
  314. ADD c13, t4, c13
  315. unop
  316. MUL b2, a3, t4
  317. LD b2, 1 * SIZE(BO)
  318. /* 4 */
  319. ADD c09, t1, c09
  320. unop
  321. MUL b3, a3, t1
  322. LD a6, 2 * SIZE(AO)
  323. ADD c10, t2, c10
  324. unop
  325. MUL b3, a4, t2
  326. LD b3, 2 * SIZE(BO)
  327. ADD c14, t3, c14
  328. unop
  329. MUL b4, a4, t3
  330. LD a4, 3 * SIZE(AO)
  331. ADD c07, t4, c07
  332. unop
  333. MUL b4, a3, t4
  334. LD b4, 3 * SIZE(BO)
  335. /* 5 */
  336. ADD c11, t1, c11
  337. unop
  338. MUL b5, a5, t1
  339. LD a1, 4 * SIZE(AO)
  340. ADD c12, t2, c12
  341. lda L, -2(L)
  342. MUL b5, a2, t2
  343. LD b1, 4 * SIZE(BO)
  344. ADD c16, t3, c16
  345. unop
  346. MUL b2, a2, t3
  347. unop
  348. ADD c15, t4, c15
  349. unop
  350. MUL b2, a5, t4
  351. unop
  352. /* 6 */
  353. ADD c01, t1, c01
  354. unop
  355. MUL b5, a6, t1
  356. unop
  357. ADD c02, t2, c02
  358. unop
  359. MUL b5, a4, t2
  360. unop
  361. ADD c06, t3, c06
  362. unop
  363. MUL b2, a4, t3
  364. unop
  365. ADD c05, t4, c05
  366. unop
  367. MUL b4, a5, t4
  368. unop
  369. /* 7 */
  370. ADD c03, t1, c03
  371. lda AO, 8 * SIZE(AO)
  372. MUL b3, a5, t1
  373. unop
  374. ADD c04, t2, c04
  375. lda BO, 8 * SIZE(BO)
  376. MUL b3, a2, t2
  377. unop
  378. ADD c08, t3, c08
  379. unop
  380. MUL b4, a2, t3
  381. LD a2, -3 * SIZE(AO)
  382. ADD c13, t4, c13
  383. unop
  384. MUL b2, a6, t4
  385. LD b2, -3 * SIZE(BO)
  386. /* 8 */
  387. ADD c09, t1, c09
  388. unop
  389. MUL b3, a6, t1
  390. LD a3, -2 * SIZE(AO)
  391. ADD c10, t2, c10
  392. unop
  393. MUL b3, a4, t2
  394. LD b3, -2 * SIZE(BO)
  395. ADD c14, t3, c14
  396. unop
  397. MUL b4, a4, t3
  398. LD a4, -1 * SIZE(AO)
  399. ADD c07, t4, c07
  400. MUL b4, a6, t4
  401. LD b4, -1 * SIZE(BO)
  402. bgt L, $L12
  403. .align 4
  404. $L15:
  405. ADD c11, t1, c11
  406. MUL b1, a1, t1
  407. #if defined(LT) || defined(RN)
  408. blbs KK, $L17
  409. #else
  410. blbs TMP1, $L17
  411. #endif
  412. .align 4
  413. ADD c12, t2, c12
  414. MUL b1, a2, t2
  415. ADD c16, t3, c16
  416. MUL b2, a2, t3
  417. ADD c15, t4, c15
  418. MUL b2, a1, t4
  419. ADD c01, t1, c01
  420. MUL b1, a3, t1
  421. ADD c02, t2, c02
  422. unop
  423. MUL b1, a4, t2
  424. LD b1, 0 * SIZE(BO)
  425. ADD c06, t3, c06
  426. MUL b2, a4, t3
  427. ADD c05, t4, c05
  428. MUL b4, a1, t4
  429. ADD c03, t1, c03
  430. unop
  431. MUL b3, a1, t1
  432. LD a1, 0 * SIZE(AO)
  433. ADD c04, t2, c04
  434. unop
  435. MUL b3, a2, t2
  436. unop
  437. ADD c08, t3, c08
  438. unop
  439. MUL b4, a2, t3
  440. LD a2, 1 * SIZE(AO)
  441. ADD c13, t4, c13
  442. unop
  443. MUL b2, a3, t4
  444. LD b2, 1 * SIZE(BO)
  445. ADD c09, t1, c09
  446. unop
  447. MUL b3, a3, t1
  448. lda AO, 4 * SIZE(AO)
  449. ADD c10, t2, c10
  450. unop
  451. MUL b3, a4, t2
  452. LD b3, 2 * SIZE(BO)
  453. ADD c14, t3, c14
  454. unop
  455. MUL b4, a4, t3
  456. LD a4, -1 * SIZE(AO)
  457. ADD c07, t4, c07
  458. unop
  459. MUL b4, a3, t4
  460. LD a3, -2 * SIZE(AO)
  461. ADD c11, t1, c11
  462. LD b4, 3 * SIZE(BO)
  463. MUL b1, a1, t1
  464. lda BO, 4 * SIZE(BO)
  465. .align 4
  466. $L17:
  467. ADD c12, t2, c12
  468. MUL b1, a2, t2
  469. ADD c16, t3, c16
  470. MUL b2, a2, t3
  471. ADD c15, t4, c15
  472. MUL b2, a1, t4
  473. ADD c01, t1, c01
  474. MUL b1, a3, t1
  475. ADD c02, t2, c02
  476. MUL b1, a4, t2
  477. ADD c06, t3, c06
  478. MUL b2, a4, t3
  479. ADD c05, t4, c05
  480. MUL b4, a1, t4
  481. ADD c03, t1, c03
  482. MUL b3, a1, t1
  483. ADD c04, t2, c04
  484. MUL b3, a2, t2
  485. ADD c08, t3, c08
  486. MUL b4, a2, t3
  487. ADD c13, t4, c13
  488. MUL b2, a3, t4
  489. ADD c09, t1, c09
  490. MUL b3, a3, t1
  491. ADD c10, t2, c10
  492. MUL b3, a4, t2
  493. ADD c14, t3, c14
  494. MUL b4, a4, t3
  495. ADD c07, t4, c07
  496. lda AO, 4 * SIZE(AO)
  497. MUL b4, a3, t4
  498. lda BO, 4 * SIZE(BO)
  499. ADD c11, t1, c11
  500. ADD c12, t2, c12
  501. ADD c16, t3, c16
  502. ADD c15, t4, c15
  503. .align 4
  504. $L18:
  505. #if defined(LN) || defined(RT)
  506. #ifdef LN
  507. subq KK, 4, TMP1
  508. #else
  509. subq KK, 4, TMP1
  510. #endif
  511. sll TMP1, BASE_SHIFT + 2, TMP2
  512. addq AORIG, TMP2, AO
  513. sll TMP1, BASE_SHIFT + 2, TMP2
  514. addq B, TMP2, BO
  515. #else
  516. lda AO, -4 * SIZE(AO)
  517. lda BO, -4 * SIZE(BO)
  518. #endif
  519. #if defined(LN) || defined(LT)
  520. LD a1, 0 * SIZE(BO)
  521. LD a2, 1 * SIZE(BO)
  522. LD a3, 2 * SIZE(BO)
  523. LD a4, 3 * SIZE(BO)
  524. LD b1, 4 * SIZE(BO)
  525. LD b2, 5 * SIZE(BO)
  526. LD b3, 6 * SIZE(BO)
  527. LD b4, 7 * SIZE(BO)
  528. SUB a1, c01, c01
  529. SUB a2, c05, c05
  530. SUB a3, c09, c09
  531. SUB a4, c13, c13
  532. SUB b1, c02, c02
  533. SUB b2, c06, c06
  534. SUB b3, c10, c10
  535. SUB b4, c14, c14
  536. LD a1, 8 * SIZE(BO)
  537. LD a2, 9 * SIZE(BO)
  538. LD a3, 10 * SIZE(BO)
  539. LD a4, 11 * SIZE(BO)
  540. LD b1, 12 * SIZE(BO)
  541. LD b2, 13 * SIZE(BO)
  542. LD b3, 14 * SIZE(BO)
  543. LD b4, 15 * SIZE(BO)
  544. SUB a1, c03, c03
  545. SUB a2, c07, c07
  546. SUB a3, c11, c11
  547. SUB a4, c15, c15
  548. SUB b1, c04, c04
  549. SUB b2, c08, c08
  550. SUB b3, c12, c12
  551. SUB b4, c16, c16
  552. #else
  553. LD a1, 0 * SIZE(AO)
  554. LD a2, 1 * SIZE(AO)
  555. LD a3, 2 * SIZE(AO)
  556. LD a4, 3 * SIZE(AO)
  557. LD b1, 4 * SIZE(AO)
  558. LD b2, 5 * SIZE(AO)
  559. LD b3, 6 * SIZE(AO)
  560. LD b4, 7 * SIZE(AO)
  561. SUB a1, c01, c01
  562. SUB a2, c02, c02
  563. SUB a3, c03, c03
  564. SUB a4, c04, c04
  565. SUB b1, c05, c05
  566. SUB b2, c06, c06
  567. SUB b3, c07, c07
  568. SUB b4, c08, c08
  569. LD a1, 8 * SIZE(AO)
  570. LD a2, 9 * SIZE(AO)
  571. LD a3, 10 * SIZE(AO)
  572. LD a4, 11 * SIZE(AO)
  573. LD b1, 12 * SIZE(AO)
  574. LD b2, 13 * SIZE(AO)
  575. LD b3, 14 * SIZE(AO)
  576. LD b4, 15 * SIZE(AO)
  577. SUB a1, c09, c09
  578. SUB a2, c10, c10
  579. SUB a3, c11, c11
  580. SUB a4, c12, c12
  581. SUB b1, c13, c13
  582. SUB b2, c14, c14
  583. SUB b3, c15, c15
  584. SUB b4, c16, c16
  585. #endif
  586. #ifdef LN
  587. LD a1, 15 * SIZE(AO)
  588. LD a2, 14 * SIZE(AO)
  589. LD a3, 13 * SIZE(AO)
  590. LD a4, 12 * SIZE(AO)
  591. MUL a1, c04, c04
  592. MUL a1, c08, c08
  593. MUL a1, c12, c12
  594. MUL a1, c16, c16
  595. MUL a2, c04, t1
  596. MUL a2, c08, t2
  597. MUL a2, c12, t3
  598. MUL a2, c16, t4
  599. SUB c03, t1, c03
  600. SUB c07, t2, c07
  601. SUB c11, t3, c11
  602. SUB c15, t4, c15
  603. MUL a3, c04, t1
  604. MUL a3, c08, t2
  605. MUL a3, c12, t3
  606. MUL a3, c16, t4
  607. SUB c02, t1, c02
  608. SUB c06, t2, c06
  609. SUB c10, t3, c10
  610. SUB c14, t4, c14
  611. MUL a4, c04, t1
  612. MUL a4, c08, t2
  613. MUL a4, c12, t3
  614. MUL a4, c16, t4
  615. SUB c01, t1, c01
  616. SUB c05, t2, c05
  617. SUB c09, t3, c09
  618. SUB c13, t4, c13
  619. LD b1, 10 * SIZE(AO)
  620. LD b2, 9 * SIZE(AO)
  621. LD b3, 8 * SIZE(AO)
  622. MUL b1, c03, c03
  623. MUL b1, c07, c07
  624. MUL b1, c11, c11
  625. MUL b1, c15, c15
  626. MUL b2, c03, t1
  627. MUL b2, c07, t2
  628. MUL b2, c11, t3
  629. MUL b2, c15, t4
  630. SUB c02, t1, c02
  631. SUB c06, t2, c06
  632. SUB c10, t3, c10
  633. SUB c14, t4, c14
  634. MUL b3, c03, t1
  635. MUL b3, c07, t2
  636. MUL b3, c11, t3
  637. MUL b3, c15, t4
  638. SUB c01, t1, c01
  639. SUB c05, t2, c05
  640. SUB c09, t3, c09
  641. SUB c13, t4, c13
  642. LD a1, 5 * SIZE(AO)
  643. LD a2, 4 * SIZE(AO)
  644. LD a3, 0 * SIZE(AO)
  645. MUL a1, c02, c02
  646. MUL a1, c06, c06
  647. MUL a1, c10, c10
  648. MUL a1, c14, c14
  649. MUL a2, c02, t1
  650. MUL a2, c06, t2
  651. MUL a2, c10, t3
  652. MUL a2, c14, t4
  653. SUB c01, t1, c01
  654. SUB c05, t2, c05
  655. SUB c09, t3, c09
  656. SUB c13, t4, c13
  657. MUL a3, c01, c01
  658. MUL a3, c05, c05
  659. MUL a3, c09, c09
  660. MUL a3, c13, c13
  661. #endif
  662. #ifdef LT
  663. LD a1, 0 * SIZE(AO)
  664. LD a2, 1 * SIZE(AO)
  665. LD a3, 2 * SIZE(AO)
  666. LD a4, 3 * SIZE(AO)
  667. MUL a1, c01, c01
  668. MUL a1, c05, c05
  669. MUL a1, c09, c09
  670. MUL a1, c13, c13
  671. MUL a2, c01, t1
  672. MUL a2, c05, t2
  673. MUL a2, c09, t3
  674. MUL a2, c13, t4
  675. SUB c02, t1, c02
  676. SUB c06, t2, c06
  677. SUB c10, t3, c10
  678. SUB c14, t4, c14
  679. MUL a3, c01, t1
  680. MUL a3, c05, t2
  681. MUL a3, c09, t3
  682. MUL a3, c13, t4
  683. SUB c03, t1, c03
  684. SUB c07, t2, c07
  685. SUB c11, t3, c11
  686. SUB c15, t4, c15
  687. MUL a4, c01, t1
  688. MUL a4, c05, t2
  689. MUL a4, c09, t3
  690. MUL a4, c13, t4
  691. SUB c04, t1, c04
  692. SUB c08, t2, c08
  693. SUB c12, t3, c12
  694. SUB c16, t4, c16
  695. LD b1, 5 * SIZE(AO)
  696. LD b2, 6 * SIZE(AO)
  697. LD b3, 7 * SIZE(AO)
  698. MUL b1, c02, c02
  699. MUL b1, c06, c06
  700. MUL b1, c10, c10
  701. MUL b1, c14, c14
  702. MUL b2, c02, t1
  703. MUL b2, c06, t2
  704. MUL b2, c10, t3
  705. MUL b2, c14, t4
  706. SUB c03, t1, c03
  707. SUB c07, t2, c07
  708. SUB c11, t3, c11
  709. SUB c15, t4, c15
  710. MUL b3, c02, t1
  711. MUL b3, c06, t2
  712. MUL b3, c10, t3
  713. MUL b3, c14, t4
  714. SUB c04, t1, c04
  715. SUB c08, t2, c08
  716. SUB c12, t3, c12
  717. SUB c16, t4, c16
  718. LD a1, 10 * SIZE(AO)
  719. LD a2, 11 * SIZE(AO)
  720. LD a3, 15 * SIZE(AO)
  721. MUL a1, c03, c03
  722. MUL a1, c07, c07
  723. MUL a1, c11, c11
  724. MUL a1, c15, c15
  725. MUL a2, c03, t1
  726. MUL a2, c07, t2
  727. MUL a2, c11, t3
  728. MUL a2, c15, t4
  729. SUB c04, t1, c04
  730. SUB c08, t2, c08
  731. SUB c12, t3, c12
  732. SUB c16, t4, c16
  733. MUL a3, c04, c04
  734. MUL a3, c08, c08
  735. MUL a3, c12, c12
  736. MUL a3, c16, c16
  737. #endif
  738. #ifdef RN
  739. LD a1, 0 * SIZE(BO)
  740. LD a2, 1 * SIZE(BO)
  741. LD a3, 2 * SIZE(BO)
  742. LD a4, 3 * SIZE(BO)
  743. MUL a1, c01, c01
  744. MUL a1, c02, c02
  745. MUL a1, c03, c03
  746. MUL a1, c04, c04
  747. MUL a2, c01, t1
  748. MUL a2, c02, t2
  749. MUL a2, c03, t3
  750. MUL a2, c04, t4
  751. SUB c05, t1, c05
  752. SUB c06, t2, c06
  753. SUB c07, t3, c07
  754. SUB c08, t4, c08
  755. MUL a3, c01, t1
  756. MUL a3, c02, t2
  757. MUL a3, c03, t3
  758. MUL a3, c04, t4
  759. SUB c09, t1, c09
  760. SUB c10, t2, c10
  761. SUB c11, t3, c11
  762. SUB c12, t4, c12
  763. MUL a4, c01, t1
  764. MUL a4, c02, t2
  765. MUL a4, c03, t3
  766. MUL a4, c04, t4
  767. SUB c13, t1, c13
  768. SUB c14, t2, c14
  769. SUB c15, t3, c15
  770. SUB c16, t4, c16
  771. LD b1, 5 * SIZE(BO)
  772. LD b2, 6 * SIZE(BO)
  773. LD b3, 7 * SIZE(BO)
  774. MUL b1, c05, c05
  775. MUL b1, c06, c06
  776. MUL b1, c07, c07
  777. MUL b1, c08, c08
  778. MUL b2, c05, t1
  779. MUL b2, c06, t2
  780. MUL b2, c07, t3
  781. MUL b2, c08, t4
  782. SUB c09, t1, c09
  783. SUB c10, t2, c10
  784. SUB c11, t3, c11
  785. SUB c12, t4, c12
  786. MUL b3, c05, t1
  787. MUL b3, c06, t2
  788. MUL b3, c07, t3
  789. MUL b3, c08, t4
  790. SUB c13, t1, c13
  791. SUB c14, t2, c14
  792. SUB c15, t3, c15
  793. SUB c16, t4, c16
  794. LD a1, 10 * SIZE(BO)
  795. LD a2, 11 * SIZE(BO)
  796. LD a3, 15 * SIZE(BO)
  797. MUL a1, c09, c09
  798. MUL a1, c10, c10
  799. MUL a1, c11, c11
  800. MUL a1, c12, c12
  801. MUL a2, c09, t1
  802. MUL a2, c10, t2
  803. MUL a2, c11, t3
  804. MUL a2, c12, t4
  805. SUB c13, t1, c13
  806. SUB c14, t2, c14
  807. SUB c15, t3, c15
  808. SUB c16, t4, c16
  809. MUL a3, c13, c13
  810. MUL a3, c14, c14
  811. MUL a3, c15, c15
  812. MUL a3, c16, c16
  813. #endif
  814. #ifdef RT
  815. LD a1, 15 * SIZE(BO)
  816. LD a2, 14 * SIZE(BO)
  817. LD a3, 13 * SIZE(BO)
  818. LD a4, 12 * SIZE(BO)
  819. MUL a1, c13, c13
  820. MUL a1, c14, c14
  821. MUL a1, c15, c15
  822. MUL a1, c16, c16
  823. MUL a2, c13, t1
  824. MUL a2, c14, t2
  825. MUL a2, c15, t3
  826. MUL a2, c16, t4
  827. SUB c09, t1, c09
  828. SUB c10, t2, c10
  829. SUB c11, t3, c11
  830. SUB c12, t4, c12
  831. MUL a3, c13, t1
  832. MUL a3, c14, t2
  833. MUL a3, c15, t3
  834. MUL a3, c16, t4
  835. SUB c05, t1, c05
  836. SUB c06, t2, c06
  837. SUB c07, t3, c07
  838. SUB c08, t4, c08
  839. MUL a4, c13, t1
  840. MUL a4, c14, t2
  841. MUL a4, c15, t3
  842. MUL a4, c16, t4
  843. SUB c01, t1, c01
  844. SUB c02, t2, c02
  845. SUB c03, t3, c03
  846. SUB c04, t4, c04
  847. LD b1, 10 * SIZE(BO)
  848. LD b2, 9 * SIZE(BO)
  849. LD b3, 8 * SIZE(BO)
  850. MUL b1, c09, c09
  851. MUL b1, c10, c10
  852. MUL b1, c11, c11
  853. MUL b1, c12, c12
  854. MUL b2, c09, t1
  855. MUL b2, c10, t2
  856. MUL b2, c11, t3
  857. MUL b2, c12, t4
  858. SUB c05, t1, c05
  859. SUB c06, t2, c06
  860. SUB c07, t3, c07
  861. SUB c08, t4, c08
  862. MUL b3, c09, t1
  863. MUL b3, c10, t2
  864. MUL b3, c11, t3
  865. MUL b3, c12, t4
  866. SUB c01, t1, c01
  867. SUB c02, t2, c02
  868. SUB c03, t3, c03
  869. SUB c04, t4, c04
  870. LD a1, 5 * SIZE(BO)
  871. LD a2, 4 * SIZE(BO)
  872. LD a3, 0 * SIZE(BO)
  873. MUL a1, c05, c05
  874. MUL a1, c06, c06
  875. MUL a1, c07, c07
  876. MUL a1, c08, c08
  877. MUL a2, c05, t1
  878. MUL a2, c06, t2
  879. MUL a2, c07, t3
  880. MUL a2, c08, t4
  881. SUB c01, t1, c01
  882. SUB c02, t2, c02
  883. SUB c03, t3, c03
  884. SUB c04, t4, c04
  885. MUL a3, c01, c01
  886. MUL a3, c02, c02
  887. MUL a3, c03, c03
  888. MUL a3, c04, c04
  889. #endif
  890. #if defined(LN) || defined(LT)
  891. ST c01, 0 * SIZE(BO)
  892. ST c05, 1 * SIZE(BO)
  893. ST c09, 2 * SIZE(BO)
  894. ST c13, 3 * SIZE(BO)
  895. ST c02, 4 * SIZE(BO)
  896. ST c06, 5 * SIZE(BO)
  897. ST c10, 6 * SIZE(BO)
  898. ST c14, 7 * SIZE(BO)
  899. ST c03, 8 * SIZE(BO)
  900. ST c07, 9 * SIZE(BO)
  901. ST c11, 10 * SIZE(BO)
  902. ST c15, 11 * SIZE(BO)
  903. ST c04, 12 * SIZE(BO)
  904. ST c08, 13 * SIZE(BO)
  905. ST c12, 14 * SIZE(BO)
  906. ST c16, 15 * SIZE(BO)
  907. #else
  908. ST c01, 0 * SIZE(AO)
  909. ST c02, 1 * SIZE(AO)
  910. ST c03, 2 * SIZE(AO)
  911. ST c04, 3 * SIZE(AO)
  912. ST c05, 4 * SIZE(AO)
  913. ST c06, 5 * SIZE(AO)
  914. ST c07, 6 * SIZE(AO)
  915. ST c08, 7 * SIZE(AO)
  916. ST c09, 8 * SIZE(AO)
  917. ST c10, 9 * SIZE(AO)
  918. ST c11, 10 * SIZE(AO)
  919. ST c12, 11 * SIZE(AO)
  920. ST c13, 12 * SIZE(AO)
  921. ST c14, 13 * SIZE(AO)
  922. ST c15, 14 * SIZE(AO)
  923. ST c16, 15 * SIZE(AO)
  924. #endif
  925. #ifdef LN
  926. lda C1, -4 * SIZE(C1)
  927. lda C2, -4 * SIZE(C2)
  928. lda C3, -4 * SIZE(C3)
  929. lda C4, -4 * SIZE(C4)
  930. #endif
  931. ST c01, 0 * SIZE(C1)
  932. ST c02, 1 * SIZE(C1)
  933. ST c03, 2 * SIZE(C1)
  934. ST c04, 3 * SIZE(C1)
  935. ST c05, 0 * SIZE(C2)
  936. ST c06, 1 * SIZE(C2)
  937. ST c07, 2 * SIZE(C2)
  938. ST c08, 3 * SIZE(C2)
  939. ST c09, 0 * SIZE(C3)
  940. ST c10, 1 * SIZE(C3)
  941. ST c11, 2 * SIZE(C3)
  942. ST c12, 3 * SIZE(C3)
  943. ST c13, 0 * SIZE(C4)
  944. ST c14, 1 * SIZE(C4)
  945. ST c15, 2 * SIZE(C4)
  946. ST c16, 3 * SIZE(C4)
  947. #ifndef LN
  948. lda C1, 4 * SIZE(C1)
  949. lda C2, 4 * SIZE(C2)
  950. lda C3, 4 * SIZE(C3)
  951. lda C4, 4 * SIZE(C4)
  952. #endif
  953. fclr t1
  954. fclr t2
  955. fclr t3
  956. fclr t4
  957. #ifdef RT
  958. sll K, 2 + BASE_SHIFT, TMP1
  959. addq AORIG, TMP1, AORIG
  960. #endif
  961. #if defined(LT) || defined(RN)
  962. subq K, KK, TMP1
  963. sll TMP1, BASE_SHIFT + 2, TMP1
  964. addq AO, TMP1, AO
  965. addq BO, TMP1, BO
  966. #endif
  967. #ifdef LT
  968. addq KK, 4, KK
  969. #endif
  970. #ifdef LN
  971. subq KK, 4, KK
  972. #endif
  973. lda I, -1(I)
  974. bgt I, $L11
  975. .align 4
  976. $L20:
  977. and M, 2, I
  978. ble I, $L30
  979. #if defined(LT) || defined(RN)
  980. LD a1, 0 * SIZE(AO)
  981. fclr c09
  982. LD a2, 1 * SIZE(AO)
  983. fclr c13
  984. LD a3, 2 * SIZE(AO)
  985. fclr c10
  986. LD a4, 3 * SIZE(AO)
  987. fclr c14
  988. LD b1, 0 * SIZE(B)
  989. lda L, -2(KK)
  990. LD b2, 1 * SIZE(B)
  991. lda AO, 2 * SIZE(AO)
  992. LD b3, 2 * SIZE(B)
  993. fclr c01
  994. LD b4, 3 * SIZE(B)
  995. fclr c05
  996. lda BO, 4 * SIZE(B)
  997. fclr c02
  998. fclr c06
  999. ble KK, $L28
  1000. ble L, $L25
  1001. #else
  1002. #ifdef LN
  1003. sll K, BASE_SHIFT + 1, TMP1
  1004. subq AORIG, TMP1, AORIG
  1005. #endif
  1006. sll KK, BASE_SHIFT + 1, TMP1
  1007. addq AORIG, TMP1, AO
  1008. sll KK, BASE_SHIFT + 2, TMP2
  1009. addq B, TMP2, BO
  1010. subq K, KK, TMP1
  1011. LD a1, 0 * SIZE(AO)
  1012. fclr c09
  1013. LD a2, 1 * SIZE(AO)
  1014. fclr c13
  1015. LD a3, 2 * SIZE(AO)
  1016. fclr c10
  1017. LD a4, 3 * SIZE(AO)
  1018. fclr c14
  1019. LD b1, 0 * SIZE(BO)
  1020. lda L, -2(TMP1)
  1021. LD b2, 1 * SIZE(BO)
  1022. lda AO, 2 * SIZE(AO)
  1023. LD b3, 2 * SIZE(BO)
  1024. fclr c01
  1025. LD b4, 3 * SIZE(BO)
  1026. fclr c05
  1027. lda BO, 4 * SIZE(BO)
  1028. fclr c02
  1029. fclr c06
  1030. ble TMP1, $L28
  1031. ble L, $L25
  1032. #endif
  1033. .align 4
  1034. $L22:
  1035. ADD c09, t1, c09
  1036. unop
  1037. MUL a1, b1, t1
  1038. unop
  1039. ADD c10, t2, c10
  1040. unop
  1041. MUL a2, b1, t2
  1042. LD b1, 0 * SIZE(BO)
  1043. ADD c13, t3, c13
  1044. unop
  1045. MUL a1, b2, t3
  1046. lda BO, 8 * SIZE(BO)
  1047. ADD c14, t4, c14
  1048. unop
  1049. MUL a2, b2, t4
  1050. LD b2, -7 * SIZE(BO)
  1051. ADD c01, t1, c01
  1052. unop
  1053. MUL a1, b3, t1
  1054. unop
  1055. ADD c02, t2, c02
  1056. unop
  1057. MUL a2, b3, t2
  1058. LD b3, -6 * SIZE(BO)
  1059. ADD c05, t3, c05
  1060. unop
  1061. MUL a1, b4, t3
  1062. LD a1, 2 * SIZE(AO)
  1063. ADD c06, t4, c06
  1064. MUL a2, b4, t4
  1065. LD b5, -5 * SIZE(BO)
  1066. ADD c09, t1, c09
  1067. unop
  1068. MUL a3, b1, t1
  1069. LD a2, 3 * SIZE(AO)
  1070. ADD c10, t2, c10
  1071. unop
  1072. MUL a4, b1, t2
  1073. LD b1, -4 * SIZE(BO)
  1074. ADD c13, t3, c13
  1075. unop
  1076. MUL a3, b2, t3
  1077. lda AO, 4 * SIZE(AO)
  1078. ADD c14, t4, c14
  1079. MUL a4, b2, t4
  1080. LD b2, -3 * SIZE(BO)
  1081. ADD c01, t1, c01
  1082. lda L, -2(L)
  1083. MUL a3, b3, t1
  1084. LD b4, -1 * SIZE(BO)
  1085. ADD c02, t2, c02
  1086. unop
  1087. MUL a4, b3, t2
  1088. LD b3, -2 * SIZE(BO)
  1089. ADD c05, t3, c05
  1090. unop
  1091. MUL a3, b5, t3
  1092. LD a3, 0 * SIZE(AO)
  1093. ADD c06, t4, c06
  1094. MUL a4, b5, t4
  1095. LD a4, 1 * SIZE(AO)
  1096. bgt L, $L22
  1097. .align 4
  1098. $L25:
  1099. ADD c09, t1, c09
  1100. MUL a1, b1, t1
  1101. #if defined(LT) || defined(RN)
  1102. blbs KK, $L27
  1103. #else
  1104. blbs TMP1, $L27
  1105. #endif
  1106. ADD c10, t2, c10
  1107. unop
  1108. MUL a2, b1, t2
  1109. LD b1, 0 * SIZE(BO)
  1110. ADD c13, t3, c13
  1111. unop
  1112. MUL a1, b2, t3
  1113. unop
  1114. ADD c14, t4, c14
  1115. unop
  1116. MUL a2, b2, t4
  1117. LD b2, 1 * SIZE(BO)
  1118. ADD c01, t1, c01
  1119. unop
  1120. MUL a1, b3, t1
  1121. lda AO, 2 * SIZE(AO)
  1122. ADD c02, t2, c02
  1123. unop
  1124. MUL a2, b3, t2
  1125. LD b3, 2 * SIZE(BO)
  1126. ADD c05, t3, c05
  1127. unop
  1128. MUL a1, b4, t3
  1129. LD a1, -2 * SIZE(AO)
  1130. ADD c06, t4, c06
  1131. unop
  1132. MUL a2, b4, t4
  1133. LD a2, -1 * SIZE(AO)
  1134. ADD c09, t1, c09
  1135. LD b4, 3 * SIZE(BO)
  1136. MUL a1, b1, t1
  1137. lda BO, 4 * SIZE(BO)
  1138. .align 4
  1139. $L27:
  1140. ADD c10, t2, c10
  1141. MUL a2, b1, t2
  1142. ADD c13, t3, c13
  1143. MUL a1, b2, t3
  1144. ADD c14, t4, c14
  1145. MUL a2, b2, t4
  1146. ADD c01, t1, c01
  1147. MUL a1, b3, t1
  1148. ADD c02, t2, c02
  1149. MUL a2, b3, t2
  1150. ADD c05, t3, c05
  1151. MUL a1, b4, t3
  1152. ADD c06, t4, c06
  1153. lda AO, 2 * SIZE(AO)
  1154. MUL a2, b4, t4
  1155. lda BO, 4 * SIZE(BO)
  1156. ADD c09, t1, c09
  1157. ADD c10, t2, c10
  1158. ADD c13, t3, c13
  1159. ADD c14, t4, c14
  1160. .align 4
  1161. $L28:
  1162. #if defined(LN) || defined(RT)
  1163. #ifdef LN
  1164. subq KK, 2, TMP1
  1165. #else
  1166. subq KK, 4, TMP1
  1167. #endif
  1168. sll TMP1, BASE_SHIFT + 1, TMP2
  1169. addq AORIG, TMP2, AO
  1170. sll TMP1, BASE_SHIFT + 2, TMP2
  1171. addq B, TMP2, BO
  1172. #else
  1173. lda AO, -2 * SIZE(AO)
  1174. lda BO, -4 * SIZE(BO)
  1175. #endif
  1176. #if defined(LN) || defined(LT)
  1177. LD a1, 0 * SIZE(BO)
  1178. LD a2, 1 * SIZE(BO)
  1179. LD a3, 2 * SIZE(BO)
  1180. LD a4, 3 * SIZE(BO)
  1181. LD b1, 4 * SIZE(BO)
  1182. LD b2, 5 * SIZE(BO)
  1183. LD b3, 6 * SIZE(BO)
  1184. LD b4, 7 * SIZE(BO)
  1185. SUB a1, c01, c01
  1186. SUB a2, c05, c05
  1187. SUB a3, c09, c09
  1188. SUB a4, c13, c13
  1189. SUB b1, c02, c02
  1190. SUB b2, c06, c06
  1191. SUB b3, c10, c10
  1192. SUB b4, c14, c14
  1193. #else
  1194. LD a1, 0 * SIZE(AO)
  1195. LD a2, 1 * SIZE(AO)
  1196. LD a3, 2 * SIZE(AO)
  1197. LD a4, 3 * SIZE(AO)
  1198. LD b1, 4 * SIZE(AO)
  1199. LD b2, 5 * SIZE(AO)
  1200. LD b3, 6 * SIZE(AO)
  1201. LD b4, 7 * SIZE(AO)
  1202. SUB a1, c01, c01
  1203. SUB a2, c02, c02
  1204. SUB a3, c05, c05
  1205. SUB a4, c06, c06
  1206. SUB b1, c09, c09
  1207. SUB b2, c10, c10
  1208. SUB b3, c13, c13
  1209. SUB b4, c14, c14
  1210. #endif
  1211. #ifdef LN
  1212. LD a1, 3 * SIZE(AO)
  1213. LD a2, 2 * SIZE(AO)
  1214. LD a3, 0 * SIZE(AO)
  1215. MUL a1, c02, c02
  1216. MUL a1, c06, c06
  1217. MUL a1, c10, c10
  1218. MUL a1, c14, c14
  1219. MUL a2, c02, t1
  1220. MUL a2, c06, t2
  1221. MUL a2, c10, t3
  1222. MUL a2, c14, t4
  1223. SUB c01, t1, c01
  1224. SUB c05, t2, c05
  1225. SUB c09, t3, c09
  1226. SUB c13, t4, c13
  1227. MUL a3, c01, c01
  1228. MUL a3, c05, c05
  1229. MUL a3, c09, c09
  1230. MUL a3, c13, c13
  1231. #endif
  1232. #ifdef LT
  1233. LD a1, 0 * SIZE(AO)
  1234. LD a2, 1 * SIZE(AO)
  1235. LD a3, 3 * SIZE(AO)
  1236. MUL a1, c01, c01
  1237. MUL a1, c05, c05
  1238. MUL a1, c09, c09
  1239. MUL a1, c13, c13
  1240. MUL a2, c01, t1
  1241. MUL a2, c05, t2
  1242. MUL a2, c09, t3
  1243. MUL a2, c13, t4
  1244. SUB c02, t1, c02
  1245. SUB c06, t2, c06
  1246. SUB c10, t3, c10
  1247. SUB c14, t4, c14
  1248. MUL a3, c02, c02
  1249. MUL a3, c06, c06
  1250. MUL a3, c10, c10
  1251. MUL a3, c14, c14
  1252. #endif
  1253. #ifdef RN
  1254. LD a1, 0 * SIZE(BO)
  1255. LD a2, 1 * SIZE(BO)
  1256. LD a3, 2 * SIZE(BO)
  1257. LD a4, 3 * SIZE(BO)
  1258. MUL a1, c01, c01
  1259. MUL a1, c02, c02
  1260. MUL a2, c01, t1
  1261. MUL a2, c02, t2
  1262. SUB c05, t1, c05
  1263. SUB c06, t2, c06
  1264. MUL a3, c01, t1
  1265. MUL a3, c02, t2
  1266. SUB c09, t1, c09
  1267. SUB c10, t2, c10
  1268. MUL a4, c01, t1
  1269. MUL a4, c02, t2
  1270. SUB c13, t1, c13
  1271. SUB c14, t2, c14
  1272. LD b1, 5 * SIZE(BO)
  1273. LD b2, 6 * SIZE(BO)
  1274. LD b3, 7 * SIZE(BO)
  1275. MUL b1, c05, c05
  1276. MUL b1, c06, c06
  1277. MUL b2, c05, t1
  1278. MUL b2, c06, t2
  1279. SUB c09, t1, c09
  1280. SUB c10, t2, c10
  1281. MUL b3, c05, t1
  1282. MUL b3, c06, t2
  1283. SUB c13, t1, c13
  1284. SUB c14, t2, c14
  1285. LD a1, 10 * SIZE(BO)
  1286. LD a2, 11 * SIZE(BO)
  1287. LD a3, 15 * SIZE(BO)
  1288. MUL a1, c09, c09
  1289. MUL a1, c10, c10
  1290. MUL a2, c09, t1
  1291. MUL a2, c10, t2
  1292. SUB c13, t1, c13
  1293. SUB c14, t2, c14
  1294. MUL a3, c13, c13
  1295. MUL a3, c14, c14
  1296. #endif
  1297. #ifdef RT
  1298. LD a1, 15 * SIZE(BO)
  1299. LD a2, 14 * SIZE(BO)
  1300. LD a3, 13 * SIZE(BO)
  1301. LD a4, 12 * SIZE(BO)
  1302. MUL a1, c13, c13
  1303. MUL a1, c14, c14
  1304. MUL a2, c13, t1
  1305. MUL a2, c14, t2
  1306. SUB c09, t1, c09
  1307. SUB c10, t2, c10
  1308. MUL a3, c13, t1
  1309. MUL a3, c14, t2
  1310. SUB c05, t1, c05
  1311. SUB c06, t2, c06
  1312. MUL a4, c13, t1
  1313. MUL a4, c14, t2
  1314. SUB c01, t1, c01
  1315. SUB c02, t2, c02
  1316. LD b1, 10 * SIZE(BO)
  1317. LD b2, 9 * SIZE(BO)
  1318. LD b3, 8 * SIZE(BO)
  1319. MUL b1, c09, c09
  1320. MUL b1, c10, c10
  1321. MUL b2, c09, t1
  1322. MUL b2, c10, t2
  1323. SUB c05, t1, c05
  1324. SUB c06, t2, c06
  1325. MUL b3, c09, t1
  1326. MUL b3, c10, t2
  1327. SUB c01, t1, c01
  1328. SUB c02, t2, c02
  1329. LD a1, 5 * SIZE(BO)
  1330. LD a2, 4 * SIZE(BO)
  1331. LD a3, 0 * SIZE(BO)
  1332. MUL a1, c05, c05
  1333. MUL a1, c06, c06
  1334. MUL a2, c05, t1
  1335. MUL a2, c06, t2
  1336. SUB c01, t1, c01
  1337. SUB c02, t2, c02
  1338. MUL a3, c01, c01
  1339. MUL a3, c02, c02
  1340. #endif
  1341. #if defined(LN) || defined(LT)
  1342. ST c01, 0 * SIZE(BO)
  1343. ST c05, 1 * SIZE(BO)
  1344. ST c09, 2 * SIZE(BO)
  1345. ST c13, 3 * SIZE(BO)
  1346. ST c02, 4 * SIZE(BO)
  1347. ST c06, 5 * SIZE(BO)
  1348. ST c10, 6 * SIZE(BO)
  1349. ST c14, 7 * SIZE(BO)
  1350. #else
  1351. ST c01, 0 * SIZE(AO)
  1352. ST c02, 1 * SIZE(AO)
  1353. ST c05, 2 * SIZE(AO)
  1354. ST c06, 3 * SIZE(AO)
  1355. ST c09, 4 * SIZE(AO)
  1356. ST c10, 5 * SIZE(AO)
  1357. ST c13, 6 * SIZE(AO)
  1358. ST c14, 7 * SIZE(AO)
  1359. #endif
  1360. #ifdef LN
  1361. lda C1, -2 * SIZE(C1)
  1362. lda C2, -2 * SIZE(C2)
  1363. lda C3, -2 * SIZE(C3)
  1364. lda C4, -2 * SIZE(C4)
  1365. #endif
  1366. ST c01, 0 * SIZE(C1)
  1367. ST c02, 1 * SIZE(C1)
  1368. ST c05, 0 * SIZE(C2)
  1369. ST c06, 1 * SIZE(C2)
  1370. ST c09, 0 * SIZE(C3)
  1371. ST c10, 1 * SIZE(C3)
  1372. ST c13, 0 * SIZE(C4)
  1373. ST c14, 1 * SIZE(C4)
  1374. #ifndef LN
  1375. lda C1, 2 * SIZE(C1)
  1376. lda C2, 2 * SIZE(C2)
  1377. lda C3, 2 * SIZE(C3)
  1378. lda C4, 2 * SIZE(C4)
  1379. #endif
  1380. fclr t1
  1381. fclr t2
  1382. fclr t3
  1383. fclr t4
  1384. #ifdef RT
  1385. sll K, 1 + BASE_SHIFT, TMP1
  1386. addq AORIG, TMP1, AORIG
  1387. #endif
  1388. #if defined(LT) || defined(RN)
  1389. subq K, KK, TMP1
  1390. sll TMP1, BASE_SHIFT + 1, TMP2
  1391. addq AO, TMP2, AO
  1392. sll TMP1, BASE_SHIFT + 2, TMP2
  1393. addq BO, TMP2, BO
  1394. #endif
  1395. #ifdef LT
  1396. addq KK, 2, KK
  1397. #endif
  1398. #ifdef LN
  1399. subq KK, 2, KK
  1400. #endif
  1401. .align 4
  1402. $L30:
  1403. and M, 1, I
  1404. ble I, $L39
  1405. #if defined(LT) || defined(RN)
  1406. LD a1, 0 * SIZE(AO)
  1407. fclr c01
  1408. LD a2, 1 * SIZE(AO)
  1409. fclr c05
  1410. LD b1, 0 * SIZE(B)
  1411. lda L, -2(KK)
  1412. LD b2, 1 * SIZE(B)
  1413. lda AO, 1 * SIZE(AO)
  1414. LD b3, 2 * SIZE(B)
  1415. fclr c09
  1416. LD b4, 3 * SIZE(B)
  1417. fclr c13
  1418. lda BO, 4 * SIZE(B)
  1419. ble KK, $L38
  1420. ble L, $L35
  1421. #else
  1422. #ifdef LN
  1423. sll K, BASE_SHIFT + 0, TMP1
  1424. subq AORIG, TMP1, AORIG
  1425. #endif
  1426. sll KK, BASE_SHIFT + 0, TMP1
  1427. addq AORIG, TMP1, AO
  1428. sll KK, BASE_SHIFT + 2, TMP2
  1429. addq B, TMP2, BO
  1430. subq K, KK, TMP1
  1431. LD a1, 0 * SIZE(AO)
  1432. fclr c01
  1433. LD a2, 1 * SIZE(AO)
  1434. fclr c05
  1435. LD b1, 0 * SIZE(BO)
  1436. lda L, -2(TMP1)
  1437. LD b2, 1 * SIZE(BO)
  1438. lda AO, 1 * SIZE(AO)
  1439. LD b3, 2 * SIZE(BO)
  1440. fclr c09
  1441. LD b4, 3 * SIZE(BO)
  1442. fclr c13
  1443. lda BO, 4 * SIZE(BO)
  1444. ble TMP1, $L38
  1445. ble L, $L35
  1446. #endif
  1447. .align 4
  1448. $L32:
  1449. ADD c01, t1, c01
  1450. lda L, -2(L)
  1451. MUL a1, b1, t1
  1452. LD b1, 0 * SIZE(BO)
  1453. ADD c05, t2, c05
  1454. lda AO, 2 * SIZE(AO)
  1455. MUL a1, b2, t2
  1456. LD b2, 1 * SIZE(BO)
  1457. ADD c09, t3, c09
  1458. LD b5, 3 * SIZE(BO)
  1459. MUL a1, b3, t3
  1460. LD b3, 2 * SIZE(BO)
  1461. ADD c13, t4, c13
  1462. MUL a1, b4, t4
  1463. LD a1, -1 * SIZE(AO)
  1464. ADD c01, t1, c01
  1465. MUL a2, b1, t1
  1466. LD b1, 4 * SIZE(BO)
  1467. lda BO, 8 * SIZE(BO)
  1468. ADD c05, t2, c05
  1469. MUL a2, b2, t2
  1470. LD b2, -3 * SIZE(BO)
  1471. ADD c09, t3, c09
  1472. LD b4, -1 * SIZE(BO)
  1473. MUL a2, b3, t3
  1474. LD b3, -2 * SIZE(BO)
  1475. ADD c13, t4, c13
  1476. MUL a2, b5, t4
  1477. LD a2, 0 * SIZE(AO)
  1478. bgt L, $L32
  1479. .align 4
  1480. $L35:
  1481. ADD c01, t1, c01
  1482. MUL a1, b1, t1
  1483. #if defined(LT) || defined(RN)
  1484. blbs KK, $L37
  1485. #else
  1486. blbs TMP1, $L37
  1487. #endif
  1488. .align 4
  1489. ADD c05, t2, c05
  1490. LD b1, 0 * SIZE(BO)
  1491. MUL a1, b2, t2
  1492. LD b2, 1 * SIZE(BO)
  1493. ADD c09, t3, c09
  1494. MUL a1, b3, t3
  1495. LD b3, 2 * SIZE(BO)
  1496. ADD c13, t4, c13
  1497. MUL a1, b4, t4
  1498. LD a1, 0 * SIZE(AO)
  1499. lda AO, 1 * SIZE(AO)
  1500. ADD c01, t1, c01
  1501. LD b4, 3 * SIZE(BO)
  1502. MUL a1, b1, t1
  1503. lda BO, 4 * SIZE(BO)
  1504. .align 4
  1505. $L37:
  1506. ADD c05, t2, c05
  1507. MUL a1, b2, t2
  1508. ADD c09, t3, c09
  1509. MUL a1, b3, t3
  1510. ADD c13, t4, c13
  1511. lda AO, 1 * SIZE(AO)
  1512. MUL a1, b4, t4
  1513. lda BO, 4 * SIZE(BO)
  1514. ADD c01, t1, c01
  1515. ADD c05, t2, c05
  1516. ADD c09, t3, c09
  1517. ADD c13, t4, c13
  1518. $L38:
  1519. #if defined(LN) || defined(RT)
  1520. #ifdef LN
  1521. subq KK, 1, TMP1
  1522. #else
  1523. subq KK, 4, TMP1
  1524. #endif
  1525. sll TMP1, BASE_SHIFT + 0, TMP2
  1526. addq AORIG, TMP2, AO
  1527. sll TMP1, BASE_SHIFT + 2, TMP2
  1528. addq B, TMP2, BO
  1529. #else
  1530. lda AO, -1 * SIZE(AO)
  1531. lda BO, -4 * SIZE(BO)
  1532. #endif
  1533. #if defined(LN) || defined(LT)
  1534. LD a1, 0 * SIZE(BO)
  1535. LD a2, 1 * SIZE(BO)
  1536. LD a3, 2 * SIZE(BO)
  1537. LD a4, 3 * SIZE(BO)
  1538. SUB a1, c01, c01
  1539. SUB a2, c05, c05
  1540. SUB a3, c09, c09
  1541. SUB a4, c13, c13
  1542. #else
  1543. LD a1, 0 * SIZE(AO)
  1544. LD a2, 1 * SIZE(AO)
  1545. LD a3, 2 * SIZE(AO)
  1546. LD a4, 3 * SIZE(AO)
  1547. SUB a1, c01, c01
  1548. SUB a2, c05, c05
  1549. SUB a3, c09, c09
  1550. SUB a4, c13, c13
  1551. #endif
  1552. #if defined(LN) || defined(LT)
  1553. LD a1, 0 * SIZE(AO)
  1554. MUL a1, c01, c01
  1555. MUL a1, c05, c05
  1556. MUL a1, c09, c09
  1557. MUL a1, c13, c13
  1558. #endif
  1559. #ifdef RN
  1560. LD a1, 0 * SIZE(BO)
  1561. LD a2, 1 * SIZE(BO)
  1562. LD a3, 2 * SIZE(BO)
  1563. LD a4, 3 * SIZE(BO)
  1564. MUL a1, c01, c01
  1565. MUL a2, c01, t1
  1566. SUB c05, t1, c05
  1567. MUL a3, c01, t1
  1568. SUB c09, t1, c09
  1569. MUL a4, c01, t1
  1570. SUB c13, t1, c13
  1571. LD b1, 5 * SIZE(BO)
  1572. LD b2, 6 * SIZE(BO)
  1573. LD b3, 7 * SIZE(BO)
  1574. MUL b1, c05, c05
  1575. MUL b2, c05, t1
  1576. SUB c09, t1, c09
  1577. MUL b3, c05, t1
  1578. SUB c13, t1, c13
  1579. LD a1, 10 * SIZE(BO)
  1580. LD a2, 11 * SIZE(BO)
  1581. LD a3, 15 * SIZE(BO)
  1582. MUL a1, c09, c09
  1583. MUL a2, c09, t1
  1584. SUB c13, t1, c13
  1585. MUL a3, c13, c13
  1586. #endif
  1587. #ifdef RT
  1588. LD a1, 15 * SIZE(BO)
  1589. LD a2, 14 * SIZE(BO)
  1590. LD a3, 13 * SIZE(BO)
  1591. LD a4, 12 * SIZE(BO)
  1592. MUL a1, c13, c13
  1593. MUL a2, c13, t1
  1594. SUB c09, t1, c09
  1595. MUL a3, c13, t1
  1596. SUB c05, t1, c05
  1597. MUL a4, c13, t1
  1598. SUB c01, t1, c01
  1599. LD b1, 10 * SIZE(BO)
  1600. LD b2, 9 * SIZE(BO)
  1601. LD b3, 8 * SIZE(BO)
  1602. MUL b1, c09, c09
  1603. MUL b2, c09, t1
  1604. SUB c05, t1, c05
  1605. MUL b3, c09, t1
  1606. SUB c01, t1, c01
  1607. LD a1, 5 * SIZE(BO)
  1608. LD a2, 4 * SIZE(BO)
  1609. LD a3, 0 * SIZE(BO)
  1610. MUL a1, c05, c05
  1611. MUL a2, c05, t1
  1612. SUB c01, t1, c01
  1613. MUL a3, c01, c01
  1614. #endif
  1615. #if defined(LN) || defined(LT)
  1616. ST c01, 0 * SIZE(BO)
  1617. ST c05, 1 * SIZE(BO)
  1618. ST c09, 2 * SIZE(BO)
  1619. ST c13, 3 * SIZE(BO)
  1620. #else
  1621. ST c01, 0 * SIZE(AO)
  1622. ST c05, 1 * SIZE(AO)
  1623. ST c09, 2 * SIZE(AO)
  1624. ST c13, 3 * SIZE(AO)
  1625. #endif
  1626. #ifdef LN
  1627. lda C1, -1 * SIZE(C1)
  1628. lda C2, -1 * SIZE(C2)
  1629. lda C3, -1 * SIZE(C3)
  1630. lda C4, -1 * SIZE(C4)
  1631. #endif
  1632. ST c01, 0 * SIZE(C1)
  1633. ST c05, 0 * SIZE(C2)
  1634. ST c09, 0 * SIZE(C3)
  1635. ST c13, 0 * SIZE(C4)
  1636. #ifdef RT
  1637. sll K, 0 + BASE_SHIFT, TMP1
  1638. addq AORIG, TMP1, AORIG
  1639. #endif
  1640. #if defined(LT) || defined(RN)
  1641. subq K, KK, TMP1
  1642. sll TMP1, BASE_SHIFT + 0, TMP2
  1643. addq AO, TMP2, AO
  1644. sll TMP1, BASE_SHIFT + 2, TMP2
  1645. addq BO, TMP2, BO
  1646. #endif
  1647. #ifdef LT
  1648. addq KK, 1, KK
  1649. #endif
  1650. #ifdef LN
  1651. subq KK, 1, KK
  1652. #endif
  1653. .align 4
  1654. $L39:
  1655. #ifdef LN
  1656. sll K, 2 + BASE_SHIFT, TMP1
  1657. addq B, TMP1, B
  1658. #endif
  1659. #if defined(LT) || defined(RN)
  1660. mov BO, B
  1661. #endif
  1662. #ifdef RN
  1663. addq KK, 4, KK
  1664. #endif
  1665. #ifdef RT
  1666. subq KK, 4, KK
  1667. #endif
  1668. lda J, -1(J)
  1669. bgt J, $L01
  1670. .align 4
  1671. $L40:
  1672. and N, 2, J
  1673. ble J, $L80
  1674. #ifdef RT
  1675. sll K, 1 + BASE_SHIFT, TMP1
  1676. subq B, TMP1, B
  1677. addq LDC, LDC, TMP1
  1678. subq C, TMP1, C
  1679. #endif
  1680. mov C, C1
  1681. addq C, LDC, C2
  1682. fclr t1
  1683. #ifndef RT
  1684. addq C2, LDC, C
  1685. #endif
  1686. fclr t2
  1687. #ifdef LN
  1688. addq M, OFFSET, KK
  1689. #endif
  1690. #ifdef LT
  1691. mov OFFSET, KK
  1692. #endif
  1693. #if defined(LN) || defined(RT)
  1694. mov A, AORIG
  1695. #else
  1696. mov A, AO
  1697. #endif
  1698. sra M, 2, I
  1699. fclr t3
  1700. fclr t4
  1701. ble I, $L60
  1702. .align 4
  1703. $L51:
  1704. #if defined(LT) || defined(RN)
  1705. LD a1, 0 * SIZE(AO)
  1706. fclr c03
  1707. LD a2, 1 * SIZE(AO)
  1708. fclr c07
  1709. LD a3, 2 * SIZE(AO)
  1710. fclr c04
  1711. LD a4, 3 * SIZE(AO)
  1712. fclr c08
  1713. LD b1, 0 * SIZE(B)
  1714. fclr c01
  1715. LD b2, 1 * SIZE(B)
  1716. fclr c05
  1717. LD b3, 2 * SIZE(B)
  1718. fclr c02
  1719. LD b4, 3 * SIZE(B)
  1720. fclr c06
  1721. lda L, -2(KK)
  1722. lda BO, 2 * SIZE(B)
  1723. lda AO, 4 * SIZE(AO)
  1724. ble KK, $L58
  1725. ble L, $L55
  1726. #else
  1727. #ifdef LN
  1728. sll K, BASE_SHIFT + 2, TMP1
  1729. subq AORIG, TMP1, AORIG
  1730. #endif
  1731. sll KK, BASE_SHIFT + 2, TMP1
  1732. addq AORIG, TMP1, AO
  1733. sll KK, BASE_SHIFT + 1, TMP1
  1734. addq B, TMP1, BO
  1735. subq K, KK, TMP1
  1736. LD a1, 0 * SIZE(AO)
  1737. fclr c03
  1738. LD a2, 1 * SIZE(AO)
  1739. fclr c07
  1740. LD a3, 2 * SIZE(AO)
  1741. fclr c04
  1742. LD a4, 3 * SIZE(AO)
  1743. fclr c08
  1744. LD b1, 0 * SIZE(BO)
  1745. fclr c01
  1746. LD b2, 1 * SIZE(BO)
  1747. fclr c05
  1748. LD b3, 2 * SIZE(BO)
  1749. fclr c02
  1750. LD b4, 3 * SIZE(BO)
  1751. fclr c06
  1752. lda L, -2(TMP1)
  1753. lda BO, 2 * SIZE(BO)
  1754. lda AO, 4 * SIZE(AO)
  1755. ble TMP1, $L58
  1756. ble L, $L55
  1757. #endif
  1758. .align 4
  1759. $L52:
  1760. ADD c05, t1, c05
  1761. unop
  1762. MUL a1, b1, t1
  1763. unop
  1764. ADD c06, t2, c06
  1765. lda L, -2(L)
  1766. MUL a2, b1, t2
  1767. unop
  1768. ADD c07, t3, c07
  1769. unop
  1770. MUL a3, b1, t3
  1771. unop
  1772. ADD c08, t4, c08
  1773. unop
  1774. MUL a4, b1, t4
  1775. LD b1, 2 * SIZE(BO)
  1776. ADD c01, t1, c01
  1777. unop
  1778. MUL a1, b2, t1
  1779. LD a1, 0 * SIZE(AO)
  1780. ADD c02, t2, c02
  1781. lda BO, 4 * SIZE(BO)
  1782. MUL a2, b2, t2
  1783. LD a2, 1 * SIZE(AO)
  1784. ADD c03, t3, c03
  1785. unop
  1786. MUL a3, b2, t3
  1787. LD a3, 2 * SIZE(AO)
  1788. ADD c04, t4, c04
  1789. unop
  1790. MUL a4, b2, t4
  1791. LD a5, 3 * SIZE(AO)
  1792. ADD c05, t1, c05
  1793. unop
  1794. MUL a1, b3, t1
  1795. LD b2, -1 * SIZE(BO)
  1796. ADD c06, t2, c06
  1797. unop
  1798. MUL a2, b3, t2
  1799. unop
  1800. ADD c07, t3, c07
  1801. unop
  1802. MUL a3, b3, t3
  1803. lda AO, 8 * SIZE(AO)
  1804. ADD c08, t4, c08
  1805. unop
  1806. MUL a5, b3, t4
  1807. LD b3, 0 * SIZE(BO)
  1808. ADD c01, t1, c01
  1809. unop
  1810. MUL a1, b4, t1
  1811. LD a1, -4 * SIZE(AO)
  1812. ADD c02, t2, c02
  1813. unop
  1814. MUL a2, b4, t2
  1815. LD a2, -3 * SIZE(AO)
  1816. ADD c03, t3, c03
  1817. LD a4, -1 * SIZE(AO)
  1818. MUL a3, b4, t3
  1819. LD a3, -2 * SIZE(AO)
  1820. ADD c04, t4, c04
  1821. MUL a5, b4, t4
  1822. LD b4, 1 * SIZE(BO)
  1823. bgt L, $L52
  1824. .align 4
  1825. $L55:
  1826. ADD c05, t1, c05
  1827. MUL a1, b1, t1
  1828. #if defined(LT) || defined(RN)
  1829. blbs KK, $L57
  1830. #else
  1831. blbs TMP1, $L57
  1832. #endif
  1833. .align 4
  1834. ADD c06, t2, c06
  1835. MUL a2, b1, t2
  1836. ADD c07, t3, c07
  1837. MUL a3, b1, t3
  1838. ADD c08, t4, c08
  1839. unop
  1840. MUL a4, b1, t4
  1841. LD b1, 0 * SIZE(BO)
  1842. ADD c01, t1, c01
  1843. unop
  1844. MUL a1, b2, t1
  1845. LD a1, 0 * SIZE(AO)
  1846. ADD c02, t2, c02
  1847. unop
  1848. MUL a2, b2, t2
  1849. LD a2, 1 * SIZE(AO)
  1850. ADD c03, t3, c03
  1851. unop
  1852. MUL a3, b2, t3
  1853. LD a3, 2 * SIZE(AO)
  1854. ADD c04, t4, c04
  1855. MUL a4, b2, t4
  1856. LD a4, 3 * SIZE(AO)
  1857. lda AO, 4 * SIZE(AO)
  1858. ADD c05, t1, c05
  1859. LD b2, 1 * SIZE(BO)
  1860. MUL a1, b1, t1
  1861. lda BO, 2 * SIZE(BO)
  1862. .align 4
  1863. $L57:
  1864. ADD c06, t2, c06
  1865. MUL a2, b1, t2
  1866. ADD c07, t3, c07
  1867. MUL a3, b1, t3
  1868. ADD c08, t4, c08
  1869. MUL a4, b1, t4
  1870. ADD c01, t1, c01
  1871. MUL a1, b2, t1
  1872. ADD c02, t2, c02
  1873. MUL a2, b2, t2
  1874. ADD c03, t3, c03
  1875. MUL a3, b2, t3
  1876. ADD c04, t4, c04
  1877. lda AO, 4 * SIZE(AO)
  1878. MUL a4, b2, t4
  1879. lda BO, 2 * SIZE(BO)
  1880. ADD c05, t1, c05
  1881. ADD c06, t2, c06
  1882. ADD c07, t3, c07
  1883. ADD c08, t4, c08
  1884. .align 4
  1885. $L58:
  1886. #if defined(LN) || defined(RT)
  1887. #ifdef LN
  1888. subq KK, 4, TMP1
  1889. #else
  1890. subq KK, 2, TMP1
  1891. #endif
  1892. sll TMP1, BASE_SHIFT + 2, TMP2
  1893. addq AORIG, TMP2, AO
  1894. sll TMP1, BASE_SHIFT + 1, TMP2
  1895. addq B, TMP2, BO
  1896. #else
  1897. lda AO, -4 * SIZE(AO)
  1898. lda BO, -2 * SIZE(BO)
  1899. #endif
  1900. #if defined(LN) || defined(LT)
  1901. LD a1, 0 * SIZE(BO)
  1902. LD a2, 1 * SIZE(BO)
  1903. LD a3, 2 * SIZE(BO)
  1904. LD a4, 3 * SIZE(BO)
  1905. LD b1, 4 * SIZE(BO)
  1906. LD b2, 5 * SIZE(BO)
  1907. LD b3, 6 * SIZE(BO)
  1908. LD b4, 7 * SIZE(BO)
  1909. SUB a1, c01, c01
  1910. SUB a2, c05, c05
  1911. SUB a3, c02, c02
  1912. SUB a4, c06, c06
  1913. SUB b1, c03, c03
  1914. SUB b2, c07, c07
  1915. SUB b3, c04, c04
  1916. SUB b4, c08, c08
  1917. #else
  1918. LD a1, 0 * SIZE(AO)
  1919. LD a2, 1 * SIZE(AO)
  1920. LD a3, 2 * SIZE(AO)
  1921. LD a4, 3 * SIZE(AO)
  1922. LD b1, 4 * SIZE(AO)
  1923. LD b2, 5 * SIZE(AO)
  1924. LD b3, 6 * SIZE(AO)
  1925. LD b4, 7 * SIZE(AO)
  1926. SUB a1, c01, c01
  1927. SUB a2, c02, c02
  1928. SUB a3, c03, c03
  1929. SUB a4, c04, c04
  1930. SUB b1, c05, c05
  1931. SUB b2, c06, c06
  1932. SUB b3, c07, c07
  1933. SUB b4, c08, c08
  1934. #endif
  1935. #ifdef LN
  1936. LD a1, 15 * SIZE(AO)
  1937. LD a2, 14 * SIZE(AO)
  1938. LD a3, 13 * SIZE(AO)
  1939. LD a4, 12 * SIZE(AO)
  1940. MUL a1, c04, c04
  1941. MUL a1, c08, c08
  1942. MUL a2, c04, t1
  1943. MUL a2, c08, t2
  1944. SUB c03, t1, c03
  1945. SUB c07, t2, c07
  1946. MUL a3, c04, t1
  1947. MUL a3, c08, t2
  1948. SUB c02, t1, c02
  1949. SUB c06, t2, c06
  1950. MUL a4, c04, t1
  1951. MUL a4, c08, t2
  1952. SUB c01, t1, c01
  1953. SUB c05, t2, c05
  1954. LD b1, 10 * SIZE(AO)
  1955. LD b2, 9 * SIZE(AO)
  1956. LD b3, 8 * SIZE(AO)
  1957. MUL b1, c03, c03
  1958. MUL b1, c07, c07
  1959. MUL b2, c03, t1
  1960. MUL b2, c07, t2
  1961. SUB c02, t1, c02
  1962. SUB c06, t2, c06
  1963. MUL b3, c03, t1
  1964. MUL b3, c07, t2
  1965. SUB c01, t1, c01
  1966. SUB c05, t2, c05
  1967. LD a1, 5 * SIZE(AO)
  1968. LD a2, 4 * SIZE(AO)
  1969. LD a3, 0 * SIZE(AO)
  1970. MUL a1, c02, c02
  1971. MUL a1, c06, c06
  1972. MUL a2, c02, t1
  1973. MUL a2, c06, t2
  1974. SUB c01, t1, c01
  1975. SUB c05, t2, c05
  1976. MUL a3, c01, c01
  1977. MUL a3, c05, c05
  1978. #endif
  1979. #ifdef LT
  1980. LD a1, 0 * SIZE(AO)
  1981. LD a2, 1 * SIZE(AO)
  1982. LD a3, 2 * SIZE(AO)
  1983. LD a4, 3 * SIZE(AO)
  1984. MUL a1, c01, c01
  1985. MUL a1, c05, c05
  1986. MUL a2, c01, t1
  1987. MUL a2, c05, t2
  1988. SUB c02, t1, c02
  1989. SUB c06, t2, c06
  1990. MUL a3, c01, t1
  1991. MUL a3, c05, t2
  1992. SUB c03, t1, c03
  1993. SUB c07, t2, c07
  1994. MUL a4, c01, t1
  1995. MUL a4, c05, t2
  1996. SUB c04, t1, c04
  1997. SUB c08, t2, c08
  1998. LD b1, 5 * SIZE(AO)
  1999. LD b2, 6 * SIZE(AO)
  2000. LD b3, 7 * SIZE(AO)
  2001. MUL b1, c02, c02
  2002. MUL b1, c06, c06
  2003. MUL b2, c02, t1
  2004. MUL b2, c06, t2
  2005. SUB c03, t1, c03
  2006. SUB c07, t2, c07
  2007. MUL b3, c02, t1
  2008. MUL b3, c06, t2
  2009. SUB c04, t1, c04
  2010. SUB c08, t2, c08
  2011. LD a1, 10 * SIZE(AO)
  2012. LD a2, 11 * SIZE(AO)
  2013. LD a3, 15 * SIZE(AO)
  2014. MUL a1, c03, c03
  2015. MUL a1, c07, c07
  2016. MUL a2, c03, t1
  2017. MUL a2, c07, t2
  2018. SUB c04, t1, c04
  2019. SUB c08, t2, c08
  2020. MUL a3, c04, c04
  2021. MUL a3, c08, c08
  2022. #endif
  2023. #ifdef RN
  2024. LD a1, 0 * SIZE(BO)
  2025. LD a2, 1 * SIZE(BO)
  2026. LD a3, 3 * SIZE(BO)
  2027. MUL a1, c01, c01
  2028. MUL a1, c02, c02
  2029. MUL a1, c03, c03
  2030. MUL a1, c04, c04
  2031. MUL a2, c01, t1
  2032. MUL a2, c02, t2
  2033. MUL a2, c03, t3
  2034. MUL a2, c04, t4
  2035. SUB c05, t1, c05
  2036. SUB c06, t2, c06
  2037. SUB c07, t3, c07
  2038. SUB c08, t4, c08
  2039. MUL a3, c05, c05
  2040. MUL a3, c06, c06
  2041. MUL a3, c07, c07
  2042. MUL a3, c08, c08
  2043. #endif
  2044. #ifdef RT
  2045. LD a1, 3 * SIZE(BO)
  2046. LD a2, 2 * SIZE(BO)
  2047. LD a3, 0 * SIZE(BO)
  2048. MUL a1, c05, c05
  2049. MUL a1, c06, c06
  2050. MUL a1, c07, c07
  2051. MUL a1, c08, c08
  2052. MUL a2, c05, t1
  2053. MUL a2, c06, t2
  2054. MUL a2, c07, t3
  2055. MUL a2, c08, t4
  2056. SUB c01, t1, c01
  2057. SUB c02, t2, c02
  2058. SUB c03, t3, c03
  2059. SUB c04, t4, c04
  2060. MUL a3, c01, c01
  2061. MUL a3, c02, c02
  2062. MUL a3, c03, c03
  2063. MUL a3, c04, c04
  2064. #endif
  2065. #if defined(LN) || defined(LT)
  2066. ST c01, 0 * SIZE(BO)
  2067. ST c05, 1 * SIZE(BO)
  2068. ST c02, 2 * SIZE(BO)
  2069. ST c06, 3 * SIZE(BO)
  2070. ST c03, 4 * SIZE(BO)
  2071. ST c07, 5 * SIZE(BO)
  2072. ST c04, 6 * SIZE(BO)
  2073. ST c08, 7 * SIZE(BO)
  2074. #else
  2075. ST c01, 0 * SIZE(AO)
  2076. ST c02, 1 * SIZE(AO)
  2077. ST c03, 2 * SIZE(AO)
  2078. ST c04, 3 * SIZE(AO)
  2079. ST c05, 4 * SIZE(AO)
  2080. ST c06, 5 * SIZE(AO)
  2081. ST c07, 6 * SIZE(AO)
  2082. ST c08, 7 * SIZE(AO)
  2083. #endif
  2084. #ifdef LN
  2085. lda C1, -4 * SIZE(C1)
  2086. lda C2, -4 * SIZE(C2)
  2087. #endif
  2088. ST c01, 0 * SIZE(C1)
  2089. ST c02, 1 * SIZE(C1)
  2090. ST c03, 2 * SIZE(C1)
  2091. ST c04, 3 * SIZE(C1)
  2092. ST c05, 0 * SIZE(C2)
  2093. ST c06, 1 * SIZE(C2)
  2094. ST c07, 2 * SIZE(C2)
  2095. ST c08, 3 * SIZE(C2)
  2096. #ifndef LN
  2097. lda C1, 4 * SIZE(C1)
  2098. lda C2, 4 * SIZE(C2)
  2099. #endif
  2100. fclr t1
  2101. fclr t2
  2102. fclr t3
  2103. fclr t4
  2104. #ifdef RT
  2105. sll K, 2 + BASE_SHIFT, TMP1
  2106. addq AORIG, TMP1, AORIG
  2107. #endif
  2108. #if defined(LT) || defined(RN)
  2109. subq K, KK, TMP1
  2110. sll TMP1, BASE_SHIFT + 2, TMP2
  2111. addq AO, TMP2, AO
  2112. sll TMP1, BASE_SHIFT + 1, TMP2
  2113. addq BO, TMP2, BO
  2114. #endif
  2115. #ifdef LT
  2116. addq KK, 4, KK
  2117. #endif
  2118. #ifdef LN
  2119. subq KK, 4, KK
  2120. #endif
  2121. lda I, -1(I)
  2122. bgt I, $L51
  2123. .align 4
  2124. $L60:
  2125. and M, 2, I
  2126. ble I, $L70
  2127. #if defined(LT) || defined(RN)
  2128. LD a1, 0 * SIZE(AO)
  2129. fclr c01
  2130. LD a2, 1 * SIZE(AO)
  2131. fclr c05
  2132. LD a3, 2 * SIZE(AO)
  2133. fclr c02
  2134. LD a4, 3 * SIZE(AO)
  2135. fclr c06
  2136. LD b1, 0 * SIZE(B)
  2137. lda L, -2(KK)
  2138. LD b2, 1 * SIZE(B)
  2139. lda AO, 2 * SIZE(AO)
  2140. LD b3, 2 * SIZE(B)
  2141. LD b4, 3 * SIZE(B)
  2142. lda BO, 2 * SIZE(B)
  2143. ble KK, $L68
  2144. ble L, $L65
  2145. #else
  2146. #ifdef LN
  2147. sll K, BASE_SHIFT + 1, TMP1
  2148. subq AORIG, TMP1, AORIG
  2149. #endif
  2150. sll KK, BASE_SHIFT + 1, TMP1
  2151. addq AORIG, TMP1, AO
  2152. sll KK, BASE_SHIFT + 1, TMP1
  2153. addq B, TMP1, BO
  2154. subq K, KK, TMP1
  2155. LD a1, 0 * SIZE(AO)
  2156. fclr c01
  2157. LD a2, 1 * SIZE(AO)
  2158. fclr c05
  2159. LD a3, 2 * SIZE(AO)
  2160. fclr c02
  2161. LD a4, 3 * SIZE(AO)
  2162. fclr c06
  2163. LD b1, 0 * SIZE(BO)
  2164. lda L, -2(TMP1)
  2165. LD b2, 1 * SIZE(BO)
  2166. lda AO, 2 * SIZE(AO)
  2167. LD b3, 2 * SIZE(BO)
  2168. LD b4, 3 * SIZE(BO)
  2169. lda BO, 2 * SIZE(BO)
  2170. ble TMP1, $L68
  2171. ble L, $L65
  2172. #endif
  2173. .align 4
  2174. $L62:
  2175. ADD c01, t1, c01
  2176. unop
  2177. MUL a1, b1, t1
  2178. unop
  2179. ADD c02, t2, c02
  2180. lda AO, 4 * SIZE(AO)
  2181. MUL a2, b1, t2
  2182. LD b1, 2 * SIZE(BO)
  2183. ADD c05, t3, c05
  2184. lda L, -2(L)
  2185. MUL a1, b2, t3
  2186. LD a1, -2 * SIZE(AO)
  2187. ADD c06, t4, c06
  2188. unop
  2189. MUL a2, b2, t4
  2190. LD a2, -1 * SIZE(AO)
  2191. ADD c01, t1, c01
  2192. LD b2, 3 * SIZE(BO)
  2193. MUL a3, b3, t1
  2194. lda BO, 4 * SIZE(BO)
  2195. ADD c02, t2, c02
  2196. unop
  2197. MUL a4, b3, t2
  2198. LD b3, 0 * SIZE(BO)
  2199. ADD c05, t3, c05
  2200. unop
  2201. MUL a3, b4, t3
  2202. LD a3, 0 * SIZE(AO)
  2203. ADD c06, t4, c06
  2204. MUL a4, b4, t4
  2205. LD b4, 1 * SIZE(BO)
  2206. unop
  2207. LD a4, 1 * SIZE(AO)
  2208. unop
  2209. unop
  2210. bgt L, $L62
  2211. .align 4
  2212. $L65:
  2213. ADD c01, t1, c01
  2214. MUL a1, b1, t1
  2215. #if defined(LT) || defined(RN)
  2216. blbs KK, $L67
  2217. #else
  2218. blbs TMP1, $L67
  2219. #endif
  2220. .align 4
  2221. ADD c02, t2, c02
  2222. unop
  2223. MUL a2, b1, t2
  2224. LD b1, 0 * SIZE(BO)
  2225. ADD c05, t3, c05
  2226. lda BO, 2 * SIZE(BO)
  2227. MUL a1, b2, t3
  2228. LD a1, 0 * SIZE(AO)
  2229. ADD c06, t4, c06
  2230. unop
  2231. MUL a2, b2, t4
  2232. LD a2, 1 * SIZE(AO)
  2233. ADD c01, t1, c01
  2234. LD b2, -1 * SIZE(BO)
  2235. MUL a1, b1, t1
  2236. lda AO, 2 * SIZE(AO)
  2237. .align 4
  2238. $L67:
  2239. ADD c02, t2, c02
  2240. MUL a2, b1, t2
  2241. ADD c05, t3, c05
  2242. MUL a1, b2, t3
  2243. ADD c06, t4, c06
  2244. lda AO, 2 * SIZE(AO)
  2245. MUL a2, b2, t4
  2246. lda BO, 2 * SIZE(BO)
  2247. ADD c01, t1, c01
  2248. ADD c02, t2, c02
  2249. ADD c05, t3, c05
  2250. ADD c06, t4, c06
  2251. .align 4
  2252. $L68:
  2253. #if defined(LN) || defined(RT)
  2254. #ifdef LN
  2255. subq KK, 2, TMP1
  2256. #else
  2257. subq KK, 2, TMP1
  2258. #endif
  2259. sll TMP1, BASE_SHIFT + 1, TMP2
  2260. addq AORIG, TMP2, AO
  2261. sll TMP1, BASE_SHIFT + 1, TMP2
  2262. addq B, TMP2, BO
  2263. #else
  2264. lda AO, -2 * SIZE(AO)
  2265. lda BO, -2 * SIZE(BO)
  2266. #endif
  2267. #if defined(LN) || defined(LT)
  2268. LD a1, 0 * SIZE(BO)
  2269. LD a2, 1 * SIZE(BO)
  2270. LD a3, 2 * SIZE(BO)
  2271. LD a4, 3 * SIZE(BO)
  2272. SUB a1, c01, c01
  2273. SUB a2, c05, c05
  2274. SUB a3, c02, c02
  2275. SUB a4, c06, c06
  2276. #else
  2277. LD a1, 0 * SIZE(AO)
  2278. LD a2, 1 * SIZE(AO)
  2279. LD a3, 2 * SIZE(AO)
  2280. LD a4, 3 * SIZE(AO)
  2281. SUB a1, c01, c01
  2282. SUB a2, c02, c02
  2283. SUB a3, c05, c05
  2284. SUB a4, c06, c06
  2285. #endif
  2286. #ifdef LN
  2287. LD a1, 3 * SIZE(AO)
  2288. LD a2, 2 * SIZE(AO)
  2289. LD a3, 0 * SIZE(AO)
  2290. MUL a1, c02, c02
  2291. MUL a1, c06, c06
  2292. MUL a2, c02, t1
  2293. MUL a2, c06, t2
  2294. SUB c01, t1, c01
  2295. SUB c05, t2, c05
  2296. MUL a3, c01, c01
  2297. MUL a3, c05, c05
  2298. #endif
  2299. #ifdef LT
  2300. LD a1, 0 * SIZE(AO)
  2301. LD a2, 1 * SIZE(AO)
  2302. LD a3, 3 * SIZE(AO)
  2303. MUL a1, c01, c01
  2304. MUL a1, c05, c05
  2305. MUL a2, c01, t1
  2306. MUL a2, c05, t2
  2307. SUB c02, t1, c02
  2308. SUB c06, t2, c06
  2309. MUL a3, c02, c02
  2310. MUL a3, c06, c06
  2311. #endif
  2312. #ifdef RN
  2313. LD a1, 0 * SIZE(BO)
  2314. LD a2, 1 * SIZE(BO)
  2315. LD a3, 3 * SIZE(BO)
  2316. MUL a1, c01, c01
  2317. MUL a1, c02, c02
  2318. MUL a2, c01, t1
  2319. MUL a2, c02, t2
  2320. SUB c05, t1, c05
  2321. SUB c06, t2, c06
  2322. MUL a3, c05, c05
  2323. MUL a3, c06, c06
  2324. #endif
  2325. #ifdef RT
  2326. LD a1, 3 * SIZE(BO)
  2327. LD a2, 2 * SIZE(BO)
  2328. LD a3, 0 * SIZE(BO)
  2329. MUL a1, c05, c05
  2330. MUL a1, c06, c06
  2331. MUL a2, c05, t1
  2332. MUL a2, c06, t2
  2333. SUB c01, t1, c01
  2334. SUB c02, t2, c02
  2335. MUL a3, c01, c01
  2336. MUL a3, c02, c02
  2337. #endif
  2338. #if defined(LN) || defined(LT)
  2339. ST c01, 0 * SIZE(BO)
  2340. ST c05, 1 * SIZE(BO)
  2341. ST c02, 2 * SIZE(BO)
  2342. ST c06, 3 * SIZE(BO)
  2343. #else
  2344. ST c01, 0 * SIZE(AO)
  2345. ST c02, 1 * SIZE(AO)
  2346. ST c05, 2 * SIZE(AO)
  2347. ST c06, 3 * SIZE(AO)
  2348. #endif
  2349. #ifdef LN
  2350. lda C1, -2 * SIZE(C1)
  2351. lda C2, -2 * SIZE(C2)
  2352. #endif
  2353. ST c01, 0 * SIZE(C1)
  2354. ST c02, 1 * SIZE(C1)
  2355. ST c05, 0 * SIZE(C2)
  2356. ST c06, 1 * SIZE(C2)
  2357. #ifndef LN
  2358. lda C1, 2 * SIZE(C1)
  2359. lda C2, 2 * SIZE(C2)
  2360. #endif
  2361. fclr t1
  2362. fclr t2
  2363. fclr t3
  2364. fclr t4
  2365. #ifdef RT
  2366. sll K, 1 + BASE_SHIFT, TMP1
  2367. addq AORIG, TMP1, AORIG
  2368. #endif
  2369. #if defined(LT) || defined(RN)
  2370. subq K, KK, TMP1
  2371. sll TMP1, BASE_SHIFT + 1, TMP2
  2372. addq AO, TMP2, AO
  2373. sll TMP1, BASE_SHIFT + 1, TMP2
  2374. addq BO, TMP2, BO
  2375. #endif
  2376. #ifdef LT
  2377. addq KK, 2, KK
  2378. #endif
  2379. #ifdef LN
  2380. subq KK, 2, KK
  2381. #endif
  2382. .align 4
  2383. $L70:
  2384. and M, 1, I
  2385. ble I, $L79
  2386. #if defined(LT) || defined(RN)
  2387. LD a1, 0 * SIZE(AO)
  2388. fclr c01
  2389. LD a2, 1 * SIZE(AO)
  2390. fclr c05
  2391. LD b1, 0 * SIZE(B)
  2392. fclr c02
  2393. LD b2, 1 * SIZE(B)
  2394. fclr c06
  2395. lda L, -2(KK)
  2396. LD b3, 2 * SIZE(B)
  2397. lda AO, 1 * SIZE(AO)
  2398. LD b4, 3 * SIZE(B)
  2399. lda BO, 2 * SIZE(B)
  2400. ble KK, $L78
  2401. ble L, $L75
  2402. #else
  2403. #ifdef LN
  2404. sll K, BASE_SHIFT + 0, TMP1
  2405. subq AORIG, TMP1, AORIG
  2406. #endif
  2407. sll KK, BASE_SHIFT + 0, TMP1
  2408. addq AORIG, TMP1, AO
  2409. sll KK, BASE_SHIFT + 1, TMP1
  2410. addq B, TMP1, BO
  2411. subq K, KK, TMP1
  2412. LD a1, 0 * SIZE(AO)
  2413. fclr c01
  2414. LD a2, 1 * SIZE(AO)
  2415. fclr c05
  2416. LD b1, 0 * SIZE(BO)
  2417. fclr c02
  2418. LD b2, 1 * SIZE(BO)
  2419. fclr c06
  2420. lda L, -2(TMP1)
  2421. LD b3, 2 * SIZE(BO)
  2422. lda AO, 1 * SIZE(AO)
  2423. LD b4, 3 * SIZE(BO)
  2424. lda BO, 2 * SIZE(BO)
  2425. ble TMP1, $L78
  2426. ble L, $L75
  2427. #endif
  2428. .align 4
  2429. $L72:
  2430. ADD c01, t1, c01
  2431. lda L, -2(L)
  2432. MUL a1, b1, t1
  2433. LD b1, 2 * SIZE(BO)
  2434. ADD c05, t2, c05
  2435. MUL a1, b2, t2
  2436. LD a1, 1 * SIZE(AO)
  2437. LD b2, 3 * SIZE(BO)
  2438. ADD c02, t3, c02
  2439. lda AO, 2 * SIZE(AO)
  2440. MUL a2, b3, t3
  2441. LD b3, 4 * SIZE(BO)
  2442. ADD c06, t4, c06
  2443. MUL a2, b4, t4
  2444. LD a2, 0 * SIZE(AO)
  2445. LD b4, 5 * SIZE(BO)
  2446. lda BO, 4 * SIZE(BO)
  2447. unop
  2448. unop
  2449. bgt L, $L72
  2450. .align 4
  2451. $L75:
  2452. ADD c01, t1, c01
  2453. MUL a1, b1, t1
  2454. #if defined(LT) || defined(RN)
  2455. blbs KK, $L77
  2456. #else
  2457. blbs TMP1, $L77
  2458. #endif
  2459. .align 4
  2460. ADD c05, t2, c05
  2461. MUL a1, b2, t2
  2462. LD a1, 0 * SIZE(AO)
  2463. LD b1, 0 * SIZE(BO)
  2464. ADD c01, t1, c01
  2465. LD b2, 1 * SIZE(BO)
  2466. lda AO, 1 * SIZE(AO)
  2467. MUL a1, b1, t1
  2468. lda BO, 2 * SIZE(BO)
  2469. .align 4
  2470. $L77:
  2471. ADD c05, t2, c05
  2472. MUL a1, b2, t2
  2473. ADD c02, t3, c02
  2474. ADD c06, t4, c06
  2475. ADD c01, c02, c01
  2476. lda AO, 1 * SIZE(AO)
  2477. ADD c05, c06, c05
  2478. lda BO, 2 * SIZE(BO)
  2479. ADD c01, t1, c01
  2480. ADD c05, t2, c05
  2481. .align 4
  2482. $L78:
  2483. #if defined(LN) || defined(RT)
  2484. #ifdef LN
  2485. subq KK, 1, TMP1
  2486. #else
  2487. subq KK, 2, TMP1
  2488. #endif
  2489. sll TMP1, BASE_SHIFT + 0, TMP2
  2490. addq AORIG, TMP2, AO
  2491. sll TMP1, BASE_SHIFT + 1, TMP2
  2492. addq B, TMP2, BO
  2493. #else
  2494. lda AO, -1 * SIZE(AO)
  2495. lda BO, -2 * SIZE(BO)
  2496. #endif
  2497. #if defined(LN) || defined(LT)
  2498. LD a1, 0 * SIZE(BO)
  2499. LD a2, 1 * SIZE(BO)
  2500. SUB a1, c01, c01
  2501. SUB a2, c05, c05
  2502. #else
  2503. LD a1, 0 * SIZE(AO)
  2504. LD a2, 1 * SIZE(AO)
  2505. SUB a1, c01, c01
  2506. SUB a2, c05, c05
  2507. #endif
  2508. #if defined(LN) || defined(LT)
  2509. LD a1, 0 * SIZE(AO)
  2510. MUL a1, c01, c01
  2511. MUL a1, c05, c05
  2512. #endif
  2513. #ifdef RN
  2514. LD a1, 0 * SIZE(BO)
  2515. LD a2, 1 * SIZE(BO)
  2516. LD a3, 3 * SIZE(BO)
  2517. MUL a1, c01, c01
  2518. MUL a2, c01, t1
  2519. SUB c05, t1, c05
  2520. MUL a3, c05, c05
  2521. #endif
  2522. #ifdef RT
  2523. LD a1, 3 * SIZE(BO)
  2524. LD a2, 2 * SIZE(BO)
  2525. LD a3, 0 * SIZE(BO)
  2526. MUL a1, c05, c05
  2527. MUL a2, c05, t1
  2528. SUB c01, t1, c01
  2529. MUL a3, c01, c01
  2530. #endif
  2531. #if defined(LN) || defined(LT)
  2532. ST c01, 0 * SIZE(BO)
  2533. ST c05, 1 * SIZE(BO)
  2534. #else
  2535. ST c01, 0 * SIZE(AO)
  2536. ST c05, 1 * SIZE(AO)
  2537. #endif
  2538. #ifdef LN
  2539. lda C1, -1 * SIZE(C1)
  2540. lda C2, -1 * SIZE(C2)
  2541. #endif
  2542. ST c01, 0 * SIZE(C1)
  2543. ST c05, 0 * SIZE(C2)
  2544. fclr t1
  2545. fclr t2
  2546. fclr t3
  2547. fclr t4
  2548. #ifdef RT
  2549. sll K, 0 + BASE_SHIFT, TMP1
  2550. addq AORIG, TMP1, AORIG
  2551. #endif
  2552. #if defined(LT) || defined(RN)
  2553. subq K, KK, TMP1
  2554. sll TMP1, BASE_SHIFT + 0, TMP2
  2555. addq AO, TMP2, AO
  2556. sll TMP1, BASE_SHIFT + 1, TMP2
  2557. addq BO, TMP2, BO
  2558. #endif
  2559. #ifdef LT
  2560. addq KK, 1, KK
  2561. #endif
  2562. #ifdef LN
  2563. subq KK, 1, KK
  2564. #endif
  2565. .align 4
  2566. $L79:
  2567. #ifdef LN
  2568. sll K, 1 + BASE_SHIFT, TMP1
  2569. addq B, TMP1, B
  2570. #endif
  2571. #if defined(LT) || defined(RN)
  2572. mov BO, B
  2573. #endif
  2574. #ifdef RN
  2575. addq KK, 2, KK
  2576. #endif
  2577. #ifdef RT
  2578. subq KK, 2, KK
  2579. #endif
  2580. .align 4
  2581. $L80:
  2582. and N, 1, J
  2583. ble J, $L999
  2584. #ifdef RT
  2585. sll K, BASE_SHIFT, TMP1
  2586. subq B, TMP1, B
  2587. subq C, LDC, C
  2588. #endif
  2589. mov C, C1
  2590. #ifndef RT
  2591. addq C, LDC, C
  2592. #endif
  2593. #ifdef LN
  2594. addq M, OFFSET, KK
  2595. #endif
  2596. #ifdef LT
  2597. mov OFFSET, KK
  2598. #endif
  2599. #if defined(LN) || defined(RT)
  2600. mov A, AORIG
  2601. #else
  2602. mov A, AO
  2603. #endif
  2604. sra M, 2, I
  2605. ble I, $L100
  2606. .align 4
  2607. $L91:
  2608. #if defined(LT) || defined(RN)
  2609. LD a1, 0 * SIZE(AO)
  2610. fclr t1
  2611. LD a2, 1 * SIZE(AO)
  2612. fclr t2
  2613. LD a3, 2 * SIZE(AO)
  2614. fclr t3
  2615. LD a4, 3 * SIZE(AO)
  2616. fclr t4
  2617. LD b1, 0 * SIZE(B)
  2618. fclr c01
  2619. LD b2, 1 * SIZE(B)
  2620. fclr c02
  2621. LD b3, 2 * SIZE(B)
  2622. fclr c03
  2623. LD b4, 3 * SIZE(B)
  2624. fclr c04
  2625. sra KK, 2, L
  2626. mov B, BO
  2627. ble L, $L95
  2628. #else
  2629. #ifdef LN
  2630. sll K, BASE_SHIFT + 2, TMP1
  2631. subq AORIG, TMP1, AORIG
  2632. #endif
  2633. sll KK, BASE_SHIFT + 2, TMP1
  2634. addq AORIG, TMP1, AO
  2635. sll KK, BASE_SHIFT + 0, TMP1
  2636. addq B, TMP1, BO
  2637. subq K, KK, TMP1
  2638. LD a1, 0 * SIZE(AO)
  2639. fclr t1
  2640. LD a2, 1 * SIZE(AO)
  2641. fclr t2
  2642. LD a3, 2 * SIZE(AO)
  2643. fclr t3
  2644. LD a4, 3 * SIZE(AO)
  2645. fclr t4
  2646. LD b1, 0 * SIZE(BO)
  2647. fclr c01
  2648. LD b2, 1 * SIZE(BO)
  2649. fclr c02
  2650. LD b3, 2 * SIZE(BO)
  2651. fclr c03
  2652. LD b4, 3 * SIZE(BO)
  2653. fclr c04
  2654. sra TMP1, 2, L
  2655. unop
  2656. ble L, $L95
  2657. #endif
  2658. .align 5
  2659. $L92:
  2660. ADD c01, t1, c01
  2661. unop
  2662. MUL a1, b1, t1
  2663. LD a1, 4 * SIZE(AO)
  2664. ADD c02, t2, c02
  2665. lda L, -1(L)
  2666. MUL a2, b1, t2
  2667. LD a2, 5 * SIZE(AO)
  2668. ADD c03, t3, c03
  2669. unop
  2670. MUL a3, b1, t3
  2671. LD a3, 6 * SIZE(AO)
  2672. ADD c04, t4, c04
  2673. MUL a4, b1, t4
  2674. LD a4, 7 * SIZE(AO)
  2675. LD b1, 4 * SIZE(BO)
  2676. ADD c01, t1, c01
  2677. unop
  2678. MUL a1, b2, t1
  2679. LD a1, 8 * SIZE(AO)
  2680. ADD c02, t2, c02
  2681. unop
  2682. MUL a2, b2, t2
  2683. LD a2, 9 * SIZE(AO)
  2684. ADD c03, t3, c03
  2685. unop
  2686. MUL a3, b2, t3
  2687. LD a3, 10 * SIZE(AO)
  2688. ADD c04, t4, c04
  2689. MUL a4, b2, t4
  2690. LD a4, 11 * SIZE(AO)
  2691. LD b2, 5 * SIZE(BO)
  2692. ADD c01, t1, c01
  2693. unop
  2694. MUL a1, b3, t1
  2695. LD a1, 12 * SIZE(AO)
  2696. ADD c02, t2, c02
  2697. unop
  2698. MUL a2, b3, t2
  2699. LD a2, 13 * SIZE(AO)
  2700. ADD c03, t3, c03
  2701. unop
  2702. MUL a3, b3, t3
  2703. LD a3, 14 * SIZE(AO)
  2704. ADD c04, t4, c04
  2705. MUL a4, b3, t4
  2706. LD a5, 15 * SIZE(AO)
  2707. LD b3, 6 * SIZE(BO)
  2708. ADD c01, t1, c01
  2709. MUL a1, b4, t1
  2710. LD a1, 16 * SIZE(AO)
  2711. lda AO, 16 * SIZE(AO)
  2712. ADD c02, t2, c02
  2713. lda BO, 4 * SIZE(BO)
  2714. MUL a2, b4, t2
  2715. LD a2, 1 * SIZE(AO)
  2716. ADD c03, t3, c03
  2717. LD a4, 3 * SIZE(AO)
  2718. MUL a3, b4, t3
  2719. LD a3, 2 * SIZE(AO)
  2720. ADD c04, t4, c04
  2721. MUL a5, b4, t4
  2722. LD b4, 3 * SIZE(BO)
  2723. bgt L, $L92
  2724. .align 4
  2725. $L95:
  2726. #if defined(LT) || defined(RN)
  2727. and KK, 3, L
  2728. #else
  2729. and TMP1, 3, L
  2730. #endif
  2731. unop
  2732. ble L, $L98
  2733. .align 4
  2734. $L96:
  2735. ADD c01, t1, c01
  2736. lda L, -1(L)
  2737. MUL a1, b1, t1
  2738. LD a1, 4 * SIZE(AO)
  2739. ADD c02, t2, c02
  2740. lda BO, 1 * SIZE(BO)
  2741. MUL a2, b1, t2
  2742. LD a2, 5 * SIZE(AO)
  2743. ADD c03, t3, c03
  2744. unop
  2745. MUL a3, b1, t3
  2746. LD a3, 6 * SIZE(AO)
  2747. ADD c04, t4, c04
  2748. MUL a4, b1, t4
  2749. LD a4, 7 * SIZE(AO)
  2750. LD b1, 0 * SIZE(BO)
  2751. lda AO, 4 * SIZE(AO)
  2752. bgt L, $L96
  2753. .align 4
  2754. $L98:
  2755. ADD c01, t1, c01
  2756. ADD c02, t2, c02
  2757. ADD c03, t3, c03
  2758. ADD c04, t4, c04
  2759. #if defined(LN) || defined(RT)
  2760. #ifdef LN
  2761. subq KK, 4, TMP1
  2762. #else
  2763. subq KK, 1, TMP1
  2764. #endif
  2765. sll TMP1, BASE_SHIFT + 2, TMP2
  2766. addq AORIG, TMP2, AO
  2767. sll TMP1, BASE_SHIFT + 0, TMP2
  2768. addq B, TMP2, BO
  2769. #endif
  2770. #if defined(LN) || defined(LT)
  2771. LD a1, 0 * SIZE(BO)
  2772. LD a2, 1 * SIZE(BO)
  2773. LD a3, 2 * SIZE(BO)
  2774. LD a4, 3 * SIZE(BO)
  2775. SUB a1, c01, c01
  2776. SUB a2, c02, c02
  2777. SUB a3, c03, c03
  2778. SUB a4, c04, c04
  2779. #else
  2780. LD a1, 0 * SIZE(AO)
  2781. LD a2, 1 * SIZE(AO)
  2782. LD a3, 2 * SIZE(AO)
  2783. LD a4, 3 * SIZE(AO)
  2784. SUB a1, c01, c01
  2785. SUB a2, c02, c02
  2786. SUB a3, c03, c03
  2787. SUB a4, c04, c04
  2788. #endif
  2789. #ifdef LN
  2790. LD a1, 15 * SIZE(AO)
  2791. LD a2, 14 * SIZE(AO)
  2792. LD a3, 13 * SIZE(AO)
  2793. LD a4, 12 * SIZE(AO)
  2794. MUL a1, c04, c04
  2795. MUL a2, c04, t1
  2796. SUB c03, t1, c03
  2797. MUL a3, c04, t1
  2798. SUB c02, t1, c02
  2799. MUL a4, c04, t1
  2800. SUB c01, t1, c01
  2801. LD b1, 10 * SIZE(AO)
  2802. LD b2, 9 * SIZE(AO)
  2803. LD b3, 8 * SIZE(AO)
  2804. MUL b1, c03, c03
  2805. MUL b2, c03, t1
  2806. SUB c02, t1, c02
  2807. MUL b3, c03, t1
  2808. SUB c01, t1, c01
  2809. LD a1, 5 * SIZE(AO)
  2810. LD a2, 4 * SIZE(AO)
  2811. LD a3, 0 * SIZE(AO)
  2812. MUL a1, c02, c02
  2813. MUL a2, c02, t1
  2814. SUB c01, t1, c01
  2815. MUL a3, c01, c01
  2816. #endif
  2817. #ifdef LT
  2818. LD a1, 0 * SIZE(AO)
  2819. LD a2, 1 * SIZE(AO)
  2820. LD a3, 2 * SIZE(AO)
  2821. LD a4, 3 * SIZE(AO)
  2822. MUL a1, c01, c01
  2823. MUL a2, c01, t1
  2824. SUB c02, t1, c02
  2825. MUL a3, c01, t1
  2826. SUB c03, t1, c03
  2827. MUL a4, c01, t1
  2828. SUB c04, t1, c04
  2829. LD b1, 5 * SIZE(AO)
  2830. LD b2, 6 * SIZE(AO)
  2831. LD b3, 7 * SIZE(AO)
  2832. MUL b1, c02, c02
  2833. MUL b2, c02, t1
  2834. SUB c03, t1, c03
  2835. MUL b3, c02, t1
  2836. SUB c04, t1, c04
  2837. LD a1, 10 * SIZE(AO)
  2838. LD a2, 11 * SIZE(AO)
  2839. LD a3, 15 * SIZE(AO)
  2840. MUL a1, c03, c03
  2841. MUL a2, c03, t1
  2842. SUB c04, t1, c04
  2843. MUL a3, c04, c04
  2844. #endif
  2845. #if defined(RN) || defined(RT)
  2846. LD a1, 0 * SIZE(BO)
  2847. MUL a1, c01, c01
  2848. MUL a1, c02, c02
  2849. MUL a1, c03, c03
  2850. MUL a1, c04, c04
  2851. #endif
  2852. #if defined(LN) || defined(LT)
  2853. ST c01, 0 * SIZE(BO)
  2854. ST c02, 1 * SIZE(BO)
  2855. ST c03, 2 * SIZE(BO)
  2856. ST c04, 3 * SIZE(BO)
  2857. #else
  2858. ST c01, 0 * SIZE(AO)
  2859. ST c02, 1 * SIZE(AO)
  2860. ST c03, 2 * SIZE(AO)
  2861. ST c04, 3 * SIZE(AO)
  2862. #endif
  2863. #ifdef LN
  2864. lda C1, -4 * SIZE(C1)
  2865. #endif
  2866. ST c01, 0 * SIZE(C1)
  2867. ST c02, 1 * SIZE(C1)
  2868. ST c03, 2 * SIZE(C1)
  2869. ST c04, 3 * SIZE(C1)
  2870. #ifndef LN
  2871. lda C1, 4 * SIZE(C1)
  2872. #endif
  2873. fclr t1
  2874. fclr t2
  2875. fclr t3
  2876. fclr t4
  2877. #ifdef RT
  2878. sll K, 2 + BASE_SHIFT, TMP1
  2879. addq AORIG, TMP1, AORIG
  2880. #endif
  2881. #if defined(LT) || defined(RN)
  2882. subq K, KK, TMP1
  2883. sll TMP1, BASE_SHIFT + 2, TMP2
  2884. addq AO, TMP2, AO
  2885. sll TMP1, BASE_SHIFT + 0, TMP2
  2886. addq BO, TMP2, BO
  2887. #endif
  2888. #ifdef LT
  2889. addq KK, 4, KK
  2890. #endif
  2891. #ifdef LN
  2892. subq KK, 4, KK
  2893. #endif
  2894. lda I, -1(I)
  2895. bgt I, $L91
  2896. .align 4
  2897. $L100:
  2898. and M, 2, I
  2899. ble I, $L110
  2900. #if defined(LT) || defined(RN)
  2901. LD a1, 0 * SIZE(AO)
  2902. fclr t1
  2903. LD a2, 1 * SIZE(AO)
  2904. fclr t2
  2905. LD a3, 2 * SIZE(AO)
  2906. fclr t3
  2907. LD a4, 3 * SIZE(AO)
  2908. fclr t4
  2909. LD b1, 0 * SIZE(B)
  2910. fclr c01
  2911. LD b2, 1 * SIZE(B)
  2912. fclr c02
  2913. LD b3, 2 * SIZE(B)
  2914. fclr c03
  2915. LD b4, 3 * SIZE(B)
  2916. fclr c04
  2917. sra KK, 2, L
  2918. mov B, BO
  2919. ble L, $L105
  2920. #else
  2921. #ifdef LN
  2922. sll K, BASE_SHIFT + 1, TMP1
  2923. subq AORIG, TMP1, AORIG
  2924. #endif
  2925. sll KK, BASE_SHIFT + 1, TMP1
  2926. addq AORIG, TMP1, AO
  2927. sll KK, BASE_SHIFT + 0, TMP1
  2928. addq B, TMP1, BO
  2929. subq K, KK, TMP1
  2930. LD a1, 0 * SIZE(AO)
  2931. fclr t1
  2932. LD a2, 1 * SIZE(AO)
  2933. fclr t2
  2934. LD a3, 2 * SIZE(AO)
  2935. fclr t3
  2936. LD a4, 3 * SIZE(AO)
  2937. fclr t4
  2938. LD b1, 0 * SIZE(BO)
  2939. fclr c01
  2940. LD b2, 1 * SIZE(BO)
  2941. fclr c02
  2942. LD b3, 2 * SIZE(BO)
  2943. fclr c03
  2944. LD b4, 3 * SIZE(BO)
  2945. fclr c04
  2946. sra TMP1, 2, L
  2947. ble L, $L105
  2948. #endif
  2949. .align 5
  2950. $L102:
  2951. ADD c01, t1, c01
  2952. lda L, -1(L)
  2953. MUL a1, b1, t1
  2954. LD a1, 4 * SIZE(AO)
  2955. ADD c02, t2, c02
  2956. MUL a2, b1, t2
  2957. LD a2, 5 * SIZE(AO)
  2958. LD b1, 4 * SIZE(BO)
  2959. ADD c03, t3, c03
  2960. lda BO, 4 * SIZE(BO)
  2961. MUL a3, b2, t3
  2962. LD a3, 6 * SIZE(AO)
  2963. ADD c04, t4, c04
  2964. MUL a4, b2, t4
  2965. LD a5, 7 * SIZE(AO)
  2966. LD b2, 1 * SIZE(BO)
  2967. ADD c01, t1, c01
  2968. MUL a1, b3, t1
  2969. LD a1, 8 * SIZE(AO)
  2970. lda AO, 8 * SIZE(AO)
  2971. ADD c02, t2, c02
  2972. MUL a2, b3, t2
  2973. LD b3, 2 * SIZE(BO)
  2974. LD a2, 1 * SIZE(AO)
  2975. ADD c03, t3, c03
  2976. LD a4, 3 * SIZE(AO)
  2977. MUL a3, b4, t3
  2978. LD a3, 2 * SIZE(AO)
  2979. ADD c04, t4, c04
  2980. MUL a5, b4, t4
  2981. LD b4, 3 * SIZE(BO)
  2982. bgt L, $L102
  2983. .align 4
  2984. $L105:
  2985. #if defined(LT) || defined(RN)
  2986. and KK, 3, L
  2987. #else
  2988. and TMP1, 3, L
  2989. #endif
  2990. ble L, $L108
  2991. .align 4
  2992. $L106:
  2993. ADD c01, t1, c01
  2994. lda L, -1(L)
  2995. MUL a1, b1, t1
  2996. LD a1, 2 * SIZE(AO)
  2997. ADD c02, t2, c02
  2998. MUL a2, b1, t2
  2999. LD a2, 3 * SIZE(AO)
  3000. LD b1, 1 * SIZE(BO)
  3001. lda AO, 2 * SIZE(AO)
  3002. unop
  3003. lda BO, 1 * SIZE(BO)
  3004. bgt L, $L106
  3005. .align 4
  3006. $L108:
  3007. ADD c01, t1, c01
  3008. ADD c02, t2, c02
  3009. ADD c03, t3, c03
  3010. ADD c04, t4, c04
  3011. ADD c01, c03, c01
  3012. ADD c02, c04, c02
  3013. #if defined(LN) || defined(RT)
  3014. #ifdef LN
  3015. subq KK, 2, TMP1
  3016. #else
  3017. subq KK, 1, TMP1
  3018. #endif
  3019. sll TMP1, BASE_SHIFT + 1, TMP2
  3020. addq AORIG, TMP2, AO
  3021. sll TMP1, BASE_SHIFT + 0, TMP2
  3022. addq B, TMP2, BO
  3023. #endif
  3024. #if defined(LN) || defined(LT)
  3025. LD a1, 0 * SIZE(BO)
  3026. LD a2, 1 * SIZE(BO)
  3027. SUB a1, c01, c01
  3028. SUB a2, c02, c02
  3029. #else
  3030. LD a1, 0 * SIZE(AO)
  3031. LD a2, 1 * SIZE(AO)
  3032. SUB a1, c01, c01
  3033. SUB a2, c02, c02
  3034. #endif
  3035. #ifdef LN
  3036. LD a1, 3 * SIZE(AO)
  3037. LD a2, 2 * SIZE(AO)
  3038. LD a3, 0 * SIZE(AO)
  3039. MUL a1, c02, c02
  3040. MUL a2, c02, t1
  3041. SUB c01, t1, c01
  3042. MUL a3, c01, c01
  3043. #endif
  3044. #ifdef LT
  3045. LD a1, 0 * SIZE(AO)
  3046. LD a2, 1 * SIZE(AO)
  3047. LD a3, 3 * SIZE(AO)
  3048. MUL a1, c01, c01
  3049. MUL a2, c01, t1
  3050. SUB c02, t1, c02
  3051. MUL a3, c02, c02
  3052. #endif
  3053. #if defined(RN) || defined(RT)
  3054. LD a1, 0 * SIZE(BO)
  3055. MUL a1, c01, c01
  3056. MUL a1, c02, c02
  3057. #endif
  3058. #if defined(LN) || defined(LT)
  3059. ST c01, 0 * SIZE(BO)
  3060. ST c02, 1 * SIZE(BO)
  3061. #else
  3062. ST c01, 0 * SIZE(AO)
  3063. ST c02, 1 * SIZE(AO)
  3064. #endif
  3065. #ifdef LN
  3066. lda C1, -2 * SIZE(C1)
  3067. #endif
  3068. ST c01, 0 * SIZE(C1)
  3069. ST c02, 1 * SIZE(C1)
  3070. #ifndef LN
  3071. lda C1, 2 * SIZE(C1)
  3072. #endif
  3073. fclr t1
  3074. fclr t2
  3075. fclr t3
  3076. fclr t4
  3077. #ifdef RT
  3078. sll K, 1 + BASE_SHIFT, TMP1
  3079. addq AORIG, TMP1, AORIG
  3080. #endif
  3081. #if defined(LT) || defined(RN)
  3082. subq K, KK, TMP1
  3083. sll TMP1, BASE_SHIFT + 1, TMP2
  3084. addq AO, TMP2, AO
  3085. sll TMP1, BASE_SHIFT + 0, TMP2
  3086. addq BO, TMP2, BO
  3087. #endif
  3088. #ifdef LT
  3089. addq KK, 2, KK
  3090. #endif
  3091. #ifdef LN
  3092. subq KK, 2, KK
  3093. #endif
  3094. .align 4
  3095. $L110:
  3096. and M, 1, I
  3097. ble I, $L119
  3098. #if defined(LT) || defined(RN)
  3099. LD a1, 0 * SIZE(AO)
  3100. fclr t1
  3101. LD a2, 1 * SIZE(AO)
  3102. fclr t2
  3103. LD a3, 2 * SIZE(AO)
  3104. fclr t3
  3105. LD a4, 3 * SIZE(AO)
  3106. fclr t4
  3107. LD b1, 0 * SIZE(B)
  3108. fclr c01
  3109. LD b2, 1 * SIZE(B)
  3110. fclr c02
  3111. LD b3, 2 * SIZE(B)
  3112. fclr c03
  3113. LD b4, 3 * SIZE(B)
  3114. fclr c04
  3115. sra KK, 2, L
  3116. mov B, BO
  3117. unop
  3118. ble L, $L115
  3119. #else
  3120. #ifdef LN
  3121. sll K, BASE_SHIFT + 0, TMP1
  3122. subq AORIG, TMP1, AORIG
  3123. #endif
  3124. sll KK, BASE_SHIFT + 0, TMP1
  3125. addq AORIG, TMP1, AO
  3126. sll KK, BASE_SHIFT + 0, TMP1
  3127. addq B, TMP1, BO
  3128. subq K, KK, TMP1
  3129. LD a1, 0 * SIZE(AO)
  3130. fclr t1
  3131. LD a2, 1 * SIZE(AO)
  3132. fclr t2
  3133. LD a3, 2 * SIZE(AO)
  3134. fclr t3
  3135. LD a4, 3 * SIZE(AO)
  3136. fclr t4
  3137. LD b1, 0 * SIZE(BO)
  3138. fclr c01
  3139. LD b2, 1 * SIZE(BO)
  3140. fclr c02
  3141. LD b3, 2 * SIZE(BO)
  3142. fclr c03
  3143. LD b4, 3 * SIZE(BO)
  3144. fclr c04
  3145. sra TMP1, 2, L
  3146. unop
  3147. ble L, $L115
  3148. #endif
  3149. .align 4
  3150. $L112:
  3151. ADD c01, t1, c01
  3152. MUL a1, b1, t1
  3153. LD a1, 4 * SIZE(AO)
  3154. LD b1, 4 * SIZE(BO)
  3155. ADD c02, t2, c02
  3156. MUL a2, b2, t2
  3157. LD a2, 5 * SIZE(AO)
  3158. LD b2, 5 * SIZE(BO)
  3159. ADD c03, t3, c03
  3160. MUL a3, b3, t3
  3161. LD a3, 6 * SIZE(AO)
  3162. LD b3, 6 * SIZE(BO)
  3163. ADD c04, t4, c04
  3164. MUL a4, b4, t4
  3165. LD a4, 7 * SIZE(AO)
  3166. LD b4, 7 * SIZE(BO)
  3167. lda L, -1(L)
  3168. lda AO, 4 * SIZE(AO)
  3169. lda BO, 4 * SIZE(BO)
  3170. bgt L, $L112
  3171. .align 4
  3172. $L115:
  3173. #if defined(LT) || defined(RN)
  3174. and KK, 3, L
  3175. #else
  3176. and TMP1, 3, L
  3177. #endif
  3178. ble L, $L118
  3179. .align 4
  3180. $L116:
  3181. ADD c01, t1, c01
  3182. MUL a1, b1, t1
  3183. LD a1, 1 * SIZE(AO)
  3184. LD b1, 1 * SIZE(BO)
  3185. lda L, -1(L)
  3186. lda AO, 1 * SIZE(AO)
  3187. lda BO, 1 * SIZE(BO)
  3188. bgt L, $L116
  3189. .align 4
  3190. $L118:
  3191. ADD c01, t1, c01
  3192. ADD c02, t2, c02
  3193. ADD c03, t3, c03
  3194. ADD c04, t4, c04
  3195. ADD c01, c02, c01
  3196. ADD c03, c04, c03
  3197. ADD c01, c03, c01
  3198. #if defined(LN) || defined(RT)
  3199. subq KK, 1, TMP1
  3200. sll TMP1, BASE_SHIFT + 0, TMP2
  3201. addq AORIG, TMP2, AO
  3202. addq B, TMP2, BO
  3203. #endif
  3204. #if defined(LN) || defined(LT)
  3205. LD a1, 0 * SIZE(BO)
  3206. SUB a1, c01, c01
  3207. #else
  3208. LD a1, 0 * SIZE(AO)
  3209. SUB a1, c01, c01
  3210. #endif
  3211. #if defined(LN) || defined(LT)
  3212. LD a1, 0 * SIZE(AO)
  3213. MUL a1, c01, c01
  3214. #endif
  3215. #if defined(RN) || defined(RT)
  3216. LD a1, 0 * SIZE(BO)
  3217. MUL a1, c01, c01
  3218. #endif
  3219. #if defined(LN) || defined(LT)
  3220. ST c01, 0 * SIZE(BO)
  3221. #else
  3222. ST c01, 0 * SIZE(AO)
  3223. #endif
  3224. #ifdef LN
  3225. lda C1, -1 * SIZE(C1)
  3226. #endif
  3227. ST c01, 0 * SIZE(C1)
  3228. #ifndef LN
  3229. lda C1, 1 * SIZE(C1)
  3230. #endif
  3231. #ifdef RT
  3232. SXADDQ K, AORIG, AORIG
  3233. #endif
  3234. #if defined(LT) || defined(RN)
  3235. subq K, KK, TMP1
  3236. sll TMP1, BASE_SHIFT + 0, TMP2
  3237. addq AO, TMP2, AO
  3238. addq BO, TMP2, BO
  3239. #endif
  3240. #ifdef LT
  3241. addq KK, 1, KK
  3242. #endif
  3243. #ifdef LN
  3244. subq KK, 1, KK
  3245. #endif
  3246. .align 4
  3247. $L119:
  3248. #ifdef LN
  3249. SXADDQ K, B, B
  3250. #endif
  3251. #if defined(LT) || defined(RN)
  3252. mov BO, B
  3253. #endif
  3254. #ifdef RN
  3255. addq KK, 1, KK
  3256. #endif
  3257. #ifdef RT
  3258. subq KK, 1, KK
  3259. #endif
  3260. .align 4
  3261. $L999:
  3262. ldt $f2, 0($sp)
  3263. ldt $f3, 8($sp)
  3264. ldt $f4, 16($sp)
  3265. ldt $f5, 24($sp)
  3266. ldt $f6, 32($sp)
  3267. ldt $f7, 40($sp)
  3268. ldt $f8, 48($sp)
  3269. ldt $f9, 56($sp)
  3270. clr $0
  3271. lda $sp, STACKSIZE($sp)
  3272. ret
  3273. EPILOGUE