You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_2x2_LT.S 33 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  42. #error "Architecture is not specified."
  43. #endif
  44. #ifdef EV6
  45. #define PREFETCHSIZE 56
  46. #define UNOP unop
  47. #endif
  48. #ifdef EV5
  49. #define PREFETCHSIZE 48
  50. #define UNOP
  51. #endif
  52. #ifdef EV4
  53. #define UNOP
  54. #endif
  55. .set noat
  56. .set noreorder
  57. .arch ev6
  58. .text
  59. .align 5
  60. .globl CNAME
  61. .ent CNAME
  62. #define STACKSIZE 80
  63. #define M $16
  64. #define N $17
  65. #define K $18
  66. #define A $21
  67. #define B $22
  68. #define C $20
  69. #define LDC $23
  70. #define C1 $19
  71. #define C2 $24
  72. #define AO $at
  73. #define BO $5
  74. #define I $6
  75. #define J $7
  76. #define L $8
  77. #define a1 $f16
  78. #define a2 $f17
  79. #define a3 $f18
  80. #define a4 $f19
  81. #define b1 $f20
  82. #define b2 $f21
  83. #define b3 $f22
  84. #define b4 $f23
  85. #define t1 $f24
  86. #define t2 $f25
  87. #define t3 $f26
  88. #define t4 $f27
  89. #define a5 $f28
  90. #define a6 $f30
  91. #define b5 $f29
  92. #define alpha_i $f29
  93. #define alpha_r $f30
  94. #define c01 $f0
  95. #define c02 $f1
  96. #define c03 $f2
  97. #define c04 $f3
  98. #define c05 $f4
  99. #define c06 $f5
  100. #define c07 $f6
  101. #define c08 $f7
  102. #define c09 $f8
  103. #define c10 $f9
  104. #define c11 $f10
  105. #define c12 $f11
  106. #define c13 $f12
  107. #define c14 $f13
  108. #define c15 $f14
  109. #define c16 $f15
  110. #define TMP1 $0
  111. #define TMP2 $1
  112. #define KK $2
  113. #define AORIG $3
  114. #define OFFSET $4
  115. #if defined(LN) || defined(LT)
  116. #ifndef CONJ
  117. #define ADD1 ADD
  118. #define ADD2 SUB
  119. #define ADD3 ADD
  120. #define ADD4 ADD
  121. #define ADD5 SUB
  122. #define ADD6 ADD
  123. #else
  124. #define ADD1 ADD
  125. #define ADD2 ADD
  126. #define ADD3 SUB
  127. #define ADD4 ADD
  128. #define ADD5 ADD
  129. #define ADD6 SUB
  130. #endif
  131. #else
  132. #ifndef CONJ
  133. #define ADD1 ADD
  134. #define ADD2 SUB
  135. #define ADD3 ADD
  136. #define ADD4 ADD
  137. #define ADD5 SUB
  138. #define ADD6 ADD
  139. #else
  140. #define ADD1 ADD
  141. #define ADD2 ADD
  142. #define ADD3 ADD
  143. #define ADD4 SUB
  144. #define ADD5 ADD
  145. #define ADD6 SUB
  146. #endif
  147. #endif
  148. CNAME:
  149. .frame $sp, STACKSIZE, $26, 0
  150. #ifdef PROFILE
  151. ldgp $gp, 0($27)
  152. lda $at, _mcount
  153. jsr $at, ($at), _mcount
  154. #endif
  155. #ifndef PROFILE
  156. .prologue 0
  157. #else
  158. .prologue 1
  159. #endif
  160. lda $sp, -STACKSIZE($sp)
  161. ldq B, 0 + STACKSIZE($sp)
  162. ldq C, 8 + STACKSIZE($sp)
  163. ldq LDC, 16 + STACKSIZE($sp)
  164. ldq OFFSET, 24 + STACKSIZE($sp)
  165. sll LDC, ZBASE_SHIFT, LDC
  166. stt $f2, 0($sp)
  167. stt $f3, 8($sp)
  168. stt $f4, 16($sp)
  169. stt $f5, 24($sp)
  170. stt $f6, 32($sp)
  171. stt $f7, 40($sp)
  172. stt $f8, 48($sp)
  173. stt $f9, 56($sp)
  174. cmple M, 0, $0
  175. cmple N, 0, $1
  176. cmple K, 0, $2
  177. or $0, $1, $0
  178. or $0, $2, $0
  179. bne $0, $L999
  180. #ifdef LN
  181. addq M, M, TMP2
  182. mulq TMP2, K, TMP1
  183. SXADDQ TMP1, A, A
  184. SXADDQ TMP2, C, C
  185. #endif
  186. #ifdef RN
  187. negq OFFSET, KK
  188. #endif
  189. #ifdef RT
  190. mulq N, K, TMP1
  191. addq TMP1, TMP1, TMP1
  192. SXADDQ TMP1, B, B
  193. mulq N, LDC, TMP1
  194. addq TMP1, C, C
  195. subq N, OFFSET, KK
  196. #endif
  197. sra N, 1, J
  198. ble J, $L30
  199. .align 4
  200. $L01:
  201. #ifdef RT
  202. sll K, ZBASE_SHIFT + 1, TMP1
  203. subq B, TMP1, B
  204. subq C, LDC, C2
  205. subq C2, LDC, C1
  206. subq C2, LDC, C
  207. #else
  208. mov C, C1
  209. addq C, LDC, C2
  210. addq C2, LDC, C
  211. #endif
  212. #ifdef LN
  213. addq M, OFFSET, KK
  214. #endif
  215. #ifdef LT
  216. mov OFFSET, KK
  217. #endif
  218. #if defined(LN) || defined(RT)
  219. mov A, AORIG
  220. #else
  221. mov A, AO
  222. #endif
  223. sra M, 1, I
  224. fclr t1
  225. fclr t2
  226. fclr t3
  227. fclr t4
  228. fclr c01
  229. fclr c05
  230. ble I, $L20
  231. .align 4
  232. $L11:
  233. #if defined(LT) || defined(RN)
  234. LD a1, 0 * SIZE(AO)
  235. fclr c09
  236. LD a2, 1 * SIZE(AO)
  237. fclr c13
  238. LD a3, 2 * SIZE(AO)
  239. fclr c02
  240. LD a4, 3 * SIZE(AO)
  241. fclr c06
  242. LD b1, 0 * SIZE(B)
  243. fclr c10
  244. LD b2, 1 * SIZE(B)
  245. fclr c14
  246. LD b3, 2 * SIZE(B)
  247. fclr c03
  248. LD b4, 3 * SIZE(B)
  249. fclr c07
  250. lda BO, 4 * SIZE(B)
  251. fclr c11
  252. lda AO, 4 * SIZE(AO)
  253. fclr c15
  254. lds $f31, 4 * SIZE(C1)
  255. fclr c04
  256. lda L, -2(KK)
  257. fclr c08
  258. lds $f31, 4 * SIZE(C2)
  259. fclr c12
  260. fclr c16
  261. ble KK, $L18
  262. ble L, $L15
  263. #else
  264. #ifdef LN
  265. sll K, ZBASE_SHIFT + 1, TMP1
  266. subq AORIG, TMP1, AORIG
  267. #endif
  268. sll KK, ZBASE_SHIFT + 1, TMP1
  269. addq AORIG, TMP1, AO
  270. addq B, TMP1, BO
  271. subq K, KK, TMP1
  272. LD a1, 0 * SIZE(AO)
  273. fclr c09
  274. LD a2, 1 * SIZE(AO)
  275. fclr c13
  276. LD a3, 2 * SIZE(AO)
  277. fclr c02
  278. LD a4, 3 * SIZE(AO)
  279. fclr c06
  280. LD b1, 0 * SIZE(BO)
  281. fclr c10
  282. LD b2, 1 * SIZE(BO)
  283. fclr c14
  284. LD b3, 2 * SIZE(BO)
  285. fclr c03
  286. LD b4, 3 * SIZE(BO)
  287. fclr c07
  288. lda BO, 4 * SIZE(BO)
  289. fclr c11
  290. lda AO, 4 * SIZE(AO)
  291. fclr c15
  292. lds $f31, 4 * SIZE(C1)
  293. fclr c04
  294. lda L, -2(TMP1)
  295. fclr c08
  296. lds $f31, 4 * SIZE(C2)
  297. fclr c12
  298. fclr c16
  299. ble TMP1, $L18
  300. ble L, $L15
  301. #endif
  302. .align 5
  303. $L12:
  304. /* 1 */
  305. ADD1 c11, t1, c11
  306. #ifndef EV4
  307. ldq $31, PREFETCHSIZE * SIZE(AO)
  308. #else
  309. unop
  310. #endif
  311. MUL b1, a1, t1
  312. #ifndef EV4
  313. ldl $31, PREFETCHSIZE * SIZE(BO)
  314. #else
  315. unop
  316. #endif
  317. ADD3 c12, t2, c12
  318. unop
  319. MUL b1, a2, t2
  320. unop
  321. ADD2 c16, t3, c16
  322. unop
  323. MUL b2, a2, t3
  324. LD a5, 0 * SIZE(AO)
  325. ADD4 c15, t4, c15
  326. unop
  327. MUL b2, a1, t4
  328. LD b5, 0 * SIZE(BO)
  329. /* 2 */
  330. ADD1 c01, t1, c01
  331. UNOP
  332. MUL b1, a3, t1
  333. UNOP
  334. ADD3 c02, t2, c02
  335. UNOP
  336. MUL b1, a4, t2
  337. UNOP
  338. ADD2 c06, t3, c06
  339. unop
  340. MUL b2, a4, t3
  341. unop
  342. ADD4 c05, t4, c05
  343. unop
  344. MUL b4, a1, t4
  345. unop
  346. /* 3 */
  347. ADD1 c03, t1, c03
  348. unop
  349. MUL b3, a1, t1
  350. unop
  351. ADD3 c04, t2, c04
  352. unop
  353. MUL b3, a2, t2
  354. unop
  355. ADD2 c08, t3, c08
  356. unop
  357. MUL b4, a2, t3
  358. LD a2, 1 * SIZE(AO)
  359. ADD4 c13, t4, c13
  360. unop
  361. MUL b2, a3, t4
  362. LD b2, 1 * SIZE(BO)
  363. /* 4 */
  364. ADD1 c09, t1, c09
  365. unop
  366. MUL b3, a3, t1
  367. LD a6, 2 * SIZE(AO)
  368. ADD3 c10, t2, c10
  369. unop
  370. MUL b3, a4, t2
  371. LD b3, 2 * SIZE(BO)
  372. ADD2 c14, t3, c14
  373. unop
  374. MUL b4, a4, t3
  375. LD a4, 3 * SIZE(AO)
  376. ADD4 c07, t4, c07
  377. unop
  378. MUL b4, a3, t4
  379. LD b4, 3 * SIZE(BO)
  380. /* 5 */
  381. ADD1 c11, t1, c11
  382. unop
  383. MUL b5, a5, t1
  384. LD a1, 4 * SIZE(AO)
  385. ADD3 c12, t2, c12
  386. lda L, -2(L)
  387. MUL b5, a2, t2
  388. LD b1, 4 * SIZE(BO)
  389. ADD2 c16, t3, c16
  390. unop
  391. MUL b2, a2, t3
  392. unop
  393. ADD4 c15, t4, c15
  394. unop
  395. MUL b2, a5, t4
  396. unop
  397. /* 6 */
  398. ADD1 c01, t1, c01
  399. unop
  400. MUL b5, a6, t1
  401. unop
  402. ADD3 c02, t2, c02
  403. unop
  404. MUL b5, a4, t2
  405. unop
  406. ADD2 c06, t3, c06
  407. unop
  408. MUL b2, a4, t3
  409. unop
  410. ADD4 c05, t4, c05
  411. unop
  412. MUL b4, a5, t4
  413. unop
  414. /* 7 */
  415. ADD1 c03, t1, c03
  416. lda AO, 8 * SIZE(AO)
  417. MUL b3, a5, t1
  418. unop
  419. ADD3 c04, t2, c04
  420. lda BO, 8 * SIZE(BO)
  421. MUL b3, a2, t2
  422. unop
  423. ADD2 c08, t3, c08
  424. unop
  425. MUL b4, a2, t3
  426. LD a2, -3 * SIZE(AO)
  427. ADD4 c13, t4, c13
  428. unop
  429. MUL b2, a6, t4
  430. LD b2, -3 * SIZE(BO)
  431. /* 8 */
  432. ADD1 c09, t1, c09
  433. unop
  434. MUL b3, a6, t1
  435. LD a3, -2 * SIZE(AO)
  436. ADD3 c10, t2, c10
  437. unop
  438. MUL b3, a4, t2
  439. LD b3, -2 * SIZE(BO)
  440. ADD2 c14, t3, c14
  441. unop
  442. MUL b4, a4, t3
  443. LD a4, -1 * SIZE(AO)
  444. ADD4 c07, t4, c07
  445. MUL b4, a6, t4
  446. LD b4, -1 * SIZE(BO)
  447. bgt L, $L12
  448. .align 4
  449. $L15:
  450. ADD1 c11, t1, c11
  451. unop
  452. MUL b1, a1, t1
  453. #if defined(LT) || defined(RN)
  454. blbs KK, $L17
  455. #else
  456. blbs TMP1, $L17
  457. #endif
  458. .align 4
  459. ADD3 c12, t2, c12
  460. MUL b1, a2, t2
  461. ADD2 c16, t3, c16
  462. MUL b2, a2, t3
  463. ADD4 c15, t4, c15
  464. MUL b2, a1, t4
  465. ADD1 c01, t1, c01
  466. MUL b1, a3, t1
  467. ADD3 c02, t2, c02
  468. unop
  469. MUL b1, a4, t2
  470. LD b1, 0 * SIZE(BO)
  471. ADD2 c06, t3, c06
  472. MUL b2, a4, t3
  473. ADD4 c05, t4, c05
  474. MUL b4, a1, t4
  475. ADD1 c03, t1, c03
  476. unop
  477. MUL b3, a1, t1
  478. LD a1, 0 * SIZE(AO)
  479. ADD3 c04, t2, c04
  480. unop
  481. MUL b3, a2, t2
  482. unop
  483. ADD2 c08, t3, c08
  484. unop
  485. MUL b4, a2, t3
  486. LD a2, 1 * SIZE(AO)
  487. ADD4 c13, t4, c13
  488. unop
  489. MUL b2, a3, t4
  490. LD b2, 1 * SIZE(BO)
  491. ADD1 c09, t1, c09
  492. unop
  493. MUL b3, a3, t1
  494. lda AO, 4 * SIZE(AO)
  495. ADD3 c10, t2, c10
  496. unop
  497. MUL b3, a4, t2
  498. LD b3, 2 * SIZE(BO)
  499. ADD2 c14, t3, c14
  500. unop
  501. MUL b4, a4, t3
  502. LD a4, -1 * SIZE(AO)
  503. ADD4 c07, t4, c07
  504. unop
  505. MUL b4, a3, t4
  506. LD a3, -2 * SIZE(AO)
  507. ADD1 c11, t1, c11
  508. LD b4, 3 * SIZE(BO)
  509. MUL b1, a1, t1
  510. lda BO, 4 * SIZE(BO)
  511. .align 4
  512. $L17:
  513. ADD3 c12, t2, c12
  514. MUL b1, a2, t2
  515. ADD2 c16, t3, c16
  516. MUL b2, a2, t3
  517. ADD4 c15, t4, c15
  518. MUL b2, a1, t4
  519. ADD1 c01, t1, c01
  520. MUL b1, a3, t1
  521. ADD3 c02, t2, c02
  522. MUL b1, a4, t2
  523. ADD2 c06, t3, c06
  524. MUL b2, a4, t3
  525. ADD4 c05, t4, c05
  526. MUL b4, a1, t4
  527. ADD1 c03, t1, c03
  528. MUL b3, a1, t1
  529. ADD3 c04, t2, c04
  530. MUL b3, a2, t2
  531. ADD2 c08, t3, c08
  532. MUL b4, a2, t3
  533. ADD4 c13, t4, c13
  534. MUL b2, a3, t4
  535. ADD1 c09, t1, c09
  536. MUL b3, a3, t1
  537. ADD3 c10, t2, c10
  538. MUL b3, a4, t2
  539. ADD2 c14, t3, c14
  540. MUL b4, a4, t3
  541. ADD4 c07, t4, c07
  542. lda AO, 4 * SIZE(AO)
  543. MUL b4, a3, t4
  544. lda BO, 4 * SIZE(BO)
  545. ADD1 c11, t1, c11
  546. ADD3 c12, t2, c12
  547. ADD2 c16, t3, c16
  548. ADD4 c15, t4, c15
  549. ADD c01, c06, c01
  550. ADD c02, c05, c02
  551. ADD c03, c08, c03
  552. ADD c04, c07, c04
  553. ADD c09, c14, c09
  554. ADD c10, c13, c10
  555. ADD c11, c16, c11
  556. ADD c12, c15, c12
  557. .align 4
  558. $L18:
  559. #if defined(LN) || defined(RT)
  560. #ifdef LN
  561. subq KK, 2, TMP1
  562. #else
  563. subq KK, 2, TMP1
  564. #endif
  565. sll TMP1, ZBASE_SHIFT + 1, TMP2
  566. addq AORIG, TMP2, AO
  567. sll TMP1, ZBASE_SHIFT + 1, TMP2
  568. addq B, TMP2, BO
  569. #else
  570. lda AO, -4 * SIZE(AO)
  571. lda BO, -4 * SIZE(BO)
  572. #endif
  573. #if defined(LN) || defined(LT)
  574. LD a1, 0 * SIZE(BO)
  575. LD a2, 1 * SIZE(BO)
  576. LD a3, 2 * SIZE(BO)
  577. LD a4, 3 * SIZE(BO)
  578. LD b1, 4 * SIZE(BO)
  579. LD b2, 5 * SIZE(BO)
  580. LD b3, 6 * SIZE(BO)
  581. LD b4, 7 * SIZE(BO)
  582. SUB a1, c01, c01
  583. SUB a2, c02, c02
  584. SUB a3, c09, c09
  585. SUB a4, c10, c10
  586. SUB b1, c03, c03
  587. SUB b2, c04, c04
  588. SUB b3, c11, c11
  589. SUB b4, c12, c12
  590. #else
  591. LD a1, 0 * SIZE(AO)
  592. LD a2, 1 * SIZE(AO)
  593. LD a3, 2 * SIZE(AO)
  594. LD a4, 3 * SIZE(AO)
  595. LD b1, 4 * SIZE(AO)
  596. LD b2, 5 * SIZE(AO)
  597. LD b3, 6 * SIZE(AO)
  598. LD b4, 7 * SIZE(AO)
  599. SUB a1, c01, c01
  600. SUB a2, c02, c02
  601. SUB a3, c03, c03
  602. SUB a4, c04, c04
  603. SUB b1, c09, c09
  604. SUB b2, c10, c10
  605. SUB b3, c11, c11
  606. SUB b4, c12, c12
  607. #endif
  608. #ifdef LN
  609. LD a1, 6 * SIZE(AO)
  610. LD a2, 7 * SIZE(AO)
  611. LD a3, 4 * SIZE(AO)
  612. LD a4, 5 * SIZE(AO)
  613. MUL a2, c04, t1
  614. MUL a2, c03, t2
  615. MUL a2, c12, t3
  616. MUL a2, c11, t4
  617. MUL a1, c03, c03
  618. MUL a1, c04, c04
  619. MUL a1, c11, c11
  620. MUL a1, c12, c12
  621. ADD5 c03, t1, c03
  622. ADD6 c04, t2, c04
  623. ADD5 c11, t3, c11
  624. ADD6 c12, t4, c12
  625. MUL a3, c03, t1
  626. MUL a3, c04, t2
  627. MUL a3, c11, t3
  628. MUL a3, c12, t4
  629. SUB c01, t1, c01
  630. SUB c02, t2, c02
  631. SUB c09, t3, c09
  632. SUB c10, t4, c10
  633. MUL a4, c04, t1
  634. MUL a4, c03, t2
  635. MUL a4, c12, t3
  636. MUL a4, c11, t4
  637. ADD6 c01, t1, c01
  638. ADD5 c02, t2, c02
  639. ADD6 c09, t3, c09
  640. ADD5 c10, t4, c10
  641. LD a1, 0 * SIZE(AO)
  642. LD a2, 1 * SIZE(AO)
  643. MUL a2, c02, t1
  644. MUL a2, c01, t2
  645. MUL a2, c10, t3
  646. MUL a2, c09, t4
  647. MUL a1, c01, c01
  648. MUL a1, c02, c02
  649. MUL a1, c09, c09
  650. MUL a1, c10, c10
  651. ADD5 c01, t1, c01
  652. ADD6 c02, t2, c02
  653. ADD5 c09, t3, c09
  654. ADD6 c10, t4, c10
  655. #endif
  656. #ifdef LT
  657. LD a1, 0 * SIZE(AO)
  658. LD a2, 1 * SIZE(AO)
  659. LD a3, 2 * SIZE(AO)
  660. LD a4, 3 * SIZE(AO)
  661. MUL a2, c02, t1
  662. MUL a2, c01, t2
  663. MUL a2, c10, t3
  664. MUL a2, c09, t4
  665. MUL a1, c01, c01
  666. MUL a1, c02, c02
  667. MUL a1, c09, c09
  668. MUL a1, c10, c10
  669. ADD5 c01, t1, c01
  670. ADD6 c02, t2, c02
  671. ADD5 c09, t3, c09
  672. ADD6 c10, t4, c10
  673. MUL a3, c01, t1
  674. MUL a3, c02, t2
  675. MUL a3, c09, t3
  676. MUL a3, c10, t4
  677. SUB c03, t1, c03
  678. SUB c04, t2, c04
  679. SUB c11, t3, c11
  680. SUB c12, t4, c12
  681. MUL a4, c02, t1
  682. MUL a4, c01, t2
  683. MUL a4, c10, t3
  684. MUL a4, c09, t4
  685. ADD6 c03, t1, c03
  686. ADD5 c04, t2, c04
  687. ADD6 c11, t3, c11
  688. ADD5 c12, t4, c12
  689. LD a1, 6 * SIZE(AO)
  690. LD a2, 7 * SIZE(AO)
  691. MUL a2, c04, t1
  692. MUL a2, c03, t2
  693. MUL a2, c12, t3
  694. MUL a2, c11, t4
  695. MUL a1, c03, c03
  696. MUL a1, c04, c04
  697. MUL a1, c11, c11
  698. MUL a1, c12, c12
  699. ADD5 c03, t1, c03
  700. ADD6 c04, t2, c04
  701. ADD5 c11, t3, c11
  702. ADD6 c12, t4, c12
  703. #endif
  704. #ifdef RN
  705. LD a1, 0 * SIZE(BO)
  706. LD a2, 1 * SIZE(BO)
  707. LD a3, 2 * SIZE(BO)
  708. LD a4, 3 * SIZE(BO)
  709. MUL a2, c02, t1
  710. MUL a2, c01, t2
  711. MUL a2, c04, t3
  712. MUL a2, c03, t4
  713. MUL a1, c01, c01
  714. MUL a1, c02, c02
  715. MUL a1, c03, c03
  716. MUL a1, c04, c04
  717. ADD5 c01, t1, c01
  718. ADD6 c02, t2, c02
  719. ADD5 c03, t3, c03
  720. ADD6 c04, t4, c04
  721. MUL a3, c01, t1
  722. MUL a3, c02, t2
  723. MUL a3, c03, t3
  724. MUL a3, c04, t4
  725. SUB c09, t1, c09
  726. SUB c10, t2, c10
  727. SUB c11, t3, c11
  728. SUB c12, t4, c12
  729. MUL a4, c02, t1
  730. MUL a4, c01, t2
  731. MUL a4, c04, t3
  732. MUL a4, c03, t4
  733. ADD6 c09, t1, c09
  734. ADD5 c10, t2, c10
  735. ADD6 c11, t3, c11
  736. ADD5 c12, t4, c12
  737. LD a1, 6 * SIZE(BO)
  738. LD a2, 7 * SIZE(BO)
  739. MUL a2, c10, t1
  740. MUL a2, c09, t2
  741. MUL a2, c12, t3
  742. MUL a2, c11, t4
  743. MUL a1, c09, c09
  744. MUL a1, c10, c10
  745. MUL a1, c11, c11
  746. MUL a1, c12, c12
  747. ADD5 c09, t1, c09
  748. ADD6 c10, t2, c10
  749. ADD5 c11, t3, c11
  750. ADD6 c12, t4, c12
  751. #endif
  752. #ifdef RT
  753. LD a1, 6 * SIZE(BO)
  754. LD a2, 7 * SIZE(BO)
  755. LD a3, 4 * SIZE(BO)
  756. LD a4, 5 * SIZE(BO)
  757. MUL a2, c10, t1
  758. MUL a2, c09, t2
  759. MUL a2, c12, t3
  760. MUL a2, c11, t4
  761. MUL a1, c09, c09
  762. MUL a1, c10, c10
  763. MUL a1, c11, c11
  764. MUL a1, c12, c12
  765. ADD5 c09, t1, c09
  766. ADD6 c10, t2, c10
  767. ADD5 c11, t3, c11
  768. ADD6 c12, t4, c12
  769. MUL a3, c09, t1
  770. MUL a3, c10, t2
  771. MUL a3, c11, t3
  772. MUL a3, c12, t4
  773. SUB c01, t1, c01
  774. SUB c02, t2, c02
  775. SUB c03, t3, c03
  776. SUB c04, t4, c04
  777. MUL a4, c10, t1
  778. MUL a4, c09, t2
  779. MUL a4, c12, t3
  780. MUL a4, c11, t4
  781. ADD6 c01, t1, c01
  782. ADD5 c02, t2, c02
  783. ADD6 c03, t3, c03
  784. ADD5 c04, t4, c04
  785. LD a1, 0 * SIZE(BO)
  786. LD a2, 1 * SIZE(BO)
  787. MUL a2, c02, t1
  788. MUL a2, c01, t2
  789. MUL a2, c04, t3
  790. MUL a2, c03, t4
  791. MUL a1, c01, c01
  792. MUL a1, c02, c02
  793. MUL a1, c03, c03
  794. MUL a1, c04, c04
  795. ADD5 c01, t1, c01
  796. ADD6 c02, t2, c02
  797. ADD5 c03, t3, c03
  798. ADD6 c04, t4, c04
  799. #endif
  800. #if defined(LN) || defined(LT)
  801. ST c01, 0 * SIZE(BO)
  802. ST c02, 1 * SIZE(BO)
  803. ST c09, 2 * SIZE(BO)
  804. ST c10, 3 * SIZE(BO)
  805. ST c03, 4 * SIZE(BO)
  806. ST c04, 5 * SIZE(BO)
  807. ST c11, 6 * SIZE(BO)
  808. ST c12, 7 * SIZE(BO)
  809. #else
  810. ST c01, 0 * SIZE(AO)
  811. ST c02, 1 * SIZE(AO)
  812. ST c03, 2 * SIZE(AO)
  813. ST c04, 3 * SIZE(AO)
  814. ST c09, 4 * SIZE(AO)
  815. ST c10, 5 * SIZE(AO)
  816. ST c11, 6 * SIZE(AO)
  817. ST c12, 7 * SIZE(AO)
  818. #endif
  819. #ifdef LN
  820. lda C1, -4 * SIZE(C1)
  821. lda C2, -4 * SIZE(C2)
  822. #endif
  823. ST c01, 0 * SIZE(C1)
  824. ST c02, 1 * SIZE(C1)
  825. ST c03, 2 * SIZE(C1)
  826. ST c04, 3 * SIZE(C1)
  827. ST c09, 0 * SIZE(C2)
  828. ST c10, 1 * SIZE(C2)
  829. ST c11, 2 * SIZE(C2)
  830. ST c12, 3 * SIZE(C2)
  831. #ifndef LN
  832. lda C1, 4 * SIZE(C1)
  833. lda C2, 4 * SIZE(C2)
  834. #endif
  835. fclr t1
  836. fclr t2
  837. fclr t3
  838. fclr t4
  839. #ifdef RT
  840. sll K, ZBASE_SHIFT + 1, TMP1
  841. addq AORIG, TMP1, AORIG
  842. #endif
  843. #if defined(LT) || defined(RN)
  844. subq K, KK, TMP1
  845. sll TMP1, ZBASE_SHIFT + 1, TMP1
  846. addq AO, TMP1, AO
  847. addq BO, TMP1, BO
  848. #endif
  849. #ifdef LT
  850. addq KK, 2, KK
  851. #endif
  852. #ifdef LN
  853. subq KK, 2, KK
  854. #endif
  855. fclr c01
  856. fclr c05
  857. lda I, -1(I)
  858. bgt I, $L11
  859. .align 4
  860. $L20:
  861. and M, 1, I
  862. ble I, $L29
  863. #if defined(LT) || defined(RN)
  864. LD a1, 0 * SIZE(AO)
  865. fclr c09
  866. LD a2, 1 * SIZE(AO)
  867. fclr c13
  868. LD a3, 2 * SIZE(AO)
  869. fclr c02
  870. LD a4, 3 * SIZE(AO)
  871. fclr c06
  872. LD b1, 0 * SIZE(B)
  873. fclr c10
  874. LD b2, 1 * SIZE(B)
  875. fclr c14
  876. LD b3, 2 * SIZE(B)
  877. lda AO, 2 * SIZE(AO)
  878. LD b4, 3 * SIZE(B)
  879. lda BO, 4 * SIZE(B)
  880. lda L, -2(KK)
  881. ble KK, $L28
  882. ble L, $L25
  883. #else
  884. #ifdef LN
  885. sll K, ZBASE_SHIFT + 0, TMP1
  886. subq AORIG, TMP1, AORIG
  887. #endif
  888. sll KK, ZBASE_SHIFT + 0, TMP1
  889. addq AORIG, TMP1, AO
  890. sll KK, ZBASE_SHIFT + 1, TMP1
  891. addq B, TMP1, BO
  892. subq K, KK, TMP1
  893. LD a1, 0 * SIZE(AO)
  894. fclr c09
  895. LD a2, 1 * SIZE(AO)
  896. fclr c13
  897. LD a3, 2 * SIZE(AO)
  898. fclr c02
  899. LD a4, 3 * SIZE(AO)
  900. fclr c06
  901. LD b1, 0 * SIZE(BO)
  902. fclr c10
  903. LD b2, 1 * SIZE(BO)
  904. fclr c14
  905. LD b3, 2 * SIZE(BO)
  906. lda AO, 2 * SIZE(AO)
  907. LD b4, 3 * SIZE(BO)
  908. lda BO, 4 * SIZE(BO)
  909. lda L, -2(TMP1)
  910. ble TMP1, $L28
  911. ble L, $L25
  912. #endif
  913. .align 5
  914. $L22:
  915. ADD1 c09, t1, c09
  916. unop
  917. MUL a1, b1, t1
  918. unop
  919. ADD3 c10, t2, c10
  920. unop
  921. MUL a2, b1, t2
  922. LD b1, 0 * SIZE(BO)
  923. ADD4 c13, t3, c13
  924. unop
  925. MUL a1, b2, t3
  926. lda BO, 8 * SIZE(BO)
  927. ADD2 c14, t4, c14
  928. unop
  929. MUL a2, b2, t4
  930. LD b2, -7 * SIZE(BO)
  931. ADD1 c01, t1, c01
  932. unop
  933. MUL a1, b3, t1
  934. unop
  935. ADD3 c02, t2, c02
  936. unop
  937. MUL a2, b3, t2
  938. LD b3, -6 * SIZE(BO)
  939. ADD4 c05, t3, c05
  940. unop
  941. MUL a1, b4, t3
  942. LD a1, 2 * SIZE(AO)
  943. ADD2 c06, t4, c06
  944. MUL a2, b4, t4
  945. LD b5, -5 * SIZE(BO)
  946. ADD1 c09, t1, c09
  947. unop
  948. MUL a3, b1, t1
  949. LD a2, 3 * SIZE(AO)
  950. ADD3 c10, t2, c10
  951. unop
  952. MUL a4, b1, t2
  953. LD b1, -4 * SIZE(BO)
  954. ADD4 c13, t3, c13
  955. unop
  956. MUL a3, b2, t3
  957. lda AO, 4 * SIZE(AO)
  958. ADD2 c14, t4, c14
  959. MUL a4, b2, t4
  960. LD b2, -3 * SIZE(BO)
  961. ADD1 c01, t1, c01
  962. lda L, -2(L)
  963. MUL a3, b3, t1
  964. LD b4, -1 * SIZE(BO)
  965. ADD3 c02, t2, c02
  966. unop
  967. MUL a4, b3, t2
  968. LD b3, -2 * SIZE(BO)
  969. ADD4 c05, t3, c05
  970. unop
  971. MUL a3, b5, t3
  972. LD a3, 0 * SIZE(AO)
  973. ADD2 c06, t4, c06
  974. MUL a4, b5, t4
  975. LD a4, 1 * SIZE(AO)
  976. bgt L, $L22
  977. .align 4
  978. $L25:
  979. ADD1 c09, t1, c09
  980. MUL a1, b1, t1
  981. #if defined(LT) || defined(RN)
  982. blbs KK, $L27
  983. #else
  984. blbs TMP1, $L27
  985. #endif
  986. .align 4
  987. ADD3 c10, t2, c10
  988. unop
  989. MUL a2, b1, t2
  990. LD b1, 0 * SIZE(BO)
  991. ADD4 c13, t3, c13
  992. unop
  993. MUL a1, b2, t3
  994. unop
  995. ADD2 c14, t4, c14
  996. unop
  997. MUL a2, b2, t4
  998. LD b2, 1 * SIZE(BO)
  999. ADD1 c01, t1, c01
  1000. unop
  1001. MUL a1, b3, t1
  1002. lda AO, 2 * SIZE(AO)
  1003. ADD3 c02, t2, c02
  1004. unop
  1005. MUL a2, b3, t2
  1006. LD b3, 2 * SIZE(BO)
  1007. ADD4 c05, t3, c05
  1008. unop
  1009. MUL a1, b4, t3
  1010. LD a1, -2 * SIZE(AO)
  1011. ADD2 c06, t4, c06
  1012. unop
  1013. MUL a2, b4, t4
  1014. LD a2, -1 * SIZE(AO)
  1015. ADD1 c09, t1, c09
  1016. LD b4, 3 * SIZE(BO)
  1017. MUL a1, b1, t1
  1018. lda BO, 4 * SIZE(BO)
  1019. .align 4
  1020. $L27:
  1021. ADD3 c10, t2, c10
  1022. MUL a2, b1, t2
  1023. ADD4 c13, t3, c13
  1024. MUL a1, b2, t3
  1025. ADD2 c14, t4, c14
  1026. MUL a2, b2, t4
  1027. ADD1 c01, t1, c01
  1028. MUL a1, b3, t1
  1029. ADD3 c02, t2, c02
  1030. MUL a2, b3, t2
  1031. ADD4 c05, t3, c05
  1032. MUL a1, b4, t3
  1033. ADD2 c06, t4, c06
  1034. lda AO, 2 * SIZE(AO)
  1035. MUL a2, b4, t4
  1036. lda BO, 4 * SIZE(BO)
  1037. ADD1 c09, t1, c09
  1038. ADD3 c10, t2, c10
  1039. ADD4 c13, t3, c13
  1040. ADD2 c14, t4, c14
  1041. ADD c01, c06, c01
  1042. ADD c02, c05, c02
  1043. ADD c09, c14, c09
  1044. ADD c10, c13, c10
  1045. .align 4
  1046. $L28:
  1047. #if defined(LN) || defined(RT)
  1048. #ifdef LN
  1049. subq KK, 1, TMP1
  1050. #else
  1051. subq KK, 2, TMP1
  1052. #endif
  1053. sll TMP1, ZBASE_SHIFT + 0, TMP2
  1054. addq AORIG, TMP2, AO
  1055. sll TMP1, ZBASE_SHIFT + 1, TMP2
  1056. addq B, TMP2, BO
  1057. #else
  1058. lda AO, -2 * SIZE(AO)
  1059. lda BO, -4 * SIZE(BO)
  1060. #endif
  1061. #if defined(LN) || defined(LT)
  1062. LD a1, 0 * SIZE(BO)
  1063. LD a2, 1 * SIZE(BO)
  1064. LD a3, 2 * SIZE(BO)
  1065. LD a4, 3 * SIZE(BO)
  1066. SUB a1, c01, c01
  1067. SUB a2, c02, c02
  1068. SUB a3, c09, c09
  1069. SUB a4, c10, c10
  1070. #else
  1071. LD a1, 0 * SIZE(AO)
  1072. LD a2, 1 * SIZE(AO)
  1073. LD a3, 2 * SIZE(AO)
  1074. LD a4, 3 * SIZE(AO)
  1075. SUB a1, c01, c01
  1076. SUB a2, c02, c02
  1077. SUB a3, c09, c09
  1078. SUB a4, c10, c10
  1079. #endif
  1080. #if defined(LN) || defined(LT)
  1081. LD a1, 0 * SIZE(AO)
  1082. LD a2, 1 * SIZE(AO)
  1083. MUL a2, c02, t1
  1084. MUL a2, c01, t2
  1085. MUL a2, c10, t3
  1086. MUL a2, c09, t4
  1087. MUL a1, c01, c01
  1088. MUL a1, c02, c02
  1089. MUL a1, c09, c09
  1090. MUL a1, c10, c10
  1091. ADD5 c01, t1, c01
  1092. ADD6 c02, t2, c02
  1093. ADD5 c09, t3, c09
  1094. ADD6 c10, t4, c10
  1095. #endif
  1096. #ifdef RN
  1097. LD a1, 0 * SIZE(BO)
  1098. LD a2, 1 * SIZE(BO)
  1099. LD a3, 2 * SIZE(BO)
  1100. LD a4, 3 * SIZE(BO)
  1101. MUL a2, c02, t1
  1102. MUL a2, c01, t2
  1103. MUL a1, c01, c01
  1104. MUL a1, c02, c02
  1105. ADD5 c01, t1, c01
  1106. ADD6 c02, t2, c02
  1107. MUL a3, c01, t1
  1108. MUL a3, c02, t2
  1109. SUB c09, t1, c09
  1110. SUB c10, t2, c10
  1111. MUL a4, c02, t1
  1112. MUL a4, c01, t2
  1113. ADD6 c09, t1, c09
  1114. ADD5 c10, t2, c10
  1115. LD a1, 6 * SIZE(BO)
  1116. LD a2, 7 * SIZE(BO)
  1117. MUL a2, c10, t1
  1118. MUL a2, c09, t2
  1119. MUL a1, c09, c09
  1120. MUL a1, c10, c10
  1121. ADD5 c09, t1, c09
  1122. ADD6 c10, t2, c10
  1123. #endif
  1124. #ifdef RT
  1125. LD a1, 6 * SIZE(BO)
  1126. LD a2, 7 * SIZE(BO)
  1127. LD a3, 4 * SIZE(BO)
  1128. LD a4, 5 * SIZE(BO)
  1129. MUL a2, c10, t1
  1130. MUL a2, c09, t2
  1131. MUL a1, c09, c09
  1132. MUL a1, c10, c10
  1133. ADD5 c09, t1, c09
  1134. ADD6 c10, t2, c10
  1135. MUL a3, c09, t1
  1136. MUL a3, c10, t2
  1137. SUB c01, t1, c01
  1138. SUB c02, t2, c02
  1139. MUL a4, c10, t1
  1140. MUL a4, c09, t2
  1141. ADD6 c01, t1, c01
  1142. ADD5 c02, t2, c02
  1143. LD a1, 0 * SIZE(BO)
  1144. LD a2, 1 * SIZE(BO)
  1145. MUL a2, c02, t1
  1146. MUL a2, c01, t2
  1147. MUL a1, c01, c01
  1148. MUL a1, c02, c02
  1149. ADD5 c01, t1, c01
  1150. ADD6 c02, t2, c02
  1151. #endif
  1152. #if defined(LN) || defined(LT)
  1153. ST c01, 0 * SIZE(BO)
  1154. ST c02, 1 * SIZE(BO)
  1155. ST c09, 2 * SIZE(BO)
  1156. ST c10, 3 * SIZE(BO)
  1157. #else
  1158. ST c01, 0 * SIZE(AO)
  1159. ST c02, 1 * SIZE(AO)
  1160. ST c09, 2 * SIZE(AO)
  1161. ST c10, 3 * SIZE(AO)
  1162. #endif
  1163. #ifdef LN
  1164. lda C1, -2 * SIZE(C1)
  1165. lda C2, -2 * SIZE(C2)
  1166. #endif
  1167. ST c01, 0 * SIZE(C1)
  1168. ST c02, 1 * SIZE(C1)
  1169. ST c09, 0 * SIZE(C2)
  1170. ST c10, 1 * SIZE(C2)
  1171. #ifndef LN
  1172. lda C1, 2 * SIZE(C1)
  1173. lda C2, 2 * SIZE(C2)
  1174. #endif
  1175. #ifdef RT
  1176. sll K, ZBASE_SHIFT, TMP1
  1177. addq AORIG, TMP1, AORIG
  1178. #endif
  1179. #if defined(LT) || defined(RN)
  1180. subq K, KK, TMP1
  1181. sll TMP1, ZBASE_SHIFT + 0, TMP2
  1182. addq AO, TMP2, AO
  1183. sll TMP1, ZBASE_SHIFT + 1, TMP2
  1184. addq BO, TMP2, BO
  1185. #endif
  1186. #ifdef LT
  1187. addq KK, 1, KK
  1188. #endif
  1189. #ifdef LN
  1190. subq KK, 1, KK
  1191. #endif
  1192. .align 4
  1193. $L29:
  1194. #ifdef LN
  1195. sll K, ZBASE_SHIFT + 1, TMP1
  1196. addq B, TMP1, B
  1197. #endif
  1198. #if defined(LT) || defined(RN)
  1199. mov BO, B
  1200. #endif
  1201. #ifdef RN
  1202. addq KK, 2, KK
  1203. #endif
  1204. #ifdef RT
  1205. subq KK, 2, KK
  1206. #endif
  1207. lda J, -1(J)
  1208. bgt J, $L01
  1209. .align 4
  1210. $L30:
  1211. and N, 1, J
  1212. ble J, $L999
  1213. #ifdef RT
  1214. sll K, ZBASE_SHIFT, TMP1
  1215. subq B, TMP1, B
  1216. subq C, LDC, C1
  1217. subq C, LDC, C
  1218. #else
  1219. mov C, C1
  1220. addq C, LDC, C
  1221. #endif
  1222. #ifdef LN
  1223. addq M, OFFSET, KK
  1224. #endif
  1225. #ifdef LT
  1226. mov OFFSET, KK
  1227. #endif
  1228. #if defined(LN) || defined(RT)
  1229. mov A, AORIG
  1230. #else
  1231. mov A, AO
  1232. #endif
  1233. sra M, 1, I
  1234. ble I, $L50
  1235. .align 4
  1236. $L41:
  1237. #if defined(LT) || defined(RN)
  1238. LD a1, 0 * SIZE(AO)
  1239. fclr t1
  1240. LD a2, 1 * SIZE(AO)
  1241. fclr t2
  1242. LD a3, 2 * SIZE(AO)
  1243. fclr t3
  1244. LD a4, 3 * SIZE(AO)
  1245. fclr t4
  1246. LD b1, 0 * SIZE(B)
  1247. fclr c01
  1248. LD b2, 1 * SIZE(B)
  1249. fclr c05
  1250. LD b3, 2 * SIZE(B)
  1251. fclr c02
  1252. LD b4, 3 * SIZE(B)
  1253. fclr c06
  1254. lda BO, 2 * SIZE(B)
  1255. fclr c03
  1256. lda AO, 4 * SIZE(AO)
  1257. fclr c07
  1258. lda L, -2(KK)
  1259. fclr c04
  1260. fclr c08
  1261. ble KK, $L48
  1262. ble L, $L45
  1263. #else
  1264. #ifdef LN
  1265. sll K, ZBASE_SHIFT + 1, TMP1
  1266. subq AORIG, TMP1, AORIG
  1267. #endif
  1268. sll KK, ZBASE_SHIFT + 1, TMP1
  1269. addq AORIG, TMP1, AO
  1270. sll KK, ZBASE_SHIFT, TMP1
  1271. addq B, TMP1, BO
  1272. subq K, KK, TMP1
  1273. LD a1, 0 * SIZE(AO)
  1274. fclr t1
  1275. LD a2, 1 * SIZE(AO)
  1276. fclr t2
  1277. LD a3, 2 * SIZE(AO)
  1278. fclr t3
  1279. LD a4, 3 * SIZE(AO)
  1280. fclr t4
  1281. LD b1, 0 * SIZE(BO)
  1282. fclr c01
  1283. LD b2, 1 * SIZE(BO)
  1284. fclr c05
  1285. LD b3, 2 * SIZE(BO)
  1286. fclr c02
  1287. LD b4, 3 * SIZE(BO)
  1288. fclr c06
  1289. lda BO, 2 * SIZE(BO)
  1290. fclr c03
  1291. lda AO, 4 * SIZE(AO)
  1292. fclr c07
  1293. lda L, -2(TMP1)
  1294. fclr c04
  1295. fclr c08
  1296. ble TMP1, $L48
  1297. ble L, $L45
  1298. #endif
  1299. .align 5
  1300. $L42:
  1301. ADD4 c05, t1, c05
  1302. unop
  1303. MUL a1, b1, t1
  1304. unop
  1305. ADD2 c06, t2, c06
  1306. lda L, -2(L)
  1307. MUL a2, b1, t2
  1308. unop
  1309. ADD4 c07, t3, c07
  1310. unop
  1311. MUL a3, b1, t3
  1312. unop
  1313. ADD2 c08, t4, c08
  1314. unop
  1315. MUL a4, b1, t4
  1316. LD b1, 2 * SIZE(BO)
  1317. ADD1 c01, t1, c01
  1318. unop
  1319. MUL a1, b2, t1
  1320. LD a1, 0 * SIZE(AO)
  1321. ADD3 c02, t2, c02
  1322. lda BO, 4 * SIZE(BO)
  1323. MUL a2, b2, t2
  1324. LD a2, 1 * SIZE(AO)
  1325. ADD1 c03, t3, c03
  1326. unop
  1327. MUL a3, b2, t3
  1328. LD a3, 2 * SIZE(AO)
  1329. ADD3 c04, t4, c04
  1330. unop
  1331. MUL a4, b2, t4
  1332. LD a5, 3 * SIZE(AO)
  1333. ADD4 c05, t1, c05
  1334. unop
  1335. MUL a1, b3, t1
  1336. LD b2, -1 * SIZE(BO)
  1337. ADD2 c06, t2, c06
  1338. unop
  1339. MUL a2, b3, t2
  1340. unop
  1341. ADD4 c07, t3, c07
  1342. unop
  1343. MUL a3, b3, t3
  1344. lda AO, 8 * SIZE(AO)
  1345. ADD2 c08, t4, c08
  1346. unop
  1347. MUL a5, b3, t4
  1348. LD b3, 0 * SIZE(BO)
  1349. ADD1 c01, t1, c01
  1350. unop
  1351. MUL a1, b4, t1
  1352. LD a1, -4 * SIZE(AO)
  1353. ADD3 c02, t2, c02
  1354. unop
  1355. MUL a2, b4, t2
  1356. LD a2, -3 * SIZE(AO)
  1357. ADD1 c03, t3, c03
  1358. LD a4, -1 * SIZE(AO)
  1359. MUL a3, b4, t3
  1360. LD a3, -2 * SIZE(AO)
  1361. ADD3 c04, t4, c04
  1362. MUL a5, b4, t4
  1363. LD b4, 1 * SIZE(BO)
  1364. bgt L, $L42
  1365. .align 4
  1366. $L45:
  1367. ADD4 c05, t1, c05
  1368. MUL b1, a1, t1
  1369. #if defined(LT) || defined(RN)
  1370. blbs KK, $L47
  1371. #else
  1372. blbs TMP1, $L47
  1373. #endif
  1374. .align 4
  1375. ADD2 c06, t2, c06
  1376. MUL a2, b1, t2
  1377. ADD4 c07, t3, c07
  1378. MUL a3, b1, t3
  1379. ADD2 c08, t4, c08
  1380. unop
  1381. MUL a4, b1, t4
  1382. LD b1, 0 * SIZE(BO)
  1383. ADD1 c01, t1, c01
  1384. unop
  1385. MUL a1, b2, t1
  1386. LD a1, 0 * SIZE(AO)
  1387. ADD3 c02, t2, c02
  1388. unop
  1389. MUL a2, b2, t2
  1390. LD a2, 1 * SIZE(AO)
  1391. ADD1 c03, t3, c03
  1392. unop
  1393. MUL a3, b2, t3
  1394. LD a3, 2 * SIZE(AO)
  1395. ADD3 c04, t4, c04
  1396. MUL a4, b2, t4
  1397. LD a4, 3 * SIZE(AO)
  1398. lda AO, 4 * SIZE(AO)
  1399. ADD4 c05, t1, c05
  1400. LD b2, 1 * SIZE(BO)
  1401. MUL a1, b1, t1
  1402. lda BO, 2 * SIZE(BO)
  1403. .align 4
  1404. $L47:
  1405. ADD2 c06, t2, c06
  1406. MUL a2, b1, t2
  1407. ADD4 c07, t3, c07
  1408. MUL a3, b1, t3
  1409. ADD2 c08, t4, c08
  1410. MUL a4, b1, t4
  1411. ADD1 c01, t1, c01
  1412. MUL a1, b2, t1
  1413. ADD3 c02, t2, c02
  1414. MUL a2, b2, t2
  1415. ADD1 c03, t3, c03
  1416. MUL a3, b2, t3
  1417. ADD3 c04, t4, c04
  1418. lda AO, 4 * SIZE(AO)
  1419. MUL a4, b2, t4
  1420. lda BO, 2 * SIZE(BO)
  1421. ADD4 c05, t1, c05
  1422. ADD2 c06, t2, c06
  1423. ADD4 c07, t3, c07
  1424. ADD2 c08, t4, c08
  1425. ADD c01, c06, c01
  1426. ADD c02, c05, c02
  1427. ADD c03, c08, c03
  1428. ADD c04, c07, c04
  1429. $L48:
  1430. #if defined(LN) || defined(RT)
  1431. #ifdef LN
  1432. subq KK, 2, TMP1
  1433. #else
  1434. subq KK, 1, TMP1
  1435. #endif
  1436. sll TMP1, ZBASE_SHIFT + 1, TMP2
  1437. addq AORIG, TMP2, AO
  1438. sll TMP1, ZBASE_SHIFT, TMP2
  1439. addq B, TMP2, BO
  1440. #else
  1441. lda AO, -4 * SIZE(AO)
  1442. lda BO, -2 * SIZE(BO)
  1443. #endif
  1444. #if defined(LN) || defined(LT)
  1445. LD a1, 0 * SIZE(BO)
  1446. LD a2, 1 * SIZE(BO)
  1447. LD a3, 2 * SIZE(BO)
  1448. LD a4, 3 * SIZE(BO)
  1449. SUB a1, c01, c01
  1450. SUB a2, c02, c02
  1451. SUB a3, c03, c03
  1452. SUB a4, c04, c04
  1453. #else
  1454. LD a1, 0 * SIZE(AO)
  1455. LD a2, 1 * SIZE(AO)
  1456. LD a3, 2 * SIZE(AO)
  1457. LD a4, 3 * SIZE(AO)
  1458. SUB a1, c01, c01
  1459. SUB a2, c02, c02
  1460. SUB a3, c03, c03
  1461. SUB a4, c04, c04
  1462. #endif
  1463. #ifdef LN
  1464. LD a1, 6 * SIZE(AO)
  1465. LD a2, 7 * SIZE(AO)
  1466. LD a3, 4 * SIZE(AO)
  1467. LD a4, 5 * SIZE(AO)
  1468. MUL a2, c04, t1
  1469. MUL a2, c03, t2
  1470. MUL a1, c03, c03
  1471. MUL a1, c04, c04
  1472. ADD5 c03, t1, c03
  1473. ADD6 c04, t2, c04
  1474. MUL a3, c03, t1
  1475. MUL a3, c04, t2
  1476. SUB c01, t1, c01
  1477. SUB c02, t2, c02
  1478. MUL a4, c04, t1
  1479. MUL a4, c03, t2
  1480. ADD6 c01, t1, c01
  1481. ADD5 c02, t2, c02
  1482. LD a1, 0 * SIZE(AO)
  1483. LD a2, 1 * SIZE(AO)
  1484. MUL a2, c02, t1
  1485. MUL a2, c01, t2
  1486. MUL a1, c01, c01
  1487. MUL a1, c02, c02
  1488. ADD5 c01, t1, c01
  1489. ADD6 c02, t2, c02
  1490. #endif
  1491. #ifdef LT
  1492. LD a1, 0 * SIZE(AO)
  1493. LD a2, 1 * SIZE(AO)
  1494. LD a3, 2 * SIZE(AO)
  1495. LD a4, 3 * SIZE(AO)
  1496. MUL a2, c02, t1
  1497. MUL a2, c01, t2
  1498. MUL a1, c01, c01
  1499. MUL a1, c02, c02
  1500. ADD5 c01, t1, c01
  1501. ADD6 c02, t2, c02
  1502. MUL a3, c01, t1
  1503. MUL a3, c02, t2
  1504. SUB c03, t1, c03
  1505. SUB c04, t2, c04
  1506. MUL a4, c02, t1
  1507. MUL a4, c01, t2
  1508. ADD6 c03, t1, c03
  1509. ADD5 c04, t2, c04
  1510. LD a1, 6 * SIZE(AO)
  1511. LD a2, 7 * SIZE(AO)
  1512. MUL a2, c04, t1
  1513. MUL a2, c03, t2
  1514. MUL a1, c03, c03
  1515. MUL a1, c04, c04
  1516. ADD5 c03, t1, c03
  1517. ADD6 c04, t2, c04
  1518. #endif
  1519. #if defined(RN) || defined(RT)
  1520. LD a1, 0 * SIZE(BO)
  1521. LD a2, 1 * SIZE(BO)
  1522. MUL a2, c02, t1
  1523. MUL a2, c01, t2
  1524. MUL a2, c04, t3
  1525. MUL a2, c03, t4
  1526. MUL a1, c01, c01
  1527. MUL a1, c02, c02
  1528. MUL a1, c03, c03
  1529. MUL a1, c04, c04
  1530. ADD5 c01, t1, c01
  1531. ADD6 c02, t2, c02
  1532. ADD5 c03, t3, c03
  1533. ADD6 c04, t4, c04
  1534. #endif
  1535. #if defined(LN) || defined(LT)
  1536. ST c01, 0 * SIZE(BO)
  1537. ST c02, 1 * SIZE(BO)
  1538. ST c03, 2 * SIZE(BO)
  1539. ST c04, 3 * SIZE(BO)
  1540. #else
  1541. ST c01, 0 * SIZE(AO)
  1542. ST c02, 1 * SIZE(AO)
  1543. ST c03, 2 * SIZE(AO)
  1544. ST c04, 3 * SIZE(AO)
  1545. #endif
  1546. #ifdef LN
  1547. lda C1, -4 * SIZE(C1)
  1548. #endif
  1549. ST c01, 0 * SIZE(C1)
  1550. ST c02, 1 * SIZE(C1)
  1551. ST c03, 2 * SIZE(C1)
  1552. ST c04, 3 * SIZE(C1)
  1553. #ifndef LN
  1554. lda C1, 4 * SIZE(C1)
  1555. #endif
  1556. #ifdef RT
  1557. sll K, ZBASE_SHIFT + 1, TMP1
  1558. addq AORIG, TMP1, AORIG
  1559. #endif
  1560. #if defined(LT) || defined(RN)
  1561. subq K, KK, TMP1
  1562. sll TMP1, ZBASE_SHIFT + 1, TMP2
  1563. addq AO, TMP2, AO
  1564. sll TMP1, ZBASE_SHIFT, TMP2
  1565. addq BO, TMP2, BO
  1566. #endif
  1567. #ifdef LT
  1568. addq KK, 2, KK
  1569. #endif
  1570. #ifdef LN
  1571. subq KK, 2, KK
  1572. #endif
  1573. lda I, -1(I)
  1574. bgt I, $L41
  1575. .align 4
  1576. $L50:
  1577. and M, 1, I
  1578. ble I, $L59
  1579. #if defined(LT) || defined(RN)
  1580. LD a1, 0 * SIZE(AO)
  1581. fclr t1
  1582. LD a2, 1 * SIZE(AO)
  1583. fclr t2
  1584. LD a3, 2 * SIZE(AO)
  1585. fclr t3
  1586. LD a4, 3 * SIZE(AO)
  1587. fclr t4
  1588. LD b1, 0 * SIZE(B)
  1589. fclr c01
  1590. LD b2, 1 * SIZE(B)
  1591. fclr c05
  1592. LD b3, 2 * SIZE(B)
  1593. fclr c02
  1594. LD b4, 3 * SIZE(B)
  1595. fclr c06
  1596. lda AO, 2 * SIZE(AO)
  1597. lda BO, 2 * SIZE(B)
  1598. lda L, -2(KK)
  1599. ble KK, $L58
  1600. ble L, $L55
  1601. #else
  1602. #ifdef LN
  1603. sll K, ZBASE_SHIFT, TMP1
  1604. subq AORIG, TMP1, AORIG
  1605. #endif
  1606. sll KK, ZBASE_SHIFT, TMP1
  1607. addq AORIG, TMP1, AO
  1608. sll KK, ZBASE_SHIFT, TMP1
  1609. addq B, TMP1, BO
  1610. subq K, KK, TMP1
  1611. LD a1, 0 * SIZE(AO)
  1612. fclr t1
  1613. LD a2, 1 * SIZE(AO)
  1614. fclr t2
  1615. LD a3, 2 * SIZE(AO)
  1616. fclr t3
  1617. LD a4, 3 * SIZE(AO)
  1618. fclr t4
  1619. LD b1, 0 * SIZE(BO)
  1620. fclr c01
  1621. LD b2, 1 * SIZE(BO)
  1622. fclr c05
  1623. LD b3, 2 * SIZE(BO)
  1624. fclr c02
  1625. LD b4, 3 * SIZE(BO)
  1626. fclr c06
  1627. lda AO, 2 * SIZE(AO)
  1628. lda BO, 2 * SIZE(BO)
  1629. lda L, -2(TMP1)
  1630. ble TMP1, $L58
  1631. ble L, $L55
  1632. #endif
  1633. .align 5
  1634. $L52:
  1635. ADD1 c01, t1, c01
  1636. unop
  1637. MUL a1, b1, t1
  1638. unop
  1639. ADD3 c02, t2, c02
  1640. lda AO, 4 * SIZE(AO)
  1641. MUL a2, b1, t2
  1642. LD b1, 2 * SIZE(BO)
  1643. ADD4 c05, t3, c05
  1644. lda L, -2(L)
  1645. MUL a1, b2, t3
  1646. LD a1, -2 * SIZE(AO)
  1647. ADD2 c06, t4, c06
  1648. unop
  1649. MUL a2, b2, t4
  1650. LD a2, -1 * SIZE(AO)
  1651. ADD1 c01, t1, c01
  1652. LD b2, 3 * SIZE(BO)
  1653. MUL a3, b3, t1
  1654. lda BO, 4 * SIZE(BO)
  1655. ADD3 c02, t2, c02
  1656. unop
  1657. MUL a4, b3, t2
  1658. LD b3, 0 * SIZE(BO)
  1659. ADD4 c05, t3, c05
  1660. unop
  1661. MUL a3, b4, t3
  1662. LD a3, 0 * SIZE(AO)
  1663. ADD2 c06, t4, c06
  1664. MUL a4, b4, t4
  1665. LD b4, 1 * SIZE(BO)
  1666. unop
  1667. LD a4, 1 * SIZE(AO)
  1668. unop
  1669. unop
  1670. bgt L, $L52
  1671. .align 4
  1672. $L55:
  1673. ADD1 c01, t1, c01
  1674. MUL a1, b1, t1
  1675. #if defined(LT) || defined(RN)
  1676. blbs KK, $L57
  1677. #else
  1678. blbs TMP1, $L57
  1679. #endif
  1680. .align 4
  1681. ADD3 c02, t2, c02
  1682. unop
  1683. MUL a2, b1, t2
  1684. LD b1, 0 * SIZE(BO)
  1685. ADD4 c05, t3, c05
  1686. lda BO, 2 * SIZE(BO)
  1687. MUL a1, b2, t3
  1688. LD a1, 0 * SIZE(AO)
  1689. ADD2 c06, t4, c06
  1690. unop
  1691. MUL a2, b2, t4
  1692. LD a2, 1 * SIZE(AO)
  1693. ADD1 c01, t1, c01
  1694. LD b2, -1 * SIZE(BO)
  1695. MUL a1, b1, t1
  1696. lda AO, 2 * SIZE(AO)
  1697. .align 4
  1698. $L57:
  1699. ADD3 c02, t2, c02
  1700. MUL a2, b1, t2
  1701. ADD4 c05, t3, c05
  1702. MUL a1, b2, t3
  1703. ADD2 c06, t4, c06
  1704. lda AO, 2 * SIZE(AO)
  1705. MUL a2, b2, t4
  1706. lda BO, 2 * SIZE(BO)
  1707. ADD1 c01, t1, c01
  1708. ADD3 c02, t2, c02
  1709. ADD4 c05, t3, c05
  1710. ADD2 c06, t4, c06
  1711. ADD c01, c06, c01
  1712. ADD c02, c05, c02
  1713. $L58:
  1714. #if defined(LN) || defined(RT)
  1715. subq KK, 1, TMP1
  1716. sll TMP1, ZBASE_SHIFT, TMP2
  1717. addq AORIG, TMP2, AO
  1718. sll TMP1, ZBASE_SHIFT, TMP2
  1719. addq B, TMP2, BO
  1720. #else
  1721. lda AO, -2 * SIZE(AO)
  1722. lda BO, -2 * SIZE(BO)
  1723. #endif
  1724. #if defined(LN) || defined(LT)
  1725. LD a1, 0 * SIZE(BO)
  1726. LD a2, 1 * SIZE(BO)
  1727. SUB a1, c01, c01
  1728. SUB a2, c02, c02
  1729. #else
  1730. LD a1, 0 * SIZE(AO)
  1731. LD a2, 1 * SIZE(AO)
  1732. SUB a1, c01, c01
  1733. SUB a2, c02, c02
  1734. #endif
  1735. #if defined(LN) || defined(LT)
  1736. LD a1, 0 * SIZE(AO)
  1737. LD a2, 1 * SIZE(AO)
  1738. MUL a2, c02, t1
  1739. MUL a2, c01, t2
  1740. MUL a1, c01, c01
  1741. MUL a1, c02, c02
  1742. ADD5 c01, t1, c01
  1743. ADD6 c02, t2, c02
  1744. #endif
  1745. #if defined(RN) || defined(RT)
  1746. LD a1, 0 * SIZE(BO)
  1747. LD a2, 1 * SIZE(BO)
  1748. MUL a2, c02, t1
  1749. MUL a2, c01, t2
  1750. MUL a1, c01, c01
  1751. MUL a1, c02, c02
  1752. ADD5 c01, t1, c01
  1753. ADD6 c02, t2, c02
  1754. #endif
  1755. #if defined(LN) || defined(LT)
  1756. ST c01, 0 * SIZE(BO)
  1757. ST c02, 1 * SIZE(BO)
  1758. #else
  1759. ST c01, 0 * SIZE(AO)
  1760. ST c02, 1 * SIZE(AO)
  1761. #endif
  1762. #ifdef LN
  1763. lda C1, -2 * SIZE(C1)
  1764. #endif
  1765. ST c01, 0 * SIZE(C1)
  1766. ST c02, 1 * SIZE(C1)
  1767. #ifndef LN
  1768. lda C1, 2 * SIZE(C1)
  1769. #endif
  1770. #ifdef RT
  1771. sll K, ZBASE_SHIFT, TMP1
  1772. addq AORIG, TMP1, AORIG
  1773. #endif
  1774. #if defined(LT) || defined(RN)
  1775. subq K, KK, TMP1
  1776. sll TMP1, ZBASE_SHIFT, TMP2
  1777. addq AO, TMP2, AO
  1778. sll TMP1, ZBASE_SHIFT, TMP2
  1779. addq BO, TMP2, BO
  1780. #endif
  1781. #ifdef LT
  1782. addq KK, 1, KK
  1783. #endif
  1784. #ifdef LN
  1785. subq KK, 1, KK
  1786. #endif
  1787. .align 4
  1788. $L59:
  1789. #ifdef LN
  1790. sll K, ZBASE_SHIFT, TMP1
  1791. addq B, TMP1, B
  1792. #endif
  1793. #if defined(LT) || defined(RN)
  1794. mov BO, B
  1795. #endif
  1796. #ifdef RN
  1797. addq KK, 1, KK
  1798. #endif
  1799. #ifdef RT
  1800. subq KK, 1, KK
  1801. #endif
  1802. .align 4
  1803. $L999:
  1804. ldt $f2, 0($sp)
  1805. ldt $f3, 8($sp)
  1806. ldt $f4, 16($sp)
  1807. ldt $f5, 24($sp)
  1808. ldt $f6, 32($sp)
  1809. ldt $f7, 40($sp)
  1810. ldt $f8, 48($sp)
  1811. ldt $f9, 56($sp)
  1812. clr $0
  1813. lda $sp, STACKSIZE($sp)
  1814. ret
  1815. .ident VERSION
  1816. .end CNAME