You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_2x2_LN.S 33 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  42. #error "Architecture is not specified."
  43. #endif
  44. #ifdef EV6
  45. #define PREFETCHSIZE 56
  46. #define UNOP unop
  47. #endif
  48. #ifdef EV5
  49. #define PREFETCHSIZE 48
  50. #define UNOP
  51. #endif
  52. #ifdef EV4
  53. #define UNOP
  54. #endif
  55. .set noat
  56. .set noreorder
  57. .arch ev6
  58. .text
  59. .align 5
  60. .globl CNAME
  61. .ent CNAME
  62. #define STACKSIZE 80
  63. #define M $16
  64. #define N $17
  65. #define K $18
  66. #define A $21
  67. #define B $22
  68. #define C $20
  69. #define LDC $23
  70. #define C1 $19
  71. #define C2 $24
  72. #define AO $at
  73. #define BO $5
  74. #define I $6
  75. #define J $7
  76. #define L $8
  77. #define a1 $f16
  78. #define a2 $f17
  79. #define a3 $f18
  80. #define a4 $f19
  81. #define b1 $f20
  82. #define b2 $f21
  83. #define b3 $f22
  84. #define b4 $f23
  85. #define t1 $f24
  86. #define t2 $f25
  87. #define t3 $f26
  88. #define t4 $f27
  89. #define a5 $f28
  90. #define a6 $f30
  91. #define b5 $f29
  92. #define alpha_i $f29
  93. #define alpha_r $f30
  94. #define c01 $f0
  95. #define c02 $f1
  96. #define c03 $f2
  97. #define c04 $f3
  98. #define c05 $f4
  99. #define c06 $f5
  100. #define c07 $f6
  101. #define c08 $f7
  102. #define c09 $f8
  103. #define c10 $f9
  104. #define c11 $f10
  105. #define c12 $f11
  106. #define c13 $f12
  107. #define c14 $f13
  108. #define c15 $f14
  109. #define c16 $f15
  110. #define TMP1 $0
  111. #define TMP2 $1
  112. #define KK $2
  113. #define AORIG $3
  114. #define OFFSET $4
  115. #if defined(LN) || defined(LT)
  116. #ifndef CONJ
  117. #define ADD1 ADD
  118. #define ADD2 SUB
  119. #define ADD3 ADD
  120. #define ADD4 ADD
  121. #define ADD5 SUB
  122. #define ADD6 ADD
  123. #else
  124. #define ADD1 ADD
  125. #define ADD2 ADD
  126. #define ADD3 SUB
  127. #define ADD4 ADD
  128. #define ADD5 ADD
  129. #define ADD6 SUB
  130. #endif
  131. #else
  132. #ifndef CONJ
  133. #define ADD1 ADD
  134. #define ADD2 SUB
  135. #define ADD3 ADD
  136. #define ADD4 ADD
  137. #define ADD5 SUB
  138. #define ADD6 ADD
  139. #else
  140. #define ADD1 ADD
  141. #define ADD2 ADD
  142. #define ADD3 ADD
  143. #define ADD4 SUB
  144. #define ADD5 ADD
  145. #define ADD6 SUB
  146. #endif
  147. #endif
  148. CNAME:
  149. .frame $sp, STACKSIZE, $26, 0
  150. #ifdef PROFILE
  151. ldgp $gp, 0($27)
  152. lda $at, _mcount
  153. jsr $at, ($at), _mcount
  154. #endif
  155. #ifndef PROFILE
  156. .prologue 0
  157. #else
  158. .prologue 1
  159. #endif
  160. lda $sp, -STACKSIZE($sp)
  161. ldq B, 0 + STACKSIZE($sp)
  162. ldq C, 8 + STACKSIZE($sp)
  163. ldq LDC, 16 + STACKSIZE($sp)
  164. ldq OFFSET, 24 + STACKSIZE($sp)
  165. sll LDC, ZBASE_SHIFT, LDC
  166. stt $f2, 0($sp)
  167. stt $f3, 8($sp)
  168. stt $f4, 16($sp)
  169. stt $f5, 24($sp)
  170. stt $f6, 32($sp)
  171. stt $f7, 40($sp)
  172. stt $f8, 48($sp)
  173. stt $f9, 56($sp)
  174. cmple M, 0, $0
  175. cmple N, 0, $1
  176. cmple K, 0, $2
  177. or $0, $1, $0
  178. or $0, $2, $0
  179. bne $0, $L999
  180. #ifdef LN
  181. addq M, M, TMP2
  182. mulq TMP2, K, TMP1
  183. SXADDQ TMP1, A, A
  184. SXADDQ TMP2, C, C
  185. #endif
  186. #ifdef RN
  187. negq OFFSET, KK
  188. #endif
  189. #ifdef RT
  190. mulq N, K, TMP1
  191. addq TMP1, TMP1, TMP1
  192. SXADDQ TMP1, B, B
  193. mulq N, LDC, TMP1
  194. addq TMP1, C, C
  195. subq N, OFFSET, KK
  196. #endif
  197. sra N, 1, J
  198. ble J, $L30
  199. .align 4
  200. $L01:
  201. #ifdef RT
  202. sll K, ZBASE_SHIFT + 1, TMP1
  203. subq B, TMP1, B
  204. subq C, LDC, C2
  205. subq C2, LDC, C1
  206. subq C2, LDC, C
  207. #else
  208. mov C, C1
  209. addq C, LDC, C2
  210. addq C2, LDC, C
  211. #endif
  212. #ifdef LN
  213. addq M, OFFSET, KK
  214. #endif
  215. #ifdef LT
  216. mov OFFSET, KK
  217. #endif
  218. #if defined(LN) || defined(RT)
  219. mov A, AORIG
  220. #else
  221. mov A, AO
  222. #endif
  223. and M, 1, I
  224. fclr t1
  225. fclr t2
  226. fclr t3
  227. fclr t4
  228. fclr c01
  229. fclr c05
  230. ble I, $L20
  231. #if defined(LT) || defined(RN)
  232. LD a1, 0 * SIZE(AO)
  233. fclr c09
  234. LD a2, 1 * SIZE(AO)
  235. fclr c13
  236. LD a3, 2 * SIZE(AO)
  237. fclr c02
  238. LD a4, 3 * SIZE(AO)
  239. fclr c06
  240. LD b1, 0 * SIZE(B)
  241. fclr c10
  242. LD b2, 1 * SIZE(B)
  243. fclr c14
  244. LD b3, 2 * SIZE(B)
  245. lda AO, 2 * SIZE(AO)
  246. LD b4, 3 * SIZE(B)
  247. lda BO, 4 * SIZE(B)
  248. lda L, -2(KK)
  249. ble KK, $L28
  250. ble L, $L25
  251. #else
  252. #ifdef LN
  253. sll K, ZBASE_SHIFT + 0, TMP1
  254. subq AORIG, TMP1, AORIG
  255. #endif
  256. sll KK, ZBASE_SHIFT + 0, TMP1
  257. addq AORIG, TMP1, AO
  258. sll KK, ZBASE_SHIFT + 1, TMP1
  259. addq B, TMP1, BO
  260. subq K, KK, TMP1
  261. LD a1, 0 * SIZE(AO)
  262. fclr c09
  263. LD a2, 1 * SIZE(AO)
  264. fclr c13
  265. LD a3, 2 * SIZE(AO)
  266. fclr c02
  267. LD a4, 3 * SIZE(AO)
  268. fclr c06
  269. LD b1, 0 * SIZE(BO)
  270. fclr c10
  271. LD b2, 1 * SIZE(BO)
  272. fclr c14
  273. LD b3, 2 * SIZE(BO)
  274. lda AO, 2 * SIZE(AO)
  275. LD b4, 3 * SIZE(BO)
  276. lda BO, 4 * SIZE(BO)
  277. lda L, -2(TMP1)
  278. ble TMP1, $L28
  279. ble L, $L25
  280. #endif
  281. .align 5
  282. $L22:
  283. ADD1 c09, t1, c09
  284. unop
  285. MUL a1, b1, t1
  286. unop
  287. ADD3 c10, t2, c10
  288. unop
  289. MUL a2, b1, t2
  290. LD b1, 0 * SIZE(BO)
  291. ADD4 c13, t3, c13
  292. unop
  293. MUL a1, b2, t3
  294. lda BO, 8 * SIZE(BO)
  295. ADD2 c14, t4, c14
  296. unop
  297. MUL a2, b2, t4
  298. LD b2, -7 * SIZE(BO)
  299. ADD1 c01, t1, c01
  300. unop
  301. MUL a1, b3, t1
  302. unop
  303. ADD3 c02, t2, c02
  304. unop
  305. MUL a2, b3, t2
  306. LD b3, -6 * SIZE(BO)
  307. ADD4 c05, t3, c05
  308. unop
  309. MUL a1, b4, t3
  310. LD a1, 2 * SIZE(AO)
  311. ADD2 c06, t4, c06
  312. MUL a2, b4, t4
  313. LD b5, -5 * SIZE(BO)
  314. ADD1 c09, t1, c09
  315. unop
  316. MUL a3, b1, t1
  317. LD a2, 3 * SIZE(AO)
  318. ADD3 c10, t2, c10
  319. unop
  320. MUL a4, b1, t2
  321. LD b1, -4 * SIZE(BO)
  322. ADD4 c13, t3, c13
  323. unop
  324. MUL a3, b2, t3
  325. lda AO, 4 * SIZE(AO)
  326. ADD2 c14, t4, c14
  327. MUL a4, b2, t4
  328. LD b2, -3 * SIZE(BO)
  329. ADD1 c01, t1, c01
  330. lda L, -2(L)
  331. MUL a3, b3, t1
  332. LD b4, -1 * SIZE(BO)
  333. ADD3 c02, t2, c02
  334. unop
  335. MUL a4, b3, t2
  336. LD b3, -2 * SIZE(BO)
  337. ADD4 c05, t3, c05
  338. unop
  339. MUL a3, b5, t3
  340. LD a3, 0 * SIZE(AO)
  341. ADD2 c06, t4, c06
  342. MUL a4, b5, t4
  343. LD a4, 1 * SIZE(AO)
  344. bgt L, $L22
  345. .align 4
  346. $L25:
  347. ADD1 c09, t1, c09
  348. MUL a1, b1, t1
  349. #if defined(LT) || defined(RN)
  350. blbs KK, $L27
  351. #else
  352. blbs TMP1, $L27
  353. #endif
  354. .align 4
  355. ADD3 c10, t2, c10
  356. unop
  357. MUL a2, b1, t2
  358. LD b1, 0 * SIZE(BO)
  359. ADD4 c13, t3, c13
  360. unop
  361. MUL a1, b2, t3
  362. unop
  363. ADD2 c14, t4, c14
  364. unop
  365. MUL a2, b2, t4
  366. LD b2, 1 * SIZE(BO)
  367. ADD1 c01, t1, c01
  368. unop
  369. MUL a1, b3, t1
  370. lda AO, 2 * SIZE(AO)
  371. ADD3 c02, t2, c02
  372. unop
  373. MUL a2, b3, t2
  374. LD b3, 2 * SIZE(BO)
  375. ADD4 c05, t3, c05
  376. unop
  377. MUL a1, b4, t3
  378. LD a1, -2 * SIZE(AO)
  379. ADD2 c06, t4, c06
  380. unop
  381. MUL a2, b4, t4
  382. LD a2, -1 * SIZE(AO)
  383. ADD1 c09, t1, c09
  384. LD b4, 3 * SIZE(BO)
  385. MUL a1, b1, t1
  386. lda BO, 4 * SIZE(BO)
  387. .align 4
  388. $L27:
  389. ADD3 c10, t2, c10
  390. MUL a2, b1, t2
  391. ADD4 c13, t3, c13
  392. MUL a1, b2, t3
  393. ADD2 c14, t4, c14
  394. MUL a2, b2, t4
  395. ADD1 c01, t1, c01
  396. MUL a1, b3, t1
  397. ADD3 c02, t2, c02
  398. MUL a2, b3, t2
  399. ADD4 c05, t3, c05
  400. MUL a1, b4, t3
  401. ADD2 c06, t4, c06
  402. lda AO, 2 * SIZE(AO)
  403. MUL a2, b4, t4
  404. lda BO, 4 * SIZE(BO)
  405. ADD1 c09, t1, c09
  406. ADD3 c10, t2, c10
  407. ADD4 c13, t3, c13
  408. ADD2 c14, t4, c14
  409. ADD c01, c06, c01
  410. ADD c02, c05, c02
  411. ADD c09, c14, c09
  412. ADD c10, c13, c10
  413. .align 4
  414. $L28:
  415. #if defined(LN) || defined(RT)
  416. #ifdef LN
  417. subq KK, 1, TMP1
  418. #else
  419. subq KK, 2, TMP1
  420. #endif
  421. sll TMP1, ZBASE_SHIFT + 0, TMP2
  422. addq AORIG, TMP2, AO
  423. sll TMP1, ZBASE_SHIFT + 1, TMP2
  424. addq B, TMP2, BO
  425. #else
  426. lda AO, -2 * SIZE(AO)
  427. lda BO, -4 * SIZE(BO)
  428. #endif
  429. #if defined(LN) || defined(LT)
  430. LD a1, 0 * SIZE(BO)
  431. LD a2, 1 * SIZE(BO)
  432. LD a3, 2 * SIZE(BO)
  433. LD a4, 3 * SIZE(BO)
  434. SUB a1, c01, c01
  435. SUB a2, c02, c02
  436. SUB a3, c09, c09
  437. SUB a4, c10, c10
  438. #else
  439. LD a1, 0 * SIZE(AO)
  440. LD a2, 1 * SIZE(AO)
  441. LD a3, 2 * SIZE(AO)
  442. LD a4, 3 * SIZE(AO)
  443. SUB a1, c01, c01
  444. SUB a2, c02, c02
  445. SUB a3, c09, c09
  446. SUB a4, c10, c10
  447. #endif
  448. #if defined(LN) || defined(LT)
  449. LD a1, 0 * SIZE(AO)
  450. LD a2, 1 * SIZE(AO)
  451. MUL a2, c02, t1
  452. MUL a2, c01, t2
  453. MUL a2, c10, t3
  454. MUL a2, c09, t4
  455. MUL a1, c01, c01
  456. MUL a1, c02, c02
  457. MUL a1, c09, c09
  458. MUL a1, c10, c10
  459. ADD5 c01, t1, c01
  460. ADD6 c02, t2, c02
  461. ADD5 c09, t3, c09
  462. ADD6 c10, t4, c10
  463. #endif
  464. #ifdef RN
  465. LD a1, 0 * SIZE(BO)
  466. LD a2, 1 * SIZE(BO)
  467. LD a3, 2 * SIZE(BO)
  468. LD a4, 3 * SIZE(BO)
  469. MUL a2, c02, t1
  470. MUL a2, c01, t2
  471. MUL a1, c01, c01
  472. MUL a1, c02, c02
  473. ADD5 c01, t1, c01
  474. ADD6 c02, t2, c02
  475. MUL a3, c01, t1
  476. MUL a3, c02, t2
  477. SUB c09, t1, c09
  478. SUB c10, t2, c10
  479. MUL a4, c02, t1
  480. MUL a4, c01, t2
  481. ADD6 c09, t1, c09
  482. ADD5 c10, t2, c10
  483. LD a1, 6 * SIZE(BO)
  484. LD a2, 7 * SIZE(BO)
  485. MUL a2, c10, t1
  486. MUL a2, c09, t2
  487. MUL a1, c09, c09
  488. MUL a1, c10, c10
  489. ADD5 c09, t1, c09
  490. ADD6 c10, t2, c10
  491. #endif
  492. #ifdef RT
  493. LD a1, 6 * SIZE(BO)
  494. LD a2, 7 * SIZE(BO)
  495. LD a3, 4 * SIZE(BO)
  496. LD a4, 5 * SIZE(BO)
  497. MUL a2, c10, t1
  498. MUL a2, c09, t2
  499. MUL a1, c09, c09
  500. MUL a1, c10, c10
  501. ADD5 c09, t1, c09
  502. ADD6 c10, t2, c10
  503. MUL a3, c09, t1
  504. MUL a3, c10, t2
  505. SUB c01, t1, c01
  506. SUB c02, t2, c02
  507. MUL a4, c10, t1
  508. MUL a4, c09, t2
  509. ADD6 c01, t1, c01
  510. ADD5 c02, t2, c02
  511. LD a1, 0 * SIZE(BO)
  512. LD a2, 1 * SIZE(BO)
  513. MUL a2, c02, t1
  514. MUL a2, c01, t2
  515. MUL a1, c01, c01
  516. MUL a1, c02, c02
  517. ADD5 c01, t1, c01
  518. ADD6 c02, t2, c02
  519. #endif
  520. #if defined(LN) || defined(LT)
  521. ST c01, 0 * SIZE(BO)
  522. ST c02, 1 * SIZE(BO)
  523. ST c09, 2 * SIZE(BO)
  524. ST c10, 3 * SIZE(BO)
  525. #else
  526. ST c01, 0 * SIZE(AO)
  527. ST c02, 1 * SIZE(AO)
  528. ST c09, 2 * SIZE(AO)
  529. ST c10, 3 * SIZE(AO)
  530. #endif
  531. #ifdef LN
  532. lda C1, -2 * SIZE(C1)
  533. lda C2, -2 * SIZE(C2)
  534. #endif
  535. ST c01, 0 * SIZE(C1)
  536. ST c02, 1 * SIZE(C1)
  537. ST c09, 0 * SIZE(C2)
  538. ST c10, 1 * SIZE(C2)
  539. #ifndef LN
  540. lda C1, 2 * SIZE(C1)
  541. lda C2, 2 * SIZE(C2)
  542. #endif
  543. #ifdef RT
  544. sll K, ZBASE_SHIFT, TMP1
  545. addq AORIG, TMP1, AORIG
  546. #endif
  547. #if defined(LT) || defined(RN)
  548. subq K, KK, TMP1
  549. sll TMP1, ZBASE_SHIFT + 0, TMP2
  550. addq AO, TMP2, AO
  551. sll TMP1, ZBASE_SHIFT + 1, TMP2
  552. addq BO, TMP2, BO
  553. #endif
  554. #ifdef LT
  555. addq KK, 1, KK
  556. #endif
  557. #ifdef LN
  558. subq KK, 1, KK
  559. #endif
  560. .align 4
  561. $L20:
  562. sra M, 1, I
  563. fclr t1
  564. fclr t2
  565. fclr t3
  566. fclr t4
  567. fclr c01
  568. fclr c05
  569. ble I, $L29
  570. .align 4
  571. $L11:
  572. #if defined(LT) || defined(RN)
  573. LD a1, 0 * SIZE(AO)
  574. fclr c09
  575. LD a2, 1 * SIZE(AO)
  576. fclr c13
  577. LD a3, 2 * SIZE(AO)
  578. fclr c02
  579. LD a4, 3 * SIZE(AO)
  580. fclr c06
  581. LD b1, 0 * SIZE(B)
  582. fclr c10
  583. LD b2, 1 * SIZE(B)
  584. fclr c14
  585. LD b3, 2 * SIZE(B)
  586. fclr c03
  587. LD b4, 3 * SIZE(B)
  588. fclr c07
  589. lda BO, 4 * SIZE(B)
  590. fclr c11
  591. lda AO, 4 * SIZE(AO)
  592. fclr c15
  593. lds $f31, 4 * SIZE(C1)
  594. fclr c04
  595. lda L, -2(KK)
  596. fclr c08
  597. lds $f31, 4 * SIZE(C2)
  598. fclr c12
  599. fclr c16
  600. ble KK, $L18
  601. ble L, $L15
  602. #else
  603. #ifdef LN
  604. sll K, ZBASE_SHIFT + 1, TMP1
  605. subq AORIG, TMP1, AORIG
  606. #endif
  607. sll KK, ZBASE_SHIFT + 1, TMP1
  608. addq AORIG, TMP1, AO
  609. addq B, TMP1, BO
  610. subq K, KK, TMP1
  611. LD a1, 0 * SIZE(AO)
  612. fclr c09
  613. LD a2, 1 * SIZE(AO)
  614. fclr c13
  615. LD a3, 2 * SIZE(AO)
  616. fclr c02
  617. LD a4, 3 * SIZE(AO)
  618. fclr c06
  619. LD b1, 0 * SIZE(BO)
  620. fclr c10
  621. LD b2, 1 * SIZE(BO)
  622. fclr c14
  623. LD b3, 2 * SIZE(BO)
  624. fclr c03
  625. LD b4, 3 * SIZE(BO)
  626. fclr c07
  627. lda BO, 4 * SIZE(BO)
  628. fclr c11
  629. lda AO, 4 * SIZE(AO)
  630. fclr c15
  631. lds $f31, 4 * SIZE(C1)
  632. fclr c04
  633. lda L, -2(TMP1)
  634. fclr c08
  635. lds $f31, 4 * SIZE(C2)
  636. fclr c12
  637. fclr c16
  638. ble TMP1, $L18
  639. ble L, $L15
  640. #endif
  641. .align 5
  642. $L12:
  643. /* 1 */
  644. ADD1 c11, t1, c11
  645. #ifndef EV4
  646. ldq $31, PREFETCHSIZE * SIZE(AO)
  647. #else
  648. unop
  649. #endif
  650. MUL b1, a1, t1
  651. #ifndef EV4
  652. ldl $31, PREFETCHSIZE * SIZE(BO)
  653. #else
  654. unop
  655. #endif
  656. ADD3 c12, t2, c12
  657. unop
  658. MUL b1, a2, t2
  659. unop
  660. ADD2 c16, t3, c16
  661. unop
  662. MUL b2, a2, t3
  663. LD a5, 0 * SIZE(AO)
  664. ADD4 c15, t4, c15
  665. unop
  666. MUL b2, a1, t4
  667. LD b5, 0 * SIZE(BO)
  668. /* 2 */
  669. ADD1 c01, t1, c01
  670. UNOP
  671. MUL b1, a3, t1
  672. UNOP
  673. ADD3 c02, t2, c02
  674. UNOP
  675. MUL b1, a4, t2
  676. UNOP
  677. ADD2 c06, t3, c06
  678. unop
  679. MUL b2, a4, t3
  680. unop
  681. ADD4 c05, t4, c05
  682. unop
  683. MUL b4, a1, t4
  684. unop
  685. /* 3 */
  686. ADD1 c03, t1, c03
  687. unop
  688. MUL b3, a1, t1
  689. unop
  690. ADD3 c04, t2, c04
  691. unop
  692. MUL b3, a2, t2
  693. unop
  694. ADD2 c08, t3, c08
  695. unop
  696. MUL b4, a2, t3
  697. LD a2, 1 * SIZE(AO)
  698. ADD4 c13, t4, c13
  699. unop
  700. MUL b2, a3, t4
  701. LD b2, 1 * SIZE(BO)
  702. /* 4 */
  703. ADD1 c09, t1, c09
  704. unop
  705. MUL b3, a3, t1
  706. LD a6, 2 * SIZE(AO)
  707. ADD3 c10, t2, c10
  708. unop
  709. MUL b3, a4, t2
  710. LD b3, 2 * SIZE(BO)
  711. ADD2 c14, t3, c14
  712. unop
  713. MUL b4, a4, t3
  714. LD a4, 3 * SIZE(AO)
  715. ADD4 c07, t4, c07
  716. unop
  717. MUL b4, a3, t4
  718. LD b4, 3 * SIZE(BO)
  719. /* 5 */
  720. ADD1 c11, t1, c11
  721. unop
  722. MUL b5, a5, t1
  723. LD a1, 4 * SIZE(AO)
  724. ADD3 c12, t2, c12
  725. lda L, -2(L)
  726. MUL b5, a2, t2
  727. LD b1, 4 * SIZE(BO)
  728. ADD2 c16, t3, c16
  729. unop
  730. MUL b2, a2, t3
  731. unop
  732. ADD4 c15, t4, c15
  733. unop
  734. MUL b2, a5, t4
  735. unop
  736. /* 6 */
  737. ADD1 c01, t1, c01
  738. unop
  739. MUL b5, a6, t1
  740. unop
  741. ADD3 c02, t2, c02
  742. unop
  743. MUL b5, a4, t2
  744. unop
  745. ADD2 c06, t3, c06
  746. unop
  747. MUL b2, a4, t3
  748. unop
  749. ADD4 c05, t4, c05
  750. unop
  751. MUL b4, a5, t4
  752. unop
  753. /* 7 */
  754. ADD1 c03, t1, c03
  755. lda AO, 8 * SIZE(AO)
  756. MUL b3, a5, t1
  757. unop
  758. ADD3 c04, t2, c04
  759. lda BO, 8 * SIZE(BO)
  760. MUL b3, a2, t2
  761. unop
  762. ADD2 c08, t3, c08
  763. unop
  764. MUL b4, a2, t3
  765. LD a2, -3 * SIZE(AO)
  766. ADD4 c13, t4, c13
  767. unop
  768. MUL b2, a6, t4
  769. LD b2, -3 * SIZE(BO)
  770. /* 8 */
  771. ADD1 c09, t1, c09
  772. unop
  773. MUL b3, a6, t1
  774. LD a3, -2 * SIZE(AO)
  775. ADD3 c10, t2, c10
  776. unop
  777. MUL b3, a4, t2
  778. LD b3, -2 * SIZE(BO)
  779. ADD2 c14, t3, c14
  780. unop
  781. MUL b4, a4, t3
  782. LD a4, -1 * SIZE(AO)
  783. ADD4 c07, t4, c07
  784. MUL b4, a6, t4
  785. LD b4, -1 * SIZE(BO)
  786. bgt L, $L12
  787. .align 4
  788. $L15:
  789. ADD1 c11, t1, c11
  790. unop
  791. MUL b1, a1, t1
  792. #if defined(LT) || defined(RN)
  793. blbs KK, $L17
  794. #else
  795. blbs TMP1, $L17
  796. #endif
  797. .align 4
  798. ADD3 c12, t2, c12
  799. MUL b1, a2, t2
  800. ADD2 c16, t3, c16
  801. MUL b2, a2, t3
  802. ADD4 c15, t4, c15
  803. MUL b2, a1, t4
  804. ADD1 c01, t1, c01
  805. MUL b1, a3, t1
  806. ADD3 c02, t2, c02
  807. unop
  808. MUL b1, a4, t2
  809. LD b1, 0 * SIZE(BO)
  810. ADD2 c06, t3, c06
  811. MUL b2, a4, t3
  812. ADD4 c05, t4, c05
  813. MUL b4, a1, t4
  814. ADD1 c03, t1, c03
  815. unop
  816. MUL b3, a1, t1
  817. LD a1, 0 * SIZE(AO)
  818. ADD3 c04, t2, c04
  819. unop
  820. MUL b3, a2, t2
  821. unop
  822. ADD2 c08, t3, c08
  823. unop
  824. MUL b4, a2, t3
  825. LD a2, 1 * SIZE(AO)
  826. ADD4 c13, t4, c13
  827. unop
  828. MUL b2, a3, t4
  829. LD b2, 1 * SIZE(BO)
  830. ADD1 c09, t1, c09
  831. unop
  832. MUL b3, a3, t1
  833. lda AO, 4 * SIZE(AO)
  834. ADD3 c10, t2, c10
  835. unop
  836. MUL b3, a4, t2
  837. LD b3, 2 * SIZE(BO)
  838. ADD2 c14, t3, c14
  839. unop
  840. MUL b4, a4, t3
  841. LD a4, -1 * SIZE(AO)
  842. ADD4 c07, t4, c07
  843. unop
  844. MUL b4, a3, t4
  845. LD a3, -2 * SIZE(AO)
  846. ADD1 c11, t1, c11
  847. LD b4, 3 * SIZE(BO)
  848. MUL b1, a1, t1
  849. lda BO, 4 * SIZE(BO)
  850. .align 4
  851. $L17:
  852. ADD3 c12, t2, c12
  853. MUL b1, a2, t2
  854. ADD2 c16, t3, c16
  855. MUL b2, a2, t3
  856. ADD4 c15, t4, c15
  857. MUL b2, a1, t4
  858. ADD1 c01, t1, c01
  859. MUL b1, a3, t1
  860. ADD3 c02, t2, c02
  861. MUL b1, a4, t2
  862. ADD2 c06, t3, c06
  863. MUL b2, a4, t3
  864. ADD4 c05, t4, c05
  865. MUL b4, a1, t4
  866. ADD1 c03, t1, c03
  867. MUL b3, a1, t1
  868. ADD3 c04, t2, c04
  869. MUL b3, a2, t2
  870. ADD2 c08, t3, c08
  871. MUL b4, a2, t3
  872. ADD4 c13, t4, c13
  873. MUL b2, a3, t4
  874. ADD1 c09, t1, c09
  875. MUL b3, a3, t1
  876. ADD3 c10, t2, c10
  877. MUL b3, a4, t2
  878. ADD2 c14, t3, c14
  879. MUL b4, a4, t3
  880. ADD4 c07, t4, c07
  881. lda AO, 4 * SIZE(AO)
  882. MUL b4, a3, t4
  883. lda BO, 4 * SIZE(BO)
  884. ADD1 c11, t1, c11
  885. ADD3 c12, t2, c12
  886. ADD2 c16, t3, c16
  887. ADD4 c15, t4, c15
  888. ADD c01, c06, c01
  889. ADD c02, c05, c02
  890. ADD c03, c08, c03
  891. ADD c04, c07, c04
  892. ADD c09, c14, c09
  893. ADD c10, c13, c10
  894. ADD c11, c16, c11
  895. ADD c12, c15, c12
  896. .align 4
  897. $L18:
  898. #if defined(LN) || defined(RT)
  899. #ifdef LN
  900. subq KK, 2, TMP1
  901. #else
  902. subq KK, 2, TMP1
  903. #endif
  904. sll TMP1, ZBASE_SHIFT + 1, TMP2
  905. addq AORIG, TMP2, AO
  906. sll TMP1, ZBASE_SHIFT + 1, TMP2
  907. addq B, TMP2, BO
  908. #else
  909. lda AO, -4 * SIZE(AO)
  910. lda BO, -4 * SIZE(BO)
  911. #endif
  912. #if defined(LN) || defined(LT)
  913. LD a1, 0 * SIZE(BO)
  914. LD a2, 1 * SIZE(BO)
  915. LD a3, 2 * SIZE(BO)
  916. LD a4, 3 * SIZE(BO)
  917. LD b1, 4 * SIZE(BO)
  918. LD b2, 5 * SIZE(BO)
  919. LD b3, 6 * SIZE(BO)
  920. LD b4, 7 * SIZE(BO)
  921. SUB a1, c01, c01
  922. SUB a2, c02, c02
  923. SUB a3, c09, c09
  924. SUB a4, c10, c10
  925. SUB b1, c03, c03
  926. SUB b2, c04, c04
  927. SUB b3, c11, c11
  928. SUB b4, c12, c12
  929. #else
  930. LD a1, 0 * SIZE(AO)
  931. LD a2, 1 * SIZE(AO)
  932. LD a3, 2 * SIZE(AO)
  933. LD a4, 3 * SIZE(AO)
  934. LD b1, 4 * SIZE(AO)
  935. LD b2, 5 * SIZE(AO)
  936. LD b3, 6 * SIZE(AO)
  937. LD b4, 7 * SIZE(AO)
  938. SUB a1, c01, c01
  939. SUB a2, c02, c02
  940. SUB a3, c03, c03
  941. SUB a4, c04, c04
  942. SUB b1, c09, c09
  943. SUB b2, c10, c10
  944. SUB b3, c11, c11
  945. SUB b4, c12, c12
  946. #endif
  947. #ifdef LN
  948. LD a1, 6 * SIZE(AO)
  949. LD a2, 7 * SIZE(AO)
  950. LD a3, 4 * SIZE(AO)
  951. LD a4, 5 * SIZE(AO)
  952. MUL a2, c04, t1
  953. MUL a2, c03, t2
  954. MUL a2, c12, t3
  955. MUL a2, c11, t4
  956. MUL a1, c03, c03
  957. MUL a1, c04, c04
  958. MUL a1, c11, c11
  959. MUL a1, c12, c12
  960. ADD5 c03, t1, c03
  961. ADD6 c04, t2, c04
  962. ADD5 c11, t3, c11
  963. ADD6 c12, t4, c12
  964. MUL a3, c03, t1
  965. MUL a3, c04, t2
  966. MUL a3, c11, t3
  967. MUL a3, c12, t4
  968. SUB c01, t1, c01
  969. SUB c02, t2, c02
  970. SUB c09, t3, c09
  971. SUB c10, t4, c10
  972. MUL a4, c04, t1
  973. MUL a4, c03, t2
  974. MUL a4, c12, t3
  975. MUL a4, c11, t4
  976. ADD6 c01, t1, c01
  977. ADD5 c02, t2, c02
  978. ADD6 c09, t3, c09
  979. ADD5 c10, t4, c10
  980. LD a1, 0 * SIZE(AO)
  981. LD a2, 1 * SIZE(AO)
  982. MUL a2, c02, t1
  983. MUL a2, c01, t2
  984. MUL a2, c10, t3
  985. MUL a2, c09, t4
  986. MUL a1, c01, c01
  987. MUL a1, c02, c02
  988. MUL a1, c09, c09
  989. MUL a1, c10, c10
  990. ADD5 c01, t1, c01
  991. ADD6 c02, t2, c02
  992. ADD5 c09, t3, c09
  993. ADD6 c10, t4, c10
  994. #endif
  995. #ifdef LT
  996. LD a1, 0 * SIZE(AO)
  997. LD a2, 1 * SIZE(AO)
  998. LD a3, 2 * SIZE(AO)
  999. LD a4, 3 * SIZE(AO)
  1000. MUL a2, c02, t1
  1001. MUL a2, c01, t2
  1002. MUL a2, c10, t3
  1003. MUL a2, c09, t4
  1004. MUL a1, c01, c01
  1005. MUL a1, c02, c02
  1006. MUL a1, c09, c09
  1007. MUL a1, c10, c10
  1008. ADD5 c01, t1, c01
  1009. ADD6 c02, t2, c02
  1010. ADD5 c09, t3, c09
  1011. ADD6 c10, t4, c10
  1012. MUL a3, c01, t1
  1013. MUL a3, c02, t2
  1014. MUL a3, c09, t3
  1015. MUL a3, c10, t4
  1016. SUB c03, t1, c03
  1017. SUB c04, t2, c04
  1018. SUB c11, t3, c11
  1019. SUB c12, t4, c12
  1020. MUL a4, c02, t1
  1021. MUL a4, c01, t2
  1022. MUL a4, c10, t3
  1023. MUL a4, c09, t4
  1024. ADD6 c03, t1, c03
  1025. ADD5 c04, t2, c04
  1026. ADD6 c11, t3, c11
  1027. ADD5 c12, t4, c12
  1028. LD a1, 6 * SIZE(AO)
  1029. LD a2, 7 * SIZE(AO)
  1030. MUL a2, c04, t1
  1031. MUL a2, c03, t2
  1032. MUL a2, c12, t3
  1033. MUL a2, c11, t4
  1034. MUL a1, c03, c03
  1035. MUL a1, c04, c04
  1036. MUL a1, c11, c11
  1037. MUL a1, c12, c12
  1038. ADD5 c03, t1, c03
  1039. ADD6 c04, t2, c04
  1040. ADD5 c11, t3, c11
  1041. ADD6 c12, t4, c12
  1042. #endif
  1043. #ifdef RN
  1044. LD a1, 0 * SIZE(BO)
  1045. LD a2, 1 * SIZE(BO)
  1046. LD a3, 2 * SIZE(BO)
  1047. LD a4, 3 * SIZE(BO)
  1048. MUL a2, c02, t1
  1049. MUL a2, c01, t2
  1050. MUL a2, c04, t3
  1051. MUL a2, c03, t4
  1052. MUL a1, c01, c01
  1053. MUL a1, c02, c02
  1054. MUL a1, c03, c03
  1055. MUL a1, c04, c04
  1056. ADD5 c01, t1, c01
  1057. ADD6 c02, t2, c02
  1058. ADD5 c03, t3, c03
  1059. ADD6 c04, t4, c04
  1060. MUL a3, c01, t1
  1061. MUL a3, c02, t2
  1062. MUL a3, c03, t3
  1063. MUL a3, c04, t4
  1064. SUB c09, t1, c09
  1065. SUB c10, t2, c10
  1066. SUB c11, t3, c11
  1067. SUB c12, t4, c12
  1068. MUL a4, c02, t1
  1069. MUL a4, c01, t2
  1070. MUL a4, c04, t3
  1071. MUL a4, c03, t4
  1072. ADD6 c09, t1, c09
  1073. ADD5 c10, t2, c10
  1074. ADD6 c11, t3, c11
  1075. ADD5 c12, t4, c12
  1076. LD a1, 6 * SIZE(BO)
  1077. LD a2, 7 * SIZE(BO)
  1078. MUL a2, c10, t1
  1079. MUL a2, c09, t2
  1080. MUL a2, c12, t3
  1081. MUL a2, c11, t4
  1082. MUL a1, c09, c09
  1083. MUL a1, c10, c10
  1084. MUL a1, c11, c11
  1085. MUL a1, c12, c12
  1086. ADD5 c09, t1, c09
  1087. ADD6 c10, t2, c10
  1088. ADD5 c11, t3, c11
  1089. ADD6 c12, t4, c12
  1090. #endif
  1091. #ifdef RT
  1092. LD a1, 6 * SIZE(BO)
  1093. LD a2, 7 * SIZE(BO)
  1094. LD a3, 4 * SIZE(BO)
  1095. LD a4, 5 * SIZE(BO)
  1096. MUL a2, c10, t1
  1097. MUL a2, c09, t2
  1098. MUL a2, c12, t3
  1099. MUL a2, c11, t4
  1100. MUL a1, c09, c09
  1101. MUL a1, c10, c10
  1102. MUL a1, c11, c11
  1103. MUL a1, c12, c12
  1104. ADD5 c09, t1, c09
  1105. ADD6 c10, t2, c10
  1106. ADD5 c11, t3, c11
  1107. ADD6 c12, t4, c12
  1108. MUL a3, c09, t1
  1109. MUL a3, c10, t2
  1110. MUL a3, c11, t3
  1111. MUL a3, c12, t4
  1112. SUB c01, t1, c01
  1113. SUB c02, t2, c02
  1114. SUB c03, t3, c03
  1115. SUB c04, t4, c04
  1116. MUL a4, c10, t1
  1117. MUL a4, c09, t2
  1118. MUL a4, c12, t3
  1119. MUL a4, c11, t4
  1120. ADD6 c01, t1, c01
  1121. ADD5 c02, t2, c02
  1122. ADD6 c03, t3, c03
  1123. ADD5 c04, t4, c04
  1124. LD a1, 0 * SIZE(BO)
  1125. LD a2, 1 * SIZE(BO)
  1126. MUL a2, c02, t1
  1127. MUL a2, c01, t2
  1128. MUL a2, c04, t3
  1129. MUL a2, c03, t4
  1130. MUL a1, c01, c01
  1131. MUL a1, c02, c02
  1132. MUL a1, c03, c03
  1133. MUL a1, c04, c04
  1134. ADD5 c01, t1, c01
  1135. ADD6 c02, t2, c02
  1136. ADD5 c03, t3, c03
  1137. ADD6 c04, t4, c04
  1138. #endif
  1139. #if defined(LN) || defined(LT)
  1140. ST c01, 0 * SIZE(BO)
  1141. ST c02, 1 * SIZE(BO)
  1142. ST c09, 2 * SIZE(BO)
  1143. ST c10, 3 * SIZE(BO)
  1144. ST c03, 4 * SIZE(BO)
  1145. ST c04, 5 * SIZE(BO)
  1146. ST c11, 6 * SIZE(BO)
  1147. ST c12, 7 * SIZE(BO)
  1148. #else
  1149. ST c01, 0 * SIZE(AO)
  1150. ST c02, 1 * SIZE(AO)
  1151. ST c03, 2 * SIZE(AO)
  1152. ST c04, 3 * SIZE(AO)
  1153. ST c09, 4 * SIZE(AO)
  1154. ST c10, 5 * SIZE(AO)
  1155. ST c11, 6 * SIZE(AO)
  1156. ST c12, 7 * SIZE(AO)
  1157. #endif
  1158. #ifdef LN
  1159. lda C1, -4 * SIZE(C1)
  1160. lda C2, -4 * SIZE(C2)
  1161. #endif
  1162. ST c01, 0 * SIZE(C1)
  1163. ST c02, 1 * SIZE(C1)
  1164. ST c03, 2 * SIZE(C1)
  1165. ST c04, 3 * SIZE(C1)
  1166. ST c09, 0 * SIZE(C2)
  1167. ST c10, 1 * SIZE(C2)
  1168. ST c11, 2 * SIZE(C2)
  1169. ST c12, 3 * SIZE(C2)
  1170. #ifndef LN
  1171. lda C1, 4 * SIZE(C1)
  1172. lda C2, 4 * SIZE(C2)
  1173. #endif
  1174. fclr t1
  1175. fclr t2
  1176. fclr t3
  1177. fclr t4
  1178. #ifdef RT
  1179. sll K, ZBASE_SHIFT + 1, TMP1
  1180. addq AORIG, TMP1, AORIG
  1181. #endif
  1182. #if defined(LT) || defined(RN)
  1183. subq K, KK, TMP1
  1184. sll TMP1, ZBASE_SHIFT + 1, TMP1
  1185. addq AO, TMP1, AO
  1186. addq BO, TMP1, BO
  1187. #endif
  1188. #ifdef LT
  1189. addq KK, 2, KK
  1190. #endif
  1191. #ifdef LN
  1192. subq KK, 2, KK
  1193. #endif
  1194. fclr c01
  1195. fclr c05
  1196. lda I, -1(I)
  1197. bgt I, $L11
  1198. .align 4
  1199. $L29:
  1200. #ifdef LN
  1201. sll K, ZBASE_SHIFT + 1, TMP1
  1202. addq B, TMP1, B
  1203. #endif
  1204. #if defined(LT) || defined(RN)
  1205. mov BO, B
  1206. #endif
  1207. #ifdef RN
  1208. addq KK, 2, KK
  1209. #endif
  1210. #ifdef RT
  1211. subq KK, 2, KK
  1212. #endif
  1213. lda J, -1(J)
  1214. bgt J, $L01
  1215. .align 4
  1216. $L30:
  1217. and N, 1, J
  1218. ble J, $L999
  1219. #ifdef RT
  1220. sll K, ZBASE_SHIFT, TMP1
  1221. subq B, TMP1, B
  1222. subq C, LDC, C1
  1223. subq C, LDC, C
  1224. #else
  1225. mov C, C1
  1226. addq C, LDC, C
  1227. #endif
  1228. #ifdef LN
  1229. addq M, OFFSET, KK
  1230. #endif
  1231. #ifdef LT
  1232. mov OFFSET, KK
  1233. #endif
  1234. #if defined(LN) || defined(RT)
  1235. mov A, AORIG
  1236. #else
  1237. mov A, AO
  1238. #endif
  1239. and M, 1, I
  1240. ble I, $L50
  1241. #if defined(LT) || defined(RN)
  1242. LD a1, 0 * SIZE(AO)
  1243. fclr t1
  1244. LD a2, 1 * SIZE(AO)
  1245. fclr t2
  1246. LD a3, 2 * SIZE(AO)
  1247. fclr t3
  1248. LD a4, 3 * SIZE(AO)
  1249. fclr t4
  1250. LD b1, 0 * SIZE(B)
  1251. fclr c01
  1252. LD b2, 1 * SIZE(B)
  1253. fclr c05
  1254. LD b3, 2 * SIZE(B)
  1255. fclr c02
  1256. LD b4, 3 * SIZE(B)
  1257. fclr c06
  1258. lda AO, 2 * SIZE(AO)
  1259. lda BO, 2 * SIZE(B)
  1260. lda L, -2(KK)
  1261. ble KK, $L58
  1262. ble L, $L55
  1263. #else
  1264. #ifdef LN
  1265. sll K, ZBASE_SHIFT, TMP1
  1266. subq AORIG, TMP1, AORIG
  1267. #endif
  1268. sll KK, ZBASE_SHIFT, TMP1
  1269. addq AORIG, TMP1, AO
  1270. sll KK, ZBASE_SHIFT, TMP1
  1271. addq B, TMP1, BO
  1272. subq K, KK, TMP1
  1273. LD a1, 0 * SIZE(AO)
  1274. fclr t1
  1275. LD a2, 1 * SIZE(AO)
  1276. fclr t2
  1277. LD a3, 2 * SIZE(AO)
  1278. fclr t3
  1279. LD a4, 3 * SIZE(AO)
  1280. fclr t4
  1281. LD b1, 0 * SIZE(BO)
  1282. fclr c01
  1283. LD b2, 1 * SIZE(BO)
  1284. fclr c05
  1285. LD b3, 2 * SIZE(BO)
  1286. fclr c02
  1287. LD b4, 3 * SIZE(BO)
  1288. fclr c06
  1289. lda AO, 2 * SIZE(AO)
  1290. lda BO, 2 * SIZE(BO)
  1291. lda L, -2(TMP1)
  1292. ble TMP1, $L58
  1293. ble L, $L55
  1294. #endif
  1295. .align 5
  1296. $L52:
  1297. ADD1 c01, t1, c01
  1298. unop
  1299. MUL a1, b1, t1
  1300. unop
  1301. ADD3 c02, t2, c02
  1302. lda AO, 4 * SIZE(AO)
  1303. MUL a2, b1, t2
  1304. LD b1, 2 * SIZE(BO)
  1305. ADD4 c05, t3, c05
  1306. lda L, -2(L)
  1307. MUL a1, b2, t3
  1308. LD a1, -2 * SIZE(AO)
  1309. ADD2 c06, t4, c06
  1310. unop
  1311. MUL a2, b2, t4
  1312. LD a2, -1 * SIZE(AO)
  1313. ADD1 c01, t1, c01
  1314. LD b2, 3 * SIZE(BO)
  1315. MUL a3, b3, t1
  1316. lda BO, 4 * SIZE(BO)
  1317. ADD3 c02, t2, c02
  1318. unop
  1319. MUL a4, b3, t2
  1320. LD b3, 0 * SIZE(BO)
  1321. ADD4 c05, t3, c05
  1322. unop
  1323. MUL a3, b4, t3
  1324. LD a3, 0 * SIZE(AO)
  1325. ADD2 c06, t4, c06
  1326. MUL a4, b4, t4
  1327. LD b4, 1 * SIZE(BO)
  1328. unop
  1329. LD a4, 1 * SIZE(AO)
  1330. unop
  1331. unop
  1332. bgt L, $L52
  1333. .align 4
  1334. $L55:
  1335. ADD1 c01, t1, c01
  1336. MUL a1, b1, t1
  1337. #if defined(LT) || defined(RN)
  1338. blbs KK, $L57
  1339. #else
  1340. blbs TMP1, $L57
  1341. #endif
  1342. .align 4
  1343. ADD3 c02, t2, c02
  1344. unop
  1345. MUL a2, b1, t2
  1346. LD b1, 0 * SIZE(BO)
  1347. ADD4 c05, t3, c05
  1348. lda BO, 2 * SIZE(BO)
  1349. MUL a1, b2, t3
  1350. LD a1, 0 * SIZE(AO)
  1351. ADD2 c06, t4, c06
  1352. unop
  1353. MUL a2, b2, t4
  1354. LD a2, 1 * SIZE(AO)
  1355. ADD1 c01, t1, c01
  1356. LD b2, -1 * SIZE(BO)
  1357. MUL a1, b1, t1
  1358. lda AO, 2 * SIZE(AO)
  1359. .align 4
  1360. $L57:
  1361. ADD3 c02, t2, c02
  1362. MUL a2, b1, t2
  1363. ADD4 c05, t3, c05
  1364. MUL a1, b2, t3
  1365. ADD2 c06, t4, c06
  1366. lda AO, 2 * SIZE(AO)
  1367. MUL a2, b2, t4
  1368. lda BO, 2 * SIZE(BO)
  1369. ADD1 c01, t1, c01
  1370. ADD3 c02, t2, c02
  1371. ADD4 c05, t3, c05
  1372. ADD2 c06, t4, c06
  1373. ADD c01, c06, c01
  1374. ADD c02, c05, c02
  1375. $L58:
  1376. #if defined(LN) || defined(RT)
  1377. subq KK, 1, TMP1
  1378. sll TMP1, ZBASE_SHIFT, TMP2
  1379. addq AORIG, TMP2, AO
  1380. sll TMP1, ZBASE_SHIFT, TMP2
  1381. addq B, TMP2, BO
  1382. #else
  1383. lda AO, -2 * SIZE(AO)
  1384. lda BO, -2 * SIZE(BO)
  1385. #endif
  1386. #if defined(LN) || defined(LT)
  1387. LD a1, 0 * SIZE(BO)
  1388. LD a2, 1 * SIZE(BO)
  1389. SUB a1, c01, c01
  1390. SUB a2, c02, c02
  1391. #else
  1392. LD a1, 0 * SIZE(AO)
  1393. LD a2, 1 * SIZE(AO)
  1394. SUB a1, c01, c01
  1395. SUB a2, c02, c02
  1396. #endif
  1397. #if defined(LN) || defined(LT)
  1398. LD a1, 0 * SIZE(AO)
  1399. LD a2, 1 * SIZE(AO)
  1400. MUL a2, c02, t1
  1401. MUL a2, c01, t2
  1402. MUL a1, c01, c01
  1403. MUL a1, c02, c02
  1404. ADD5 c01, t1, c01
  1405. ADD6 c02, t2, c02
  1406. #endif
  1407. #if defined(RN) || defined(RT)
  1408. LD a1, 0 * SIZE(BO)
  1409. LD a2, 1 * SIZE(BO)
  1410. MUL a2, c02, t1
  1411. MUL a2, c01, t2
  1412. MUL a1, c01, c01
  1413. MUL a1, c02, c02
  1414. ADD5 c01, t1, c01
  1415. ADD6 c02, t2, c02
  1416. #endif
  1417. #if defined(LN) || defined(LT)
  1418. ST c01, 0 * SIZE(BO)
  1419. ST c02, 1 * SIZE(BO)
  1420. #else
  1421. ST c01, 0 * SIZE(AO)
  1422. ST c02, 1 * SIZE(AO)
  1423. #endif
  1424. #ifdef LN
  1425. lda C1, -2 * SIZE(C1)
  1426. #endif
  1427. ST c01, 0 * SIZE(C1)
  1428. ST c02, 1 * SIZE(C1)
  1429. #ifndef LN
  1430. lda C1, 2 * SIZE(C1)
  1431. #endif
  1432. #ifdef RT
  1433. sll K, ZBASE_SHIFT, TMP1
  1434. addq AORIG, TMP1, AORIG
  1435. #endif
  1436. #if defined(LT) || defined(RN)
  1437. subq K, KK, TMP1
  1438. sll TMP1, ZBASE_SHIFT, TMP2
  1439. addq AO, TMP2, AO
  1440. sll TMP1, ZBASE_SHIFT, TMP2
  1441. addq BO, TMP2, BO
  1442. #endif
  1443. #ifdef LT
  1444. addq KK, 1, KK
  1445. #endif
  1446. #ifdef LN
  1447. subq KK, 1, KK
  1448. #endif
  1449. .align 4
  1450. $L50:
  1451. sra M, 1, I
  1452. ble I, $L59
  1453. .align 4
  1454. $L41:
  1455. #if defined(LT) || defined(RN)
  1456. LD a1, 0 * SIZE(AO)
  1457. fclr t1
  1458. LD a2, 1 * SIZE(AO)
  1459. fclr t2
  1460. LD a3, 2 * SIZE(AO)
  1461. fclr t3
  1462. LD a4, 3 * SIZE(AO)
  1463. fclr t4
  1464. LD b1, 0 * SIZE(B)
  1465. fclr c01
  1466. LD b2, 1 * SIZE(B)
  1467. fclr c05
  1468. LD b3, 2 * SIZE(B)
  1469. fclr c02
  1470. LD b4, 3 * SIZE(B)
  1471. fclr c06
  1472. lda BO, 2 * SIZE(B)
  1473. fclr c03
  1474. lda AO, 4 * SIZE(AO)
  1475. fclr c07
  1476. lda L, -2(KK)
  1477. fclr c04
  1478. fclr c08
  1479. ble KK, $L48
  1480. ble L, $L45
  1481. #else
  1482. #ifdef LN
  1483. sll K, ZBASE_SHIFT + 1, TMP1
  1484. subq AORIG, TMP1, AORIG
  1485. #endif
  1486. sll KK, ZBASE_SHIFT + 1, TMP1
  1487. addq AORIG, TMP1, AO
  1488. sll KK, ZBASE_SHIFT, TMP1
  1489. addq B, TMP1, BO
  1490. subq K, KK, TMP1
  1491. LD a1, 0 * SIZE(AO)
  1492. fclr t1
  1493. LD a2, 1 * SIZE(AO)
  1494. fclr t2
  1495. LD a3, 2 * SIZE(AO)
  1496. fclr t3
  1497. LD a4, 3 * SIZE(AO)
  1498. fclr t4
  1499. LD b1, 0 * SIZE(BO)
  1500. fclr c01
  1501. LD b2, 1 * SIZE(BO)
  1502. fclr c05
  1503. LD b3, 2 * SIZE(BO)
  1504. fclr c02
  1505. LD b4, 3 * SIZE(BO)
  1506. fclr c06
  1507. lda BO, 2 * SIZE(BO)
  1508. fclr c03
  1509. lda AO, 4 * SIZE(AO)
  1510. fclr c07
  1511. lda L, -2(TMP1)
  1512. fclr c04
  1513. fclr c08
  1514. ble TMP1, $L48
  1515. ble L, $L45
  1516. #endif
  1517. .align 5
  1518. $L42:
  1519. ADD4 c05, t1, c05
  1520. unop
  1521. MUL a1, b1, t1
  1522. unop
  1523. ADD2 c06, t2, c06
  1524. lda L, -2(L)
  1525. MUL a2, b1, t2
  1526. unop
  1527. ADD4 c07, t3, c07
  1528. unop
  1529. MUL a3, b1, t3
  1530. unop
  1531. ADD2 c08, t4, c08
  1532. unop
  1533. MUL a4, b1, t4
  1534. LD b1, 2 * SIZE(BO)
  1535. ADD1 c01, t1, c01
  1536. unop
  1537. MUL a1, b2, t1
  1538. LD a1, 0 * SIZE(AO)
  1539. ADD3 c02, t2, c02
  1540. lda BO, 4 * SIZE(BO)
  1541. MUL a2, b2, t2
  1542. LD a2, 1 * SIZE(AO)
  1543. ADD1 c03, t3, c03
  1544. unop
  1545. MUL a3, b2, t3
  1546. LD a3, 2 * SIZE(AO)
  1547. ADD3 c04, t4, c04
  1548. unop
  1549. MUL a4, b2, t4
  1550. LD a5, 3 * SIZE(AO)
  1551. ADD4 c05, t1, c05
  1552. unop
  1553. MUL a1, b3, t1
  1554. LD b2, -1 * SIZE(BO)
  1555. ADD2 c06, t2, c06
  1556. unop
  1557. MUL a2, b3, t2
  1558. unop
  1559. ADD4 c07, t3, c07
  1560. unop
  1561. MUL a3, b3, t3
  1562. lda AO, 8 * SIZE(AO)
  1563. ADD2 c08, t4, c08
  1564. unop
  1565. MUL a5, b3, t4
  1566. LD b3, 0 * SIZE(BO)
  1567. ADD1 c01, t1, c01
  1568. unop
  1569. MUL a1, b4, t1
  1570. LD a1, -4 * SIZE(AO)
  1571. ADD3 c02, t2, c02
  1572. unop
  1573. MUL a2, b4, t2
  1574. LD a2, -3 * SIZE(AO)
  1575. ADD1 c03, t3, c03
  1576. LD a4, -1 * SIZE(AO)
  1577. MUL a3, b4, t3
  1578. LD a3, -2 * SIZE(AO)
  1579. ADD3 c04, t4, c04
  1580. MUL a5, b4, t4
  1581. LD b4, 1 * SIZE(BO)
  1582. bgt L, $L42
  1583. .align 4
  1584. $L45:
  1585. ADD4 c05, t1, c05
  1586. MUL b1, a1, t1
  1587. #if defined(LT) || defined(RN)
  1588. blbs KK, $L47
  1589. #else
  1590. blbs TMP1, $L47
  1591. #endif
  1592. .align 4
  1593. ADD2 c06, t2, c06
  1594. MUL a2, b1, t2
  1595. ADD4 c07, t3, c07
  1596. MUL a3, b1, t3
  1597. ADD2 c08, t4, c08
  1598. unop
  1599. MUL a4, b1, t4
  1600. LD b1, 0 * SIZE(BO)
  1601. ADD1 c01, t1, c01
  1602. unop
  1603. MUL a1, b2, t1
  1604. LD a1, 0 * SIZE(AO)
  1605. ADD3 c02, t2, c02
  1606. unop
  1607. MUL a2, b2, t2
  1608. LD a2, 1 * SIZE(AO)
  1609. ADD1 c03, t3, c03
  1610. unop
  1611. MUL a3, b2, t3
  1612. LD a3, 2 * SIZE(AO)
  1613. ADD3 c04, t4, c04
  1614. MUL a4, b2, t4
  1615. LD a4, 3 * SIZE(AO)
  1616. lda AO, 4 * SIZE(AO)
  1617. ADD4 c05, t1, c05
  1618. LD b2, 1 * SIZE(BO)
  1619. MUL a1, b1, t1
  1620. lda BO, 2 * SIZE(BO)
  1621. .align 4
  1622. $L47:
  1623. ADD2 c06, t2, c06
  1624. MUL a2, b1, t2
  1625. ADD4 c07, t3, c07
  1626. MUL a3, b1, t3
  1627. ADD2 c08, t4, c08
  1628. MUL a4, b1, t4
  1629. ADD1 c01, t1, c01
  1630. MUL a1, b2, t1
  1631. ADD3 c02, t2, c02
  1632. MUL a2, b2, t2
  1633. ADD1 c03, t3, c03
  1634. MUL a3, b2, t3
  1635. ADD3 c04, t4, c04
  1636. lda AO, 4 * SIZE(AO)
  1637. MUL a4, b2, t4
  1638. lda BO, 2 * SIZE(BO)
  1639. ADD4 c05, t1, c05
  1640. ADD2 c06, t2, c06
  1641. ADD4 c07, t3, c07
  1642. ADD2 c08, t4, c08
  1643. ADD c01, c06, c01
  1644. ADD c02, c05, c02
  1645. ADD c03, c08, c03
  1646. ADD c04, c07, c04
  1647. $L48:
  1648. #if defined(LN) || defined(RT)
  1649. #ifdef LN
  1650. subq KK, 2, TMP1
  1651. #else
  1652. subq KK, 1, TMP1
  1653. #endif
  1654. sll TMP1, ZBASE_SHIFT + 1, TMP2
  1655. addq AORIG, TMP2, AO
  1656. sll TMP1, ZBASE_SHIFT, TMP2
  1657. addq B, TMP2, BO
  1658. #else
  1659. lda AO, -4 * SIZE(AO)
  1660. lda BO, -2 * SIZE(BO)
  1661. #endif
  1662. #if defined(LN) || defined(LT)
  1663. LD a1, 0 * SIZE(BO)
  1664. LD a2, 1 * SIZE(BO)
  1665. LD a3, 2 * SIZE(BO)
  1666. LD a4, 3 * SIZE(BO)
  1667. SUB a1, c01, c01
  1668. SUB a2, c02, c02
  1669. SUB a3, c03, c03
  1670. SUB a4, c04, c04
  1671. #else
  1672. LD a1, 0 * SIZE(AO)
  1673. LD a2, 1 * SIZE(AO)
  1674. LD a3, 2 * SIZE(AO)
  1675. LD a4, 3 * SIZE(AO)
  1676. SUB a1, c01, c01
  1677. SUB a2, c02, c02
  1678. SUB a3, c03, c03
  1679. SUB a4, c04, c04
  1680. #endif
  1681. #ifdef LN
  1682. LD a1, 6 * SIZE(AO)
  1683. LD a2, 7 * SIZE(AO)
  1684. LD a3, 4 * SIZE(AO)
  1685. LD a4, 5 * SIZE(AO)
  1686. MUL a2, c04, t1
  1687. MUL a2, c03, t2
  1688. MUL a1, c03, c03
  1689. MUL a1, c04, c04
  1690. ADD5 c03, t1, c03
  1691. ADD6 c04, t2, c04
  1692. MUL a3, c03, t1
  1693. MUL a3, c04, t2
  1694. SUB c01, t1, c01
  1695. SUB c02, t2, c02
  1696. MUL a4, c04, t1
  1697. MUL a4, c03, t2
  1698. ADD6 c01, t1, c01
  1699. ADD5 c02, t2, c02
  1700. LD a1, 0 * SIZE(AO)
  1701. LD a2, 1 * SIZE(AO)
  1702. MUL a2, c02, t1
  1703. MUL a2, c01, t2
  1704. MUL a1, c01, c01
  1705. MUL a1, c02, c02
  1706. ADD5 c01, t1, c01
  1707. ADD6 c02, t2, c02
  1708. #endif
  1709. #ifdef LT
  1710. LD a1, 0 * SIZE(AO)
  1711. LD a2, 1 * SIZE(AO)
  1712. LD a3, 2 * SIZE(AO)
  1713. LD a4, 3 * SIZE(AO)
  1714. MUL a2, c02, t1
  1715. MUL a2, c01, t2
  1716. MUL a1, c01, c01
  1717. MUL a1, c02, c02
  1718. ADD5 c01, t1, c01
  1719. ADD6 c02, t2, c02
  1720. MUL a3, c01, t1
  1721. MUL a3, c02, t2
  1722. SUB c03, t1, c03
  1723. SUB c04, t2, c04
  1724. MUL a4, c02, t1
  1725. MUL a4, c01, t2
  1726. ADD6 c03, t1, c03
  1727. ADD5 c04, t2, c04
  1728. LD a1, 6 * SIZE(AO)
  1729. LD a2, 7 * SIZE(AO)
  1730. MUL a2, c04, t1
  1731. MUL a2, c03, t2
  1732. MUL a1, c03, c03
  1733. MUL a1, c04, c04
  1734. ADD5 c03, t1, c03
  1735. ADD6 c04, t2, c04
  1736. #endif
  1737. #if defined(RN) || defined(RT)
  1738. LD a1, 0 * SIZE(BO)
  1739. LD a2, 1 * SIZE(BO)
  1740. MUL a2, c02, t1
  1741. MUL a2, c01, t2
  1742. MUL a2, c04, t3
  1743. MUL a2, c03, t4
  1744. MUL a1, c01, c01
  1745. MUL a1, c02, c02
  1746. MUL a1, c03, c03
  1747. MUL a1, c04, c04
  1748. ADD5 c01, t1, c01
  1749. ADD6 c02, t2, c02
  1750. ADD5 c03, t3, c03
  1751. ADD6 c04, t4, c04
  1752. #endif
  1753. #if defined(LN) || defined(LT)
  1754. ST c01, 0 * SIZE(BO)
  1755. ST c02, 1 * SIZE(BO)
  1756. ST c03, 2 * SIZE(BO)
  1757. ST c04, 3 * SIZE(BO)
  1758. #else
  1759. ST c01, 0 * SIZE(AO)
  1760. ST c02, 1 * SIZE(AO)
  1761. ST c03, 2 * SIZE(AO)
  1762. ST c04, 3 * SIZE(AO)
  1763. #endif
  1764. #ifdef LN
  1765. lda C1, -4 * SIZE(C1)
  1766. #endif
  1767. ST c01, 0 * SIZE(C1)
  1768. ST c02, 1 * SIZE(C1)
  1769. ST c03, 2 * SIZE(C1)
  1770. ST c04, 3 * SIZE(C1)
  1771. #ifndef LN
  1772. lda C1, 4 * SIZE(C1)
  1773. #endif
  1774. #ifdef RT
  1775. sll K, ZBASE_SHIFT + 1, TMP1
  1776. addq AORIG, TMP1, AORIG
  1777. #endif
  1778. #if defined(LT) || defined(RN)
  1779. subq K, KK, TMP1
  1780. sll TMP1, ZBASE_SHIFT + 1, TMP2
  1781. addq AO, TMP2, AO
  1782. sll TMP1, ZBASE_SHIFT, TMP2
  1783. addq BO, TMP2, BO
  1784. #endif
  1785. #ifdef LT
  1786. addq KK, 2, KK
  1787. #endif
  1788. #ifdef LN
  1789. subq KK, 2, KK
  1790. #endif
  1791. lda I, -1(I)
  1792. bgt I, $L41
  1793. .align 4
  1794. $L59:
  1795. #ifdef LN
  1796. sll K, ZBASE_SHIFT, TMP1
  1797. addq B, TMP1, B
  1798. #endif
  1799. #if defined(LT) || defined(RN)
  1800. mov BO, B
  1801. #endif
  1802. #ifdef RN
  1803. addq KK, 1, KK
  1804. #endif
  1805. #ifdef RT
  1806. subq KK, 1, KK
  1807. #endif
  1808. .align 4
  1809. $L999:
  1810. ldt $f2, 0($sp)
  1811. ldt $f3, 8($sp)
  1812. ldt $f4, 16($sp)
  1813. ldt $f5, 24($sp)
  1814. ldt $f6, 32($sp)
  1815. ldt $f7, 40($sp)
  1816. ldt $f8, 48($sp)
  1817. ldt $f9, 56($sp)
  1818. clr $0
  1819. lda $sp, STACKSIZE($sp)
  1820. ret
  1821. .ident VERSION
  1822. .end CNAME