You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_2x2_LN.S 33 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  41. #error "Architecture is not specified."
  42. #endif
  43. #ifdef EV6
  44. #define PREFETCHSIZE 56
  45. #define UNOP unop
  46. #endif
  47. #ifdef EV5
  48. #define PREFETCHSIZE 48
  49. #define UNOP
  50. #endif
  51. #ifdef EV4
  52. #define UNOP
  53. #endif
  54. .set noat
  55. .set noreorder
  56. .arch ev6
  57. .text
  58. .align 5
  59. .globl CNAME
  60. .ent CNAME
  61. #define STACKSIZE 80
  62. #define M $16
  63. #define N $17
  64. #define K $18
  65. #define A $21
  66. #define B $22
  67. #define C $20
  68. #define LDC $23
  69. #define C1 $19
  70. #define C2 $24
  71. #define AO $at
  72. #define BO $5
  73. #define I $6
  74. #define J $7
  75. #define L $8
  76. #define a1 $f16
  77. #define a2 $f17
  78. #define a3 $f18
  79. #define a4 $f19
  80. #define b1 $f20
  81. #define b2 $f21
  82. #define b3 $f22
  83. #define b4 $f23
  84. #define t1 $f24
  85. #define t2 $f25
  86. #define t3 $f26
  87. #define t4 $f27
  88. #define a5 $f28
  89. #define a6 $f30
  90. #define b5 $f29
  91. #define alpha_i $f29
  92. #define alpha_r $f30
  93. #define c01 $f0
  94. #define c02 $f1
  95. #define c03 $f2
  96. #define c04 $f3
  97. #define c05 $f4
  98. #define c06 $f5
  99. #define c07 $f6
  100. #define c08 $f7
  101. #define c09 $f8
  102. #define c10 $f9
  103. #define c11 $f10
  104. #define c12 $f11
  105. #define c13 $f12
  106. #define c14 $f13
  107. #define c15 $f14
  108. #define c16 $f15
  109. #define TMP1 $0
  110. #define TMP2 $1
  111. #define KK $2
  112. #define AORIG $3
  113. #define OFFSET $4
  114. #if defined(LN) || defined(LT)
  115. #ifndef CONJ
  116. #define ADD1 ADD
  117. #define ADD2 SUB
  118. #define ADD3 ADD
  119. #define ADD4 ADD
  120. #define ADD5 SUB
  121. #define ADD6 ADD
  122. #else
  123. #define ADD1 ADD
  124. #define ADD2 ADD
  125. #define ADD3 SUB
  126. #define ADD4 ADD
  127. #define ADD5 ADD
  128. #define ADD6 SUB
  129. #endif
  130. #else
  131. #ifndef CONJ
  132. #define ADD1 ADD
  133. #define ADD2 SUB
  134. #define ADD3 ADD
  135. #define ADD4 ADD
  136. #define ADD5 SUB
  137. #define ADD6 ADD
  138. #else
  139. #define ADD1 ADD
  140. #define ADD2 ADD
  141. #define ADD3 ADD
  142. #define ADD4 SUB
  143. #define ADD5 ADD
  144. #define ADD6 SUB
  145. #endif
  146. #endif
  147. CNAME:
  148. .frame $sp, STACKSIZE, $26, 0
  149. #ifdef PROFILE
  150. ldgp $gp, 0($27)
  151. lda $at, _mcount
  152. jsr $at, ($at), _mcount
  153. #endif
  154. #ifndef PROFILE
  155. .prologue 0
  156. #else
  157. .prologue 1
  158. #endif
  159. lda $sp, -STACKSIZE($sp)
  160. ldq B, 0 + STACKSIZE($sp)
  161. ldq C, 8 + STACKSIZE($sp)
  162. ldq LDC, 16 + STACKSIZE($sp)
  163. ldq OFFSET, 24 + STACKSIZE($sp)
  164. sll LDC, ZBASE_SHIFT, LDC
  165. stt $f2, 0($sp)
  166. stt $f3, 8($sp)
  167. stt $f4, 16($sp)
  168. stt $f5, 24($sp)
  169. stt $f6, 32($sp)
  170. stt $f7, 40($sp)
  171. stt $f8, 48($sp)
  172. stt $f9, 56($sp)
  173. cmple M, 0, $0
  174. cmple N, 0, $1
  175. cmple K, 0, $2
  176. or $0, $1, $0
  177. or $0, $2, $0
  178. bne $0, $L999
  179. #ifdef LN
  180. addq M, M, TMP2
  181. mulq TMP2, K, TMP1
  182. SXADDQ TMP1, A, A
  183. SXADDQ TMP2, C, C
  184. #endif
  185. #ifdef RN
  186. negq OFFSET, KK
  187. #endif
  188. #ifdef RT
  189. mulq N, K, TMP1
  190. addq TMP1, TMP1, TMP1
  191. SXADDQ TMP1, B, B
  192. mulq N, LDC, TMP1
  193. addq TMP1, C, C
  194. subq N, OFFSET, KK
  195. #endif
  196. sra N, 1, J
  197. ble J, $L30
  198. .align 4
  199. $L01:
  200. #ifdef RT
  201. sll K, ZBASE_SHIFT + 1, TMP1
  202. subq B, TMP1, B
  203. subq C, LDC, C2
  204. subq C2, LDC, C1
  205. subq C2, LDC, C
  206. #else
  207. mov C, C1
  208. addq C, LDC, C2
  209. addq C2, LDC, C
  210. #endif
  211. #ifdef LN
  212. addq M, OFFSET, KK
  213. #endif
  214. #ifdef LT
  215. mov OFFSET, KK
  216. #endif
  217. #if defined(LN) || defined(RT)
  218. mov A, AORIG
  219. #else
  220. mov A, AO
  221. #endif
  222. and M, 1, I
  223. fclr t1
  224. fclr t2
  225. fclr t3
  226. fclr t4
  227. fclr c01
  228. fclr c05
  229. ble I, $L20
  230. #if defined(LT) || defined(RN)
  231. LD a1, 0 * SIZE(AO)
  232. fclr c09
  233. LD a2, 1 * SIZE(AO)
  234. fclr c13
  235. LD a3, 2 * SIZE(AO)
  236. fclr c02
  237. LD a4, 3 * SIZE(AO)
  238. fclr c06
  239. LD b1, 0 * SIZE(B)
  240. fclr c10
  241. LD b2, 1 * SIZE(B)
  242. fclr c14
  243. LD b3, 2 * SIZE(B)
  244. lda AO, 2 * SIZE(AO)
  245. LD b4, 3 * SIZE(B)
  246. lda BO, 4 * SIZE(B)
  247. lda L, -2(KK)
  248. ble KK, $L28
  249. ble L, $L25
  250. #else
  251. #ifdef LN
  252. sll K, ZBASE_SHIFT + 0, TMP1
  253. subq AORIG, TMP1, AORIG
  254. #endif
  255. sll KK, ZBASE_SHIFT + 0, TMP1
  256. addq AORIG, TMP1, AO
  257. sll KK, ZBASE_SHIFT + 1, TMP1
  258. addq B, TMP1, BO
  259. subq K, KK, TMP1
  260. LD a1, 0 * SIZE(AO)
  261. fclr c09
  262. LD a2, 1 * SIZE(AO)
  263. fclr c13
  264. LD a3, 2 * SIZE(AO)
  265. fclr c02
  266. LD a4, 3 * SIZE(AO)
  267. fclr c06
  268. LD b1, 0 * SIZE(BO)
  269. fclr c10
  270. LD b2, 1 * SIZE(BO)
  271. fclr c14
  272. LD b3, 2 * SIZE(BO)
  273. lda AO, 2 * SIZE(AO)
  274. LD b4, 3 * SIZE(BO)
  275. lda BO, 4 * SIZE(BO)
  276. lda L, -2(TMP1)
  277. ble TMP1, $L28
  278. ble L, $L25
  279. #endif
  280. .align 5
  281. $L22:
  282. ADD1 c09, t1, c09
  283. unop
  284. MUL a1, b1, t1
  285. unop
  286. ADD3 c10, t2, c10
  287. unop
  288. MUL a2, b1, t2
  289. LD b1, 0 * SIZE(BO)
  290. ADD4 c13, t3, c13
  291. unop
  292. MUL a1, b2, t3
  293. lda BO, 8 * SIZE(BO)
  294. ADD2 c14, t4, c14
  295. unop
  296. MUL a2, b2, t4
  297. LD b2, -7 * SIZE(BO)
  298. ADD1 c01, t1, c01
  299. unop
  300. MUL a1, b3, t1
  301. unop
  302. ADD3 c02, t2, c02
  303. unop
  304. MUL a2, b3, t2
  305. LD b3, -6 * SIZE(BO)
  306. ADD4 c05, t3, c05
  307. unop
  308. MUL a1, b4, t3
  309. LD a1, 2 * SIZE(AO)
  310. ADD2 c06, t4, c06
  311. MUL a2, b4, t4
  312. LD b5, -5 * SIZE(BO)
  313. ADD1 c09, t1, c09
  314. unop
  315. MUL a3, b1, t1
  316. LD a2, 3 * SIZE(AO)
  317. ADD3 c10, t2, c10
  318. unop
  319. MUL a4, b1, t2
  320. LD b1, -4 * SIZE(BO)
  321. ADD4 c13, t3, c13
  322. unop
  323. MUL a3, b2, t3
  324. lda AO, 4 * SIZE(AO)
  325. ADD2 c14, t4, c14
  326. MUL a4, b2, t4
  327. LD b2, -3 * SIZE(BO)
  328. ADD1 c01, t1, c01
  329. lda L, -2(L)
  330. MUL a3, b3, t1
  331. LD b4, -1 * SIZE(BO)
  332. ADD3 c02, t2, c02
  333. unop
  334. MUL a4, b3, t2
  335. LD b3, -2 * SIZE(BO)
  336. ADD4 c05, t3, c05
  337. unop
  338. MUL a3, b5, t3
  339. LD a3, 0 * SIZE(AO)
  340. ADD2 c06, t4, c06
  341. MUL a4, b5, t4
  342. LD a4, 1 * SIZE(AO)
  343. bgt L, $L22
  344. .align 4
  345. $L25:
  346. ADD1 c09, t1, c09
  347. MUL a1, b1, t1
  348. #if defined(LT) || defined(RN)
  349. blbs KK, $L27
  350. #else
  351. blbs TMP1, $L27
  352. #endif
  353. .align 4
  354. ADD3 c10, t2, c10
  355. unop
  356. MUL a2, b1, t2
  357. LD b1, 0 * SIZE(BO)
  358. ADD4 c13, t3, c13
  359. unop
  360. MUL a1, b2, t3
  361. unop
  362. ADD2 c14, t4, c14
  363. unop
  364. MUL a2, b2, t4
  365. LD b2, 1 * SIZE(BO)
  366. ADD1 c01, t1, c01
  367. unop
  368. MUL a1, b3, t1
  369. lda AO, 2 * SIZE(AO)
  370. ADD3 c02, t2, c02
  371. unop
  372. MUL a2, b3, t2
  373. LD b3, 2 * SIZE(BO)
  374. ADD4 c05, t3, c05
  375. unop
  376. MUL a1, b4, t3
  377. LD a1, -2 * SIZE(AO)
  378. ADD2 c06, t4, c06
  379. unop
  380. MUL a2, b4, t4
  381. LD a2, -1 * SIZE(AO)
  382. ADD1 c09, t1, c09
  383. LD b4, 3 * SIZE(BO)
  384. MUL a1, b1, t1
  385. lda BO, 4 * SIZE(BO)
  386. .align 4
  387. $L27:
  388. ADD3 c10, t2, c10
  389. MUL a2, b1, t2
  390. ADD4 c13, t3, c13
  391. MUL a1, b2, t3
  392. ADD2 c14, t4, c14
  393. MUL a2, b2, t4
  394. ADD1 c01, t1, c01
  395. MUL a1, b3, t1
  396. ADD3 c02, t2, c02
  397. MUL a2, b3, t2
  398. ADD4 c05, t3, c05
  399. MUL a1, b4, t3
  400. ADD2 c06, t4, c06
  401. lda AO, 2 * SIZE(AO)
  402. MUL a2, b4, t4
  403. lda BO, 4 * SIZE(BO)
  404. ADD1 c09, t1, c09
  405. ADD3 c10, t2, c10
  406. ADD4 c13, t3, c13
  407. ADD2 c14, t4, c14
  408. ADD c01, c06, c01
  409. ADD c02, c05, c02
  410. ADD c09, c14, c09
  411. ADD c10, c13, c10
  412. .align 4
  413. $L28:
  414. #if defined(LN) || defined(RT)
  415. #ifdef LN
  416. subq KK, 1, TMP1
  417. #else
  418. subq KK, 2, TMP1
  419. #endif
  420. sll TMP1, ZBASE_SHIFT + 0, TMP2
  421. addq AORIG, TMP2, AO
  422. sll TMP1, ZBASE_SHIFT + 1, TMP2
  423. addq B, TMP2, BO
  424. #else
  425. lda AO, -2 * SIZE(AO)
  426. lda BO, -4 * SIZE(BO)
  427. #endif
  428. #if defined(LN) || defined(LT)
  429. LD a1, 0 * SIZE(BO)
  430. LD a2, 1 * SIZE(BO)
  431. LD a3, 2 * SIZE(BO)
  432. LD a4, 3 * SIZE(BO)
  433. SUB a1, c01, c01
  434. SUB a2, c02, c02
  435. SUB a3, c09, c09
  436. SUB a4, c10, c10
  437. #else
  438. LD a1, 0 * SIZE(AO)
  439. LD a2, 1 * SIZE(AO)
  440. LD a3, 2 * SIZE(AO)
  441. LD a4, 3 * SIZE(AO)
  442. SUB a1, c01, c01
  443. SUB a2, c02, c02
  444. SUB a3, c09, c09
  445. SUB a4, c10, c10
  446. #endif
  447. #if defined(LN) || defined(LT)
  448. LD a1, 0 * SIZE(AO)
  449. LD a2, 1 * SIZE(AO)
  450. MUL a2, c02, t1
  451. MUL a2, c01, t2
  452. MUL a2, c10, t3
  453. MUL a2, c09, t4
  454. MUL a1, c01, c01
  455. MUL a1, c02, c02
  456. MUL a1, c09, c09
  457. MUL a1, c10, c10
  458. ADD5 c01, t1, c01
  459. ADD6 c02, t2, c02
  460. ADD5 c09, t3, c09
  461. ADD6 c10, t4, c10
  462. #endif
  463. #ifdef RN
  464. LD a1, 0 * SIZE(BO)
  465. LD a2, 1 * SIZE(BO)
  466. LD a3, 2 * SIZE(BO)
  467. LD a4, 3 * SIZE(BO)
  468. MUL a2, c02, t1
  469. MUL a2, c01, t2
  470. MUL a1, c01, c01
  471. MUL a1, c02, c02
  472. ADD5 c01, t1, c01
  473. ADD6 c02, t2, c02
  474. MUL a3, c01, t1
  475. MUL a3, c02, t2
  476. SUB c09, t1, c09
  477. SUB c10, t2, c10
  478. MUL a4, c02, t1
  479. MUL a4, c01, t2
  480. ADD6 c09, t1, c09
  481. ADD5 c10, t2, c10
  482. LD a1, 6 * SIZE(BO)
  483. LD a2, 7 * SIZE(BO)
  484. MUL a2, c10, t1
  485. MUL a2, c09, t2
  486. MUL a1, c09, c09
  487. MUL a1, c10, c10
  488. ADD5 c09, t1, c09
  489. ADD6 c10, t2, c10
  490. #endif
  491. #ifdef RT
  492. LD a1, 6 * SIZE(BO)
  493. LD a2, 7 * SIZE(BO)
  494. LD a3, 4 * SIZE(BO)
  495. LD a4, 5 * SIZE(BO)
  496. MUL a2, c10, t1
  497. MUL a2, c09, t2
  498. MUL a1, c09, c09
  499. MUL a1, c10, c10
  500. ADD5 c09, t1, c09
  501. ADD6 c10, t2, c10
  502. MUL a3, c09, t1
  503. MUL a3, c10, t2
  504. SUB c01, t1, c01
  505. SUB c02, t2, c02
  506. MUL a4, c10, t1
  507. MUL a4, c09, t2
  508. ADD6 c01, t1, c01
  509. ADD5 c02, t2, c02
  510. LD a1, 0 * SIZE(BO)
  511. LD a2, 1 * SIZE(BO)
  512. MUL a2, c02, t1
  513. MUL a2, c01, t2
  514. MUL a1, c01, c01
  515. MUL a1, c02, c02
  516. ADD5 c01, t1, c01
  517. ADD6 c02, t2, c02
  518. #endif
  519. #if defined(LN) || defined(LT)
  520. ST c01, 0 * SIZE(BO)
  521. ST c02, 1 * SIZE(BO)
  522. ST c09, 2 * SIZE(BO)
  523. ST c10, 3 * SIZE(BO)
  524. #else
  525. ST c01, 0 * SIZE(AO)
  526. ST c02, 1 * SIZE(AO)
  527. ST c09, 2 * SIZE(AO)
  528. ST c10, 3 * SIZE(AO)
  529. #endif
  530. #ifdef LN
  531. lda C1, -2 * SIZE(C1)
  532. lda C2, -2 * SIZE(C2)
  533. #endif
  534. ST c01, 0 * SIZE(C1)
  535. ST c02, 1 * SIZE(C1)
  536. ST c09, 0 * SIZE(C2)
  537. ST c10, 1 * SIZE(C2)
  538. #ifndef LN
  539. lda C1, 2 * SIZE(C1)
  540. lda C2, 2 * SIZE(C2)
  541. #endif
  542. #ifdef RT
  543. sll K, ZBASE_SHIFT, TMP1
  544. addq AORIG, TMP1, AORIG
  545. #endif
  546. #if defined(LT) || defined(RN)
  547. subq K, KK, TMP1
  548. sll TMP1, ZBASE_SHIFT + 0, TMP2
  549. addq AO, TMP2, AO
  550. sll TMP1, ZBASE_SHIFT + 1, TMP2
  551. addq BO, TMP2, BO
  552. #endif
  553. #ifdef LT
  554. addq KK, 1, KK
  555. #endif
  556. #ifdef LN
  557. subq KK, 1, KK
  558. #endif
  559. .align 4
  560. $L20:
  561. sra M, 1, I
  562. fclr t1
  563. fclr t2
  564. fclr t3
  565. fclr t4
  566. fclr c01
  567. fclr c05
  568. ble I, $L29
  569. .align 4
  570. $L11:
  571. #if defined(LT) || defined(RN)
  572. LD a1, 0 * SIZE(AO)
  573. fclr c09
  574. LD a2, 1 * SIZE(AO)
  575. fclr c13
  576. LD a3, 2 * SIZE(AO)
  577. fclr c02
  578. LD a4, 3 * SIZE(AO)
  579. fclr c06
  580. LD b1, 0 * SIZE(B)
  581. fclr c10
  582. LD b2, 1 * SIZE(B)
  583. fclr c14
  584. LD b3, 2 * SIZE(B)
  585. fclr c03
  586. LD b4, 3 * SIZE(B)
  587. fclr c07
  588. lda BO, 4 * SIZE(B)
  589. fclr c11
  590. lda AO, 4 * SIZE(AO)
  591. fclr c15
  592. lds $f31, 4 * SIZE(C1)
  593. fclr c04
  594. lda L, -2(KK)
  595. fclr c08
  596. lds $f31, 4 * SIZE(C2)
  597. fclr c12
  598. fclr c16
  599. ble KK, $L18
  600. ble L, $L15
  601. #else
  602. #ifdef LN
  603. sll K, ZBASE_SHIFT + 1, TMP1
  604. subq AORIG, TMP1, AORIG
  605. #endif
  606. sll KK, ZBASE_SHIFT + 1, TMP1
  607. addq AORIG, TMP1, AO
  608. addq B, TMP1, BO
  609. subq K, KK, TMP1
  610. LD a1, 0 * SIZE(AO)
  611. fclr c09
  612. LD a2, 1 * SIZE(AO)
  613. fclr c13
  614. LD a3, 2 * SIZE(AO)
  615. fclr c02
  616. LD a4, 3 * SIZE(AO)
  617. fclr c06
  618. LD b1, 0 * SIZE(BO)
  619. fclr c10
  620. LD b2, 1 * SIZE(BO)
  621. fclr c14
  622. LD b3, 2 * SIZE(BO)
  623. fclr c03
  624. LD b4, 3 * SIZE(BO)
  625. fclr c07
  626. lda BO, 4 * SIZE(BO)
  627. fclr c11
  628. lda AO, 4 * SIZE(AO)
  629. fclr c15
  630. lds $f31, 4 * SIZE(C1)
  631. fclr c04
  632. lda L, -2(TMP1)
  633. fclr c08
  634. lds $f31, 4 * SIZE(C2)
  635. fclr c12
  636. fclr c16
  637. ble TMP1, $L18
  638. ble L, $L15
  639. #endif
  640. .align 5
  641. $L12:
  642. /* 1 */
  643. ADD1 c11, t1, c11
  644. #ifndef EV4
  645. ldq $31, PREFETCHSIZE * SIZE(AO)
  646. #else
  647. unop
  648. #endif
  649. MUL b1, a1, t1
  650. #ifndef EV4
  651. ldl $31, PREFETCHSIZE * SIZE(BO)
  652. #else
  653. unop
  654. #endif
  655. ADD3 c12, t2, c12
  656. unop
  657. MUL b1, a2, t2
  658. unop
  659. ADD2 c16, t3, c16
  660. unop
  661. MUL b2, a2, t3
  662. LD a5, 0 * SIZE(AO)
  663. ADD4 c15, t4, c15
  664. unop
  665. MUL b2, a1, t4
  666. LD b5, 0 * SIZE(BO)
  667. /* 2 */
  668. ADD1 c01, t1, c01
  669. UNOP
  670. MUL b1, a3, t1
  671. UNOP
  672. ADD3 c02, t2, c02
  673. UNOP
  674. MUL b1, a4, t2
  675. UNOP
  676. ADD2 c06, t3, c06
  677. unop
  678. MUL b2, a4, t3
  679. unop
  680. ADD4 c05, t4, c05
  681. unop
  682. MUL b4, a1, t4
  683. unop
  684. /* 3 */
  685. ADD1 c03, t1, c03
  686. unop
  687. MUL b3, a1, t1
  688. unop
  689. ADD3 c04, t2, c04
  690. unop
  691. MUL b3, a2, t2
  692. unop
  693. ADD2 c08, t3, c08
  694. unop
  695. MUL b4, a2, t3
  696. LD a2, 1 * SIZE(AO)
  697. ADD4 c13, t4, c13
  698. unop
  699. MUL b2, a3, t4
  700. LD b2, 1 * SIZE(BO)
  701. /* 4 */
  702. ADD1 c09, t1, c09
  703. unop
  704. MUL b3, a3, t1
  705. LD a6, 2 * SIZE(AO)
  706. ADD3 c10, t2, c10
  707. unop
  708. MUL b3, a4, t2
  709. LD b3, 2 * SIZE(BO)
  710. ADD2 c14, t3, c14
  711. unop
  712. MUL b4, a4, t3
  713. LD a4, 3 * SIZE(AO)
  714. ADD4 c07, t4, c07
  715. unop
  716. MUL b4, a3, t4
  717. LD b4, 3 * SIZE(BO)
  718. /* 5 */
  719. ADD1 c11, t1, c11
  720. unop
  721. MUL b5, a5, t1
  722. LD a1, 4 * SIZE(AO)
  723. ADD3 c12, t2, c12
  724. lda L, -2(L)
  725. MUL b5, a2, t2
  726. LD b1, 4 * SIZE(BO)
  727. ADD2 c16, t3, c16
  728. unop
  729. MUL b2, a2, t3
  730. unop
  731. ADD4 c15, t4, c15
  732. unop
  733. MUL b2, a5, t4
  734. unop
  735. /* 6 */
  736. ADD1 c01, t1, c01
  737. unop
  738. MUL b5, a6, t1
  739. unop
  740. ADD3 c02, t2, c02
  741. unop
  742. MUL b5, a4, t2
  743. unop
  744. ADD2 c06, t3, c06
  745. unop
  746. MUL b2, a4, t3
  747. unop
  748. ADD4 c05, t4, c05
  749. unop
  750. MUL b4, a5, t4
  751. unop
  752. /* 7 */
  753. ADD1 c03, t1, c03
  754. lda AO, 8 * SIZE(AO)
  755. MUL b3, a5, t1
  756. unop
  757. ADD3 c04, t2, c04
  758. lda BO, 8 * SIZE(BO)
  759. MUL b3, a2, t2
  760. unop
  761. ADD2 c08, t3, c08
  762. unop
  763. MUL b4, a2, t3
  764. LD a2, -3 * SIZE(AO)
  765. ADD4 c13, t4, c13
  766. unop
  767. MUL b2, a6, t4
  768. LD b2, -3 * SIZE(BO)
  769. /* 8 */
  770. ADD1 c09, t1, c09
  771. unop
  772. MUL b3, a6, t1
  773. LD a3, -2 * SIZE(AO)
  774. ADD3 c10, t2, c10
  775. unop
  776. MUL b3, a4, t2
  777. LD b3, -2 * SIZE(BO)
  778. ADD2 c14, t3, c14
  779. unop
  780. MUL b4, a4, t3
  781. LD a4, -1 * SIZE(AO)
  782. ADD4 c07, t4, c07
  783. MUL b4, a6, t4
  784. LD b4, -1 * SIZE(BO)
  785. bgt L, $L12
  786. .align 4
  787. $L15:
  788. ADD1 c11, t1, c11
  789. unop
  790. MUL b1, a1, t1
  791. #if defined(LT) || defined(RN)
  792. blbs KK, $L17
  793. #else
  794. blbs TMP1, $L17
  795. #endif
  796. .align 4
  797. ADD3 c12, t2, c12
  798. MUL b1, a2, t2
  799. ADD2 c16, t3, c16
  800. MUL b2, a2, t3
  801. ADD4 c15, t4, c15
  802. MUL b2, a1, t4
  803. ADD1 c01, t1, c01
  804. MUL b1, a3, t1
  805. ADD3 c02, t2, c02
  806. unop
  807. MUL b1, a4, t2
  808. LD b1, 0 * SIZE(BO)
  809. ADD2 c06, t3, c06
  810. MUL b2, a4, t3
  811. ADD4 c05, t4, c05
  812. MUL b4, a1, t4
  813. ADD1 c03, t1, c03
  814. unop
  815. MUL b3, a1, t1
  816. LD a1, 0 * SIZE(AO)
  817. ADD3 c04, t2, c04
  818. unop
  819. MUL b3, a2, t2
  820. unop
  821. ADD2 c08, t3, c08
  822. unop
  823. MUL b4, a2, t3
  824. LD a2, 1 * SIZE(AO)
  825. ADD4 c13, t4, c13
  826. unop
  827. MUL b2, a3, t4
  828. LD b2, 1 * SIZE(BO)
  829. ADD1 c09, t1, c09
  830. unop
  831. MUL b3, a3, t1
  832. lda AO, 4 * SIZE(AO)
  833. ADD3 c10, t2, c10
  834. unop
  835. MUL b3, a4, t2
  836. LD b3, 2 * SIZE(BO)
  837. ADD2 c14, t3, c14
  838. unop
  839. MUL b4, a4, t3
  840. LD a4, -1 * SIZE(AO)
  841. ADD4 c07, t4, c07
  842. unop
  843. MUL b4, a3, t4
  844. LD a3, -2 * SIZE(AO)
  845. ADD1 c11, t1, c11
  846. LD b4, 3 * SIZE(BO)
  847. MUL b1, a1, t1
  848. lda BO, 4 * SIZE(BO)
  849. .align 4
  850. $L17:
  851. ADD3 c12, t2, c12
  852. MUL b1, a2, t2
  853. ADD2 c16, t3, c16
  854. MUL b2, a2, t3
  855. ADD4 c15, t4, c15
  856. MUL b2, a1, t4
  857. ADD1 c01, t1, c01
  858. MUL b1, a3, t1
  859. ADD3 c02, t2, c02
  860. MUL b1, a4, t2
  861. ADD2 c06, t3, c06
  862. MUL b2, a4, t3
  863. ADD4 c05, t4, c05
  864. MUL b4, a1, t4
  865. ADD1 c03, t1, c03
  866. MUL b3, a1, t1
  867. ADD3 c04, t2, c04
  868. MUL b3, a2, t2
  869. ADD2 c08, t3, c08
  870. MUL b4, a2, t3
  871. ADD4 c13, t4, c13
  872. MUL b2, a3, t4
  873. ADD1 c09, t1, c09
  874. MUL b3, a3, t1
  875. ADD3 c10, t2, c10
  876. MUL b3, a4, t2
  877. ADD2 c14, t3, c14
  878. MUL b4, a4, t3
  879. ADD4 c07, t4, c07
  880. lda AO, 4 * SIZE(AO)
  881. MUL b4, a3, t4
  882. lda BO, 4 * SIZE(BO)
  883. ADD1 c11, t1, c11
  884. ADD3 c12, t2, c12
  885. ADD2 c16, t3, c16
  886. ADD4 c15, t4, c15
  887. ADD c01, c06, c01
  888. ADD c02, c05, c02
  889. ADD c03, c08, c03
  890. ADD c04, c07, c04
  891. ADD c09, c14, c09
  892. ADD c10, c13, c10
  893. ADD c11, c16, c11
  894. ADD c12, c15, c12
  895. .align 4
  896. $L18:
  897. #if defined(LN) || defined(RT)
  898. #ifdef LN
  899. subq KK, 2, TMP1
  900. #else
  901. subq KK, 2, TMP1
  902. #endif
  903. sll TMP1, ZBASE_SHIFT + 1, TMP2
  904. addq AORIG, TMP2, AO
  905. sll TMP1, ZBASE_SHIFT + 1, TMP2
  906. addq B, TMP2, BO
  907. #else
  908. lda AO, -4 * SIZE(AO)
  909. lda BO, -4 * SIZE(BO)
  910. #endif
  911. #if defined(LN) || defined(LT)
  912. LD a1, 0 * SIZE(BO)
  913. LD a2, 1 * SIZE(BO)
  914. LD a3, 2 * SIZE(BO)
  915. LD a4, 3 * SIZE(BO)
  916. LD b1, 4 * SIZE(BO)
  917. LD b2, 5 * SIZE(BO)
  918. LD b3, 6 * SIZE(BO)
  919. LD b4, 7 * SIZE(BO)
  920. SUB a1, c01, c01
  921. SUB a2, c02, c02
  922. SUB a3, c09, c09
  923. SUB a4, c10, c10
  924. SUB b1, c03, c03
  925. SUB b2, c04, c04
  926. SUB b3, c11, c11
  927. SUB b4, c12, c12
  928. #else
  929. LD a1, 0 * SIZE(AO)
  930. LD a2, 1 * SIZE(AO)
  931. LD a3, 2 * SIZE(AO)
  932. LD a4, 3 * SIZE(AO)
  933. LD b1, 4 * SIZE(AO)
  934. LD b2, 5 * SIZE(AO)
  935. LD b3, 6 * SIZE(AO)
  936. LD b4, 7 * SIZE(AO)
  937. SUB a1, c01, c01
  938. SUB a2, c02, c02
  939. SUB a3, c03, c03
  940. SUB a4, c04, c04
  941. SUB b1, c09, c09
  942. SUB b2, c10, c10
  943. SUB b3, c11, c11
  944. SUB b4, c12, c12
  945. #endif
  946. #ifdef LN
  947. LD a1, 6 * SIZE(AO)
  948. LD a2, 7 * SIZE(AO)
  949. LD a3, 4 * SIZE(AO)
  950. LD a4, 5 * SIZE(AO)
  951. MUL a2, c04, t1
  952. MUL a2, c03, t2
  953. MUL a2, c12, t3
  954. MUL a2, c11, t4
  955. MUL a1, c03, c03
  956. MUL a1, c04, c04
  957. MUL a1, c11, c11
  958. MUL a1, c12, c12
  959. ADD5 c03, t1, c03
  960. ADD6 c04, t2, c04
  961. ADD5 c11, t3, c11
  962. ADD6 c12, t4, c12
  963. MUL a3, c03, t1
  964. MUL a3, c04, t2
  965. MUL a3, c11, t3
  966. MUL a3, c12, t4
  967. SUB c01, t1, c01
  968. SUB c02, t2, c02
  969. SUB c09, t3, c09
  970. SUB c10, t4, c10
  971. MUL a4, c04, t1
  972. MUL a4, c03, t2
  973. MUL a4, c12, t3
  974. MUL a4, c11, t4
  975. ADD6 c01, t1, c01
  976. ADD5 c02, t2, c02
  977. ADD6 c09, t3, c09
  978. ADD5 c10, t4, c10
  979. LD a1, 0 * SIZE(AO)
  980. LD a2, 1 * SIZE(AO)
  981. MUL a2, c02, t1
  982. MUL a2, c01, t2
  983. MUL a2, c10, t3
  984. MUL a2, c09, t4
  985. MUL a1, c01, c01
  986. MUL a1, c02, c02
  987. MUL a1, c09, c09
  988. MUL a1, c10, c10
  989. ADD5 c01, t1, c01
  990. ADD6 c02, t2, c02
  991. ADD5 c09, t3, c09
  992. ADD6 c10, t4, c10
  993. #endif
  994. #ifdef LT
  995. LD a1, 0 * SIZE(AO)
  996. LD a2, 1 * SIZE(AO)
  997. LD a3, 2 * SIZE(AO)
  998. LD a4, 3 * SIZE(AO)
  999. MUL a2, c02, t1
  1000. MUL a2, c01, t2
  1001. MUL a2, c10, t3
  1002. MUL a2, c09, t4
  1003. MUL a1, c01, c01
  1004. MUL a1, c02, c02
  1005. MUL a1, c09, c09
  1006. MUL a1, c10, c10
  1007. ADD5 c01, t1, c01
  1008. ADD6 c02, t2, c02
  1009. ADD5 c09, t3, c09
  1010. ADD6 c10, t4, c10
  1011. MUL a3, c01, t1
  1012. MUL a3, c02, t2
  1013. MUL a3, c09, t3
  1014. MUL a3, c10, t4
  1015. SUB c03, t1, c03
  1016. SUB c04, t2, c04
  1017. SUB c11, t3, c11
  1018. SUB c12, t4, c12
  1019. MUL a4, c02, t1
  1020. MUL a4, c01, t2
  1021. MUL a4, c10, t3
  1022. MUL a4, c09, t4
  1023. ADD6 c03, t1, c03
  1024. ADD5 c04, t2, c04
  1025. ADD6 c11, t3, c11
  1026. ADD5 c12, t4, c12
  1027. LD a1, 6 * SIZE(AO)
  1028. LD a2, 7 * SIZE(AO)
  1029. MUL a2, c04, t1
  1030. MUL a2, c03, t2
  1031. MUL a2, c12, t3
  1032. MUL a2, c11, t4
  1033. MUL a1, c03, c03
  1034. MUL a1, c04, c04
  1035. MUL a1, c11, c11
  1036. MUL a1, c12, c12
  1037. ADD5 c03, t1, c03
  1038. ADD6 c04, t2, c04
  1039. ADD5 c11, t3, c11
  1040. ADD6 c12, t4, c12
  1041. #endif
  1042. #ifdef RN
  1043. LD a1, 0 * SIZE(BO)
  1044. LD a2, 1 * SIZE(BO)
  1045. LD a3, 2 * SIZE(BO)
  1046. LD a4, 3 * SIZE(BO)
  1047. MUL a2, c02, t1
  1048. MUL a2, c01, t2
  1049. MUL a2, c04, t3
  1050. MUL a2, c03, t4
  1051. MUL a1, c01, c01
  1052. MUL a1, c02, c02
  1053. MUL a1, c03, c03
  1054. MUL a1, c04, c04
  1055. ADD5 c01, t1, c01
  1056. ADD6 c02, t2, c02
  1057. ADD5 c03, t3, c03
  1058. ADD6 c04, t4, c04
  1059. MUL a3, c01, t1
  1060. MUL a3, c02, t2
  1061. MUL a3, c03, t3
  1062. MUL a3, c04, t4
  1063. SUB c09, t1, c09
  1064. SUB c10, t2, c10
  1065. SUB c11, t3, c11
  1066. SUB c12, t4, c12
  1067. MUL a4, c02, t1
  1068. MUL a4, c01, t2
  1069. MUL a4, c04, t3
  1070. MUL a4, c03, t4
  1071. ADD6 c09, t1, c09
  1072. ADD5 c10, t2, c10
  1073. ADD6 c11, t3, c11
  1074. ADD5 c12, t4, c12
  1075. LD a1, 6 * SIZE(BO)
  1076. LD a2, 7 * SIZE(BO)
  1077. MUL a2, c10, t1
  1078. MUL a2, c09, t2
  1079. MUL a2, c12, t3
  1080. MUL a2, c11, t4
  1081. MUL a1, c09, c09
  1082. MUL a1, c10, c10
  1083. MUL a1, c11, c11
  1084. MUL a1, c12, c12
  1085. ADD5 c09, t1, c09
  1086. ADD6 c10, t2, c10
  1087. ADD5 c11, t3, c11
  1088. ADD6 c12, t4, c12
  1089. #endif
  1090. #ifdef RT
  1091. LD a1, 6 * SIZE(BO)
  1092. LD a2, 7 * SIZE(BO)
  1093. LD a3, 4 * SIZE(BO)
  1094. LD a4, 5 * SIZE(BO)
  1095. MUL a2, c10, t1
  1096. MUL a2, c09, t2
  1097. MUL a2, c12, t3
  1098. MUL a2, c11, t4
  1099. MUL a1, c09, c09
  1100. MUL a1, c10, c10
  1101. MUL a1, c11, c11
  1102. MUL a1, c12, c12
  1103. ADD5 c09, t1, c09
  1104. ADD6 c10, t2, c10
  1105. ADD5 c11, t3, c11
  1106. ADD6 c12, t4, c12
  1107. MUL a3, c09, t1
  1108. MUL a3, c10, t2
  1109. MUL a3, c11, t3
  1110. MUL a3, c12, t4
  1111. SUB c01, t1, c01
  1112. SUB c02, t2, c02
  1113. SUB c03, t3, c03
  1114. SUB c04, t4, c04
  1115. MUL a4, c10, t1
  1116. MUL a4, c09, t2
  1117. MUL a4, c12, t3
  1118. MUL a4, c11, t4
  1119. ADD6 c01, t1, c01
  1120. ADD5 c02, t2, c02
  1121. ADD6 c03, t3, c03
  1122. ADD5 c04, t4, c04
  1123. LD a1, 0 * SIZE(BO)
  1124. LD a2, 1 * SIZE(BO)
  1125. MUL a2, c02, t1
  1126. MUL a2, c01, t2
  1127. MUL a2, c04, t3
  1128. MUL a2, c03, t4
  1129. MUL a1, c01, c01
  1130. MUL a1, c02, c02
  1131. MUL a1, c03, c03
  1132. MUL a1, c04, c04
  1133. ADD5 c01, t1, c01
  1134. ADD6 c02, t2, c02
  1135. ADD5 c03, t3, c03
  1136. ADD6 c04, t4, c04
  1137. #endif
  1138. #if defined(LN) || defined(LT)
  1139. ST c01, 0 * SIZE(BO)
  1140. ST c02, 1 * SIZE(BO)
  1141. ST c09, 2 * SIZE(BO)
  1142. ST c10, 3 * SIZE(BO)
  1143. ST c03, 4 * SIZE(BO)
  1144. ST c04, 5 * SIZE(BO)
  1145. ST c11, 6 * SIZE(BO)
  1146. ST c12, 7 * SIZE(BO)
  1147. #else
  1148. ST c01, 0 * SIZE(AO)
  1149. ST c02, 1 * SIZE(AO)
  1150. ST c03, 2 * SIZE(AO)
  1151. ST c04, 3 * SIZE(AO)
  1152. ST c09, 4 * SIZE(AO)
  1153. ST c10, 5 * SIZE(AO)
  1154. ST c11, 6 * SIZE(AO)
  1155. ST c12, 7 * SIZE(AO)
  1156. #endif
  1157. #ifdef LN
  1158. lda C1, -4 * SIZE(C1)
  1159. lda C2, -4 * SIZE(C2)
  1160. #endif
  1161. ST c01, 0 * SIZE(C1)
  1162. ST c02, 1 * SIZE(C1)
  1163. ST c03, 2 * SIZE(C1)
  1164. ST c04, 3 * SIZE(C1)
  1165. ST c09, 0 * SIZE(C2)
  1166. ST c10, 1 * SIZE(C2)
  1167. ST c11, 2 * SIZE(C2)
  1168. ST c12, 3 * SIZE(C2)
  1169. #ifndef LN
  1170. lda C1, 4 * SIZE(C1)
  1171. lda C2, 4 * SIZE(C2)
  1172. #endif
  1173. fclr t1
  1174. fclr t2
  1175. fclr t3
  1176. fclr t4
  1177. #ifdef RT
  1178. sll K, ZBASE_SHIFT + 1, TMP1
  1179. addq AORIG, TMP1, AORIG
  1180. #endif
  1181. #if defined(LT) || defined(RN)
  1182. subq K, KK, TMP1
  1183. sll TMP1, ZBASE_SHIFT + 1, TMP1
  1184. addq AO, TMP1, AO
  1185. addq BO, TMP1, BO
  1186. #endif
  1187. #ifdef LT
  1188. addq KK, 2, KK
  1189. #endif
  1190. #ifdef LN
  1191. subq KK, 2, KK
  1192. #endif
  1193. fclr c01
  1194. fclr c05
  1195. lda I, -1(I)
  1196. bgt I, $L11
  1197. .align 4
  1198. $L29:
  1199. #ifdef LN
  1200. sll K, ZBASE_SHIFT + 1, TMP1
  1201. addq B, TMP1, B
  1202. #endif
  1203. #if defined(LT) || defined(RN)
  1204. mov BO, B
  1205. #endif
  1206. #ifdef RN
  1207. addq KK, 2, KK
  1208. #endif
  1209. #ifdef RT
  1210. subq KK, 2, KK
  1211. #endif
  1212. lda J, -1(J)
  1213. bgt J, $L01
  1214. .align 4
  1215. $L30:
  1216. and N, 1, J
  1217. ble J, $L999
  1218. #ifdef RT
  1219. sll K, ZBASE_SHIFT, TMP1
  1220. subq B, TMP1, B
  1221. subq C, LDC, C1
  1222. subq C, LDC, C
  1223. #else
  1224. mov C, C1
  1225. addq C, LDC, C
  1226. #endif
  1227. #ifdef LN
  1228. addq M, OFFSET, KK
  1229. #endif
  1230. #ifdef LT
  1231. mov OFFSET, KK
  1232. #endif
  1233. #if defined(LN) || defined(RT)
  1234. mov A, AORIG
  1235. #else
  1236. mov A, AO
  1237. #endif
  1238. and M, 1, I
  1239. ble I, $L50
  1240. #if defined(LT) || defined(RN)
  1241. LD a1, 0 * SIZE(AO)
  1242. fclr t1
  1243. LD a2, 1 * SIZE(AO)
  1244. fclr t2
  1245. LD a3, 2 * SIZE(AO)
  1246. fclr t3
  1247. LD a4, 3 * SIZE(AO)
  1248. fclr t4
  1249. LD b1, 0 * SIZE(B)
  1250. fclr c01
  1251. LD b2, 1 * SIZE(B)
  1252. fclr c05
  1253. LD b3, 2 * SIZE(B)
  1254. fclr c02
  1255. LD b4, 3 * SIZE(B)
  1256. fclr c06
  1257. lda AO, 2 * SIZE(AO)
  1258. lda BO, 2 * SIZE(B)
  1259. lda L, -2(KK)
  1260. ble KK, $L58
  1261. ble L, $L55
  1262. #else
  1263. #ifdef LN
  1264. sll K, ZBASE_SHIFT, TMP1
  1265. subq AORIG, TMP1, AORIG
  1266. #endif
  1267. sll KK, ZBASE_SHIFT, TMP1
  1268. addq AORIG, TMP1, AO
  1269. sll KK, ZBASE_SHIFT, TMP1
  1270. addq B, TMP1, BO
  1271. subq K, KK, TMP1
  1272. LD a1, 0 * SIZE(AO)
  1273. fclr t1
  1274. LD a2, 1 * SIZE(AO)
  1275. fclr t2
  1276. LD a3, 2 * SIZE(AO)
  1277. fclr t3
  1278. LD a4, 3 * SIZE(AO)
  1279. fclr t4
  1280. LD b1, 0 * SIZE(BO)
  1281. fclr c01
  1282. LD b2, 1 * SIZE(BO)
  1283. fclr c05
  1284. LD b3, 2 * SIZE(BO)
  1285. fclr c02
  1286. LD b4, 3 * SIZE(BO)
  1287. fclr c06
  1288. lda AO, 2 * SIZE(AO)
  1289. lda BO, 2 * SIZE(BO)
  1290. lda L, -2(TMP1)
  1291. ble TMP1, $L58
  1292. ble L, $L55
  1293. #endif
  1294. .align 5
  1295. $L52:
  1296. ADD1 c01, t1, c01
  1297. unop
  1298. MUL a1, b1, t1
  1299. unop
  1300. ADD3 c02, t2, c02
  1301. lda AO, 4 * SIZE(AO)
  1302. MUL a2, b1, t2
  1303. LD b1, 2 * SIZE(BO)
  1304. ADD4 c05, t3, c05
  1305. lda L, -2(L)
  1306. MUL a1, b2, t3
  1307. LD a1, -2 * SIZE(AO)
  1308. ADD2 c06, t4, c06
  1309. unop
  1310. MUL a2, b2, t4
  1311. LD a2, -1 * SIZE(AO)
  1312. ADD1 c01, t1, c01
  1313. LD b2, 3 * SIZE(BO)
  1314. MUL a3, b3, t1
  1315. lda BO, 4 * SIZE(BO)
  1316. ADD3 c02, t2, c02
  1317. unop
  1318. MUL a4, b3, t2
  1319. LD b3, 0 * SIZE(BO)
  1320. ADD4 c05, t3, c05
  1321. unop
  1322. MUL a3, b4, t3
  1323. LD a3, 0 * SIZE(AO)
  1324. ADD2 c06, t4, c06
  1325. MUL a4, b4, t4
  1326. LD b4, 1 * SIZE(BO)
  1327. unop
  1328. LD a4, 1 * SIZE(AO)
  1329. unop
  1330. unop
  1331. bgt L, $L52
  1332. .align 4
  1333. $L55:
  1334. ADD1 c01, t1, c01
  1335. MUL a1, b1, t1
  1336. #if defined(LT) || defined(RN)
  1337. blbs KK, $L57
  1338. #else
  1339. blbs TMP1, $L57
  1340. #endif
  1341. .align 4
  1342. ADD3 c02, t2, c02
  1343. unop
  1344. MUL a2, b1, t2
  1345. LD b1, 0 * SIZE(BO)
  1346. ADD4 c05, t3, c05
  1347. lda BO, 2 * SIZE(BO)
  1348. MUL a1, b2, t3
  1349. LD a1, 0 * SIZE(AO)
  1350. ADD2 c06, t4, c06
  1351. unop
  1352. MUL a2, b2, t4
  1353. LD a2, 1 * SIZE(AO)
  1354. ADD1 c01, t1, c01
  1355. LD b2, -1 * SIZE(BO)
  1356. MUL a1, b1, t1
  1357. lda AO, 2 * SIZE(AO)
  1358. .align 4
  1359. $L57:
  1360. ADD3 c02, t2, c02
  1361. MUL a2, b1, t2
  1362. ADD4 c05, t3, c05
  1363. MUL a1, b2, t3
  1364. ADD2 c06, t4, c06
  1365. lda AO, 2 * SIZE(AO)
  1366. MUL a2, b2, t4
  1367. lda BO, 2 * SIZE(BO)
  1368. ADD1 c01, t1, c01
  1369. ADD3 c02, t2, c02
  1370. ADD4 c05, t3, c05
  1371. ADD2 c06, t4, c06
  1372. ADD c01, c06, c01
  1373. ADD c02, c05, c02
  1374. $L58:
  1375. #if defined(LN) || defined(RT)
  1376. subq KK, 1, TMP1
  1377. sll TMP1, ZBASE_SHIFT, TMP2
  1378. addq AORIG, TMP2, AO
  1379. sll TMP1, ZBASE_SHIFT, TMP2
  1380. addq B, TMP2, BO
  1381. #else
  1382. lda AO, -2 * SIZE(AO)
  1383. lda BO, -2 * SIZE(BO)
  1384. #endif
  1385. #if defined(LN) || defined(LT)
  1386. LD a1, 0 * SIZE(BO)
  1387. LD a2, 1 * SIZE(BO)
  1388. SUB a1, c01, c01
  1389. SUB a2, c02, c02
  1390. #else
  1391. LD a1, 0 * SIZE(AO)
  1392. LD a2, 1 * SIZE(AO)
  1393. SUB a1, c01, c01
  1394. SUB a2, c02, c02
  1395. #endif
  1396. #if defined(LN) || defined(LT)
  1397. LD a1, 0 * SIZE(AO)
  1398. LD a2, 1 * SIZE(AO)
  1399. MUL a2, c02, t1
  1400. MUL a2, c01, t2
  1401. MUL a1, c01, c01
  1402. MUL a1, c02, c02
  1403. ADD5 c01, t1, c01
  1404. ADD6 c02, t2, c02
  1405. #endif
  1406. #if defined(RN) || defined(RT)
  1407. LD a1, 0 * SIZE(BO)
  1408. LD a2, 1 * SIZE(BO)
  1409. MUL a2, c02, t1
  1410. MUL a2, c01, t2
  1411. MUL a1, c01, c01
  1412. MUL a1, c02, c02
  1413. ADD5 c01, t1, c01
  1414. ADD6 c02, t2, c02
  1415. #endif
  1416. #if defined(LN) || defined(LT)
  1417. ST c01, 0 * SIZE(BO)
  1418. ST c02, 1 * SIZE(BO)
  1419. #else
  1420. ST c01, 0 * SIZE(AO)
  1421. ST c02, 1 * SIZE(AO)
  1422. #endif
  1423. #ifdef LN
  1424. lda C1, -2 * SIZE(C1)
  1425. #endif
  1426. ST c01, 0 * SIZE(C1)
  1427. ST c02, 1 * SIZE(C1)
  1428. #ifndef LN
  1429. lda C1, 2 * SIZE(C1)
  1430. #endif
  1431. #ifdef RT
  1432. sll K, ZBASE_SHIFT, TMP1
  1433. addq AORIG, TMP1, AORIG
  1434. #endif
  1435. #if defined(LT) || defined(RN)
  1436. subq K, KK, TMP1
  1437. sll TMP1, ZBASE_SHIFT, TMP2
  1438. addq AO, TMP2, AO
  1439. sll TMP1, ZBASE_SHIFT, TMP2
  1440. addq BO, TMP2, BO
  1441. #endif
  1442. #ifdef LT
  1443. addq KK, 1, KK
  1444. #endif
  1445. #ifdef LN
  1446. subq KK, 1, KK
  1447. #endif
  1448. .align 4
  1449. $L50:
  1450. sra M, 1, I
  1451. ble I, $L59
  1452. .align 4
  1453. $L41:
  1454. #if defined(LT) || defined(RN)
  1455. LD a1, 0 * SIZE(AO)
  1456. fclr t1
  1457. LD a2, 1 * SIZE(AO)
  1458. fclr t2
  1459. LD a3, 2 * SIZE(AO)
  1460. fclr t3
  1461. LD a4, 3 * SIZE(AO)
  1462. fclr t4
  1463. LD b1, 0 * SIZE(B)
  1464. fclr c01
  1465. LD b2, 1 * SIZE(B)
  1466. fclr c05
  1467. LD b3, 2 * SIZE(B)
  1468. fclr c02
  1469. LD b4, 3 * SIZE(B)
  1470. fclr c06
  1471. lda BO, 2 * SIZE(B)
  1472. fclr c03
  1473. lda AO, 4 * SIZE(AO)
  1474. fclr c07
  1475. lda L, -2(KK)
  1476. fclr c04
  1477. fclr c08
  1478. ble KK, $L48
  1479. ble L, $L45
  1480. #else
  1481. #ifdef LN
  1482. sll K, ZBASE_SHIFT + 1, TMP1
  1483. subq AORIG, TMP1, AORIG
  1484. #endif
  1485. sll KK, ZBASE_SHIFT + 1, TMP1
  1486. addq AORIG, TMP1, AO
  1487. sll KK, ZBASE_SHIFT, TMP1
  1488. addq B, TMP1, BO
  1489. subq K, KK, TMP1
  1490. LD a1, 0 * SIZE(AO)
  1491. fclr t1
  1492. LD a2, 1 * SIZE(AO)
  1493. fclr t2
  1494. LD a3, 2 * SIZE(AO)
  1495. fclr t3
  1496. LD a4, 3 * SIZE(AO)
  1497. fclr t4
  1498. LD b1, 0 * SIZE(BO)
  1499. fclr c01
  1500. LD b2, 1 * SIZE(BO)
  1501. fclr c05
  1502. LD b3, 2 * SIZE(BO)
  1503. fclr c02
  1504. LD b4, 3 * SIZE(BO)
  1505. fclr c06
  1506. lda BO, 2 * SIZE(BO)
  1507. fclr c03
  1508. lda AO, 4 * SIZE(AO)
  1509. fclr c07
  1510. lda L, -2(TMP1)
  1511. fclr c04
  1512. fclr c08
  1513. ble TMP1, $L48
  1514. ble L, $L45
  1515. #endif
  1516. .align 5
  1517. $L42:
  1518. ADD4 c05, t1, c05
  1519. unop
  1520. MUL a1, b1, t1
  1521. unop
  1522. ADD2 c06, t2, c06
  1523. lda L, -2(L)
  1524. MUL a2, b1, t2
  1525. unop
  1526. ADD4 c07, t3, c07
  1527. unop
  1528. MUL a3, b1, t3
  1529. unop
  1530. ADD2 c08, t4, c08
  1531. unop
  1532. MUL a4, b1, t4
  1533. LD b1, 2 * SIZE(BO)
  1534. ADD1 c01, t1, c01
  1535. unop
  1536. MUL a1, b2, t1
  1537. LD a1, 0 * SIZE(AO)
  1538. ADD3 c02, t2, c02
  1539. lda BO, 4 * SIZE(BO)
  1540. MUL a2, b2, t2
  1541. LD a2, 1 * SIZE(AO)
  1542. ADD1 c03, t3, c03
  1543. unop
  1544. MUL a3, b2, t3
  1545. LD a3, 2 * SIZE(AO)
  1546. ADD3 c04, t4, c04
  1547. unop
  1548. MUL a4, b2, t4
  1549. LD a5, 3 * SIZE(AO)
  1550. ADD4 c05, t1, c05
  1551. unop
  1552. MUL a1, b3, t1
  1553. LD b2, -1 * SIZE(BO)
  1554. ADD2 c06, t2, c06
  1555. unop
  1556. MUL a2, b3, t2
  1557. unop
  1558. ADD4 c07, t3, c07
  1559. unop
  1560. MUL a3, b3, t3
  1561. lda AO, 8 * SIZE(AO)
  1562. ADD2 c08, t4, c08
  1563. unop
  1564. MUL a5, b3, t4
  1565. LD b3, 0 * SIZE(BO)
  1566. ADD1 c01, t1, c01
  1567. unop
  1568. MUL a1, b4, t1
  1569. LD a1, -4 * SIZE(AO)
  1570. ADD3 c02, t2, c02
  1571. unop
  1572. MUL a2, b4, t2
  1573. LD a2, -3 * SIZE(AO)
  1574. ADD1 c03, t3, c03
  1575. LD a4, -1 * SIZE(AO)
  1576. MUL a3, b4, t3
  1577. LD a3, -2 * SIZE(AO)
  1578. ADD3 c04, t4, c04
  1579. MUL a5, b4, t4
  1580. LD b4, 1 * SIZE(BO)
  1581. bgt L, $L42
  1582. .align 4
  1583. $L45:
  1584. ADD4 c05, t1, c05
  1585. MUL b1, a1, t1
  1586. #if defined(LT) || defined(RN)
  1587. blbs KK, $L47
  1588. #else
  1589. blbs TMP1, $L47
  1590. #endif
  1591. .align 4
  1592. ADD2 c06, t2, c06
  1593. MUL a2, b1, t2
  1594. ADD4 c07, t3, c07
  1595. MUL a3, b1, t3
  1596. ADD2 c08, t4, c08
  1597. unop
  1598. MUL a4, b1, t4
  1599. LD b1, 0 * SIZE(BO)
  1600. ADD1 c01, t1, c01
  1601. unop
  1602. MUL a1, b2, t1
  1603. LD a1, 0 * SIZE(AO)
  1604. ADD3 c02, t2, c02
  1605. unop
  1606. MUL a2, b2, t2
  1607. LD a2, 1 * SIZE(AO)
  1608. ADD1 c03, t3, c03
  1609. unop
  1610. MUL a3, b2, t3
  1611. LD a3, 2 * SIZE(AO)
  1612. ADD3 c04, t4, c04
  1613. MUL a4, b2, t4
  1614. LD a4, 3 * SIZE(AO)
  1615. lda AO, 4 * SIZE(AO)
  1616. ADD4 c05, t1, c05
  1617. LD b2, 1 * SIZE(BO)
  1618. MUL a1, b1, t1
  1619. lda BO, 2 * SIZE(BO)
  1620. .align 4
  1621. $L47:
  1622. ADD2 c06, t2, c06
  1623. MUL a2, b1, t2
  1624. ADD4 c07, t3, c07
  1625. MUL a3, b1, t3
  1626. ADD2 c08, t4, c08
  1627. MUL a4, b1, t4
  1628. ADD1 c01, t1, c01
  1629. MUL a1, b2, t1
  1630. ADD3 c02, t2, c02
  1631. MUL a2, b2, t2
  1632. ADD1 c03, t3, c03
  1633. MUL a3, b2, t3
  1634. ADD3 c04, t4, c04
  1635. lda AO, 4 * SIZE(AO)
  1636. MUL a4, b2, t4
  1637. lda BO, 2 * SIZE(BO)
  1638. ADD4 c05, t1, c05
  1639. ADD2 c06, t2, c06
  1640. ADD4 c07, t3, c07
  1641. ADD2 c08, t4, c08
  1642. ADD c01, c06, c01
  1643. ADD c02, c05, c02
  1644. ADD c03, c08, c03
  1645. ADD c04, c07, c04
  1646. $L48:
  1647. #if defined(LN) || defined(RT)
  1648. #ifdef LN
  1649. subq KK, 2, TMP1
  1650. #else
  1651. subq KK, 1, TMP1
  1652. #endif
  1653. sll TMP1, ZBASE_SHIFT + 1, TMP2
  1654. addq AORIG, TMP2, AO
  1655. sll TMP1, ZBASE_SHIFT, TMP2
  1656. addq B, TMP2, BO
  1657. #else
  1658. lda AO, -4 * SIZE(AO)
  1659. lda BO, -2 * SIZE(BO)
  1660. #endif
  1661. #if defined(LN) || defined(LT)
  1662. LD a1, 0 * SIZE(BO)
  1663. LD a2, 1 * SIZE(BO)
  1664. LD a3, 2 * SIZE(BO)
  1665. LD a4, 3 * SIZE(BO)
  1666. SUB a1, c01, c01
  1667. SUB a2, c02, c02
  1668. SUB a3, c03, c03
  1669. SUB a4, c04, c04
  1670. #else
  1671. LD a1, 0 * SIZE(AO)
  1672. LD a2, 1 * SIZE(AO)
  1673. LD a3, 2 * SIZE(AO)
  1674. LD a4, 3 * SIZE(AO)
  1675. SUB a1, c01, c01
  1676. SUB a2, c02, c02
  1677. SUB a3, c03, c03
  1678. SUB a4, c04, c04
  1679. #endif
  1680. #ifdef LN
  1681. LD a1, 6 * SIZE(AO)
  1682. LD a2, 7 * SIZE(AO)
  1683. LD a3, 4 * SIZE(AO)
  1684. LD a4, 5 * SIZE(AO)
  1685. MUL a2, c04, t1
  1686. MUL a2, c03, t2
  1687. MUL a1, c03, c03
  1688. MUL a1, c04, c04
  1689. ADD5 c03, t1, c03
  1690. ADD6 c04, t2, c04
  1691. MUL a3, c03, t1
  1692. MUL a3, c04, t2
  1693. SUB c01, t1, c01
  1694. SUB c02, t2, c02
  1695. MUL a4, c04, t1
  1696. MUL a4, c03, t2
  1697. ADD6 c01, t1, c01
  1698. ADD5 c02, t2, c02
  1699. LD a1, 0 * SIZE(AO)
  1700. LD a2, 1 * SIZE(AO)
  1701. MUL a2, c02, t1
  1702. MUL a2, c01, t2
  1703. MUL a1, c01, c01
  1704. MUL a1, c02, c02
  1705. ADD5 c01, t1, c01
  1706. ADD6 c02, t2, c02
  1707. #endif
  1708. #ifdef LT
  1709. LD a1, 0 * SIZE(AO)
  1710. LD a2, 1 * SIZE(AO)
  1711. LD a3, 2 * SIZE(AO)
  1712. LD a4, 3 * SIZE(AO)
  1713. MUL a2, c02, t1
  1714. MUL a2, c01, t2
  1715. MUL a1, c01, c01
  1716. MUL a1, c02, c02
  1717. ADD5 c01, t1, c01
  1718. ADD6 c02, t2, c02
  1719. MUL a3, c01, t1
  1720. MUL a3, c02, t2
  1721. SUB c03, t1, c03
  1722. SUB c04, t2, c04
  1723. MUL a4, c02, t1
  1724. MUL a4, c01, t2
  1725. ADD6 c03, t1, c03
  1726. ADD5 c04, t2, c04
  1727. LD a1, 6 * SIZE(AO)
  1728. LD a2, 7 * SIZE(AO)
  1729. MUL a2, c04, t1
  1730. MUL a2, c03, t2
  1731. MUL a1, c03, c03
  1732. MUL a1, c04, c04
  1733. ADD5 c03, t1, c03
  1734. ADD6 c04, t2, c04
  1735. #endif
  1736. #if defined(RN) || defined(RT)
  1737. LD a1, 0 * SIZE(BO)
  1738. LD a2, 1 * SIZE(BO)
  1739. MUL a2, c02, t1
  1740. MUL a2, c01, t2
  1741. MUL a2, c04, t3
  1742. MUL a2, c03, t4
  1743. MUL a1, c01, c01
  1744. MUL a1, c02, c02
  1745. MUL a1, c03, c03
  1746. MUL a1, c04, c04
  1747. ADD5 c01, t1, c01
  1748. ADD6 c02, t2, c02
  1749. ADD5 c03, t3, c03
  1750. ADD6 c04, t4, c04
  1751. #endif
  1752. #if defined(LN) || defined(LT)
  1753. ST c01, 0 * SIZE(BO)
  1754. ST c02, 1 * SIZE(BO)
  1755. ST c03, 2 * SIZE(BO)
  1756. ST c04, 3 * SIZE(BO)
  1757. #else
  1758. ST c01, 0 * SIZE(AO)
  1759. ST c02, 1 * SIZE(AO)
  1760. ST c03, 2 * SIZE(AO)
  1761. ST c04, 3 * SIZE(AO)
  1762. #endif
  1763. #ifdef LN
  1764. lda C1, -4 * SIZE(C1)
  1765. #endif
  1766. ST c01, 0 * SIZE(C1)
  1767. ST c02, 1 * SIZE(C1)
  1768. ST c03, 2 * SIZE(C1)
  1769. ST c04, 3 * SIZE(C1)
  1770. #ifndef LN
  1771. lda C1, 4 * SIZE(C1)
  1772. #endif
  1773. #ifdef RT
  1774. sll K, ZBASE_SHIFT + 1, TMP1
  1775. addq AORIG, TMP1, AORIG
  1776. #endif
  1777. #if defined(LT) || defined(RN)
  1778. subq K, KK, TMP1
  1779. sll TMP1, ZBASE_SHIFT + 1, TMP2
  1780. addq AO, TMP2, AO
  1781. sll TMP1, ZBASE_SHIFT, TMP2
  1782. addq BO, TMP2, BO
  1783. #endif
  1784. #ifdef LT
  1785. addq KK, 2, KK
  1786. #endif
  1787. #ifdef LN
  1788. subq KK, 2, KK
  1789. #endif
  1790. lda I, -1(I)
  1791. bgt I, $L41
  1792. .align 4
  1793. $L59:
  1794. #ifdef LN
  1795. sll K, ZBASE_SHIFT, TMP1
  1796. addq B, TMP1, B
  1797. #endif
  1798. #if defined(LT) || defined(RN)
  1799. mov BO, B
  1800. #endif
  1801. #ifdef RN
  1802. addq KK, 1, KK
  1803. #endif
  1804. #ifdef RT
  1805. subq KK, 1, KK
  1806. #endif
  1807. .align 4
  1808. $L999:
  1809. ldt $f2, 0($sp)
  1810. ldt $f3, 8($sp)
  1811. ldt $f4, 16($sp)
  1812. ldt $f5, 24($sp)
  1813. ldt $f6, 32($sp)
  1814. ldt $f7, 40($sp)
  1815. ldt $f8, 48($sp)
  1816. ldt $f9, 56($sp)
  1817. clr $0
  1818. lda $sp, STACKSIZE($sp)
  1819. ret
  1820. .ident VERSION
  1821. .end CNAME