You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT.S 65 kB


  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M $r4
  30. #define N $r5
  31. #define K $r6
  32. #define A $r7
  33. #define B $r8
  34. #define C $r9
  35. #define LDC $r10
  36. #define OFFSET $r11
  37. #define AO $r12
  38. #define BO $r13
  39. #define I $r17
  40. #define J $r18
  41. #define L $r29
  42. #define CO1 $r14
  43. #define CO2 $r15
  44. #define CO3 $r23
  45. #define CO4 $r24
  46. #define CO5 $r25
  47. #define CO6 $r26
  48. #define CO7 $r27
  49. #define CO8 $r28
  50. #define KK $r30
  51. #define TEMP $r20
  52. #define AORIG $r16
  53. #define a1 $f22
  54. #define a2 $f8
  55. #define a3 $f27
  56. #define a4 $f28
  57. #define b1 $f23
  58. #define b2 $f9
  59. #define b3 $f10
  60. #define b4 $f11
  61. #define b5 $f12
  62. #define b6 $f13
  63. #define b7 $f14
  64. #define b8 $f15
  65. #define a5 b8
  66. #define c11 $f16
  67. #define c12 $f17
  68. #define c21 $f3
  69. #define c22 $f1
  70. #define c31 $f2
  71. #define c32 $f4
  72. #define c41 $f5
  73. #define c42 $f6
  74. #define c51 $f7
  75. #define c52 $f18
  76. #define c61 $f19
  77. #define c62 $f20
  78. #define c71 $f21
  79. #define c72 $f24
  80. #define c81 $f25
  81. #define c82 $f26
  82. #define ALPHA $f0
  83. PROLOGUE
  84. addi.d $sp, $sp, -144
  85. SDARG $r23, $sp, 0
  86. SDARG $r24, $sp, 8
  87. SDARG $r25, $sp, 16
  88. SDARG $r26, $sp, 24
  89. SDARG $r27, $sp, 32
  90. SDARG $r28, $sp, 40
  91. fst.d $f24, $sp, 48
  92. fst.d $f25, $sp, 56
  93. fst.d $f26, $sp, 64
  94. fst.d $f27, $sp, 72
  95. fst.d $f28, $sp, 80
  96. SDARG $r29, $sp, 88
  97. SDARG $r30, $sp, 96
  98. SDARG $r20, $sp, 104
  99. SDARG $r16, $sp, 112
  100. #ifndef __64BIT__
  101. fst.d $f18, $sp, 112
  102. fst.d $f19, $sp, 120
  103. fst.d $f20, $sp, 128
  104. fst.d $f21, $sp, 136
  105. #endif
  106. slli.d LDC, LDC, BASE_SHIFT
  107. #ifdef LN
  108. mul.w TEMP, M, K
  109. slli.d TEMP, TEMP, BASE_SHIFT
  110. add.d A, A, TEMP
  111. slli.d TEMP, M, BASE_SHIFT
  112. add.d C, C, TEMP
  113. #endif
  114. #ifdef RN
  115. sub.d KK, $r0, OFFSET
  116. #endif
  117. #ifdef RT
  118. mul.w TEMP, N, K
  119. slli.d TEMP, TEMP, BASE_SHIFT
  120. add.d B, B, TEMP
  121. mul.w TEMP, N, LDC
  122. add.d C, C, TEMP
  123. sub.d KK, N, OFFSET
  124. #endif
  125. andi J, N, 1
  126. bge $r0, J, .L30
  127. #ifdef RT
  128. slli.d TEMP, K, BASE_SHIFT
  129. sub.d B, B, TEMP
  130. sub.d C, C, LDC
  131. #endif
  132. move AO, A
  133. move CO1, C
  134. #ifdef LN
  135. add.d KK, M, OFFSET
  136. #endif
  137. #ifdef LT
  138. move KK, OFFSET
  139. #endif
  140. #if defined(LN) || defined(RT)
  141. move AORIG, A
  142. #else
  143. move AO, A
  144. #endif
  145. #ifndef RT
  146. add.d C, CO1, LDC
  147. #endif
  148. srai.d I, M, 1
  149. bge $r0, I, .L80
  150. .L71:
  151. #if defined(LT) || defined(RN)
  152. LD a1, AO, 0 * SIZE
  153. MTC c11, $r0
  154. LD a2, AO, 1 * SIZE
  155. MOV c21, c11
  156. LD a5, AO, 4 * SIZE
  157. LD b1, B, 0 * SIZE
  158. MOV c12, c11
  159. LD b2, B, 1 * SIZE
  160. MOV c22, c11
  161. LD b3, B, 2 * SIZE
  162. LD b5, B, 4 * SIZE
  163. srai.d L, KK, 2
  164. LD b6, B, 8 * SIZE
  165. LD b7, B, 12 * SIZE
  166. move BO, B
  167. bge $r0, L, .L75
  168. #else
  169. #ifdef LN
  170. slli.d TEMP, K, 1 + BASE_SHIFT
  171. sub.d AORIG, AORIG, TEMP
  172. #endif
  173. slli.d L, KK, 1 + BASE_SHIFT
  174. slli.d TEMP, KK, 0 + BASE_SHIFT
  175. add.d AO, AORIG, L
  176. add.d BO, B, TEMP
  177. sub.d TEMP, K, KK
  178. LD a1, AO, 0 * SIZE
  179. MTC c11, $r0
  180. LD a2, AO, 1 * SIZE
  181. MOV c21, c11
  182. LD a5, AO, 4 * SIZE
  183. LD b1, BO, 0 * SIZE
  184. MOV c12, c11
  185. LD b2, BO, 1 * SIZE
  186. MOV c22, c11
  187. LD b3, BO, 2 * SIZE
  188. LD b5, BO, 4 * SIZE
  189. srai.d L, TEMP, 2
  190. LD b6, BO, 8 * SIZE
  191. LD b7, BO, 12 * SIZE
  192. bge $r0, L, .L75
  193. #endif
  194. .align 3
  195. .L72:
  196. LD a1, AO, 0 * SIZE
  197. LD a2, AO, 1 * SIZE
  198. LD b1, BO, 0 * SIZE
  199. MADD c11, b1, a1, c11
  200. MADD c12, b1, a2, c12
  201. LD a1, AO, 2 * SIZE
  202. LD a2, AO, 3 * SIZE
  203. LD b1, BO, 1 * SIZE
  204. MADD c11, b1, a1, c11
  205. MADD c12, b1, a2, c12
  206. LD a1, AO, 4 * SIZE
  207. LD a2, AO, 5 * SIZE
  208. LD b1, BO, 2 * SIZE
  209. MADD c11, b1, a1, c11
  210. MADD c12, b1, a2, c12
  211. LD a1, AO, 6 * SIZE
  212. LD a2, AO, 7 * SIZE
  213. LD b1, BO, 3 * SIZE
  214. MADD c11, b1, a1, c11
  215. MADD c12, b1, a2, c12
  216. addi.d L, L, -1
  217. addi.d AO, AO, 8 * SIZE
  218. addi.d BO, BO, 4 * SIZE
  219. blt $r0, L, .L72
  220. .align 3
  221. .L75:
  222. #if defined(LT) || defined(RN)
  223. andi L, KK, 3
  224. #else
  225. andi L, TEMP, 3
  226. #endif
  227. bge $r0, L, .L78
  228. .align 3
  229. .L76:
  230. LD a1, AO, 0 * SIZE
  231. LD a2, AO, 1 * SIZE
  232. LD b1, BO, 0 * SIZE
  233. MADD c11, b1, a1, c11
  234. MADD c12, b1, a2, c12
  235. addi.d L, L, -1
  236. addi.d AO, AO, 2 * SIZE
  237. addi.d BO, BO, 1 * SIZE
  238. blt $r0, L, .L76
  239. .L78:
  240. ADD c11, c11, c21
  241. ADD c12, c12, c22
  242. #if defined(LN) || defined(RT)
  243. #ifdef LN
  244. addi.d TEMP, KK, -2
  245. #else
  246. addi.d TEMP, KK, -1
  247. #endif
  248. slli.d L, TEMP, 1 + BASE_SHIFT
  249. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  250. add.d AO, AORIG, L
  251. add.d BO, B, TEMP
  252. #endif
  253. #if defined(LN) || defined(LT)
  254. LD b1, BO, 0 * SIZE
  255. LD b2, BO, 1 * SIZE
  256. SUB c11, b1, c11
  257. SUB c12, b2, c12
  258. #else
  259. LD b1, AO, 0 * SIZE
  260. LD b2, AO, 1 * SIZE
  261. SUB c11, b1, c11
  262. SUB c12, b2, c12
  263. #endif
  264. #ifdef LN
  265. LD b1, AO, 3 * SIZE
  266. LD b2, AO, 2 * SIZE
  267. LD b3, AO, 0 * SIZE
  268. MUL c12, b1, c12
  269. NMSUB c11, c12, b2, c11
  270. MUL c11, b3, c11
  271. #endif
  272. #ifdef LT
  273. LD b1, AO, 0 * SIZE
  274. LD b2, AO, 1 * SIZE
  275. LD b3, AO, 3 * SIZE
  276. MUL c11, b1, c11
  277. NMSUB c12, c11, b2, c12
  278. MUL c12, b3, c12
  279. #endif
  280. #if defined(RN) || defined(RT)
  281. LD b1, BO, 0 * SIZE
  282. MUL c11, b1, c11
  283. MUL c12, b1, c12
  284. #endif
  285. #ifdef LN
  286. addi.d CO1, CO1, -2 * SIZE
  287. #endif
  288. #if defined(LN) || defined(LT)
  289. ST c11, BO, 0 * SIZE
  290. ST c12, BO, 1 * SIZE
  291. #else
  292. ST c11, AO, 0 * SIZE
  293. ST c12, AO, 1 * SIZE
  294. #endif
  295. ST c11, CO1, 0 * SIZE
  296. ST c12, CO1, 1 * SIZE
  297. #ifndef LN
  298. addi.d CO1, CO1, 2 * SIZE
  299. #endif
  300. #ifdef RT
  301. slli.d TEMP, K, 1 + BASE_SHIFT
  302. add.d AORIG, AORIG, TEMP
  303. #endif
  304. #if defined(LT) || defined(RN)
  305. sub.d TEMP, K, KK
  306. slli.d L, TEMP, 1 + BASE_SHIFT
  307. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  308. add.d AO, AO, L
  309. add.d BO, BO, TEMP
  310. #endif
  311. #ifdef LT
  312. addi.d KK, KK, 2
  313. #endif
  314. #ifdef LN
  315. addi.d KK, KK, -2
  316. #endif
  317. addi.d I, I, -1
  318. blt $r0, I, .L71
  319. .align 3
  320. .L80:
  321. andi I, M, 1
  322. bge $r0, I, .L89
  323. #if defined(LT) || defined(RN)
  324. LD a1, AO, 0 * SIZE
  325. MTC c11, $r0
  326. LD a2, AO, 1 * SIZE
  327. LD a3, AO, 2 * SIZE
  328. LD a4, AO, 3 * SIZE
  329. LD b1, B, 0 * SIZE
  330. LD b2, B, 1 * SIZE
  331. MOV c21, c11
  332. LD b3, B, 2 * SIZE
  333. LD b4, B, 3 * SIZE
  334. LD b5, B, 4 * SIZE
  335. LD b6, B, 8 * SIZE
  336. LD b7, B, 12 * SIZE
  337. srai.d L, KK, 2
  338. move BO, B
  339. bge $r0, L, .L85
  340. #else
  341. #ifdef LN
  342. slli.d TEMP, K, BASE_SHIFT
  343. sub.d AORIG, AORIG, TEMP
  344. #endif
  345. slli.d TEMP, KK, BASE_SHIFT
  346. add.d AO, AORIG, TEMP
  347. add.d BO, B, TEMP
  348. sub.d TEMP, K, KK
  349. LD a1, AO, 0 * SIZE
  350. MTC c11, $r0
  351. LD a2, AO, 1 * SIZE
  352. LD a3, AO, 2 * SIZE
  353. LD a4, AO, 3 * SIZE
  354. LD b1, BO, 0 * SIZE
  355. LD b2, BO, 1 * SIZE
  356. LD b3, BO, 2 * SIZE
  357. LD b4, BO, 3 * SIZE
  358. MOV c21, c11
  359. LD b5, BO, 4 * SIZE
  360. LD b6, BO, 8 * SIZE
  361. LD b7, BO, 12 * SIZE
  362. srai.d L, TEMP, 2
  363. bge $r0, L, .L85
  364. #endif
  365. .align 3
  366. .L82:
  367. LD a1, AO, 0 * SIZE
  368. LD b1, BO, 0 * SIZE
  369. MADD c11, b1, a1, c11
  370. LD a1, AO, 1 * SIZE
  371. LD b1, BO, 1 * SIZE
  372. MADD c21, b1, a1, c21
  373. LD a1, AO, 2 * SIZE
  374. LD b1, BO, 2 * SIZE
  375. MADD c11, b1, a1, c11
  376. LD a1, AO, 3 * SIZE
  377. LD b1, BO, 3 * SIZE
  378. MADD c21, b1, a1, c21
  379. addi.d L, L, -1
  380. addi.d AO, AO, 4 * SIZE
  381. addi.d BO, BO, 4 * SIZE
  382. blt $r0, L, .L82
  383. .align 3
  384. .L85:
  385. #if defined(LT) || defined(RN)
  386. andi L, KK, 3
  387. #else
  388. andi L, TEMP, 3
  389. #endif
  390. bge $r0, L, .L88
  391. .align 3
  392. .L86:
  393. LD a1, AO, 0 * SIZE
  394. LD b1, BO, 0 * SIZE
  395. MADD c11, b1, a1, c11
  396. addi.d L, L, -1
  397. addi.d AO, AO, 1 * SIZE
  398. addi.d BO, BO, 1 * SIZE
  399. blt $r0, L, .L86
  400. .L88:
  401. ADD c11, c11, c21
  402. #if defined(LN) || defined(RT)
  403. #ifdef LN
  404. addi.d TEMP, KK, -1
  405. #else
  406. addi.d TEMP, KK, -1
  407. #endif
  408. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  409. add.d AO, AORIG, TEMP
  410. add.d BO, B, TEMP
  411. #endif
  412. #if defined(LN) || defined(LT)
  413. LD b1, BO, 0 * SIZE
  414. SUB c11, b1, c11
  415. #else
  416. LD b1, AO, 0 * SIZE
  417. SUB c11, b1, c11
  418. #endif
  419. #if defined(LN) || defined(LT)
  420. LD b1, AO, 0 * SIZE
  421. MUL c11, b1, c11
  422. #endif
  423. #if defined(RN) || defined(RT)
  424. LD b1, BO, 0 * SIZE
  425. MUL c11, b1, c11
  426. #endif
  427. #ifdef LN
  428. addi.d CO1, CO1, -1 * SIZE
  429. #endif
  430. #if defined(LN) || defined(LT)
  431. ST c11, BO, 0 * SIZE
  432. #else
  433. ST c11, AO, 0 * SIZE
  434. #endif
  435. ST c11, CO1, 0 * SIZE
  436. #ifndef LN
  437. addi.d CO1, CO1, 1 * SIZE
  438. #endif
  439. #ifdef RT
  440. slli.d TEMP, K, BASE_SHIFT
  441. add.d AORIG, AORIG, TEMP
  442. #endif
  443. #if defined(LT) || defined(RN)
  444. sub.d TEMP, K, KK
  445. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  446. add.d AO, AO, TEMP
  447. add.d BO, BO, TEMP
  448. #endif
  449. #ifdef LT
  450. addi.d KK, KK, 1
  451. #endif
  452. #ifdef LN
  453. addi.d KK, KK, -1
  454. #endif
  455. .align 3
  456. .L89:
  457. #ifdef LN
  458. slli.d TEMP, K, BASE_SHIFT
  459. add.d B, B, TEMP
  460. #endif
  461. #if defined(LT) || defined(RN)
  462. move B, BO
  463. #endif
  464. #ifdef RN
  465. addi.d KK, KK, 1
  466. #endif
  467. #ifdef RT
  468. addi.d KK, KK, -1
  469. #endif
  470. .align 3
  471. .L30:
  472. andi J, N, 2
  473. bge $r0, J, .L50
  474. #ifdef RT
  475. slli.d TEMP, K, 1 + BASE_SHIFT
  476. sub.d B, B, TEMP
  477. slli.d TEMP, LDC, 1
  478. sub.d C, C, TEMP
  479. #endif
  480. move AO, A
  481. move CO1, C
  482. add.d CO2, C, LDC
  483. #ifdef LN
  484. add.d KK, M, OFFSET
  485. #endif
  486. #ifdef LT
  487. move KK, OFFSET
  488. #endif
  489. #if defined(LN) || defined(RT)
  490. move AORIG, A
  491. #else
  492. move AO, A
  493. #endif
  494. #ifndef RT
  495. add.d C, CO2, LDC
  496. #endif
  497. srai.d I, M, 1
  498. bge $r0, I, .L60
  499. .L51:
  500. #if defined(LT) || defined(RN)
  501. LD a1, AO, 0 * SIZE
  502. MTC c11, $r0
  503. LD a2, AO, 1 * SIZE
  504. MOV c21, c11
  505. LD a5, AO, 4 * SIZE
  506. LD b1, B, 0 * SIZE
  507. MOV c12, c11
  508. LD b2, B, 1 * SIZE
  509. MOV c22, c11
  510. LD b3, B, 2 * SIZE
  511. LD b5, B, 4 * SIZE
  512. srai.d L, KK, 2
  513. LD b6, B, 8 * SIZE
  514. LD b7, B, 12 * SIZE
  515. move BO, B
  516. bge $r0, L, .L55
  517. #else
  518. #ifdef LN
  519. slli.d TEMP, K, 1 + BASE_SHIFT
  520. sub.d AORIG, AORIG, TEMP
  521. #endif
  522. slli.d L, KK, 1 + BASE_SHIFT
  523. slli.d TEMP, KK, 1 + BASE_SHIFT
  524. add.d AO, AORIG, L
  525. add.d BO, B, TEMP
  526. sub.d TEMP, K, KK
  527. LD a1, AO, 0 * SIZE
  528. MTC c11, $r0
  529. LD a2, AO, 1 * SIZE
  530. MOV c21, c11
  531. LD a5, AO, 4 * SIZE
  532. LD b1, BO, 0 * SIZE
  533. MOV c12, c11
  534. LD b2, BO, 1 * SIZE
  535. MOV c22, c11
  536. LD b3, BO, 2 * SIZE
  537. LD b5, BO, 4 * SIZE
  538. srai.d L, TEMP, 2
  539. LD b6, BO, 8 * SIZE
  540. LD b7, BO, 12 * SIZE
  541. bge $r0, L, .L55
  542. #endif
  543. .align 3
  544. .L52:
  545. MADD c11, b1, a1, c11
  546. LD a3, AO, 2 * SIZE
  547. MADD c21, b2, a1, c21
  548. LD b4, BO, 3 * SIZE
  549. MADD c12, b1, a2, c12
  550. LD a4, AO, 3 * SIZE
  551. MADD c22, b2, a2, c22
  552. LD b1, BO, 8 * SIZE
  553. MADD c11, b3, a3, c11
  554. LD a1, AO, 8 * SIZE
  555. MADD c21, b4, a3, c21
  556. LD b2, BO, 5 * SIZE
  557. MADD c12, b3, a4, c12
  558. LD a2, AO, 5 * SIZE
  559. MADD c22, b4, a4, c22
  560. LD b3, BO, 6 * SIZE
  561. MADD c11, b5, a5, c11
  562. LD a3, AO, 6 * SIZE
  563. MADD c21, b2, a5, c21
  564. LD b4, BO, 7 * SIZE
  565. MADD c12, b5, a2, c12
  566. LD a4, AO, 7 * SIZE
  567. MADD c22, b2, a2, c22
  568. LD b5, BO, 12 * SIZE
  569. MADD c11, b3, a3, c11
  570. LD a5, AO, 12 * SIZE
  571. MADD c21, b4, a3, c21
  572. LD b2, BO, 9 * SIZE
  573. MADD c12, b3, a4, c12
  574. LD a2, AO, 9 * SIZE
  575. MADD c22, b4, a4, c22
  576. LD b3, BO, 10 * SIZE
  577. addi.d AO, AO, 8 * SIZE
  578. addi.d L, L, -1
  579. addi.d BO, BO, 8 * SIZE
  580. blt $r0, L, .L52
  581. .align 3
  582. .L55:
  583. #if defined(LT) || defined(RN)
  584. andi L, KK, 3
  585. #else
  586. andi L, TEMP, 3
  587. #endif
  588. bge $r0, L, .L58
  589. .align 3
  590. .L56:
  591. MADD c11, b1, a1, c11
  592. LD a2, AO, 1 * SIZE
  593. MADD c21, b2, a1, c21
  594. LD a1, AO, 2 * SIZE
  595. MADD c12, b1, a2, c12
  596. LD b1, BO, 2 * SIZE
  597. MADD c22, b2, a2, c22
  598. LD b2, BO, 3 * SIZE
  599. addi.d L, L, -1
  600. addi.d AO, AO, 2 * SIZE
  601. addi.d BO, BO, 2 * SIZE
  602. blt $r0, L, .L56
  603. .L58:
  604. #if defined(LN) || defined(RT)
  605. #ifdef LN
  606. addi.d TEMP, KK, -2
  607. #else
  608. addi.d TEMP, KK, -2
  609. #endif
  610. slli.d L, TEMP, 1 + BASE_SHIFT
  611. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  612. add.d AO, AORIG, L
  613. add.d BO, B, TEMP
  614. #endif
  615. #if defined(LN) || defined(LT)
  616. LD b1, BO, 0 * SIZE
  617. LD b2, BO, 1 * SIZE
  618. LD b3, BO, 2 * SIZE
  619. LD b4, BO, 3 * SIZE
  620. SUB c11, b1, c11
  621. SUB c21, b2, c21
  622. SUB c12, b3, c12
  623. SUB c22, b4, c22
  624. #else
  625. LD b1, AO, 0 * SIZE
  626. LD b2, AO, 1 * SIZE
  627. LD b3, AO, 2 * SIZE
  628. LD b4, AO, 3 * SIZE
  629. SUB c11, b1, c11
  630. SUB c12, b2, c12
  631. SUB c21, b3, c21
  632. SUB c22, b4, c22
  633. #endif
  634. #ifdef LN
  635. LD b1, AO, 3 * SIZE
  636. LD b2, AO, 2 * SIZE
  637. LD b3, AO, 0 * SIZE
  638. MUL c12, b1, c12
  639. MUL c22, b1, c22
  640. NMSUB c11, c12, b2, c11
  641. NMSUB c21, c22, b2, c21
  642. MUL c11, b3, c11
  643. MUL c21, b3, c21
  644. #endif
  645. #ifdef LT
  646. LD b1, AO, 0 * SIZE
  647. LD b2, AO, 1 * SIZE
  648. LD b3, AO, 3 * SIZE
  649. MUL c11, b1, c11
  650. MUL c21, b1, c21
  651. NMSUB c12, c11, b2, c12
  652. NMSUB c22, c21, b2, c22
  653. MUL c12, b3, c12
  654. MUL c22, b3, c22
  655. #endif
  656. #ifdef RN
  657. LD b1, BO, 0 * SIZE
  658. LD b2, BO, 1 * SIZE
  659. LD b3, BO, 3 * SIZE
  660. MUL c11, b1, c11
  661. MUL c12, b1, c12
  662. NMSUB c21, c11, b2, c21
  663. NMSUB c22, c12, b2, c22
  664. MUL c21, b3, c21
  665. MUL c22, b3, c22
  666. #endif
  667. #ifdef RT
  668. LD b1, BO, 3 * SIZE
  669. LD b2, BO, 2 * SIZE
  670. LD b3, BO, 0 * SIZE
  671. MUL c21, b1, c21
  672. MUL c22, b1, c22
  673. NMSUB c11, c21, b2, c11
  674. NMSUB c12, c22, b2, c12
  675. MUL c11, b3, c11
  676. MUL c12, b3, c12
  677. #endif
  678. #ifdef LN
  679. addi.d CO1, CO1, -2 * SIZE
  680. addi.d CO2, CO2, -2 * SIZE
  681. #endif
  682. #if defined(LN) || defined(LT)
  683. ST c11, BO, 0 * SIZE
  684. ST c21, BO, 1 * SIZE
  685. ST c12, BO, 2 * SIZE
  686. ST c22, BO, 3 * SIZE
  687. #else
  688. ST c11, AO, 0 * SIZE
  689. ST c12, AO, 1 * SIZE
  690. ST c21, AO, 2 * SIZE
  691. ST c22, AO, 3 * SIZE
  692. #endif
  693. ST c11, CO1, 0 * SIZE
  694. ST c12, CO1, 1 * SIZE
  695. ST c21, CO2, 0 * SIZE
  696. ST c22, CO2, 1 * SIZE
  697. #ifndef LN
  698. addi.d CO1, CO1, 2 * SIZE
  699. addi.d CO2, CO2, 2 * SIZE
  700. #endif
  701. #ifdef RT
  702. slli.d TEMP, K, 1 + BASE_SHIFT
  703. add.d AORIG, AORIG, TEMP
  704. #endif
  705. #if defined(LT) || defined(RN)
  706. sub.d TEMP, K, KK
  707. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  708. add.d AO, AO, TEMP
  709. add.d BO, BO, TEMP
  710. #endif
  711. #ifdef LT
  712. addi.d KK, KK, 2
  713. #endif
  714. #ifdef LN
  715. addi.d KK, KK, -2
  716. #endif
  717. MTC a1, $r0
  718. MOV c11, a1
  719. MOV c21, a1
  720. MOV c31, a1
  721. addi.d I, I, -1
  722. MOV c41, c11
  723. blt $r0, I, .L51
  724. .align 3
  725. .L60:
  726. andi I, M, 1
  727. bge $r0, I, .L69
  728. #if defined(LT) || defined(RN)
  729. srai.d L, KK, 2
  730. LD a1, AO, 0 * SIZE
  731. MTC c11, $r0
  732. LD a2, AO, 1 * SIZE
  733. MOV c21, c11
  734. LD a3, AO, 2 * SIZE
  735. MOV c31, c11
  736. LD a4, AO, 3 * SIZE
  737. MOV c41, c11
  738. LD b1, B, 0 * SIZE
  739. LD b2, B, 1 * SIZE
  740. LD b3, B, 2 * SIZE
  741. LD b4, B, 3 * SIZE
  742. LD b5, B, 4 * SIZE
  743. LD b6, B, 8 * SIZE
  744. LD b7, B, 12 * SIZE
  745. move BO, B
  746. bge $r0, L, .L65
  747. #else
  748. #ifdef LN
  749. slli.d TEMP, K, BASE_SHIFT
  750. sub.d AORIG, AORIG, TEMP
  751. #endif
  752. slli.d L, KK, 0 + BASE_SHIFT
  753. slli.d TEMP, KK, 1 + BASE_SHIFT
  754. add.d AO, AORIG, L
  755. add.d BO, B, TEMP
  756. sub.d TEMP, K, KK
  757. srai.d L, TEMP, 2
  758. LD a1, AO, 0 * SIZE
  759. MTC c11, $r0
  760. LD a2, AO, 1 * SIZE
  761. MOV c21, c11
  762. LD a3, AO, 2 * SIZE
  763. MOV c31, c11
  764. LD a4, AO, 3 * SIZE
  765. MOV c41, c11
  766. LD b1, BO, 0 * SIZE
  767. LD b2, BO, 1 * SIZE
  768. LD b3, BO, 2 * SIZE
  769. LD b4, BO, 3 * SIZE
  770. LD b5, BO, 4 * SIZE
  771. LD b6, BO, 8 * SIZE
  772. LD b7, BO, 12 * SIZE
  773. bge $r0, L, .L65
  774. #endif
  775. .align 3
  776. .L62:
  777. MADD c11, b1, a1, c11
  778. LD b1, BO, 4 * SIZE
  779. MADD c21, b2, a1, c21
  780. LD b2, BO, 5 * SIZE
  781. MADD c31, b3, a2, c31
  782. LD b3, BO, 6 * SIZE
  783. MADD c41, b4, a2, c41
  784. LD b4, BO, 7 * SIZE
  785. LD a1, AO, 4 * SIZE
  786. LD a2, AO, 5 * SIZE
  787. MADD c11, b1, a3, c11
  788. LD b1, BO, 8 * SIZE
  789. MADD c21, b2, a3, c21
  790. LD b2, BO, 9 * SIZE
  791. MADD c31, b3, a4, c31
  792. LD b3, BO, 10 * SIZE
  793. MADD c41, b4, a4, c41
  794. LD b4, BO, 11 * SIZE
  795. LD a3, AO, 6 * SIZE
  796. LD a4, AO, 7 * SIZE
  797. addi.d L, L, -1
  798. addi.d AO, AO, 4 * SIZE
  799. addi.d BO, BO, 8 * SIZE
  800. blt $r0, L, .L62
  801. .align 3
  802. .L65:
  803. #if defined(LT) || defined(RN)
  804. andi L, KK, 3
  805. #else
  806. andi L, TEMP, 3
  807. #endif
  808. bge $r0, L, .L68
  809. .align 3
  810. .L66:
  811. MADD c11, b1, a1, c11
  812. LD b1, BO, 2 * SIZE
  813. MADD c21, b2, a1, c21
  814. LD b2, BO, 3 * SIZE
  815. LD a1, AO, 1 * SIZE
  816. addi.d L, L, -1
  817. addi.d AO, AO, 1 * SIZE
  818. addi.d BO, BO, 2 * SIZE
  819. blt $r0, L, .L66
  820. .L68:
  821. ADD c11, c11, c31
  822. ADD c21, c21, c41
  823. #if defined(LN) || defined(RT)
  824. #ifdef LN
  825. addi.d TEMP, KK, -1
  826. #else
  827. addi.d TEMP, KK, -2
  828. #endif
  829. slli.d L, TEMP, 0 + BASE_SHIFT
  830. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  831. add.d AO, AORIG, L
  832. add.d BO, B, TEMP
  833. #endif
  834. #if defined(LN) || defined(LT)
  835. LD b1, BO, 0 * SIZE
  836. LD b2, BO, 1 * SIZE
  837. SUB c11, b1, c11
  838. SUB c21, b2, c21
  839. #else
  840. LD b1, AO, 0 * SIZE
  841. LD b2, AO, 1 * SIZE
  842. SUB c11, b1, c11
  843. SUB c21, b2, c21
  844. #endif
  845. #if defined(LN) || defined(LT)
  846. LD b3, AO, 0 * SIZE
  847. MUL c11, b3, c11
  848. MUL c21, b3, c21
  849. #endif
  850. #ifdef RN
  851. LD b1, BO, 0 * SIZE
  852. LD b2, BO, 1 * SIZE
  853. LD b3, BO, 3 * SIZE
  854. MUL c11, b1, c11
  855. NMSUB c21, c11, b2, c21
  856. MUL c21, b3, c21
  857. #endif
  858. #ifdef RT
  859. LD b1, BO, 3 * SIZE
  860. LD b2, BO, 2 * SIZE
  861. LD b3, BO, 0 * SIZE
  862. MUL c21, b1, c21
  863. NMSUB c11, c21, b2, c11
  864. MUL c11, b3, c11
  865. #endif
  866. #ifdef LN
  867. addi.d CO1, CO1, -1 * SIZE
  868. addi.d CO2, CO2, -1 * SIZE
  869. #endif
  870. #if defined(LN) || defined(LT)
  871. ST c11, BO, 0 * SIZE
  872. ST c21, BO, 1 * SIZE
  873. #else
  874. ST c11, AO, 0 * SIZE
  875. ST c21, AO, 1 * SIZE
  876. #endif
  877. ST c11, CO1, 0 * SIZE
  878. ST c21, CO2, 0 * SIZE
  879. #ifndef LN
  880. addi.d CO1, CO1, 1 * SIZE
  881. addi.d CO2, CO2, 1 * SIZE
  882. #endif
  883. #ifdef RT
  884. slli.d TEMP, K, 0 + BASE_SHIFT
  885. add.d AORIG, AORIG, TEMP
  886. #endif
  887. #if defined(LT) || defined(RN)
  888. sub.d TEMP, K, KK
  889. slli.d L, TEMP, 0 + BASE_SHIFT
  890. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  891. add.d AO, AO, L
  892. add.d BO, BO, TEMP
  893. #endif
  894. #ifdef LT
  895. addi.d KK, KK, 1
  896. #endif
  897. #ifdef LN
  898. addi.d KK, KK, -1
  899. #endif
  900. .align 3
  901. .L69:
  902. #ifdef LN
  903. slli.d TEMP, K, 1 + BASE_SHIFT
  904. add.d B, B, TEMP
  905. #endif
  906. #if defined(LT) || defined(RN)
  907. move B, BO
  908. #endif
  909. #ifdef RN
  910. addi.d KK, KK, 2
  911. #endif
  912. #ifdef RT
  913. addi.d KK, KK, -2
  914. #endif
  915. .align 3
  916. .L50:
  917. andi J, N, 4
  918. move AO, A
  919. bge $r0, J, .L70
  920. #ifdef RT
  921. slli.d TEMP, K, 2 + BASE_SHIFT
  922. sub.d B, B, TEMP
  923. slli.d TEMP, LDC, 2
  924. sub.d C, C, TEMP
  925. #endif
  926. move CO1, C
  927. MTC c11, $r0
  928. add.d CO2, C, LDC
  929. add.d CO3, CO2, LDC
  930. add.d CO4, CO3, LDC
  931. MOV c21, c11
  932. srai.d I, M, 1
  933. MOV c31, c11
  934. #ifdef LN
  935. add.d KK, M, OFFSET
  936. #endif
  937. #ifdef LT
  938. move KK, OFFSET
  939. #endif
  940. #if defined(LN) || defined(RT)
  941. move AORIG, A
  942. #else
  943. move AO, A
  944. #endif
  945. #ifndef RT
  946. add.d C, CO4, LDC
  947. #endif
  948. MOV c41, c11
  949. bge $r0, I, .L40
  950. .L31:
  951. #if defined(LT) || defined(RN)
  952. LD a1, AO, 0 * SIZE
  953. LD a3, AO, 4 * SIZE
  954. LD b1, B, 0 * SIZE
  955. MOV c12, c11
  956. LD b2, B, 1 * SIZE
  957. MOV c22, c11
  958. LD b3, B, 2 * SIZE
  959. MOV c32, c11
  960. LD b4, B, 3 * SIZE
  961. MOV c42, c11
  962. LD b5, B, 4 * SIZE
  963. srai.d L, KK, 2
  964. LD b6, B, 8 * SIZE
  965. LD b7, B, 12 * SIZE
  966. move BO, B
  967. bge $r0, L, .L35
  968. #else
  969. #ifdef LN
  970. slli.d TEMP, K, 1 + BASE_SHIFT
  971. sub.d AORIG, AORIG, TEMP
  972. #endif
  973. slli.d L, KK, 1 + BASE_SHIFT
  974. slli.d TEMP, KK, 2 + BASE_SHIFT
  975. add.d AO, AORIG, L
  976. add.d BO, B, TEMP
  977. sub.d TEMP, K, KK
  978. LD a1, AO, 0 * SIZE
  979. LD a3, AO, 4 * SIZE
  980. LD b1, BO, 0 * SIZE
  981. MOV c12, c11
  982. LD b2, BO, 1 * SIZE
  983. MOV c22, c11
  984. LD b3, BO, 2 * SIZE
  985. MOV c32, c11
  986. LD b4, BO, 3 * SIZE
  987. MOV c42, c11
  988. LD b5, BO, 4 * SIZE
  989. srai.d L, TEMP, 2
  990. LD b6, BO, 8 * SIZE
  991. LD b7, BO, 12 * SIZE
  992. bge $r0, L, .L35
  993. #endif
  994. .align 3
  995. .L32:
  996. MADD c11, b1, a1, c11
  997. LD a2, AO, 1 * SIZE
  998. MADD c21, b2, a1, c21
  999. addi.d L, L, -1
  1000. MADD c31, b3, a1, c31
  1001. MADD c41, b4, a1, c41
  1002. LD a1, AO, 2 * SIZE
  1003. MADD c12, b1, a2, c12
  1004. LD b1, BO, 16 * SIZE
  1005. MADD c22, b2, a2, c22
  1006. LD b2, BO, 5 * SIZE
  1007. MADD c32, b3, a2, c32
  1008. LD b3, BO, 6 * SIZE
  1009. MADD c42, b4, a2, c42
  1010. LD b4, BO, 7 * SIZE
  1011. MADD c11, b5, a1, c11
  1012. LD a2, AO, 3 * SIZE
  1013. MADD c21, b2, a1, c21
  1014. MADD c31, b3, a1, c31
  1015. MADD c41, b4, a1, c41
  1016. LD a1, AO, 8 * SIZE
  1017. MADD c12, b5, a2, c12
  1018. LD b5, BO, 20 * SIZE
  1019. MADD c22, b2, a2, c22
  1020. LD b2, BO, 9 * SIZE
  1021. MADD c32, b3, a2, c32
  1022. LD b3, BO, 10 * SIZE
  1023. MADD c42, b4, a2, c42
  1024. LD b4, BO, 11 * SIZE
  1025. MADD c11, b6, a3, c11
  1026. LD a2, AO, 5 * SIZE
  1027. MADD c21, b2, a3, c21
  1028. MADD c31, b3, a3, c31
  1029. MADD c41, b4, a3, c41
  1030. LD a3, AO, 6 * SIZE
  1031. MADD c12, b6, a2, c12
  1032. LD b6, BO, 24 * SIZE
  1033. MADD c22, b2, a2, c22
  1034. LD b2, BO, 13 * SIZE
  1035. MADD c32, b3, a2, c32
  1036. LD b3, BO, 14 * SIZE
  1037. MADD c42, b4, a2, c42
  1038. LD b4, BO, 15 * SIZE
  1039. MADD c11, b7, a3, c11
  1040. LD a2, AO, 7 * SIZE
  1041. MADD c21, b2, a3, c21
  1042. addi.d AO, AO, 8 * SIZE
  1043. MADD c31, b3, a3, c31
  1044. addi.d BO, BO, 16 * SIZE
  1045. MADD c41, b4, a3, c41
  1046. LD a3, AO, 4 * SIZE
  1047. MADD c12, b7, a2, c12
  1048. LD b7, BO, 12 * SIZE
  1049. MADD c22, b2, a2, c22
  1050. LD b2, BO, 1 * SIZE
  1051. MADD c32, b3, a2, c32
  1052. LD b3, BO, 2 * SIZE
  1053. MADD c42, b4, a2, c42
  1054. LD b4, BO, 3 * SIZE
  1055. blt $r0, L, .L32
  1056. .align 3
  1057. .L35:
  1058. #if defined(LT) || defined(RN)
  1059. andi L, KK, 3
  1060. #else
  1061. andi L, TEMP, 3
  1062. #endif
  1063. bge $r0, L, .L38
  1064. .align 3
  1065. .L36:
  1066. MADD c11, b1, a1, c11
  1067. LD a2, AO, 1 * SIZE
  1068. MADD c21, b2, a1, c21
  1069. addi.d L, L, -1
  1070. MADD c31, b3, a1, c31
  1071. addi.d AO, AO, 2 * SIZE
  1072. MADD c41, b4, a1, c41
  1073. LD a1, AO, 0 * SIZE
  1074. MADD c12, b1, a2, c12
  1075. LD b1, BO, 4 * SIZE
  1076. MADD c22, b2, a2, c22
  1077. LD b2, BO, 5 * SIZE
  1078. MADD c32, b3, a2, c32
  1079. LD b3, BO, 6 * SIZE
  1080. MADD c42, b4, a2, c42
  1081. LD b4, BO, 7 * SIZE
  1082. addi.d BO, BO, 4 * SIZE
  1083. blt $r0, L, .L36
  1084. .L38:
  1085. #if defined(LN) || defined(RT)
  1086. #ifdef LN
  1087. addi.d TEMP, KK, -2
  1088. #else
  1089. addi.d TEMP, KK, -4
  1090. #endif
  1091. slli.d L, TEMP, 1 + BASE_SHIFT
  1092. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1093. add.d AO, AORIG, L
  1094. add.d BO, B, TEMP
  1095. #endif
  1096. #if defined(LN) || defined(LT)
  1097. LD b1, BO, 0 * SIZE
  1098. LD b2, BO, 1 * SIZE
  1099. LD b3, BO, 2 * SIZE
  1100. LD b4, BO, 3 * SIZE
  1101. LD b5, BO, 4 * SIZE
  1102. LD b6, BO, 5 * SIZE
  1103. LD b7, BO, 6 * SIZE
  1104. LD b8, BO, 7 * SIZE
  1105. SUB c11, b1, c11
  1106. SUB c21, b2, c21
  1107. SUB c31, b3, c31
  1108. SUB c41, b4, c41
  1109. SUB c12, b5, c12
  1110. SUB c22, b6, c22
  1111. SUB c32, b7, c32
  1112. SUB c42, b8, c42
  1113. #else
  1114. LD b1, AO, 0 * SIZE
  1115. LD b2, AO, 1 * SIZE
  1116. LD b3, AO, 2 * SIZE
  1117. LD b4, AO, 3 * SIZE
  1118. LD b5, AO, 4 * SIZE
  1119. LD b6, AO, 5 * SIZE
  1120. LD b7, AO, 6 * SIZE
  1121. LD b8, AO, 7 * SIZE
  1122. SUB c11, b1, c11
  1123. SUB c12, b2, c12
  1124. SUB c21, b3, c21
  1125. SUB c22, b4, c22
  1126. SUB c31, b5, c31
  1127. SUB c32, b6, c32
  1128. SUB c41, b7, c41
  1129. SUB c42, b8, c42
  1130. #endif
  1131. #ifdef LN
  1132. LD b1, AO, 3 * SIZE
  1133. LD b2, AO, 2 * SIZE
  1134. LD b3, AO, 0 * SIZE
  1135. MUL c12, b1, c12
  1136. MUL c22, b1, c22
  1137. MUL c32, b1, c32
  1138. MUL c42, b1, c42
  1139. NMSUB c11, c12, b2, c11
  1140. NMSUB c21, c22, b2, c21
  1141. NMSUB c31, c32, b2, c31
  1142. NMSUB c41, c42, b2, c41
  1143. MUL c11, b3, c11
  1144. MUL c21, b3, c21
  1145. MUL c31, b3, c31
  1146. MUL c41, b3, c41
  1147. #endif
  1148. #ifdef LT
  1149. LD b1, AO, 0 * SIZE
  1150. LD b2, AO, 1 * SIZE
  1151. LD b3, AO, 3 * SIZE
  1152. MUL c11, b1, c11
  1153. MUL c21, b1, c21
  1154. MUL c31, b1, c31
  1155. MUL c41, b1, c41
  1156. NMSUB c12, c11, b2, c12
  1157. NMSUB c22, c21, b2, c22
  1158. NMSUB c32, c31, b2, c32
  1159. NMSUB c42, c41, b2, c42
  1160. MUL c12, b3, c12
  1161. MUL c22, b3, c22
  1162. MUL c32, b3, c32
  1163. MUL c42, b3, c42
  1164. #endif
  1165. #ifdef RN
  1166. LD b1, BO, 0 * SIZE
  1167. LD b2, BO, 1 * SIZE
  1168. LD b3, BO, 2 * SIZE
  1169. LD b4, BO, 3 * SIZE
  1170. MUL c11, b1, c11
  1171. MUL c12, b1, c12
  1172. NMSUB c21, c11, b2, c21
  1173. NMSUB c22, c12, b2, c22
  1174. NMSUB c31, c11, b3, c31
  1175. NMSUB c32, c12, b3, c32
  1176. NMSUB c41, c11, b4, c41
  1177. NMSUB c42, c12, b4, c42
  1178. LD b2, BO, 5 * SIZE
  1179. LD b3, BO, 6 * SIZE
  1180. LD b4, BO, 7 * SIZE
  1181. MUL c21, b2, c21
  1182. MUL c22, b2, c22
  1183. NMSUB c31, c21, b3, c31
  1184. NMSUB c32, c22, b3, c32
  1185. NMSUB c41, c21, b4, c41
  1186. NMSUB c42, c22, b4, c42
  1187. LD b3, BO, 10 * SIZE
  1188. LD b4, BO, 11 * SIZE
  1189. MUL c31, b3, c31
  1190. MUL c32, b3, c32
  1191. NMSUB c41, c31, b4, c41
  1192. NMSUB c42, c32, b4, c42
  1193. LD b4, BO, 15 * SIZE
  1194. MUL c41, b4, c41
  1195. MUL c42, b4, c42
  1196. #endif
  1197. #ifdef RT
  1198. LD b5, BO, 15 * SIZE
  1199. LD b6, BO, 14 * SIZE
  1200. LD b7, BO, 13 * SIZE
  1201. LD b8, BO, 12 * SIZE
  1202. MUL c41, b5, c41
  1203. MUL c42, b5, c42
  1204. NMSUB c31, c41, b6, c31
  1205. NMSUB c32, c42, b6, c32
  1206. NMSUB c21, c41, b7, c21
  1207. NMSUB c22, c42, b7, c22
  1208. NMSUB c11, c41, b8, c11
  1209. NMSUB c12, c42, b8, c12
  1210. LD b6, BO, 10 * SIZE
  1211. LD b7, BO, 9 * SIZE
  1212. LD b8, BO, 8 * SIZE
  1213. MUL c31, b6, c31
  1214. MUL c32, b6, c32
  1215. NMSUB c21, c31, b7, c21
  1216. NMSUB c22, c32, b7, c22
  1217. NMSUB c11, c31, b8, c11
  1218. NMSUB c12, c32, b8, c12
  1219. LD b7, BO, 5 * SIZE
  1220. LD b8, BO, 4 * SIZE
  1221. MUL c21, b7, c21
  1222. MUL c22, b7, c22
  1223. NMSUB c11, c21, b8, c11
  1224. NMSUB c12, c22, b8, c12
  1225. LD b8, BO, 0 * SIZE
  1226. MUL c11, b8, c11
  1227. MUL c12, b8, c12
  1228. #endif
  1229. #ifdef LN
  1230. addi.d CO1, CO1, -2 * SIZE
  1231. addi.d CO2, CO2, -2 * SIZE
  1232. addi.d CO3, CO3, -2 * SIZE
  1233. addi.d CO4, CO4, -2 * SIZE
  1234. #endif
  1235. #if defined(LN) || defined(LT)
  1236. ST c11, BO, 0 * SIZE
  1237. ST c21, BO, 1 * SIZE
  1238. ST c31, BO, 2 * SIZE
  1239. ST c41, BO, 3 * SIZE
  1240. ST c12, BO, 4 * SIZE
  1241. ST c22, BO, 5 * SIZE
  1242. ST c32, BO, 6 * SIZE
  1243. ST c42, BO, 7 * SIZE
  1244. #else
  1245. ST c11, AO, 0 * SIZE
  1246. ST c12, AO, 1 * SIZE
  1247. ST c21, AO, 2 * SIZE
  1248. ST c22, AO, 3 * SIZE
  1249. ST c31, AO, 4 * SIZE
  1250. ST c32, AO, 5 * SIZE
  1251. ST c41, AO, 6 * SIZE
  1252. ST c42, AO, 7 * SIZE
  1253. #endif
  1254. ST c11, CO1, 0 * SIZE
  1255. ST c12, CO1, 1 * SIZE
  1256. ST c21, CO2, 0 * SIZE
  1257. ST c22, CO2, 1 * SIZE
  1258. ST c31, CO3, 0 * SIZE
  1259. ST c32, CO3, 1 * SIZE
  1260. ST c41, CO4, 0 * SIZE
  1261. ST c42, CO4, 1 * SIZE
  1262. #ifndef LN
  1263. addi.d CO1, CO1, 2 * SIZE
  1264. addi.d CO2, CO2, 2 * SIZE
  1265. addi.d CO3, CO3, 2 * SIZE
  1266. addi.d CO4, CO4, 2 * SIZE
  1267. #endif
  1268. #ifdef RT
  1269. slli.d TEMP, K, 1 + BASE_SHIFT
  1270. add.d AORIG, AORIG, TEMP
  1271. #endif
  1272. #if defined(LT) || defined(RN)
  1273. sub.d TEMP, K, KK
  1274. slli.d L, TEMP, 1 + BASE_SHIFT
  1275. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1276. add.d AO, AO, L
  1277. add.d BO, BO, TEMP
  1278. #endif
  1279. #ifdef LT
  1280. addi.d KK, KK, 2
  1281. #endif
  1282. #ifdef LN
  1283. addi.d KK, KK, -2
  1284. #endif
  1285. MTC a1, $r0
  1286. MOV c11, a1
  1287. MOV c21, a1
  1288. MOV c31, a1
  1289. addi.d I, I, -1
  1290. MOV c41, c11
  1291. blt $r0, I, .L31
  1292. .align 3
  1293. .L40:
  1294. andi I, M, 1
  1295. MOV c61, c11
  1296. bge $r0, I, .L49
  1297. #if defined(LT) || defined(RN)
  1298. LD a1, AO, 0 * SIZE
  1299. MOV c71, c11
  1300. LD a2, AO, 1 * SIZE
  1301. MOV c81, c11
  1302. LD b1, B, 0 * SIZE
  1303. LD b2, B, 1 * SIZE
  1304. LD b3, B, 2 * SIZE
  1305. LD b4, B, 3 * SIZE
  1306. LD b5, B, 4 * SIZE
  1307. LD b6, B, 8 * SIZE
  1308. LD b7, B, 12 * SIZE
  1309. srai.d L, KK, 2
  1310. move BO, B
  1311. bge $r0, L, .L45
  1312. #else
  1313. #ifdef LN
  1314. slli.d TEMP, K, BASE_SHIFT
  1315. sub.d AORIG, AORIG, TEMP
  1316. #endif
  1317. slli.d L, KK, 0 + BASE_SHIFT
  1318. slli.d TEMP, KK, 2 + BASE_SHIFT
  1319. add.d AO, AORIG, L
  1320. add.d BO, B, TEMP
  1321. sub.d TEMP, K, KK
  1322. LD a1, AO, 0 * SIZE
  1323. MOV c71, c11
  1324. LD a2, AO, 1 * SIZE
  1325. MOV c81, c11
  1326. LD b1, BO, 0 * SIZE
  1327. LD b2, BO, 1 * SIZE
  1328. LD b3, BO, 2 * SIZE
  1329. LD b4, BO, 3 * SIZE
  1330. LD b5, BO, 4 * SIZE
  1331. LD b6, BO, 8 * SIZE
  1332. LD b7, BO, 12 * SIZE
  1333. srai.d L, TEMP, 2
  1334. bge $r0, L, .L45
  1335. #endif
  1336. .align 3
  1337. .L42:
  1338. MADD c11, b1, a1, c11
  1339. LD b1, BO, 16 * SIZE
  1340. MADD c21, b2, a1, c21
  1341. LD b2, BO, 5 * SIZE
  1342. MADD c31, b3, a1, c31
  1343. LD b3, BO, 6 * SIZE
  1344. MADD c41, b4, a1, c41
  1345. LD b4, BO, 7 * SIZE
  1346. LD a1, AO, 4 * SIZE
  1347. addi.d L, L, -1
  1348. MADD c11, b5, a2, c11
  1349. LD b5, BO, 20 * SIZE
  1350. MADD c21, b2, a2, c21
  1351. LD b2, BO, 9 * SIZE
  1352. MADD c31, b3, a2, c31
  1353. LD b3, BO, 10 * SIZE
  1354. MADD c41, b4, a2, c41
  1355. LD b4, BO, 11 * SIZE
  1356. LD a2, AO, 2 * SIZE
  1357. addi.d AO, AO, 4 * SIZE
  1358. MADD c11, b6, a2, c11
  1359. LD b6, BO, 24 * SIZE
  1360. MADD c21, b2, a2, c21
  1361. LD b2, BO, 13 * SIZE
  1362. MADD c31, b3, a2, c31
  1363. LD b3, BO, 14 * SIZE
  1364. MADD c41, b4, a2, c41
  1365. LD b4, BO, 15 * SIZE
  1366. LD a2, AO, -1 * SIZE
  1367. addi.d BO, BO, 16 * SIZE
  1368. MADD c11, b7, a2, c11
  1369. LD b7, BO, 12 * SIZE
  1370. MADD c21, b2, a2, c21
  1371. LD b2, BO, 1 * SIZE
  1372. MADD c31, b3, a2, c31
  1373. LD b3, BO, 2 * SIZE
  1374. MADD c41, b4, a2, c41
  1375. LD b4, BO, 3 * SIZE
  1376. LD a2, AO, 1 * SIZE
  1377. blt $r0, L, .L42
  1378. .align 3
  1379. .L45:
  1380. #if defined(LT) || defined(RN)
  1381. andi L, KK, 3
  1382. #else
  1383. andi L, TEMP, 3
  1384. #endif
  1385. bge $r0, L, .L48
  1386. .align 3
  1387. .L46:
  1388. MADD c11, b1, a1, c11
  1389. LD b1, BO, 4 * SIZE
  1390. MADD c21, b2, a1, c21
  1391. LD b2, BO, 5 * SIZE
  1392. MADD c31, b3, a1, c31
  1393. LD b3, BO, 6 * SIZE
  1394. MADD c41, b4, a1, c41
  1395. LD a1, AO, 1 * SIZE
  1396. LD b4, BO, 7 * SIZE
  1397. addi.d L, L, -1
  1398. addi.d AO, AO, 1 * SIZE
  1399. MOV a2, a2
  1400. addi.d BO, BO, 4 * SIZE
  1401. blt $r0, L, .L46
  1402. .L48:
  1403. #if defined(LN) || defined(RT)
  1404. #ifdef LN
  1405. addi.d TEMP, KK, -1
  1406. #else
  1407. addi.d TEMP, KK, -4
  1408. #endif
  1409. slli.d L, TEMP, 0 + BASE_SHIFT
  1410. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1411. add.d AO, AORIG, L
  1412. add.d BO, B, TEMP
  1413. #endif
  1414. #if defined(LN) || defined(LT)
  1415. LD b1, BO, 0 * SIZE
  1416. LD b2, BO, 1 * SIZE
  1417. LD b3, BO, 2 * SIZE
  1418. LD b4, BO, 3 * SIZE
  1419. SUB c11, b1, c11
  1420. SUB c21, b2, c21
  1421. SUB c31, b3, c31
  1422. SUB c41, b4, c41
  1423. #else
  1424. LD b1, AO, 0 * SIZE
  1425. LD b2, AO, 1 * SIZE
  1426. LD b3, AO, 2 * SIZE
  1427. LD b4, AO, 3 * SIZE
  1428. SUB c11, b1, c11
  1429. SUB c21, b2, c21
  1430. SUB c31, b3, c31
  1431. SUB c41, b4, c41
  1432. #endif
  1433. #if defined(LN) || defined(LT)
  1434. LD b1, AO, 0 * SIZE
  1435. MUL c11, b1, c11
  1436. MUL c21, b1, c21
  1437. MUL c31, b1, c31
  1438. MUL c41, b1, c41
  1439. #endif
  1440. #ifdef RN
  1441. LD b1, BO, 0 * SIZE
  1442. LD b2, BO, 1 * SIZE
  1443. LD b3, BO, 2 * SIZE
  1444. LD b4, BO, 3 * SIZE
  1445. MUL c11, b1, c11
  1446. NMSUB c21, c11, b2, c21
  1447. NMSUB c31, c11, b3, c31
  1448. NMSUB c41, c11, b4, c41
  1449. LD b2, BO, 5 * SIZE
  1450. LD b3, BO, 6 * SIZE
  1451. LD b4, BO, 7 * SIZE
  1452. MUL c21, b2, c21
  1453. NMSUB c31, c21, b3, c31
  1454. NMSUB c41, c21, b4, c41
  1455. LD b3, BO, 10 * SIZE
  1456. LD b4, BO, 11 * SIZE
  1457. MUL c31, b3, c31
  1458. NMSUB c41, c31, b4, c41
  1459. LD b4, BO, 15 * SIZE
  1460. MUL c41, b4, c41
  1461. #endif
  1462. #ifdef RT
  1463. LD b5, BO, 15 * SIZE
  1464. LD b6, BO, 14 * SIZE
  1465. LD b7, BO, 13 * SIZE
  1466. LD b8, BO, 12 * SIZE
  1467. MUL c41, b5, c41
  1468. NMSUB c31, c41, b6, c31
  1469. NMSUB c21, c41, b7, c21
  1470. NMSUB c11, c41, b8, c11
  1471. LD b6, BO, 10 * SIZE
  1472. LD b7, BO, 9 * SIZE
  1473. LD b8, BO, 8 * SIZE
  1474. MUL c31, b6, c31
  1475. NMSUB c21, c31, b7, c21
  1476. NMSUB c11, c31, b8, c11
  1477. LD b7, BO, 5 * SIZE
  1478. LD b8, BO, 4 * SIZE
  1479. MUL c21, b7, c21
  1480. NMSUB c11, c21, b8, c11
  1481. LD b8, BO, 0 * SIZE
  1482. MUL c11, b8, c11
  1483. #endif
  1484. #ifdef LN
  1485. addi.d CO1, CO1, -1 * SIZE
  1486. addi.d CO2, CO2, -1 * SIZE
  1487. addi.d CO3, CO3, -1 * SIZE
  1488. addi.d CO4, CO4, -1 * SIZE
  1489. #endif
  1490. #if defined(LN) || defined(LT)
  1491. ST c11, BO, 0 * SIZE
  1492. ST c21, BO, 1 * SIZE
  1493. ST c31, BO, 2 * SIZE
  1494. ST c41, BO, 3 * SIZE
  1495. #else
  1496. ST c11, AO, 0 * SIZE
  1497. ST c21, AO, 1 * SIZE
  1498. ST c31, AO, 2 * SIZE
  1499. ST c41, AO, 3 * SIZE
  1500. #endif
  1501. ST c11, CO1, 0 * SIZE
  1502. ST c21, CO2, 0 * SIZE
  1503. ST c31, CO3, 0 * SIZE
  1504. ST c41, CO4, 0 * SIZE
  1505. #ifndef LN
  1506. addi.d CO1, CO1, 1 * SIZE
  1507. addi.d CO2, CO2, 1 * SIZE
  1508. addi.d CO3, CO3, 1 * SIZE
  1509. addi.d CO4, CO4, 1 * SIZE
  1510. #endif
  1511. #ifdef RT
  1512. slli.d TEMP, K, BASE_SHIFT
  1513. add.d AORIG, AORIG, TEMP
  1514. #endif
  1515. #if defined(LT) || defined(RN)
  1516. sub.d TEMP, K, KK
  1517. slli.d L, TEMP, 0 + BASE_SHIFT
  1518. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1519. add.d AO, AO, L
  1520. add.d BO, BO, TEMP
  1521. #endif
  1522. #ifdef LT
  1523. addi.d KK, KK, 1
  1524. #endif
  1525. #ifdef LN
  1526. addi.d KK, KK, -1
  1527. #endif
  1528. .align 3
  1529. .L49:
  1530. #ifdef LN
  1531. slli.d TEMP, K, 2 + BASE_SHIFT
  1532. add.d B, B, TEMP
  1533. #endif
  1534. #if defined(LT) || defined(RN)
  1535. move B, BO
  1536. #endif
  1537. #ifdef RN
  1538. addi.d KK, KK, 4
  1539. #endif
  1540. #ifdef RT
  1541. addi.d KK, KK, -4
  1542. #endif
  1543. .align 3
  1544. .L70:
  1545. srai.d J, N, 3
  1546. nop
  1547. bge $r0, J, .L999
  1548. .L10:
  1549. #ifdef RT
  1550. slli.d TEMP, K, 3 + BASE_SHIFT
  1551. sub.d B, B, TEMP
  1552. slli.d TEMP, LDC, 3
  1553. sub.d C, C, TEMP
  1554. #endif
  1555. move CO1, C
  1556. MTC c11, $r0
  1557. add.d CO2, C, LDC
  1558. add.d CO3, CO2, LDC
  1559. addi.d J, J, -1
  1560. add.d CO4, CO3, LDC
  1561. MOV c21, c11
  1562. add.d CO5, CO4, LDC
  1563. MOV c31, c11
  1564. add.d CO6, CO5, LDC
  1565. MOV c41, c11
  1566. add.d CO7, CO6, LDC
  1567. MOV c51, c11
  1568. add.d CO8, CO7, LDC
  1569. srai.d I, M, 1
  1570. #ifdef LN
  1571. add.d KK, M, OFFSET
  1572. #endif
  1573. #ifdef LT
  1574. move KK, OFFSET
  1575. #endif
  1576. #if defined(LN) || defined(RT)
  1577. move AORIG, A
  1578. #else
  1579. move AO, A
  1580. #endif
  1581. #ifndef RT
  1582. add.d C, CO8, LDC
  1583. #endif
  1584. MOV c61, c11
  1585. bge $r0, I, .L20
  1586. .L11:
  1587. #if defined(LT) || defined(RN)
  1588. LD a1, AO, 0 * SIZE
  1589. MOV c71, c11
  1590. LD b1, B, 0 * SIZE
  1591. MOV c81, c11
  1592. LD a3, AO, 4 * SIZE
  1593. MOV c12, c11
  1594. LD b2, B, 1 * SIZE
  1595. MOV c22, c11
  1596. srai.d L, KK, 2
  1597. MOV c32, c11
  1598. LD b3, B, 2 * SIZE
  1599. MOV c42, c11
  1600. LD b4, B, 3 * SIZE
  1601. MOV c52, c11
  1602. LD b5, B, 4 * SIZE
  1603. MOV c62, c11
  1604. LD b6, B, 8 * SIZE
  1605. MOV c72, c11
  1606. LD b7, B, 12 * SIZE
  1607. MOV c82, c11
  1608. move BO, B
  1609. bge $r0, L, .L15
  1610. #else
  1611. #ifdef LN
  1612. slli.d TEMP, K, 1 + BASE_SHIFT
  1613. sub.d AORIG, AORIG, TEMP
  1614. #endif
  1615. slli.d L, KK, 1 + BASE_SHIFT
  1616. slli.d TEMP, KK, 3 + BASE_SHIFT
  1617. add.d AO, AORIG, L
  1618. add.d BO, B, TEMP
  1619. sub.d TEMP, K, KK
  1620. LD a1, AO, 0 * SIZE
  1621. MOV c71, c11
  1622. LD b1, BO, 0 * SIZE
  1623. MOV c81, c11
  1624. LD a3, AO, 4 * SIZE
  1625. MOV c12, c11
  1626. LD b2, BO, 1 * SIZE
  1627. MOV c22, c11
  1628. MOV c32, c11
  1629. LD b3, BO, 2 * SIZE
  1630. MOV c42, c11
  1631. LD b4, BO, 3 * SIZE
  1632. MOV c52, c11
  1633. LD b5, BO, 4 * SIZE
  1634. MOV c62, c11
  1635. LD b6, BO, 8 * SIZE
  1636. MOV c72, c11
  1637. LD b7, BO, 12 * SIZE
  1638. MOV c82, c11
  1639. srai.d L, TEMP, 2
  1640. bge $r0, L, .L15
  1641. #endif
  1642. MADD c11, b1, a1, c11
  1643. LD a2, AO, 1 * SIZE
  1644. MADD c21, b2, a1, c21
  1645. addi.d L, L, -1
  1646. MADD c31, b3, a1, c31
  1647. MADD c41, b4, a1, c41
  1648. bge $r0, L, .L13
  1649. .align 3
  1650. .L12:
  1651. MADD c12, b1, a2, c12
  1652. LD b1, BO, 16 * SIZE
  1653. MADD c22, b2, a2, c22
  1654. LD b2, BO, 5 * SIZE
  1655. MADD c32, b3, a2, c32
  1656. LD b3, BO, 6 * SIZE
  1657. MADD c42, b4, a2, c42
  1658. LD b4, BO, 7 * SIZE
  1659. MADD c51, b5, a1, c51
  1660. MADD c61, b2, a1, c61
  1661. LD a4, AO, 2 * SIZE
  1662. MADD c71, b3, a1, c71
  1663. MADD c81, b4, a1, c81
  1664. LD a1, AO, 8 * SIZE
  1665. MADD c52, b5, a2, c52
  1666. LD b5, BO, 20 * SIZE
  1667. MADD c62, b2, a2, c62
  1668. LD b2, BO, 9 * SIZE
  1669. MADD c72, b3, a2, c72
  1670. LD b3, BO, 10 * SIZE
  1671. MADD c82, b4, a2, c82
  1672. LD b4, BO, 11 * SIZE
  1673. MADD c11, b6, a4, c11
  1674. LD a2, AO, 3 * SIZE
  1675. MADD c21, b2, a4, c21
  1676. MADD c31, b3, a4, c31
  1677. MADD c41, b4, a4, c41
  1678. MADD c12, b6, a2, c12
  1679. LD b6, BO, 24 * SIZE
  1680. MADD c22, b2, a2, c22
  1681. LD b2, BO, 13 * SIZE
  1682. MADD c32, b3, a2, c32
  1683. LD b3, BO, 14 * SIZE
  1684. MADD c42, b4, a2, c42
  1685. LD b4, BO, 15 * SIZE
  1686. MADD c51, b7, a4, c51
  1687. MADD c61, b2, a4, c61
  1688. MADD c71, b3, a4, c71
  1689. MADD c81, b4, a4, c81
  1690. MADD c52, b7, a2, c52
  1691. LD b7, BO, 28 * SIZE
  1692. MADD c62, b2, a2, c62
  1693. LD b2, BO, 17 * SIZE
  1694. MADD c72, b3, a2, c72
  1695. LD b3, BO, 18 * SIZE
  1696. MADD c82, b4, a2, c82
  1697. LD b4, BO, 19 * SIZE
  1698. MADD c11, b1, a3, c11
  1699. LD a2, AO, 5 * SIZE
  1700. MADD c21, b2, a3, c21
  1701. MADD c31, b3, a3, c31
  1702. MADD c41, b4, a3, c41
  1703. MADD c12, b1, a2, c12
  1704. LD b1, BO, 32 * SIZE
  1705. MADD c22, b2, a2, c22
  1706. LD b2, BO, 21 * SIZE
  1707. MADD c32, b3, a2, c32
  1708. LD b3, BO, 22 * SIZE
  1709. MADD c42, b4, a2, c42
  1710. LD b4, BO, 23 * SIZE
  1711. MADD c51, b5, a3, c51
  1712. MADD c61, b2, a3, c61
  1713. LD a4, AO, 6 * SIZE
  1714. MADD c71, b3, a3, c71
  1715. MADD c81, b4, a3, c81
  1716. LD a3, AO, 12 * SIZE
  1717. MADD c52, b5, a2, c52
  1718. LD b5, BO, 36 * SIZE
  1719. MADD c62, b2, a2, c62
  1720. LD b2, BO, 25 * SIZE
  1721. MADD c72, b3, a2, c72
  1722. LD b3, BO, 26 * SIZE
  1723. MADD c82, b4, a2, c82
  1724. LD b4, BO, 27 * SIZE
  1725. MADD c11, b6, a4, c11
  1726. LD a2, AO, 7 * SIZE
  1727. MADD c21, b2, a4, c21
  1728. MADD c31, b3, a4, c31
  1729. MADD c41, b4, a4, c41
  1730. addi.d L, L, -1
  1731. MADD c12, b6, a2, c12
  1732. LD b6, BO, 40 * SIZE
  1733. MADD c22, b2, a2, c22
  1734. LD b2, BO, 29 * SIZE
  1735. MADD c32, b3, a2, c32
  1736. LD b3, BO, 30 * SIZE
  1737. MADD c42, b4, a2, c42
  1738. LD b4, BO, 31 * SIZE
  1739. MADD c51, b7, a4, c51
  1740. addi.d BO, BO, 32 * SIZE
  1741. MADD c61, b2, a4, c61
  1742. addi.d AO, AO, 8 * SIZE
  1743. MADD c71, b3, a4, c71
  1744. MADD c81, b4, a4, c81
  1745. MADD c52, b7, a2, c52
  1746. LD b7, BO, 12 * SIZE
  1747. MADD c62, b2, a2, c62
  1748. LD b2, BO, 1 * SIZE
  1749. MADD c72, b3, a2, c72
  1750. LD b3, BO, 2 * SIZE
  1751. MADD c82, b4, a2, c82
  1752. LD b4, BO, 3 * SIZE
  1753. MADD c11, b1, a1, c11
  1754. LD a2, AO, 1 * SIZE
  1755. MADD c21, b2, a1, c21
  1756. MADD c31, b3, a1, c31
  1757. MADD c41, b4, a1, c41
  1758. blt $r0, L, .L12
  1759. .align 3
  1760. .L13:
  1761. MADD c12, b1, a2, c12
  1762. LD b1, BO, 16 * SIZE
  1763. MADD c22, b2, a2, c22
  1764. LD b2, BO, 5 * SIZE
  1765. MADD c32, b3, a2, c32
  1766. LD b3, BO, 6 * SIZE
  1767. MADD c42, b4, a2, c42
  1768. LD b4, BO, 7 * SIZE
  1769. MADD c51, b5, a1, c51
  1770. MADD c61, b2, a1, c61
  1771. LD a4, AO, 2 * SIZE
  1772. MADD c71, b3, a1, c71
  1773. MADD c81, b4, a1, c81
  1774. LD a1, AO, 8 * SIZE
  1775. MADD c52, b5, a2, c52
  1776. LD b5, BO, 20 * SIZE
  1777. MADD c62, b2, a2, c62
  1778. LD b2, BO, 9 * SIZE
  1779. MADD c72, b3, a2, c72
  1780. LD b3, BO, 10 * SIZE
  1781. MADD c82, b4, a2, c82
  1782. LD b4, BO, 11 * SIZE
  1783. MADD c11, b6, a4, c11
  1784. LD a2, AO, 3 * SIZE
  1785. MADD c21, b2, a4, c21
  1786. MADD c31, b3, a4, c31
  1787. MADD c41, b4, a4, c41
  1788. MADD c12, b6, a2, c12
  1789. LD b6, BO, 24 * SIZE
  1790. MADD c22, b2, a2, c22
  1791. LD b2, BO, 13 * SIZE
  1792. MADD c32, b3, a2, c32
  1793. LD b3, BO, 14 * SIZE
  1794. MADD c42, b4, a2, c42
  1795. LD b4, BO, 15 * SIZE
  1796. MADD c51, b7, a4, c51
  1797. MADD c61, b2, a4, c61
  1798. MADD c71, b3, a4, c71
  1799. MADD c81, b4, a4, c81
  1800. MADD c52, b7, a2, c52
  1801. LD b7, BO, 28 * SIZE
  1802. MADD c62, b2, a2, c62
  1803. LD b2, BO, 17 * SIZE
  1804. MADD c72, b3, a2, c72
  1805. LD b3, BO, 18 * SIZE
  1806. MADD c82, b4, a2, c82
  1807. LD b4, BO, 19 * SIZE
  1808. MADD c11, b1, a3, c11
  1809. LD a2, AO, 5 * SIZE
  1810. MADD c21, b2, a3, c21
  1811. MADD c31, b3, a3, c31
  1812. MADD c41, b4, a3, c41
  1813. MADD c12, b1, a2, c12
  1814. LD b1, BO, 32 * SIZE
  1815. MADD c22, b2, a2, c22
  1816. LD b2, BO, 21 * SIZE
  1817. MADD c32, b3, a2, c32
  1818. LD b3, BO, 22 * SIZE
  1819. MADD c42, b4, a2, c42
  1820. LD b4, BO, 23 * SIZE
  1821. MADD c51, b5, a3, c51
  1822. MADD c61, b2, a3, c61
  1823. LD a4, AO, 6 * SIZE
  1824. MADD c71, b3, a3, c71
  1825. MADD c81, b4, a3, c81
  1826. LD a3, AO, 12 * SIZE
  1827. MADD c52, b5, a2, c52
  1828. LD b5, BO, 36 * SIZE
  1829. MADD c62, b2, a2, c62
  1830. LD b2, BO, 25 * SIZE
  1831. MADD c72, b3, a2, c72
  1832. LD b3, BO, 26 * SIZE
  1833. MADD c82, b4, a2, c82
  1834. LD b4, BO, 27 * SIZE
  1835. MADD c11, b6, a4, c11
  1836. LD a2, AO, 7 * SIZE
  1837. MADD c21, b2, a4, c21
  1838. MADD c31, b3, a4, c31
  1839. MADD c41, b4, a4, c41
  1840. MADD c12, b6, a2, c12
  1841. LD b6, BO, 40 * SIZE
  1842. MADD c22, b2, a2, c22
  1843. LD b2, BO, 29 * SIZE
  1844. MADD c32, b3, a2, c32
  1845. LD b3, BO, 30 * SIZE
  1846. MADD c42, b4, a2, c42
  1847. LD b4, BO, 31 * SIZE
  1848. MADD c51, b7, a4, c51
  1849. addi.d BO, BO, 32 * SIZE
  1850. MADD c61, b2, a4, c61
  1851. addi.d AO, AO, 8 * SIZE
  1852. MADD c71, b3, a4, c71
  1853. MADD c81, b4, a4, c81
  1854. MADD c52, b7, a2, c52
  1855. LD b7, BO, 12 * SIZE
  1856. MADD c62, b2, a2, c62
  1857. LD b2, BO, 1 * SIZE
  1858. MADD c72, b3, a2, c72
  1859. LD b3, BO, 2 * SIZE
  1860. MADD c82, b4, a2, c82
  1861. LD b4, BO, 3 * SIZE
  1862. .align 3
  1863. .L15:
  1864. #if defined(LT) || defined(RN)
  1865. andi L, KK, 3
  1866. #else
  1867. andi L, TEMP, 3
  1868. #endif
  1869. bge $r0, L, .L18
  1870. .align 3
  1871. .L16:
  1872. MADD c11, b1, a1, c11
  1873. LD a2, AO, 1 * SIZE
  1874. MADD c21, b2, a1, c21
  1875. MADD c31, b3, a1, c31
  1876. MADD c41, b4, a1, c41
  1877. MADD c12, b1, a2, c12
  1878. LD b1, BO, 8 * SIZE
  1879. MADD c22, b2, a2, c22
  1880. LD b2, BO, 5 * SIZE
  1881. MADD c32, b3, a2, c32
  1882. LD b3, BO, 6 * SIZE
  1883. MADD c42, b4, a2, c42
  1884. LD b4, BO, 7 * SIZE
  1885. MADD c51, b5, a1, c51
  1886. addi.d L, L, -1
  1887. MADD c61, b2, a1, c61
  1888. addi.d AO, AO, 2 * SIZE
  1889. MADD c71, b3, a1, c71
  1890. addi.d BO, BO, 8 * SIZE
  1891. MADD c81, b4, a1, c81
  1892. LD a1, AO, 0 * SIZE
  1893. MADD c52, b5, a2, c52
  1894. LD b5, BO, 4 * SIZE
  1895. MADD c62, b2, a2, c62
  1896. LD b2, BO, 1 * SIZE
  1897. MADD c72, b3, a2, c72
  1898. LD b3, BO, 2 * SIZE
  1899. MADD c82, b4, a2, c82
  1900. LD b4, BO, 3 * SIZE
  1901. blt $r0, L, .L16
  1902. .L18:
  1903. #if defined(LN) || defined(RT)
  1904. #ifdef LN
  1905. addi.d TEMP, KK, -2
  1906. #else
  1907. addi.d TEMP, KK, -8
  1908. #endif
  1909. slli.d L, TEMP, 1 + BASE_SHIFT
  1910. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  1911. add.d AO, AORIG, L
  1912. add.d BO, B, TEMP
  1913. #endif
  1914. #if defined(LN) || defined(LT)
  1915. LD b1, BO, 0 * SIZE
  1916. LD b2, BO, 1 * SIZE
  1917. LD b3, BO, 2 * SIZE
  1918. LD b4, BO, 3 * SIZE
  1919. SUB c11, b1, c11
  1920. LD b5, BO, 4 * SIZE
  1921. SUB c21, b2, c21
  1922. LD b6, BO, 5 * SIZE
  1923. SUB c31, b3, c31
  1924. LD b7, BO, 6 * SIZE
  1925. SUB c41, b4, c41
  1926. LD b8, BO, 7 * SIZE
  1927. SUB c51, b5, c51
  1928. LD b1, BO, 8 * SIZE
  1929. SUB c61, b6, c61
  1930. LD b2, BO, 9 * SIZE
  1931. SUB c71, b7, c71
  1932. LD b3, BO, 10 * SIZE
  1933. SUB c81, b8, c81
  1934. LD b4, BO, 11 * SIZE
  1935. SUB c12, b1, c12
  1936. LD b5, BO, 12 * SIZE
  1937. SUB c22, b2, c22
  1938. LD b6, BO, 13 * SIZE
  1939. SUB c32, b3, c32
  1940. LD b7, BO, 14 * SIZE
  1941. SUB c42, b4, c42
  1942. LD b8, BO, 15 * SIZE
  1943. SUB c52, b5, c52
  1944. #ifdef LN
  1945. LD b1, AO, 3 * SIZE
  1946. #else
  1947. LD b1, AO, 0 * SIZE
  1948. #endif
  1949. SUB c62, b6, c62
  1950. SUB c72, b7, c72
  1951. SUB c82, b8, c82
  1952. #else
  1953. LD b1, AO, 0 * SIZE
  1954. LD b2, AO, 1 * SIZE
  1955. LD b3, AO, 2 * SIZE
  1956. LD b4, AO, 3 * SIZE
  1957. SUB c11, b1, c11
  1958. LD b5, AO, 4 * SIZE
  1959. SUB c12, b2, c12
  1960. LD b6, AO, 5 * SIZE
  1961. SUB c21, b3, c21
  1962. LD b7, AO, 6 * SIZE
  1963. SUB c22, b4, c22
  1964. LD b8, AO, 7 * SIZE
  1965. SUB c31, b5, c31
  1966. LD b1, AO, 8 * SIZE
  1967. SUB c32, b6, c32
  1968. LD b2, AO, 9 * SIZE
  1969. SUB c41, b7, c41
  1970. LD b3, AO, 10 * SIZE
  1971. SUB c42, b8, c42
  1972. LD b4, AO, 11 * SIZE
  1973. LD b5, AO, 12 * SIZE
  1974. SUB c51, b1, c51
  1975. LD b6, AO, 13 * SIZE
  1976. SUB c52, b2, c52
  1977. LD b7, AO, 14 * SIZE
  1978. SUB c61, b3, c61
  1979. LD b8, AO, 15 * SIZE
  1980. SUB c62, b4, c62
  1981. SUB c71, b5, c71
  1982. SUB c72, b6, c72
  1983. SUB c81, b7, c81
  1984. SUB c82, b8, c82
  1985. #endif
  1986. #ifdef LN
  1987. MUL c12, b1, c12
  1988. LD b2, AO, 2 * SIZE
  1989. MUL c22, b1, c22
  1990. MUL c32, b1, c32
  1991. MUL c42, b1, c42
  1992. MUL c52, b1, c52
  1993. MUL c62, b1, c62
  1994. MUL c72, b1, c72
  1995. MUL c82, b1, c82
  1996. NMSUB c11, c12, b2, c11
  1997. LD b3, AO, 0 * SIZE
  1998. NMSUB c21, c22, b2, c21
  1999. NMSUB c31, c32, b2, c31
  2000. NMSUB c41, c42, b2, c41
  2001. NMSUB c51, c52, b2, c51
  2002. NMSUB c61, c62, b2, c61
  2003. NMSUB c71, c72, b2, c71
  2004. NMSUB c81, c82, b2, c81
  2005. MUL c11, b3, c11
  2006. addi.d CO1, CO1, -2 * SIZE
  2007. MUL c21, b3, c21
  2008. addi.d CO2, CO2, -2 * SIZE
  2009. MUL c31, b3, c31
  2010. addi.d CO3, CO3, -2 * SIZE
  2011. MUL c41, b3, c41
  2012. addi.d CO4, CO4, -2 * SIZE
  2013. MUL c51, b3, c51
  2014. addi.d CO5, CO5, -2 * SIZE
  2015. MUL c61, b3, c61
  2016. addi.d CO6, CO6, -2 * SIZE
  2017. MUL c71, b3, c71
  2018. addi.d CO7, CO7, -2 * SIZE
  2019. MUL c81, b3, c81
  2020. addi.d CO8, CO8, -2 * SIZE
  2021. #endif
  2022. #ifdef LT
  2023. MUL c11, b1, c11
  2024. LD b2, AO, 1 * SIZE
  2025. MUL c21, b1, c21
  2026. MUL c31, b1, c31
  2027. MUL c41, b1, c41
  2028. MUL c51, b1, c51
  2029. MUL c61, b1, c61
  2030. MUL c71, b1, c71
  2031. MUL c81, b1, c81
  2032. NMSUB c12, c11, b2, c12
  2033. LD b3, AO, 3 * SIZE
  2034. NMSUB c22, c21, b2, c22
  2035. NMSUB c32, c31, b2, c32
  2036. NMSUB c42, c41, b2, c42
  2037. NMSUB c52, c51, b2, c52
  2038. NMSUB c62, c61, b2, c62
  2039. NMSUB c72, c71, b2, c72
  2040. NMSUB c82, c81, b2, c82
  2041. MUL c12, b3, c12
  2042. MUL c22, b3, c22
  2043. MUL c32, b3, c32
  2044. MUL c42, b3, c42
  2045. MUL c52, b3, c52
  2046. MUL c62, b3, c62
  2047. MUL c72, b3, c72
  2048. MUL c82, b3, c82
  2049. #endif
  2050. #ifdef RN
  2051. LD b1, BO, 0 * SIZE
  2052. LD b2, BO, 1 * SIZE
  2053. LD b3, BO, 2 * SIZE
  2054. LD b4, BO, 3 * SIZE
  2055. MUL c11, b1, c11
  2056. MUL c12, b1, c12
  2057. LD b5, BO, 4 * SIZE
  2058. NMSUB c21, c11, b2, c21
  2059. NMSUB c22, c12, b2, c22
  2060. LD b6, BO, 5 * SIZE
  2061. NMSUB c31, c11, b3, c31
  2062. NMSUB c32, c12, b3, c32
  2063. LD b7, BO, 6 * SIZE
  2064. NMSUB c41, c11, b4, c41
  2065. NMSUB c42, c12, b4, c42
  2066. LD b8, BO, 7 * SIZE
  2067. NMSUB c51, c11, b5, c51
  2068. NMSUB c52, c12, b5, c52
  2069. LD b2, BO, 9 * SIZE
  2070. NMSUB c61, c11, b6, c61
  2071. NMSUB c62, c12, b6, c62
  2072. LD b3, BO, 10 * SIZE
  2073. NMSUB c71, c11, b7, c71
  2074. NMSUB c72, c12, b7, c72
  2075. LD b4, BO, 11 * SIZE
  2076. NMSUB c81, c11, b8, c81
  2077. NMSUB c82, c12, b8, c82
  2078. LD b5, BO, 12 * SIZE
  2079. MUL c21, b2, c21
  2080. MUL c22, b2, c22
  2081. LD b6, BO, 13 * SIZE
  2082. NMSUB c31, c21, b3, c31
  2083. NMSUB c32, c22, b3, c32
  2084. LD b7, BO, 14 * SIZE
  2085. NMSUB c41, c21, b4, c41
  2086. NMSUB c42, c22, b4, c42
  2087. LD b8, BO, 15 * SIZE
  2088. NMSUB c51, c21, b5, c51
  2089. NMSUB c52, c22, b5, c52
  2090. LD b3, BO, 18 * SIZE
  2091. NMSUB c61, c21, b6, c61
  2092. NMSUB c62, c22, b6, c62
  2093. LD b4, BO, 19 * SIZE
  2094. NMSUB c71, c21, b7, c71
  2095. NMSUB c72, c22, b7, c72
  2096. LD b5, BO, 20 * SIZE
  2097. NMSUB c81, c21, b8, c81
  2098. NMSUB c82, c22, b8, c82
  2099. LD b6, BO, 21 * SIZE
  2100. MUL c31, b3, c31
  2101. MUL c32, b3, c32
  2102. LD b7, BO, 22 * SIZE
  2103. NMSUB c41, c31, b4, c41
  2104. NMSUB c42, c32, b4, c42
  2105. LD b8, BO, 23 * SIZE
  2106. NMSUB c51, c31, b5, c51
  2107. NMSUB c52, c32, b5, c52
  2108. LD b4, BO, 27 * SIZE
  2109. NMSUB c61, c31, b6, c61
  2110. NMSUB c62, c32, b6, c62
  2111. LD b5, BO, 28 * SIZE
  2112. NMSUB c71, c31, b7, c71
  2113. NMSUB c72, c32, b7, c72
  2114. LD b6, BO, 29 * SIZE
  2115. NMSUB c81, c31, b8, c81
  2116. NMSUB c82, c32, b8, c82
  2117. LD b7, BO, 30 * SIZE
  2118. MUL c41, b4, c41
  2119. MUL c42, b4, c42
  2120. LD b8, BO, 31 * SIZE
  2121. NMSUB c51, c41, b5, c51
  2122. NMSUB c52, c42, b5, c52
  2123. LD b5, BO, 36 * SIZE
  2124. NMSUB c61, c41, b6, c61
  2125. NMSUB c62, c42, b6, c62
  2126. LD b6, BO, 37 * SIZE
  2127. NMSUB c71, c41, b7, c71
  2128. NMSUB c72, c42, b7, c72
  2129. LD b7, BO, 38 * SIZE
  2130. NMSUB c81, c41, b8, c81
  2131. NMSUB c82, c42, b8, c82
  2132. LD b8, BO, 39 * SIZE
  2133. MUL c51, b5, c51
  2134. MUL c52, b5, c52
  2135. NMSUB c61, c51, b6, c61
  2136. NMSUB c62, c52, b6, c62
  2137. LD b6, BO, 45 * SIZE
  2138. NMSUB c71, c51, b7, c71
  2139. NMSUB c72, c52, b7, c72
  2140. LD b7, BO, 46 * SIZE
  2141. NMSUB c81, c51, b8, c81
  2142. NMSUB c82, c52, b8, c82
  2143. LD b8, BO, 47 * SIZE
  2144. MUL c61, b6, c61
  2145. MUL c62, b6, c62
  2146. NMSUB c71, c61, b7, c71
  2147. NMSUB c72, c62, b7, c72
  2148. LD b7, BO, 54 * SIZE
  2149. NMSUB c81, c61, b8, c81
  2150. NMSUB c82, c62, b8, c82
  2151. LD b8, BO, 55 * SIZE
  2152. MUL c71, b7, c71
  2153. MUL c72, b7, c72
  2154. NMSUB c81, c71, b8, c81
  2155. NMSUB c82, c72, b8, c82
  2156. LD b8, BO, 63 * SIZE
  2157. MUL c81, b8, c81
  2158. MUL c82, b8, c82
  2159. #endif
  2160. #ifdef RT
  2161. LD b1, BO, 63 * SIZE
  2162. LD b2, BO, 62 * SIZE
  2163. LD b3, BO, 61 * SIZE
  2164. LD b4, BO, 60 * SIZE
  2165. MUL c81, b1, c81
  2166. MUL c82, b1, c82
  2167. LD b5, BO, 59 * SIZE
  2168. NMSUB c71, c81, b2, c71
  2169. NMSUB c72, c82, b2, c72
  2170. LD b6, BO, 58 * SIZE
  2171. NMSUB c61, c81, b3, c61
  2172. NMSUB c62, c82, b3, c62
  2173. LD b7, BO, 57 * SIZE
  2174. NMSUB c51, c81, b4, c51
  2175. NMSUB c52, c82, b4, c52
  2176. LD b8, BO, 56 * SIZE
  2177. NMSUB c41, c81, b5, c41
  2178. NMSUB c42, c82, b5, c42
  2179. LD b2, BO, 54 * SIZE
  2180. NMSUB c31, c81, b6, c31
  2181. NMSUB c32, c82, b6, c32
  2182. LD b3, BO, 53 * SIZE
  2183. NMSUB c21, c81, b7, c21
  2184. NMSUB c22, c82, b7, c22
  2185. LD b4, BO, 52 * SIZE
  2186. NMSUB c11, c81, b8, c11
  2187. NMSUB c12, c82, b8, c12
  2188. LD b5, BO, 51 * SIZE
  2189. MUL c71, b2, c71
  2190. MUL c72, b2, c72
  2191. LD b6, BO, 50 * SIZE
  2192. NMSUB c61, c71, b3, c61
  2193. NMSUB c62, c72, b3, c62
  2194. LD b7, BO, 49 * SIZE
  2195. NMSUB c51, c71, b4, c51
  2196. NMSUB c52, c72, b4, c52
  2197. LD b8, BO, 48 * SIZE
  2198. NMSUB c41, c71, b5, c41
  2199. NMSUB c42, c72, b5, c42
  2200. LD b3, BO, 45 * SIZE
  2201. NMSUB c31, c71, b6, c31
  2202. NMSUB c32, c72, b6, c32
  2203. LD b4, BO, 44 * SIZE
  2204. NMSUB c21, c71, b7, c21
  2205. NMSUB c22, c72, b7, c22
  2206. LD b5, BO, 43 * SIZE
  2207. NMSUB c11, c71, b8, c11
  2208. NMSUB c12, c72, b8, c12
  2209. LD b6, BO, 42 * SIZE
  2210. MUL c61, b3, c61
  2211. MUL c62, b3, c62
  2212. LD b7, BO, 41 * SIZE
  2213. NMSUB c51, c61, b4, c51
  2214. NMSUB c52, c62, b4, c52
  2215. LD b8, BO, 40 * SIZE
  2216. NMSUB c41, c61, b5, c41
  2217. NMSUB c42, c62, b5, c42
  2218. LD b4, BO, 36 * SIZE
  2219. NMSUB c31, c61, b6, c31
  2220. NMSUB c32, c62, b6, c32
  2221. LD b5, BO, 35 * SIZE
  2222. NMSUB c21, c61, b7, c21
  2223. NMSUB c22, c62, b7, c22
  2224. LD b6, BO, 34 * SIZE
  2225. NMSUB c11, c61, b8, c11
  2226. NMSUB c12, c62, b8, c12
  2227. LD b7, BO, 33 * SIZE
  2228. MUL c51, b4, c51
  2229. MUL c52, b4, c52
  2230. LD b8, BO, 32 * SIZE
  2231. NMSUB c41, c51, b5, c41
  2232. NMSUB c42, c52, b5, c42
  2233. LD b5, BO, 27 * SIZE
  2234. NMSUB c31, c51, b6, c31
  2235. NMSUB c32, c52, b6, c32
  2236. LD b6, BO, 26 * SIZE
  2237. NMSUB c21, c51, b7, c21
  2238. NMSUB c22, c52, b7, c22
  2239. LD b7, BO, 25 * SIZE
  2240. NMSUB c11, c51, b8, c11
  2241. NMSUB c12, c52, b8, c12
  2242. LD b8, BO, 24 * SIZE
  2243. MUL c41, b5, c41
  2244. MUL c42, b5, c42
  2245. NMSUB c31, c41, b6, c31
  2246. NMSUB c32, c42, b6, c32
  2247. LD b6, BO, 18 * SIZE
  2248. NMSUB c21, c41, b7, c21
  2249. NMSUB c22, c42, b7, c22
  2250. LD b7, BO, 17 * SIZE
  2251. NMSUB c11, c41, b8, c11
  2252. NMSUB c12, c42, b8, c12
  2253. LD b8, BO, 16 * SIZE
  2254. MUL c31, b6, c31
  2255. MUL c32, b6, c32
  2256. NMSUB c21, c31, b7, c21
  2257. NMSUB c22, c32, b7, c22
  2258. LD b7, BO, 9 * SIZE
  2259. NMSUB c11, c31, b8, c11
  2260. NMSUB c12, c32, b8, c12
  2261. LD b8, BO, 8 * SIZE
  2262. MUL c21, b7, c21
  2263. MUL c22, b7, c22
  2264. NMSUB c11, c21, b8, c11
  2265. NMSUB c12, c22, b8, c12
  2266. LD b8, BO, 0 * SIZE
  2267. MUL c11, b8, c11
  2268. MUL c12, b8, c12
  2269. #endif
  2270. #if defined(LN) || defined(LT)
  2271. ST c11, BO, 0 * SIZE
  2272. ST c21, BO, 1 * SIZE
  2273. ST c31, BO, 2 * SIZE
  2274. ST c41, BO, 3 * SIZE
  2275. ST c51, BO, 4 * SIZE
  2276. ST c61, BO, 5 * SIZE
  2277. ST c71, BO, 6 * SIZE
  2278. ST c81, BO, 7 * SIZE
  2279. ST c12, BO, 8 * SIZE
  2280. ST c22, BO, 9 * SIZE
  2281. ST c32, BO, 10 * SIZE
  2282. ST c42, BO, 11 * SIZE
  2283. ST c52, BO, 12 * SIZE
  2284. ST c62, BO, 13 * SIZE
  2285. ST c72, BO, 14 * SIZE
  2286. ST c82, BO, 15 * SIZE
  2287. #else
  2288. ST c11, AO, 0 * SIZE
  2289. ST c12, AO, 1 * SIZE
  2290. ST c21, AO, 2 * SIZE
  2291. ST c22, AO, 3 * SIZE
  2292. ST c31, AO, 4 * SIZE
  2293. ST c32, AO, 5 * SIZE
  2294. ST c41, AO, 6 * SIZE
  2295. ST c42, AO, 7 * SIZE
  2296. ST c51, AO, 8 * SIZE
  2297. ST c52, AO, 9 * SIZE
  2298. ST c61, AO, 10 * SIZE
  2299. ST c62, AO, 11 * SIZE
  2300. ST c71, AO, 12 * SIZE
  2301. ST c72, AO, 13 * SIZE
  2302. ST c81, AO, 14 * SIZE
  2303. ST c82, AO, 15 * SIZE
  2304. #endif
  2305. ST c11, CO1, 0 * SIZE
  2306. ST c12, CO1, 1 * SIZE
  2307. ST c21, CO2, 0 * SIZE
  2308. ST c22, CO2, 1 * SIZE
  2309. ST c31, CO3, 0 * SIZE
  2310. ST c32, CO3, 1 * SIZE
  2311. ST c41, CO4, 0 * SIZE
  2312. ST c42, CO4, 1 * SIZE
  2313. ST c51, CO5, 0 * SIZE
  2314. ST c52, CO5, 1 * SIZE
  2315. ST c61, CO6, 0 * SIZE
  2316. ST c62, CO6, 1 * SIZE
  2317. ST c71, CO7, 0 * SIZE
  2318. ST c72, CO7, 1 * SIZE
  2319. ST c81, CO8, 0 * SIZE
  2320. ST c82, CO8, 1 * SIZE
  2321. MTC a1, $r0
  2322. #ifndef LN
  2323. addi.d CO1, CO1, 2 * SIZE
  2324. addi.d CO2, CO2, 2 * SIZE
  2325. addi.d CO3, CO3, 2 * SIZE
  2326. addi.d CO4, CO4, 2 * SIZE
  2327. addi.d CO5, CO5, 2 * SIZE
  2328. addi.d CO6, CO6, 2 * SIZE
  2329. addi.d CO7, CO7, 2 * SIZE
  2330. addi.d CO8, CO8, 2 * SIZE
  2331. #endif
  2332. MOV c11, a1
  2333. MOV c21, a1
  2334. #ifdef RT
  2335. slli.d TEMP, K, 1 + BASE_SHIFT
  2336. add.d AORIG, AORIG, TEMP
  2337. #endif
  2338. MOV c31, a1
  2339. MOV c41, a1
  2340. #if defined(LT) || defined(RN)
  2341. sub.d TEMP, K, KK
  2342. slli.d L, TEMP, 1 + BASE_SHIFT
  2343. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  2344. add.d AO, AO, L
  2345. add.d BO, BO, TEMP
  2346. #endif
  2347. #ifdef LT
  2348. addi.d KK, KK, 2
  2349. #endif
  2350. #ifdef LN
  2351. addi.d KK, KK, -2
  2352. #endif
  2353. addi.d I, I, -1
  2354. MOV c51, a1
  2355. MOV c61, a1
  2356. blt $r0, I, .L11
  2357. .align 3
  2358. .L20:
  2359. andi I, M, 1
  2360. MOV c61, c11
  2361. MOV c71, c11
  2362. bge $r0, I, .L29
  2363. #if defined(LT) || defined(RN)
  2364. LD a1, AO, 0 * SIZE
  2365. LD a2, AO, 1 * SIZE
  2366. LD a3, AO, 2 * SIZE
  2367. LD a4, AO, 3 * SIZE
  2368. LD b1, B, 0 * SIZE
  2369. LD b2, B, 1 * SIZE
  2370. LD b3, B, 2 * SIZE
  2371. LD b4, B, 3 * SIZE
  2372. LD b5, B, 4 * SIZE
  2373. LD b6, B, 8 * SIZE
  2374. LD b7, B, 12 * SIZE
  2375. srai.d L, KK, 2
  2376. MOV c81, c11
  2377. move BO, B
  2378. bge $r0, L, .L25
  2379. #else
  2380. #ifdef LN
  2381. slli.d TEMP, K, 0 + BASE_SHIFT
  2382. sub.d AORIG, AORIG, TEMP
  2383. #endif
  2384. slli.d L, KK, 0 + BASE_SHIFT
  2385. slli.d TEMP, KK, 3 + BASE_SHIFT
  2386. add.d AO, AORIG, L
  2387. add.d BO, B, TEMP
  2388. sub.d TEMP, K, KK
  2389. LD a1, AO, 0 * SIZE
  2390. LD a2, AO, 1 * SIZE
  2391. LD a3, AO, 2 * SIZE
  2392. LD a4, AO, 3 * SIZE
  2393. LD b1, BO, 0 * SIZE
  2394. LD b2, BO, 1 * SIZE
  2395. LD b3, BO, 2 * SIZE
  2396. LD b4, BO, 3 * SIZE
  2397. LD b5, BO, 4 * SIZE
  2398. LD b6, BO, 8 * SIZE
  2399. LD b7, BO, 12 * SIZE
  2400. srai.d L, TEMP, 2
  2401. MOV c81, c11
  2402. bge $r0, L, .L25
  2403. #endif
  2404. .align 3
  2405. .L22:
  2406. MADD c11, b1, a1, c11
  2407. LD b1, BO, 16 * SIZE
  2408. MADD c21, b2, a1, c21
  2409. LD b2, BO, 5 * SIZE
  2410. MADD c31, b3, a1, c31
  2411. LD b3, BO, 6 * SIZE
  2412. MADD c41, b4, a1, c41
  2413. LD b4, BO, 7 * SIZE
  2414. MADD c51, b5, a1, c51
  2415. LD b5, BO, 20 * SIZE
  2416. MADD c61, b2, a1, c61
  2417. LD b2, BO, 9 * SIZE
  2418. MADD c71, b3, a1, c71
  2419. LD b3, BO, 10 * SIZE
  2420. MADD c81, b4, a1, c81
  2421. LD b4, BO, 11 * SIZE
  2422. LD a1, AO, 4 * SIZE
  2423. addi.d L, L, -1
  2424. MADD c11, b6, a2, c11
  2425. LD b6, BO, 24 * SIZE
  2426. MADD c21, b2, a2, c21
  2427. LD b2, BO, 13 * SIZE
  2428. MADD c31, b3, a2, c31
  2429. LD b3, BO, 14 * SIZE
  2430. MADD c41, b4, a2, c41
  2431. LD b4, BO, 15 * SIZE
  2432. MADD c51, b7, a2, c51
  2433. LD b7, BO, 28 * SIZE
  2434. MADD c61, b2, a2, c61
  2435. LD b2, BO, 17 * SIZE
  2436. MADD c71, b3, a2, c71
  2437. LD b3, BO, 18 * SIZE
  2438. MADD c81, b4, a2, c81
  2439. LD b4, BO, 19 * SIZE
  2440. LD a2, AO, 5 * SIZE
  2441. addi.d AO, AO, 4 * SIZE
  2442. MADD c11, b1, a3, c11
  2443. LD b1, BO, 32 * SIZE
  2444. MADD c21, b2, a3, c21
  2445. LD b2, BO, 21 * SIZE
  2446. MADD c31, b3, a3, c31
  2447. LD b3, BO, 22 * SIZE
  2448. MADD c41, b4, a3, c41
  2449. LD b4, BO, 23 * SIZE
  2450. MADD c51, b5, a3, c51
  2451. LD b5, BO, 36 * SIZE
  2452. MADD c61, b2, a3, c61
  2453. LD b2, BO, 25 * SIZE
  2454. MADD c71, b3, a3, c71
  2455. LD b3, BO, 26 * SIZE
  2456. MADD c81, b4, a3, c81
  2457. LD b4, BO, 27 * SIZE
  2458. LD a3, AO, 2 * SIZE
  2459. addi.d BO, BO, 32 * SIZE
  2460. MADD c11, b6, a4, c11
  2461. LD b6, BO, 8 * SIZE
  2462. MADD c21, b2, a4, c21
  2463. LD b2, BO, -3 * SIZE
  2464. MADD c31, b3, a4, c31
  2465. LD b3, BO, -2 * SIZE
  2466. MADD c41, b4, a4, c41
  2467. LD b4, BO, -1 * SIZE
  2468. MADD c51, b7, a4, c51
  2469. LD b7, BO, 12 * SIZE
  2470. MADD c61, b2, a4, c61
  2471. LD b2, BO, 1 * SIZE
  2472. MADD c71, b3, a4, c71
  2473. LD b3, BO, 2 * SIZE
  2474. MADD c81, b4, a4, c81
  2475. LD b4, BO, 3 * SIZE
  2476. LD a4, AO, 3 * SIZE
  2477. blt $r0, L, .L22
  2478. .align 3
  2479. .L25:
  2480. #if defined(LT) || defined(RN)
  2481. andi L, KK, 3
  2482. #else
  2483. andi L, TEMP, 3
  2484. #endif
  2485. bge $r0, L, .L28
  2486. .align 3
  2487. .L26:
  2488. MADD c11, b1, a1, c11
  2489. LD b1, BO, 8 * SIZE
  2490. MADD c21, b2, a1, c21
  2491. LD b2, BO, 5 * SIZE
  2492. MADD c31, b3, a1, c31
  2493. LD b3, BO, 6 * SIZE
  2494. MADD c41, b4, a1, c41
  2495. LD b4, BO, 7 * SIZE
  2496. addi.d L, L, -1
  2497. MOV a2, a2
  2498. addi.d AO, AO, 1 * SIZE
  2499. addi.d BO, BO, 8 * SIZE
  2500. MADD c51, b5, a1, c51
  2501. LD b5, BO, 4 * SIZE
  2502. MADD c61, b2, a1, c61
  2503. LD b2, BO, 1 * SIZE
  2504. MADD c71, b3, a1, c71
  2505. LD b3, BO, 2 * SIZE
  2506. MADD c81, b4, a1, c81
  2507. LD a1, AO, 0 * SIZE
  2508. LD b4, BO, 3 * SIZE
  2509. blt $r0, L, .L26
  2510. .L28:
  2511. #if defined(LN) || defined(RT)
  2512. #ifdef LN
  2513. addi.d TEMP, KK, -1
  2514. #else
  2515. addi.d TEMP, KK, -8
  2516. #endif
  2517. slli.d L, TEMP, 0 + BASE_SHIFT
  2518. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  2519. add.d AO, AORIG, L
  2520. add.d BO, B, TEMP
  2521. #endif
  2522. #if defined(LN) || defined(LT)
  2523. LD b1, BO, 0 * SIZE
  2524. LD b2, BO, 1 * SIZE
  2525. LD b3, BO, 2 * SIZE
  2526. LD b4, BO, 3 * SIZE
  2527. LD b5, BO, 4 * SIZE
  2528. LD b6, BO, 5 * SIZE
  2529. LD b7, BO, 6 * SIZE
  2530. LD b8, BO, 7 * SIZE
  2531. SUB c11, b1, c11
  2532. SUB c21, b2, c21
  2533. SUB c31, b3, c31
  2534. SUB c41, b4, c41
  2535. SUB c51, b5, c51
  2536. SUB c61, b6, c61
  2537. SUB c71, b7, c71
  2538. SUB c81, b8, c81
  2539. #else
  2540. LD b1, AO, 0 * SIZE
  2541. LD b2, AO, 1 * SIZE
  2542. LD b3, AO, 2 * SIZE
  2543. LD b4, AO, 3 * SIZE
  2544. LD b5, AO, 4 * SIZE
  2545. LD b6, AO, 5 * SIZE
  2546. LD b7, AO, 6 * SIZE
  2547. LD b8, AO, 7 * SIZE
  2548. SUB c11, b1, c11
  2549. SUB c21, b2, c21
  2550. SUB c31, b3, c31
  2551. SUB c41, b4, c41
  2552. SUB c51, b5, c51
  2553. SUB c61, b6, c61
  2554. SUB c71, b7, c71
  2555. SUB c81, b8, c81
  2556. #endif
  2557. #if defined(LN) || defined(LT)
  2558. LD b1, AO, 0 * SIZE
  2559. MUL c11, b1, c11
  2560. MUL c21, b1, c21
  2561. MUL c31, b1, c31
  2562. MUL c41, b1, c41
  2563. MUL c51, b1, c51
  2564. MUL c61, b1, c61
  2565. MUL c71, b1, c71
  2566. MUL c81, b1, c81
  2567. #endif
  2568. #ifdef RN
  2569. LD b1, BO, 0 * SIZE
  2570. LD b2, BO, 1 * SIZE
  2571. LD b3, BO, 2 * SIZE
  2572. LD b4, BO, 3 * SIZE
  2573. LD b5, BO, 4 * SIZE
  2574. LD b6, BO, 5 * SIZE
  2575. LD b7, BO, 6 * SIZE
  2576. LD b8, BO, 7 * SIZE
  2577. MUL c11, b1, c11
  2578. NMSUB c21, c11, b2, c21
  2579. NMSUB c31, c11, b3, c31
  2580. NMSUB c41, c11, b4, c41
  2581. NMSUB c51, c11, b5, c51
  2582. NMSUB c61, c11, b6, c61
  2583. NMSUB c71, c11, b7, c71
  2584. NMSUB c81, c11, b8, c81
  2585. LD b2, BO, 9 * SIZE
  2586. LD b3, BO, 10 * SIZE
  2587. LD b4, BO, 11 * SIZE
  2588. LD b5, BO, 12 * SIZE
  2589. LD b6, BO, 13 * SIZE
  2590. LD b7, BO, 14 * SIZE
  2591. LD b8, BO, 15 * SIZE
  2592. MUL c21, b2, c21
  2593. NMSUB c31, c21, b3, c31
  2594. NMSUB c41, c21, b4, c41
  2595. NMSUB c51, c21, b5, c51
  2596. NMSUB c61, c21, b6, c61
  2597. NMSUB c71, c21, b7, c71
  2598. NMSUB c81, c21, b8, c81
  2599. LD b3, BO, 18 * SIZE
  2600. LD b4, BO, 19 * SIZE
  2601. LD b5, BO, 20 * SIZE
  2602. LD b6, BO, 21 * SIZE
  2603. LD b7, BO, 22 * SIZE
  2604. LD b8, BO, 23 * SIZE
  2605. MUL c31, b3, c31
  2606. NMSUB c41, c31, b4, c41
  2607. NMSUB c51, c31, b5, c51
  2608. NMSUB c61, c31, b6, c61
  2609. NMSUB c71, c31, b7, c71
  2610. NMSUB c81, c31, b8, c81
  2611. LD b4, BO, 27 * SIZE
  2612. LD b5, BO, 28 * SIZE
  2613. LD b6, BO, 29 * SIZE
  2614. LD b7, BO, 30 * SIZE
  2615. LD b8, BO, 31 * SIZE
  2616. MUL c41, b4, c41
  2617. NMSUB c51, c41, b5, c51
  2618. NMSUB c61, c41, b6, c61
  2619. NMSUB c71, c41, b7, c71
  2620. NMSUB c81, c41, b8, c81
  2621. LD b5, BO, 36 * SIZE
  2622. LD b6, BO, 37 * SIZE
  2623. LD b7, BO, 38 * SIZE
  2624. LD b8, BO, 39 * SIZE
  2625. MUL c51, b5, c51
  2626. NMSUB c61, c51, b6, c61
  2627. NMSUB c71, c51, b7, c71
  2628. NMSUB c81, c51, b8, c81
  2629. LD b6, BO, 45 * SIZE
  2630. LD b7, BO, 46 * SIZE
  2631. LD b8, BO, 47 * SIZE
  2632. MUL c61, b6, c61
  2633. NMSUB c71, c61, b7, c71
  2634. NMSUB c81, c61, b8, c81
  2635. LD b7, BO, 54 * SIZE
  2636. LD b8, BO, 55 * SIZE
  2637. MUL c71, b7, c71
  2638. NMSUB c81, c71, b8, c81
  2639. LD b8, BO, 63 * SIZE
  2640. MUL c81, b8, c81
  2641. #endif
  2642. #ifdef RT
  2643. LD b1, BO, 63 * SIZE
  2644. LD b2, BO, 62 * SIZE
  2645. LD b3, BO, 61 * SIZE
  2646. LD b4, BO, 60 * SIZE
  2647. LD b5, BO, 59 * SIZE
  2648. LD b6, BO, 58 * SIZE
  2649. LD b7, BO, 57 * SIZE
  2650. LD b8, BO, 56 * SIZE
  2651. MUL c81, b1, c81
  2652. NMSUB c71, c81, b2, c71
  2653. NMSUB c61, c81, b3, c61
  2654. NMSUB c51, c81, b4, c51
  2655. NMSUB c41, c81, b5, c41
  2656. NMSUB c31, c81, b6, c31
  2657. NMSUB c21, c81, b7, c21
  2658. NMSUB c11, c81, b8, c11
  2659. LD b2, BO, 54 * SIZE
  2660. LD b3, BO, 53 * SIZE
  2661. LD b4, BO, 52 * SIZE
  2662. LD b5, BO, 51 * SIZE
  2663. LD b6, BO, 50 * SIZE
  2664. LD b7, BO, 49 * SIZE
  2665. LD b8, BO, 48 * SIZE
  2666. MUL c71, b2, c71
  2667. NMSUB c61, c71, b3, c61
  2668. NMSUB c51, c71, b4, c51
  2669. NMSUB c41, c71, b5, c41
  2670. NMSUB c31, c71, b6, c31
  2671. NMSUB c21, c71, b7, c21
  2672. NMSUB c11, c71, b8, c11
  2673. LD b3, BO, 45 * SIZE
  2674. LD b4, BO, 44 * SIZE
  2675. LD b5, BO, 43 * SIZE
  2676. LD b6, BO, 42 * SIZE
  2677. LD b7, BO, 41 * SIZE
  2678. LD b8, BO, 40 * SIZE
  2679. MUL c61, b3, c61
  2680. NMSUB c51, c61, b4, c51
  2681. NMSUB c41, c61, b5, c41
  2682. NMSUB c31, c61, b6, c31
  2683. NMSUB c21, c61, b7, c21
  2684. NMSUB c11, c61, b8, c11
  2685. LD b4, BO, 36 * SIZE
  2686. LD b5, BO, 35 * SIZE
  2687. LD b6, BO, 34 * SIZE
  2688. LD b7, BO, 33 * SIZE
  2689. LD b8, BO, 32 * SIZE
  2690. MUL c51, b4, c51
  2691. NMSUB c41, c51, b5, c41
  2692. NMSUB c31, c51, b6, c31
  2693. NMSUB c21, c51, b7, c21
  2694. NMSUB c11, c51, b8, c11
  2695. LD b5, BO, 27 * SIZE
  2696. LD b6, BO, 26 * SIZE
  2697. LD b7, BO, 25 * SIZE
  2698. LD b8, BO, 24 * SIZE
  2699. MUL c41, b5, c41
  2700. NMSUB c31, c41, b6, c31
  2701. NMSUB c21, c41, b7, c21
  2702. NMSUB c11, c41, b8, c11
  2703. LD b6, BO, 18 * SIZE
  2704. LD b7, BO, 17 * SIZE
  2705. LD b8, BO, 16 * SIZE
  2706. MUL c31, b6, c31
  2707. NMSUB c21, c31, b7, c21
  2708. NMSUB c11, c31, b8, c11
  2709. LD b7, BO, 9 * SIZE
  2710. LD b8, BO, 8 * SIZE
  2711. MUL c21, b7, c21
  2712. NMSUB c11, c21, b8, c11
  2713. LD b8, BO, 0 * SIZE
  2714. MUL c11, b8, c11
  2715. #endif
  2716. #ifdef LN
  2717. addi.d CO1, CO1, -1 * SIZE
  2718. addi.d CO2, CO2, -1 * SIZE
  2719. addi.d CO3, CO3, -1 * SIZE
  2720. addi.d CO4, CO4, -1 * SIZE
  2721. addi.d CO5, CO5, -1 * SIZE
  2722. addi.d CO6, CO6, -1 * SIZE
  2723. addi.d CO7, CO7, -1 * SIZE
  2724. addi.d CO8, CO8, -1 * SIZE
  2725. #endif
  2726. #if defined(LN) || defined(LT)
  2727. ST c11, BO, 0 * SIZE
  2728. ST c21, BO, 1 * SIZE
  2729. ST c31, BO, 2 * SIZE
  2730. ST c41, BO, 3 * SIZE
  2731. ST c51, BO, 4 * SIZE
  2732. ST c61, BO, 5 * SIZE
  2733. ST c71, BO, 6 * SIZE
  2734. ST c81, BO, 7 * SIZE
  2735. #else
  2736. ST c11, AO, 0 * SIZE
  2737. ST c21, AO, 1 * SIZE
  2738. ST c31, AO, 2 * SIZE
  2739. ST c41, AO, 3 * SIZE
  2740. ST c51, AO, 4 * SIZE
  2741. ST c61, AO, 5 * SIZE
  2742. ST c71, AO, 6 * SIZE
  2743. ST c81, AO, 7 * SIZE
  2744. #endif
  2745. ST c11, CO1, 0 * SIZE
  2746. ST c21, CO2, 0 * SIZE
  2747. ST c31, CO3, 0 * SIZE
  2748. ST c41, CO4, 0 * SIZE
  2749. ST c51, CO5, 0 * SIZE
  2750. ST c61, CO6, 0 * SIZE
  2751. ST c71, CO7, 0 * SIZE
  2752. ST c81, CO8, 0 * SIZE
  2753. #ifndef LN
  2754. addi.d CO1, CO1, 1 * SIZE
  2755. addi.d CO2, CO2, 1 * SIZE
  2756. addi.d CO3, CO3, 1 * SIZE
  2757. addi.d CO4, CO4, 1 * SIZE
  2758. addi.d CO5, CO5, 1 * SIZE
  2759. addi.d CO6, CO6, 1 * SIZE
  2760. addi.d CO7, CO7, 1 * SIZE
  2761. addi.d CO8, CO8, 1 * SIZE
  2762. #endif
  2763. #ifdef RT
  2764. slli.d TEMP, K, BASE_SHIFT
  2765. add.d AORIG, AORIG, TEMP
  2766. #endif
  2767. #if defined(LT) || defined(RN)
  2768. sub.d TEMP, K, KK
  2769. slli.d L, TEMP, 0 + BASE_SHIFT
  2770. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  2771. add.d AO, AO, L
  2772. add.d BO, BO, TEMP
  2773. #endif
  2774. #ifdef LT
  2775. addi.d KK, KK, 1
  2776. #endif
  2777. #ifdef LN
  2778. addi.d KK, KK, -1
  2779. #endif
  2780. .align 3
  2781. .L29:
  2782. #ifdef LN
  2783. slli.d TEMP, K, 3 + BASE_SHIFT
  2784. add.d B, B, TEMP
  2785. #endif
  2786. #if defined(LT) || defined(RN)
  2787. move B, BO
  2788. #endif
  2789. #ifdef RN
  2790. addi.d KK, KK, 8
  2791. #endif
  2792. #ifdef RT
  2793. addi.d KK, KK, -8
  2794. #endif
  2795. blt $r0, J, .L10
  2796. .align 3
  2797. .L999:
  2798. LDARG $r23, $sp, 0
  2799. LDARG $r24, $sp, 8
  2800. LDARG $r25, $sp, 16
  2801. LDARG $r26, $sp, 24
  2802. LDARG $r27, $sp, 32
  2803. LDARG $r28, $sp, 40
  2804. fld.d $f24, $sp, 48
  2805. fld.d $f25, $sp, 56
  2806. fld.d $f26, $sp, 64
  2807. fld.d $f27, $sp, 72
  2808. fld.d $f28, $sp, 80
  2809. LDARG $r29, $sp, 88
  2810. LDARG $r30, $sp, 96
  2811. LDARG $r20, $sp, 104
  2812. LDARG $r16, $sp, 112
  2813. #ifndef __64BIT__
  2814. fld.d $f18, $sp, 112
  2815. fld.d $f19, $sp, 120
  2816. fld.d $f20, $sp, 128
  2817. fld.d $f21, $sp, 136
  2818. #endif
  2819. addi.d $sp, $sp, 144
  2820. move $r4, $r17
  2821. fmov.d $f0, $f22
  2822. jirl $r0, $r1, 0x0
  2823. EPILOGUE