You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT.S 65 kB


  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M $r4
  30. #define N $r5
  31. #define K $r6
  32. #define A $r7
  33. #define B $r8
  34. #define C $r9
  35. #define LDC $r10
  36. #define OFFSET $r11
  37. #define AO $r12
  38. #define BO $r13
  39. #define I $r17
  40. #define J $r18
  41. #define L $r29
  42. #define CO1 $r14
  43. #define CO2 $r15
  44. #define CO3 $r23
  45. #define CO4 $r24
  46. #define CO5 $r25
  47. #define CO6 $r26
  48. #define CO7 $r27
  49. #define CO8 $r28
  50. #define KK $r30
  51. #define TEMP $r20
  52. #define AORIG $r16
  53. #define a1 $f22
  54. #define a2 $f8
  55. #define a3 $f27
  56. #define a4 $f28
  57. #define b1 $f23
  58. #define b2 $f9
  59. #define b3 $f10
  60. #define b4 $f11
  61. #define b5 $f12
  62. #define b6 $f13
  63. #define b7 $f14
  64. #define b8 $f15
  65. #define a5 b8
  66. #define c11 $f16
  67. #define c12 $f17
  68. #define c21 $f3
  69. #define c22 $f1
  70. #define c31 $f2
  71. #define c32 $f4
  72. #define c41 $f5
  73. #define c42 $f6
  74. #define c51 $f7
  75. #define c52 $f18
  76. #define c61 $f19
  77. #define c62 $f20
  78. #define c71 $f21
  79. #define c72 $f24
  80. #define c81 $f25
  81. #define c82 $f26
  82. #define ALPHA $f0
  83. PROLOGUE
  84. addi.d $sp, $sp, -144
  85. SDARG $r23, $sp, 0
  86. SDARG $r24, $sp, 8
  87. SDARG $r25, $sp, 16
  88. SDARG $r26, $sp, 24
  89. SDARG $r27, $sp, 32
  90. SDARG $r28, $sp, 40
  91. fst.d $f24, $sp, 48
  92. fst.d $f25, $sp, 56
  93. fst.d $f26, $sp, 64
  94. fst.d $f27, $sp, 72
  95. fst.d $f28, $sp, 80
  96. SDARG $r29, $sp, 88
  97. SDARG $r30, $sp, 96
  98. SDARG $r20, $sp, 104
  99. SDARG $r16, $sp, 112
  100. #ifndef __64BIT__
  101. fst.d $f18, $sp, 112
  102. fst.d $f19, $sp, 120
  103. fst.d $f20, $sp, 128
  104. fst.d $f21, $sp, 136
  105. #endif
  106. slli.d LDC, LDC, BASE_SHIFT
  107. #ifdef LN
  108. mul.w TEMP, M, K
  109. slli.d TEMP, TEMP, BASE_SHIFT
  110. add.d A, A, TEMP
  111. slli.d TEMP, M, BASE_SHIFT
  112. add.d C, C, TEMP
  113. #endif
  114. #ifdef RN
  115. sub.d KK, $r0, OFFSET
  116. #endif
  117. #ifdef RT
  118. mul.w TEMP, N, K
  119. slli.d TEMP, TEMP, BASE_SHIFT
  120. add.d B, B, TEMP
  121. mul.w TEMP, N, LDC
  122. add.d C, C, TEMP
  123. sub.d KK, N, OFFSET
  124. #endif
  125. srai.d J, N, 3
  126. nop
  127. bge $r0, J, .L30
  128. .L10:
  129. #ifdef RT
  130. slli.d TEMP, K, 3 + BASE_SHIFT
  131. sub.d B, B, TEMP
  132. slli.d TEMP, LDC, 3
  133. sub.d C, C, TEMP
  134. #endif
  135. move CO1, C
  136. MTC c11, $r0
  137. add.d CO2, C, LDC
  138. add.d CO3, CO2, LDC
  139. addi.d J, J, -1
  140. add.d CO4, CO3, LDC
  141. MOV c21, c11
  142. add.d CO5, CO4, LDC
  143. MOV c31, c11
  144. add.d CO6, CO5, LDC
  145. MOV c41, c11
  146. add.d CO7, CO6, LDC
  147. MOV c51, c11
  148. add.d CO8, CO7, LDC
  149. srai.d I, M, 1
  150. #ifdef LN
  151. add.d KK, M, OFFSET
  152. #endif
  153. #ifdef LT
  154. move KK, OFFSET
  155. #endif
  156. #if defined(LN) || defined(RT)
  157. move AORIG, A
  158. #else
  159. move AO, A
  160. #endif
  161. #ifndef RT
  162. add.d C, CO8, LDC
  163. #endif
  164. MOV c61, c11
  165. bge $r0, I, .L20
  166. .L11:
  167. #if defined(LT) || defined(RN)
  168. LD a1, AO, 0 * SIZE
  169. MOV c71, c11
  170. LD b1, B, 0 * SIZE
  171. MOV c81, c11
  172. LD a3, AO, 4 * SIZE
  173. MOV c12, c11
  174. LD b2, B, 1 * SIZE
  175. MOV c22, c11
  176. srai.d L, KK, 2
  177. MOV c32, c11
  178. LD b3, B, 2 * SIZE
  179. MOV c42, c11
  180. LD b4, B, 3 * SIZE
  181. MOV c52, c11
  182. LD b5, B, 4 * SIZE
  183. MOV c62, c11
  184. LD b6, B, 8 * SIZE
  185. MOV c72, c11
  186. LD b7, B, 12 * SIZE
  187. MOV c82, c11
  188. move BO, B
  189. bge $r0, L, .L15
  190. #else
  191. #ifdef LN
  192. slli.d TEMP, K, 1 + BASE_SHIFT
  193. sub.d AORIG, AORIG, TEMP
  194. #endif
  195. slli.d L, KK, 1 + BASE_SHIFT
  196. slli.d TEMP, KK, 3 + BASE_SHIFT
  197. add.d AO, AORIG, L
  198. add.d BO, B, TEMP
  199. sub.d TEMP, K, KK
  200. LD a1, AO, 0 * SIZE
  201. MOV c71, c11
  202. LD b1, BO, 0 * SIZE
  203. MOV c81, c11
  204. LD a3, AO, 4 * SIZE
  205. MOV c12, c11
  206. LD b2, BO, 1 * SIZE
  207. MOV c22, c11
  208. srai.d L, TEMP, 2
  209. MOV c32, c11
  210. LD b3, BO, 2 * SIZE
  211. MOV c42, c11
  212. LD b4, BO, 3 * SIZE
  213. MOV c52, c11
  214. LD b5, BO, 4 * SIZE
  215. MOV c62, c11
  216. LD b6, BO, 8 * SIZE
  217. MOV c72, c11
  218. LD b7, BO, 12 * SIZE
  219. MOV c82, c11
  220. bge $r0, L, .L15
  221. #endif
  222. MADD c11, b1, a1, c11
  223. LD a2, AO, 1 * SIZE
  224. MADD c21, b2, a1, c21
  225. addi.d L, L, -1
  226. MADD c31, b3, a1, c31
  227. MADD c41, b4, a1, c41
  228. bge $r0, L, .L13
  229. .align 3
  230. .L12:
  231. MADD c12, b1, a2, c12
  232. LD b1, BO, 16 * SIZE
  233. MADD c22, b2, a2, c22
  234. LD b2, BO, 5 * SIZE
  235. MADD c32, b3, a2, c32
  236. LD b3, BO, 6 * SIZE
  237. MADD c42, b4, a2, c42
  238. LD b4, BO, 7 * SIZE
  239. MADD c51, b5, a1, c51
  240. MADD c61, b2, a1, c61
  241. LD a4, AO, 2 * SIZE
  242. MADD c71, b3, a1, c71
  243. MADD c81, b4, a1, c81
  244. LD a1, AO, 8 * SIZE
  245. MADD c52, b5, a2, c52
  246. LD b5, BO, 20 * SIZE
  247. MADD c62, b2, a2, c62
  248. LD b2, BO, 9 * SIZE
  249. MADD c72, b3, a2, c72
  250. LD b3, BO, 10 * SIZE
  251. MADD c82, b4, a2, c82
  252. LD b4, BO, 11 * SIZE
  253. MADD c11, b6, a4, c11
  254. LD a2, AO, 3 * SIZE
  255. MADD c21, b2, a4, c21
  256. MADD c31, b3, a4, c31
  257. MADD c41, b4, a4, c41
  258. MADD c12, b6, a2, c12
  259. LD b6, BO, 24 * SIZE
  260. MADD c22, b2, a2, c22
  261. LD b2, BO, 13 * SIZE
  262. MADD c32, b3, a2, c32
  263. LD b3, BO, 14 * SIZE
  264. MADD c42, b4, a2, c42
  265. LD b4, BO, 15 * SIZE
  266. MADD c51, b7, a4, c51
  267. MADD c61, b2, a4, c61
  268. MADD c71, b3, a4, c71
  269. MADD c81, b4, a4, c81
  270. MADD c52, b7, a2, c52
  271. LD b7, BO, 28 * SIZE
  272. MADD c62, b2, a2, c62
  273. LD b2, BO, 17 * SIZE
  274. MADD c72, b3, a2, c72
  275. LD b3, BO, 18 * SIZE
  276. MADD c82, b4, a2, c82
  277. LD b4, BO, 19 * SIZE
  278. MADD c11, b1, a3, c11
  279. LD a2, AO, 5 * SIZE
  280. MADD c21, b2, a3, c21
  281. MADD c31, b3, a3, c31
  282. MADD c41, b4, a3, c41
  283. MADD c12, b1, a2, c12
  284. LD b1, BO, 32 * SIZE
  285. MADD c22, b2, a2, c22
  286. LD b2, BO, 21 * SIZE
  287. MADD c32, b3, a2, c32
  288. LD b3, BO, 22 * SIZE
  289. MADD c42, b4, a2, c42
  290. LD b4, BO, 23 * SIZE
  291. MADD c51, b5, a3, c51
  292. MADD c61, b2, a3, c61
  293. LD a4, AO, 6 * SIZE
  294. MADD c71, b3, a3, c71
  295. MADD c81, b4, a3, c81
  296. LD a3, AO, 12 * SIZE
  297. MADD c52, b5, a2, c52
  298. LD b5, BO, 36 * SIZE
  299. MADD c62, b2, a2, c62
  300. LD b2, BO, 25 * SIZE
  301. MADD c72, b3, a2, c72
  302. LD b3, BO, 26 * SIZE
  303. MADD c82, b4, a2, c82
  304. LD b4, BO, 27 * SIZE
  305. MADD c11, b6, a4, c11
  306. LD a2, AO, 7 * SIZE
  307. MADD c21, b2, a4, c21
  308. MADD c31, b3, a4, c31
  309. MADD c41, b4, a4, c41
  310. addi.d L, L, -1
  311. MADD c12, b6, a2, c12
  312. LD b6, BO, 40 * SIZE
  313. MADD c22, b2, a2, c22
  314. LD b2, BO, 29 * SIZE
  315. MADD c32, b3, a2, c32
  316. LD b3, BO, 30 * SIZE
  317. MADD c42, b4, a2, c42
  318. LD b4, BO, 31 * SIZE
  319. MADD c51, b7, a4, c51
  320. addi.d BO, BO, 32 * SIZE
  321. MADD c61, b2, a4, c61
  322. addi.d AO, AO, 8 * SIZE
  323. MADD c71, b3, a4, c71
  324. MADD c81, b4, a4, c81
  325. MADD c52, b7, a2, c52
  326. LD b7, BO, 12 * SIZE
  327. MADD c62, b2, a2, c62
  328. LD b2, BO, 1 * SIZE
  329. MADD c72, b3, a2, c72
  330. LD b3, BO, 2 * SIZE
  331. MADD c82, b4, a2, c82
  332. LD b4, BO, 3 * SIZE
  333. MADD c11, b1, a1, c11
  334. LD a2, AO, 1 * SIZE
  335. MADD c21, b2, a1, c21
  336. MADD c31, b3, a1, c31
  337. MADD c41, b4, a1, c41
  338. blt $r0, L, .L12
  339. .align 3
  340. .L13:
  341. MADD c12, b1, a2, c12
  342. LD b1, BO, 16 * SIZE
  343. MADD c22, b2, a2, c22
  344. LD b2, BO, 5 * SIZE
  345. MADD c32, b3, a2, c32
  346. LD b3, BO, 6 * SIZE
  347. MADD c42, b4, a2, c42
  348. LD b4, BO, 7 * SIZE
  349. MADD c51, b5, a1, c51
  350. MADD c61, b2, a1, c61
  351. LD a4, AO, 2 * SIZE
  352. MADD c71, b3, a1, c71
  353. MADD c81, b4, a1, c81
  354. LD a1, AO, 8 * SIZE
  355. MADD c52, b5, a2, c52
  356. LD b5, BO, 20 * SIZE
  357. MADD c62, b2, a2, c62
  358. LD b2, BO, 9 * SIZE
  359. MADD c72, b3, a2, c72
  360. LD b3, BO, 10 * SIZE
  361. MADD c82, b4, a2, c82
  362. LD b4, BO, 11 * SIZE
  363. MADD c11, b6, a4, c11
  364. LD a2, AO, 3 * SIZE
  365. MADD c21, b2, a4, c21
  366. MADD c31, b3, a4, c31
  367. MADD c41, b4, a4, c41
  368. MADD c12, b6, a2, c12
  369. LD b6, BO, 24 * SIZE
  370. MADD c22, b2, a2, c22
  371. LD b2, BO, 13 * SIZE
  372. MADD c32, b3, a2, c32
  373. LD b3, BO, 14 * SIZE
  374. MADD c42, b4, a2, c42
  375. LD b4, BO, 15 * SIZE
  376. MADD c51, b7, a4, c51
  377. MADD c61, b2, a4, c61
  378. MADD c71, b3, a4, c71
  379. MADD c81, b4, a4, c81
  380. MADD c52, b7, a2, c52
  381. LD b7, BO, 28 * SIZE
  382. MADD c62, b2, a2, c62
  383. LD b2, BO, 17 * SIZE
  384. MADD c72, b3, a2, c72
  385. LD b3, BO, 18 * SIZE
  386. MADD c82, b4, a2, c82
  387. LD b4, BO, 19 * SIZE
  388. MADD c11, b1, a3, c11
  389. LD a2, AO, 5 * SIZE
  390. MADD c21, b2, a3, c21
  391. MADD c31, b3, a3, c31
  392. MADD c41, b4, a3, c41
  393. MADD c12, b1, a2, c12
  394. LD b1, BO, 32 * SIZE
  395. MADD c22, b2, a2, c22
  396. LD b2, BO, 21 * SIZE
  397. MADD c32, b3, a2, c32
  398. LD b3, BO, 22 * SIZE
  399. MADD c42, b4, a2, c42
  400. LD b4, BO, 23 * SIZE
  401. MADD c51, b5, a3, c51
  402. MADD c61, b2, a3, c61
  403. LD a4, AO, 6 * SIZE
  404. MADD c71, b3, a3, c71
  405. MADD c81, b4, a3, c81
  406. LD a3, AO, 12 * SIZE
  407. MADD c52, b5, a2, c52
  408. LD b5, BO, 36 * SIZE
  409. MADD c62, b2, a2, c62
  410. LD b2, BO, 25 * SIZE
  411. MADD c72, b3, a2, c72
  412. LD b3, BO, 26 * SIZE
  413. MADD c82, b4, a2, c82
  414. LD b4, BO, 27 * SIZE
  415. MADD c11, b6, a4, c11
  416. LD a2, AO, 7 * SIZE
  417. MADD c21, b2, a4, c21
  418. MADD c31, b3, a4, c31
  419. MADD c41, b4, a4, c41
  420. MADD c12, b6, a2, c12
  421. LD b6, BO, 40 * SIZE
  422. MADD c22, b2, a2, c22
  423. LD b2, BO, 29 * SIZE
  424. MADD c32, b3, a2, c32
  425. LD b3, BO, 30 * SIZE
  426. MADD c42, b4, a2, c42
  427. LD b4, BO, 31 * SIZE
  428. MADD c51, b7, a4, c51
  429. addi.d BO, BO, 32 * SIZE
  430. MADD c61, b2, a4, c61
  431. addi.d AO, AO, 8 * SIZE
  432. MADD c71, b3, a4, c71
  433. MADD c81, b4, a4, c81
  434. MADD c52, b7, a2, c52
  435. LD b7, BO, 12 * SIZE
  436. MADD c62, b2, a2, c62
  437. LD b2, BO, 1 * SIZE
  438. MADD c72, b3, a2, c72
  439. LD b3, BO, 2 * SIZE
  440. MADD c82, b4, a2, c82
  441. LD b4, BO, 3 * SIZE
  442. .align 3
  443. .L15:
  444. #if defined(LT) || defined(RN)
  445. andi L, KK, 3
  446. #else
  447. andi L, TEMP, 3
  448. #endif
  449. bge $r0, L, .L18
  450. .align 3
  451. .L16:
  452. MADD c11, b1, a1, c11
  453. LD a2, AO, 1 * SIZE
  454. MADD c21, b2, a1, c21
  455. MADD c31, b3, a1, c31
  456. MADD c41, b4, a1, c41
  457. MADD c12, b1, a2, c12
  458. LD b1, BO, 8 * SIZE
  459. MADD c22, b2, a2, c22
  460. LD b2, BO, 5 * SIZE
  461. MADD c32, b3, a2, c32
  462. LD b3, BO, 6 * SIZE
  463. MADD c42, b4, a2, c42
  464. LD b4, BO, 7 * SIZE
  465. MADD c51, b5, a1, c51
  466. addi.d L, L, -1
  467. MADD c61, b2, a1, c61
  468. addi.d AO, AO, 2 * SIZE
  469. MADD c71, b3, a1, c71
  470. addi.d BO, BO, 8 * SIZE
  471. MADD c81, b4, a1, c81
  472. LD a1, AO, 0 * SIZE
  473. MADD c52, b5, a2, c52
  474. LD b5, BO, 4 * SIZE
  475. MADD c62, b2, a2, c62
  476. LD b2, BO, 1 * SIZE
  477. MADD c72, b3, a2, c72
  478. LD b3, BO, 2 * SIZE
  479. MADD c82, b4, a2, c82
  480. LD b4, BO, 3 * SIZE
  481. blt $r0, L, .L16
  482. .L18:
  483. #if defined(LN) || defined(RT)
  484. #ifdef LN
  485. addi.d TEMP, KK, -2
  486. #else
  487. addi.d TEMP, KK, -8
  488. #endif
  489. slli.d L, TEMP, 1 + BASE_SHIFT
  490. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  491. add.d AO, AORIG, L
  492. add.d BO, B, TEMP
  493. #endif
  494. #if defined(LN) || defined(LT)
  495. LD b1, BO, 0 * SIZE
  496. LD b2, BO, 1 * SIZE
  497. LD b3, BO, 2 * SIZE
  498. LD b4, BO, 3 * SIZE
  499. SUB c11, b1, c11
  500. LD b5, BO, 4 * SIZE
  501. SUB c21, b2, c21
  502. LD b6, BO, 5 * SIZE
  503. SUB c31, b3, c31
  504. LD b7, BO, 6 * SIZE
  505. SUB c41, b4, c41
  506. LD b8, BO, 7 * SIZE
  507. SUB c51, b5, c51
  508. LD b1, BO, 8 * SIZE
  509. SUB c61, b6, c61
  510. LD b2, BO, 9 * SIZE
  511. SUB c71, b7, c71
  512. LD b3, BO, 10 * SIZE
  513. SUB c81, b8, c81
  514. LD b4, BO, 11 * SIZE
  515. SUB c12, b1, c12
  516. LD b5, BO, 12 * SIZE
  517. SUB c22, b2, c22
  518. LD b6, BO, 13 * SIZE
  519. SUB c32, b3, c32
  520. LD b7, BO, 14 * SIZE
  521. SUB c42, b4, c42
  522. LD b8, BO, 15 * SIZE
  523. SUB c52, b5, c52
  524. #ifdef LN
  525. LD b1, AO, 3 * SIZE
  526. #else
  527. LD b1, AO, 0 * SIZE
  528. #endif
  529. SUB c62, b6, c62
  530. SUB c72, b7, c72
  531. SUB c82, b8, c82
  532. #else
  533. LD b1, AO, 0 * SIZE
  534. LD b2, AO, 1 * SIZE
  535. LD b3, AO, 2 * SIZE
  536. LD b4, AO, 3 * SIZE
  537. SUB c11, b1, c11
  538. LD b5, AO, 4 * SIZE
  539. SUB c12, b2, c12
  540. LD b6, AO, 5 * SIZE
  541. SUB c21, b3, c21
  542. LD b7, AO, 6 * SIZE
  543. SUB c22, b4, c22
  544. LD b8, AO, 7 * SIZE
  545. SUB c31, b5, c31
  546. LD b1, AO, 8 * SIZE
  547. SUB c32, b6, c32
  548. LD b2, AO, 9 * SIZE
  549. SUB c41, b7, c41
  550. LD b3, AO, 10 * SIZE
  551. SUB c42, b8, c42
  552. LD b4, AO, 11 * SIZE
  553. LD b5, AO, 12 * SIZE
  554. SUB c51, b1, c51
  555. LD b6, AO, 13 * SIZE
  556. SUB c52, b2, c52
  557. LD b7, AO, 14 * SIZE
  558. SUB c61, b3, c61
  559. LD b8, AO, 15 * SIZE
  560. SUB c62, b4, c62
  561. SUB c71, b5, c71
  562. SUB c72, b6, c72
  563. SUB c81, b7, c81
  564. SUB c82, b8, c82
  565. #endif
  566. #ifdef LN
  567. MUL c12, b1, c12
  568. LD b2, AO, 2 * SIZE
  569. MUL c22, b1, c22
  570. MUL c32, b1, c32
  571. MUL c42, b1, c42
  572. MUL c52, b1, c52
  573. MUL c62, b1, c62
  574. MUL c72, b1, c72
  575. MUL c82, b1, c82
  576. NMSUB c11, c12, b2, c11
  577. LD b3, AO, 0 * SIZE
  578. NMSUB c21, c22, b2, c21
  579. NMSUB c31, c32, b2, c31
  580. NMSUB c41, c42, b2, c41
  581. NMSUB c51, c52, b2, c51
  582. NMSUB c61, c62, b2, c61
  583. NMSUB c71, c72, b2, c71
  584. NMSUB c81, c82, b2, c81
  585. MUL c11, b3, c11
  586. addi.d CO1, CO1, -2 * SIZE
  587. MUL c21, b3, c21
  588. addi.d CO2, CO2, -2 * SIZE
  589. MUL c31, b3, c31
  590. addi.d CO3, CO3, -2 * SIZE
  591. MUL c41, b3, c41
  592. addi.d CO4, CO4, -2 * SIZE
  593. MUL c51, b3, c51
  594. addi.d CO5, CO5, -2 * SIZE
  595. MUL c61, b3, c61
  596. addi.d CO6, CO6, -2 * SIZE
  597. MUL c71, b3, c71
  598. addi.d CO7, CO7, -2 * SIZE
  599. MUL c81, b3, c81
  600. addi.d CO8, CO8, -2 * SIZE
  601. #endif
  602. #ifdef LT
  603. MUL c11, b1, c11
  604. LD b2, AO, 1 * SIZE
  605. MUL c21, b1, c21
  606. MUL c31, b1, c31
  607. MUL c41, b1, c41
  608. MUL c51, b1, c51
  609. MUL c61, b1, c61
  610. MUL c71, b1, c71
  611. MUL c81, b1, c81
  612. NMSUB c12, c11, b2, c12
  613. LD b3, AO, 3 * SIZE
  614. NMSUB c22, c21, b2, c22
  615. NMSUB c32, c31, b2, c32
  616. NMSUB c42, c41, b2, c42
  617. NMSUB c52, c51, b2, c52
  618. NMSUB c62, c61, b2, c62
  619. NMSUB c72, c71, b2, c72
  620. NMSUB c82, c81, b2, c82
  621. MUL c12, b3, c12
  622. MUL c22, b3, c22
  623. MUL c32, b3, c32
  624. MUL c42, b3, c42
  625. MUL c52, b3, c52
  626. MUL c62, b3, c62
  627. MUL c72, b3, c72
  628. MUL c82, b3, c82
  629. #endif
  630. #ifdef RN
  631. LD b1, BO, 0 * SIZE
  632. LD b2, BO, 1 * SIZE
  633. LD b3, BO, 2 * SIZE
  634. LD b4, BO, 3 * SIZE
  635. MUL c11, b1, c11
  636. MUL c12, b1, c12
  637. LD b5, BO, 4 * SIZE
  638. NMSUB c21, c11, b2, c21
  639. NMSUB c22, c12, b2, c22
  640. LD b6, BO, 5 * SIZE
  641. NMSUB c31, c11, b3, c31
  642. NMSUB c32, c12, b3, c32
  643. LD b7, BO, 6 * SIZE
  644. NMSUB c41, c11, b4, c41
  645. NMSUB c42, c12, b4, c42
  646. LD b8, BO, 7 * SIZE
  647. NMSUB c51, c11, b5, c51
  648. NMSUB c52, c12, b5, c52
  649. LD b2, BO, 9 * SIZE
  650. NMSUB c61, c11, b6, c61
  651. NMSUB c62, c12, b6, c62
  652. LD b3, BO, 10 * SIZE
  653. NMSUB c71, c11, b7, c71
  654. NMSUB c72, c12, b7, c72
  655. LD b4, BO, 11 * SIZE
  656. NMSUB c81, c11, b8, c81
  657. NMSUB c82, c12, b8, c82
  658. LD b5, BO, 12 * SIZE
  659. MUL c21, b2, c21
  660. MUL c22, b2, c22
  661. LD b6, BO, 13 * SIZE
  662. NMSUB c31, c21, b3, c31
  663. NMSUB c32, c22, b3, c32
  664. LD b7, BO, 14 * SIZE
  665. NMSUB c41, c21, b4, c41
  666. NMSUB c42, c22, b4, c42
  667. LD b8, BO, 15 * SIZE
  668. NMSUB c51, c21, b5, c51
  669. NMSUB c52, c22, b5, c52
  670. LD b3, BO, 18 * SIZE
  671. NMSUB c61, c21, b6, c61
  672. NMSUB c62, c22, b6, c62
  673. LD b4, BO, 19 * SIZE
  674. NMSUB c71, c21, b7, c71
  675. NMSUB c72, c22, b7, c72
  676. LD b5, BO, 20 * SIZE
  677. NMSUB c81, c21, b8, c81
  678. NMSUB c82, c22, b8, c82
  679. LD b6, BO, 21 * SIZE
  680. MUL c31, b3, c31
  681. MUL c32, b3, c32
  682. LD b7, BO, 22 * SIZE
  683. NMSUB c41, c31, b4, c41
  684. NMSUB c42, c32, b4, c42
  685. LD b8, BO, 23 * SIZE
  686. NMSUB c51, c31, b5, c51
  687. NMSUB c52, c32, b5, c52
  688. LD b4, BO, 27 * SIZE
  689. NMSUB c61, c31, b6, c61
  690. NMSUB c62, c32, b6, c62
  691. LD b5, BO, 28 * SIZE
  692. NMSUB c71, c31, b7, c71
  693. NMSUB c72, c32, b7, c72
  694. LD b6, BO, 29 * SIZE
  695. NMSUB c81, c31, b8, c81
  696. NMSUB c82, c32, b8, c82
  697. LD b7, BO, 30 * SIZE
  698. MUL c41, b4, c41
  699. MUL c42, b4, c42
  700. LD b8, BO, 31 * SIZE
  701. NMSUB c51, c41, b5, c51
  702. NMSUB c52, c42, b5, c52
  703. LD b5, BO, 36 * SIZE
  704. NMSUB c61, c41, b6, c61
  705. NMSUB c62, c42, b6, c62
  706. LD b6, BO, 37 * SIZE
  707. NMSUB c71, c41, b7, c71
  708. NMSUB c72, c42, b7, c72
  709. LD b7, BO, 38 * SIZE
  710. NMSUB c81, c41, b8, c81
  711. NMSUB c82, c42, b8, c82
  712. LD b8, BO, 39 * SIZE
  713. MUL c51, b5, c51
  714. MUL c52, b5, c52
  715. NMSUB c61, c51, b6, c61
  716. NMSUB c62, c52, b6, c62
  717. LD b6, BO, 45 * SIZE
  718. NMSUB c71, c51, b7, c71
  719. NMSUB c72, c52, b7, c72
  720. LD b7, BO, 46 * SIZE
  721. NMSUB c81, c51, b8, c81
  722. NMSUB c82, c52, b8, c82
  723. LD b8, BO, 47 * SIZE
  724. MUL c61, b6, c61
  725. MUL c62, b6, c62
  726. NMSUB c71, c61, b7, c71
  727. NMSUB c72, c62, b7, c72
  728. LD b7, BO, 54 * SIZE
  729. NMSUB c81, c61, b8, c81
  730. NMSUB c82, c62, b8, c82
  731. LD b8, BO, 55 * SIZE
  732. MUL c71, b7, c71
  733. MUL c72, b7, c72
  734. NMSUB c81, c71, b8, c81
  735. NMSUB c82, c72, b8, c82
  736. LD b8, BO, 63 * SIZE
  737. MUL c81, b8, c81
  738. MUL c82, b8, c82
  739. #endif
  740. #ifdef RT
  741. LD b1, BO, 63 * SIZE
  742. LD b2, BO, 62 * SIZE
  743. LD b3, BO, 61 * SIZE
  744. LD b4, BO, 60 * SIZE
  745. MUL c81, b1, c81
  746. MUL c82, b1, c82
  747. LD b5, BO, 59 * SIZE
  748. NMSUB c71, c81, b2, c71
  749. NMSUB c72, c82, b2, c72
  750. LD b6, BO, 58 * SIZE
  751. NMSUB c61, c81, b3, c61
  752. NMSUB c62, c82, b3, c62
  753. LD b7, BO, 57 * SIZE
  754. NMSUB c51, c81, b4, c51
  755. NMSUB c52, c82, b4, c52
  756. LD b8, BO, 56 * SIZE
  757. NMSUB c41, c81, b5, c41
  758. NMSUB c42, c82, b5, c42
  759. LD b2, BO, 54 * SIZE
  760. NMSUB c31, c81, b6, c31
  761. NMSUB c32, c82, b6, c32
  762. LD b3, BO, 53 * SIZE
  763. NMSUB c21, c81, b7, c21
  764. NMSUB c22, c82, b7, c22
  765. LD b4, BO, 52 * SIZE
  766. NMSUB c11, c81, b8, c11
  767. NMSUB c12, c82, b8, c12
  768. LD b5, BO, 51 * SIZE
  769. MUL c71, b2, c71
  770. MUL c72, b2, c72
  771. LD b6, BO, 50 * SIZE
  772. NMSUB c61, c71, b3, c61
  773. NMSUB c62, c72, b3, c62
  774. LD b7, BO, 49 * SIZE
  775. NMSUB c51, c71, b4, c51
  776. NMSUB c52, c72, b4, c52
  777. LD b8, BO, 48 * SIZE
  778. NMSUB c41, c71, b5, c41
  779. NMSUB c42, c72, b5, c42
  780. LD b3, BO, 45 * SIZE
  781. NMSUB c31, c71, b6, c31
  782. NMSUB c32, c72, b6, c32
  783. LD b4, BO, 44 * SIZE
  784. NMSUB c21, c71, b7, c21
  785. NMSUB c22, c72, b7, c22
  786. LD b5, BO, 43 * SIZE
  787. NMSUB c11, c71, b8, c11
  788. NMSUB c12, c72, b8, c12
  789. LD b6, BO, 42 * SIZE
  790. MUL c61, b3, c61
  791. MUL c62, b3, c62
  792. LD b7, BO, 41 * SIZE
  793. NMSUB c51, c61, b4, c51
  794. NMSUB c52, c62, b4, c52
  795. LD b8, BO, 40 * SIZE
  796. NMSUB c41, c61, b5, c41
  797. NMSUB c42, c62, b5, c42
  798. LD b4, BO, 36 * SIZE
  799. NMSUB c31, c61, b6, c31
  800. NMSUB c32, c62, b6, c32
  801. LD b5, BO, 35 * SIZE
  802. NMSUB c21, c61, b7, c21
  803. NMSUB c22, c62, b7, c22
  804. LD b6, BO, 34 * SIZE
  805. NMSUB c11, c61, b8, c11
  806. NMSUB c12, c62, b8, c12
  807. LD b7, BO, 33 * SIZE
  808. MUL c51, b4, c51
  809. MUL c52, b4, c52
  810. LD b8, BO, 32 * SIZE
  811. NMSUB c41, c51, b5, c41
  812. NMSUB c42, c52, b5, c42
  813. LD b5, BO, 27 * SIZE
  814. NMSUB c31, c51, b6, c31
  815. NMSUB c32, c52, b6, c32
  816. LD b6, BO, 26 * SIZE
  817. NMSUB c21, c51, b7, c21
  818. NMSUB c22, c52, b7, c22
  819. LD b7, BO, 25 * SIZE
  820. NMSUB c11, c51, b8, c11
  821. NMSUB c12, c52, b8, c12
  822. LD b8, BO, 24 * SIZE
  823. MUL c41, b5, c41
  824. MUL c42, b5, c42
  825. NMSUB c31, c41, b6, c31
  826. NMSUB c32, c42, b6, c32
  827. LD b6, BO, 18 * SIZE
  828. NMSUB c21, c41, b7, c21
  829. NMSUB c22, c42, b7, c22
  830. LD b7, BO, 17 * SIZE
  831. NMSUB c11, c41, b8, c11
  832. NMSUB c12, c42, b8, c12
  833. LD b8, BO, 16 * SIZE
  834. MUL c31, b6, c31
  835. MUL c32, b6, c32
  836. NMSUB c21, c31, b7, c21
  837. NMSUB c22, c32, b7, c22
  838. LD b7, BO, 9 * SIZE
  839. NMSUB c11, c31, b8, c11
  840. NMSUB c12, c32, b8, c12
  841. LD b8, BO, 8 * SIZE
  842. MUL c21, b7, c21
  843. MUL c22, b7, c22
  844. NMSUB c11, c21, b8, c11
  845. NMSUB c12, c22, b8, c12
  846. LD b8, BO, 0 * SIZE
  847. MUL c11, b8, c11
  848. MUL c12, b8, c12
  849. #endif
  850. #if defined(LN) || defined(LT)
  851. ST c11, BO, 0 * SIZE
  852. ST c21, BO, 1 * SIZE
  853. ST c31, BO, 2 * SIZE
  854. ST c41, BO, 3 * SIZE
  855. ST c51, BO, 4 * SIZE
  856. ST c61, BO, 5 * SIZE
  857. ST c71, BO, 6 * SIZE
  858. ST c81, BO, 7 * SIZE
  859. ST c12, BO, 8 * SIZE
  860. ST c22, BO, 9 * SIZE
  861. ST c32, BO, 10 * SIZE
  862. ST c42, BO, 11 * SIZE
  863. ST c52, BO, 12 * SIZE
  864. ST c62, BO, 13 * SIZE
  865. ST c72, BO, 14 * SIZE
  866. ST c82, BO, 15 * SIZE
  867. #else
  868. ST c11, AO, 0 * SIZE
  869. ST c12, AO, 1 * SIZE
  870. ST c21, AO, 2 * SIZE
  871. ST c22, AO, 3 * SIZE
  872. ST c31, AO, 4 * SIZE
  873. ST c32, AO, 5 * SIZE
  874. ST c41, AO, 6 * SIZE
  875. ST c42, AO, 7 * SIZE
  876. ST c51, AO, 8 * SIZE
  877. ST c52, AO, 9 * SIZE
  878. ST c61, AO, 10 * SIZE
  879. ST c62, AO, 11 * SIZE
  880. ST c71, AO, 12 * SIZE
  881. ST c72, AO, 13 * SIZE
  882. ST c81, AO, 14 * SIZE
  883. ST c82, AO, 15 * SIZE
  884. #endif
  885. ST c11, CO1, 0 * SIZE
  886. ST c12, CO1, 1 * SIZE
  887. ST c21, CO2, 0 * SIZE
  888. ST c22, CO2, 1 * SIZE
  889. ST c31, CO3, 0 * SIZE
  890. ST c32, CO3, 1 * SIZE
  891. ST c41, CO4, 0 * SIZE
  892. ST c42, CO4, 1 * SIZE
  893. ST c51, CO5, 0 * SIZE
  894. ST c52, CO5, 1 * SIZE
  895. ST c61, CO6, 0 * SIZE
  896. ST c62, CO6, 1 * SIZE
  897. ST c71, CO7, 0 * SIZE
  898. ST c72, CO7, 1 * SIZE
  899. ST c81, CO8, 0 * SIZE
  900. ST c82, CO8, 1 * SIZE
  901. MTC a1, $r0
  902. #ifndef LN
  903. addi.d CO1, CO1, 2 * SIZE
  904. addi.d CO2, CO2, 2 * SIZE
  905. addi.d CO3, CO3, 2 * SIZE
  906. addi.d CO4, CO4, 2 * SIZE
  907. addi.d CO5, CO5, 2 * SIZE
  908. addi.d CO6, CO6, 2 * SIZE
  909. addi.d CO7, CO7, 2 * SIZE
  910. addi.d CO8, CO8, 2 * SIZE
  911. #endif
  912. MOV c11, a1
  913. MOV c21, a1
  914. #ifdef RT
  915. slli.d TEMP, K, 1 + BASE_SHIFT
  916. add.d AORIG, AORIG, TEMP
  917. #endif
  918. MOV c31, a1
  919. MOV c41, a1
  920. #if defined(LT) || defined(RN)
  921. sub.d TEMP, K, KK
  922. slli.d L, TEMP, 1 + BASE_SHIFT
  923. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  924. add.d AO, AO, L
  925. add.d BO, BO, TEMP
  926. #endif
  927. #ifdef LT
  928. addi.d KK, KK, 2
  929. #endif
  930. #ifdef LN
  931. addi.d KK, KK, -2
  932. #endif
  933. addi.d I, I, -1
  934. MOV c51, a1
  935. MOV c61, a1
  936. blt $r0, I, .L11
  937. .align 3
  938. .L20:
  939. andi I, M, 1
  940. MOV c61, c11
  941. MOV c71, c11
  942. bge $r0, I, .L29
  943. #if defined(LT) || defined(RN)
  944. LD a1, AO, 0 * SIZE
  945. LD a2, AO, 1 * SIZE
  946. LD a3, AO, 2 * SIZE
  947. LD a4, AO, 3 * SIZE
  948. LD b1, B, 0 * SIZE
  949. LD b2, B, 1 * SIZE
  950. LD b3, B, 2 * SIZE
  951. LD b4, B, 3 * SIZE
  952. LD b5, B, 4 * SIZE
  953. LD b6, B, 8 * SIZE
  954. LD b7, B, 12 * SIZE
  955. srai.d L, KK, 2
  956. MOV c81, c11
  957. move BO, B
  958. bge $r0, L, .L25
  959. #else
  960. #ifdef LN
  961. slli.d TEMP, K, 0 + BASE_SHIFT
  962. sub.d AORIG, AORIG, TEMP
  963. #endif
  964. slli.d L, KK, 0 + BASE_SHIFT
  965. slli.d TEMP, KK, 3 + BASE_SHIFT
  966. add.d AO, AORIG, L
  967. add.d BO, B, TEMP
  968. sub.d TEMP, K, KK
  969. LD a1, AO, 0 * SIZE
  970. LD a2, AO, 1 * SIZE
  971. LD a3, AO, 2 * SIZE
  972. LD a4, AO, 3 * SIZE
  973. LD b1, BO, 0 * SIZE
  974. LD b2, BO, 1 * SIZE
  975. LD b3, BO, 2 * SIZE
  976. LD b4, BO, 3 * SIZE
  977. LD b5, BO, 4 * SIZE
  978. LD b6, BO, 8 * SIZE
  979. LD b7, BO, 12 * SIZE
  980. srai.d L, TEMP, 2
  981. MOV c81, c11
  982. bge $r0, L, .L25
  983. #endif
  984. .align 3
  985. .L22:
  986. MADD c11, b1, a1, c11
  987. LD b1, BO, 16 * SIZE
  988. MADD c21, b2, a1, c21
  989. LD b2, BO, 5 * SIZE
  990. MADD c31, b3, a1, c31
  991. LD b3, BO, 6 * SIZE
  992. MADD c41, b4, a1, c41
  993. LD b4, BO, 7 * SIZE
  994. MADD c51, b5, a1, c51
  995. LD b5, BO, 20 * SIZE
  996. MADD c61, b2, a1, c61
  997. LD b2, BO, 9 * SIZE
  998. MADD c71, b3, a1, c71
  999. LD b3, BO, 10 * SIZE
  1000. MADD c81, b4, a1, c81
  1001. LD b4, BO, 11 * SIZE
  1002. LD a1, AO, 4 * SIZE
  1003. addi.d L, L, -1
  1004. MADD c11, b6, a2, c11
  1005. LD b6, BO, 24 * SIZE
  1006. MADD c21, b2, a2, c21
  1007. LD b2, BO, 13 * SIZE
  1008. MADD c31, b3, a2, c31
  1009. LD b3, BO, 14 * SIZE
  1010. MADD c41, b4, a2, c41
  1011. LD b4, BO, 15 * SIZE
  1012. MADD c51, b7, a2, c51
  1013. LD b7, BO, 28 * SIZE
  1014. MADD c61, b2, a2, c61
  1015. LD b2, BO, 17 * SIZE
  1016. MADD c71, b3, a2, c71
  1017. LD b3, BO, 18 * SIZE
  1018. MADD c81, b4, a2, c81
  1019. LD b4, BO, 19 * SIZE
  1020. LD a2, AO, 5 * SIZE
  1021. addi.d AO, AO, 4 * SIZE
  1022. MADD c11, b1, a3, c11
  1023. LD b1, BO, 32 * SIZE
  1024. MADD c21, b2, a3, c21
  1025. LD b2, BO, 21 * SIZE
  1026. MADD c31, b3, a3, c31
  1027. LD b3, BO, 22 * SIZE
  1028. MADD c41, b4, a3, c41
  1029. LD b4, BO, 23 * SIZE
  1030. MADD c51, b5, a3, c51
  1031. LD b5, BO, 36 * SIZE
  1032. MADD c61, b2, a3, c61
  1033. LD b2, BO, 25 * SIZE
  1034. MADD c71, b3, a3, c71
  1035. LD b3, BO, 26 * SIZE
  1036. MADD c81, b4, a3, c81
  1037. LD b4, BO, 27 * SIZE
  1038. LD a3, AO, 2 * SIZE
  1039. addi.d BO, BO, 32 * SIZE
  1040. MADD c11, b6, a4, c11
  1041. LD b6, BO, 8 * SIZE
  1042. MADD c21, b2, a4, c21
  1043. LD b2, BO, -3 * SIZE
  1044. MADD c31, b3, a4, c31
  1045. LD b3, BO, -2 * SIZE
  1046. MADD c41, b4, a4, c41
  1047. LD b4, BO, -1 * SIZE
  1048. MADD c51, b7, a4, c51
  1049. LD b7, BO, 12 * SIZE
  1050. MADD c61, b2, a4, c61
  1051. LD b2, BO, 1 * SIZE
  1052. MADD c71, b3, a4, c71
  1053. LD b3, BO, 2 * SIZE
  1054. MADD c81, b4, a4, c81
  1055. LD b4, BO, 3 * SIZE
  1056. LD a4, AO, 3 * SIZE
  1057. blt $r0, L, .L22
  1058. .align 3
  1059. .L25:
  1060. #if defined(LT) || defined(RN)
  1061. andi L, KK, 3
  1062. #else
  1063. andi L, TEMP, 3
  1064. #endif
  1065. bge $r0, L, .L28
  1066. .align 3
  1067. .L26:
  1068. MADD c11, b1, a1, c11
  1069. LD b1, BO, 8 * SIZE
  1070. MADD c21, b2, a1, c21
  1071. LD b2, BO, 5 * SIZE
  1072. MADD c31, b3, a1, c31
  1073. LD b3, BO, 6 * SIZE
  1074. MADD c41, b4, a1, c41
  1075. LD b4, BO, 7 * SIZE
  1076. addi.d L, L, -1
  1077. MOV a2, a2
  1078. addi.d AO, AO, 1 * SIZE
  1079. addi.d BO, BO, 8 * SIZE
  1080. MADD c51, b5, a1, c51
  1081. LD b5, BO, 4 * SIZE
  1082. MADD c61, b2, a1, c61
  1083. LD b2, BO, 1 * SIZE
  1084. MADD c71, b3, a1, c71
  1085. LD b3, BO, 2 * SIZE
  1086. MADD c81, b4, a1, c81
  1087. LD a1, AO, 0 * SIZE
  1088. LD b4, BO, 3 * SIZE
  1089. blt $r0, L, .L26
  1090. .L28:
  1091. #if defined(LN) || defined(RT)
  1092. #ifdef LN
  1093. addi.d TEMP, KK, -1
  1094. #else
  1095. addi.d TEMP, KK, -8
  1096. #endif
  1097. slli.d L, TEMP, 0 + BASE_SHIFT
  1098. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  1099. add.d AO, AORIG, L
  1100. add.d BO, B, TEMP
  1101. #endif
  1102. #if defined(LN) || defined(LT)
  1103. LD b1, BO, 0 * SIZE
  1104. LD b2, BO, 1 * SIZE
  1105. LD b3, BO, 2 * SIZE
  1106. LD b4, BO, 3 * SIZE
  1107. LD b5, BO, 4 * SIZE
  1108. LD b6, BO, 5 * SIZE
  1109. LD b7, BO, 6 * SIZE
  1110. LD b8, BO, 7 * SIZE
  1111. SUB c11, b1, c11
  1112. SUB c21, b2, c21
  1113. SUB c31, b3, c31
  1114. SUB c41, b4, c41
  1115. SUB c51, b5, c51
  1116. SUB c61, b6, c61
  1117. SUB c71, b7, c71
  1118. SUB c81, b8, c81
  1119. #else
  1120. LD b1, AO, 0 * SIZE
  1121. LD b2, AO, 1 * SIZE
  1122. LD b3, AO, 2 * SIZE
  1123. LD b4, AO, 3 * SIZE
  1124. LD b5, AO, 4 * SIZE
  1125. LD b6, AO, 5 * SIZE
  1126. LD b7, AO, 6 * SIZE
  1127. LD b8, AO, 7 * SIZE
  1128. SUB c11, b1, c11
  1129. SUB c21, b2, c21
  1130. SUB c31, b3, c31
  1131. SUB c41, b4, c41
  1132. SUB c51, b5, c51
  1133. SUB c61, b6, c61
  1134. SUB c71, b7, c71
  1135. SUB c81, b8, c81
  1136. #endif
  1137. #if defined(LN) || defined(LT)
  1138. LD b1, AO, 0 * SIZE
  1139. MUL c11, b1, c11
  1140. MUL c21, b1, c21
  1141. MUL c31, b1, c31
  1142. MUL c41, b1, c41
  1143. MUL c51, b1, c51
  1144. MUL c61, b1, c61
  1145. MUL c71, b1, c71
  1146. MUL c81, b1, c81
  1147. #endif
  1148. #ifdef RN
  1149. LD b1, BO, 0 * SIZE
  1150. LD b2, BO, 1 * SIZE
  1151. LD b3, BO, 2 * SIZE
  1152. LD b4, BO, 3 * SIZE
  1153. LD b5, BO, 4 * SIZE
  1154. LD b6, BO, 5 * SIZE
  1155. LD b7, BO, 6 * SIZE
  1156. LD b8, BO, 7 * SIZE
  1157. MUL c11, b1, c11
  1158. NMSUB c21, c11, b2, c21
  1159. NMSUB c31, c11, b3, c31
  1160. NMSUB c41, c11, b4, c41
  1161. NMSUB c51, c11, b5, c51
  1162. NMSUB c61, c11, b6, c61
  1163. NMSUB c71, c11, b7, c71
  1164. NMSUB c81, c11, b8, c81
  1165. LD b2, BO, 9 * SIZE
  1166. LD b3, BO, 10 * SIZE
  1167. LD b4, BO, 11 * SIZE
  1168. LD b5, BO, 12 * SIZE
  1169. LD b6, BO, 13 * SIZE
  1170. LD b7, BO, 14 * SIZE
  1171. LD b8, BO, 15 * SIZE
  1172. MUL c21, b2, c21
  1173. NMSUB c31, c21, b3, c31
  1174. NMSUB c41, c21, b4, c41
  1175. NMSUB c51, c21, b5, c51
  1176. NMSUB c61, c21, b6, c61
  1177. NMSUB c71, c21, b7, c71
  1178. NMSUB c81, c21, b8, c81
  1179. LD b3, BO, 18 * SIZE
  1180. LD b4, BO, 19 * SIZE
  1181. LD b5, BO, 20 * SIZE
  1182. LD b6, BO, 21 * SIZE
  1183. LD b7, BO, 22 * SIZE
  1184. LD b8, BO, 23 * SIZE
  1185. MUL c31, b3, c31
  1186. NMSUB c41, c31, b4, c41
  1187. NMSUB c51, c31, b5, c51
  1188. NMSUB c61, c31, b6, c61
  1189. NMSUB c71, c31, b7, c71
  1190. NMSUB c81, c31, b8, c81
  1191. LD b4, BO, 27 * SIZE
  1192. LD b5, BO, 28 * SIZE
  1193. LD b6, BO, 29 * SIZE
  1194. LD b7, BO, 30 * SIZE
  1195. LD b8, BO, 31 * SIZE
  1196. MUL c41, b4, c41
  1197. NMSUB c51, c41, b5, c51
  1198. NMSUB c61, c41, b6, c61
  1199. NMSUB c71, c41, b7, c71
  1200. NMSUB c81, c41, b8, c81
  1201. LD b5, BO, 36 * SIZE
  1202. LD b6, BO, 37 * SIZE
  1203. LD b7, BO, 38 * SIZE
  1204. LD b8, BO, 39 * SIZE
  1205. MUL c51, b5, c51
  1206. NMSUB c61, c51, b6, c61
  1207. NMSUB c71, c51, b7, c71
  1208. NMSUB c81, c51, b8, c81
  1209. LD b6, BO, 45 * SIZE
  1210. LD b7, BO, 46 * SIZE
  1211. LD b8, BO, 47 * SIZE
  1212. MUL c61, b6, c61
  1213. NMSUB c71, c61, b7, c71
  1214. NMSUB c81, c61, b8, c81
  1215. LD b7, BO, 54 * SIZE
  1216. LD b8, BO, 55 * SIZE
  1217. MUL c71, b7, c71
  1218. NMSUB c81, c71, b8, c81
  1219. LD b8, BO, 63 * SIZE
  1220. MUL c81, b8, c81
  1221. #endif
  1222. #ifdef RT
  1223. LD b1, BO, 63 * SIZE
  1224. LD b2, BO, 62 * SIZE
  1225. LD b3, BO, 61 * SIZE
  1226. LD b4, BO, 60 * SIZE
  1227. LD b5, BO, 59 * SIZE
  1228. LD b6, BO, 58 * SIZE
  1229. LD b7, BO, 57 * SIZE
  1230. LD b8, BO, 56 * SIZE
  1231. MUL c81, b1, c81
  1232. NMSUB c71, c81, b2, c71
  1233. NMSUB c61, c81, b3, c61
  1234. NMSUB c51, c81, b4, c51
  1235. NMSUB c41, c81, b5, c41
  1236. NMSUB c31, c81, b6, c31
  1237. NMSUB c21, c81, b7, c21
  1238. NMSUB c11, c81, b8, c11
  1239. LD b2, BO, 54 * SIZE
  1240. LD b3, BO, 53 * SIZE
  1241. LD b4, BO, 52 * SIZE
  1242. LD b5, BO, 51 * SIZE
  1243. LD b6, BO, 50 * SIZE
  1244. LD b7, BO, 49 * SIZE
  1245. LD b8, BO, 48 * SIZE
  1246. MUL c71, b2, c71
  1247. NMSUB c61, c71, b3, c61
  1248. NMSUB c51, c71, b4, c51
  1249. NMSUB c41, c71, b5, c41
  1250. NMSUB c31, c71, b6, c31
  1251. NMSUB c21, c71, b7, c21
  1252. NMSUB c11, c71, b8, c11
  1253. LD b3, BO, 45 * SIZE
  1254. LD b4, BO, 44 * SIZE
  1255. LD b5, BO, 43 * SIZE
  1256. LD b6, BO, 42 * SIZE
  1257. LD b7, BO, 41 * SIZE
  1258. LD b8, BO, 40 * SIZE
  1259. MUL c61, b3, c61
  1260. NMSUB c51, c61, b4, c51
  1261. NMSUB c41, c61, b5, c41
  1262. NMSUB c31, c61, b6, c31
  1263. NMSUB c21, c61, b7, c21
  1264. NMSUB c11, c61, b8, c11
  1265. LD b4, BO, 36 * SIZE
  1266. LD b5, BO, 35 * SIZE
  1267. LD b6, BO, 34 * SIZE
  1268. LD b7, BO, 33 * SIZE
  1269. LD b8, BO, 32 * SIZE
  1270. MUL c51, b4, c51
  1271. NMSUB c41, c51, b5, c41
  1272. NMSUB c31, c51, b6, c31
  1273. NMSUB c21, c51, b7, c21
  1274. NMSUB c11, c51, b8, c11
  1275. LD b5, BO, 27 * SIZE
  1276. LD b6, BO, 26 * SIZE
  1277. LD b7, BO, 25 * SIZE
  1278. LD b8, BO, 24 * SIZE
  1279. MUL c41, b5, c41
  1280. NMSUB c31, c41, b6, c31
  1281. NMSUB c21, c41, b7, c21
  1282. NMSUB c11, c41, b8, c11
  1283. LD b6, BO, 18 * SIZE
  1284. LD b7, BO, 17 * SIZE
  1285. LD b8, BO, 16 * SIZE
  1286. MUL c31, b6, c31
  1287. NMSUB c21, c31, b7, c21
  1288. NMSUB c11, c31, b8, c11
  1289. LD b7, BO, 9 * SIZE
  1290. LD b8, BO, 8 * SIZE
  1291. MUL c21, b7, c21
  1292. NMSUB c11, c21, b8, c11
  1293. LD b8, BO, 0 * SIZE
  1294. MUL c11, b8, c11
  1295. #endif
  1296. #ifdef LN
  1297. addi.d CO1, CO1, -1 * SIZE
  1298. addi.d CO2, CO2, -1 * SIZE
  1299. addi.d CO3, CO3, -1 * SIZE
  1300. addi.d CO4, CO4, -1 * SIZE
  1301. addi.d CO5, CO5, -1 * SIZE
  1302. addi.d CO6, CO6, -1 * SIZE
  1303. addi.d CO7, CO7, -1 * SIZE
  1304. addi.d CO8, CO8, -1 * SIZE
  1305. #endif
  1306. #if defined(LN) || defined(LT)
  1307. ST c11, BO, 0 * SIZE
  1308. ST c21, BO, 1 * SIZE
  1309. ST c31, BO, 2 * SIZE
  1310. ST c41, BO, 3 * SIZE
  1311. ST c51, BO, 4 * SIZE
  1312. ST c61, BO, 5 * SIZE
  1313. ST c71, BO, 6 * SIZE
  1314. ST c81, BO, 7 * SIZE
  1315. #else
  1316. ST c11, AO, 0 * SIZE
  1317. ST c21, AO, 1 * SIZE
  1318. ST c31, AO, 2 * SIZE
  1319. ST c41, AO, 3 * SIZE
  1320. ST c51, AO, 4 * SIZE
  1321. ST c61, AO, 5 * SIZE
  1322. ST c71, AO, 6 * SIZE
  1323. ST c81, AO, 7 * SIZE
  1324. #endif
  1325. ST c11, CO1, 0 * SIZE
  1326. ST c21, CO2, 0 * SIZE
  1327. ST c31, CO3, 0 * SIZE
  1328. ST c41, CO4, 0 * SIZE
  1329. ST c51, CO5, 0 * SIZE
  1330. ST c61, CO6, 0 * SIZE
  1331. ST c71, CO7, 0 * SIZE
  1332. ST c81, CO8, 0 * SIZE
  1333. #ifndef LN
  1334. addi.d CO1, CO1, 1 * SIZE
  1335. addi.d CO2, CO2, 1 * SIZE
  1336. addi.d CO3, CO3, 1 * SIZE
  1337. addi.d CO4, CO4, 1 * SIZE
  1338. addi.d CO5, CO5, 1 * SIZE
  1339. addi.d CO6, CO6, 1 * SIZE
  1340. addi.d CO7, CO7, 1 * SIZE
  1341. addi.d CO8, CO8, 1 * SIZE
  1342. #endif
  1343. #ifdef RT
  1344. slli.d TEMP, K, BASE_SHIFT
  1345. add.d AORIG, AORIG, TEMP
  1346. #endif
  1347. #if defined(LT) || defined(RN)
  1348. sub.d TEMP, K, KK
  1349. slli.d L, TEMP, 0 + BASE_SHIFT
  1350. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  1351. add.d AO, AO, L
  1352. add.d BO, BO, TEMP
  1353. #endif
  1354. #ifdef LT
  1355. addi.d KK, KK, 1
  1356. #endif
  1357. #ifdef LN
  1358. addi.d KK, KK, -1
  1359. #endif
  1360. .align 3
  1361. .L29:
  1362. #ifdef LN
  1363. slli.d TEMP, K, 3 + BASE_SHIFT
  1364. add.d B, B, TEMP
  1365. #endif
  1366. #if defined(LT) || defined(RN)
  1367. move B, BO
  1368. #endif
  1369. #ifdef RN
  1370. addi.d KK, KK, 8
  1371. #endif
  1372. #ifdef RT
  1373. addi.d KK, KK, -8
  1374. #endif
  1375. blt $r0, J, .L10
  1376. .align 3
  1377. .L30:
  1378. andi J, N, 4
  1379. move AO, A
  1380. bge $r0, J, .L50
  1381. #ifdef RT
  1382. slli.d TEMP, K, 2 + BASE_SHIFT
  1383. sub.d B, B, TEMP
  1384. slli.d TEMP, LDC, 2
  1385. sub.d C, C, TEMP
  1386. #endif
  1387. move CO1, C
  1388. MTC c11, $r0
  1389. add.d CO2, C, LDC
  1390. add.d CO3, CO2, LDC
  1391. add.d CO4, CO3, LDC
  1392. MOV c21, c11
  1393. srai.d I, M, 1
  1394. MOV c31, c11
  1395. #ifdef LN
  1396. add.d KK, M, OFFSET
  1397. #endif
  1398. #ifdef LT
  1399. move KK, OFFSET
  1400. #endif
  1401. #if defined(LN) || defined(RT)
  1402. move AORIG, A
  1403. #else
  1404. move AO, A
  1405. #endif
  1406. #ifndef RT
  1407. add.d C, CO4, LDC
  1408. #endif
  1409. MOV c41, c11
  1410. bge $r0, I, .L40
  1411. .L31:
  1412. #if defined(LT) || defined(RN)
  1413. LD a1, AO, 0 * SIZE
  1414. LD a3, AO, 4 * SIZE
  1415. LD b1, B, 0 * SIZE
  1416. MOV c12, c11
  1417. LD b2, B, 1 * SIZE
  1418. MOV c22, c11
  1419. LD b3, B, 2 * SIZE
  1420. MOV c32, c11
  1421. LD b4, B, 3 * SIZE
  1422. MOV c42, c11
  1423. LD b5, B, 4 * SIZE
  1424. srai.d L, KK, 2
  1425. LD b6, B, 8 * SIZE
  1426. LD b7, B, 12 * SIZE
  1427. move BO, B
  1428. bge $r0, L, .L35
  1429. #else
  1430. #ifdef LN
  1431. slli.d TEMP, K, 1 + BASE_SHIFT
  1432. sub.d AORIG, AORIG, TEMP
  1433. #endif
  1434. slli.d L, KK, 1 + BASE_SHIFT
  1435. slli.d TEMP, KK, 2 + BASE_SHIFT
  1436. add.d AO, AORIG, L
  1437. add.d BO, B, TEMP
  1438. sub.d TEMP, K, KK
  1439. LD a1, AO, 0 * SIZE
  1440. LD a3, AO, 4 * SIZE
  1441. LD b1, BO, 0 * SIZE
  1442. MOV c12, c11
  1443. LD b2, BO, 1 * SIZE
  1444. MOV c22, c11
  1445. LD b3, BO, 2 * SIZE
  1446. MOV c32, c11
  1447. LD b4, BO, 3 * SIZE
  1448. MOV c42, c11
  1449. LD b5, BO, 4 * SIZE
  1450. srai.d L, TEMP, 2
  1451. LD b6, BO, 8 * SIZE
  1452. LD b7, BO, 12 * SIZE
  1453. bge $r0, L, .L35
  1454. #endif
  1455. .align 3
  1456. .L32:
  1457. MADD c11, b1, a1, c11
  1458. LD a2, AO, 1 * SIZE
  1459. MADD c21, b2, a1, c21
  1460. addi.d L, L, -1
  1461. MADD c31, b3, a1, c31
  1462. MADD c41, b4, a1, c41
  1463. LD a1, AO, 2 * SIZE
  1464. MADD c12, b1, a2, c12
  1465. LD b1, BO, 16 * SIZE
  1466. MADD c22, b2, a2, c22
  1467. LD b2, BO, 5 * SIZE
  1468. MADD c32, b3, a2, c32
  1469. LD b3, BO, 6 * SIZE
  1470. MADD c42, b4, a2, c42
  1471. LD b4, BO, 7 * SIZE
  1472. MADD c11, b5, a1, c11
  1473. LD a2, AO, 3 * SIZE
  1474. MADD c21, b2, a1, c21
  1475. MADD c31, b3, a1, c31
  1476. MADD c41, b4, a1, c41
  1477. LD a1, AO, 8 * SIZE
  1478. MADD c12, b5, a2, c12
  1479. LD b5, BO, 20 * SIZE
  1480. MADD c22, b2, a2, c22
  1481. LD b2, BO, 9 * SIZE
  1482. MADD c32, b3, a2, c32
  1483. LD b3, BO, 10 * SIZE
  1484. MADD c42, b4, a2, c42
  1485. LD b4, BO, 11 * SIZE
  1486. MADD c11, b6, a3, c11
  1487. LD a2, AO, 5 * SIZE
  1488. MADD c21, b2, a3, c21
  1489. MADD c31, b3, a3, c31
  1490. MADD c41, b4, a3, c41
  1491. LD a3, AO, 6 * SIZE
  1492. MADD c12, b6, a2, c12
  1493. LD b6, BO, 24 * SIZE
  1494. MADD c22, b2, a2, c22
  1495. LD b2, BO, 13 * SIZE
  1496. MADD c32, b3, a2, c32
  1497. LD b3, BO, 14 * SIZE
  1498. MADD c42, b4, a2, c42
  1499. LD b4, BO, 15 * SIZE
  1500. MADD c11, b7, a3, c11
  1501. LD a2, AO, 7 * SIZE
  1502. MADD c21, b2, a3, c21
  1503. addi.d AO, AO, 8 * SIZE
  1504. MADD c31, b3, a3, c31
  1505. addi.d BO, BO, 16 * SIZE
  1506. MADD c41, b4, a3, c41
  1507. LD a3, AO, 4 * SIZE
  1508. MADD c12, b7, a2, c12
  1509. LD b7, BO, 12 * SIZE
  1510. MADD c22, b2, a2, c22
  1511. LD b2, BO, 1 * SIZE
  1512. MADD c32, b3, a2, c32
  1513. LD b3, BO, 2 * SIZE
  1514. MADD c42, b4, a2, c42
  1515. LD b4, BO, 3 * SIZE
  1516. blt $r0, L, .L32
  1517. .align 3
  1518. .L35:
  1519. #if defined(LT) || defined(RN)
  1520. andi L, KK, 3
  1521. #else
  1522. andi L, TEMP, 3
  1523. #endif
  1524. bge $r0, L, .L38
  1525. .align 3
  1526. .L36:
  1527. MADD c11, b1, a1, c11
  1528. LD a2, AO, 1 * SIZE
  1529. MADD c21, b2, a1, c21
  1530. addi.d L, L, -1
  1531. MADD c31, b3, a1, c31
  1532. addi.d AO, AO, 2 * SIZE
  1533. MADD c41, b4, a1, c41
  1534. LD a1, AO, 0 * SIZE
  1535. MADD c12, b1, a2, c12
  1536. LD b1, BO, 4 * SIZE
  1537. MADD c22, b2, a2, c22
  1538. LD b2, BO, 5 * SIZE
  1539. MADD c32, b3, a2, c32
  1540. LD b3, BO, 6 * SIZE
  1541. MADD c42, b4, a2, c42
  1542. LD b4, BO, 7 * SIZE
  1543. addi.d BO, BO, 4 * SIZE
  1544. blt $r0, L, .L36
  1545. .L38:
  1546. #if defined(LN) || defined(RT)
  1547. #ifdef LN
  1548. addi.d TEMP, KK, -2
  1549. #else
  1550. addi.d TEMP, KK, -4
  1551. #endif
  1552. slli.d L, TEMP, 1 + BASE_SHIFT
  1553. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1554. add.d AO, AORIG, L
  1555. add.d BO, B, TEMP
  1556. #endif
  1557. #if defined(LN) || defined(LT)
  1558. LD b1, BO, 0 * SIZE
  1559. LD b2, BO, 1 * SIZE
  1560. LD b3, BO, 2 * SIZE
  1561. LD b4, BO, 3 * SIZE
  1562. LD b5, BO, 4 * SIZE
  1563. LD b6, BO, 5 * SIZE
  1564. LD b7, BO, 6 * SIZE
  1565. LD b8, BO, 7 * SIZE
  1566. SUB c11, b1, c11
  1567. SUB c21, b2, c21
  1568. SUB c31, b3, c31
  1569. SUB c41, b4, c41
  1570. SUB c12, b5, c12
  1571. SUB c22, b6, c22
  1572. SUB c32, b7, c32
  1573. SUB c42, b8, c42
  1574. #else
  1575. LD b1, AO, 0 * SIZE
  1576. LD b2, AO, 1 * SIZE
  1577. LD b3, AO, 2 * SIZE
  1578. LD b4, AO, 3 * SIZE
  1579. LD b5, AO, 4 * SIZE
  1580. LD b6, AO, 5 * SIZE
  1581. LD b7, AO, 6 * SIZE
  1582. LD b8, AO, 7 * SIZE
  1583. SUB c11, b1, c11
  1584. SUB c12, b2, c12
  1585. SUB c21, b3, c21
  1586. SUB c22, b4, c22
  1587. SUB c31, b5, c31
  1588. SUB c32, b6, c32
  1589. SUB c41, b7, c41
  1590. SUB c42, b8, c42
  1591. #endif
  1592. #ifdef LN
  1593. LD b1, AO, 3 * SIZE
  1594. LD b2, AO, 2 * SIZE
  1595. LD b3, AO, 0 * SIZE
  1596. MUL c12, b1, c12
  1597. MUL c22, b1, c22
  1598. MUL c32, b1, c32
  1599. MUL c42, b1, c42
  1600. NMSUB c11, c12, b2, c11
  1601. NMSUB c21, c22, b2, c21
  1602. NMSUB c31, c32, b2, c31
  1603. NMSUB c41, c42, b2, c41
  1604. MUL c11, b3, c11
  1605. MUL c21, b3, c21
  1606. MUL c31, b3, c31
  1607. MUL c41, b3, c41
  1608. #endif
  1609. #ifdef LT
  1610. LD b1, AO, 0 * SIZE
  1611. LD b2, AO, 1 * SIZE
  1612. LD b3, AO, 3 * SIZE
  1613. MUL c11, b1, c11
  1614. MUL c21, b1, c21
  1615. MUL c31, b1, c31
  1616. MUL c41, b1, c41
  1617. NMSUB c12, c11, b2, c12
  1618. NMSUB c22, c21, b2, c22
  1619. NMSUB c32, c31, b2, c32
  1620. NMSUB c42, c41, b2, c42
  1621. MUL c12, b3, c12
  1622. MUL c22, b3, c22
  1623. MUL c32, b3, c32
  1624. MUL c42, b3, c42
  1625. #endif
  1626. #ifdef RN
  1627. LD b1, BO, 0 * SIZE
  1628. LD b2, BO, 1 * SIZE
  1629. LD b3, BO, 2 * SIZE
  1630. LD b4, BO, 3 * SIZE
  1631. MUL c11, b1, c11
  1632. MUL c12, b1, c12
  1633. NMSUB c21, c11, b2, c21
  1634. NMSUB c22, c12, b2, c22
  1635. NMSUB c31, c11, b3, c31
  1636. NMSUB c32, c12, b3, c32
  1637. NMSUB c41, c11, b4, c41
  1638. NMSUB c42, c12, b4, c42
  1639. LD b2, BO, 5 * SIZE
  1640. LD b3, BO, 6 * SIZE
  1641. LD b4, BO, 7 * SIZE
  1642. MUL c21, b2, c21
  1643. MUL c22, b2, c22
  1644. NMSUB c31, c21, b3, c31
  1645. NMSUB c32, c22, b3, c32
  1646. NMSUB c41, c21, b4, c41
  1647. NMSUB c42, c22, b4, c42
  1648. LD b3, BO, 10 * SIZE
  1649. LD b4, BO, 11 * SIZE
  1650. MUL c31, b3, c31
  1651. MUL c32, b3, c32
  1652. NMSUB c41, c31, b4, c41
  1653. NMSUB c42, c32, b4, c42
  1654. LD b4, BO, 15 * SIZE
  1655. MUL c41, b4, c41
  1656. MUL c42, b4, c42
  1657. #endif
  1658. #ifdef RT
  1659. LD b5, BO, 15 * SIZE
  1660. LD b6, BO, 14 * SIZE
  1661. LD b7, BO, 13 * SIZE
  1662. LD b8, BO, 12 * SIZE
  1663. MUL c41, b5, c41
  1664. MUL c42, b5, c42
  1665. NMSUB c31, c41, b6, c31
  1666. NMSUB c32, c42, b6, c32
  1667. NMSUB c21, c41, b7, c21
  1668. NMSUB c22, c42, b7, c22
  1669. NMSUB c11, c41, b8, c11
  1670. NMSUB c12, c42, b8, c12
  1671. LD b6, BO, 10 * SIZE
  1672. LD b7, BO, 9 * SIZE
  1673. LD b8, BO, 8 * SIZE
  1674. MUL c31, b6, c31
  1675. MUL c32, b6, c32
  1676. NMSUB c21, c31, b7, c21
  1677. NMSUB c22, c32, b7, c22
  1678. NMSUB c11, c31, b8, c11
  1679. NMSUB c12, c32, b8, c12
  1680. LD b7, BO, 5 * SIZE
  1681. LD b8, BO, 4 * SIZE
  1682. MUL c21, b7, c21
  1683. MUL c22, b7, c22
  1684. NMSUB c11, c21, b8, c11
  1685. NMSUB c12, c22, b8, c12
  1686. LD b8, BO, 0 * SIZE
  1687. MUL c11, b8, c11
  1688. MUL c12, b8, c12
  1689. #endif
  1690. #ifdef LN
  1691. addi.d CO1, CO1, -2 * SIZE
  1692. addi.d CO2, CO2, -2 * SIZE
  1693. addi.d CO3, CO3, -2 * SIZE
  1694. addi.d CO4, CO4, -2 * SIZE
  1695. #endif
  1696. #if defined(LN) || defined(LT)
  1697. ST c11, BO, 0 * SIZE
  1698. ST c21, BO, 1 * SIZE
  1699. ST c31, BO, 2 * SIZE
  1700. ST c41, BO, 3 * SIZE
  1701. ST c12, BO, 4 * SIZE
  1702. ST c22, BO, 5 * SIZE
  1703. ST c32, BO, 6 * SIZE
  1704. ST c42, BO, 7 * SIZE
  1705. #else
  1706. ST c11, AO, 0 * SIZE
  1707. ST c12, AO, 1 * SIZE
  1708. ST c21, AO, 2 * SIZE
  1709. ST c22, AO, 3 * SIZE
  1710. ST c31, AO, 4 * SIZE
  1711. ST c32, AO, 5 * SIZE
  1712. ST c41, AO, 6 * SIZE
  1713. ST c42, AO, 7 * SIZE
  1714. #endif
  1715. ST c11, CO1, 0 * SIZE
  1716. ST c12, CO1, 1 * SIZE
  1717. ST c21, CO2, 0 * SIZE
  1718. ST c22, CO2, 1 * SIZE
  1719. ST c31, CO3, 0 * SIZE
  1720. ST c32, CO3, 1 * SIZE
  1721. ST c41, CO4, 0 * SIZE
  1722. ST c42, CO4, 1 * SIZE
  1723. #ifndef LN
  1724. addi.d CO1, CO1, 2 * SIZE
  1725. addi.d CO2, CO2, 2 * SIZE
  1726. addi.d CO3, CO3, 2 * SIZE
  1727. addi.d CO4, CO4, 2 * SIZE
  1728. #endif
  1729. #ifdef RT
  1730. slli.d TEMP, K, 1 + BASE_SHIFT
  1731. add.d AORIG, AORIG, TEMP
  1732. #endif
  1733. #if defined(LT) || defined(RN)
  1734. sub.d TEMP, K, KK
  1735. slli.d L, TEMP, 1 + BASE_SHIFT
  1736. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1737. add.d AO, AO, L
  1738. add.d BO, BO, TEMP
  1739. #endif
  1740. #ifdef LT
  1741. addi.d KK, KK, 2
  1742. #endif
  1743. #ifdef LN
  1744. addi.d KK, KK, -2
  1745. #endif
  1746. MTC a1, $r0
  1747. MOV c11, a1
  1748. MOV c21, a1
  1749. MOV c31, a1
  1750. addi.d I, I, -1
  1751. MOV c41, c11
  1752. blt $r0, I, .L31
  1753. .align 3
  1754. .L40:
  1755. andi I, M, 1
  1756. MOV c61, c11
  1757. bge $r0, I, .L49
  1758. #if defined(LT) || defined(RN)
  1759. LD a1, AO, 0 * SIZE
  1760. MOV c71, c11
  1761. LD a2, AO, 1 * SIZE
  1762. MOV c81, c11
  1763. LD b1, B, 0 * SIZE
  1764. LD b2, B, 1 * SIZE
  1765. LD b3, B, 2 * SIZE
  1766. LD b4, B, 3 * SIZE
  1767. LD b5, B, 4 * SIZE
  1768. LD b6, B, 8 * SIZE
  1769. LD b7, B, 12 * SIZE
  1770. srai.d L, KK, 2
  1771. move BO, B
  1772. bge $r0, L, .L45
  1773. #else
  1774. #ifdef LN
  1775. slli.d TEMP, K, BASE_SHIFT
  1776. sub.d AORIG, AORIG, TEMP
  1777. #endif
  1778. slli.d L, KK, 0 + BASE_SHIFT
  1779. slli.d TEMP, KK, 2 + BASE_SHIFT
  1780. add.d AO, AORIG, L
  1781. add.d BO, B, TEMP
  1782. sub.d TEMP, K, KK
  1783. LD a1, AO, 0 * SIZE
  1784. MOV c71, c11
  1785. LD a2, AO, 1 * SIZE
  1786. MOV c81, c11
  1787. LD b1, BO, 0 * SIZE
  1788. LD b2, BO, 1 * SIZE
  1789. LD b3, BO, 2 * SIZE
  1790. LD b4, BO, 3 * SIZE
  1791. LD b5, BO, 4 * SIZE
  1792. LD b6, BO, 8 * SIZE
  1793. LD b7, BO, 12 * SIZE
  1794. srai.d L, TEMP, 2
  1795. bge $r0, L, .L45
  1796. #endif
  1797. .align 3
  1798. .L42:
  1799. MADD c11, b1, a1, c11
  1800. LD b1, BO, 16 * SIZE
  1801. MADD c21, b2, a1, c21
  1802. LD b2, BO, 5 * SIZE
  1803. MADD c31, b3, a1, c31
  1804. LD b3, BO, 6 * SIZE
  1805. MADD c41, b4, a1, c41
  1806. LD b4, BO, 7 * SIZE
  1807. LD a1, AO, 4 * SIZE
  1808. addi.d L, L, -1
  1809. MADD c11, b5, a2, c11
  1810. LD b5, BO, 20 * SIZE
  1811. MADD c21, b2, a2, c21
  1812. LD b2, BO, 9 * SIZE
  1813. MADD c31, b3, a2, c31
  1814. LD b3, BO, 10 * SIZE
  1815. MADD c41, b4, a2, c41
  1816. LD b4, BO, 11 * SIZE
  1817. LD a2, AO, 2 * SIZE
  1818. addi.d AO, AO, 4 * SIZE
  1819. MADD c11, b6, a2, c11
  1820. LD b6, BO, 24 * SIZE
  1821. MADD c21, b2, a2, c21
  1822. LD b2, BO, 13 * SIZE
  1823. MADD c31, b3, a2, c31
  1824. LD b3, BO, 14 * SIZE
  1825. MADD c41, b4, a2, c41
  1826. LD b4, BO, 15 * SIZE
  1827. LD a2, AO, -1 * SIZE
  1828. addi.d BO, BO, 16 * SIZE
  1829. MADD c11, b7, a2, c11
  1830. LD b7, BO, 12 * SIZE
  1831. MADD c21, b2, a2, c21
  1832. LD b2, BO, 1 * SIZE
  1833. MADD c31, b3, a2, c31
  1834. LD b3, BO, 2 * SIZE
  1835. MADD c41, b4, a2, c41
  1836. LD b4, BO, 3 * SIZE
  1837. LD a2, AO, 1 * SIZE
  1838. blt $r0, L, .L42
  1839. .align 3
  1840. .L45:
  1841. #if defined(LT) || defined(RN)
  1842. andi L, KK, 3
  1843. #else
  1844. andi L, TEMP, 3
  1845. #endif
  1846. bge $r0, L, .L48
  1847. .align 3
  1848. .L46:
  1849. MADD c11, b1, a1, c11
  1850. LD b1, BO, 4 * SIZE
  1851. MADD c21, b2, a1, c21
  1852. LD b2, BO, 5 * SIZE
  1853. MADD c31, b3, a1, c31
  1854. LD b3, BO, 6 * SIZE
  1855. MADD c41, b4, a1, c41
  1856. LD a1, AO, 1 * SIZE
  1857. LD b4, BO, 7 * SIZE
  1858. addi.d L, L, -1
  1859. addi.d AO, AO, 1 * SIZE
  1860. MOV a2, a2
  1861. addi.d BO, BO, 4 * SIZE
  1862. blt $r0, L, .L46
  1863. .L48:
  1864. #if defined(LN) || defined(RT)
  1865. #ifdef LN
  1866. addi.d TEMP, KK, -1
  1867. #else
  1868. addi.d TEMP, KK, -4
  1869. #endif
  1870. slli.d L, TEMP, 0 + BASE_SHIFT
  1871. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1872. add.d AO, AORIG, L
  1873. add.d BO, B, TEMP
  1874. #endif
  1875. #if defined(LN) || defined(LT)
  1876. LD b1, BO, 0 * SIZE
  1877. LD b2, BO, 1 * SIZE
  1878. LD b3, BO, 2 * SIZE
  1879. LD b4, BO, 3 * SIZE
  1880. SUB c11, b1, c11
  1881. SUB c21, b2, c21
  1882. SUB c31, b3, c31
  1883. SUB c41, b4, c41
  1884. #else
  1885. LD b1, AO, 0 * SIZE
  1886. LD b2, AO, 1 * SIZE
  1887. LD b3, AO, 2 * SIZE
  1888. LD b4, AO, 3 * SIZE
  1889. SUB c11, b1, c11
  1890. SUB c21, b2, c21
  1891. SUB c31, b3, c31
  1892. SUB c41, b4, c41
  1893. #endif
  1894. #if defined(LN) || defined(LT)
  1895. LD b1, AO, 0 * SIZE
  1896. MUL c11, b1, c11
  1897. MUL c21, b1, c21
  1898. MUL c31, b1, c31
  1899. MUL c41, b1, c41
  1900. #endif
  1901. #ifdef RN
  1902. LD b1, BO, 0 * SIZE
  1903. LD b2, BO, 1 * SIZE
  1904. LD b3, BO, 2 * SIZE
  1905. LD b4, BO, 3 * SIZE
  1906. MUL c11, b1, c11
  1907. NMSUB c21, c11, b2, c21
  1908. NMSUB c31, c11, b3, c31
  1909. NMSUB c41, c11, b4, c41
  1910. LD b2, BO, 5 * SIZE
  1911. LD b3, BO, 6 * SIZE
  1912. LD b4, BO, 7 * SIZE
  1913. MUL c21, b2, c21
  1914. NMSUB c31, c21, b3, c31
  1915. NMSUB c41, c21, b4, c41
  1916. LD b3, BO, 10 * SIZE
  1917. LD b4, BO, 11 * SIZE
  1918. MUL c31, b3, c31
  1919. NMSUB c41, c31, b4, c41
  1920. LD b4, BO, 15 * SIZE
  1921. MUL c41, b4, c41
  1922. #endif
  1923. #ifdef RT
  1924. LD b5, BO, 15 * SIZE
  1925. LD b6, BO, 14 * SIZE
  1926. LD b7, BO, 13 * SIZE
  1927. LD b8, BO, 12 * SIZE
  1928. MUL c41, b5, c41
  1929. NMSUB c31, c41, b6, c31
  1930. NMSUB c21, c41, b7, c21
  1931. NMSUB c11, c41, b8, c11
  1932. LD b6, BO, 10 * SIZE
  1933. LD b7, BO, 9 * SIZE
  1934. LD b8, BO, 8 * SIZE
  1935. MUL c31, b6, c31
  1936. NMSUB c21, c31, b7, c21
  1937. NMSUB c11, c31, b8, c11
  1938. LD b7, BO, 5 * SIZE
  1939. LD b8, BO, 4 * SIZE
  1940. MUL c21, b7, c21
  1941. NMSUB c11, c21, b8, c11
  1942. LD b8, BO, 0 * SIZE
  1943. MUL c11, b8, c11
  1944. #endif
  1945. #ifdef LN
  1946. addi.d CO1, CO1, -1 * SIZE
  1947. addi.d CO2, CO2, -1 * SIZE
  1948. addi.d CO3, CO3, -1 * SIZE
  1949. addi.d CO4, CO4, -1 * SIZE
  1950. #endif
  1951. #if defined(LN) || defined(LT)
  1952. ST c11, BO, 0 * SIZE
  1953. ST c21, BO, 1 * SIZE
  1954. ST c31, BO, 2 * SIZE
  1955. ST c41, BO, 3 * SIZE
  1956. #else
  1957. ST c11, AO, 0 * SIZE
  1958. ST c21, AO, 1 * SIZE
  1959. ST c31, AO, 2 * SIZE
  1960. ST c41, AO, 3 * SIZE
  1961. #endif
  1962. ST c11, CO1, 0 * SIZE
  1963. ST c21, CO2, 0 * SIZE
  1964. ST c31, CO3, 0 * SIZE
  1965. ST c41, CO4, 0 * SIZE
  1966. #ifndef LN
  1967. addi.d CO1, CO1, 1 * SIZE
  1968. addi.d CO2, CO2, 1 * SIZE
  1969. addi.d CO3, CO3, 1 * SIZE
  1970. addi.d CO4, CO4, 1 * SIZE
  1971. #endif
  1972. #ifdef RT
  1973. slli.d TEMP, K, BASE_SHIFT
  1974. add.d AORIG, AORIG, TEMP
  1975. #endif
  1976. #if defined(LT) || defined(RN)
  1977. sub.d TEMP, K, KK
  1978. slli.d L, TEMP, 0 + BASE_SHIFT
  1979. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1980. add.d AO, AO, L
  1981. add.d BO, BO, TEMP
  1982. #endif
  1983. #ifdef LT
  1984. addi.d KK, KK, 1
  1985. #endif
  1986. #ifdef LN
  1987. addi.d KK, KK, -1
  1988. #endif
  1989. .align 3
  1990. .L49:
  1991. #ifdef LN
  1992. slli.d TEMP, K, 2 + BASE_SHIFT
  1993. add.d B, B, TEMP
  1994. #endif
  1995. #if defined(LT) || defined(RN)
  1996. move B, BO
  1997. #endif
  1998. #ifdef RN
  1999. addi.d KK, KK, 4
  2000. #endif
  2001. #ifdef RT
  2002. addi.d KK, KK, -4
  2003. #endif
  2004. .align 3
  2005. .L50:
  2006. andi J, N, 2
  2007. #ifdef RT
  2008. slli.d TEMP, K, 1 + BASE_SHIFT
  2009. #else
  2010. move AO, A
  2011. #endif
  2012. bge $r0, J, .L70
  2013. #ifdef RT
  2014. sub.d B, B, TEMP
  2015. slli.d TEMP, LDC, 1
  2016. sub.d C, C, TEMP
  2017. #endif
  2018. move AO, A
  2019. move CO1, C
  2020. add.d CO2, C, LDC
  2021. #ifdef LN
  2022. add.d KK, M, OFFSET
  2023. #endif
  2024. #ifdef LT
  2025. move KK, OFFSET
  2026. #endif
  2027. #if defined(LN) || defined(RT)
  2028. move AORIG, A
  2029. #else
  2030. move AO, A
  2031. #endif
  2032. #ifndef RT
  2033. add.d C, CO2, LDC
  2034. #endif
  2035. srai.d I, M, 1
  2036. bge $r0, I, .L60
  2037. .L51:
  2038. #if defined(LT) || defined(RN)
  2039. LD a1, AO, 0 * SIZE
  2040. MTC c11, $r0
  2041. LD a2, AO, 1 * SIZE
  2042. MOV c21, c11
  2043. LD a5, AO, 4 * SIZE
  2044. LD b1, B, 0 * SIZE
  2045. MOV c12, c11
  2046. LD b2, B, 1 * SIZE
  2047. MOV c22, c11
  2048. LD b3, B, 2 * SIZE
  2049. LD b5, B, 4 * SIZE
  2050. srai.d L, KK, 2
  2051. LD b6, B, 8 * SIZE
  2052. LD b7, B, 12 * SIZE
  2053. move BO, B
  2054. bge $r0, L, .L55
  2055. #else
  2056. #ifdef LN
  2057. slli.d TEMP, K, 1 + BASE_SHIFT
  2058. sub.d AORIG, AORIG, TEMP
  2059. #endif
  2060. slli.d L, KK, 1 + BASE_SHIFT
  2061. slli.d TEMP, KK, 1 + BASE_SHIFT
  2062. add.d AO, AORIG, L
  2063. add.d BO, B, TEMP
  2064. sub.d TEMP, K, KK
  2065. LD a1, AO, 0 * SIZE
  2066. MTC c11, $r0
  2067. LD a2, AO, 1 * SIZE
  2068. MOV c21, c11
  2069. LD a5, AO, 4 * SIZE
  2070. LD b1, BO, 0 * SIZE
  2071. MOV c12, c11
  2072. LD b2, BO, 1 * SIZE
  2073. MOV c22, c11
  2074. LD b3, BO, 2 * SIZE
  2075. LD b5, BO, 4 * SIZE
  2076. srai.d L, TEMP, 2
  2077. LD b6, BO, 8 * SIZE
  2078. LD b7, BO, 12 * SIZE
  2079. bge $r0, L, .L55
  2080. #endif
  2081. .align 3
  2082. .L52:
  2083. MADD c11, b1, a1, c11
  2084. LD a3, AO, 2 * SIZE
  2085. MADD c21, b2, a1, c21
  2086. LD b4, BO, 3 * SIZE
  2087. MADD c12, b1, a2, c12
  2088. LD a4, AO, 3 * SIZE
  2089. MADD c22, b2, a2, c22
  2090. LD b1, BO, 8 * SIZE
  2091. MADD c11, b3, a3, c11
  2092. LD a1, AO, 8 * SIZE
  2093. MADD c21, b4, a3, c21
  2094. LD b2, BO, 5 * SIZE
  2095. MADD c12, b3, a4, c12
  2096. LD a2, AO, 5 * SIZE
  2097. MADD c22, b4, a4, c22
  2098. LD b3, BO, 6 * SIZE
  2099. MADD c11, b5, a5, c11
  2100. LD a3, AO, 6 * SIZE
  2101. MADD c21, b2, a5, c21
  2102. LD b4, BO, 7 * SIZE
  2103. MADD c12, b5, a2, c12
  2104. LD a4, AO, 7 * SIZE
  2105. MADD c22, b2, a2, c22
  2106. LD b5, BO, 12 * SIZE
  2107. MADD c11, b3, a3, c11
  2108. LD a5, AO, 12 * SIZE
  2109. MADD c21, b4, a3, c21
  2110. LD b2, BO, 9 * SIZE
  2111. MADD c12, b3, a4, c12
  2112. LD a2, AO, 9 * SIZE
  2113. MADD c22, b4, a4, c22
  2114. LD b3, BO, 10 * SIZE
  2115. addi.d AO, AO, 8 * SIZE
  2116. addi.d L, L, -1
  2117. addi.d BO, BO, 8 * SIZE
  2118. blt $r0, L, .L52
  2119. .align 3
  2120. .L55:
  2121. #if defined(LT) || defined(RN)
  2122. andi L, KK, 3
  2123. #else
  2124. andi L, TEMP, 3
  2125. #endif
  2126. bge $r0, L, .L58
  2127. .align 3
  2128. .L56:
  2129. MADD c11, b1, a1, c11
  2130. LD a2, AO, 1 * SIZE
  2131. MADD c21, b2, a1, c21
  2132. LD a1, AO, 2 * SIZE
  2133. MADD c12, b1, a2, c12
  2134. LD b1, BO, 2 * SIZE
  2135. MADD c22, b2, a2, c22
  2136. LD b2, BO, 3 * SIZE
  2137. addi.d L, L, -1
  2138. addi.d AO, AO, 2 * SIZE
  2139. addi.d BO, BO, 2 * SIZE
  2140. blt $r0, L, .L56
  2141. .L58:
  2142. #if defined(LN) || defined(RT)
  2143. #ifdef LN
  2144. addi.d TEMP, KK, -2
  2145. #else
  2146. addi.d TEMP, KK, -2
  2147. #endif
  2148. slli.d L, TEMP, 1 + BASE_SHIFT
  2149. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  2150. add.d AO, AORIG, L
  2151. add.d BO, B, TEMP
  2152. #endif
  2153. #if defined(LN) || defined(LT)
  2154. LD b1, BO, 0 * SIZE
  2155. LD b2, BO, 1 * SIZE
  2156. LD b3, BO, 2 * SIZE
  2157. LD b4, BO, 3 * SIZE
  2158. SUB c11, b1, c11
  2159. SUB c21, b2, c21
  2160. SUB c12, b3, c12
  2161. SUB c22, b4, c22
  2162. #else
  2163. LD b1, AO, 0 * SIZE
  2164. LD b2, AO, 1 * SIZE
  2165. LD b3, AO, 2 * SIZE
  2166. LD b4, AO, 3 * SIZE
  2167. SUB c11, b1, c11
  2168. SUB c12, b2, c12
  2169. SUB c21, b3, c21
  2170. SUB c22, b4, c22
  2171. #endif
  2172. #ifdef LN
  2173. LD b1, AO, 3 * SIZE
  2174. LD b2, AO, 2 * SIZE
  2175. LD b3, AO, 0 * SIZE
  2176. MUL c12, b1, c12
  2177. MUL c22, b1, c22
  2178. NMSUB c11, c12, b2, c11
  2179. NMSUB c21, c22, b2, c21
  2180. MUL c11, b3, c11
  2181. MUL c21, b3, c21
  2182. #endif
  2183. #ifdef LT
  2184. LD b1, AO, 0 * SIZE
  2185. LD b2, AO, 1 * SIZE
  2186. LD b3, AO, 3 * SIZE
  2187. MUL c11, b1, c11
  2188. MUL c21, b1, c21
  2189. NMSUB c12, c11, b2, c12
  2190. NMSUB c22, c21, b2, c22
  2191. MUL c12, b3, c12
  2192. MUL c22, b3, c22
  2193. #endif
  2194. #ifdef RN
  2195. LD b1, BO, 0 * SIZE
  2196. LD b2, BO, 1 * SIZE
  2197. LD b3, BO, 3 * SIZE
  2198. MUL c11, b1, c11
  2199. MUL c12, b1, c12
  2200. NMSUB c21, c11, b2, c21
  2201. NMSUB c22, c12, b2, c22
  2202. MUL c21, b3, c21
  2203. MUL c22, b3, c22
  2204. #endif
  2205. #ifdef RT
  2206. LD b1, BO, 3 * SIZE
  2207. LD b2, BO, 2 * SIZE
  2208. LD b3, BO, 0 * SIZE
  2209. MUL c21, b1, c21
  2210. MUL c22, b1, c22
  2211. NMSUB c11, c21, b2, c11
  2212. NMSUB c12, c22, b2, c12
  2213. MUL c11, b3, c11
  2214. MUL c12, b3, c12
  2215. #endif
  2216. #ifdef LN
  2217. addi.d CO1, CO1, -2 * SIZE
  2218. addi.d CO2, CO2, -2 * SIZE
  2219. #endif
  2220. #if defined(LN) || defined(LT)
  2221. ST c11, BO, 0 * SIZE
  2222. ST c21, BO, 1 * SIZE
  2223. ST c12, BO, 2 * SIZE
  2224. ST c22, BO, 3 * SIZE
  2225. #else
  2226. ST c11, AO, 0 * SIZE
  2227. ST c12, AO, 1 * SIZE
  2228. ST c21, AO, 2 * SIZE
  2229. ST c22, AO, 3 * SIZE
  2230. #endif
  2231. ST c11, CO1, 0 * SIZE
  2232. ST c12, CO1, 1 * SIZE
  2233. ST c21, CO2, 0 * SIZE
  2234. ST c22, CO2, 1 * SIZE
  2235. #ifndef LN
  2236. addi.d CO1, CO1, 2 * SIZE
  2237. addi.d CO2, CO2, 2 * SIZE
  2238. #endif
  2239. #ifdef RT
  2240. slli.d TEMP, K, 1 + BASE_SHIFT
  2241. add.d AORIG, AORIG, TEMP
  2242. #endif
  2243. #if defined(LT) || defined(RN)
  2244. sub.d TEMP, K, KK
  2245. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  2246. add.d AO, AO, TEMP
  2247. add.d BO, BO, TEMP
  2248. #endif
  2249. #ifdef LT
  2250. addi.d KK, KK, 2
  2251. #endif
  2252. #ifdef LN
  2253. addi.d KK, KK, -2
  2254. #endif
  2255. MTC a1, $r0
  2256. MOV c11, a1
  2257. MOV c21, a1
  2258. MOV c31, a1
  2259. addi.d I, I, -1
  2260. MOV c41, c11
  2261. blt $r0, I, .L51
  2262. .align 3
  2263. .L60:
  2264. andi I, M, 1
  2265. bge $r0, I, .L69
  2266. #if defined(LT) || defined(RN)
  2267. srai.d L, KK, 2
  2268. LD a1, AO, 0 * SIZE
  2269. MTC c11, $r0
  2270. LD a2, AO, 1 * SIZE
  2271. MOV c21, c11
  2272. LD a3, AO, 2 * SIZE
  2273. MOV c31, c11
  2274. LD a4, AO, 3 * SIZE
  2275. MOV c41, c11
  2276. LD b1, B, 0 * SIZE
  2277. LD b2, B, 1 * SIZE
  2278. LD b3, B, 2 * SIZE
  2279. LD b4, B, 3 * SIZE
  2280. LD b5, B, 4 * SIZE
  2281. LD b6, B, 8 * SIZE
  2282. LD b7, B, 12 * SIZE
  2283. move BO, B
  2284. bge $r0, L, .L65
  2285. #else
  2286. #ifdef LN
  2287. slli.d TEMP, K, BASE_SHIFT
  2288. sub.d AORIG, AORIG, TEMP
  2289. #endif
  2290. slli.d L, KK, 0 + BASE_SHIFT
  2291. slli.d TEMP, KK, 1 + BASE_SHIFT
  2292. add.d AO, AORIG, L
  2293. add.d BO, B, TEMP
  2294. sub.d TEMP, K, KK
  2295. srai.d L, TEMP, 2
  2296. LD a1, AO, 0 * SIZE
  2297. MTC c11, $r0
  2298. LD a2, AO, 1 * SIZE
  2299. MOV c21, c11
  2300. LD a3, AO, 2 * SIZE
  2301. MOV c31, c11
  2302. LD a4, AO, 3 * SIZE
  2303. MOV c41, c11
  2304. LD b1, BO, 0 * SIZE
  2305. LD b2, BO, 1 * SIZE
  2306. LD b3, BO, 2 * SIZE
  2307. LD b4, BO, 3 * SIZE
  2308. LD b5, BO, 4 * SIZE
  2309. LD b6, BO, 8 * SIZE
  2310. LD b7, BO, 12 * SIZE
  2311. bge $r0, L, .L65
  2312. #endif
  2313. .align 3
  2314. .L62:
  2315. MADD c11, b1, a1, c11
  2316. LD b1, BO, 4 * SIZE
  2317. MADD c21, b2, a1, c21
  2318. LD b2, BO, 5 * SIZE
  2319. MADD c31, b3, a2, c31
  2320. LD b3, BO, 6 * SIZE
  2321. MADD c41, b4, a2, c41
  2322. LD b4, BO, 7 * SIZE
  2323. LD a1, AO, 4 * SIZE
  2324. LD a2, AO, 5 * SIZE
  2325. MADD c11, b1, a3, c11
  2326. LD b1, BO, 8 * SIZE
  2327. MADD c21, b2, a3, c21
  2328. LD b2, BO, 9 * SIZE
  2329. MADD c31, b3, a4, c31
  2330. LD b3, BO, 10 * SIZE
  2331. MADD c41, b4, a4, c41
  2332. LD b4, BO, 11 * SIZE
  2333. LD a3, AO, 6 * SIZE
  2334. LD a4, AO, 7 * SIZE
  2335. addi.d L, L, -1
  2336. addi.d AO, AO, 4 * SIZE
  2337. addi.d BO, BO, 8 * SIZE
  2338. blt $r0, L, .L62
  2339. .align 3
  2340. .L65:
  2341. #if defined(LT) || defined(RN)
  2342. andi L, KK, 3
  2343. #else
  2344. andi L, TEMP, 3
  2345. #endif
  2346. bge $r0, L, .L68
  2347. .align 3
  2348. .L66:
  2349. MADD c11, b1, a1, c11
  2350. LD b1, BO, 2 * SIZE
  2351. MADD c21, b2, a1, c21
  2352. LD b2, BO, 3 * SIZE
  2353. LD a1, AO, 1 * SIZE
  2354. addi.d L, L, -1
  2355. addi.d AO, AO, 1 * SIZE
  2356. addi.d BO, BO, 2 * SIZE
  2357. blt $r0, L, .L66
  2358. .L68:
  2359. ADD c11, c11, c31
  2360. ADD c21, c21, c41
  2361. #if defined(LN) || defined(RT)
  2362. #ifdef LN
  2363. addi.d TEMP, KK, -1
  2364. #else
  2365. addi.d TEMP, KK, -2
  2366. #endif
  2367. slli.d L, TEMP, 0 + BASE_SHIFT
  2368. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  2369. add.d AO, AORIG, L
  2370. add.d BO, B, TEMP
  2371. #endif
  2372. #if defined(LN) || defined(LT)
  2373. LD b1, BO, 0 * SIZE
  2374. LD b2, BO, 1 * SIZE
  2375. SUB c11, b1, c11
  2376. SUB c21, b2, c21
  2377. #else
  2378. LD b1, AO, 0 * SIZE
  2379. LD b2, AO, 1 * SIZE
  2380. SUB c11, b1, c11
  2381. SUB c21, b2, c21
  2382. #endif
  2383. #if defined(LN) || defined(LT)
  2384. LD b3, AO, 0 * SIZE
  2385. MUL c11, b3, c11
  2386. MUL c21, b3, c21
  2387. #endif
  2388. #ifdef RN
  2389. LD b1, BO, 0 * SIZE
  2390. LD b2, BO, 1 * SIZE
  2391. LD b3, BO, 3 * SIZE
  2392. MUL c11, b1, c11
  2393. NMSUB c21, c11, b2, c21
  2394. MUL c21, b3, c21
  2395. #endif
  2396. #ifdef RT
  2397. LD b1, BO, 3 * SIZE
  2398. LD b2, BO, 2 * SIZE
  2399. LD b3, BO, 0 * SIZE
  2400. MUL c21, b1, c21
  2401. NMSUB c11, c21, b2, c11
  2402. MUL c11, b3, c11
  2403. #endif
  2404. #ifdef LN
  2405. addi.d CO1, CO1, -1 * SIZE
  2406. addi.d CO2, CO2, -1 * SIZE
  2407. #endif
  2408. #if defined(LN) || defined(LT)
  2409. ST c11, BO, 0 * SIZE
  2410. ST c21, BO, 1 * SIZE
  2411. #else
  2412. ST c11, AO, 0 * SIZE
  2413. ST c21, AO, 1 * SIZE
  2414. #endif
  2415. ST c11, CO1, 0 * SIZE
  2416. ST c21, CO2, 0 * SIZE
  2417. #ifndef LN
  2418. addi.d CO1, CO1, 1 * SIZE
  2419. addi.d CO2, CO2, 1 * SIZE
  2420. #endif
  2421. #ifdef RT
  2422. slli.d TEMP, K, 0 + BASE_SHIFT
  2423. add.d AORIG, AORIG, TEMP
  2424. #endif
  2425. #if defined(LT) || defined(RN)
  2426. sub.d TEMP, K, KK
  2427. slli.d L, TEMP, 0 + BASE_SHIFT
  2428. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  2429. add.d AO, AO, L
  2430. add.d BO, BO, TEMP
  2431. #endif
  2432. #ifdef LT
  2433. addi.d KK, KK, 1
  2434. #endif
  2435. #ifdef LN
  2436. addi.d KK, KK, -1
  2437. #endif
  2438. .align 3
  2439. .L69:
  2440. #ifdef LN
  2441. slli.d TEMP, K, 1 + BASE_SHIFT
  2442. add.d B, B, TEMP
  2443. #endif
  2444. #if defined(LT) || defined(RN)
  2445. move B, BO
  2446. #endif
  2447. #ifdef RN
  2448. addi.d KK, KK, 2
  2449. #endif
  2450. #ifdef RT
  2451. addi.d KK, KK, -2
  2452. #endif
  2453. .align 3
  2454. .L70:
  2455. andi J, N, 1
  2456. bge $r0, J, .L999
  2457. #ifdef RT
  2458. slli.d TEMP, K, BASE_SHIFT
  2459. sub.d B, B, TEMP
  2460. sub.d C, C, LDC
  2461. #endif
  2462. move AO, A
  2463. move CO1, C
  2464. #ifdef LN
  2465. add.d KK, M, OFFSET
  2466. #endif
  2467. #ifdef LT
  2468. move KK, OFFSET
  2469. #endif
  2470. #if defined(LN) || defined(RT)
  2471. move AORIG, A
  2472. #else
  2473. move AO, A
  2474. #endif
  2475. #ifndef RT
  2476. add.d C, CO1, LDC
  2477. #endif
  2478. srai.d I, M, 1
  2479. bge $r0, I, .L80
  2480. .L71:
  2481. #if defined(LT) || defined(RN)
  2482. LD a1, AO, 0 * SIZE
  2483. MTC c11, $r0
  2484. LD a2, AO, 1 * SIZE
  2485. MOV c21, c11
  2486. LD a5, AO, 4 * SIZE
  2487. LD b1, B, 0 * SIZE
  2488. MOV c12, c11
  2489. LD b2, B, 1 * SIZE
  2490. MOV c22, c11
  2491. LD b3, B, 2 * SIZE
  2492. LD b5, B, 4 * SIZE
  2493. srai.d L, KK, 2
  2494. LD b6, B, 8 * SIZE
  2495. LD b7, B, 12 * SIZE
  2496. move BO, B
  2497. bge $r0, L, .L75
  2498. #else
  2499. #ifdef LN
  2500. slli.d TEMP, K, 1 + BASE_SHIFT
  2501. sub.d AORIG, AORIG, TEMP
  2502. #endif
  2503. slli.d L, KK, 1 + BASE_SHIFT
  2504. slli.d TEMP, KK, 0 + BASE_SHIFT
  2505. add.d AO, AORIG, L
  2506. add.d BO, B, TEMP
  2507. sub.d TEMP, K, KK
  2508. LD a1, AO, 0 * SIZE
  2509. MTC c11, $r0
  2510. LD a2, AO, 1 * SIZE
  2511. MOV c21, c11
  2512. LD a5, AO, 4 * SIZE
  2513. LD b1, BO, 0 * SIZE
  2514. MOV c12, c11
  2515. LD b2, BO, 1 * SIZE
  2516. MOV c22, c11
  2517. LD b3, BO, 2 * SIZE
  2518. LD b5, BO, 4 * SIZE
  2519. srai.d L, TEMP, 2
  2520. LD b6, BO, 8 * SIZE
  2521. LD b7, BO, 12 * SIZE
  2522. bge $r0, L, .L75
  2523. #endif
  2524. .align 3
  2525. .L72:
  2526. LD a1, AO, 0 * SIZE
  2527. LD a2, AO, 1 * SIZE
  2528. LD b1, BO, 0 * SIZE
  2529. MADD c11, b1, a1, c11
  2530. MADD c12, b1, a2, c12
  2531. LD a1, AO, 2 * SIZE
  2532. LD a2, AO, 3 * SIZE
  2533. LD b1, BO, 1 * SIZE
  2534. MADD c11, b1, a1, c11
  2535. MADD c12, b1, a2, c12
  2536. LD a1, AO, 4 * SIZE
  2537. LD a2, AO, 5 * SIZE
  2538. LD b1, BO, 2 * SIZE
  2539. MADD c11, b1, a1, c11
  2540. MADD c12, b1, a2, c12
  2541. LD a1, AO, 6 * SIZE
  2542. LD a2, AO, 7 * SIZE
  2543. LD b1, BO, 3 * SIZE
  2544. MADD c11, b1, a1, c11
  2545. MADD c12, b1, a2, c12
  2546. addi.d L, L, -1
  2547. addi.d AO, AO, 8 * SIZE
  2548. addi.d BO, BO, 4 * SIZE
  2549. blt $r0, L, .L72
  2550. .align 3
  2551. .L75:
  2552. #if defined(LT) || defined(RN)
  2553. andi L, KK, 3
  2554. #else
  2555. andi L, TEMP, 3
  2556. #endif
  2557. bge $r0, L, .L78
  2558. .align 3
  2559. .L76:
  2560. LD a1, AO, 0 * SIZE
  2561. LD a2, AO, 1 * SIZE
  2562. LD b1, BO, 0 * SIZE
  2563. MADD c11, b1, a1, c11
  2564. MADD c12, b1, a2, c12
  2565. addi.d L, L, -1
  2566. addi.d AO, AO, 2 * SIZE
  2567. addi.d BO, BO, 1 * SIZE
  2568. blt $r0, L, .L76
  2569. .L78:
  2570. ADD c11, c11, c21
  2571. ADD c12, c12, c22
  2572. #if defined(LN) || defined(RT)
  2573. #ifdef LN
  2574. addi.d TEMP, KK, -2
  2575. #else
  2576. addi.d TEMP, KK, -1
  2577. #endif
  2578. slli.d L, TEMP, 1 + BASE_SHIFT
  2579. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  2580. add.d AO, AORIG, L
  2581. add.d BO, B, TEMP
  2582. #endif
  2583. #if defined(LN) || defined(LT)
  2584. LD b1, BO, 0 * SIZE
  2585. LD b2, BO, 1 * SIZE
  2586. SUB c11, b1, c11
  2587. SUB c12, b2, c12
  2588. #else
  2589. LD b1, AO, 0 * SIZE
  2590. LD b2, AO, 1 * SIZE
  2591. SUB c11, b1, c11
  2592. SUB c12, b2, c12
  2593. #endif
  2594. #ifdef LN
  2595. LD b1, AO, 3 * SIZE
  2596. LD b2, AO, 2 * SIZE
  2597. LD b3, AO, 0 * SIZE
  2598. MUL c12, b1, c12
  2599. NMSUB c11, c12, b2, c11
  2600. MUL c11, b3, c11
  2601. #endif
  2602. #ifdef LT
  2603. LD b1, AO, 0 * SIZE
  2604. LD b2, AO, 1 * SIZE
  2605. LD b3, AO, 3 * SIZE
  2606. MUL c11, b1, c11
  2607. NMSUB c12, c11, b2, c12
  2608. MUL c12, b3, c12
  2609. #endif
  2610. #if defined(RN) || defined(RT)
  2611. LD b1, BO, 0 * SIZE
  2612. MUL c11, b1, c11
  2613. MUL c12, b1, c12
  2614. #endif
  2615. #ifdef LN
  2616. addi.d CO1, CO1, -2 * SIZE
  2617. #endif
  2618. #if defined(LN) || defined(LT)
  2619. ST c11, BO, 0 * SIZE
  2620. ST c12, BO, 1 * SIZE
  2621. #else
  2622. ST c11, AO, 0 * SIZE
  2623. ST c12, AO, 1 * SIZE
  2624. #endif
  2625. ST c11, CO1, 0 * SIZE
  2626. ST c12, CO1, 1 * SIZE
  2627. #ifndef LN
  2628. addi.d CO1, CO1, 2 * SIZE
  2629. #endif
  2630. #ifdef RT
  2631. slli.d TEMP, K, 1 + BASE_SHIFT
  2632. add.d AORIG, AORIG, TEMP
  2633. #endif
  2634. #if defined(LT) || defined(RN)
  2635. sub.d TEMP, K, KK
  2636. slli.d L, TEMP, 1 + BASE_SHIFT
  2637. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  2638. add.d AO, AO, L
  2639. add.d BO, BO, TEMP
  2640. #endif
  2641. #ifdef LT
  2642. addi.d KK, KK, 2
  2643. #endif
  2644. #ifdef LN
  2645. addi.d KK, KK, -2
  2646. #endif
  2647. addi.d I, I, -1
  2648. blt $r0, I, .L71
  2649. .align 3
  2650. .L80:
  2651. andi I, M, 1
  2652. bge $r0, I, .L89
  2653. #if defined(LT) || defined(RN)
  2654. LD a1, AO, 0 * SIZE
  2655. MTC c11, $r0
  2656. LD a2, AO, 1 * SIZE
  2657. MOV c21, c11
  2658. LD a3, AO, 2 * SIZE
  2659. LD a4, AO, 3 * SIZE
  2660. LD b1, B, 0 * SIZE
  2661. LD b2, B, 1 * SIZE
  2662. LD b3, B, 2 * SIZE
  2663. LD b4, B, 3 * SIZE
  2664. LD b5, B, 4 * SIZE
  2665. LD b6, B, 8 * SIZE
  2666. LD b7, B, 12 * SIZE
  2667. srai.d L, KK, 2
  2668. move BO, B
  2669. bge $r0, L, .L85
  2670. #else
  2671. #ifdef LN
  2672. slli.d TEMP, K, BASE_SHIFT
  2673. sub.d AORIG, AORIG, TEMP
  2674. #endif
  2675. slli.d TEMP, KK, BASE_SHIFT
  2676. add.d AO, AORIG, TEMP
  2677. add.d BO, B, TEMP
  2678. sub.d TEMP, K, KK
  2679. LD a1, AO, 0 * SIZE
  2680. MTC c11, $r0
  2681. LD a2, AO, 1 * SIZE
  2682. MOV c21, c11
  2683. LD a3, AO, 2 * SIZE
  2684. LD a4, AO, 3 * SIZE
  2685. LD b1, BO, 0 * SIZE
  2686. LD b2, BO, 1 * SIZE
  2687. LD b3, BO, 2 * SIZE
  2688. LD b4, BO, 3 * SIZE
  2689. LD b5, BO, 4 * SIZE
  2690. LD b6, BO, 8 * SIZE
  2691. LD b7, BO, 12 * SIZE
  2692. srai.d L, TEMP, 2
  2693. bge $r0, L, .L85
  2694. #endif
  2695. .align 3
  2696. .L82:
  2697. LD a1, AO, 0 * SIZE
  2698. LD b1, BO, 0 * SIZE
  2699. MADD c11, b1, a1, c11
  2700. LD a1, AO, 1 * SIZE
  2701. LD b1, BO, 1 * SIZE
  2702. MADD c21, b1, a1, c21
  2703. LD a1, AO, 2 * SIZE
  2704. LD b1, BO, 2 * SIZE
  2705. MADD c11, b1, a1, c11
  2706. LD a1, AO, 3 * SIZE
  2707. LD b1, BO, 3 * SIZE
  2708. MADD c21, b1, a1, c21
  2709. addi.d L, L, -1
  2710. addi.d AO, AO, 4 * SIZE
  2711. addi.d BO, BO, 4 * SIZE
  2712. blt $r0, L, .L82
  2713. .align 3
  2714. .L85:
  2715. #if defined(LT) || defined(RN)
  2716. andi L, KK, 3
  2717. #else
  2718. andi L, TEMP, 3
  2719. #endif
  2720. bge $r0, L, .L88
  2721. .align 3
  2722. .L86:
  2723. LD a1, AO, 0 * SIZE
  2724. LD b1, BO, 0 * SIZE
  2725. MADD c11, b1, a1, c11
  2726. addi.d L, L, -1
  2727. addi.d AO, AO, 1 * SIZE
  2728. addi.d BO, BO, 1 * SIZE
  2729. blt $r0, L, .L86
  2730. .L88:
  2731. ADD c11, c11, c21
  2732. #if defined(LN) || defined(RT)
  2733. #ifdef LN
  2734. addi.d TEMP, KK, -1
  2735. #else
  2736. addi.d TEMP, KK, -1
  2737. #endif
  2738. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  2739. add.d AO, AORIG, TEMP
  2740. add.d BO, B, TEMP
  2741. #endif
  2742. #if defined(LN) || defined(LT)
  2743. LD b1, BO, 0 * SIZE
  2744. SUB c11, b1, c11
  2745. #else
  2746. LD b1, AO, 0 * SIZE
  2747. SUB c11, b1, c11
  2748. #endif
  2749. #if defined(LN) || defined(LT)
  2750. LD b1, AO, 0 * SIZE
  2751. MUL c11, b1, c11
  2752. #endif
  2753. #if defined(RN) || defined(RT)
  2754. LD b1, BO, 0 * SIZE
  2755. MUL c11, b1, c11
  2756. #endif
  2757. #ifdef LN
  2758. addi.d CO1, CO1, -1 * SIZE
  2759. #endif
  2760. #if defined(LN) || defined(LT)
  2761. ST c11, BO, 0 * SIZE
  2762. #else
  2763. ST c11, AO, 0 * SIZE
  2764. #endif
  2765. ST c11, CO1, 0 * SIZE
  2766. #ifndef LN
  2767. addi.d CO1, CO1, 1 * SIZE
  2768. #endif
  2769. #ifdef RT
  2770. slli.d TEMP, K, BASE_SHIFT
  2771. add.d AORIG, AORIG, TEMP
  2772. #endif
  2773. #if defined(LT) || defined(RN)
  2774. sub.d TEMP, K, KK
  2775. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  2776. add.d AO, AO, TEMP
  2777. add.d BO, BO, TEMP
  2778. #endif
  2779. #ifdef LT
  2780. addi.d KK, KK, 1
  2781. #endif
  2782. #ifdef LN
  2783. addi.d KK, KK, -1
  2784. #endif
  2785. .align 3
  2786. .L89:
  2787. #ifdef LN
  2788. slli.d TEMP, K, BASE_SHIFT
  2789. add.d B, B, TEMP
  2790. #endif
  2791. #if defined(LT) || defined(RN)
  2792. move B, BO
  2793. #endif
  2794. #ifdef RN
  2795. addi.d KK, KK, 1
  2796. #endif
  2797. #ifdef RT
  2798. addi.d KK, KK, -1
  2799. #endif
  2800. .align 3
  2801. .L999:
  2802. LDARG $r23, $sp, 0
  2803. LDARG $r24, $sp, 8
  2804. LDARG $r25, $sp, 16
  2805. LDARG $r26, $sp, 24
  2806. LDARG $r27, $sp, 32
  2807. LDARG $r28, $sp, 40
  2808. fld.d $f24, $sp, 48
  2809. fld.d $f25, $sp, 56
  2810. fld.d $f26, $sp, 64
  2811. fld.d $f27, $sp, 72
  2812. fld.d $f28, $sp, 80
  2813. LDARG $r29, $sp, 88
  2814. LDARG $r30, $sp, 96
  2815. LDARG $r20, $sp, 104
  2816. LDARG $r16, $sp, 112
  2817. #ifndef __64BIT__
  2818. fld.d $f18, $sp, 112
  2819. fld.d $f19, $sp, 120
  2820. fld.d $f20, $sp, 128
  2821. fld.d $f21, $sp, 136
  2822. #endif
  2823. addi.d $sp, $sp, 144
  2824. move $r4, $r17
  2825. fmov.d $f0, $f22
  2826. jirl $r0, $r1, 0x0
  2827. EPILOGUE