You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN.S 65 kB


  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M $r4
  30. #define N $r5
  31. #define K $r6
  32. #define A $r7
  33. #define B $r8
  34. #define C $r9
  35. #define LDC $r10
  36. #define OFFSET $r11
  37. #define AO $r12
  38. #define BO $r13
  39. #define I $r17
  40. #define J $r18
  41. #define L $r29
  42. #define CO1 $r14
  43. #define CO2 $r15
  44. #define CO3 $r23
  45. #define CO4 $r24
  46. #define CO5 $r25
  47. #define CO6 $r26
  48. #define CO7 $r27
  49. #define CO8 $r28
  50. #define KK $r30
  51. #define TEMP $r20
  52. #define AORIG $r16
  53. #define a1 $f22
  54. #define a2 $f8
  55. #define a3 $f27
  56. #define a4 $f28
  57. #define b1 $f23
  58. #define b2 $f9
  59. #define b3 $f10
  60. #define b4 $f11
  61. #define b5 $f12
  62. #define b6 $f13
  63. #define b7 $f14
  64. #define b8 $f15
  65. #define a5 b8
  66. #define c11 $f16
  67. #define c12 $f17
  68. #define c21 $f3
  69. #define c22 $f1
  70. #define c31 $f2
  71. #define c32 $f4
  72. #define c41 $f5
  73. #define c42 $f6
  74. #define c51 $f7
  75. #define c52 $f18
  76. #define c61 $f19
  77. #define c62 $f20
  78. #define c71 $f21
  79. #define c72 $f24
  80. #define c81 $f25
  81. #define c82 $f26
  82. #define ALPHA $f0
  83. PROLOGUE
  84. addi.d $sp, $sp, -144
  85. SDARG $r23, $sp, 0
  86. SDARG $r24, $sp, 8
  87. SDARG $r25, $sp, 16
  88. SDARG $r26, $sp, 24
  89. SDARG $r27, $sp, 32
  90. SDARG $r28, $sp, 40
  91. fst.d $f24, $sp, 48
  92. fst.d $f25, $sp, 56
  93. fst.d $f26, $sp, 64
  94. fst.d $f27, $sp, 72
  95. fst.d $f28, $sp, 80
  96. SDARG $r29, $sp, 88
  97. SDARG $r30, $sp, 96
  98. SDARG $r20, $sp, 104
  99. SDARG $r16, $sp, 112
  100. #ifndef __64BIT__
  101. fst.d $f18, $sp, 112
  102. fst.d $f19, $sp, 120
  103. fst.d $f20, $sp, 128
  104. fst.d $f21, $sp, 136
  105. #endif
  106. slli.d LDC, LDC, BASE_SHIFT
  107. #ifdef LN
  108. mul.w TEMP, M, K
  109. slli.d TEMP, TEMP, BASE_SHIFT
  110. add.d A, A, TEMP
  111. slli.d TEMP, M, BASE_SHIFT
  112. add.d C, C, TEMP
  113. #endif
  114. #ifdef RN
  115. neg KK, OFFSET
  116. #endif
  117. #ifdef RT
  118. mul.w TEMP, N, K
  119. slli.d TEMP, TEMP, BASE_SHIFT
  120. add.d B, B, TEMP
  121. mul.w TEMP, N, LDC
  122. add.d C, C, TEMP
  123. sub.d KK, N, OFFSET
  124. #endif
  125. srai.d J, N, 3
  126. nop
  127. bge $r0, J, .L30
  128. .L10:
  129. #ifdef RT
  130. slli.d TEMP, K, 3 + BASE_SHIFT
  131. sub.d B, B, TEMP
  132. slli.d TEMP, LDC, 3
  133. sub.d C, C, TEMP
  134. #endif
  135. move CO1, C
  136. MTC c11, $r0
  137. add.d CO2, C, LDC
  138. add.d CO3, CO2, LDC
  139. addi.d J, J, -1
  140. add.d CO4, CO3, LDC
  141. MOV c21, c11
  142. add.d CO5, CO4, LDC
  143. MOV c31, c11
  144. add.d CO6, CO5, LDC
  145. MOV c41, c11
  146. add.d CO7, CO6, LDC
  147. MOV c51, c11
  148. add.d CO8, CO7, LDC
  149. #ifdef LN
  150. add.d KK, M, OFFSET
  151. #endif
  152. #ifdef LT
  153. move KK, OFFSET
  154. #endif
  155. #if defined(LN) || defined(RT)
  156. move AORIG, A
  157. #else
  158. move AO, A
  159. #endif
  160. #ifndef RT
  161. add.d C, CO8, LDC
  162. #endif
  163. andi I, M, 1
  164. MOV c61, c11
  165. MOV c71, c11
  166. bge $r0, I, .L20
  167. #if defined(LT) || defined(RN)
  168. LD a1, AO, 0 * SIZE
  169. LD a2, AO, 1 * SIZE
  170. LD a3, AO, 2 * SIZE
  171. LD a4, AO, 3 * SIZE
  172. LD b1, B, 0 * SIZE
  173. LD b2, B, 1 * SIZE
  174. LD b3, B, 2 * SIZE
  175. LD b4, B, 3 * SIZE
  176. LD b5, B, 4 * SIZE
  177. LD b6, B, 8 * SIZE
  178. LD b7, B, 12 * SIZE
  179. srai.d L, KK, 2
  180. MOV c81, c11
  181. move BO, B
  182. bge $r0, L, .L25
  183. #else
  184. #ifdef LN
  185. slli.d TEMP, K, 0 + BASE_SHIFT
  186. sub.d AORIG, AORIG, TEMP
  187. #endif
  188. slli.d L, KK, 0 + BASE_SHIFT
  189. slli.d TEMP, KK, 3 + BASE_SHIFT
  190. add.d AO, AORIG, L
  191. add.d BO, B, TEMP
  192. sub.d TEMP, K, KK
  193. LD a1, AO, 0 * SIZE
  194. LD a2, AO, 1 * SIZE
  195. LD a3, AO, 2 * SIZE
  196. LD a4, AO, 3 * SIZE
  197. LD b1, BO, 0 * SIZE
  198. LD b2, BO, 1 * SIZE
  199. LD b3, BO, 2 * SIZE
  200. LD b4, BO, 3 * SIZE
  201. LD b5, BO, 4 * SIZE
  202. LD b6, BO, 8 * SIZE
  203. LD b7, BO, 12 * SIZE
  204. srai.d L, TEMP, 2
  205. MOV c81, c11
  206. bge $r0, L, .L25
  207. #endif
  208. .align 3
  209. .L22:
  210. MADD c11, b1, a1, c11
  211. LD b1, BO, 16 * SIZE
  212. MADD c21, b2, a1, c21
  213. LD b2, BO, 5 * SIZE
  214. MADD c31, b3, a1, c31
  215. LD b3, BO, 6 * SIZE
  216. MADD c41, b4, a1, c41
  217. LD b4, BO, 7 * SIZE
  218. MADD c51, b5, a1, c51
  219. LD b5, BO, 20 * SIZE
  220. MADD c61, b2, a1, c61
  221. LD b2, BO, 9 * SIZE
  222. MADD c71, b3, a1, c71
  223. LD b3, BO, 10 * SIZE
  224. MADD c81, b4, a1, c81
  225. LD b4, BO, 11 * SIZE
  226. LD a1, AO, 4 * SIZE
  227. addi.d L, L, -1
  228. MADD c11, b6, a2, c11
  229. LD b6, BO, 24 * SIZE
  230. MADD c21, b2, a2, c21
  231. LD b2, BO, 13 * SIZE
  232. MADD c31, b3, a2, c31
  233. LD b3, BO, 14 * SIZE
  234. MADD c41, b4, a2, c41
  235. LD b4, BO, 15 * SIZE
  236. MADD c51, b7, a2, c51
  237. LD b7, BO, 28 * SIZE
  238. MADD c61, b2, a2, c61
  239. LD b2, BO, 17 * SIZE
  240. MADD c71, b3, a2, c71
  241. LD b3, BO, 18 * SIZE
  242. MADD c81, b4, a2, c81
  243. LD b4, BO, 19 * SIZE
  244. LD a2, AO, 5 * SIZE
  245. addi.d AO, AO, 4 * SIZE
  246. MADD c11, b1, a3, c11
  247. LD b1, BO, 32 * SIZE
  248. MADD c21, b2, a3, c21
  249. LD b2, BO, 21 * SIZE
  250. MADD c31, b3, a3, c31
  251. LD b3, BO, 22 * SIZE
  252. MADD c41, b4, a3, c41
  253. LD b4, BO, 23 * SIZE
  254. MADD c51, b5, a3, c51
  255. LD b5, BO, 36 * SIZE
  256. MADD c61, b2, a3, c61
  257. LD b2, BO, 25 * SIZE
  258. MADD c71, b3, a3, c71
  259. LD b3, BO, 26 * SIZE
  260. MADD c81, b4, a3, c81
  261. LD b4, BO, 27 * SIZE
  262. LD a3, AO, 2 * SIZE
  263. addi.d BO, BO, 32 * SIZE
  264. MADD c11, b6, a4, c11
  265. LD b6, BO, 8 * SIZE
  266. MADD c21, b2, a4, c21
  267. LD b2, BO, -3 * SIZE
  268. MADD c31, b3, a4, c31
  269. LD b3, BO, -2 * SIZE
  270. MADD c41, b4, a4, c41
  271. LD b4, BO, -1 * SIZE
  272. MADD c51, b7, a4, c51
  273. LD b7, BO, 12 * SIZE
  274. MADD c61, b2, a4, c61
  275. LD b2, BO, 1 * SIZE
  276. MADD c71, b3, a4, c71
  277. LD b3, BO, 2 * SIZE
  278. MADD c81, b4, a4, c81
  279. LD b4, BO, 3 * SIZE
  280. LD a4, AO, 3 * SIZE
  281. blt $r0, L, .L22
  282. .align 3
  283. .L25:
  284. #if defined(LT) || defined(RN)
  285. andi L, KK, 3
  286. #else
  287. andi L, TEMP, 3
  288. #endif
  289. bge $r0, L, .L28
  290. .align 3
  291. .L26:
  292. MADD c11, b1, a1, c11
  293. LD b1, BO, 8 * SIZE
  294. MADD c21, b2, a1, c21
  295. LD b2, BO, 5 * SIZE
  296. MADD c31, b3, a1, c31
  297. LD b3, BO, 6 * SIZE
  298. MADD c41, b4, a1, c41
  299. LD b4, BO, 7 * SIZE
  300. addi.d L, L, -1
  301. MOV a2, a2
  302. addi.d AO, AO, 1 * SIZE
  303. addi.d BO, BO, 8 * SIZE
  304. MADD c51, b5, a1, c51
  305. LD b5, BO, 4 * SIZE
  306. MADD c61, b2, a1, c61
  307. LD b2, BO, 1 * SIZE
  308. MADD c71, b3, a1, c71
  309. LD b3, BO, 2 * SIZE
  310. MADD c81, b4, a1, c81
  311. LD a1, AO, 0 * SIZE
  312. LD b4, BO, 3 * SIZE
  313. blt $r0, L, .L26
  314. .L28:
  315. #if defined(LN) || defined(RT)
  316. #ifdef LN
  317. addi.d TEMP, KK, -1
  318. #else
  319. addi.d TEMP, KK, -8
  320. #endif
  321. slli.d L, TEMP, 0 + BASE_SHIFT
  322. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  323. add.d AO, AORIG, L
  324. add.d BO, B, TEMP
  325. #endif
  326. #if defined(LN) || defined(LT)
  327. LD b1, BO, 0 * SIZE
  328. LD b2, BO, 1 * SIZE
  329. LD b3, BO, 2 * SIZE
  330. LD b4, BO, 3 * SIZE
  331. LD b5, BO, 4 * SIZE
  332. LD b6, BO, 5 * SIZE
  333. LD b7, BO, 6 * SIZE
  334. LD b8, BO, 7 * SIZE
  335. SUB c11, b1, c11
  336. SUB c21, b2, c21
  337. SUB c31, b3, c31
  338. SUB c41, b4, c41
  339. SUB c51, b5, c51
  340. SUB c61, b6, c61
  341. SUB c71, b7, c71
  342. SUB c81, b8, c81
  343. #else
  344. LD b1, AO, 0 * SIZE
  345. LD b2, AO, 1 * SIZE
  346. LD b3, AO, 2 * SIZE
  347. LD b4, AO, 3 * SIZE
  348. LD b5, AO, 4 * SIZE
  349. LD b6, AO, 5 * SIZE
  350. LD b7, AO, 6 * SIZE
  351. LD b8, AO, 7 * SIZE
  352. SUB c11, b1, c11
  353. SUB c21, b2, c21
  354. SUB c31, b3, c31
  355. SUB c41, b4, c41
  356. SUB c51, b5, c51
  357. SUB c61, b6, c61
  358. SUB c71, b7, c71
  359. SUB c81, b8, c81
  360. #endif
  361. #if defined(LN) || defined(LT)
  362. LD b1, AO, 0 * SIZE
  363. MUL c11, b1, c11
  364. MUL c21, b1, c21
  365. MUL c31, b1, c31
  366. MUL c41, b1, c41
  367. MUL c51, b1, c51
  368. MUL c61, b1, c61
  369. MUL c71, b1, c71
  370. MUL c81, b1, c81
  371. #endif
  372. #ifdef RN
  373. LD b1, BO, 0 * SIZE
  374. LD b2, BO, 1 * SIZE
  375. LD b3, BO, 2 * SIZE
  376. LD b4, BO, 3 * SIZE
  377. LD b5, BO, 4 * SIZE
  378. LD b6, BO, 5 * SIZE
  379. LD b7, BO, 6 * SIZE
  380. LD b8, BO, 7 * SIZE
  381. MUL c11, b1, c11
  382. NMSUB c21, c11, b2, c21
  383. NMSUB c31, c11, b3, c31
  384. NMSUB c41, c11, b4, c41
  385. NMSUB c51, c11, b5, c51
  386. NMSUB c61, c11, b6, c61
  387. NMSUB c71, c11, b7, c71
  388. NMSUB c81, c11, b8, c81
  389. LD b2, BO, 9 * SIZE
  390. LD b3, BO, 10 * SIZE
  391. LD b4, BO, 11 * SIZE
  392. LD b5, BO, 12 * SIZE
  393. LD b6, BO, 13 * SIZE
  394. LD b7, BO, 14 * SIZE
  395. LD b8, BO, 15 * SIZE
  396. MUL c21, b2, c21
  397. NMSUB c31, c21, b3, c31
  398. NMSUB c41, c21, b4, c41
  399. NMSUB c51, c21, b5, c51
  400. NMSUB c61, c21, b6, c61
  401. NMSUB c71, c21, b7, c71
  402. NMSUB c81, c21, b8, c81
  403. LD b3, BO, 18 * SIZE
  404. LD b4, BO, 19 * SIZE
  405. LD b5, BO, 20 * SIZE
  406. LD b6, BO, 21 * SIZE
  407. LD b7, BO, 22 * SIZE
  408. LD b8, BO, 23 * SIZE
  409. MUL c31, b3, c31
  410. NMSUB c41, c31, b4, c41
  411. NMSUB c51, c31, b5, c51
  412. NMSUB c61, c31, b6, c61
  413. NMSUB c71, c31, b7, c71
  414. NMSUB c81, c31, b8, c81
  415. LD b4, BO, 27 * SIZE
  416. LD b5, BO, 28 * SIZE
  417. LD b6, BO, 29 * SIZE
  418. LD b7, BO, 30 * SIZE
  419. LD b8, BO, 31 * SIZE
  420. MUL c41, b4, c41
  421. NMSUB c51, c41, b5, c51
  422. NMSUB c61, c41, b6, c61
  423. NMSUB c71, c41, b7, c71
  424. NMSUB c81, c41, b8, c81
  425. LD b5, BO, 36 * SIZE
  426. LD b6, BO, 37 * SIZE
  427. LD b7, BO, 38 * SIZE
  428. LD b8, BO, 39 * SIZE
  429. MUL c51, b5, c51
  430. NMSUB c61, c51, b6, c61
  431. NMSUB c71, c51, b7, c71
  432. NMSUB c81, c51, b8, c81
  433. LD b6, BO, 45 * SIZE
  434. LD b7, BO, 46 * SIZE
  435. LD b8, BO, 47 * SIZE
  436. MUL c61, b6, c61
  437. NMSUB c71, c61, b7, c71
  438. NMSUB c81, c61, b8, c81
  439. LD b7, BO, 54 * SIZE
  440. LD b8, BO, 55 * SIZE
  441. MUL c71, b7, c71
  442. NMSUB c81, c71, b8, c81
  443. LD b8, BO, 63 * SIZE
  444. MUL c81, b8, c81
  445. #endif
  446. #ifdef RT
  447. LD b1, BO, 63 * SIZE
  448. LD b2, BO, 62 * SIZE
  449. LD b3, BO, 61 * SIZE
  450. LD b4, BO, 60 * SIZE
  451. LD b5, BO, 59 * SIZE
  452. LD b6, BO, 58 * SIZE
  453. LD b7, BO, 57 * SIZE
  454. LD b8, BO, 56 * SIZE
  455. MUL c81, b1, c81
  456. NMSUB c71, c81, b2, c71
  457. NMSUB c61, c81, b3, c61
  458. NMSUB c51, c81, b4, c51
  459. NMSUB c41, c81, b5, c41
  460. NMSUB c31, c81, b6, c31
  461. NMSUB c21, c81, b7, c21
  462. NMSUB c11, c81, b8, c11
  463. LD b2, BO, 54 * SIZE
  464. LD b3, BO, 53 * SIZE
  465. LD b4, BO, 52 * SIZE
  466. LD b5, BO, 51 * SIZE
  467. LD b6, BO, 50 * SIZE
  468. LD b7, BO, 49 * SIZE
  469. LD b8, BO, 48 * SIZE
  470. MUL c71, b2, c71
  471. NMSUB c61, c71, b3, c61
  472. NMSUB c51, c71, b4, c51
  473. NMSUB c41, c71, b5, c41
  474. NMSUB c31, c71, b6, c31
  475. NMSUB c21, c71, b7, c21
  476. NMSUB c11, c71, b8, c11
  477. LD b3, BO, 45 * SIZE
  478. LD b4, BO, 44 * SIZE
  479. LD b5, BO, 43 * SIZE
  480. LD b6, BO, 42 * SIZE
  481. LD b7, BO, 41 * SIZE
  482. LD b8, BO, 40 * SIZE
  483. MUL c61, b3, c61
  484. NMSUB c51, c61, b4, c51
  485. NMSUB c41, c61, b5, c41
  486. NMSUB c31, c61, b6, c31
  487. NMSUB c21, c61, b7, c21
  488. NMSUB c11, c61, b8, c11
  489. LD b4, BO, 36 * SIZE
  490. LD b5, BO, 35 * SIZE
  491. LD b6, BO, 34 * SIZE
  492. LD b7, BO, 33 * SIZE
  493. LD b8, BO, 32 * SIZE
  494. MUL c51, b4, c51
  495. NMSUB c41, c51, b5, c41
  496. NMSUB c31, c51, b6, c31
  497. NMSUB c21, c51, b7, c21
  498. NMSUB c11, c51, b8, c11
  499. LD b5, BO, 27 * SIZE
  500. LD b6, BO, 26 * SIZE
  501. LD b7, BO, 25 * SIZE
  502. LD b8, BO, 24 * SIZE
  503. MUL c41, b5, c41
  504. NMSUB c31, c41, b6, c31
  505. NMSUB c21, c41, b7, c21
  506. NMSUB c11, c41, b8, c11
  507. LD b6, BO, 18 * SIZE
  508. LD b7, BO, 17 * SIZE
  509. LD b8, BO, 16 * SIZE
  510. MUL c31, b6, c31
  511. NMSUB c21, c31, b7, c21
  512. NMSUB c11, c31, b8, c11
  513. LD b7, BO, 9 * SIZE
  514. LD b8, BO, 8 * SIZE
  515. MUL c21, b7, c21
  516. NMSUB c11, c21, b8, c11
  517. LD b8, BO, 0 * SIZE
  518. MUL c11, b8, c11
  519. #endif
  520. #ifdef LN
  521. addi.d CO1, CO1, -1 * SIZE
  522. addi.d CO2, CO2, -1 * SIZE
  523. addi.d CO3, CO3, -1 * SIZE
  524. addi.d CO4, CO4, -1 * SIZE
  525. addi.d CO5, CO5, -1 * SIZE
  526. addi.d CO6, CO6, -1 * SIZE
  527. addi.d CO7, CO7, -1 * SIZE
  528. addi.d CO8, CO8, -1 * SIZE
  529. #endif
  530. #if defined(LN) || defined(LT)
  531. ST c11, BO, 0 * SIZE
  532. ST c21, BO, 1 * SIZE
  533. ST c31, BO, 2 * SIZE
  534. ST c41, BO, 3 * SIZE
  535. ST c51, BO, 4 * SIZE
  536. ST c61, BO, 5 * SIZE
  537. ST c71, BO, 6 * SIZE
  538. ST c81, BO, 7 * SIZE
  539. #else
  540. ST c11, AO, 0 * SIZE
  541. ST c21, AO, 1 * SIZE
  542. ST c31, AO, 2 * SIZE
  543. ST c41, AO, 3 * SIZE
  544. ST c51, AO, 4 * SIZE
  545. ST c61, AO, 5 * SIZE
  546. ST c71, AO, 6 * SIZE
  547. ST c81, AO, 7 * SIZE
  548. #endif
  549. ST c11, CO1, 0 * SIZE
  550. ST c21, CO2, 0 * SIZE
  551. ST c31, CO3, 0 * SIZE
  552. ST c41, CO4, 0 * SIZE
  553. ST c51, CO5, 0 * SIZE
  554. ST c61, CO6, 0 * SIZE
  555. ST c71, CO7, 0 * SIZE
  556. ST c81, CO8, 0 * SIZE
  557. MTC c11, $r0
  558. #ifndef LN
  559. addi.d CO1, CO1, 1 * SIZE
  560. addi.d CO2, CO2, 1 * SIZE
  561. addi.d CO3, CO3, 1 * SIZE
  562. addi.d CO4, CO4, 1 * SIZE
  563. addi.d CO5, CO5, 1 * SIZE
  564. addi.d CO6, CO6, 1 * SIZE
  565. addi.d CO7, CO7, 1 * SIZE
  566. addi.d CO8, CO8, 1 * SIZE
  567. #endif
  568. MOV c21, c11
  569. #ifdef RT
  570. slli.d TEMP, K, BASE_SHIFT
  571. add.d AORIG, AORIG, TEMP
  572. #endif
  573. MOV c31, c11
  574. #if defined(LT) || defined(RN)
  575. sub.d TEMP, K, KK
  576. slli.d L, TEMP, 0 + BASE_SHIFT
  577. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  578. add.d AO, AO, L
  579. add.d BO, BO, TEMP
  580. #endif
  581. MOV c41, c11
  582. #ifdef LT
  583. addi.d KK, KK, 1
  584. #endif
  585. #ifdef LN
  586. addi.d KK, KK, -1
  587. #endif
  588. .align 3
  589. .L20:
  590. srai.d I, M, 1
  591. MOV c51, c11
  592. MOV c61, c11
  593. bge $r0, I, .L29
  594. .L11:
  595. #if defined(LT) || defined(RN)
  596. LD a1, AO, 0 * SIZE
  597. MOV c71, c11
  598. LD b1, B, 0 * SIZE
  599. MOV c81, c11
  600. LD a3, AO, 4 * SIZE
  601. MOV c12, c11
  602. LD b2, B, 1 * SIZE
  603. MOV c22, c11
  604. srai.d L, KK, 2
  605. MOV c32, c11
  606. LD b3, B, 2 * SIZE
  607. MOV c42, c11
  608. LD b4, B, 3 * SIZE
  609. MOV c52, c11
  610. LD b5, B, 4 * SIZE
  611. MOV c62, c11
  612. LD b6, B, 8 * SIZE
  613. MOV c72, c11
  614. LD b7, B, 12 * SIZE
  615. MOV c82, c11
  616. move BO, B
  617. bge $r0, L, .L15
  618. #else
  619. #ifdef LN
  620. slli.d TEMP, K, 1 + BASE_SHIFT
  621. sub.d AORIG, AORIG, TEMP
  622. #endif
  623. slli.d L, KK, 1 + BASE_SHIFT
  624. slli.d TEMP, KK, 3 + BASE_SHIFT
  625. add.d AO, AORIG, L
  626. add.d BO, B, TEMP
  627. sub.d TEMP, K, KK
  628. LD a1, AO, 0 * SIZE
  629. MOV c71, c11
  630. LD b1, BO, 0 * SIZE
  631. MOV c81, c11
  632. LD a3, AO, 4 * SIZE
  633. MOV c12, c11
  634. LD b2, BO, 1 * SIZE
  635. MOV c22, c11
  636. MOV c32, c11
  637. LD b3, BO, 2 * SIZE
  638. MOV c42, c11
  639. LD b4, BO, 3 * SIZE
  640. MOV c52, c11
  641. LD b5, BO, 4 * SIZE
  642. MOV c62, c11
  643. LD b6, BO, 8 * SIZE
  644. MOV c72, c11
  645. LD b7, BO, 12 * SIZE
  646. MOV c82, c11
  647. srai.d L, TEMP, 2
  648. bge $r0, L, .L15
  649. #endif
  650. MADD c11, b1, a1, c11
  651. LD a2, AO, 1 * SIZE
  652. MADD c21, b2, a1, c21
  653. addi.d L, L, -1
  654. MADD c31, b3, a1, c31
  655. MADD c41, b4, a1, c41
  656. bge $r0, L, .L13
  657. .align 3
  658. .L12:
  659. MADD c12, b1, a2, c12
  660. LD b1, BO, 16 * SIZE
  661. MADD c22, b2, a2, c22
  662. LD b2, BO, 5 * SIZE
  663. MADD c32, b3, a2, c32
  664. LD b3, BO, 6 * SIZE
  665. MADD c42, b4, a2, c42
  666. LD b4, BO, 7 * SIZE
  667. MADD c51, b5, a1, c51
  668. MADD c61, b2, a1, c61
  669. LD a4, AO, 2 * SIZE
  670. MADD c71, b3, a1, c71
  671. MADD c81, b4, a1, c81
  672. LD a1, AO, 8 * SIZE
  673. MADD c52, b5, a2, c52
  674. LD b5, BO, 20 * SIZE
  675. MADD c62, b2, a2, c62
  676. LD b2, BO, 9 * SIZE
  677. MADD c72, b3, a2, c72
  678. LD b3, BO, 10 * SIZE
  679. MADD c82, b4, a2, c82
  680. LD b4, BO, 11 * SIZE
  681. MADD c11, b6, a4, c11
  682. LD a2, AO, 3 * SIZE
  683. MADD c21, b2, a4, c21
  684. MADD c31, b3, a4, c31
  685. MADD c41, b4, a4, c41
  686. MADD c12, b6, a2, c12
  687. LD b6, BO, 24 * SIZE
  688. MADD c22, b2, a2, c22
  689. LD b2, BO, 13 * SIZE
  690. MADD c32, b3, a2, c32
  691. LD b3, BO, 14 * SIZE
  692. MADD c42, b4, a2, c42
  693. LD b4, BO, 15 * SIZE
  694. MADD c51, b7, a4, c51
  695. MADD c61, b2, a4, c61
  696. MADD c71, b3, a4, c71
  697. MADD c81, b4, a4, c81
  698. MADD c52, b7, a2, c52
  699. LD b7, BO, 28 * SIZE
  700. MADD c62, b2, a2, c62
  701. LD b2, BO, 17 * SIZE
  702. MADD c72, b3, a2, c72
  703. LD b3, BO, 18 * SIZE
  704. MADD c82, b4, a2, c82
  705. LD b4, BO, 19 * SIZE
  706. MADD c11, b1, a3, c11
  707. LD a2, AO, 5 * SIZE
  708. MADD c21, b2, a3, c21
  709. MADD c31, b3, a3, c31
  710. MADD c41, b4, a3, c41
  711. MADD c12, b1, a2, c12
  712. LD b1, BO, 32 * SIZE
  713. MADD c22, b2, a2, c22
  714. LD b2, BO, 21 * SIZE
  715. MADD c32, b3, a2, c32
  716. LD b3, BO, 22 * SIZE
  717. MADD c42, b4, a2, c42
  718. LD b4, BO, 23 * SIZE
  719. MADD c51, b5, a3, c51
  720. MADD c61, b2, a3, c61
  721. LD a4, AO, 6 * SIZE
  722. MADD c71, b3, a3, c71
  723. MADD c81, b4, a3, c81
  724. LD a3, AO, 12 * SIZE
  725. MADD c52, b5, a2, c52
  726. LD b5, BO, 36 * SIZE
  727. MADD c62, b2, a2, c62
  728. LD b2, BO, 25 * SIZE
  729. MADD c72, b3, a2, c72
  730. LD b3, BO, 26 * SIZE
  731. MADD c82, b4, a2, c82
  732. LD b4, BO, 27 * SIZE
  733. MADD c11, b6, a4, c11
  734. LD a2, AO, 7 * SIZE
  735. MADD c21, b2, a4, c21
  736. MADD c31, b3, a4, c31
  737. MADD c41, b4, a4, c41
  738. addi.d L, L, -1
  739. MADD c12, b6, a2, c12
  740. LD b6, BO, 40 * SIZE
  741. MADD c22, b2, a2, c22
  742. LD b2, BO, 29 * SIZE
  743. MADD c32, b3, a2, c32
  744. LD b3, BO, 30 * SIZE
  745. MADD c42, b4, a2, c42
  746. LD b4, BO, 31 * SIZE
  747. MADD c51, b7, a4, c51
  748. addi.d BO, BO, 32 * SIZE
  749. MADD c61, b2, a4, c61
  750. addi.d AO, AO, 8 * SIZE
  751. MADD c71, b3, a4, c71
  752. MADD c81, b4, a4, c81
  753. MADD c52, b7, a2, c52
  754. LD b7, BO, 12 * SIZE
  755. MADD c62, b2, a2, c62
  756. LD b2, BO, 1 * SIZE
  757. MADD c72, b3, a2, c72
  758. LD b3, BO, 2 * SIZE
  759. MADD c82, b4, a2, c82
  760. LD b4, BO, 3 * SIZE
  761. MADD c11, b1, a1, c11
  762. LD a2, AO, 1 * SIZE
  763. MADD c21, b2, a1, c21
  764. MADD c31, b3, a1, c31
  765. MADD c41, b4, a1, c41
  766. blt $r0, L, .L12
  767. .align 3
  768. .L13:
  769. MADD c12, b1, a2, c12
  770. LD b1, BO, 16 * SIZE
  771. MADD c22, b2, a2, c22
  772. LD b2, BO, 5 * SIZE
  773. MADD c32, b3, a2, c32
  774. LD b3, BO, 6 * SIZE
  775. MADD c42, b4, a2, c42
  776. LD b4, BO, 7 * SIZE
  777. MADD c51, b5, a1, c51
  778. MADD c61, b2, a1, c61
  779. LD a4, AO, 2 * SIZE
  780. MADD c71, b3, a1, c71
  781. MADD c81, b4, a1, c81
  782. LD a1, AO, 8 * SIZE
  783. MADD c52, b5, a2, c52
  784. LD b5, BO, 20 * SIZE
  785. MADD c62, b2, a2, c62
  786. LD b2, BO, 9 * SIZE
  787. MADD c72, b3, a2, c72
  788. LD b3, BO, 10 * SIZE
  789. MADD c82, b4, a2, c82
  790. LD b4, BO, 11 * SIZE
  791. MADD c11, b6, a4, c11
  792. LD a2, AO, 3 * SIZE
  793. MADD c21, b2, a4, c21
  794. MADD c31, b3, a4, c31
  795. MADD c41, b4, a4, c41
  796. MADD c12, b6, a2, c12
  797. LD b6, BO, 24 * SIZE
  798. MADD c22, b2, a2, c22
  799. LD b2, BO, 13 * SIZE
  800. MADD c32, b3, a2, c32
  801. LD b3, BO, 14 * SIZE
  802. MADD c42, b4, a2, c42
  803. LD b4, BO, 15 * SIZE
  804. MADD c51, b7, a4, c51
  805. MADD c61, b2, a4, c61
  806. MADD c71, b3, a4, c71
  807. MADD c81, b4, a4, c81
  808. MADD c52, b7, a2, c52
  809. LD b7, BO, 28 * SIZE
  810. MADD c62, b2, a2, c62
  811. LD b2, BO, 17 * SIZE
  812. MADD c72, b3, a2, c72
  813. LD b3, BO, 18 * SIZE
  814. MADD c82, b4, a2, c82
  815. LD b4, BO, 19 * SIZE
  816. MADD c11, b1, a3, c11
  817. LD a2, AO, 5 * SIZE
  818. MADD c21, b2, a3, c21
  819. MADD c31, b3, a3, c31
  820. MADD c41, b4, a3, c41
  821. MADD c12, b1, a2, c12
  822. LD b1, BO, 32 * SIZE
  823. MADD c22, b2, a2, c22
  824. LD b2, BO, 21 * SIZE
  825. MADD c32, b3, a2, c32
  826. LD b3, BO, 22 * SIZE
  827. MADD c42, b4, a2, c42
  828. LD b4, BO, 23 * SIZE
  829. MADD c51, b5, a3, c51
  830. MADD c61, b2, a3, c61
  831. LD a4, AO, 6 * SIZE
  832. MADD c71, b3, a3, c71
  833. MADD c81, b4, a3, c81
  834. LD a3, AO, 12 * SIZE
  835. MADD c52, b5, a2, c52
  836. LD b5, BO, 36 * SIZE
  837. MADD c62, b2, a2, c62
  838. LD b2, BO, 25 * SIZE
  839. MADD c72, b3, a2, c72
  840. LD b3, BO, 26 * SIZE
  841. MADD c82, b4, a2, c82
  842. LD b4, BO, 27 * SIZE
  843. MADD c11, b6, a4, c11
  844. LD a2, AO, 7 * SIZE
  845. MADD c21, b2, a4, c21
  846. MADD c31, b3, a4, c31
  847. MADD c41, b4, a4, c41
  848. MADD c12, b6, a2, c12
  849. LD b6, BO, 40 * SIZE
  850. MADD c22, b2, a2, c22
  851. LD b2, BO, 29 * SIZE
  852. MADD c32, b3, a2, c32
  853. LD b3, BO, 30 * SIZE
  854. MADD c42, b4, a2, c42
  855. LD b4, BO, 31 * SIZE
  856. MADD c51, b7, a4, c51
  857. addi.d BO, BO, 32 * SIZE
  858. MADD c61, b2, a4, c61
  859. addi.d AO, AO, 8 * SIZE
  860. MADD c71, b3, a4, c71
  861. MADD c81, b4, a4, c81
  862. MADD c52, b7, a2, c52
  863. LD b7, BO, 12 * SIZE
  864. MADD c62, b2, a2, c62
  865. LD b2, BO, 1 * SIZE
  866. MADD c72, b3, a2, c72
  867. LD b3, BO, 2 * SIZE
  868. MADD c82, b4, a2, c82
  869. LD b4, BO, 3 * SIZE
  870. .align 3
  871. .L15:
  872. #if defined(LT) || defined(RN)
  873. andi L, KK, 3
  874. #else
  875. andi L, TEMP, 3
  876. #endif
  877. bge $r0, L, .L18
  878. .align 3
  879. .L16:
  880. MADD c11, b1, a1, c11
  881. LD a2, AO, 1 * SIZE
  882. MADD c21, b2, a1, c21
  883. MADD c31, b3, a1, c31
  884. MADD c41, b4, a1, c41
  885. MADD c12, b1, a2, c12
  886. LD b1, BO, 8 * SIZE
  887. MADD c22, b2, a2, c22
  888. LD b2, BO, 5 * SIZE
  889. MADD c32, b3, a2, c32
  890. LD b3, BO, 6 * SIZE
  891. MADD c42, b4, a2, c42
  892. LD b4, BO, 7 * SIZE
  893. MADD c51, b5, a1, c51
  894. addi.d L, L, -1
  895. MADD c61, b2, a1, c61
  896. addi.d AO, AO, 2 * SIZE
  897. MADD c71, b3, a1, c71
  898. addi.d BO, BO, 8 * SIZE
  899. MADD c81, b4, a1, c81
  900. LD a1, AO, 0 * SIZE
  901. MADD c52, b5, a2, c52
  902. LD b5, BO, 4 * SIZE
  903. MADD c62, b2, a2, c62
  904. LD b2, BO, 1 * SIZE
  905. MADD c72, b3, a2, c72
  906. LD b3, BO, 2 * SIZE
  907. MADD c82, b4, a2, c82
  908. LD b4, BO, 3 * SIZE
  909. blt $r0, L, .L16
  910. .L18:
  911. #if defined(LN) || defined(RT)
  912. #ifdef LN
  913. addi.d TEMP, KK, -2
  914. #else
  915. addi.d TEMP, KK, -8
  916. #endif
  917. slli.d L, TEMP, 1 + BASE_SHIFT
  918. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  919. add.d AO, AORIG, L
  920. add.d BO, B, TEMP
  921. #endif
  922. #if defined(LN) || defined(LT)
  923. LD b1, BO, 0 * SIZE
  924. LD b2, BO, 1 * SIZE
  925. LD b3, BO, 2 * SIZE
  926. LD b4, BO, 3 * SIZE
  927. SUB c11, b1, c11
  928. LD b5, BO, 4 * SIZE
  929. SUB c21, b2, c21
  930. LD b6, BO, 5 * SIZE
  931. SUB c31, b3, c31
  932. LD b7, BO, 6 * SIZE
  933. SUB c41, b4, c41
  934. LD b8, BO, 7 * SIZE
  935. SUB c51, b5, c51
  936. LD b1, BO, 8 * SIZE
  937. SUB c61, b6, c61
  938. LD b2, BO, 9 * SIZE
  939. SUB c71, b7, c71
  940. LD b3, BO, 10 * SIZE
  941. SUB c81, b8, c81
  942. LD b4, BO, 11 * SIZE
  943. SUB c12, b1, c12
  944. LD b5, BO, 12 * SIZE
  945. SUB c22, b2, c22
  946. LD b6, BO, 13 * SIZE
  947. SUB c32, b3, c32
  948. LD b7, BO, 14 * SIZE
  949. SUB c42, b4, c42
  950. LD b8, BO, 15 * SIZE
  951. SUB c52, b5, c52
  952. #ifdef LN
  953. LD b1, AO, 3 * SIZE
  954. #else
  955. LD b1, AO, 0 * SIZE
  956. #endif
  957. SUB c62, b6, c62
  958. SUB c72, b7, c72
  959. SUB c82, b8, c82
  960. #else
  961. LD b1, AO, 0 * SIZE
  962. LD b2, AO, 1 * SIZE
  963. LD b3, AO, 2 * SIZE
  964. LD b4, AO, 3 * SIZE
  965. SUB c11, b1, c11
  966. LD b5, AO, 4 * SIZE
  967. SUB c12, b2, c12
  968. LD b6, AO, 5 * SIZE
  969. SUB c21, b3, c21
  970. LD b7, AO, 6 * SIZE
  971. SUB c22, b4, c22
  972. LD b8, AO, 7 * SIZE
  973. SUB c31, b5, c31
  974. LD b1, AO, 8 * SIZE
  975. SUB c32, b6, c32
  976. LD b2, AO, 9 * SIZE
  977. SUB c41, b7, c41
  978. LD b3, AO, 10 * SIZE
  979. SUB c42, b8, c42
  980. LD b4, AO, 11 * SIZE
  981. LD b5, AO, 12 * SIZE
  982. SUB c51, b1, c51
  983. LD b6, AO, 13 * SIZE
  984. SUB c52, b2, c52
  985. LD b7, AO, 14 * SIZE
  986. SUB c61, b3, c61
  987. LD b8, AO, 15 * SIZE
  988. SUB c62, b4, c62
  989. SUB c71, b5, c71
  990. SUB c72, b6, c72
  991. SUB c81, b7, c81
  992. SUB c82, b8, c82
  993. #endif
  994. #ifdef LN
  995. MUL c12, b1, c12
  996. LD b2, AO, 2 * SIZE
  997. MUL c22, b1, c22
  998. MUL c32, b1, c32
  999. MUL c42, b1, c42
  1000. MUL c52, b1, c52
  1001. MUL c62, b1, c62
  1002. MUL c72, b1, c72
  1003. MUL c82, b1, c82
  1004. NMSUB c11, c12, b2, c11
  1005. LD b3, AO, 0 * SIZE
  1006. NMSUB c21, c22, b2, c21
  1007. NMSUB c31, c32, b2, c31
  1008. NMSUB c41, c42, b2, c41
  1009. NMSUB c51, c52, b2, c51
  1010. NMSUB c61, c62, b2, c61
  1011. NMSUB c71, c72, b2, c71
  1012. NMSUB c81, c82, b2, c81
  1013. MUL c11, b3, c11
  1014. addi.d CO1, CO1, -2 * SIZE
  1015. MUL c21, b3, c21
  1016. addi.d CO2, CO2, -2 * SIZE
  1017. MUL c31, b3, c31
  1018. addi.d CO3, CO3, -2 * SIZE
  1019. MUL c41, b3, c41
  1020. addi.d CO4, CO4, -2 * SIZE
  1021. MUL c51, b3, c51
  1022. addi.d CO5, CO5, -2 * SIZE
  1023. MUL c61, b3, c61
  1024. addi.d CO6, CO6, -2 * SIZE
  1025. MUL c71, b3, c71
  1026. addi.d CO7, CO7, -2 * SIZE
  1027. MUL c81, b3, c81
  1028. addi.d CO8, CO8, -2 * SIZE
  1029. #endif
  1030. #ifdef LT
  1031. MUL c11, b1, c11
  1032. LD b2, AO, 1 * SIZE
  1033. MUL c21, b1, c21
  1034. MUL c31, b1, c31
  1035. MUL c41, b1, c41
  1036. MUL c51, b1, c51
  1037. MUL c61, b1, c61
  1038. MUL c71, b1, c71
  1039. MUL c81, b1, c81
  1040. NMSUB c12, c11, b2, c12
  1041. LD b3, AO, 3 * SIZE
  1042. NMSUB c22, c21, b2, c22
  1043. NMSUB c32, c31, b2, c32
  1044. NMSUB c42, c41, b2, c42
  1045. NMSUB c52, c51, b2, c52
  1046. NMSUB c62, c61, b2, c62
  1047. NMSUB c72, c71, b2, c72
  1048. NMSUB c82, c81, b2, c82
  1049. MUL c12, b3, c12
  1050. MUL c22, b3, c22
  1051. MUL c32, b3, c32
  1052. MUL c42, b3, c42
  1053. MUL c52, b3, c52
  1054. MUL c62, b3, c62
  1055. MUL c72, b3, c72
  1056. MUL c82, b3, c82
  1057. #endif
  1058. #ifdef RN
  1059. LD b1, BO, 0 * SIZE
  1060. LD b2, BO, 1 * SIZE
  1061. LD b3, BO, 2 * SIZE
  1062. LD b4, BO, 3 * SIZE
  1063. MUL c11, b1, c11
  1064. MUL c12, b1, c12
  1065. LD b5, BO, 4 * SIZE
  1066. NMSUB c21, c11, b2, c21
  1067. NMSUB c22, c12, b2, c22
  1068. LD b6, BO, 5 * SIZE
  1069. NMSUB c31, c11, b3, c31
  1070. NMSUB c32, c12, b3, c32
  1071. LD b7, BO, 6 * SIZE
  1072. NMSUB c41, c11, b4, c41
  1073. NMSUB c42, c12, b4, c42
  1074. LD b8, BO, 7 * SIZE
  1075. NMSUB c51, c11, b5, c51
  1076. NMSUB c52, c12, b5, c52
  1077. LD b2, BO, 9 * SIZE
  1078. NMSUB c61, c11, b6, c61
  1079. NMSUB c62, c12, b6, c62
  1080. LD b3, BO, 10 * SIZE
  1081. NMSUB c71, c11, b7, c71
  1082. NMSUB c72, c12, b7, c72
  1083. LD b4, BO, 11 * SIZE
  1084. NMSUB c81, c11, b8, c81
  1085. NMSUB c82, c12, b8, c82
  1086. LD b5, BO, 12 * SIZE
  1087. MUL c21, b2, c21
  1088. MUL c22, b2, c22
  1089. LD b6, BO, 13 * SIZE
  1090. NMSUB c31, c21, b3, c31
  1091. NMSUB c32, c22, b3, c32
  1092. LD b7, BO, 14 * SIZE
  1093. NMSUB c41, c21, b4, c41
  1094. NMSUB c42, c22, b4, c42
  1095. LD b8, BO, 15 * SIZE
  1096. NMSUB c51, c21, b5, c51
  1097. NMSUB c52, c22, b5, c52
  1098. LD b3, BO, 18 * SIZE
  1099. NMSUB c61, c21, b6, c61
  1100. NMSUB c62, c22, b6, c62
  1101. LD b4, BO, 19 * SIZE
  1102. NMSUB c71, c21, b7, c71
  1103. NMSUB c72, c22, b7, c72
  1104. LD b5, BO, 20 * SIZE
  1105. NMSUB c81, c21, b8, c81
  1106. NMSUB c82, c22, b8, c82
  1107. LD b6, BO, 21 * SIZE
  1108. MUL c31, b3, c31
  1109. MUL c32, b3, c32
  1110. LD b7, BO, 22 * SIZE
  1111. NMSUB c41, c31, b4, c41
  1112. NMSUB c42, c32, b4, c42
  1113. LD b8, BO, 23 * SIZE
  1114. NMSUB c51, c31, b5, c51
  1115. NMSUB c52, c32, b5, c52
  1116. LD b4, BO, 27 * SIZE
  1117. NMSUB c61, c31, b6, c61
  1118. NMSUB c62, c32, b6, c62
  1119. LD b5, BO, 28 * SIZE
  1120. NMSUB c71, c31, b7, c71
  1121. NMSUB c72, c32, b7, c72
  1122. LD b6, BO, 29 * SIZE
  1123. NMSUB c81, c31, b8, c81
  1124. NMSUB c82, c32, b8, c82
  1125. LD b7, BO, 30 * SIZE
  1126. MUL c41, b4, c41
  1127. MUL c42, b4, c42
  1128. LD b8, BO, 31 * SIZE
  1129. NMSUB c51, c41, b5, c51
  1130. NMSUB c52, c42, b5, c52
  1131. LD b5, BO, 36 * SIZE
  1132. NMSUB c61, c41, b6, c61
  1133. NMSUB c62, c42, b6, c62
  1134. LD b6, BO, 37 * SIZE
  1135. NMSUB c71, c41, b7, c71
  1136. NMSUB c72, c42, b7, c72
  1137. LD b7, BO, 38 * SIZE
  1138. NMSUB c81, c41, b8, c81
  1139. NMSUB c82, c42, b8, c82
  1140. LD b8, BO, 39 * SIZE
  1141. MUL c51, b5, c51
  1142. MUL c52, b5, c52
  1143. NMSUB c61, c51, b6, c61
  1144. NMSUB c62, c52, b6, c62
  1145. LD b6, BO, 45 * SIZE
  1146. NMSUB c71, c51, b7, c71
  1147. NMSUB c72, c52, b7, c72
  1148. LD b7, BO, 46 * SIZE
  1149. NMSUB c81, c51, b8, c81
  1150. NMSUB c82, c52, b8, c82
  1151. LD b8, BO, 47 * SIZE
  1152. MUL c61, b6, c61
  1153. MUL c62, b6, c62
  1154. NMSUB c71, c61, b7, c71
  1155. NMSUB c72, c62, b7, c72
  1156. LD b7, BO, 54 * SIZE
  1157. NMSUB c81, c61, b8, c81
  1158. NMSUB c82, c62, b8, c82
  1159. LD b8, BO, 55 * SIZE
  1160. MUL c71, b7, c71
  1161. MUL c72, b7, c72
  1162. NMSUB c81, c71, b8, c81
  1163. NMSUB c82, c72, b8, c82
  1164. LD b8, BO, 63 * SIZE
  1165. MUL c81, b8, c81
  1166. MUL c82, b8, c82
  1167. #endif
  1168. #ifdef RT
  1169. LD b1, BO, 63 * SIZE
  1170. LD b2, BO, 62 * SIZE
  1171. LD b3, BO, 61 * SIZE
  1172. LD b4, BO, 60 * SIZE
  1173. MUL c81, b1, c81
  1174. MUL c82, b1, c82
  1175. LD b5, BO, 59 * SIZE
  1176. NMSUB c71, c81, b2, c71
  1177. NMSUB c72, c82, b2, c72
  1178. LD b6, BO, 58 * SIZE
  1179. NMSUB c61, c81, b3, c61
  1180. NMSUB c62, c82, b3, c62
  1181. LD b7, BO, 57 * SIZE
  1182. NMSUB c51, c81, b4, c51
  1183. NMSUB c52, c82, b4, c52
  1184. LD b8, BO, 56 * SIZE
  1185. NMSUB c41, c81, b5, c41
  1186. NMSUB c42, c82, b5, c42
  1187. LD b2, BO, 54 * SIZE
  1188. NMSUB c31, c81, b6, c31
  1189. NMSUB c32, c82, b6, c32
  1190. LD b3, BO, 53 * SIZE
  1191. NMSUB c21, c81, b7, c21
  1192. NMSUB c22, c82, b7, c22
  1193. LD b4, BO, 52 * SIZE
  1194. NMSUB c11, c81, b8, c11
  1195. NMSUB c12, c82, b8, c12
  1196. LD b5, BO, 51 * SIZE
  1197. MUL c71, b2, c71
  1198. MUL c72, b2, c72
  1199. LD b6, BO, 50 * SIZE
  1200. NMSUB c61, c71, b3, c61
  1201. NMSUB c62, c72, b3, c62
  1202. LD b7, BO, 49 * SIZE
  1203. NMSUB c51, c71, b4, c51
  1204. NMSUB c52, c72, b4, c52
  1205. LD b8, BO, 48 * SIZE
  1206. NMSUB c41, c71, b5, c41
  1207. NMSUB c42, c72, b5, c42
  1208. LD b3, BO, 45 * SIZE
  1209. NMSUB c31, c71, b6, c31
  1210. NMSUB c32, c72, b6, c32
  1211. LD b4, BO, 44 * SIZE
  1212. NMSUB c21, c71, b7, c21
  1213. NMSUB c22, c72, b7, c22
  1214. LD b5, BO, 43 * SIZE
  1215. NMSUB c11, c71, b8, c11
  1216. NMSUB c12, c72, b8, c12
  1217. LD b6, BO, 42 * SIZE
  1218. MUL c61, b3, c61
  1219. MUL c62, b3, c62
  1220. LD b7, BO, 41 * SIZE
  1221. NMSUB c51, c61, b4, c51
  1222. NMSUB c52, c62, b4, c52
  1223. LD b8, BO, 40 * SIZE
  1224. NMSUB c41, c61, b5, c41
  1225. NMSUB c42, c62, b5, c42
  1226. LD b4, BO, 36 * SIZE
  1227. NMSUB c31, c61, b6, c31
  1228. NMSUB c32, c62, b6, c32
  1229. LD b5, BO, 35 * SIZE
  1230. NMSUB c21, c61, b7, c21
  1231. NMSUB c22, c62, b7, c22
  1232. LD b6, BO, 34 * SIZE
  1233. NMSUB c11, c61, b8, c11
  1234. NMSUB c12, c62, b8, c12
  1235. LD b7, BO, 33 * SIZE
  1236. MUL c51, b4, c51
  1237. MUL c52, b4, c52
  1238. LD b8, BO, 32 * SIZE
  1239. NMSUB c41, c51, b5, c41
  1240. NMSUB c42, c52, b5, c42
  1241. LD b5, BO, 27 * SIZE
  1242. NMSUB c31, c51, b6, c31
  1243. NMSUB c32, c52, b6, c32
  1244. LD b6, BO, 26 * SIZE
  1245. NMSUB c21, c51, b7, c21
  1246. NMSUB c22, c52, b7, c22
  1247. LD b7, BO, 25 * SIZE
  1248. NMSUB c11, c51, b8, c11
  1249. NMSUB c12, c52, b8, c12
  1250. LD b8, BO, 24 * SIZE
  1251. MUL c41, b5, c41
  1252. MUL c42, b5, c42
  1253. NMSUB c31, c41, b6, c31
  1254. NMSUB c32, c42, b6, c32
  1255. LD b6, BO, 18 * SIZE
  1256. NMSUB c21, c41, b7, c21
  1257. NMSUB c22, c42, b7, c22
  1258. LD b7, BO, 17 * SIZE
  1259. NMSUB c11, c41, b8, c11
  1260. NMSUB c12, c42, b8, c12
  1261. LD b8, BO, 16 * SIZE
  1262. MUL c31, b6, c31
  1263. MUL c32, b6, c32
  1264. NMSUB c21, c31, b7, c21
  1265. NMSUB c22, c32, b7, c22
  1266. LD b7, BO, 9 * SIZE
  1267. NMSUB c11, c31, b8, c11
  1268. NMSUB c12, c32, b8, c12
  1269. LD b8, BO, 8 * SIZE
  1270. MUL c21, b7, c21
  1271. MUL c22, b7, c22
  1272. NMSUB c11, c21, b8, c11
  1273. NMSUB c12, c22, b8, c12
  1274. LD b8, BO, 0 * SIZE
  1275. MUL c11, b8, c11
  1276. MUL c12, b8, c12
  1277. #endif
  1278. #if defined(LN) || defined(LT)
  1279. ST c11, BO, 0 * SIZE
  1280. ST c21, BO, 1 * SIZE
  1281. ST c31, BO, 2 * SIZE
  1282. ST c41, BO, 3 * SIZE
  1283. ST c51, BO, 4 * SIZE
  1284. ST c61, BO, 5 * SIZE
  1285. ST c71, BO, 6 * SIZE
  1286. ST c81, BO, 7 * SIZE
  1287. ST c12, BO, 8 * SIZE
  1288. ST c22, BO, 9 * SIZE
  1289. ST c32, BO, 10 * SIZE
  1290. ST c42, BO, 11 * SIZE
  1291. ST c52, BO, 12 * SIZE
  1292. ST c62, BO, 13 * SIZE
  1293. ST c72, BO, 14 * SIZE
  1294. ST c82, BO, 15 * SIZE
  1295. #else
  1296. ST c11, AO, 0 * SIZE
  1297. ST c12, AO, 1 * SIZE
  1298. ST c21, AO, 2 * SIZE
  1299. ST c22, AO, 3 * SIZE
  1300. ST c31, AO, 4 * SIZE
  1301. ST c32, AO, 5 * SIZE
  1302. ST c41, AO, 6 * SIZE
  1303. ST c42, AO, 7 * SIZE
  1304. ST c51, AO, 8 * SIZE
  1305. ST c52, AO, 9 * SIZE
  1306. ST c61, AO, 10 * SIZE
  1307. ST c62, AO, 11 * SIZE
  1308. ST c71, AO, 12 * SIZE
  1309. ST c72, AO, 13 * SIZE
  1310. ST c81, AO, 14 * SIZE
  1311. ST c82, AO, 15 * SIZE
  1312. #endif
  1313. ST c11, CO1, 0 * SIZE
  1314. ST c12, CO1, 1 * SIZE
  1315. ST c21, CO2, 0 * SIZE
  1316. ST c22, CO2, 1 * SIZE
  1317. ST c31, CO3, 0 * SIZE
  1318. ST c32, CO3, 1 * SIZE
  1319. ST c41, CO4, 0 * SIZE
  1320. ST c42, CO4, 1 * SIZE
  1321. ST c51, CO5, 0 * SIZE
  1322. ST c52, CO5, 1 * SIZE
  1323. ST c61, CO6, 0 * SIZE
  1324. ST c62, CO6, 1 * SIZE
  1325. ST c71, CO7, 0 * SIZE
  1326. ST c72, CO7, 1 * SIZE
  1327. ST c81, CO8, 0 * SIZE
  1328. ST c82, CO8, 1 * SIZE
  1329. MTC a1, $r0
  1330. #ifndef LN
  1331. addi.d CO1, CO1, 2 * SIZE
  1332. addi.d CO2, CO2, 2 * SIZE
  1333. addi.d CO3, CO3, 2 * SIZE
  1334. addi.d CO4, CO4, 2 * SIZE
  1335. addi.d CO5, CO5, 2 * SIZE
  1336. addi.d CO6, CO6, 2 * SIZE
  1337. addi.d CO7, CO7, 2 * SIZE
  1338. addi.d CO8, CO8, 2 * SIZE
  1339. #endif
  1340. MOV c11, a1
  1341. MOV c21, a1
  1342. #ifdef RT
  1343. slli.d TEMP, K, 1 + BASE_SHIFT
  1344. add.d AORIG, AORIG, TEMP
  1345. #endif
  1346. MOV c31, a1
  1347. MOV c41, a1
  1348. #if defined(LT) || defined(RN)
  1349. sub.d TEMP, K, KK
  1350. slli.d L, TEMP, 1 + BASE_SHIFT
  1351. slli.d TEMP, TEMP, 3 + BASE_SHIFT
  1352. add.d AO, AO, L
  1353. add.d BO, BO, TEMP
  1354. #endif
  1355. #ifdef LT
  1356. addi.d KK, KK, 2
  1357. #endif
  1358. #ifdef LN
  1359. addi.d KK, KK, -2
  1360. #endif
  1361. addi.d I, I, -1
  1362. MOV c51, a1
  1363. MOV c61, a1
  1364. blt $r0, I, .L11
  1365. .align 3
  1366. .L29:
  1367. #ifdef LN
  1368. slli.d TEMP, K, 3 + BASE_SHIFT
  1369. add.d B, B, TEMP
  1370. #endif
  1371. #if defined(LT) || defined(RN)
  1372. move B, BO
  1373. #endif
  1374. #ifdef RN
  1375. addi.d KK, KK, 8
  1376. #endif
  1377. #ifdef RT
  1378. addi.d KK, KK, -8
  1379. #endif
  1380. blt $r0, J, .L10
  1381. .align 3
  1382. .L30:
  1383. andi J, N, 4
  1384. move AO, A
  1385. bge $r0, J, .L50
  1386. #ifdef RT
  1387. slli.d TEMP, K, 2 + BASE_SHIFT
  1388. sub.d B, B, TEMP
  1389. slli.d TEMP, LDC, 2
  1390. sub.d C, C, TEMP
  1391. #endif
  1392. move CO1, C
  1393. MTC c11, $r0
  1394. add.d CO2, C, LDC
  1395. add.d CO3, CO2, LDC
  1396. MOV c21, c11
  1397. add.d CO4, CO3, LDC
  1398. MOV c31, c11
  1399. #ifdef LN
  1400. add.d KK, M, OFFSET
  1401. #endif
  1402. #ifdef LT
  1403. move KK, OFFSET
  1404. #endif
  1405. #if defined(LN) || defined(RT)
  1406. move AORIG, A
  1407. #else
  1408. move AO, A
  1409. #endif
  1410. #ifndef RT
  1411. add.d C, CO4, LDC
  1412. #endif
  1413. andi I, M, 1
  1414. MOV c41, c11
  1415. bge $r0, I, .L40
  1416. #if defined(LT) || defined(RN)
  1417. LD a1, AO, 0 * SIZE
  1418. MOV c71, c11
  1419. LD a2, AO, 1 * SIZE
  1420. MOV c81, c11
  1421. LD b1, B, 0 * SIZE
  1422. LD b2, B, 1 * SIZE
  1423. LD b3, B, 2 * SIZE
  1424. LD b4, B, 3 * SIZE
  1425. LD b5, B, 4 * SIZE
  1426. LD b6, B, 8 * SIZE
  1427. LD b7, B, 12 * SIZE
  1428. srai.d L, KK, 2
  1429. move BO, B
  1430. bge $r0, L, .L45
  1431. #else
  1432. #ifdef LN
  1433. slli.d TEMP, K, BASE_SHIFT
  1434. sub.d AORIG, AORIG, TEMP
  1435. #endif
  1436. slli.d L, KK, 0 + BASE_SHIFT
  1437. slli.d TEMP, KK, 2 + BASE_SHIFT
  1438. add.d AO, AORIG, L
  1439. add.d BO, B, TEMP
  1440. sub.d TEMP, K, KK
  1441. LD a1, AO, 0 * SIZE
  1442. MOV c71, c11
  1443. LD a2, AO, 1 * SIZE
  1444. MOV c81, c11
  1445. LD b1, BO, 0 * SIZE
  1446. LD b2, BO, 1 * SIZE
  1447. LD b3, BO, 2 * SIZE
  1448. LD b4, BO, 3 * SIZE
  1449. LD b5, BO, 4 * SIZE
  1450. LD b6, BO, 8 * SIZE
  1451. LD b7, BO, 12 * SIZE
  1452. srai.d L, TEMP, 2
  1453. bge $r0, L, .L45
  1454. #endif
  1455. .align 3
  1456. .L42:
  1457. MADD c11, b1, a1, c11
  1458. LD b1, BO, 16 * SIZE
  1459. MADD c21, b2, a1, c21
  1460. LD b2, BO, 5 * SIZE
  1461. MADD c31, b3, a1, c31
  1462. LD b3, BO, 6 * SIZE
  1463. MADD c41, b4, a1, c41
  1464. LD b4, BO, 7 * SIZE
  1465. LD a1, AO, 4 * SIZE
  1466. addi.d L, L, -1
  1467. MADD c11, b5, a2, c11
  1468. LD b5, BO, 20 * SIZE
  1469. MADD c21, b2, a2, c21
  1470. LD b2, BO, 9 * SIZE
  1471. MADD c31, b3, a2, c31
  1472. LD b3, BO, 10 * SIZE
  1473. MADD c41, b4, a2, c41
  1474. LD b4, BO, 11 * SIZE
  1475. LD a2, AO, 2 * SIZE
  1476. addi.d AO, AO, 4 * SIZE
  1477. MADD c11, b6, a2, c11
  1478. LD b6, BO, 24 * SIZE
  1479. MADD c21, b2, a2, c21
  1480. LD b2, BO, 13 * SIZE
  1481. MADD c31, b3, a2, c31
  1482. LD b3, BO, 14 * SIZE
  1483. MADD c41, b4, a2, c41
  1484. LD b4, BO, 15 * SIZE
  1485. LD a2, AO, -1 * SIZE
  1486. addi.d BO, BO, 16 * SIZE
  1487. MADD c11, b7, a2, c11
  1488. LD b7, BO, 12 * SIZE
  1489. MADD c21, b2, a2, c21
  1490. LD b2, BO, 1 * SIZE
  1491. MADD c31, b3, a2, c31
  1492. LD b3, BO, 2 * SIZE
  1493. MADD c41, b4, a2, c41
  1494. LD b4, BO, 3 * SIZE
  1495. LD a2, AO, 1 * SIZE
  1496. blt $r0, L, .L42
  1497. .align 3
  1498. .L45:
  1499. #if defined(LT) || defined(RN)
  1500. andi L, KK, 3
  1501. #else
  1502. andi L, TEMP, 3
  1503. #endif
  1504. bge $r0, L, .L48
  1505. .align 3
  1506. .L46:
  1507. MADD c11, b1, a1, c11
  1508. LD b1, BO, 4 * SIZE
  1509. MADD c21, b2, a1, c21
  1510. LD b2, BO, 5 * SIZE
  1511. MADD c31, b3, a1, c31
  1512. LD b3, BO, 6 * SIZE
  1513. MADD c41, b4, a1, c41
  1514. LD a1, AO, 1 * SIZE
  1515. LD b4, BO, 7 * SIZE
  1516. addi.d L, L, -1
  1517. addi.d AO, AO, 1 * SIZE
  1518. MOV a2, a2
  1519. addi.d BO, BO, 4 * SIZE
  1520. blt $r0, L, .L46
  1521. .L48:
  1522. #if defined(LN) || defined(RT)
  1523. #ifdef LN
  1524. addi.d TEMP, KK, -1
  1525. #else
  1526. addi.d TEMP, KK, -4
  1527. #endif
  1528. slli.d L, TEMP, 0 + BASE_SHIFT
  1529. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1530. add.d AO, AORIG, L
  1531. add.d BO, B, TEMP
  1532. #endif
  1533. #if defined(LN) || defined(LT)
  1534. LD b1, BO, 0 * SIZE
  1535. LD b2, BO, 1 * SIZE
  1536. LD b3, BO, 2 * SIZE
  1537. LD b4, BO, 3 * SIZE
  1538. SUB c11, b1, c11
  1539. SUB c21, b2, c21
  1540. SUB c31, b3, c31
  1541. SUB c41, b4, c41
  1542. #else
  1543. LD b1, AO, 0 * SIZE
  1544. LD b2, AO, 1 * SIZE
  1545. LD b3, AO, 2 * SIZE
  1546. LD b4, AO, 3 * SIZE
  1547. SUB c11, b1, c11
  1548. SUB c21, b2, c21
  1549. SUB c31, b3, c31
  1550. SUB c41, b4, c41
  1551. #endif
  1552. #if defined(LN) || defined(LT)
  1553. LD b1, AO, 0 * SIZE
  1554. MUL c11, b1, c11
  1555. MUL c21, b1, c21
  1556. MUL c31, b1, c31
  1557. MUL c41, b1, c41
  1558. #endif
  1559. #ifdef RN
  1560. LD b1, BO, 0 * SIZE
  1561. LD b2, BO, 1 * SIZE
  1562. LD b3, BO, 2 * SIZE
  1563. LD b4, BO, 3 * SIZE
  1564. MUL c11, b1, c11
  1565. NMSUB c21, c11, b2, c21
  1566. NMSUB c31, c11, b3, c31
  1567. NMSUB c41, c11, b4, c41
  1568. LD b2, BO, 5 * SIZE
  1569. LD b3, BO, 6 * SIZE
  1570. LD b4, BO, 7 * SIZE
  1571. MUL c21, b2, c21
  1572. NMSUB c31, c21, b3, c31
  1573. NMSUB c41, c21, b4, c41
  1574. LD b3, BO, 10 * SIZE
  1575. LD b4, BO, 11 * SIZE
  1576. MUL c31, b3, c31
  1577. NMSUB c41, c31, b4, c41
  1578. LD b4, BO, 15 * SIZE
  1579. MUL c41, b4, c41
  1580. #endif
  1581. #ifdef RT
  1582. LD b5, BO, 15 * SIZE
  1583. LD b6, BO, 14 * SIZE
  1584. LD b7, BO, 13 * SIZE
  1585. LD b8, BO, 12 * SIZE
  1586. MUL c41, b5, c41
  1587. NMSUB c31, c41, b6, c31
  1588. NMSUB c21, c41, b7, c21
  1589. NMSUB c11, c41, b8, c11
  1590. LD b6, BO, 10 * SIZE
  1591. LD b7, BO, 9 * SIZE
  1592. LD b8, BO, 8 * SIZE
  1593. MUL c31, b6, c31
  1594. NMSUB c21, c31, b7, c21
  1595. NMSUB c11, c31, b8, c11
  1596. LD b7, BO, 5 * SIZE
  1597. LD b8, BO, 4 * SIZE
  1598. MUL c21, b7, c21
  1599. NMSUB c11, c21, b8, c11
  1600. LD b8, BO, 0 * SIZE
  1601. MUL c11, b8, c11
  1602. #endif
  1603. #ifdef LN
  1604. addi.d CO1, CO1, -1 * SIZE
  1605. addi.d CO2, CO2, -1 * SIZE
  1606. addi.d CO3, CO3, -1 * SIZE
  1607. addi.d CO4, CO4, -1 * SIZE
  1608. #endif
  1609. #if defined(LN) || defined(LT)
  1610. ST c11, BO, 0 * SIZE
  1611. ST c21, BO, 1 * SIZE
  1612. ST c31, BO, 2 * SIZE
  1613. ST c41, BO, 3 * SIZE
  1614. #else
  1615. ST c11, AO, 0 * SIZE
  1616. ST c21, AO, 1 * SIZE
  1617. ST c31, AO, 2 * SIZE
  1618. ST c41, AO, 3 * SIZE
  1619. #endif
  1620. ST c11, CO1, 0 * SIZE
  1621. ST c21, CO2, 0 * SIZE
  1622. ST c31, CO3, 0 * SIZE
  1623. ST c41, CO4, 0 * SIZE
  1624. MTC c11, $r0
  1625. #ifndef LN
  1626. addi.d CO1, CO1, 1 * SIZE
  1627. addi.d CO2, CO2, 1 * SIZE
  1628. addi.d CO3, CO3, 1 * SIZE
  1629. addi.d CO4, CO4, 1 * SIZE
  1630. #endif
  1631. MOV c21, c11
  1632. #ifdef RT
  1633. slli.d TEMP, K, BASE_SHIFT
  1634. add.d AORIG, AORIG, TEMP
  1635. #endif
  1636. #if defined(LT) || defined(RN)
  1637. sub.d TEMP, K, KK
  1638. slli.d L, TEMP, 0 + BASE_SHIFT
  1639. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1640. add.d AO, AO, L
  1641. add.d BO, BO, TEMP
  1642. #endif
  1643. MOV c31, c11
  1644. #ifdef LT
  1645. addi.d KK, KK, 1
  1646. #endif
  1647. #ifdef LN
  1648. addi.d KK, KK, -1
  1649. #endif
  1650. .align 3
  1651. .L40:
  1652. srai.d I, M, 1
  1653. MOV c61, c11
  1654. MOV c41, c11
  1655. bge $r0, I, .L49
  1656. .L31:
  1657. #if defined(LT) || defined(RN)
  1658. LD a1, AO, 0 * SIZE
  1659. LD a3, AO, 4 * SIZE
  1660. LD b1, B, 0 * SIZE
  1661. MOV c12, c11
  1662. LD b2, B, 1 * SIZE
  1663. MOV c22, c11
  1664. LD b3, B, 2 * SIZE
  1665. MOV c32, c11
  1666. LD b4, B, 3 * SIZE
  1667. MOV c42, c11
  1668. LD b5, B, 4 * SIZE
  1669. srai.d L, KK, 2
  1670. LD b6, B, 8 * SIZE
  1671. LD b7, B, 12 * SIZE
  1672. move BO, B
  1673. bge $r0, L, .L35
  1674. #else
  1675. #ifdef LN
  1676. slli.d TEMP, K, 1 + BASE_SHIFT
  1677. sub.d AORIG, AORIG, TEMP
  1678. #endif
  1679. slli.d L, KK, 1 + BASE_SHIFT
  1680. slli.d TEMP, KK, 2 + BASE_SHIFT
  1681. add.d AO, AORIG, L
  1682. add.d BO, B, TEMP
  1683. sub.d TEMP, K, KK
  1684. LD a1, AO, 0 * SIZE
  1685. LD a3, AO, 4 * SIZE
  1686. LD b1, BO, 0 * SIZE
  1687. MOV c12, c11
  1688. LD b2, BO, 1 * SIZE
  1689. MOV c22, c11
  1690. LD b3, BO, 2 * SIZE
  1691. MOV c32, c11
  1692. LD b4, BO, 3 * SIZE
  1693. MOV c42, c11
  1694. LD b5, BO, 4 * SIZE
  1695. srai.d L, TEMP, 2
  1696. LD b6, BO, 8 * SIZE
  1697. LD b7, BO, 12 * SIZE
  1698. bge $r0, L, .L35
  1699. #endif
  1700. .align 3
  1701. .L32:
  1702. MADD c11, b1, a1, c11
  1703. LD a2, AO, 1 * SIZE
  1704. MADD c21, b2, a1, c21
  1705. addi.d L, L, -1
  1706. MADD c31, b3, a1, c31
  1707. MADD c41, b4, a1, c41
  1708. LD a1, AO, 2 * SIZE
  1709. MADD c12, b1, a2, c12
  1710. LD b1, BO, 16 * SIZE
  1711. MADD c22, b2, a2, c22
  1712. LD b2, BO, 5 * SIZE
  1713. MADD c32, b3, a2, c32
  1714. LD b3, BO, 6 * SIZE
  1715. MADD c42, b4, a2, c42
  1716. LD b4, BO, 7 * SIZE
  1717. MADD c11, b5, a1, c11
  1718. LD a2, AO, 3 * SIZE
  1719. MADD c21, b2, a1, c21
  1720. MADD c31, b3, a1, c31
  1721. MADD c41, b4, a1, c41
  1722. LD a1, AO, 8 * SIZE
  1723. MADD c12, b5, a2, c12
  1724. LD b5, BO, 20 * SIZE
  1725. MADD c22, b2, a2, c22
  1726. LD b2, BO, 9 * SIZE
  1727. MADD c32, b3, a2, c32
  1728. LD b3, BO, 10 * SIZE
  1729. MADD c42, b4, a2, c42
  1730. LD b4, BO, 11 * SIZE
  1731. MADD c11, b6, a3, c11
  1732. LD a2, AO, 5 * SIZE
  1733. MADD c21, b2, a3, c21
  1734. MADD c31, b3, a3, c31
  1735. MADD c41, b4, a3, c41
  1736. LD a3, AO, 6 * SIZE
  1737. MADD c12, b6, a2, c12
  1738. LD b6, BO, 24 * SIZE
  1739. MADD c22, b2, a2, c22
  1740. LD b2, BO, 13 * SIZE
  1741. MADD c32, b3, a2, c32
  1742. LD b3, BO, 14 * SIZE
  1743. MADD c42, b4, a2, c42
  1744. LD b4, BO, 15 * SIZE
  1745. MADD c11, b7, a3, c11
  1746. LD a2, AO, 7 * SIZE
  1747. MADD c21, b2, a3, c21
  1748. addi.d AO, AO, 8 * SIZE
  1749. MADD c31, b3, a3, c31
  1750. addi.d BO, BO, 16 * SIZE
  1751. MADD c41, b4, a3, c41
  1752. LD a3, AO, 4 * SIZE
  1753. MADD c12, b7, a2, c12
  1754. LD b7, BO, 12 * SIZE
  1755. MADD c22, b2, a2, c22
  1756. LD b2, BO, 1 * SIZE
  1757. MADD c32, b3, a2, c32
  1758. LD b3, BO, 2 * SIZE
  1759. MADD c42, b4, a2, c42
  1760. LD b4, BO, 3 * SIZE
  1761. blt $r0, L, .L32
  1762. .align 3
  1763. .L35:
  1764. #if defined(LT) || defined(RN)
  1765. andi L, KK, 3
  1766. #else
  1767. andi L, TEMP, 3
  1768. #endif
  1769. bge $r0, L, .L38
  1770. .align 3
  1771. .L36:
  1772. MADD c11, b1, a1, c11
  1773. LD a2, AO, 1 * SIZE
  1774. MADD c21, b2, a1, c21
  1775. addi.d L, L, -1
  1776. MADD c31, b3, a1, c31
  1777. addi.d AO, AO, 2 * SIZE
  1778. MADD c41, b4, a1, c41
  1779. LD a1, AO, 0 * SIZE
  1780. MADD c12, b1, a2, c12
  1781. LD b1, BO, 4 * SIZE
  1782. MADD c22, b2, a2, c22
  1783. LD b2, BO, 5 * SIZE
  1784. MADD c32, b3, a2, c32
  1785. LD b3, BO, 6 * SIZE
  1786. MADD c42, b4, a2, c42
  1787. LD b4, BO, 7 * SIZE
  1788. addi.d BO, BO, 4 * SIZE
  1789. blt $r0, L, .L36
  1790. .L38:
  1791. #if defined(LN) || defined(RT)
  1792. #ifdef LN
  1793. addi.d TEMP, KK, -2
  1794. #else
  1795. addi.d TEMP, KK, -4
  1796. #endif
  1797. slli.d L, TEMP, 1 + BASE_SHIFT
  1798. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1799. add.d AO, AORIG, L
  1800. add.d BO, B, TEMP
  1801. #endif
  1802. #if defined(LN) || defined(LT)
  1803. LD b1, BO, 0 * SIZE
  1804. LD b2, BO, 1 * SIZE
  1805. LD b3, BO, 2 * SIZE
  1806. LD b4, BO, 3 * SIZE
  1807. LD b5, BO, 4 * SIZE
  1808. LD b6, BO, 5 * SIZE
  1809. LD b7, BO, 6 * SIZE
  1810. LD b8, BO, 7 * SIZE
  1811. SUB c11, b1, c11
  1812. SUB c21, b2, c21
  1813. SUB c31, b3, c31
  1814. SUB c41, b4, c41
  1815. SUB c12, b5, c12
  1816. SUB c22, b6, c22
  1817. SUB c32, b7, c32
  1818. SUB c42, b8, c42
  1819. #else
  1820. LD b1, AO, 0 * SIZE
  1821. LD b2, AO, 1 * SIZE
  1822. LD b3, AO, 2 * SIZE
  1823. LD b4, AO, 3 * SIZE
  1824. LD b5, AO, 4 * SIZE
  1825. LD b6, AO, 5 * SIZE
  1826. LD b7, AO, 6 * SIZE
  1827. LD b8, AO, 7 * SIZE
  1828. SUB c11, b1, c11
  1829. SUB c12, b2, c12
  1830. SUB c21, b3, c21
  1831. SUB c22, b4, c22
  1832. SUB c31, b5, c31
  1833. SUB c32, b6, c32
  1834. SUB c41, b7, c41
  1835. SUB c42, b8, c42
  1836. #endif
  1837. #ifdef LN
  1838. LD b1, AO, 3 * SIZE
  1839. LD b2, AO, 2 * SIZE
  1840. LD b3, AO, 0 * SIZE
  1841. MUL c12, b1, c12
  1842. MUL c22, b1, c22
  1843. MUL c32, b1, c32
  1844. MUL c42, b1, c42
  1845. NMSUB c11, c12, b2, c11
  1846. NMSUB c21, c22, b2, c21
  1847. NMSUB c31, c32, b2, c31
  1848. NMSUB c41, c42, b2, c41
  1849. MUL c11, b3, c11
  1850. MUL c21, b3, c21
  1851. MUL c31, b3, c31
  1852. MUL c41, b3, c41
  1853. #endif
  1854. #ifdef LT
  1855. LD b1, AO, 0 * SIZE
  1856. LD b2, AO, 1 * SIZE
  1857. LD b3, AO, 3 * SIZE
  1858. MUL c11, b1, c11
  1859. MUL c21, b1, c21
  1860. MUL c31, b1, c31
  1861. MUL c41, b1, c41
  1862. NMSUB c12, c11, b2, c12
  1863. NMSUB c22, c21, b2, c22
  1864. NMSUB c32, c31, b2, c32
  1865. NMSUB c42, c41, b2, c42
  1866. MUL c12, b3, c12
  1867. MUL c22, b3, c22
  1868. MUL c32, b3, c32
  1869. MUL c42, b3, c42
  1870. #endif
  1871. #ifdef RN
  1872. LD b1, BO, 0 * SIZE
  1873. LD b2, BO, 1 * SIZE
  1874. LD b3, BO, 2 * SIZE
  1875. LD b4, BO, 3 * SIZE
  1876. MUL c11, b1, c11
  1877. MUL c12, b1, c12
  1878. NMSUB c21, c11, b2, c21
  1879. NMSUB c22, c12, b2, c22
  1880. NMSUB c31, c11, b3, c31
  1881. NMSUB c32, c12, b3, c32
  1882. NMSUB c41, c11, b4, c41
  1883. NMSUB c42, c12, b4, c42
  1884. LD b2, BO, 5 * SIZE
  1885. LD b3, BO, 6 * SIZE
  1886. LD b4, BO, 7 * SIZE
  1887. MUL c21, b2, c21
  1888. MUL c22, b2, c22
  1889. NMSUB c31, c21, b3, c31
  1890. NMSUB c32, c22, b3, c32
  1891. NMSUB c41, c21, b4, c41
  1892. NMSUB c42, c22, b4, c42
  1893. LD b3, BO, 10 * SIZE
  1894. LD b4, BO, 11 * SIZE
  1895. MUL c31, b3, c31
  1896. MUL c32, b3, c32
  1897. NMSUB c41, c31, b4, c41
  1898. NMSUB c42, c32, b4, c42
  1899. LD b4, BO, 15 * SIZE
  1900. MUL c41, b4, c41
  1901. MUL c42, b4, c42
  1902. #endif
  1903. #ifdef RT
  1904. LD b5, BO, 15 * SIZE
  1905. LD b6, BO, 14 * SIZE
  1906. LD b7, BO, 13 * SIZE
  1907. LD b8, BO, 12 * SIZE
  1908. MUL c41, b5, c41
  1909. MUL c42, b5, c42
  1910. NMSUB c31, c41, b6, c31
  1911. NMSUB c32, c42, b6, c32
  1912. NMSUB c21, c41, b7, c21
  1913. NMSUB c22, c42, b7, c22
  1914. NMSUB c11, c41, b8, c11
  1915. NMSUB c12, c42, b8, c12
  1916. LD b6, BO, 10 * SIZE
  1917. LD b7, BO, 9 * SIZE
  1918. LD b8, BO, 8 * SIZE
  1919. MUL c31, b6, c31
  1920. MUL c32, b6, c32
  1921. NMSUB c21, c31, b7, c21
  1922. NMSUB c22, c32, b7, c22
  1923. NMSUB c11, c31, b8, c11
  1924. NMSUB c12, c32, b8, c12
  1925. LD b7, BO, 5 * SIZE
  1926. LD b8, BO, 4 * SIZE
  1927. MUL c21, b7, c21
  1928. MUL c22, b7, c22
  1929. NMSUB c11, c21, b8, c11
  1930. NMSUB c12, c22, b8, c12
  1931. LD b8, BO, 0 * SIZE
  1932. MUL c11, b8, c11
  1933. MUL c12, b8, c12
  1934. #endif
  1935. #ifdef LN
  1936. addi.d CO1, CO1, -2 * SIZE
  1937. addi.d CO2, CO2, -2 * SIZE
  1938. addi.d CO3, CO3, -2 * SIZE
  1939. addi.d CO4, CO4, -2 * SIZE
  1940. #endif
  1941. #if defined(LN) || defined(LT)
  1942. ST c11, BO, 0 * SIZE
  1943. ST c21, BO, 1 * SIZE
  1944. ST c31, BO, 2 * SIZE
  1945. ST c41, BO, 3 * SIZE
  1946. ST c12, BO, 4 * SIZE
  1947. ST c22, BO, 5 * SIZE
  1948. ST c32, BO, 6 * SIZE
  1949. ST c42, BO, 7 * SIZE
  1950. #else
  1951. ST c11, AO, 0 * SIZE
  1952. ST c12, AO, 1 * SIZE
  1953. ST c21, AO, 2 * SIZE
  1954. ST c22, AO, 3 * SIZE
  1955. ST c31, AO, 4 * SIZE
  1956. ST c32, AO, 5 * SIZE
  1957. ST c41, AO, 6 * SIZE
  1958. ST c42, AO, 7 * SIZE
  1959. #endif
  1960. ST c11, CO1, 0 * SIZE
  1961. ST c12, CO1, 1 * SIZE
  1962. ST c21, CO2, 0 * SIZE
  1963. ST c22, CO2, 1 * SIZE
  1964. ST c31, CO3, 0 * SIZE
  1965. ST c32, CO3, 1 * SIZE
  1966. ST c41, CO4, 0 * SIZE
  1967. ST c42, CO4, 1 * SIZE
  1968. #ifndef LN
  1969. addi.d CO1, CO1, 2 * SIZE
  1970. addi.d CO2, CO2, 2 * SIZE
  1971. addi.d CO3, CO3, 2 * SIZE
  1972. addi.d CO4, CO4, 2 * SIZE
  1973. #endif
  1974. #ifdef RT
  1975. slli.d TEMP, K, 1 + BASE_SHIFT
  1976. add.d AORIG, AORIG, TEMP
  1977. #endif
  1978. #if defined(LT) || defined(RN)
  1979. sub.d TEMP, K, KK
  1980. slli.d L, TEMP, 1 + BASE_SHIFT
  1981. slli.d TEMP, TEMP, 2 + BASE_SHIFT
  1982. add.d AO, AO, L
  1983. add.d BO, BO, TEMP
  1984. #endif
  1985. #ifdef LT
  1986. addi.d KK, KK, 2
  1987. #endif
  1988. #ifdef LN
  1989. addi.d KK, KK, -2
  1990. #endif
  1991. MTC a1, $r0
  1992. MOV c11, a1
  1993. MOV c21, a1
  1994. MOV c31, a1
  1995. addi.d I, I, -1
  1996. MOV c41, c11
  1997. blt $r0, I, .L31
  1998. .align 3
  1999. .L49:
  2000. #ifdef LN
  2001. slli.d TEMP, K, 2 + BASE_SHIFT
  2002. add.d B, B, TEMP
  2003. #endif
  2004. #if defined(LT) || defined(RN)
  2005. move B, BO
  2006. #endif
  2007. #ifdef RN
  2008. addi.d KK, KK, 4
  2009. #endif
  2010. #ifdef RT
  2011. addi.d KK, KK, -4
  2012. #endif
  2013. .align 3
  2014. .L50:
  2015. andi J, N, 2
  2016. #ifdef RT
  2017. slli.d TEMP, K, 1 + BASE_SHIFT
  2018. #else
  2019. move AO, A
  2020. #endif
  2021. bge $r0, J, .L70
  2022. #ifdef RT
  2023. sub.d B, B, TEMP
  2024. slli.d TEMP, LDC, 1
  2025. sub.d C, C, TEMP
  2026. #endif
  2027. move AO, A
  2028. move CO1, C
  2029. add.d CO2, C, LDC
  2030. #ifdef LN
  2031. add.d KK, M, OFFSET
  2032. #endif
  2033. #ifdef LT
  2034. move KK, OFFSET
  2035. #endif
  2036. #if defined(LN) || defined(RT)
  2037. move AORIG, A
  2038. #else
  2039. move AO, A
  2040. #endif
  2041. #ifndef RT
  2042. add.d C, CO2, LDC
  2043. #endif
  2044. andi I, M, 1
  2045. bge $r0, I, .L60
  2046. #if defined(LT) || defined(RN)
  2047. srai.d L, KK, 2
  2048. LD a1, AO, 0 * SIZE
  2049. MTC c11, $r0
  2050. LD a2, AO, 1 * SIZE
  2051. MOV c21, c11
  2052. LD a3, AO, 2 * SIZE
  2053. MOV c31, c11
  2054. LD a4, AO, 3 * SIZE
  2055. MOV c41, c11
  2056. LD b1, B, 0 * SIZE
  2057. LD b2, B, 1 * SIZE
  2058. LD b3, B, 2 * SIZE
  2059. LD b4, B, 3 * SIZE
  2060. LD b5, B, 4 * SIZE
  2061. LD b6, B, 8 * SIZE
  2062. LD b7, B, 12 * SIZE
  2063. move BO, B
  2064. bge $r0, L, .L65
  2065. #else
  2066. #ifdef LN
  2067. slli.d TEMP, K, BASE_SHIFT
  2068. sub.d AORIG, AORIG, TEMP
  2069. #endif
  2070. slli.d L, KK, 0 + BASE_SHIFT
  2071. slli.d TEMP, KK, 1 + BASE_SHIFT
  2072. add.d AO, AORIG, L
  2073. add.d BO, B, TEMP
  2074. sub.d TEMP, K, KK
  2075. srai.d L, TEMP, 2
  2076. LD a1, AO, 0 * SIZE
  2077. MTC c11, $r0
  2078. LD a2, AO, 1 * SIZE
  2079. MOV c21, c11
  2080. LD a3, AO, 2 * SIZE
  2081. MOV c31, c11
  2082. LD a4, AO, 3 * SIZE
  2083. MOV c41, c11
  2084. LD b1, BO, 0 * SIZE
  2085. LD b2, BO, 1 * SIZE
  2086. LD b3, BO, 2 * SIZE
  2087. LD b4, BO, 3 * SIZE
  2088. LD b5, BO, 4 * SIZE
  2089. LD b6, BO, 8 * SIZE
  2090. LD b7, BO, 12 * SIZE
  2091. bge $r0, L, .L65
  2092. #endif
  2093. .align 3
  2094. .L62:
  2095. MADD c11, b1, a1, c11
  2096. LD b1, BO, 4 * SIZE
  2097. MADD c21, b2, a1, c21
  2098. LD b2, BO, 5 * SIZE
  2099. MADD c31, b3, a2, c31
  2100. LD b3, BO, 6 * SIZE
  2101. MADD c41, b4, a2, c41
  2102. LD b4, BO, 7 * SIZE
  2103. LD a1, AO, 4 * SIZE
  2104. LD a2, AO, 5 * SIZE
  2105. MADD c11, b1, a3, c11
  2106. LD b1, BO, 8 * SIZE
  2107. MADD c21, b2, a3, c21
  2108. LD b2, BO, 9 * SIZE
  2109. MADD c31, b3, a4, c31
  2110. LD b3, BO, 10 * SIZE
  2111. MADD c41, b4, a4, c41
  2112. LD b4, BO, 11 * SIZE
  2113. LD a3, AO, 6 * SIZE
  2114. LD a4, AO, 7 * SIZE
  2115. addi.d L, L, -1
  2116. addi.d AO, AO, 4 * SIZE
  2117. addi.d BO, BO, 8 * SIZE
  2118. blt $r0, L, .L62
  2119. .align 3
  2120. .L65:
  2121. #if defined(LT) || defined(RN)
  2122. andi L, KK, 3
  2123. #else
  2124. andi L, TEMP, 3
  2125. #endif
  2126. bge $r0, L, .L68
  2127. .align 3
  2128. .L66:
  2129. MADD c11, b1, a1, c11
  2130. LD b1, BO, 2 * SIZE
  2131. MADD c21, b2, a1, c21
  2132. LD b2, BO, 3 * SIZE
  2133. LD a1, AO, 1 * SIZE
  2134. addi.d L, L, -1
  2135. addi.d AO, AO, 1 * SIZE
  2136. addi.d BO, BO, 2 * SIZE
  2137. blt $r0, L, .L66
  2138. .L68:
  2139. ADD c11, c11, c31
  2140. ADD c21, c21, c41
  2141. #if defined(LN) || defined(RT)
  2142. #ifdef LN
  2143. addi.d TEMP, KK, -1
  2144. #else
  2145. addi.d TEMP, KK, -2
  2146. #endif
  2147. slli.d L, TEMP, 0 + BASE_SHIFT
  2148. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  2149. add.d AO, AORIG, L
  2150. add.d BO, B, TEMP
  2151. #endif
  2152. #if defined(LN) || defined(LT)
  2153. LD b1, BO, 0 * SIZE
  2154. LD b2, BO, 1 * SIZE
  2155. SUB c11, b1, c11
  2156. SUB c21, b2, c21
  2157. #else
  2158. LD b1, AO, 0 * SIZE
  2159. LD b2, AO, 1 * SIZE
  2160. SUB c11, b1, c11
  2161. SUB c21, b2, c21
  2162. #endif
  2163. #if defined(LN) || defined(LT)
  2164. LD b3, AO, 0 * SIZE
  2165. MUL c11, b3, c11
  2166. MUL c21, b3, c21
  2167. #endif
  2168. #ifdef RN
  2169. LD b1, BO, 0 * SIZE
  2170. LD b2, BO, 1 * SIZE
  2171. LD b3, BO, 3 * SIZE
  2172. MUL c11, b1, c11
  2173. NMSUB c21, c11, b2, c21
  2174. MUL c21, b3, c21
  2175. #endif
  2176. #ifdef RT
  2177. LD b1, BO, 3 * SIZE
  2178. LD b2, BO, 2 * SIZE
  2179. LD b3, BO, 0 * SIZE
  2180. MUL c21, b1, c21
  2181. NMSUB c11, c21, b2, c11
  2182. MUL c11, b3, c11
  2183. #endif
  2184. #ifdef LN
  2185. addi.d CO1, CO1, -1 * SIZE
  2186. addi.d CO2, CO2, -1 * SIZE
  2187. #endif
  2188. #if defined(LN) || defined(LT)
  2189. ST c11, BO, 0 * SIZE
  2190. ST c21, BO, 1 * SIZE
  2191. #else
  2192. ST c11, AO, 0 * SIZE
  2193. ST c21, AO, 1 * SIZE
  2194. #endif
  2195. ST c11, CO1, 0 * SIZE
  2196. ST c21, CO2, 0 * SIZE
  2197. #ifndef LN
  2198. addi.d CO1, CO1, 1 * SIZE
  2199. addi.d CO2, CO2, 1 * SIZE
  2200. #endif
  2201. #ifdef RT
  2202. slli.d TEMP, K, 0 + BASE_SHIFT
  2203. add.d AORIG, AORIG, TEMP
  2204. #endif
  2205. #if defined(LT) || defined(RN)
  2206. sub.d TEMP, K, KK
  2207. slli.d L, TEMP, 0 + BASE_SHIFT
  2208. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  2209. add.d AO, AO, L
  2210. add.d BO, BO, TEMP
  2211. #endif
  2212. #ifdef LT
  2213. addi.d KK, KK, 1
  2214. #endif
  2215. #ifdef LN
  2216. addi.d KK, KK, -1
  2217. #endif
  2218. .align 3
  2219. .L60:
  2220. srai.d I, M, 1
  2221. bge $r0, I, .L69
  2222. .L51:
  2223. #if defined(LT) || defined(RN)
  2224. LD a1, AO, 0 * SIZE
  2225. MTC c11, $r0
  2226. LD a2, AO, 1 * SIZE
  2227. MOV c21, c11
  2228. LD a5, AO, 4 * SIZE
  2229. LD b1, B, 0 * SIZE
  2230. MOV c12, c11
  2231. LD b2, B, 1 * SIZE
  2232. MOV c22, c11
  2233. LD b3, B, 2 * SIZE
  2234. LD b5, B, 4 * SIZE
  2235. srai.d L, KK, 2
  2236. LD b6, B, 8 * SIZE
  2237. LD b7, B, 12 * SIZE
  2238. move BO, B
  2239. bge $r0, L, .L55
  2240. #else
  2241. #ifdef LN
  2242. slli.d TEMP, K, 1 + BASE_SHIFT
  2243. sub.d AORIG, AORIG, TEMP
  2244. #endif
  2245. slli.d L, KK, 1 + BASE_SHIFT
  2246. slli.d TEMP, KK, 1 + BASE_SHIFT
  2247. add.d AO, AORIG, L
  2248. add.d BO, B, TEMP
  2249. sub.d TEMP, K, KK
  2250. LD a1, AO, 0 * SIZE
  2251. MTC c11, $r0
  2252. LD a2, AO, 1 * SIZE
  2253. MOV c21, c11
  2254. LD a5, AO, 4 * SIZE
  2255. LD b1, BO, 0 * SIZE
  2256. MOV c12, c11
  2257. LD b2, BO, 1 * SIZE
  2258. MOV c22, c11
  2259. LD b3, BO, 2 * SIZE
  2260. LD b5, BO, 4 * SIZE
  2261. srai.d L, TEMP, 2
  2262. LD b6, BO, 8 * SIZE
  2263. LD b7, BO, 12 * SIZE
  2264. bge $r0, L, .L55
  2265. #endif
  2266. .align 3
  2267. .L52:
  2268. MADD c11, b1, a1, c11
  2269. LD a3, AO, 2 * SIZE
  2270. MADD c21, b2, a1, c21
  2271. LD b4, BO, 3 * SIZE
  2272. MADD c12, b1, a2, c12
  2273. LD a4, AO, 3 * SIZE
  2274. MADD c22, b2, a2, c22
  2275. LD b1, BO, 8 * SIZE
  2276. MADD c11, b3, a3, c11
  2277. LD a1, AO, 8 * SIZE
  2278. MADD c21, b4, a3, c21
  2279. LD b2, BO, 5 * SIZE
  2280. MADD c12, b3, a4, c12
  2281. LD a2, AO, 5 * SIZE
  2282. MADD c22, b4, a4, c22
  2283. LD b3, BO, 6 * SIZE
  2284. MADD c11, b5, a5, c11
  2285. LD a3, AO, 6 * SIZE
  2286. MADD c21, b2, a5, c21
  2287. LD b4, BO, 7 * SIZE
  2288. MADD c12, b5, a2, c12
  2289. LD a4, AO, 7 * SIZE
  2290. MADD c22, b2, a2, c22
  2291. LD b5, BO, 12 * SIZE
  2292. MADD c11, b3, a3, c11
  2293. LD a5, AO, 12 * SIZE
  2294. MADD c21, b4, a3, c21
  2295. LD b2, BO, 9 * SIZE
  2296. MADD c12, b3, a4, c12
  2297. LD a2, AO, 9 * SIZE
  2298. MADD c22, b4, a4, c22
  2299. LD b3, BO, 10 * SIZE
  2300. addi.d AO, AO, 8 * SIZE
  2301. addi.d L, L, -1
  2302. addi.d BO, BO, 8 * SIZE
  2303. blt $r0, L, .L52
  2304. .align 3
  2305. .L55:
  2306. #if defined(LT) || defined(RN)
  2307. andi L, KK, 3
  2308. #else
  2309. andi L, TEMP, 3
  2310. #endif
  2311. bge $r0, L, .L58
  2312. .align 3
  2313. .L56:
  2314. MADD c11, b1, a1, c11
  2315. LD a2, AO, 1 * SIZE
  2316. MADD c21, b2, a1, c21
  2317. LD a1, AO, 2 * SIZE
  2318. MADD c12, b1, a2, c12
  2319. LD b1, BO, 2 * SIZE
  2320. MADD c22, b2, a2, c22
  2321. LD b2, BO, 3 * SIZE
  2322. addi.d L, L, -1
  2323. addi.d AO, AO, 2 * SIZE
  2324. addi.d BO, BO, 2 * SIZE
  2325. blt $r0, L, .L56
  2326. .L58:
  2327. #if defined(LN) || defined(RT)
  2328. #ifdef LN
  2329. addi.d TEMP, KK, -2
  2330. #else
  2331. addi.d TEMP, KK, -2
  2332. #endif
  2333. slli.d L, TEMP, 1 + BASE_SHIFT
  2334. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  2335. add.d AO, AORIG, L
  2336. add.d BO, B, TEMP
  2337. #endif
  2338. #if defined(LN) || defined(LT)
  2339. LD b1, BO, 0 * SIZE
  2340. LD b2, BO, 1 * SIZE
  2341. LD b3, BO, 2 * SIZE
  2342. LD b4, BO, 3 * SIZE
  2343. SUB c11, b1, c11
  2344. SUB c21, b2, c21
  2345. SUB c12, b3, c12
  2346. SUB c22, b4, c22
  2347. #else
  2348. LD b1, AO, 0 * SIZE
  2349. LD b2, AO, 1 * SIZE
  2350. LD b3, AO, 2 * SIZE
  2351. LD b4, AO, 3 * SIZE
  2352. SUB c11, b1, c11
  2353. SUB c12, b2, c12
  2354. SUB c21, b3, c21
  2355. SUB c22, b4, c22
  2356. #endif
  2357. #ifdef LN
  2358. LD b1, AO, 3 * SIZE
  2359. LD b2, AO, 2 * SIZE
  2360. LD b3, AO, 0 * SIZE
  2361. MUL c12, b1, c12
  2362. MUL c22, b1, c22
  2363. NMSUB c11, c12, b2, c11
  2364. NMSUB c21, c22, b2, c21
  2365. MUL c11, b3, c11
  2366. MUL c21, b3, c21
  2367. #endif
  2368. #ifdef LT
  2369. LD b1, AO, 0 * SIZE
  2370. LD b2, AO, 1 * SIZE
  2371. LD b3, AO, 3 * SIZE
  2372. MUL c11, b1, c11
  2373. MUL c21, b1, c21
  2374. NMSUB c12, c11, b2, c12
  2375. NMSUB c22, c21, b2, c22
  2376. MUL c12, b3, c12
  2377. MUL c22, b3, c22
  2378. #endif
  2379. #ifdef RN
  2380. LD b1, BO, 0 * SIZE
  2381. LD b2, BO, 1 * SIZE
  2382. LD b3, BO, 3 * SIZE
  2383. MUL c11, b1, c11
  2384. MUL c12, b1, c12
  2385. NMSUB c21, c11, b2, c21
  2386. NMSUB c22, c12, b2, c22
  2387. MUL c21, b3, c21
  2388. MUL c22, b3, c22
  2389. #endif
  2390. #ifdef RT
  2391. LD b1, BO, 3 * SIZE
  2392. LD b2, BO, 2 * SIZE
  2393. LD b3, BO, 0 * SIZE
  2394. MUL c21, b1, c21
  2395. MUL c22, b1, c22
  2396. NMSUB c11, c21, b2, c11
  2397. NMSUB c12, c22, b2, c12
  2398. MUL c11, b3, c11
  2399. MUL c12, b3, c12
  2400. #endif
  2401. #ifdef LN
  2402. addi.d CO1, CO1, -2 * SIZE
  2403. addi.d CO2, CO2, -2 * SIZE
  2404. #endif
  2405. #if defined(LN) || defined(LT)
  2406. ST c11, BO, 0 * SIZE
  2407. ST c21, BO, 1 * SIZE
  2408. ST c12, BO, 2 * SIZE
  2409. ST c22, BO, 3 * SIZE
  2410. #else
  2411. ST c11, AO, 0 * SIZE
  2412. ST c12, AO, 1 * SIZE
  2413. ST c21, AO, 2 * SIZE
  2414. ST c22, AO, 3 * SIZE
  2415. #endif
  2416. ST c11, CO1, 0 * SIZE
  2417. ST c12, CO1, 1 * SIZE
  2418. ST c21, CO2, 0 * SIZE
  2419. ST c22, CO2, 1 * SIZE
  2420. #ifndef LN
  2421. addi.d CO1, CO1, 2 * SIZE
  2422. addi.d CO2, CO2, 2 * SIZE
  2423. #endif
  2424. #ifdef RT
  2425. slli.d TEMP, K, 1 + BASE_SHIFT
  2426. add.d AORIG, AORIG, TEMP
  2427. #endif
  2428. #if defined(LT) || defined(RN)
  2429. sub.d TEMP, K, KK
  2430. slli.d TEMP, TEMP, 1 + BASE_SHIFT
  2431. add.d AO, AO, TEMP
  2432. add.d BO, BO, TEMP
  2433. #endif
  2434. #ifdef LT
  2435. addi.d KK, KK, 2
  2436. #endif
  2437. #ifdef LN
  2438. addi.d KK, KK, -2
  2439. #endif
  2440. MTC a1, $r0
  2441. MOV c11, a1
  2442. MOV c21, a1
  2443. MOV c31, a1
  2444. addi.d I, I, -1
  2445. MOV c41, c11
  2446. blt $r0, I, .L51
  2447. .align 3
  2448. .L69:
  2449. #ifdef LN
  2450. slli.d TEMP, K, 1 + BASE_SHIFT
  2451. add.d B, B, TEMP
  2452. #endif
  2453. #if defined(LT) || defined(RN)
  2454. move B, BO
  2455. #endif
  2456. #ifdef RN
  2457. addi.d KK, KK, 2
  2458. #endif
  2459. #ifdef RT
  2460. addi.d KK, KK, -2
  2461. #endif
  2462. .align 3
  2463. .L70:
  2464. andi J, N, 1
  2465. bge $r0, J, .L999
  2466. #ifdef RT
  2467. slli.d TEMP, K, BASE_SHIFT
  2468. sub.d B, B, TEMP
  2469. sub.d C, C, LDC
  2470. #endif
  2471. move AO, A
  2472. move CO1, C
  2473. #ifdef LN
  2474. add.d KK, M, OFFSET
  2475. #endif
  2476. #ifdef LT
  2477. move KK, OFFSET
  2478. #endif
  2479. #if defined(LN) || defined(RT)
  2480. move AORIG, A
  2481. #else
  2482. move AO, A
  2483. #endif
  2484. #ifndef RT
  2485. add.d C, CO1, LDC
  2486. #endif
  2487. andi I, M, 1
  2488. bge $r0, I, .L80
  2489. #if defined(LT) || defined(RN)
  2490. LD a1, AO, 0 * SIZE
  2491. MTC c11, $r0
  2492. LD a2, AO, 1 * SIZE
  2493. MOV c21, c11
  2494. LD a3, AO, 2 * SIZE
  2495. LD a4, AO, 3 * SIZE
  2496. LD b1, B, 0 * SIZE
  2497. LD b2, B, 1 * SIZE
  2498. LD b3, B, 2 * SIZE
  2499. LD b4, B, 3 * SIZE
  2500. LD b5, B, 4 * SIZE
  2501. LD b6, B, 8 * SIZE
  2502. LD b7, B, 12 * SIZE
  2503. srai.d L, KK, 2
  2504. move BO, B
  2505. bge $r0, L, .L85
  2506. #else
  2507. #ifdef LN
  2508. slli.d TEMP, K, BASE_SHIFT
  2509. sub.d AORIG, AORIG, TEMP
  2510. #endif
  2511. slli.d TEMP, KK, BASE_SHIFT
  2512. add.d AO, AORIG, TEMP
  2513. add.d BO, B, TEMP
  2514. sub.d TEMP, K, KK
  2515. LD a1, AO, 0 * SIZE
  2516. MTC c11, $r0
  2517. LD a2, AO, 1 * SIZE
  2518. MOV c21, c11
  2519. LD a3, AO, 2 * SIZE
  2520. LD a4, AO, 3 * SIZE
  2521. LD b1, BO, 0 * SIZE
  2522. LD b2, BO, 1 * SIZE
  2523. LD b3, BO, 2 * SIZE
  2524. LD b4, BO, 3 * SIZE
  2525. LD b5, BO, 4 * SIZE
  2526. LD b6, BO, 8 * SIZE
  2527. LD b7, BO, 12 * SIZE
  2528. srai.d L, TEMP, 2
  2529. bge $r0, L, .L85
  2530. #endif
  2531. .align 3
  2532. .L82:
  2533. LD a1, AO, 0 * SIZE
  2534. LD b1, BO, 0 * SIZE
  2535. MADD c11, b1, a1, c11
  2536. LD a1, AO, 1 * SIZE
  2537. LD b1, BO, 1 * SIZE
  2538. MADD c21, b1, a1, c21
  2539. LD a1, AO, 2 * SIZE
  2540. LD b1, BO, 2 * SIZE
  2541. MADD c11, b1, a1, c11
  2542. LD a1, AO, 3 * SIZE
  2543. LD b1, BO, 3 * SIZE
  2544. MADD c21, b1, a1, c21
  2545. addi.d L, L, -1
  2546. addi.d AO, AO, 4 * SIZE
  2547. addi.d BO, BO, 4 * SIZE
  2548. blt $r0, L, .L82
  2549. .align 3
  2550. .L85:
  2551. #if defined(LT) || defined(RN)
  2552. andi L, KK, 3
  2553. #else
  2554. andi L, TEMP, 3
  2555. #endif
  2556. bge $r0, L, .L88
  2557. .align 3
  2558. .L86:
  2559. LD a1, AO, 0 * SIZE
  2560. LD b1, BO, 0 * SIZE
  2561. MADD c11, b1, a1, c11
  2562. addi.d L, L, -1
  2563. addi.d AO, AO, 1 * SIZE
  2564. addi.d BO, BO, 1 * SIZE
  2565. blt $r0, L, .L86
  2566. .L88:
  2567. ADD c11, c11, c21
  2568. #if defined(LN) || defined(RT)
  2569. #ifdef LN
  2570. addi.d TEMP, KK, -1
  2571. #else
  2572. addi.d TEMP, KK, -1
  2573. #endif
  2574. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  2575. add.d AO, AORIG, TEMP
  2576. add.d BO, B, TEMP
  2577. #endif
  2578. #if defined(LN) || defined(LT)
  2579. LD b1, BO, 0 * SIZE
  2580. SUB c11, b1, c11
  2581. #else
  2582. LD b1, AO, 0 * SIZE
  2583. SUB c11, b1, c11
  2584. #endif
  2585. #if defined(LN) || defined(LT)
  2586. LD b1, AO, 0 * SIZE
  2587. MUL c11, b1, c11
  2588. #endif
  2589. #if defined(RN) || defined(RT)
  2590. LD b1, BO, 0 * SIZE
  2591. MUL c11, b1, c11
  2592. #endif
  2593. #ifdef LN
  2594. addi.d CO1, CO1, -1 * SIZE
  2595. #endif
  2596. #if defined(LN) || defined(LT)
  2597. ST c11, BO, 0 * SIZE
  2598. #else
  2599. ST c11, AO, 0 * SIZE
  2600. #endif
  2601. ST c11, CO1, 0 * SIZE
  2602. #ifndef LN
  2603. addi.d CO1, CO1, 1 * SIZE
  2604. #endif
  2605. #ifdef RT
  2606. slli.d TEMP, K, BASE_SHIFT
  2607. add.d AORIG, AORIG, TEMP
  2608. #endif
  2609. #if defined(LT) || defined(RN)
  2610. sub.d TEMP, K, KK
  2611. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  2612. add.d AO, AO, TEMP
  2613. add.d BO, BO, TEMP
  2614. #endif
  2615. #ifdef LT
  2616. addi.d KK, KK, 1
  2617. #endif
  2618. #ifdef LN
  2619. addi.d KK, KK, -1
  2620. #endif
  2621. .align 3
  2622. .L80:
  2623. srai.d I, M, 1
  2624. bge $r0, I, .L89
  2625. .L71:
  2626. #if defined(LT) || defined(RN)
  2627. LD a1, AO, 0 * SIZE
  2628. MTC c11, $r0
  2629. LD a2, AO, 1 * SIZE
  2630. MOV c21, c11
  2631. LD a5, AO, 4 * SIZE
  2632. LD b1, B, 0 * SIZE
  2633. MOV c12, c11
  2634. LD b2, B, 1 * SIZE
  2635. MOV c22, c11
  2636. LD b3, B, 2 * SIZE
  2637. LD b5, B, 4 * SIZE
  2638. srai.d L, KK, 2
  2639. LD b6, B, 8 * SIZE
  2640. LD b7, B, 12 * SIZE
  2641. move BO, B
  2642. bge $r0, L, .L75
  2643. #else
  2644. #ifdef LN
  2645. slli.d TEMP, K, 1 + BASE_SHIFT
  2646. sub.d AORIG, AORIG, TEMP
  2647. #endif
  2648. slli.d L, KK, 1 + BASE_SHIFT
  2649. slli.d TEMP, KK, 0 + BASE_SHIFT
  2650. add.d AO, AORIG, L
  2651. add.d BO, B, TEMP
  2652. sub.d TEMP, K, KK
  2653. LD a1, AO, 0 * SIZE
  2654. MTC c11, $r0
  2655. LD a2, AO, 1 * SIZE
  2656. MOV c21, c11
  2657. LD a5, AO, 4 * SIZE
  2658. LD b1, BO, 0 * SIZE
  2659. MOV c12, c11
  2660. LD b2, BO, 1 * SIZE
  2661. MOV c22, c11
  2662. LD b3, BO, 2 * SIZE
  2663. LD b5, BO, 4 * SIZE
  2664. srai.d L, TEMP, 2
  2665. LD b6, BO, 8 * SIZE
  2666. LD b7, BO, 12 * SIZE
  2667. bge $r0, L, .L75
  2668. #endif
  2669. .align 3
  2670. .L72:
  2671. LD a1, AO, 0 * SIZE
  2672. LD a2, AO, 1 * SIZE
  2673. LD b1, BO, 0 * SIZE
  2674. MADD c11, b1, a1, c11
  2675. MADD c12, b1, a2, c12
  2676. LD a1, AO, 2 * SIZE
  2677. LD a2, AO, 3 * SIZE
  2678. LD b1, BO, 1 * SIZE
  2679. MADD c11, b1, a1, c11
  2680. MADD c12, b1, a2, c12
  2681. LD a1, AO, 4 * SIZE
  2682. LD a2, AO, 5 * SIZE
  2683. LD b1, BO, 2 * SIZE
  2684. MADD c11, b1, a1, c11
  2685. MADD c12, b1, a2, c12
  2686. LD a1, AO, 6 * SIZE
  2687. LD a2, AO, 7 * SIZE
  2688. LD b1, BO, 3 * SIZE
  2689. MADD c11, b1, a1, c11
  2690. MADD c12, b1, a2, c12
  2691. addi.d L, L, -1
  2692. addi.d AO, AO, 8 * SIZE
  2693. addi.d BO, BO, 4 * SIZE
  2694. blt $r0, L, .L72
  2695. .align 3
  2696. .L75:
  2697. #if defined(LT) || defined(RN)
  2698. andi L, KK, 3
  2699. #else
  2700. andi L, TEMP, 3
  2701. #endif
  2702. bge $r0, L, .L78
  2703. .align 3
  2704. .L76:
  2705. LD a1, AO, 0 * SIZE
  2706. LD a2, AO, 1 * SIZE
  2707. LD b1, BO, 0 * SIZE
  2708. MADD c11, b1, a1, c11
  2709. MADD c12, b1, a2, c12
  2710. addi.d L, L, -1
  2711. addi.d AO, AO, 2 * SIZE
  2712. addi.d BO, BO, 1 * SIZE
  2713. blt $r0, L, .L76
  2714. .L78:
  2715. ADD c11, c11, c21
  2716. ADD c12, c12, c22
  2717. #if defined(LN) || defined(RT)
  2718. #ifdef LN
  2719. addi.d TEMP, KK, -2
  2720. #else
  2721. addi.d TEMP, KK, -1
  2722. #endif
  2723. slli.d L, TEMP, 1 + BASE_SHIFT
  2724. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  2725. add.d AO, AORIG, L
  2726. add.d BO, B, TEMP
  2727. #endif
  2728. #if defined(LN) || defined(LT)
  2729. LD b1, BO, 0 * SIZE
  2730. LD b2, BO, 1 * SIZE
  2731. SUB c11, b1, c11
  2732. SUB c12, b2, c12
  2733. #else
  2734. LD b1, AO, 0 * SIZE
  2735. LD b2, AO, 1 * SIZE
  2736. SUB c11, b1, c11
  2737. SUB c12, b2, c12
  2738. #endif
  2739. #ifdef LN
  2740. LD b1, AO, 3 * SIZE
  2741. LD b2, AO, 2 * SIZE
  2742. LD b3, AO, 0 * SIZE
  2743. MUL c12, b1, c12
  2744. NMSUB c11, c12, b2, c11
  2745. MUL c11, b3, c11
  2746. #endif
  2747. #ifdef LT
  2748. LD b1, AO, 0 * SIZE
  2749. LD b2, AO, 1 * SIZE
  2750. LD b3, AO, 3 * SIZE
  2751. MUL c11, b1, c11
  2752. NMSUB c12, c11, b2, c12
  2753. MUL c12, b3, c12
  2754. #endif
  2755. #if defined(RN) || defined(RT)
  2756. LD b1, BO, 0 * SIZE
  2757. MUL c11, b1, c11
  2758. MUL c12, b1, c12
  2759. #endif
  2760. #ifdef LN
  2761. addi.d CO1, CO1, -2 * SIZE
  2762. #endif
  2763. #if defined(LN) || defined(LT)
  2764. ST c11, BO, 0 * SIZE
  2765. ST c12, BO, 1 * SIZE
  2766. #else
  2767. ST c11, AO, 0 * SIZE
  2768. ST c12, AO, 1 * SIZE
  2769. #endif
  2770. ST c11, CO1, 0 * SIZE
  2771. ST c12, CO1, 1 * SIZE
  2772. #ifndef LN
  2773. addi.d CO1, CO1, 2 * SIZE
  2774. #endif
  2775. #ifdef RT
  2776. slli.d TEMP, K, 1 + BASE_SHIFT
  2777. add.d AORIG, AORIG, TEMP
  2778. #endif
  2779. #if defined(LT) || defined(RN)
  2780. sub.d TEMP, K, KK
  2781. slli.d L, TEMP, 1 + BASE_SHIFT
  2782. slli.d TEMP, TEMP, 0 + BASE_SHIFT
  2783. add.d AO, AO, L
  2784. add.d BO, BO, TEMP
  2785. #endif
  2786. #ifdef LT
  2787. addi.d KK, KK, 2
  2788. #endif
  2789. #ifdef LN
  2790. addi.d KK, KK, -2
  2791. #endif
  2792. addi.d I, I, -1
  2793. blt $r0, I, .L71
  2794. .align 3
  2795. .L89:
  2796. #ifdef LN
  2797. slli.d TEMP, K, BASE_SHIFT
  2798. add.d B, B, TEMP
  2799. #endif
  2800. #if defined(LT) || defined(RN)
  2801. move B, BO
  2802. #endif
  2803. #ifdef RN
  2804. addi.d KK, KK, 1
  2805. #endif
  2806. #ifdef RT
  2807. addi.d KK, KK, -1
  2808. #endif
  2809. .align 3
  2810. .L999:
  2811. LDARG $r23, $sp, 0
  2812. LDARG $r24, $sp, 8
  2813. LDARG $r25, $sp, 16
  2814. LDARG $r26, $sp, 24
  2815. LDARG $r27, $sp, 32
  2816. LDARG $r28, $sp, 40
  2817. fld.d $f24, $sp, 48
  2818. fld.d $f25, $sp, 56
  2819. fld.d $f26, $sp, 64
  2820. fld.d $f27, $sp, 72
  2821. fld.d $f28, $sp, 80
  2822. LDARG $r29, $sp, 88
  2823. LDARG $r30, $sp, 96
  2824. LDARG $r20, $sp, 104
  2825. LDARG $r16, $sp, 112
  2826. #ifndef __64BIT__
  2827. fld.d $f18, $sp, 112
  2828. fld.d $f19, $sp, 120
  2829. fld.d $f20, $sp, 128
  2830. fld.d $f21, $sp, 136
  2831. #endif
  2832. addi.d $sp, $sp, 144
  2833. move $r4, $r17
  2834. fmov.d $f0, $f22
  2835. jirl $r0, $r1, 0x0
  2836. EPILOGUE