You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_RT.S 31 kB


  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M $r4
  30. #define N $r5
  31. #define K $r6
  32. #define A $r7
  33. #define B $r8
  34. #define C $r9
  35. #define LDC $r10
  36. #define OFFSET $r11
  37. #define AO $r12
  38. #define BO $r13
  39. #define I $r17
  40. #define J $r18
  41. #define L $r25
  42. #define CO1 $r14
  43. #define CO2 $r15
  44. #define CO3 $r23
  45. #define CO4 $r24
  46. #define KK $r26
  47. #define TEMP $r27
  48. #define AORIG $r28
  49. #define a1 $f22
  50. #define a2 $f8
  51. #define a3 $f26
  52. #define a4 $f27
  53. #define b1 $f23
  54. #define b2 $f9
  55. #define b3 $f10
  56. #define b4 $f11
  57. #define b5 $f12
  58. #define b6 $f13
  59. #define b7 $f14
  60. #define b8 $f15
  61. #define a5 b8
  62. #define c11 $f16
  63. #define c12 $f17
  64. #define c21 $f0
  65. #define c22 $f1
  66. #define c31 $f2
  67. #define c32 $f3
  68. #define c41 $f4
  69. #define c42 $f5
  70. #define c51 $f6
  71. #define c52 $f7
  72. #define c61 $f18
  73. #define c62 $f19
  74. #define c71 $f20
  75. #define c72 $f21
  76. #define c81 $f24
  77. #define c82 $f25
  78. #ifndef CONJ
  79. #define MADD1 MADD
  80. #define MADD2 MADD
  81. #define MADD3 MADD
  82. #define MADD4 NMSUB
  83. #define MADD5 MSUB
  84. #define MADD6 MADD
  85. #define MADD7 NMSUB
  86. #define MADD8 MADD
  87. #else
  88. #if defined(LN) || defined(LT)
  89. #define MADD1 MADD
  90. #define MADD2 NMSUB
  91. #define MADD3 MADD
  92. #define MADD4 MADD
  93. #else
  94. #define MADD1 MADD
  95. #define MADD2 MADD
  96. #define MADD3 NMSUB
  97. #define MADD4 MADD
  98. #endif
  99. #define MADD5 MADD
  100. #define MADD6 MSUB
  101. #define MADD7 MADD
  102. #define MADD8 NMSUB
  103. #endif
  104. PROLOGUE
  105. addi.d $sp, $sp, -128
  106. SDARG $r23, $sp, 0
  107. SDARG $r24, $sp, 8
  108. SDARG $r25, $sp, 16
  109. SDARG $r26, $sp, 24
  110. SDARG $r27, $sp, 32
  111. SDARG $r28, $sp, 40
  112. fst.d $f24, $sp, 48
  113. fst.d $f25, $sp, 56
  114. fst.d $f26, $sp, 64
  115. fst.d $f27, $sp, 72
  116. #ifndef __64BIT__
  117. fst.d $f18, $sp, 88
  118. fst.d $f19, $sp, 96
  119. fst.d $f20, $sp, 104
  120. fst.d $f21, $sp, 112
  121. #endif
  122. slli.d LDC, LDC, ZBASE_SHIFT
  123. #ifdef LN
  124. mul.w TEMP, M, K
  125. slli.d TEMP, TEMP, ZBASE_SHIFT
  126. add.d A, A, TEMP
  127. slli.d TEMP, M, ZBASE_SHIFT
  128. add.d C, C, TEMP
  129. #endif
  130. #ifdef RN
  131. sub.d KK, $r0, OFFSET
  132. #endif
  133. #ifdef RT
  134. mul.w TEMP, N, K
  135. slli.d TEMP, TEMP, ZBASE_SHIFT
  136. add.d B, B, TEMP
  137. mul.w TEMP, N, LDC
  138. add.d C, C, TEMP
  139. sub.d KK, N, OFFSET
  140. #endif
  141. andi J, N, 1
  142. bge $r0, J, .L20
  143. #ifdef RT
  144. slli.d TEMP, K, ZBASE_SHIFT
  145. sub.d B, B, TEMP
  146. sub.d C, C, LDC
  147. #endif
  148. MTC c11, $r0
  149. move CO1, C
  150. #ifdef LN
  151. add.d KK, M, OFFSET
  152. #endif
  153. #ifdef LT
  154. move KK, OFFSET
  155. #endif
  156. #if defined(LN) || defined(RT)
  157. move AORIG, A
  158. #else
  159. move AO, A
  160. #endif
  161. #ifndef RT
  162. add.d C, CO1, LDC
  163. #endif
  164. move I, M
  165. bge $r0, I, .L39
  166. .align 3
  167. .L31:
  168. #if defined(LT) || defined(RN)
  169. LD a1, AO, 0 * SIZE
  170. MOV c21, c11
  171. LD b1, B, 0 * SIZE
  172. MOV c31, c11
  173. LD a2, AO, 1 * SIZE
  174. MOV c41, c11
  175. LD b2, B, 1 * SIZE
  176. MOV c12, c11
  177. srai.d L, KK, 2
  178. MOV c22, c11
  179. LD a3, AO, 4 * SIZE
  180. MOV c32, c11
  181. LD b3, B, 4 * SIZE
  182. MOV c42, c11
  183. move BO, B
  184. bge $r0, L, .L35
  185. #else
  186. #ifdef LN
  187. slli.d TEMP, K, ZBASE_SHIFT
  188. sub.d AORIG, AORIG, TEMP
  189. #endif
  190. slli.d TEMP, KK, ZBASE_SHIFT
  191. add.d AO, AORIG, TEMP
  192. add.d BO, B, TEMP
  193. sub.d TEMP, K, KK
  194. LD a1, AO, 0 * SIZE
  195. MOV c21, c11
  196. LD b1, BO, 0 * SIZE
  197. MOV c31, c11
  198. LD a2, AO, 1 * SIZE
  199. MOV c41, c11
  200. LD b2, BO, 1 * SIZE
  201. MOV c12, c11
  202. srai.d L, TEMP, 2
  203. MOV c22, c11
  204. LD a3, AO, 4 * SIZE
  205. MOV c32, c11
  206. LD b3, BO, 4 * SIZE
  207. MOV c42, c11
  208. bge $r0, L, .L35
  209. #endif
  210. .align 3
  211. .L32:
  212. MADD1 c11, b1, a1, c11
  213. LD b4, BO, 3 * SIZE
  214. MADD3 c21, b2, a1, c21
  215. LD a1, AO, 2 * SIZE
  216. MADD2 c12, b1, a2, c12
  217. LD b1, BO, 2 * SIZE
  218. MADD4 c22, b2, a2, c22
  219. LD a2, AO, 3 * SIZE
  220. MADD1 c11, b1, a1, c11
  221. LD b2, BO, 5 * SIZE
  222. MADD3 c21, b4, a1, c21
  223. LD a1, AO, 8 * SIZE
  224. MADD2 c12, b1, a2, c12
  225. LD b1, BO, 8 * SIZE
  226. MADD4 c22, b4, a2, c22
  227. LD a2, AO, 5 * SIZE
  228. MADD1 c11, b3, a3, c11
  229. LD b4, BO, 7 * SIZE
  230. MADD3 c21, b2, a3, c21
  231. LD a3, AO, 6 * SIZE
  232. MADD2 c12, b3, a2, c12
  233. LD b3, BO, 6 * SIZE
  234. MADD4 c22, b2, a2, c22
  235. LD a2, AO, 7 * SIZE
  236. MADD1 c11, b3, a3, c11
  237. LD b2, BO, 9 * SIZE
  238. MADD3 c21, b4, a3, c21
  239. LD a3, AO, 12 * SIZE
  240. MADD2 c12, b3, a2, c12
  241. LD b3, BO, 12 * SIZE
  242. MADD4 c22, b4, a2, c22
  243. LD a2, AO, 9 * SIZE
  244. addi.d AO, AO, 8 * SIZE
  245. addi.d L, L, -1
  246. addi.d BO, BO, 8 * SIZE
  247. blt $r0, L, .L32
  248. .align 3
  249. .L35:
  250. #if defined(LT) || defined(RN)
  251. andi L, KK, 3
  252. #else
  253. andi L, TEMP, 3
  254. #endif
  255. bge $r0, L, .L38
  256. .align 3
  257. .L36:
  258. MADD1 c11, b1, a1, c11
  259. addi.d L, L, -1
  260. MADD3 c21, b2, a1, c21
  261. LD a1, AO, 2 * SIZE
  262. MADD2 c12, b1, a2, c12
  263. LD b1, BO, 2 * SIZE
  264. MADD4 c22, b2, a2, c22
  265. LD a2, AO, 3 * SIZE
  266. LD b2, BO, 3 * SIZE
  267. addi.d BO, BO, 2 * SIZE
  268. addi.d AO, AO, 2 * SIZE
  269. blt $r0, L, .L36
  270. .L38:
  271. ADD c11, c11, c22
  272. ADD c12, c12, c21
  273. #if defined(LN) || defined(RT)
  274. addi.d TEMP, KK, -1
  275. slli.d TEMP, TEMP, ZBASE_SHIFT
  276. add.d AO, AORIG, TEMP
  277. add.d BO, B, TEMP
  278. #endif
  279. #if defined(LN) || defined(LT)
  280. LD b1, BO, 0 * SIZE
  281. LD b2, BO, 1 * SIZE
  282. SUB c11, b1, c11
  283. SUB c12, b2, c12
  284. #else
  285. LD b1, AO, 0 * SIZE
  286. LD b2, AO, 1 * SIZE
  287. SUB c11, b1, c11
  288. SUB c12, b2, c12
  289. #endif
  290. #if defined(LN) || defined(LT)
  291. LD b1, AO, 0 * SIZE
  292. LD b2, AO, 1 * SIZE
  293. MUL a1, b2, c12
  294. MUL a2, b2, c11
  295. MADD5 c11, c11, b1, a1
  296. MADD6 c12, c12, b1, a2
  297. #endif
  298. #if defined(RN) || defined(RT)
  299. LD b1, BO, 0 * SIZE
  300. LD b2, BO, 1 * SIZE
  301. MUL a1, b2, c12
  302. MUL a2, b2, c11
  303. MADD5 c11, c11, b1, a1
  304. MADD6 c12, c12, b1, a2
  305. #endif
  306. #if defined(LN) || defined(LT)
  307. ST c11, BO, 0 * SIZE
  308. ST c12, BO, 1 * SIZE
  309. #else
  310. ST c11, AO, 0 * SIZE
  311. ST c12, AO, 1 * SIZE
  312. #endif
  313. #ifdef LN
  314. addi.d CO1,CO1, -2 * SIZE
  315. #endif
  316. ST c11, CO1, 0 * SIZE
  317. ST c12, CO1, 1 * SIZE
  318. #ifndef LN
  319. addi.d CO1,CO1, 2 * SIZE
  320. #endif
  321. MTC c11, $r0
  322. #ifdef RT
  323. slli.d TEMP, K, ZBASE_SHIFT
  324. add.d AORIG, AORIG, TEMP
  325. #endif
  326. #if defined(LT) || defined(RN)
  327. sub.d TEMP, K, KK
  328. slli.d TEMP, TEMP, ZBASE_SHIFT
  329. add.d AO, AO, TEMP
  330. add.d BO, BO, TEMP
  331. #endif
  332. #ifdef LT
  333. addi.d KK, KK, 1
  334. #endif
  335. #ifdef LN
  336. addi.d KK, KK, -1
  337. #endif
  338. addi.d I, I, -1
  339. blt $r0, I, .L31
  340. .align 3
  341. .L39:
  342. #ifdef LN
  343. slli.d TEMP, K, ZBASE_SHIFT
  344. add.d B, B, TEMP
  345. #endif
  346. #if defined(LT) || defined(RN)
  347. move B, BO
  348. #endif
  349. #ifdef RN
  350. addi.d KK, KK, 1
  351. #endif
  352. #ifdef RT
  353. addi.d KK, KK, -1
  354. #endif
  355. .align 3
  356. .L20:
  357. andi J, N, 2
  358. bge $r0, J, .L30
  359. #ifdef RT
  360. slli.d TEMP, K, 1 + ZBASE_SHIFT
  361. sub.d B, B, TEMP
  362. slli.d TEMP, LDC, 1
  363. sub.d C, C, TEMP
  364. #endif
  365. MTC c11, $r0
  366. move CO1, C
  367. add.d CO2, C, LDC
  368. #ifdef LN
  369. add.d KK, M, OFFSET
  370. #endif
  371. #ifdef LT
  372. move KK, OFFSET
  373. #endif
  374. #if defined(LN) || defined(RT)
  375. move AORIG, A
  376. #else
  377. move AO, A
  378. #endif
  379. #ifndef RT
  380. add.d C, CO2, LDC
  381. #endif
  382. move I, M
  383. bge $r0, I, .L29
  384. .align 3
  385. .L21:
  386. #if defined(LT) || defined(RN)
  387. LD a1, AO, 0 * SIZE
  388. MOV c21, c11
  389. LD b1, B, 0 * SIZE
  390. MOV c31, c11
  391. LD a3, AO, 4 * SIZE
  392. MOV c41, c11
  393. LD b2, B, 1 * SIZE
  394. srai.d L, KK, 2
  395. LD b3, B, 2 * SIZE
  396. MOV c12, c11
  397. LD b4, B, 3 * SIZE
  398. MOV c22, c11
  399. LD b5, B, 4 * SIZE
  400. MOV c32, c11
  401. MOV c42, c11
  402. move BO, B
  403. bge $r0, L, .L25
  404. #else
  405. #ifdef LN
  406. slli.d TEMP, K, ZBASE_SHIFT
  407. sub.d AORIG, AORIG, TEMP
  408. #endif
  409. slli.d L, KK, ZBASE_SHIFT
  410. slli.d TEMP, KK, 1 + ZBASE_SHIFT
  411. add.d AO, AORIG, L
  412. add.d BO, B, TEMP
  413. sub.d TEMP, K, KK
  414. LD a1, AO, 0 * SIZE
  415. MOV c21, c11
  416. LD b1, BO, 0 * SIZE
  417. MOV c31, c11
  418. LD a3, AO, 4 * SIZE
  419. MOV c41, c11
  420. LD b2, BO, 1 * SIZE
  421. srai.d L, TEMP, 2
  422. LD b3, BO, 2 * SIZE
  423. MOV c12, c11
  424. LD b4, BO, 3 * SIZE
  425. MOV c22, c11
  426. LD b5, BO, 4 * SIZE
  427. MOV c32, c11
  428. MOV c42, c11
  429. bge $r0, L, .L25
  430. #endif
  431. .align 3
  432. .L22:
  433. MADD1 c11, b1, a1, c11
  434. LD a2, AO, 1 * SIZE
  435. MADD3 c21, b2, a1, c21
  436. addi.d L, L, -1
  437. MADD1 c31, b3, a1, c31
  438. MADD3 c41, b4, a1, c41
  439. LD a1, AO, 2 * SIZE
  440. MADD2 c12, b1, a2, c12
  441. LD b1, BO, 8 * SIZE
  442. MADD4 c22, b2, a2, c22
  443. LD b2, BO, 5 * SIZE
  444. MADD2 c32, b3, a2, c32
  445. LD b3, BO, 6 * SIZE
  446. MADD4 c42, b4, a2, c42
  447. LD b4, BO, 7 * SIZE
  448. MADD1 c11, b5, a1, c11
  449. LD a2, AO, 3 * SIZE
  450. MADD3 c21, b2, a1, c21
  451. MADD1 c31, b3, a1, c31
  452. MADD3 c41, b4, a1, c41
  453. LD a1, AO, 8 * SIZE
  454. MADD2 c12, b5, a2, c12
  455. LD b5, BO, 12 * SIZE
  456. MADD4 c22, b2, a2, c22
  457. LD b2, BO, 9 * SIZE
  458. MADD2 c32, b3, a2, c32
  459. LD b3, BO, 10 * SIZE
  460. MADD4 c42, b4, a2, c42
  461. LD b4, BO, 11 * SIZE
  462. MADD1 c11, b1, a3, c11
  463. LD a2, AO, 5 * SIZE
  464. MADD3 c21, b2, a3, c21
  465. MADD1 c31, b3, a3, c31
  466. MADD3 c41, b4, a3, c41
  467. LD a3, AO, 6 * SIZE
  468. MADD2 c12, b1, a2, c12
  469. LD b1, BO, 16 * SIZE
  470. MADD4 c22, b2, a2, c22
  471. LD b2, BO, 13 * SIZE
  472. MADD2 c32, b3, a2, c32
  473. LD b3, BO, 14 * SIZE
  474. MADD4 c42, b4, a2, c42
  475. LD b4, BO, 15 * SIZE
  476. MADD1 c11, b5, a3, c11
  477. LD a2, AO, 7 * SIZE
  478. MADD3 c21, b2, a3, c21
  479. addi.d AO, AO, 8 * SIZE
  480. MADD1 c31, b3, a3, c31
  481. MADD3 c41, b4, a3, c41
  482. LD a3, AO, 4 * SIZE
  483. MADD2 c12, b5, a2, c12
  484. LD b5, BO, 20 * SIZE
  485. MADD4 c22, b2, a2, c22
  486. LD b2, BO, 17 * SIZE
  487. MADD2 c32, b3, a2, c32
  488. LD b3, BO, 18 * SIZE
  489. MADD4 c42, b4, a2, c42
  490. LD b4, BO, 19 * SIZE
  491. addi.d BO, BO, 16 * SIZE
  492. blt $r0, L, .L22
  493. .align 3
  494. .L25:
  495. #if defined(LT) || defined(RN)
  496. andi L, KK, 3
  497. #else
  498. andi L, TEMP, 3
  499. #endif
  500. bge $r0, L, .L28
  501. .align 3
  502. .L26:
  503. MADD1 c11, b1, a1, c11
  504. LD a2, AO, 1 * SIZE
  505. MADD3 c21, b2, a1, c21
  506. addi.d L, L, -1
  507. MADD1 c31, b3, a1, c31
  508. addi.d BO, BO, 4 * SIZE
  509. MADD3 c41, b4, a1, c41
  510. LD a1, AO, 2 * SIZE
  511. MADD2 c12, b1, a2, c12
  512. LD b1, BO, 0 * SIZE
  513. MADD4 c22, b2, a2, c22
  514. LD b2, BO, 1 * SIZE
  515. MADD2 c32, b3, a2, c32
  516. LD b3, BO, 2 * SIZE
  517. MADD4 c42, b4, a2, c42
  518. LD b4, BO, 3 * SIZE
  519. addi.d AO, AO, 2 * SIZE
  520. blt $r0, L, .L26
  521. .L28:
  522. ADD c11, c11, c22
  523. ADD c12, c12, c21
  524. ADD c31, c31, c42
  525. ADD c32, c32, c41
  526. #if defined(LN) || defined(RT)
  527. #ifdef LN
  528. addi.d TEMP, KK, -1
  529. #else
  530. addi.d TEMP, KK, -2
  531. #endif
  532. slli.d L, TEMP, ZBASE_SHIFT
  533. slli.d TEMP, TEMP, 1 + ZBASE_SHIFT
  534. add.d AO, AORIG, L
  535. add.d BO, B, TEMP
  536. #endif
  537. #if defined(LN) || defined(LT)
  538. LD b1, BO, 0 * SIZE
  539. LD b2, BO, 1 * SIZE
  540. LD b3, BO, 2 * SIZE
  541. LD b4, BO, 3 * SIZE
  542. SUB c11, b1, c11
  543. SUB c12, b2, c12
  544. SUB c31, b3, c31
  545. SUB c32, b4, c32
  546. #else
  547. LD b1, AO, 0 * SIZE
  548. LD b2, AO, 1 * SIZE
  549. LD b3, AO, 2 * SIZE
  550. LD b4, AO, 3 * SIZE
  551. SUB c11, b1, c11
  552. SUB c12, b2, c12
  553. SUB c31, b3, c31
  554. SUB c32, b4, c32
  555. #endif
  556. #if defined(LN) || defined(LT)
  557. LD b1, AO, 0 * SIZE
  558. LD b2, AO, 1 * SIZE
  559. MUL a1, b2, c12
  560. MUL a2, b2, c11
  561. MUL a3, b2, c32
  562. MUL a4, b2, c31
  563. MADD5 c11, c11, b1, a1
  564. MADD6 c12, c12, b1, a2
  565. MADD5 c31, c31, b1, a3
  566. MADD6 c32, c32, b1, a4
  567. #endif
  568. #ifdef RN
  569. LD b1, BO, 0 * SIZE
  570. LD b2, BO, 1 * SIZE
  571. LD b3, BO, 2 * SIZE
  572. LD b4, BO, 3 * SIZE
  573. MUL a1, b2, c12
  574. MUL a2, b2, c11
  575. MADD5 c11, c11, b1, a1
  576. MADD6 c12, c12, b1, a2
  577. NMSUB c31, c11, b3, c31
  578. MADD7 c32, c11, b4, c32
  579. MADD8 c31, c12, b4, c31
  580. NMSUB c32, c12, b3, c32
  581. LD b3, BO, 6 * SIZE
  582. LD b4, BO, 7 * SIZE
  583. MUL a1, b4, c32
  584. MUL a2, b4, c31
  585. MADD5 c31, c31, b3, a1
  586. MADD6 c32, c32, b3, a2
  587. #endif
  588. #ifdef RT
  589. LD b5, BO, 6 * SIZE
  590. LD b6, BO, 7 * SIZE
  591. LD b7, BO, 4 * SIZE
  592. LD b8, BO, 5 * SIZE
  593. MUL a1, b6, c32
  594. MUL a2, b6, c31
  595. MADD5 c31, c31, b5, a1
  596. MADD6 c32, c32, b5, a2
  597. NMSUB c11, c31, b7, c11
  598. MADD7 c12, c31, b8, c12
  599. MADD8 c11, c32, b8, c11
  600. NMSUB c12, c32, b7, c12
  601. LD b7, BO, 0 * SIZE
  602. LD b8, BO, 1 * SIZE
  603. MUL a1, b8, c12
  604. MUL a2, b8, c11
  605. MADD5 c11, c11, b7, a1
  606. MADD6 c12, c12, b7, a2
  607. #endif
  608. #if defined(LN) || defined(LT)
  609. ST c11, BO, 0 * SIZE
  610. ST c12, BO, 1 * SIZE
  611. ST c31, BO, 2 * SIZE
  612. ST c32, BO, 3 * SIZE
  613. #else
  614. ST c11, AO, 0 * SIZE
  615. ST c12, AO, 1 * SIZE
  616. ST c31, AO, 2 * SIZE
  617. ST c32, AO, 3 * SIZE
  618. #endif
  619. #ifdef LN
  620. addi.d CO1,CO1, -2 * SIZE
  621. addi.d CO2,CO2, -2 * SIZE
  622. #endif
  623. ST c11, CO1, 0 * SIZE
  624. ST c12, CO1, 1 * SIZE
  625. ST c31, CO2, 0 * SIZE
  626. ST c32, CO2, 1 * SIZE
  627. #ifndef LN
  628. addi.d CO1,CO1, 2 * SIZE
  629. addi.d CO2,CO2, 2 * SIZE
  630. #endif
  631. MTC c11, $r0
  632. #ifdef RT
  633. slli.d TEMP, K, ZBASE_SHIFT
  634. add.d AORIG, AORIG, TEMP
  635. #endif
  636. #if defined(LT) || defined(RN)
  637. sub.d TEMP, K, KK
  638. slli.d L, TEMP, ZBASE_SHIFT
  639. slli.d TEMP, TEMP, 1 + ZBASE_SHIFT
  640. add.d AO, AO, L
  641. add.d BO, BO, TEMP
  642. #endif
  643. #ifdef LT
  644. addi.d KK, KK, 1
  645. #endif
  646. #ifdef LN
  647. addi.d KK, KK, -1
  648. #endif
  649. addi.d I, I, -1
  650. blt $r0, I, .L21
  651. .align 3
  652. .L29:
  653. #ifdef LN
  654. slli.d TEMP, K, 1 + ZBASE_SHIFT
  655. add.d B, B, TEMP
  656. #endif
  657. #if defined(LT) || defined(RN)
  658. move B, BO
  659. #endif
  660. #ifdef RN
  661. addi.d KK, KK, 2
  662. #endif
  663. #ifdef RT
  664. addi.d KK, KK, -2
  665. #endif
  666. .align 3
  667. .L30:
  668. srai.d J, N, 2
  669. nop
  670. bge $r0, J, .L999
  671. .L10:
  672. #ifdef RT
  673. slli.d TEMP, K, 2 + ZBASE_SHIFT
  674. sub.d B, B, TEMP
  675. slli.d TEMP, LDC, 2
  676. sub.d C, C, TEMP
  677. #endif
  678. move CO1, C
  679. MTC c11, $r0
  680. add.d CO2, C, LDC
  681. add.d CO3, CO2, LDC
  682. addi.d J, J, -1
  683. add.d CO4, CO3, LDC
  684. MOV c21, c11
  685. MOV c31, c11
  686. MOV c41, c11
  687. MOV c51, c11
  688. move I, M
  689. #ifdef LN
  690. add.d KK, M, OFFSET
  691. #endif
  692. #ifdef LT
  693. move KK, OFFSET
  694. #endif
  695. #if defined(LN) || defined(RT)
  696. move AORIG, A
  697. #else
  698. move AO, A
  699. #endif
  700. #ifndef RT
  701. add.d C, CO4, LDC
  702. #endif
  703. MOV c61, c11
  704. bge $r0, I, .L19
  705. .align 3
  706. .L11:
  707. #if defined(LT) || defined(RN)
  708. LD a1, AO, 0 * SIZE
  709. MOV c71, c11
  710. LD b1, B, 0 * SIZE
  711. MOV c81, c11
  712. LD a3, AO, 4 * SIZE
  713. MOV c12, c11
  714. LD b2, B, 1 * SIZE
  715. MOV c22, c11
  716. srai.d L, KK, 2
  717. MOV c32, c11
  718. LD b3, B, 2 * SIZE
  719. MOV c42, c11
  720. LD b4, B, 3 * SIZE
  721. MOV c52, c11
  722. LD b5, B, 4 * SIZE
  723. MOV c62, c11
  724. LD b6, B, 8 * SIZE
  725. MOV c72, c11
  726. LD b7, B, 12 * SIZE
  727. MOV c82, c11
  728. move BO, B
  729. bge $r0, L, .L15
  730. #else
  731. #ifdef LN
  732. slli.d TEMP, K, ZBASE_SHIFT
  733. sub.d AORIG, AORIG, TEMP
  734. #endif
  735. slli.d L, KK, ZBASE_SHIFT
  736. slli.d TEMP, KK, 2 + ZBASE_SHIFT
  737. add.d AO, AORIG, L
  738. add.d BO, B, TEMP
  739. sub.d TEMP, K, KK
  740. LD a1, AO, 0 * SIZE
  741. MOV c71, c11
  742. LD b1, BO, 0 * SIZE
  743. MOV c81, c11
  744. LD a3, AO, 4 * SIZE
  745. MOV c12, c11
  746. LD b2, BO, 1 * SIZE
  747. MOV c22, c11
  748. srai.d L, TEMP, 2
  749. MOV c32, c11
  750. LD b3, BO, 2 * SIZE
  751. MOV c42, c11
  752. LD b4, BO, 3 * SIZE
  753. MOV c52, c11
  754. LD b5, BO, 4 * SIZE
  755. MOV c62, c11
  756. LD b6, BO, 8 * SIZE
  757. MOV c72, c11
  758. LD b7, BO, 12 * SIZE
  759. MOV c82, c11
  760. bge $r0, L, .L15
  761. #endif
  762. MADD1 c11, b1, a1, c11
  763. LD a2, AO, 1 * SIZE
  764. MADD3 c21, b2, a1, c21
  765. addi.d L, L, -1
  766. MADD1 c31, b3, a1, c31
  767. MADD3 c41, b4, a1, c41
  768. bge $r0, L, .L13
  769. .align 3
  770. .L12:
  771. MADD2 c12, b1, a2, c12
  772. LD b1, BO, 16 * SIZE
  773. MADD4 c22, b2, a2, c22
  774. LD b2, BO, 5 * SIZE
  775. MADD2 c32, b3, a2, c32
  776. LD b3, BO, 6 * SIZE
  777. MADD4 c42, b4, a2, c42
  778. LD b4, BO, 7 * SIZE
  779. MADD1 c51, b5, a1, c51
  780. MADD3 c61, b2, a1, c61
  781. LD a4, AO, 2 * SIZE
  782. MADD1 c71, b3, a1, c71
  783. MADD3 c81, b4, a1, c81
  784. LD a1, AO, 8 * SIZE
  785. MADD2 c52, b5, a2, c52
  786. LD b5, BO, 20 * SIZE
  787. MADD4 c62, b2, a2, c62
  788. LD b2, BO, 9 * SIZE
  789. MADD2 c72, b3, a2, c72
  790. LD b3, BO, 10 * SIZE
  791. MADD4 c82, b4, a2, c82
  792. LD b4, BO, 11 * SIZE
  793. MADD1 c11, b6, a4, c11
  794. LD a2, AO, 3 * SIZE
  795. MADD3 c21, b2, a4, c21
  796. MADD1 c31, b3, a4, c31
  797. MADD3 c41, b4, a4, c41
  798. MADD2 c12, b6, a2, c12
  799. LD b6, BO, 24 * SIZE
  800. MADD4 c22, b2, a2, c22
  801. LD b2, BO, 13 * SIZE
  802. MADD2 c32, b3, a2, c32
  803. LD b3, BO, 14 * SIZE
  804. MADD4 c42, b4, a2, c42
  805. LD b4, BO, 15 * SIZE
  806. MADD1 c51, b7, a4, c51
  807. MADD3 c61, b2, a4, c61
  808. MADD1 c71, b3, a4, c71
  809. MADD3 c81, b4, a4, c81
  810. MADD2 c52, b7, a2, c52
  811. LD b7, BO, 28 * SIZE
  812. MADD4 c62, b2, a2, c62
  813. LD b2, BO, 17 * SIZE
  814. MADD2 c72, b3, a2, c72
  815. LD b3, BO, 18 * SIZE
  816. MADD4 c82, b4, a2, c82
  817. LD b4, BO, 19 * SIZE
  818. MADD1 c11, b1, a3, c11
  819. LD a2, AO, 5 * SIZE
  820. MADD3 c21, b2, a3, c21
  821. MADD1 c31, b3, a3, c31
  822. MADD3 c41, b4, a3, c41
  823. MADD2 c12, b1, a2, c12
  824. LD b1, BO, 32 * SIZE
  825. MADD4 c22, b2, a2, c22
  826. LD b2, BO, 21 * SIZE
  827. MADD2 c32, b3, a2, c32
  828. LD b3, BO, 22 * SIZE
  829. MADD4 c42, b4, a2, c42
  830. LD b4, BO, 23 * SIZE
  831. MADD1 c51, b5, a3, c51
  832. MADD3 c61, b2, a3, c61
  833. LD a4, AO, 6 * SIZE
  834. MADD1 c71, b3, a3, c71
  835. MADD3 c81, b4, a3, c81
  836. LD a3, AO, 12 * SIZE
  837. MADD2 c52, b5, a2, c52
  838. LD b5, BO, 36 * SIZE
  839. MADD4 c62, b2, a2, c62
  840. LD b2, BO, 25 * SIZE
  841. MADD2 c72, b3, a2, c72
  842. LD b3, BO, 26 * SIZE
  843. MADD4 c82, b4, a2, c82
  844. LD b4, BO, 27 * SIZE
  845. MADD1 c11, b6, a4, c11
  846. LD a2, AO, 7 * SIZE
  847. MADD3 c21, b2, a4, c21
  848. MADD1 c31, b3, a4, c31
  849. MADD3 c41, b4, a4, c41
  850. addi.d L, L, -1
  851. MADD2 c12, b6, a2, c12
  852. LD b6, BO, 40 * SIZE
  853. MADD4 c22, b2, a2, c22
  854. LD b2, BO, 29 * SIZE
  855. MADD2 c32, b3, a2, c32
  856. LD b3, BO, 30 * SIZE
  857. MADD4 c42, b4, a2, c42
  858. LD b4, BO, 31 * SIZE
  859. MADD1 c51, b7, a4, c51
  860. addi.d BO, BO, 32 * SIZE
  861. MADD3 c61, b2, a4, c61
  862. addi.d AO, AO, 8 * SIZE
  863. MADD1 c71, b3, a4, c71
  864. MADD3 c81, b4, a4, c81
  865. MADD2 c52, b7, a2, c52
  866. LD b7, BO, 12 * SIZE
  867. MADD4 c62, b2, a2, c62
  868. LD b2, BO, 1 * SIZE
  869. MADD2 c72, b3, a2, c72
  870. LD b3, BO, 2 * SIZE
  871. MADD4 c82, b4, a2, c82
  872. LD b4, BO, 3 * SIZE
  873. MADD1 c11, b1, a1, c11
  874. LD a2, AO, 1 * SIZE
  875. MADD3 c21, b2, a1, c21
  876. MADD1 c31, b3, a1, c31
  877. MADD3 c41, b4, a1, c41
  878. blt $r0, L, .L12
  879. .align 3
  880. .L13:
  881. MADD2 c12, b1, a2, c12
  882. LD b1, BO, 16 * SIZE
  883. MADD4 c22, b2, a2, c22
  884. LD b2, BO, 5 * SIZE
  885. MADD2 c32, b3, a2, c32
  886. LD b3, BO, 6 * SIZE
  887. MADD4 c42, b4, a2, c42
  888. LD b4, BO, 7 * SIZE
  889. MADD1 c51, b5, a1, c51
  890. MADD3 c61, b2, a1, c61
  891. LD a4, AO, 2 * SIZE
  892. MADD1 c71, b3, a1, c71
  893. MADD3 c81, b4, a1, c81
  894. LD a1, AO, 8 * SIZE
  895. MADD2 c52, b5, a2, c52
  896. LD b5, BO, 20 * SIZE
  897. MADD4 c62, b2, a2, c62
  898. LD b2, BO, 9 * SIZE
  899. MADD2 c72, b3, a2, c72
  900. LD b3, BO, 10 * SIZE
  901. MADD4 c82, b4, a2, c82
  902. LD b4, BO, 11 * SIZE
  903. MADD1 c11, b6, a4, c11
  904. LD a2, AO, 3 * SIZE
  905. MADD3 c21, b2, a4, c21
  906. MADD1 c31, b3, a4, c31
  907. MADD3 c41, b4, a4, c41
  908. MADD2 c12, b6, a2, c12
  909. LD b6, BO, 24 * SIZE
  910. MADD4 c22, b2, a2, c22
  911. LD b2, BO, 13 * SIZE
  912. MADD2 c32, b3, a2, c32
  913. LD b3, BO, 14 * SIZE
  914. MADD4 c42, b4, a2, c42
  915. LD b4, BO, 15 * SIZE
  916. MADD1 c51, b7, a4, c51
  917. MADD3 c61, b2, a4, c61
  918. MADD1 c71, b3, a4, c71
  919. MADD3 c81, b4, a4, c81
  920. MADD2 c52, b7, a2, c52
  921. LD b7, BO, 28 * SIZE
  922. MADD4 c62, b2, a2, c62
  923. LD b2, BO, 17 * SIZE
  924. MADD2 c72, b3, a2, c72
  925. LD b3, BO, 18 * SIZE
  926. MADD4 c82, b4, a2, c82
  927. LD b4, BO, 19 * SIZE
  928. MADD1 c11, b1, a3, c11
  929. LD a2, AO, 5 * SIZE
  930. MADD3 c21, b2, a3, c21
  931. MADD1 c31, b3, a3, c31
  932. MADD3 c41, b4, a3, c41
  933. MADD2 c12, b1, a2, c12
  934. LD b1, BO, 32 * SIZE
  935. MADD4 c22, b2, a2, c22
  936. LD b2, BO, 21 * SIZE
  937. MADD2 c32, b3, a2, c32
  938. LD b3, BO, 22 * SIZE
  939. MADD4 c42, b4, a2, c42
  940. LD b4, BO, 23 * SIZE
  941. MADD1 c51, b5, a3, c51
  942. MADD3 c61, b2, a3, c61
  943. LD a4, AO, 6 * SIZE
  944. MADD1 c71, b3, a3, c71
  945. MADD3 c81, b4, a3, c81
  946. LD a3, AO, 12 * SIZE
  947. MADD2 c52, b5, a2, c52
  948. LD b5, BO, 36 * SIZE
  949. MADD4 c62, b2, a2, c62
  950. LD b2, BO, 25 * SIZE
  951. MADD2 c72, b3, a2, c72
  952. LD b3, BO, 26 * SIZE
  953. MADD4 c82, b4, a2, c82
  954. LD b4, BO, 27 * SIZE
  955. MADD1 c11, b6, a4, c11
  956. LD a2, AO, 7 * SIZE
  957. MADD3 c21, b2, a4, c21
  958. MADD1 c31, b3, a4, c31
  959. MADD3 c41, b4, a4, c41
  960. MADD2 c12, b6, a2, c12
  961. LD b6, BO, 40 * SIZE
  962. MADD4 c22, b2, a2, c22
  963. LD b2, BO, 29 * SIZE
  964. MADD2 c32, b3, a2, c32
  965. LD b3, BO, 30 * SIZE
  966. MADD4 c42, b4, a2, c42
  967. LD b4, BO, 31 * SIZE
  968. MADD1 c51, b7, a4, c51
  969. addi.d BO, BO, 32 * SIZE
  970. MADD3 c61, b2, a4, c61
  971. addi.d AO, AO, 8 * SIZE
  972. MADD1 c71, b3, a4, c71
  973. MADD3 c81, b4, a4, c81
  974. MADD2 c52, b7, a2, c52
  975. LD b7, BO, 12 * SIZE
  976. MADD4 c62, b2, a2, c62
  977. LD b2, BO, 1 * SIZE
  978. MADD2 c72, b3, a2, c72
  979. LD b3, BO, 2 * SIZE
  980. MADD4 c82, b4, a2, c82
  981. LD b4, BO, 3 * SIZE
  982. .align 3
  983. .L15:
  984. #if defined(LT) || defined(RN)
  985. andi L, KK, 3
  986. #else
  987. andi L, TEMP, 3
  988. #endif
  989. bge $r0, L, .L18
  990. .align 3
  991. .L16:
  992. MADD1 c11, b1, a1, c11
  993. LD a2, AO, 1 * SIZE
  994. MADD3 c21, b2, a1, c21
  995. MADD1 c31, b3, a1, c31
  996. MADD3 c41, b4, a1, c41
  997. MADD2 c12, b1, a2, c12
  998. LD b1, BO, 8 * SIZE
  999. MADD4 c22, b2, a2, c22
  1000. LD b2, BO, 5 * SIZE
  1001. MADD2 c32, b3, a2, c32
  1002. LD b3, BO, 6 * SIZE
  1003. MADD4 c42, b4, a2, c42
  1004. LD b4, BO, 7 * SIZE
  1005. MADD1 c51, b5, a1, c51
  1006. addi.d L, L, -1
  1007. MADD3 c61, b2, a1, c61
  1008. addi.d AO, AO, 2 * SIZE
  1009. MADD1 c71, b3, a1, c71
  1010. addi.d BO, BO, 8 * SIZE
  1011. MADD3 c81, b4, a1, c81
  1012. LD a1, AO, 0 * SIZE
  1013. MADD2 c52, b5, a2, c52
  1014. LD b5, BO, 4 * SIZE
  1015. MADD4 c62, b2, a2, c62
  1016. LD b2, BO, 1 * SIZE
  1017. MADD2 c72, b3, a2, c72
  1018. LD b3, BO, 2 * SIZE
  1019. MADD4 c82, b4, a2, c82
  1020. LD b4, BO, 3 * SIZE
  1021. blt $r0, L, .L16
  1022. .L18:
  1023. ADD c11, c11, c22
  1024. ADD c12, c12, c21
  1025. ADD c31, c31, c42
  1026. ADD c32, c32, c41
  1027. ADD c51, c51, c62
  1028. ADD c52, c52, c61
  1029. ADD c71, c71, c82
  1030. ADD c72, c72, c81
  1031. #if defined(LN) || defined(RT)
  1032. #ifdef LN
  1033. addi.d TEMP, KK, -1
  1034. #else
  1035. addi.d TEMP, KK, -4
  1036. #endif
  1037. slli.d L, TEMP, ZBASE_SHIFT
  1038. slli.d TEMP, TEMP, 2 + ZBASE_SHIFT
  1039. add.d AO, AORIG, L
  1040. add.d BO, B, TEMP
  1041. #endif
  1042. #if defined(LN) || defined(LT)
  1043. LD b1, BO, 0 * SIZE
  1044. LD b2, BO, 1 * SIZE
  1045. LD b3, BO, 2 * SIZE
  1046. LD b4, BO, 3 * SIZE
  1047. LD b5, BO, 4 * SIZE
  1048. LD b6, BO, 5 * SIZE
  1049. LD b7, BO, 6 * SIZE
  1050. LD b8, BO, 7 * SIZE
  1051. SUB c11, b1, c11
  1052. SUB c12, b2, c12
  1053. SUB c31, b3, c31
  1054. SUB c32, b4, c32
  1055. SUB c51, b5, c51
  1056. SUB c52, b6, c52
  1057. SUB c71, b7, c71
  1058. SUB c72, b8, c72
  1059. #else
  1060. LD b1, AO, 0 * SIZE
  1061. LD b2, AO, 1 * SIZE
  1062. LD b3, AO, 2 * SIZE
  1063. LD b4, AO, 3 * SIZE
  1064. LD b5, AO, 4 * SIZE
  1065. LD b6, AO, 5 * SIZE
  1066. LD b7, AO, 6 * SIZE
  1067. LD b8, AO, 7 * SIZE
  1068. SUB c11, b1, c11
  1069. SUB c12, b2, c12
  1070. SUB c31, b3, c31
  1071. SUB c32, b4, c32
  1072. SUB c51, b5, c51
  1073. SUB c52, b6, c52
  1074. SUB c71, b7, c71
  1075. SUB c72, b8, c72
  1076. #endif
  1077. #if defined(LN) || defined(LT)
  1078. LD b1, AO, 0 * SIZE
  1079. LD b2, AO, 1 * SIZE
  1080. MUL a1, b2, c12
  1081. MUL a2, b2, c11
  1082. MUL a3, b2, c32
  1083. MUL a4, b2, c31
  1084. MADD5 c11, c11, b1, a1
  1085. MADD6 c12, c12, b1, a2
  1086. MADD5 c31, c31, b1, a3
  1087. MADD6 c32, c32, b1, a4
  1088. MUL a1, b2, c52
  1089. MUL a2, b2, c51
  1090. MUL a3, b2, c72
  1091. MUL a4, b2, c71
  1092. MADD5 c51, c51, b1, a1
  1093. MADD6 c52, c52, b1, a2
  1094. MADD5 c71, c71, b1, a3
  1095. MADD6 c72, c72, b1, a4
  1096. #endif
  1097. #ifdef RN
  1098. LD b1, BO, 0 * SIZE
  1099. LD b2, BO, 1 * SIZE
  1100. LD b3, BO, 2 * SIZE
  1101. LD b4, BO, 3 * SIZE
  1102. LD b5, BO, 4 * SIZE
  1103. LD b6, BO, 5 * SIZE
  1104. LD b7, BO, 6 * SIZE
  1105. LD b8, BO, 7 * SIZE
  1106. MUL a1, b2, c12
  1107. MUL a2, b2, c11
  1108. MADD5 c11, c11, b1, a1
  1109. MADD6 c12, c12, b1, a2
  1110. NMSUB c31, c11, b3, c31
  1111. MADD7 c32, c11, b4, c32
  1112. NMSUB c51, c11, b5, c51
  1113. MADD7 c52, c11, b6, c52
  1114. NMSUB c71, c11, b7, c71
  1115. MADD7 c72, c11, b8, c72
  1116. MADD8 c31, c12, b4, c31
  1117. NMSUB c32, c12, b3, c32
  1118. MADD8 c51, c12, b6, c51
  1119. NMSUB c52, c12, b5, c52
  1120. MADD8 c71, c12, b8, c71
  1121. NMSUB c72, c12, b7, c72
  1122. LD b3, BO, 10 * SIZE
  1123. LD b4, BO, 11 * SIZE
  1124. LD b5, BO, 12 * SIZE
  1125. LD b6, BO, 13 * SIZE
  1126. LD b7, BO, 14 * SIZE
  1127. LD b8, BO, 15 * SIZE
  1128. MUL a1, b4, c32
  1129. MUL a2, b4, c31
  1130. MADD5 c31, c31, b3, a1
  1131. MADD6 c32, c32, b3, a2
  1132. NMSUB c51, c31, b5, c51
  1133. MADD7 c52, c31, b6, c52
  1134. NMSUB c71, c31, b7, c71
  1135. MADD7 c72, c31, b8, c72
  1136. MADD8 c51, c32, b6, c51
  1137. NMSUB c52, c32, b5, c52
  1138. MADD8 c71, c32, b8, c71
  1139. NMSUB c72, c32, b7, c72
  1140. LD b5, BO, 20 * SIZE
  1141. LD b6, BO, 21 * SIZE
  1142. LD b7, BO, 22 * SIZE
  1143. LD b8, BO, 23 * SIZE
  1144. MUL a1, b6, c52
  1145. MUL a2, b6, c51
  1146. MADD5 c51, c51, b5, a1
  1147. MADD6 c52, c52, b5, a2
  1148. NMSUB c71, c51, b7, c71
  1149. MADD7 c72, c51, b8, c72
  1150. MADD8 c71, c52, b8, c71
  1151. NMSUB c72, c52, b7, c72
  1152. LD b7, BO, 30 * SIZE
  1153. LD b8, BO, 31 * SIZE
  1154. MUL a1, b8, c72
  1155. MUL a2, b8, c71
  1156. MADD5 c71, c71, b7, a1
  1157. MADD6 c72, c72, b7, a2
  1158. #endif
  1159. #ifdef RT
  1160. LD b1, BO, 30 * SIZE
  1161. LD b2, BO, 31 * SIZE
  1162. LD b3, BO, 28 * SIZE
  1163. LD b4, BO, 29 * SIZE
  1164. LD b5, BO, 26 * SIZE
  1165. LD b6, BO, 27 * SIZE
  1166. LD b7, BO, 24 * SIZE
  1167. LD b8, BO, 25 * SIZE
  1168. MUL a1, b2, c72
  1169. MUL a2, b2, c71
  1170. MADD5 c71, c71, b1, a1
  1171. MADD6 c72, c72, b1, a2
  1172. NMSUB c51, c71, b3, c51
  1173. MADD7 c52, c71, b4, c52
  1174. NMSUB c31, c71, b5, c31
  1175. MADD7 c32, c71, b6, c32
  1176. NMSUB c11, c71, b7, c11
  1177. MADD7 c12, c71, b8, c12
  1178. MADD8 c51, c72, b4, c51
  1179. NMSUB c52, c72, b3, c52
  1180. MADD8 c31, c72, b6, c31
  1181. NMSUB c32, c72, b5, c32
  1182. MADD8 c11, c72, b8, c11
  1183. NMSUB c12, c72, b7, c12
  1184. LD b3, BO, 20 * SIZE
  1185. LD b4, BO, 21 * SIZE
  1186. LD b5, BO, 18 * SIZE
  1187. LD b6, BO, 19 * SIZE
  1188. LD b7, BO, 16 * SIZE
  1189. LD b8, BO, 17 * SIZE
  1190. MUL a1, b4, c52
  1191. MUL a2, b4, c51
  1192. MADD5 c51, c51, b3, a1
  1193. MADD6 c52, c52, b3, a2
  1194. NMSUB c31, c51, b5, c31
  1195. MADD7 c32, c51, b6, c32
  1196. NMSUB c11, c51, b7, c11
  1197. MADD7 c12, c51, b8, c12
  1198. MADD8 c31, c52, b6, c31
  1199. NMSUB c32, c52, b5, c32
  1200. MADD8 c11, c52, b8, c11
  1201. NMSUB c12, c52, b7, c12
  1202. LD b5, BO, 10 * SIZE
  1203. LD b6, BO, 11 * SIZE
  1204. LD b7, BO, 8 * SIZE
  1205. LD b8, BO, 9 * SIZE
  1206. MUL a1, b6, c32
  1207. MUL a2, b6, c31
  1208. MADD5 c31, c31, b5, a1
  1209. MADD6 c32, c32, b5, a2
  1210. NMSUB c11, c31, b7, c11
  1211. MADD7 c12, c31, b8, c12
  1212. MADD8 c11, c32, b8, c11
  1213. NMSUB c12, c32, b7, c12
  1214. LD b7, BO, 0 * SIZE
  1215. LD b8, BO, 1 * SIZE
  1216. MUL a1, b8, c12
  1217. MUL a2, b8, c11
  1218. MADD5 c11, c11, b7, a1
  1219. MADD6 c12, c12, b7, a2
  1220. #endif
  1221. #if defined(LN) || defined(LT)
  1222. ST c11, BO, 0 * SIZE
  1223. ST c12, BO, 1 * SIZE
  1224. ST c31, BO, 2 * SIZE
  1225. ST c32, BO, 3 * SIZE
  1226. ST c51, BO, 4 * SIZE
  1227. ST c52, BO, 5 * SIZE
  1228. ST c71, BO, 6 * SIZE
  1229. ST c72, BO, 7 * SIZE
  1230. #else
  1231. ST c11, AO, 0 * SIZE
  1232. ST c12, AO, 1 * SIZE
  1233. ST c31, AO, 2 * SIZE
  1234. ST c32, AO, 3 * SIZE
  1235. ST c51, AO, 4 * SIZE
  1236. ST c52, AO, 5 * SIZE
  1237. ST c71, AO, 6 * SIZE
  1238. ST c72, AO, 7 * SIZE
  1239. #endif
  1240. #ifdef LN
  1241. addi.d CO1,CO1, -2 * SIZE
  1242. addi.d CO2,CO2, -2 * SIZE
  1243. addi.d CO3,CO3, -2 * SIZE
  1244. addi.d CO4,CO4, -2 * SIZE
  1245. #endif
  1246. ST c11, CO1, 0 * SIZE
  1247. ST c12, CO1, 1 * SIZE
  1248. ST c31, CO2, 0 * SIZE
  1249. ST c32, CO2, 1 * SIZE
  1250. ST c51, CO3, 0 * SIZE
  1251. ST c52, CO3, 1 * SIZE
  1252. ST c71, CO4, 0 * SIZE
  1253. ST c72, CO4, 1 * SIZE
  1254. #ifndef LN
  1255. addi.d CO1,CO1, 2 * SIZE
  1256. addi.d CO2,CO2, 2 * SIZE
  1257. addi.d CO3,CO3, 2 * SIZE
  1258. addi.d CO4,CO4, 2 * SIZE
  1259. #endif
  1260. #ifdef RT
  1261. slli.d TEMP, K, ZBASE_SHIFT
  1262. add.d AORIG, AORIG, TEMP
  1263. #endif
  1264. #if defined(LT) || defined(RN)
  1265. sub.d TEMP, K, KK
  1266. slli.d L, TEMP, ZBASE_SHIFT
  1267. slli.d TEMP, TEMP, 2 + ZBASE_SHIFT
  1268. add.d AO, AO, L
  1269. add.d BO, BO, TEMP
  1270. #endif
  1271. #ifdef LT
  1272. addi.d KK, KK, 1
  1273. #endif
  1274. #ifdef LN
  1275. addi.d KK, KK, -1
  1276. #endif
  1277. MTC c11, $r0
  1278. addi.d I, I, -1
  1279. MOV c21, c11
  1280. MOV c31, c11
  1281. MOV c41, c11
  1282. MOV c51, c11
  1283. MOV c61, c11
  1284. blt $r0, I, .L11
  1285. .align 3
  1286. .L19:
  1287. #ifdef LN
  1288. slli.d TEMP, K, 2 + ZBASE_SHIFT
  1289. add.d B, B, TEMP
  1290. #endif
  1291. #if defined(LT) || defined(RN)
  1292. move B, BO
  1293. #endif
  1294. #ifdef RN
  1295. addi.d KK, KK, 4
  1296. #endif
  1297. #ifdef RT
  1298. addi.d KK, KK, -4
  1299. #endif
  1300. blt $r0, J, .L10
  1301. .align 3
  1302. .L999:
  1303. LDARG $r23, $sp, 0
  1304. LDARG $r24, $sp, 8
  1305. LDARG $r25, $sp, 16
  1306. LDARG $r26, $sp, 24
  1307. LDARG $r27, $sp, 32
  1308. LDARG $r28, $sp, 40
  1309. fld.d $f24, $sp, 48
  1310. fld.d $f25, $sp, 56
  1311. fld.d $f26, $sp, 64
  1312. fld.d $f27, $sp, 72
  1313. #ifndef __64BIT__
  1314. fld.d $f18, $sp, 88
  1315. fld.d $f19, $sp, 96
  1316. fld.d $f20, $sp, 104
  1317. fld.d $f21, $sp, 112
  1318. #endif
  1319. addi.d $sp, $sp, 128
  1320. move $r4, $r17
  1321. fmov.d $f0, $f22
  1322. jirl $r0, $r1, 0x0
  1323. EPILOGUE