You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LT.S 31 kB


  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M $r4
  30. #define N $r5
  31. #define K $r6
  32. #define A $r7
  33. #define B $r8
  34. #define C $r9
  35. #define LDC $r10
  36. #define OFFSET $r11
  37. #define AO $r12
  38. #define BO $r13
  39. #define I $r17
  40. #define J $r18
  41. #define L $r25
  42. #define CO1 $r14
  43. #define CO2 $r15
  44. #define CO3 $r23
  45. #define CO4 $r24
  46. #define KK $r26
  47. #define TEMP $r27
  48. #define AORIG $r28
  49. #define a1 $f22
  50. #define a2 $f8
  51. #define a3 $f26
  52. #define a4 $f27
  53. #define b1 $f23
  54. #define b2 $f9
  55. #define b3 $f10
  56. #define b4 $f11
  57. #define b5 $f12
  58. #define b6 $f13
  59. #define b7 $f14
  60. #define b8 $f15
  61. #define a5 b8
  62. #define c11 $f16
  63. #define c12 $f17
  64. #define c21 $f0
  65. #define c22 $f1
  66. #define c31 $f2
  67. #define c32 $f3
  68. #define c41 $f4
  69. #define c42 $f5
  70. #define c51 $f6
  71. #define c52 $f7
  72. #define c61 $f18
  73. #define c62 $f19
  74. #define c71 $f20
  75. #define c72 $f21
  76. #define c81 $f24
  77. #define c82 $f25
  78. #ifndef CONJ
  79. #define MADD1 MADD
  80. #define MADD2 MADD
  81. #define MADD3 MADD
  82. #define MADD4 NMSUB
  83. #define MADD5 MSUB
  84. #define MADD6 MADD
  85. #define MADD7 NMSUB
  86. #define MADD8 MADD
  87. #else
  88. #if defined(LN) || defined(LT)
  89. #define MADD1 MADD
  90. #define MADD2 NMSUB
  91. #define MADD3 MADD
  92. #define MADD4 MADD
  93. #else
  94. #define MADD1 MADD
  95. #define MADD2 MADD
  96. #define MADD3 NMSUB
  97. #define MADD4 MADD
  98. #endif
  99. #define MADD5 MADD
  100. #define MADD6 MSUB
  101. #define MADD7 MADD
  102. #define MADD8 NMSUB
  103. #endif
  104. PROLOGUE
  105. addi.d $sp, $sp, -128
  106. SDARG $r23, $sp, 0
  107. SDARG $r24, $sp, 8
  108. SDARG $r25, $sp, 16
  109. SDARG $r26, $sp, 24
  110. SDARG $r27, $sp, 32
  111. SDARG $r28, $sp, 40
  112. fst.d $f24, $sp, 48
  113. fst.d $f25, $sp, 56
  114. fst.d $f26, $sp, 64
  115. fst.d $f27, $sp, 72
  116. #ifndef __64BIT__
  117. fst.d $f18, $sp, 88
  118. fst.d $f19, $sp, 96
  119. fst.d $f20, $sp, 104
  120. fst.d $f21, $sp, 112
  121. #endif
  122. slli.d LDC, LDC, ZBASE_SHIFT
  123. #ifdef LN
  124. mul.w TEMP, M, K
  125. slli.d TEMP, TEMP, ZBASE_SHIFT
  126. add.d A, A, TEMP
  127. slli.d TEMP, M, ZBASE_SHIFT
  128. add.d C, C, TEMP
  129. #endif
  130. #ifdef RN
  131. sub.d KK, $r0, OFFSET
  132. #endif
  133. #ifdef RT
  134. mul.w TEMP, N, K
  135. slli.d TEMP, TEMP, ZBASE_SHIFT
  136. add.d B, B, TEMP
  137. mul.w TEMP, N, LDC
  138. add.d C, C, TEMP
  139. sub.d KK, N, OFFSET
  140. #endif
  141. srai.d J, N, 2
  142. nop
  143. bge $r0, J, .L20
  144. .L10:
  145. #ifdef RT
  146. slli.d TEMP, K, 2 + ZBASE_SHIFT
  147. sub.d B, B, TEMP
  148. slli.d TEMP, LDC, 2
  149. sub.d C, C, TEMP
  150. #endif
  151. move CO1, C
  152. MTC c11, $r0
  153. add.d CO2, C, LDC
  154. add.d CO3, CO2, LDC
  155. addi.d J, J, -1
  156. add.d CO4, CO3, LDC
  157. MOV c21, c11
  158. MOV c31, c11
  159. MOV c41, c11
  160. MOV c51, c11
  161. move I, M
  162. #ifdef LN
  163. add.d KK, M, OFFSET
  164. #endif
  165. #ifdef LT
  166. move KK, OFFSET
  167. #endif
  168. #if defined(LN) || defined(RT)
  169. move AORIG, A
  170. #else
  171. move AO, A
  172. #endif
  173. #ifndef RT
  174. add.d C, CO4, LDC
  175. #endif
  176. MOV c61, c11
  177. bge $r0, I, .L19
  178. .align 3
  179. .L11:
  180. #if defined(LT) || defined(RN)
  181. LD a1, AO, 0 * SIZE
  182. MOV c71, c11
  183. LD b1, B, 0 * SIZE
  184. MOV c81, c11
  185. LD a3, AO, 4 * SIZE
  186. MOV c12, c11
  187. LD b2, B, 1 * SIZE
  188. MOV c22, c11
  189. srai.d L, KK, 2
  190. MOV c32, c11
  191. LD b3, B, 2 * SIZE
  192. MOV c42, c11
  193. LD b4, B, 3 * SIZE
  194. MOV c52, c11
  195. LD b5, B, 4 * SIZE
  196. MOV c62, c11
  197. LD b6, B, 8 * SIZE
  198. MOV c72, c11
  199. LD b7, B, 12 * SIZE
  200. MOV c82, c11
  201. move BO, B
  202. bge $r0, L, .L15
  203. #else
  204. #ifdef LN
  205. slli.d TEMP, K, ZBASE_SHIFT
  206. sub.d AORIG, AORIG, TEMP
  207. #endif
  208. slli.d L, KK, ZBASE_SHIFT
  209. slli.d TEMP, KK, 2 + ZBASE_SHIFT
  210. add.d AO, AORIG, L
  211. add.d BO, B, TEMP
  212. sub.d TEMP, K, KK
  213. LD a1, AO, 0 * SIZE
  214. MOV c71, c11
  215. LD b1, BO, 0 * SIZE
  216. MOV c81, c11
  217. LD a3, AO, 4 * SIZE
  218. MOV c12, c11
  219. LD b2, BO, 1 * SIZE
  220. MOV c22, c11
  221. srai.d L, TEMP, 2
  222. MOV c32, c11
  223. LD b3, BO, 2 * SIZE
  224. MOV c42, c11
  225. LD b4, BO, 3 * SIZE
  226. MOV c52, c11
  227. LD b5, BO, 4 * SIZE
  228. MOV c62, c11
  229. LD b6, BO, 8 * SIZE
  230. MOV c72, c11
  231. LD b7, BO, 12 * SIZE
  232. MOV c82, c11
  233. bge $r0, L, .L15
  234. #endif
  235. MADD1 c11, b1, a1, c11
  236. LD a2, AO, 1 * SIZE
  237. MADD3 c21, b2, a1, c21
  238. addi.d L, L, -1
  239. MADD1 c31, b3, a1, c31
  240. MADD3 c41, b4, a1, c41
  241. bge $r0, L, .L13
  242. .align 3
  243. .L12:
  244. MADD2 c12, b1, a2, c12
  245. LD b1, BO, 16 * SIZE
  246. MADD4 c22, b2, a2, c22
  247. LD b2, BO, 5 * SIZE
  248. MADD2 c32, b3, a2, c32
  249. LD b3, BO, 6 * SIZE
  250. MADD4 c42, b4, a2, c42
  251. LD b4, BO, 7 * SIZE
  252. MADD1 c51, b5, a1, c51
  253. MADD3 c61, b2, a1, c61
  254. LD a4, AO, 2 * SIZE
  255. MADD1 c71, b3, a1, c71
  256. MADD3 c81, b4, a1, c81
  257. LD a1, AO, 8 * SIZE
  258. MADD2 c52, b5, a2, c52
  259. LD b5, BO, 20 * SIZE
  260. MADD4 c62, b2, a2, c62
  261. LD b2, BO, 9 * SIZE
  262. MADD2 c72, b3, a2, c72
  263. LD b3, BO, 10 * SIZE
  264. MADD4 c82, b4, a2, c82
  265. LD b4, BO, 11 * SIZE
  266. MADD1 c11, b6, a4, c11
  267. LD a2, AO, 3 * SIZE
  268. MADD3 c21, b2, a4, c21
  269. MADD1 c31, b3, a4, c31
  270. MADD3 c41, b4, a4, c41
  271. MADD2 c12, b6, a2, c12
  272. LD b6, BO, 24 * SIZE
  273. MADD4 c22, b2, a2, c22
  274. LD b2, BO, 13 * SIZE
  275. MADD2 c32, b3, a2, c32
  276. LD b3, BO, 14 * SIZE
  277. MADD4 c42, b4, a2, c42
  278. LD b4, BO, 15 * SIZE
  279. MADD1 c51, b7, a4, c51
  280. MADD3 c61, b2, a4, c61
  281. MADD1 c71, b3, a4, c71
  282. MADD3 c81, b4, a4, c81
  283. MADD2 c52, b7, a2, c52
  284. LD b7, BO, 28 * SIZE
  285. MADD4 c62, b2, a2, c62
  286. LD b2, BO, 17 * SIZE
  287. MADD2 c72, b3, a2, c72
  288. LD b3, BO, 18 * SIZE
  289. MADD4 c82, b4, a2, c82
  290. LD b4, BO, 19 * SIZE
  291. MADD1 c11, b1, a3, c11
  292. LD a2, AO, 5 * SIZE
  293. MADD3 c21, b2, a3, c21
  294. MADD1 c31, b3, a3, c31
  295. MADD3 c41, b4, a3, c41
  296. MADD2 c12, b1, a2, c12
  297. LD b1, BO, 32 * SIZE
  298. MADD4 c22, b2, a2, c22
  299. LD b2, BO, 21 * SIZE
  300. MADD2 c32, b3, a2, c32
  301. LD b3, BO, 22 * SIZE
  302. MADD4 c42, b4, a2, c42
  303. LD b4, BO, 23 * SIZE
  304. MADD1 c51, b5, a3, c51
  305. MADD3 c61, b2, a3, c61
  306. LD a4, AO, 6 * SIZE
  307. MADD1 c71, b3, a3, c71
  308. MADD3 c81, b4, a3, c81
  309. LD a3, AO, 12 * SIZE
  310. MADD2 c52, b5, a2, c52
  311. LD b5, BO, 36 * SIZE
  312. MADD4 c62, b2, a2, c62
  313. LD b2, BO, 25 * SIZE
  314. MADD2 c72, b3, a2, c72
  315. LD b3, BO, 26 * SIZE
  316. MADD4 c82, b4, a2, c82
  317. LD b4, BO, 27 * SIZE
  318. MADD1 c11, b6, a4, c11
  319. LD a2, AO, 7 * SIZE
  320. MADD3 c21, b2, a4, c21
  321. MADD1 c31, b3, a4, c31
  322. MADD3 c41, b4, a4, c41
  323. addi.d L, L, -1
  324. MADD2 c12, b6, a2, c12
  325. LD b6, BO, 40 * SIZE
  326. MADD4 c22, b2, a2, c22
  327. LD b2, BO, 29 * SIZE
  328. MADD2 c32, b3, a2, c32
  329. LD b3, BO, 30 * SIZE
  330. MADD4 c42, b4, a2, c42
  331. LD b4, BO, 31 * SIZE
  332. MADD1 c51, b7, a4, c51
  333. addi.d BO, BO, 32 * SIZE
  334. MADD3 c61, b2, a4, c61
  335. addi.d AO, AO, 8 * SIZE
  336. MADD1 c71, b3, a4, c71
  337. MADD3 c81, b4, a4, c81
  338. MADD2 c52, b7, a2, c52
  339. LD b7, BO, 12 * SIZE
  340. MADD4 c62, b2, a2, c62
  341. LD b2, BO, 1 * SIZE
  342. MADD2 c72, b3, a2, c72
  343. LD b3, BO, 2 * SIZE
  344. MADD4 c82, b4, a2, c82
  345. LD b4, BO, 3 * SIZE
  346. MADD1 c11, b1, a1, c11
  347. LD a2, AO, 1 * SIZE
  348. MADD3 c21, b2, a1, c21
  349. MADD1 c31, b3, a1, c31
  350. MADD3 c41, b4, a1, c41
  351. blt $r0, L, .L12
  352. .align 3
  353. .L13:
  354. MADD2 c12, b1, a2, c12
  355. LD b1, BO, 16 * SIZE
  356. MADD4 c22, b2, a2, c22
  357. LD b2, BO, 5 * SIZE
  358. MADD2 c32, b3, a2, c32
  359. LD b3, BO, 6 * SIZE
  360. MADD4 c42, b4, a2, c42
  361. LD b4, BO, 7 * SIZE
  362. MADD1 c51, b5, a1, c51
  363. MADD3 c61, b2, a1, c61
  364. LD a4, AO, 2 * SIZE
  365. MADD1 c71, b3, a1, c71
  366. MADD3 c81, b4, a1, c81
  367. LD a1, AO, 8 * SIZE
  368. MADD2 c52, b5, a2, c52
  369. LD b5, BO, 20 * SIZE
  370. MADD4 c62, b2, a2, c62
  371. LD b2, BO, 9 * SIZE
  372. MADD2 c72, b3, a2, c72
  373. LD b3, BO, 10 * SIZE
  374. MADD4 c82, b4, a2, c82
  375. LD b4, BO, 11 * SIZE
  376. MADD1 c11, b6, a4, c11
  377. LD a2, AO, 3 * SIZE
  378. MADD3 c21, b2, a4, c21
  379. MADD1 c31, b3, a4, c31
  380. MADD3 c41, b4, a4, c41
  381. MADD2 c12, b6, a2, c12
  382. LD b6, BO, 24 * SIZE
  383. MADD4 c22, b2, a2, c22
  384. LD b2, BO, 13 * SIZE
  385. MADD2 c32, b3, a2, c32
  386. LD b3, BO, 14 * SIZE
  387. MADD4 c42, b4, a2, c42
  388. LD b4, BO, 15 * SIZE
  389. MADD1 c51, b7, a4, c51
  390. MADD3 c61, b2, a4, c61
  391. MADD1 c71, b3, a4, c71
  392. MADD3 c81, b4, a4, c81
  393. MADD2 c52, b7, a2, c52
  394. LD b7, BO, 28 * SIZE
  395. MADD4 c62, b2, a2, c62
  396. LD b2, BO, 17 * SIZE
  397. MADD2 c72, b3, a2, c72
  398. LD b3, BO, 18 * SIZE
  399. MADD4 c82, b4, a2, c82
  400. LD b4, BO, 19 * SIZE
  401. MADD1 c11, b1, a3, c11
  402. LD a2, AO, 5 * SIZE
  403. MADD3 c21, b2, a3, c21
  404. MADD1 c31, b3, a3, c31
  405. MADD3 c41, b4, a3, c41
  406. MADD2 c12, b1, a2, c12
  407. LD b1, BO, 32 * SIZE
  408. MADD4 c22, b2, a2, c22
  409. LD b2, BO, 21 * SIZE
  410. MADD2 c32, b3, a2, c32
  411. LD b3, BO, 22 * SIZE
  412. MADD4 c42, b4, a2, c42
  413. LD b4, BO, 23 * SIZE
  414. MADD1 c51, b5, a3, c51
  415. MADD3 c61, b2, a3, c61
  416. LD a4, AO, 6 * SIZE
  417. MADD1 c71, b3, a3, c71
  418. MADD3 c81, b4, a3, c81
  419. LD a3, AO, 12 * SIZE
  420. MADD2 c52, b5, a2, c52
  421. LD b5, BO, 36 * SIZE
  422. MADD4 c62, b2, a2, c62
  423. LD b2, BO, 25 * SIZE
  424. MADD2 c72, b3, a2, c72
  425. LD b3, BO, 26 * SIZE
  426. MADD4 c82, b4, a2, c82
  427. LD b4, BO, 27 * SIZE
  428. MADD1 c11, b6, a4, c11
  429. LD a2, AO, 7 * SIZE
  430. MADD3 c21, b2, a4, c21
  431. MADD1 c31, b3, a4, c31
  432. MADD3 c41, b4, a4, c41
  433. MADD2 c12, b6, a2, c12
  434. LD b6, BO, 40 * SIZE
  435. MADD4 c22, b2, a2, c22
  436. LD b2, BO, 29 * SIZE
  437. MADD2 c32, b3, a2, c32
  438. LD b3, BO, 30 * SIZE
  439. MADD4 c42, b4, a2, c42
  440. LD b4, BO, 31 * SIZE
  441. MADD1 c51, b7, a4, c51
  442. addi.d BO, BO, 32 * SIZE
  443. MADD3 c61, b2, a4, c61
  444. addi.d AO, AO, 8 * SIZE
  445. MADD1 c71, b3, a4, c71
  446. MADD3 c81, b4, a4, c81
  447. MADD2 c52, b7, a2, c52
  448. LD b7, BO, 12 * SIZE
  449. MADD4 c62, b2, a2, c62
  450. LD b2, BO, 1 * SIZE
  451. MADD2 c72, b3, a2, c72
  452. LD b3, BO, 2 * SIZE
  453. MADD4 c82, b4, a2, c82
  454. LD b4, BO, 3 * SIZE
  455. .align 3
  456. .L15:
  457. #if defined(LT) || defined(RN)
  458. andi L, KK, 3
  459. #else
  460. andi L, TEMP, 3
  461. #endif
  462. bge $r0, L, .L18
  463. .align 3
  464. .L16:
  465. MADD1 c11, b1, a1, c11
  466. LD a2, AO, 1 * SIZE
  467. MADD3 c21, b2, a1, c21
  468. MADD1 c31, b3, a1, c31
  469. MADD3 c41, b4, a1, c41
  470. MADD2 c12, b1, a2, c12
  471. LD b1, BO, 8 * SIZE
  472. MADD4 c22, b2, a2, c22
  473. LD b2, BO, 5 * SIZE
  474. MADD2 c32, b3, a2, c32
  475. LD b3, BO, 6 * SIZE
  476. MADD4 c42, b4, a2, c42
  477. LD b4, BO, 7 * SIZE
  478. MADD1 c51, b5, a1, c51
  479. addi.d L, L, -1
  480. MADD3 c61, b2, a1, c61
  481. addi.d AO, AO, 2 * SIZE
  482. MADD1 c71, b3, a1, c71
  483. addi.d BO, BO, 8 * SIZE
  484. MADD3 c81, b4, a1, c81
  485. LD a1, AO, 0 * SIZE
  486. MADD2 c52, b5, a2, c52
  487. LD b5, BO, 4 * SIZE
  488. MADD4 c62, b2, a2, c62
  489. LD b2, BO, 1 * SIZE
  490. MADD2 c72, b3, a2, c72
  491. LD b3, BO, 2 * SIZE
  492. MADD4 c82, b4, a2, c82
  493. LD b4, BO, 3 * SIZE
  494. blt $r0, L, .L16
  495. .L18:
  496. ADD c11, c11, c22
  497. ADD c12, c12, c21
  498. ADD c31, c31, c42
  499. ADD c32, c32, c41
  500. ADD c51, c51, c62
  501. ADD c52, c52, c61
  502. ADD c71, c71, c82
  503. ADD c72, c72, c81
  504. #if defined(LN) || defined(RT)
  505. #ifdef LN
  506. addi.d TEMP, KK, -1
  507. #else
  508. addi.d TEMP, KK, -4
  509. #endif
  510. slli.d L, TEMP, ZBASE_SHIFT
  511. slli.d TEMP, TEMP, 2 + ZBASE_SHIFT
  512. add.d AO, AORIG, L
  513. add.d BO, B, TEMP
  514. #endif
  515. #if defined(LN) || defined(LT)
  516. LD b1, BO, 0 * SIZE
  517. LD b2, BO, 1 * SIZE
  518. LD b3, BO, 2 * SIZE
  519. LD b4, BO, 3 * SIZE
  520. LD b5, BO, 4 * SIZE
  521. LD b6, BO, 5 * SIZE
  522. LD b7, BO, 6 * SIZE
  523. LD b8, BO, 7 * SIZE
  524. SUB c11, b1, c11
  525. SUB c12, b2, c12
  526. SUB c31, b3, c31
  527. SUB c32, b4, c32
  528. SUB c51, b5, c51
  529. SUB c52, b6, c52
  530. SUB c71, b7, c71
  531. SUB c72, b8, c72
  532. #else
  533. LD b1, AO, 0 * SIZE
  534. LD b2, AO, 1 * SIZE
  535. LD b3, AO, 2 * SIZE
  536. LD b4, AO, 3 * SIZE
  537. LD b5, AO, 4 * SIZE
  538. LD b6, AO, 5 * SIZE
  539. LD b7, AO, 6 * SIZE
  540. LD b8, AO, 7 * SIZE
  541. SUB c11, b1, c11
  542. SUB c12, b2, c12
  543. SUB c31, b3, c31
  544. SUB c32, b4, c32
  545. SUB c51, b5, c51
  546. SUB c52, b6, c52
  547. SUB c71, b7, c71
  548. SUB c72, b8, c72
  549. #endif
  550. #if defined(LN) || defined(LT)
  551. LD b1, AO, 0 * SIZE
  552. LD b2, AO, 1 * SIZE
  553. MUL a1, b2, c12
  554. MUL a2, b2, c11
  555. MUL a3, b2, c32
  556. MUL a4, b2, c31
  557. MADD5 c11, c11, b1, a1
  558. MADD6 c12, c12, b1, a2
  559. MADD5 c31, c31, b1, a3
  560. MADD6 c32, c32, b1, a4
  561. MUL a1, b2, c52
  562. MUL a2, b2, c51
  563. MUL a3, b2, c72
  564. MUL a4, b2, c71
  565. MADD5 c51, c51, b1, a1
  566. MADD6 c52, c52, b1, a2
  567. MADD5 c71, c71, b1, a3
  568. MADD6 c72, c72, b1, a4
  569. #endif
  570. #ifdef RN
  571. LD b1, BO, 0 * SIZE
  572. LD b2, BO, 1 * SIZE
  573. LD b3, BO, 2 * SIZE
  574. LD b4, BO, 3 * SIZE
  575. LD b5, BO, 4 * SIZE
  576. LD b6, BO, 5 * SIZE
  577. LD b7, BO, 6 * SIZE
  578. LD b8, BO, 7 * SIZE
  579. MUL a1, b2, c12
  580. MUL a2, b2, c11
  581. MADD5 c11, c11, b1, a1
  582. MADD6 c12, c12, b1, a2
  583. NMSUB c31, c11, b3, c31
  584. MADD7 c32, c11, b4, c32
  585. NMSUB c51, c11, b5, c51
  586. MADD7 c52, c11, b6, c52
  587. NMSUB c71, c11, b7, c71
  588. MADD7 c72, c11, b8, c72
  589. MADD8 c31, c12, b4, c31
  590. NMSUB c32, c12, b3, c32
  591. MADD8 c51, c12, b6, c51
  592. NMSUB c52, c12, b5, c52
  593. MADD8 c71, c12, b8, c71
  594. NMSUB c72, c12, b7, c72
  595. LD b3, BO, 10 * SIZE
  596. LD b4, BO, 11 * SIZE
  597. LD b5, BO, 12 * SIZE
  598. LD b6, BO, 13 * SIZE
  599. LD b7, BO, 14 * SIZE
  600. LD b8, BO, 15 * SIZE
  601. MUL a1, b4, c32
  602. MUL a2, b4, c31
  603. MADD5 c31, c31, b3, a1
  604. MADD6 c32, c32, b3, a2
  605. NMSUB c51, c31, b5, c51
  606. MADD7 c52, c31, b6, c52
  607. NMSUB c71, c31, b7, c71
  608. MADD7 c72, c31, b8, c72
  609. MADD8 c51, c32, b6, c51
  610. NMSUB c52, c32, b5, c52
  611. MADD8 c71, c32, b8, c71
  612. NMSUB c72, c32, b7, c72
  613. LD b5, BO, 20 * SIZE
  614. LD b6, BO, 21 * SIZE
  615. LD b7, BO, 22 * SIZE
  616. LD b8, BO, 23 * SIZE
  617. MUL a1, b6, c52
  618. MUL a2, b6, c51
  619. MADD5 c51, c51, b5, a1
  620. MADD6 c52, c52, b5, a2
  621. NMSUB c71, c51, b7, c71
  622. MADD7 c72, c51, b8, c72
  623. MADD8 c71, c52, b8, c71
  624. NMSUB c72, c52, b7, c72
  625. LD b7, BO, 30 * SIZE
  626. LD b8, BO, 31 * SIZE
  627. MUL a1, b8, c72
  628. MUL a2, b8, c71
  629. MADD5 c71, c71, b7, a1
  630. MADD6 c72, c72, b7, a2
  631. #endif
  632. #ifdef RT
  633. LD b1, BO, 30 * SIZE
  634. LD b2, BO, 31 * SIZE
  635. LD b3, BO, 28 * SIZE
  636. LD b4, BO, 29 * SIZE
  637. LD b5, BO, 26 * SIZE
  638. LD b6, BO, 27 * SIZE
  639. LD b7, BO, 24 * SIZE
  640. LD b8, BO, 25 * SIZE
  641. MUL a1, b2, c72
  642. MUL a2, b2, c71
  643. MADD5 c71, c71, b1, a1
  644. MADD6 c72, c72, b1, a2
  645. NMSUB c51, c71, b3, c51
  646. MADD7 c52, c71, b4, c52
  647. NMSUB c31, c71, b5, c31
  648. MADD7 c32, c71, b6, c32
  649. NMSUB c11, c71, b7, c11
  650. MADD7 c12, c71, b8, c12
  651. MADD8 c51, c72, b4, c51
  652. NMSUB c52, c72, b3, c52
  653. MADD8 c31, c72, b6, c31
  654. NMSUB c32, c72, b5, c32
  655. MADD8 c11, c72, b8, c11
  656. NMSUB c12, c72, b7, c12
  657. LD b3, BO, 20 * SIZE
  658. LD b4, BO, 21 * SIZE
  659. LD b5, BO, 18 * SIZE
  660. LD b6, BO, 19 * SIZE
  661. LD b7, BO, 16 * SIZE
  662. LD b8, BO, 17 * SIZE
  663. MUL a1, b4, c52
  664. MUL a2, b4, c51
  665. MADD5 c51, c51, b3, a1
  666. MADD6 c52, c52, b3, a2
  667. NMSUB c31, c51, b5, c31
  668. MADD7 c32, c51, b6, c32
  669. NMSUB c11, c51, b7, c11
  670. MADD7 c12, c51, b8, c12
  671. MADD8 c31, c52, b6, c31
  672. NMSUB c32, c52, b5, c32
  673. MADD8 c11, c52, b8, c11
  674. NMSUB c12, c52, b7, c12
  675. LD b5, BO, 10 * SIZE
  676. LD b6, BO, 11 * SIZE
  677. LD b7, BO, 8 * SIZE
  678. LD b8, BO, 9 * SIZE
  679. MUL a1, b6, c32
  680. MUL a2, b6, c31
  681. MADD5 c31, c31, b5, a1
  682. MADD6 c32, c32, b5, a2
  683. NMSUB c11, c31, b7, c11
  684. MADD7 c12, c31, b8, c12
  685. MADD8 c11, c32, b8, c11
  686. NMSUB c12, c32, b7, c12
  687. LD b7, BO, 0 * SIZE
  688. LD b8, BO, 1 * SIZE
  689. MUL a1, b8, c12
  690. MUL a2, b8, c11
  691. MADD5 c11, c11, b7, a1
  692. MADD6 c12, c12, b7, a2
  693. #endif
  694. #if defined(LN) || defined(LT)
  695. ST c11, BO, 0 * SIZE
  696. ST c12, BO, 1 * SIZE
  697. ST c31, BO, 2 * SIZE
  698. ST c32, BO, 3 * SIZE
  699. ST c51, BO, 4 * SIZE
  700. ST c52, BO, 5 * SIZE
  701. ST c71, BO, 6 * SIZE
  702. ST c72, BO, 7 * SIZE
  703. #else
  704. ST c11, AO, 0 * SIZE
  705. ST c12, AO, 1 * SIZE
  706. ST c31, AO, 2 * SIZE
  707. ST c32, AO, 3 * SIZE
  708. ST c51, AO, 4 * SIZE
  709. ST c52, AO, 5 * SIZE
  710. ST c71, AO, 6 * SIZE
  711. ST c72, AO, 7 * SIZE
  712. #endif
  713. #ifdef LN
  714. addi.d CO1,CO1, -2 * SIZE
  715. addi.d CO2,CO2, -2 * SIZE
  716. addi.d CO3,CO3, -2 * SIZE
  717. addi.d CO4,CO4, -2 * SIZE
  718. #endif
  719. ST c11, CO1, 0 * SIZE
  720. ST c12, CO1, 1 * SIZE
  721. ST c31, CO2, 0 * SIZE
  722. ST c32, CO2, 1 * SIZE
  723. ST c51, CO3, 0 * SIZE
  724. ST c52, CO3, 1 * SIZE
  725. ST c71, CO4, 0 * SIZE
  726. ST c72, CO4, 1 * SIZE
  727. #ifndef LN
  728. addi.d CO1,CO1, 2 * SIZE
  729. addi.d CO2,CO2, 2 * SIZE
  730. addi.d CO3,CO3, 2 * SIZE
  731. addi.d CO4,CO4, 2 * SIZE
  732. #endif
  733. #ifdef RT
  734. slli.d TEMP, K, ZBASE_SHIFT
  735. add.d AORIG, AORIG, TEMP
  736. #endif
  737. #if defined(LT) || defined(RN)
  738. sub.d TEMP, K, KK
  739. slli.d L, TEMP, ZBASE_SHIFT
  740. slli.d TEMP, TEMP, 2 + ZBASE_SHIFT
  741. add.d AO, AO, L
  742. add.d BO, BO, TEMP
  743. #endif
  744. #ifdef LT
  745. addi.d KK, KK, 1
  746. #endif
  747. #ifdef LN
  748. addi.d KK, KK, -1
  749. #endif
  750. MTC c11, $r0
  751. addi.d I, I, -1
  752. MOV c21, c11
  753. MOV c31, c11
  754. MOV c41, c11
  755. MOV c51, c11
  756. MOV c61, c11
  757. blt $r0, I, .L11
  758. .align 3
  759. .L19:
  760. #ifdef LN
  761. slli.d TEMP, K, 2 + ZBASE_SHIFT
  762. add.d B, B, TEMP
  763. #endif
  764. #if defined(LT) || defined(RN)
  765. move B, BO
  766. #endif
  767. #ifdef RN
  768. addi.d KK, KK, 4
  769. #endif
  770. #ifdef RT
  771. addi.d KK, KK, -4
  772. #endif
  773. blt $r0, J, .L10
  774. .align 3
  775. .L20:
  776. andi J, N, 2
  777. bge $r0, J, .L30
  778. #ifdef RT
  779. slli.d TEMP, K, 1 + ZBASE_SHIFT
  780. sub.d B, B, TEMP
  781. slli.d TEMP, LDC, 1
  782. sub.d C, C, TEMP
  783. #endif
  784. MTC c11, $r0
  785. move CO1, C
  786. add.d CO2, C, LDC
  787. #ifdef LN
  788. add.d KK, M, OFFSET
  789. #endif
  790. #ifdef LT
  791. move KK, OFFSET
  792. #endif
  793. #if defined(LN) || defined(RT)
  794. move AORIG, A
  795. #else
  796. move AO, A
  797. #endif
  798. #ifndef RT
  799. add.d C, CO2, LDC
  800. #endif
  801. move I, M
  802. bge $r0, I, .L29
  803. .align 3
  804. .L21:
  805. #if defined(LT) || defined(RN)
  806. LD a1, AO, 0 * SIZE
  807. MOV c21, c11
  808. LD b1, B, 0 * SIZE
  809. MOV c31, c11
  810. LD a3, AO, 4 * SIZE
  811. MOV c41, c11
  812. LD b2, B, 1 * SIZE
  813. srai.d L, KK, 2
  814. LD b3, B, 2 * SIZE
  815. MOV c12, c11
  816. LD b4, B, 3 * SIZE
  817. MOV c22, c11
  818. LD b5, B, 4 * SIZE
  819. MOV c32, c11
  820. MOV c42, c11
  821. move BO, B
  822. bge $r0, L, .L25
  823. #else
  824. #ifdef LN
  825. slli.d TEMP, K, ZBASE_SHIFT
  826. sub.d AORIG, AORIG, TEMP
  827. #endif
  828. slli.d L, KK, ZBASE_SHIFT
  829. slli.d TEMP, KK, 1 + ZBASE_SHIFT
  830. add.d AO, AORIG, L
  831. add.d BO, B, TEMP
  832. sub.d TEMP, K, KK
  833. LD a1, AO, 0 * SIZE
  834. MOV c21, c11
  835. LD b1, BO, 0 * SIZE
  836. MOV c31, c11
  837. LD a3, AO, 4 * SIZE
  838. MOV c41, c11
  839. LD b2, BO, 1 * SIZE
  840. srai.d L, TEMP, 2
  841. LD b3, BO, 2 * SIZE
  842. MOV c12, c11
  843. LD b4, BO, 3 * SIZE
  844. MOV c22, c11
  845. LD b5, BO, 4 * SIZE
  846. MOV c32, c11
  847. MOV c42, c11
  848. bge $r0, L, .L25
  849. #endif
  850. .align 3
  851. .L22:
  852. MADD1 c11, b1, a1, c11
  853. LD a2, AO, 1 * SIZE
  854. MADD3 c21, b2, a1, c21
  855. addi.d L, L, -1
  856. MADD1 c31, b3, a1, c31
  857. MADD3 c41, b4, a1, c41
  858. LD a1, AO, 2 * SIZE
  859. MADD2 c12, b1, a2, c12
  860. LD b1, BO, 8 * SIZE
  861. MADD4 c22, b2, a2, c22
  862. LD b2, BO, 5 * SIZE
  863. MADD2 c32, b3, a2, c32
  864. LD b3, BO, 6 * SIZE
  865. MADD4 c42, b4, a2, c42
  866. LD b4, BO, 7 * SIZE
  867. MADD1 c11, b5, a1, c11
  868. LD a2, AO, 3 * SIZE
  869. MADD3 c21, b2, a1, c21
  870. MADD1 c31, b3, a1, c31
  871. MADD3 c41, b4, a1, c41
  872. LD a1, AO, 8 * SIZE
  873. MADD2 c12, b5, a2, c12
  874. LD b5, BO, 12 * SIZE
  875. MADD4 c22, b2, a2, c22
  876. LD b2, BO, 9 * SIZE
  877. MADD2 c32, b3, a2, c32
  878. LD b3, BO, 10 * SIZE
  879. MADD4 c42, b4, a2, c42
  880. LD b4, BO, 11 * SIZE
  881. MADD1 c11, b1, a3, c11
  882. LD a2, AO, 5 * SIZE
  883. MADD3 c21, b2, a3, c21
  884. MADD1 c31, b3, a3, c31
  885. MADD3 c41, b4, a3, c41
  886. LD a3, AO, 6 * SIZE
  887. MADD2 c12, b1, a2, c12
  888. LD b1, BO, 16 * SIZE
  889. MADD4 c22, b2, a2, c22
  890. LD b2, BO, 13 * SIZE
  891. MADD2 c32, b3, a2, c32
  892. LD b3, BO, 14 * SIZE
  893. MADD4 c42, b4, a2, c42
  894. LD b4, BO, 15 * SIZE
  895. MADD1 c11, b5, a3, c11
  896. LD a2, AO, 7 * SIZE
  897. MADD3 c21, b2, a3, c21
  898. addi.d AO, AO, 8 * SIZE
  899. MADD1 c31, b3, a3, c31
  900. MADD3 c41, b4, a3, c41
  901. LD a3, AO, 4 * SIZE
  902. MADD2 c12, b5, a2, c12
  903. LD b5, BO, 20 * SIZE
  904. MADD4 c22, b2, a2, c22
  905. LD b2, BO, 17 * SIZE
  906. MADD2 c32, b3, a2, c32
  907. LD b3, BO, 18 * SIZE
  908. MADD4 c42, b4, a2, c42
  909. LD b4, BO, 19 * SIZE
  910. addi.d BO, BO, 16 * SIZE
  911. blt $r0, L, .L22
  912. .align 3
  913. .L25:
  914. #if defined(LT) || defined(RN)
  915. andi L, KK, 3
  916. #else
  917. andi L, TEMP, 3
  918. #endif
  919. bge $r0, L, .L28
  920. .align 3
  921. .L26:
  922. MADD1 c11, b1, a1, c11
  923. LD a2, AO, 1 * SIZE
  924. MADD3 c21, b2, a1, c21
  925. addi.d L, L, -1
  926. MADD1 c31, b3, a1, c31
  927. addi.d BO, BO, 4 * SIZE
  928. MADD3 c41, b4, a1, c41
  929. LD a1, AO, 2 * SIZE
  930. MADD2 c12, b1, a2, c12
  931. LD b1, BO, 0 * SIZE
  932. MADD4 c22, b2, a2, c22
  933. LD b2, BO, 1 * SIZE
  934. MADD2 c32, b3, a2, c32
  935. LD b3, BO, 2 * SIZE
  936. MADD4 c42, b4, a2, c42
  937. LD b4, BO, 3 * SIZE
  938. addi.d AO, AO, 2 * SIZE
  939. blt $r0, L, .L26
  940. .L28:
  941. ADD c11, c11, c22
  942. ADD c12, c12, c21
  943. ADD c31, c31, c42
  944. ADD c32, c32, c41
  945. #if defined(LN) || defined(RT)
  946. #ifdef LN
  947. addi.d TEMP, KK, -1
  948. #else
  949. addi.d TEMP, KK, -2
  950. #endif
  951. slli.d L, TEMP, ZBASE_SHIFT
  952. slli.d TEMP, TEMP, 1 + ZBASE_SHIFT
  953. add.d AO, AORIG, L
  954. add.d BO, B, TEMP
  955. #endif
  956. #if defined(LN) || defined(LT)
  957. LD b1, BO, 0 * SIZE
  958. LD b2, BO, 1 * SIZE
  959. LD b3, BO, 2 * SIZE
  960. LD b4, BO, 3 * SIZE
  961. SUB c11, b1, c11
  962. SUB c12, b2, c12
  963. SUB c31, b3, c31
  964. SUB c32, b4, c32
  965. #else
  966. LD b1, AO, 0 * SIZE
  967. LD b2, AO, 1 * SIZE
  968. LD b3, AO, 2 * SIZE
  969. LD b4, AO, 3 * SIZE
  970. SUB c11, b1, c11
  971. SUB c12, b2, c12
  972. SUB c31, b3, c31
  973. SUB c32, b4, c32
  974. #endif
  975. #if defined(LN) || defined(LT)
  976. LD b1, AO, 0 * SIZE
  977. LD b2, AO, 1 * SIZE
  978. MUL a1, b2, c12
  979. MUL a2, b2, c11
  980. MUL a3, b2, c32
  981. MUL a4, b2, c31
  982. MADD5 c11, c11, b1, a1
  983. MADD6 c12, c12, b1, a2
  984. MADD5 c31, c31, b1, a3
  985. MADD6 c32, c32, b1, a4
  986. #endif
  987. #ifdef RN
  988. LD b1, BO, 0 * SIZE
  989. LD b2, BO, 1 * SIZE
  990. LD b3, BO, 2 * SIZE
  991. LD b4, BO, 3 * SIZE
  992. MUL a1, b2, c12
  993. MUL a2, b2, c11
  994. MADD5 c11, c11, b1, a1
  995. MADD6 c12, c12, b1, a2
  996. NMSUB c31, c11, b3, c31
  997. MADD7 c32, c11, b4, c32
  998. MADD8 c31, c12, b4, c31
  999. NMSUB c32, c12, b3, c32
  1000. LD b3, BO, 6 * SIZE
  1001. LD b4, BO, 7 * SIZE
  1002. MUL a1, b4, c32
  1003. MUL a2, b4, c31
  1004. MADD5 c31, c31, b3, a1
  1005. MADD6 c32, c32, b3, a2
  1006. #endif
  1007. #ifdef RT
  1008. LD b5, BO, 6 * SIZE
  1009. LD b6, BO, 7 * SIZE
  1010. LD b7, BO, 4 * SIZE
  1011. LD b8, BO, 5 * SIZE
  1012. MUL a1, b6, c32
  1013. MUL a2, b6, c31
  1014. MADD5 c31, c31, b5, a1
  1015. MADD6 c32, c32, b5, a2
  1016. NMSUB c11, c31, b7, c11
  1017. MADD7 c12, c31, b8, c12
  1018. MADD8 c11, c32, b8, c11
  1019. NMSUB c12, c32, b7, c12
  1020. LD b7, BO, 0 * SIZE
  1021. LD b8, BO, 1 * SIZE
  1022. MUL a1, b8, c12
  1023. MUL a2, b8, c11
  1024. MADD5 c11, c11, b7, a1
  1025. MADD6 c12, c12, b7, a2
  1026. #endif
  1027. #if defined(LN) || defined(LT)
  1028. ST c11, BO, 0 * SIZE
  1029. ST c12, BO, 1 * SIZE
  1030. ST c31, BO, 2 * SIZE
  1031. ST c32, BO, 3 * SIZE
  1032. #else
  1033. ST c11, AO, 0 * SIZE
  1034. ST c12, AO, 1 * SIZE
  1035. ST c31, AO, 2 * SIZE
  1036. ST c32, AO, 3 * SIZE
  1037. #endif
  1038. #ifdef LN
  1039. addi.d CO1,CO1, -2 * SIZE
  1040. addi.d CO2,CO2, -2 * SIZE
  1041. #endif
  1042. ST c11, CO1, 0 * SIZE
  1043. ST c12, CO1, 1 * SIZE
  1044. ST c31, CO2, 0 * SIZE
  1045. ST c32, CO2, 1 * SIZE
  1046. #ifndef LN
  1047. addi.d CO1,CO1, 2 * SIZE
  1048. addi.d CO2,CO2, 2 * SIZE
  1049. #endif
  1050. MTC c11, $r0
  1051. #ifdef RT
  1052. slli.d TEMP, K, ZBASE_SHIFT
  1053. add.d AORIG, AORIG, TEMP
  1054. #endif
  1055. #if defined(LT) || defined(RN)
  1056. sub.d TEMP, K, KK
  1057. slli.d L, TEMP, ZBASE_SHIFT
  1058. slli.d TEMP, TEMP, 1 + ZBASE_SHIFT
  1059. add.d AO, AO, L
  1060. add.d BO, BO, TEMP
  1061. #endif
  1062. #ifdef LT
  1063. addi.d KK, KK, 1
  1064. #endif
  1065. #ifdef LN
  1066. addi.d KK, KK, -1
  1067. #endif
  1068. addi.d I, I, -1
  1069. blt $r0, I, .L21
  1070. .align 3
  1071. .L29:
  1072. #ifdef LN
  1073. slli.d TEMP, K, 1 + ZBASE_SHIFT
  1074. add.d B, B, TEMP
  1075. #endif
  1076. #if defined(LT) || defined(RN)
  1077. move B, BO
  1078. #endif
  1079. #ifdef RN
  1080. addi.d KK, KK, 2
  1081. #endif
  1082. #ifdef RT
  1083. addi.d KK, KK, -2
  1084. #endif
  1085. .align 3
  1086. .L30:
  1087. andi J, N, 1
  1088. bge $r0, J, .L999
  1089. #ifdef RT
  1090. slli.d TEMP, K, ZBASE_SHIFT
  1091. sub.d B, B, TEMP
  1092. sub.d C, C, LDC
  1093. #endif
  1094. MTC c11, $r0
  1095. move CO1, C
  1096. #ifdef LN
  1097. add.d KK, M, OFFSET
  1098. #endif
  1099. #ifdef LT
  1100. move KK, OFFSET
  1101. #endif
  1102. #if defined(LN) || defined(RT)
  1103. move AORIG, A
  1104. #else
  1105. move AO, A
  1106. #endif
  1107. #ifndef RT
  1108. add.d C, CO1, LDC
  1109. #endif
  1110. move I, M
  1111. bge $r0, I, .L39
  1112. .align 3
  1113. .L31:
  1114. #if defined(LT) || defined(RN)
  1115. LD a1, AO, 0 * SIZE
  1116. MOV c21, c11
  1117. LD b1, B, 0 * SIZE
  1118. MOV c31, c11
  1119. LD a2, AO, 1 * SIZE
  1120. MOV c41, c11
  1121. LD b2, B, 1 * SIZE
  1122. MOV c12, c11
  1123. srai.d L, KK, 2
  1124. MOV c22, c11
  1125. LD a3, AO, 4 * SIZE
  1126. MOV c32, c11
  1127. LD b3, B, 4 * SIZE
  1128. MOV c42, c11
  1129. move BO, B
  1130. bge $r0, L, .L35
  1131. #else
  1132. #ifdef LN
  1133. slli.d TEMP, K, ZBASE_SHIFT
  1134. sub.d AORIG, AORIG, TEMP
  1135. #endif
  1136. slli.d TEMP, KK, ZBASE_SHIFT
  1137. add.d AO, AORIG, TEMP
  1138. add.d BO, B, TEMP
  1139. sub.d TEMP, K, KK
  1140. LD a1, AO, 0 * SIZE
  1141. MOV c21, c11
  1142. LD b1, BO, 0 * SIZE
  1143. MOV c31, c11
  1144. LD a2, AO, 1 * SIZE
  1145. MOV c41, c11
  1146. LD b2, BO, 1 * SIZE
  1147. MOV c12, c11
  1148. srai.d L, TEMP, 2
  1149. MOV c22, c11
  1150. LD a3, AO, 4 * SIZE
  1151. MOV c32, c11
  1152. LD b3, BO, 4 * SIZE
  1153. MOV c42, c11
  1154. bge $r0, L, .L35
  1155. #endif
  1156. .align 3
  1157. .L32:
  1158. MADD1 c11, b1, a1, c11
  1159. LD b4, BO, 3 * SIZE
  1160. MADD3 c21, b2, a1, c21
  1161. LD a1, AO, 2 * SIZE
  1162. MADD2 c12, b1, a2, c12
  1163. LD b1, BO, 2 * SIZE
  1164. MADD4 c22, b2, a2, c22
  1165. LD a2, AO, 3 * SIZE
  1166. MADD1 c11, b1, a1, c11
  1167. LD b2, BO, 5 * SIZE
  1168. MADD3 c21, b4, a1, c21
  1169. LD a1, AO, 8 * SIZE
  1170. MADD2 c12, b1, a2, c12
  1171. LD b1, BO, 8 * SIZE
  1172. MADD4 c22, b4, a2, c22
  1173. LD a2, AO, 5 * SIZE
  1174. MADD1 c11, b3, a3, c11
  1175. LD b4, BO, 7 * SIZE
  1176. MADD3 c21, b2, a3, c21
  1177. LD a3, AO, 6 * SIZE
  1178. MADD2 c12, b3, a2, c12
  1179. LD b3, BO, 6 * SIZE
  1180. MADD4 c22, b2, a2, c22
  1181. LD a2, AO, 7 * SIZE
  1182. MADD1 c11, b3, a3, c11
  1183. LD b2, BO, 9 * SIZE
  1184. MADD3 c21, b4, a3, c21
  1185. LD a3, AO, 12 * SIZE
  1186. MADD2 c12, b3, a2, c12
  1187. LD b3, BO, 12 * SIZE
  1188. MADD4 c22, b4, a2, c22
  1189. LD a2, AO, 9 * SIZE
  1190. addi.d AO, AO, 8 * SIZE
  1191. addi.d L, L, -1
  1192. addi.d BO, BO, 8 * SIZE
  1193. blt $r0, L, .L32
  1194. .align 3
  1195. .L35:
  1196. #if defined(LT) || defined(RN)
  1197. andi L, KK, 3
  1198. #else
  1199. andi L, TEMP, 3
  1200. #endif
  1201. bge $r0, L, .L38
  1202. .align 3
  1203. .L36:
  1204. MADD1 c11, b1, a1, c11
  1205. addi.d L, L, -1
  1206. MADD3 c21, b2, a1, c21
  1207. LD a1, AO, 2 * SIZE
  1208. MADD2 c12, b1, a2, c12
  1209. LD b1, BO, 2 * SIZE
  1210. MADD4 c22, b2, a2, c22
  1211. LD a2, AO, 3 * SIZE
  1212. LD b2, BO, 3 * SIZE
  1213. addi.d BO, BO, 2 * SIZE
  1214. addi.d AO, AO, 2 * SIZE
  1215. blt $r0, L, .L36
  1216. .L38:
  1217. ADD c11, c11, c22
  1218. ADD c12, c12, c21
  1219. #if defined(LN) || defined(RT)
  1220. addi.d TEMP, KK, -1
  1221. slli.d TEMP, TEMP, ZBASE_SHIFT
  1222. add.d AO, AORIG, TEMP
  1223. add.d BO, B, TEMP
  1224. #endif
  1225. #if defined(LN) || defined(LT)
  1226. LD b1, BO, 0 * SIZE
  1227. LD b2, BO, 1 * SIZE
  1228. SUB c11, b1, c11
  1229. SUB c12, b2, c12
  1230. #else
  1231. LD b1, AO, 0 * SIZE
  1232. LD b2, AO, 1 * SIZE
  1233. SUB c11, b1, c11
  1234. SUB c12, b2, c12
  1235. #endif
  1236. #if defined(LN) || defined(LT)
  1237. LD b1, AO, 0 * SIZE
  1238. LD b2, AO, 1 * SIZE
  1239. MUL a1, b2, c12
  1240. MUL a2, b2, c11
  1241. MADD5 c11, c11, b1, a1
  1242. MADD6 c12, c12, b1, a2
  1243. #endif
  1244. #if defined(RN) || defined(RT)
  1245. LD b1, BO, 0 * SIZE
  1246. LD b2, BO, 1 * SIZE
  1247. MUL a1, b2, c12
  1248. MUL a2, b2, c11
  1249. MADD5 c11, c11, b1, a1
  1250. MADD6 c12, c12, b1, a2
  1251. #endif
  1252. #if defined(LN) || defined(LT)
  1253. ST c11, BO, 0 * SIZE
  1254. ST c12, BO, 1 * SIZE
  1255. #else
  1256. ST c11, AO, 0 * SIZE
  1257. ST c12, AO, 1 * SIZE
  1258. #endif
  1259. #ifdef LN
  1260. addi.d CO1,CO1, -2 * SIZE
  1261. #endif
  1262. ST c11, CO1, 0 * SIZE
  1263. ST c12, CO1, 1 * SIZE
  1264. #ifndef LN
  1265. addi.d CO1,CO1, 2 * SIZE
  1266. #endif
  1267. MTC c11, $r0
  1268. #ifdef RT
  1269. slli.d TEMP, K, ZBASE_SHIFT
  1270. add.d AORIG, AORIG, TEMP
  1271. #endif
  1272. #if defined(LT) || defined(RN)
  1273. sub.d TEMP, K, KK
  1274. slli.d TEMP, TEMP, ZBASE_SHIFT
  1275. add.d AO, AO, TEMP
  1276. add.d BO, BO, TEMP
  1277. #endif
  1278. #ifdef LT
  1279. addi.d KK, KK, 1
  1280. #endif
  1281. #ifdef LN
  1282. addi.d KK, KK, -1
  1283. #endif
  1284. addi.d I, I, -1
  1285. blt $r0, I, .L31
  1286. .align 3
  1287. .L39:
  1288. #ifdef LN
  1289. slli.d TEMP, K, ZBASE_SHIFT
  1290. add.d B, B, TEMP
  1291. #endif
  1292. #if defined(LT) || defined(RN)
  1293. move B, BO
  1294. #endif
  1295. #ifdef RN
  1296. addi.d KK, KK, 1
  1297. #endif
  1298. #ifdef RT
  1299. addi.d KK, KK, -1
  1300. #endif
  1301. .align 3
  1302. .L999:
  1303. LDARG $r23, $sp, 0
  1304. LDARG $r24, $sp, 8
  1305. LDARG $r25, $sp, 16
  1306. LDARG $r26, $sp, 24
  1307. LDARG $r27, $sp, 32
  1308. LDARG $r28, $sp, 40
  1309. fld.d $f24, $sp, 48
  1310. fld.d $f25, $sp, 56
  1311. fld.d $f26, $sp, 64
  1312. fld.d $f27, $sp, 72
  1313. #ifndef __64BIT__
  1314. fld.d $f18, $sp, 88
  1315. fld.d $f19, $sp, 96
  1316. fld.d $f20, $sp, 104
  1317. fld.d $f21, $sp, 112
  1318. #endif
  1319. addi.d $sp, $sp, 128
  1320. move $r4, $r17
  1321. fmov.d $f0, $f22
  1322. jirl $r0, $r1, 0x0
  1323. EPILOGUE