You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_small_kernel_tn_lasx.S 22 kB


  1. /***************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #include "loongarch64_asm.S"
  30. #define M $a0
  31. #define N $a1
  32. #define K $a2
  33. #define A $a3
  34. #define LDA $a4
  35. #define ALPHA $f0
  36. #define B $a5
  37. #define LDB $a6
  38. #define C $a7
  39. #define LDC $t0
  40. #ifdef B0
  41. #define BETA $f1
  42. #endif
  43. #undef ZERO
  44. #define ZERO $r0
  45. #define M4 $t1
  46. #define M2 $t1
  47. #define M1 $t1
  48. #define N4 $t2
  49. #define N2 $t2
  50. #define N1 $t2
  51. #define K8 $t3
  52. #define A0 $t4
  53. #define X0 $t5
  54. #define B1 $t6
  55. #define B2 $t7
  56. #define B3 $t8
  57. #define C0 $s0
  58. #define C1 $s1
  59. #define C2 $s2
  60. #define C3 $s3
  61. #define K1 $s4
  62. #define A1 $s5
  63. #define A2 $s6
  64. #define A3 $s7
  65. #define VALPHA $xr0
  66. #ifndef B0
  67. #define VBETA $xr1
  68. #endif
  69. #define D0 $xr2
  70. #define D1 $xr3
  71. #define D2 $xr4
  72. #define D3 $xr5
  73. #define T0 $xr6
  74. #define T1 $xr7
  75. #define T2 $xr8
  76. #define T3 $xr9
  77. #define Y0 $xr10
  78. #define Y1 $xr11
  79. #define Y2 $xr12
  80. #define Y3 $xr13
  81. #define G0 $xr14
  82. #define G1 $xr15
  83. #define G2 $xr16
  84. #define G3 $xr17
  85. #define S0 $xr18
  86. #define S1 $xr19
  87. #define S2 $xr20
  88. #define S3 $xr21
  89. #define Z0 $xr22
  90. #define Z1 $xr23
  91. #define Z2 $xr24
  92. #define Z3 $xr25
  93. #define V0 $vr2
  94. #define V1 $vr3
  95. #define V2 $vr4
  96. #define V3 $vr5
  97. #define F0 $f2
  98. #define F1 $f3
  99. #define F2 $f4
  100. #define F3 $f5
  101. PROLOGUE
  102. PTR_LD LDC, $sp, 0
  103. push_if_used 8, 2
  104. xvreplve0.d VALPHA, VALPHA
  105. #ifndef B0
  106. xvreplve0.d VBETA, VBETA
  107. #endif
  108. PTR_SLLI LDA, LDA, 3
  109. PTR_SLLI LDB, LDB, 3
  110. PTR_SLLI LDC, LDC, 3
  111. PTR_SLLI K8, K, 3
  112. PTR_SRAI M4, M, 2 // M >> 2
  113. beqz M4, .L_M3
  114. .L_M4:
  115. PTR_SRAI N4, N, 2 // N >> 2
  116. move A0, A // Restore A0
  117. PTR_ADD A1, A0, LDA
  118. PTR_ADD A2, A1, LDA
  119. PTR_ADD A3, A2, LDA
  120. move X0, B // Restore X0
  121. PTR_ADD B1, X0, LDB
  122. PTR_ADD B2, B1, LDB
  123. PTR_ADD B3, B2, LDB
  124. move C0, C // Restore C0
  125. PTR_ADD C1, C0, LDC
  126. PTR_ADD C2, C1, LDC
  127. PTR_ADD C3, C2, LDC
  128. beqz N4, .L_M4_N3
  129. .L_M4_N4:
  130. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3
  131. move K1, K // Restore K1
  132. PTR_ADDI N4, N4, -1
  133. bge ZERO, K, .L_M4_N4_END
  134. PTR_SRAI K1, K1, 3
  135. beq ZERO, K1, .L_M4_N4_K7
  136. .L_M4_N4_K8:
  137. PTR_ADDI K1, K1, -1
  138. GLD xv, , T0, A0, 0x00, T1, A1, 0x00, T2, A2, 0x00, T3, A3, 0x00
  139. GTRANSPOSE4x4_D T0, T1, T2, T3, S0, S1, S2, S3, Z0, Z1
  140. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00
  141. GLDREPL xv, d, T0, X0, 0x08, T1, B1, 0x08, T2, B2, 0x08, T3, B3, 0x08
  142. GLDREPL xv, d, Y0, X0, 0x10, Y1, B1, 0x10, Y2, B2, 0x10, Y3, B3, 0x10
  143. GLDREPL xv, d, G0, X0, 0x18, G1, B1, 0x18, G2, B2, 0x18, G3, B3, 0x18
  144. GMADD xvf, d, D0, S0, Z0, D0, \
  145. D1, S0, Z1, D1, \
  146. D2, S0, Z2, D2, \
  147. D3, S0, Z3, D3
  148. GMADD xvf, d, D0, S1, T0, D0, \
  149. D1, S1, T1, D1, \
  150. D2, S1, T2, D2, \
  151. D3, S1, T3, D3
  152. GMADD xvf, d, D0, S2, Y0, D0, \
  153. D1, S2, Y1, D1, \
  154. D2, S2, Y2, D2, \
  155. D3, S2, Y3, D3
  156. GMADD xvf, d, D0, S3, G0, D0, \
  157. D1, S3, G1, D1, \
  158. D2, S3, G2, D2, \
  159. D3, S3, G3, D3
  160. GLD xv, , T0, A0, 0x20, T1, A1, 0x20, T2, A2, 0x20, T3, A3, 0x20
  161. GTRANSPOSE4x4_D T0, T1, T2, T3, S0, S1, S2, S3, Z0, Z1
  162. GLDREPL xv, d, Z0, X0, 0x20, Z1, B1, 0x20, Z2, B2, 0x20, Z3, B3, 0x20
  163. GLDREPL xv, d, T0, X0, 0x28, T1, B1, 0x28, T2, B2, 0x28, T3, B3, 0x28
  164. GLDREPL xv, d, Y0, X0, 0x30, Y1, B1, 0x30, Y2, B2, 0x30, Y3, B3, 0x30
  165. GLDREPL xv, d, G0, X0, 0x38, G1, B1, 0x38, G2, B2, 0x38, G3, B3, 0x38
  166. GMADD xvf, d, D0, S0, Z0, D0, \
  167. D1, S0, Z1, D1, \
  168. D2, S0, Z2, D2, \
  169. D3, S0, Z3, D3
  170. GMADD xvf, d, D0, S1, T0, D0, \
  171. D1, S1, T1, D1, \
  172. D2, S1, T2, D2, \
  173. D3, S1, T3, D3
  174. GMADD xvf, d, D0, S2, Y0, D0, \
  175. D1, S2, Y1, D1, \
  176. D2, S2, Y2, D2, \
  177. D3, S2, Y3, D3
  178. GMADD xvf, d, D0, S3, G0, D0, \
  179. D1, S3, G1, D1, \
  180. D2, S3, G2, D2, \
  181. D3, S3, G3, D3
  182. PTR_ADDI X0, X0, 0x40
  183. PTR_ADDI B1, B1, 0x40
  184. PTR_ADDI B2, B2, 0x40
  185. PTR_ADDI B3, B3, 0x40
  186. PTR_ADDI A0, A0, 0x40
  187. PTR_ADDI A1, A1, 0x40
  188. PTR_ADDI A2, A2, 0x40
  189. PTR_ADDI A3, A3, 0x40
  190. bnez K1, .L_M4_N4_K8
  191. .L_M4_N4_K7:
  192. andi K1, K, 4
  193. beqz K1, .L_M4_N4_3
  194. .L_M4_N4_K4:
  195. GLD xv, , T0, A0, 0x00, T1, A1, 0x00, T2, A2, 0x00, T3, A3, 0x00
  196. GTRANSPOSE4x4_D T0, T1, T2, T3, S0, S1, S2, S3, Z0, Z1
  197. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00
  198. GLDREPL xv, d, T0, X0, 0x08, T1, B1, 0x08, T2, B2, 0x08, T3, B3, 0x08
  199. GLDREPL xv, d, Y0, X0, 0x10, Y1, B1, 0x10, Y2, B2, 0x10, Y3, B3, 0x10
  200. GLDREPL xv, d, G0, X0, 0x18, G1, B1, 0x18, G2, B2, 0x18, G3, B3, 0x18
  201. GMADD xvf, d, D0, S0, Z0, D0, \
  202. D1, S0, Z1, D1, \
  203. D2, S0, Z2, D2, \
  204. D3, S0, Z3, D3
  205. GMADD xvf, d, D0, S1, T0, D0, \
  206. D1, S1, T1, D1, \
  207. D2, S1, T2, D2, \
  208. D3, S1, T3, D3
  209. GMADD xvf, d, D0, S2, Y0, D0, \
  210. D1, S2, Y1, D1, \
  211. D2, S2, Y2, D2, \
  212. D3, S2, Y3, D3
  213. GMADD xvf, d, D0, S3, G0, D0, \
  214. D1, S3, G1, D1, \
  215. D2, S3, G2, D2, \
  216. D3, S3, G3, D3
  217. PTR_ADDI X0, X0, 0x20
  218. PTR_ADDI B1, B1, 0x20
  219. PTR_ADDI B2, B2, 0x20
  220. PTR_ADDI B3, B3, 0x20
  221. PTR_ADDI A0, A0, 0x20
  222. PTR_ADDI A1, A1, 0x20
  223. PTR_ADDI A2, A2, 0x20
  224. PTR_ADDI A3, A3, 0x20
  225. .L_M4_N4_3:
  226. andi K1, K, 3
  227. beqz K1, .L_M4_N4_END
  228. .L_M4_N4_K1:
  229. GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00
  230. GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3
  231. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00
  232. GMADD xvf, d, D0, S0, Z0, D0, \
  233. D1, S0, Z1, D1, \
  234. D2, S0, Z2, D2, \
  235. D3, S0, Z3, D3
  236. PTR_ADDI K1, K1, -1
  237. PTR_ADDI X0, X0, 0x08
  238. PTR_ADDI B1, B1, 0x08
  239. PTR_ADDI B2, B2, 0x08
  240. PTR_ADDI B3, B3, 0x08
  241. PTR_ADDI A0, A0, 0x08
  242. PTR_ADDI A1, A1, 0x08
  243. PTR_ADDI A2, A2, 0x08
  244. PTR_ADDI A3, A3, 0x08
  245. bnez K1, .L_M4_N4_K1
  246. .L_M4_N4_END:
  247. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
  248. #ifndef B0
  249. GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00
  250. GMADD xvf, d, D0, S0, VBETA, D0, \
  251. D1, S1, VBETA, D1, \
  252. D2, S2, VBETA, D2, \
  253. D3, S3, VBETA, D3
  254. #endif
  255. GST xv, , D3, C3, 0x00, \
  256. D2, C2, 0x00, \
  257. D1, C1, 0x00, \
  258. D0, C0, 0x00
  259. // Update C0, C1, C2, C3
  260. PTR_ALSL C0, LDC, C0, 2
  261. PTR_ALSL C1, LDC, C1, 2
  262. PTR_ALSL C2, LDC, C2, 2
  263. PTR_ALSL C3, LDC, C3, 2
  264. // Update X0, B1, B2, B3
  265. PTR_SUB X0, X0, K8
  266. PTR_SUB B1, B1, K8
  267. PTR_SUB B2, B2, K8
  268. PTR_SUB B3, B3, K8
  269. PTR_ALSL X0, LDB, X0, 2
  270. PTR_ALSL B1, LDB, B1, 2
  271. PTR_ALSL B2, LDB, B2, 2
  272. PTR_ALSL B3, LDB, B3, 2
  273. // Restore A0, A1, A2, A3
  274. move A0, A
  275. PTR_ADD A1, A0, LDA
  276. PTR_ADD A2, A1, LDA
  277. PTR_ADD A3, A2, LDA
  278. bnez N4, .L_M4_N4
  279. .L_M4_N3:
  280. andi N2, N, 0x02
  281. beqz N2, .L_M4_N1
  282. .L_M4_N2:
  283. GXOR xv, v, D0, D0, D0, D1, D1, D1
  284. move K1, K // Restore K1
  285. bge ZERO, K, .L_M4_N2_END
  286. .L_M4_N2_K1:
  287. PTR_ADDI K1, K1, -1
  288. GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00
  289. GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3
  290. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00
  291. GMADD xvf, d, D0, S0, Z0, D0, \
  292. D1, S0, Z1, D1
  293. PTR_ADDI X0, X0, 0x08
  294. PTR_ADDI B1, B1, 0x08
  295. PTR_ADDI A0, A0, 0x08
  296. PTR_ADDI A1, A1, 0x08
  297. PTR_ADDI A2, A2, 0x08
  298. PTR_ADDI A3, A3, 0x08
  299. bnez K1, .L_M4_N2_K1
  300. .L_M4_N2_END:
  301. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA
  302. #ifndef B0
  303. GLD xv, , S0, C0, 0x00, S1, C1, 0x00
  304. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1
  305. #endif
  306. GST xv, , D1, C1, 0x00, \
  307. D0, C0, 0x00
  308. // Update C0, C1
  309. PTR_ALSL C0, LDC, C0, 1
  310. PTR_ALSL C1, LDC, C1, 1
  311. // Update X0, B1
  312. PTR_SUB X0, X0, K8
  313. PTR_SUB B1, B1, K8
  314. PTR_ALSL X0, LDB, X0, 1
  315. PTR_ALSL B1, LDB, B1, 1
  316. // Restore A0
  317. move A0, A
  318. PTR_ADD A1, A0, LDA
  319. PTR_ADD A2, A1, LDA
  320. PTR_ADD A3, A2, LDA
  321. .L_M4_N1:
  322. andi N1, N, 0x01
  323. beqz N1, .L_M4_END
  324. GXOR xv, v, D0, D0, D0
  325. move K1, K // Restore K1
  326. bge ZERO, K, .L_M4_N1_END
  327. .L_M4_N1_K1:
  328. PTR_ADDI K1, K1, -1
  329. GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00
  330. GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3
  331. GLDREPL xv, d, Z0, X0, 0x00
  332. GMADD xvf, d, D0, S0, Z0, D0
  333. PTR_ADDI X0, X0, 0x08
  334. PTR_ADDI A0, A0, 0x08
  335. PTR_ADDI A1, A1, 0x08
  336. PTR_ADDI A2, A2, 0x08
  337. PTR_ADDI A3, A3, 0x08
  338. bnez K1, .L_M4_N1_K1
  339. .L_M4_N1_END:
  340. GMUL xvf, d, D0, D0, VALPHA
  341. #ifndef B0
  342. GLD xv, , S0, C0, 0x00
  343. GMADD xvf, d, D0, S0, VBETA, D0
  344. #endif
  345. GST xv, , D0, C0, 0x00
  346. // Update C0
  347. PTR_ALSL C0, LDC, C0, 2
  348. // Update X0
  349. PTR_SUB X0, X0, K8
  350. PTR_ALSL X0, LDB, X0, 2
  351. // Restore A0
  352. move A0, A
  353. PTR_ADD A1, A0, LDA
  354. PTR_ADD A2, A1, LDA
  355. PTR_ADD A3, A2, LDA
  356. .L_M4_END:
  357. PTR_ADDI M4, M4, -1
  358. PTR_ALSL A, LDA, A, 2 // A += LDA << 2;
  359. PTR_ADDI C, C, 0x20
  360. bnez M4, .L_M4
  361. .L_M3:
  362. andi M2, M, 0x02
  363. beqz M2, .L_M1
  364. .L_M2:
  365. PTR_SRAI N4, N, 2 // N >> 2
  366. move A0, A // Restore A0
  367. PTR_ADD A1, A0, LDA
  368. move X0, B // Restore X0
  369. PTR_ADD B1, X0, LDB
  370. PTR_ADD B2, B1, LDB
  371. PTR_ADD B3, B2, LDB
  372. move C0, C // Restore C0
  373. PTR_ADD C1, C0, LDC
  374. PTR_ADD C2, C1, LDC
  375. PTR_ADD C3, C2, LDC
  376. beqz N4, .L_M2_N3
  377. .L_M2_N4:
  378. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3
  379. move K1, K // Restore K1
  380. PTR_ADDI N4, N4, -1
  381. bge ZERO, K, .L_M2_N4_END
  382. .L_M2_N4_K1:
  383. PTR_ADDI K1, K1, -1
  384. GLD xv, , S0, A0, 0x00, S1, A1, 0x00
  385. GINSVE0 xv, d, S0, S1, 1
  386. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00
  387. GMADD xvf, d, D0, S0, Z0, D0, \
  388. D1, S0, Z1, D1, \
  389. D2, S0, Z2, D2, \
  390. D3, S0, Z3, D3
  391. PTR_ADDI X0, X0, 0x08
  392. PTR_ADDI B1, B1, 0x08
  393. PTR_ADDI B2, B2, 0x08
  394. PTR_ADDI B3, B3, 0x08
  395. PTR_ADDI A0, A0, 0x08
  396. PTR_ADDI A1, A1, 0x08
  397. bnez K1, .L_M2_N4_K1
  398. .L_M2_N4_END:
  399. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
  400. #ifndef B0
  401. GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00
  402. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3
  403. #endif
  404. GST v, , V3, C3, 0x00, \
  405. V2, C2, 0x00, \
  406. V1, C1, 0x00, \
  407. V0, C0, 0x00
  408. // Update C0, C1, C2, C3
  409. PTR_ALSL C0, LDC, C0, 2
  410. PTR_ALSL C1, LDC, C1, 2
  411. PTR_ALSL C2, LDC, C2, 2
  412. PTR_ALSL C3, LDC, C3, 2
  413. // Update X0, B1, B2, B3
  414. PTR_SUB X0, X0, K8
  415. PTR_SUB B1, B1, K8
  416. PTR_SUB B2, B2, K8
  417. PTR_SUB B3, B3, K8
  418. PTR_ALSL X0, LDB, X0, 2
  419. PTR_ALSL B1, LDB, B1, 2
  420. PTR_ALSL B2, LDB, B2, 2
  421. PTR_ALSL B3, LDB, B3, 2
  422. // Restore A0, A1
  423. move A0, A
  424. PTR_ADD A1, A0, LDA
  425. bnez N4, .L_M2_N4
  426. .L_M2_N3:
  427. andi N2, N, 0x02
  428. beqz N2, .L_M2_N1
  429. .L_M2_N2:
  430. GXOR xv, v, D0, D0, D0, D1, D1, D1
  431. move K1, K // Restore K1
  432. bge ZERO, K, .L_M2_N2_END
  433. .L_M2_N2_K1:
  434. PTR_ADDI K1, K1, -1
  435. GLD xv, , S0, A0, 0x00, S1, A1, 0x00
  436. GINSVE0 xv, d, S0, S1, 1
  437. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00
  438. GMADD xvf, d, D0, S0, Z0, D0, \
  439. D1, S0, Z1, D1
  440. PTR_ADDI X0, X0, 0x08
  441. PTR_ADDI B1, B1, 0x08
  442. PTR_ADDI A0, A0, 0x08
  443. PTR_ADDI A1, A1, 0x08
  444. bnez K1, .L_M2_N2_K1
  445. .L_M2_N2_END:
  446. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA
  447. #ifndef B0
  448. GLD xv, , S0, C0, 0x00, S1, C1, 0x00
  449. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1
  450. #endif
  451. GST v, , V1, C1, 0x00, \
  452. V0, C0, 0x00
  453. // Update C0, C1
  454. PTR_ALSL C0, LDC, C0, 1
  455. PTR_ALSL C1, LDC, C1, 1
  456. // Update X0, B1
  457. PTR_SUB X0, X0, K8
  458. PTR_SUB B1, B1, K8
  459. PTR_ALSL X0, LDB, X0, 1
  460. PTR_ALSL B1, LDB, B1, 1
  461. // Restore A0, A1
  462. move A0, A
  463. PTR_ADD A1, A0, LDA
  464. .L_M2_N1:
  465. andi N1, N, 0x01
  466. beqz N1, .L_M2_END
  467. GXOR xv, v, D0, D0, D0
  468. move K1, K // Restore K1
  469. bge ZERO, K, .L_M2_N1_END
  470. .L_M2_N1_K1:
  471. PTR_ADDI K1, K1, -1
  472. GLD xv, , S0, A0, 0x00, S1, A1, 0x00
  473. GINSVE0 xv, d, S0, S1, 1
  474. GLDREPL xv, d, Z0, X0, 0x00
  475. GMADD xvf, d, D0, S0, Z0, D0
  476. PTR_ADDI X0, X0, 0x08
  477. PTR_ADDI A0, A0, 0x08
  478. PTR_ADDI A1, A1, 0x08
  479. bnez K1, .L_M2_N1_K1
  480. .L_M2_N1_END:
  481. GMUL xvf, d, D0, D0, VALPHA
  482. #ifndef B0
  483. GLD xv, , S0, C0, 0x00
  484. GMADD xvf, d, D0, S0, VBETA, D0
  485. #endif
  486. GST v, , V0, C0, 0x00
  487. // Update C0
  488. PTR_ALSL C0, LDC, C0, 2
  489. // Update X0
  490. PTR_SUB X0, X0, K8
  491. PTR_ALSL X0, LDB, X0, 2
  492. // Restore A0, A1
  493. move A0, A
  494. PTR_ADD A1, A0, LDA
  495. .L_M2_END:
  496. PTR_ALSL A, LDA, A, 1 // A += LDA << 1;
  497. PTR_ADDI C, C, 0x10
  498. .L_M1:
  499. andi M1, M, 0x01
  500. beqz M1, .L_M0
  501. PTR_SRAI N4, N, 2 // N >> 2
  502. move A0, A // Restore A0
  503. move X0, B // Restore X0
  504. PTR_ADD B1, X0, LDB
  505. PTR_ADD B2, B1, LDB
  506. PTR_ADD B3, B2, LDB
  507. move C0, C // Restore C0
  508. PTR_ADD C1, C0, LDC
  509. PTR_ADD C2, C1, LDC
  510. PTR_ADD C3, C2, LDC
  511. beqz N4, .L_M1_N3
  512. .L_M1_N4:
  513. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3
  514. move K1, K // Restore K1
  515. PTR_ADDI N4, N4, -1
  516. bge ZERO, K, .L_M1_N4_END
  517. .L_M1_N4_K1:
  518. PTR_ADDI K1, K1, -1
  519. GLD xv, , S0, A0, 0x00
  520. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00
  521. GMADD xvf, d, D0, S0, Z0, D0, \
  522. D1, S0, Z1, D1, \
  523. D2, S0, Z2, D2, \
  524. D3, S0, Z3, D3
  525. PTR_ADDI X0, X0, 0x08
  526. PTR_ADDI B1, B1, 0x08
  527. PTR_ADDI B2, B2, 0x08
  528. PTR_ADDI B3, B3, 0x08
  529. PTR_ADDI A0, A0, 0x08
  530. bnez K1, .L_M1_N4_K1
  531. .L_M1_N4_END:
  532. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
  533. #ifndef B0
  534. GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00
  535. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3
  536. #endif
  537. GST f, d, F3, C3, 0x00, \
  538. F2, C2, 0x00, \
  539. F1, C1, 0x00, \
  540. F0, C0, 0x00
  541. // Update C0, C1, C2, C3
  542. PTR_ALSL C0, LDC, C0, 2
  543. PTR_ALSL C1, LDC, C1, 2
  544. PTR_ALSL C2, LDC, C2, 2
  545. PTR_ALSL C3, LDC, C3, 2
  546. // Update X0, B1, B2, B3
  547. PTR_SUB X0, X0, K8
  548. PTR_SUB B1, B1, K8
  549. PTR_SUB B2, B2, K8
  550. PTR_SUB B3, B3, K8
  551. PTR_ALSL X0, LDB, X0, 2
  552. PTR_ALSL B1, LDB, B1, 2
  553. PTR_ALSL B2, LDB, B2, 2
  554. PTR_ALSL B3, LDB, B3, 2
  555. // Restore A0, A1
  556. move A0, A
  557. bnez N4, .L_M1_N4
  558. .L_M1_N3:
  559. andi N2, N, 0x02
  560. beqz N2, .L_M1_N1
  561. .L_M1_N2:
  562. GXOR xv, v, D0, D0, D0, D1, D1, D1
  563. move K1, K // Restore K1
  564. bge ZERO, K, .L_M1_N2_END
  565. .L_M1_N2_K1:
  566. PTR_ADDI K1, K1, -1
  567. GLD xv, , S0, A0, 0x00
  568. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00
  569. GMADD xvf, d, D0, S0, Z0, D0, \
  570. D1, S0, Z1, D1
  571. PTR_ADDI X0, X0, 0x08
  572. PTR_ADDI B1, B1, 0x08
  573. PTR_ADDI A0, A0, 0x08
  574. bnez K1, .L_M1_N2_K1
  575. .L_M1_N2_END:
  576. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA
  577. #ifndef B0
  578. GLD xv, , S0, C0, 0x00, S1, C1, 0x00
  579. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1
  580. #endif
  581. GST f, d, F1, C1, 0x00, \
  582. F0, C0, 0x00
  583. // Update C0, C1
  584. PTR_ALSL C0, LDC, C0, 1
  585. PTR_ALSL C1, LDC, C1, 1
  586. // Update X0, B1
  587. PTR_SUB X0, X0, K8
  588. PTR_SUB B1, B1, K8
  589. PTR_ALSL X0, LDB, X0, 1
  590. PTR_ALSL B1, LDB, B1, 1
  591. // Restore A0
  592. move A0, A
  593. .L_M1_N1:
  594. andi N1, N, 0x01
  595. beqz N1, .L_M0
  596. GXOR xv, v, D0, D0, D0
  597. move K1, K // Restore K1
  598. bge ZERO, K, .L_M1_N1_END
  599. .L_M1_N1_K1:
  600. PTR_ADDI K1, K1, -1
  601. GLD xv, , S0, A0, 0x00
  602. GLDREPL xv, d, Z0, X0, 0x00
  603. GMADD xvf, d, D0, S0, Z0, D0
  604. PTR_ADDI X0, X0, 0x08
  605. PTR_ADDI A0, A0, 0x08
  606. bnez K1, .L_M1_N1_K1
  607. .L_M1_N1_END:
  608. GMUL xvf, d, D0, D0, VALPHA
  609. #ifndef B0
  610. GLD xv, , S0, C0, 0x00
  611. GMADD xvf, d, D0, S0, VBETA, D0
  612. #endif
  613. GST f, d, F0, C0, 0x00
  614. // Update C0
  615. PTR_ALSL C0, LDC, C0, 2
  616. // Update X0
  617. PTR_SUB X0, X0, K8
  618. PTR_ALSL X0, LDB, X0, 2
  619. // Restore A0
  620. move A0, A
  621. .L_M0:
  622. pop_if_used 8, 2
  623. jirl $r0, $r1, 0x0
  624. EPILOGUE