You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_small_kernel_nn_lasx.S 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. /***************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #include "loongarch64_asm.S"
  30. #define M $a0
  31. #define N $a1
  32. #define K $a2
  33. #define A $a3
  34. #define LDA $a4
  35. #define ALPHA $f0
  36. #define B $a5
  37. #define LDB $a6
  38. #define C $a7
  39. #define LDC $t0
  40. #ifdef B0
  41. #define BETA $f1
  42. #endif
  43. #undef ZERO
  44. #define ZERO $r0
  45. #define M16 $t1
  46. #define M8 $t1
  47. #define M4 $t1
  48. #define M2 $t1
  49. #define M1 $t1
  50. #define N4 $t2
  51. #define N2 $t2
  52. #define N1 $t2
  53. #define K8 $t3
  54. #define A0 $t4
  55. #define X0 $t5
  56. #define B1 $t6
  57. #define B2 $t7
  58. #define B3 $t8
  59. #define C0 $s0
  60. #define C1 $s1
  61. #define C2 $s2
  62. #define C3 $s3
  63. #define K1 $s4
  64. #define VALPHA $xr0
  65. #ifndef B0
  66. #define VBETA $xr1
  67. #endif
  68. #define D0 $xr2
  69. #define D1 $xr3
  70. #define D2 $xr4
  71. #define D3 $xr5
  72. #define D4 $xr6
  73. #define D5 $xr7
  74. #define D6 $xr8
  75. #define D7 $xr9
  76. #define D8 $xr10
  77. #define D9 $xr11
  78. #define D10 $xr12
  79. #define D11 $xr13
  80. #define D12 $xr14
  81. #define D13 $xr15
  82. #define D14 $xr16
  83. #define D15 $xr17
  84. #define S0 $xr18
  85. #define S1 $xr19
  86. #define S2 $xr20
  87. #define S3 $xr21
  88. #define Z0 $xr22
  89. #define Z1 $xr23
  90. #define Z2 $xr24
  91. #define Z3 $xr25
  92. #define V0 $vr2
  93. #define V1 $vr3
  94. #define V2 $vr4
  95. #define V3 $vr5
  96. #define F0 $f2
  97. #define F1 $f3
  98. #define F2 $f4
  99. #define F3 $f5
  100. .macro DGEMM_SMALL_KERNEL_NN_TAIL M
  101. PTR_SRAI N4, N, 2 // N >> 2
  102. move A0, A // Restore A0
  103. move X0, B // Restore X0
  104. PTR_ADD B1, X0, LDB
  105. PTR_ADD B2, B1, LDB
  106. PTR_ADD B3, B2, LDB
  107. move C0, C // Restore C0
  108. PTR_ADD C1, C0, LDC
  109. PTR_ADD C2, C1, LDC
  110. PTR_ADD C3, C2, LDC
  111. beqz N4, .L_M\M\()_N3
  112. .L_M\M\()_N4:
  113. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3
  114. move K1, K // Restore K1
  115. PTR_ADDI N4, N4, -1
  116. bge ZERO, K, .L_M\M\()_N4_END
  117. .L_M\M\()_N4_K1:
  118. PTR_ADDI K1, K1, -1
  119. GLD xv, , S0, A0, 0x00
  120. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00
  121. GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1, D2, S0, Z2, D2, D3, S0, Z3, D3
  122. PTR_ADDI X0, X0, 0x08
  123. PTR_ADDI B1, B1, 0x08
  124. PTR_ADDI B2, B2, 0x08
  125. PTR_ADDI B3, B3, 0x08
  126. PTR_ADD A0, A0, LDA
  127. bnez K1, .L_M\M\()_N4_K1
  128. .L_M\M\()_N4_END:
  129. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
  130. #ifndef B0
  131. GLD xv, , S0, C0, 0x00
  132. GMADD xvf, d, D0, S0, VBETA, D0
  133. GLD xv, , S0, C1, 0x00
  134. GMADD xvf, d, D1, S0, VBETA, D1
  135. GLD xv, , S0, C2, 0x00
  136. GMADD xvf, d, D2, S0, VBETA, D2
  137. GLD xv, , S0, C3, 0x00
  138. GMADD xvf, d, D3, S0, VBETA, D3
  139. #endif
  140. .if \M == 4
  141. GST xv, , D0, C0, 0x00, D1, C1, 0x00, D2, C2, 0x00, D3, C3, 0x00
  142. .elseif \M == 2
  143. GST v, , V0, C0, 0x00, V1, C1, 0x00, V2, C2, 0x00, V3, C3, 0x00
  144. .elseif \M == 1
  145. GST f, d, F0, C0, 0x00, F1, C1, 0x00, F2, C2, 0x00, F3, C3, 0x00
  146. .endif
  147. // Update C0, C1, C2, C3
  148. PTR_ALSL C0, LDC, C0, 2
  149. PTR_ALSL C1, LDC, C1, 2
  150. PTR_ALSL C2, LDC, C2, 2
  151. PTR_ALSL C3, LDC, C3, 2
  152. // Update X0, B1, B2, B3
  153. PTR_SUB X0, X0, K8
  154. PTR_SUB B1, B1, K8
  155. PTR_SUB B2, B2, K8
  156. PTR_SUB B3, B3, K8
  157. PTR_ALSL X0, LDB, X0, 2
  158. PTR_ALSL B1, LDB, B1, 2
  159. PTR_ALSL B2, LDB, B2, 2
  160. PTR_ALSL B3, LDB, B3, 2
  161. // Restore A0
  162. move A0, A
  163. bnez N4, .L_M\M\()_N4
  164. .L_M\M\()_N3:
  165. andi N2, N, 0x02
  166. beqz N2, .L_M\M\()_N1
  167. .L_M\M\()_N2:
  168. GXOR xv, v, D0, D0, D0, D1, D1, D1
  169. move K1, K // Restore K1
  170. bge ZERO, K, .L_M\M\()_N2_END
  171. .L_M\M\()_N2_K1:
  172. PTR_ADDI K1, K1, -1
  173. GLD xv, , S0, A0, 0x00
  174. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00
  175. GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1
  176. PTR_ADDI X0, X0, 0x08
  177. PTR_ADDI B1, B1, 0x08
  178. PTR_ADD A0, A0, LDA
  179. bnez K1, .L_M\M\()_N2_K1
  180. .L_M\M\()_N2_END:
  181. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA
  182. #ifndef B0
  183. GLD xv, , S0, C0, 0x00
  184. GMADD xvf, d, D0, S0, VBETA, D0
  185. GLD xv, , S0, C1, 0x00
  186. GMADD xvf, d, D1, S0, VBETA, D1
  187. #endif
  188. .if \M == 4
  189. GST xv, , D0, C0, 0x00, D1, C1, 0x00
  190. .elseif \M == 2
  191. GST v, , V0, C0, 0x00, V1, C1, 0x00
  192. .elseif \M == 1
  193. GST f, d, F0, C0, 0x00, F1, C1, 0x00
  194. .endif
  195. // Update C0, C1
  196. PTR_ALSL C0, LDC, C0, 1
  197. PTR_ALSL C1, LDC, C1, 1
  198. // Update X0, B1
  199. PTR_SUB X0, X0, K8
  200. PTR_SUB B1, B1, K8
  201. PTR_ALSL X0, LDB, X0, 1
  202. PTR_ALSL B1, LDB, B1, 1
  203. // Restore A0
  204. move A0, A
  205. .L_M\M\()_N1:
  206. andi N1, N, 0x01
  207. beqz N1, .L_M\M\()_END
  208. GXOR xv, v, D0, D0, D0
  209. move K1, K // Restore K1
  210. bge ZERO, K, .L_M\M\()_N1_END
  211. .L_M\M\()_N1_K1:
  212. PTR_ADDI K1, K1, -1
  213. GLD xv, , S0, A0, 0x00
  214. GLDREPL xv, d, Z0, X0, 0x00
  215. GMADD xvf, d, D0, S0, Z0, D0
  216. PTR_ADDI X0, X0, 0x08
  217. PTR_ADD A0, A0, LDA
  218. bnez K1, .L_M\M\()_N1_K1
  219. .L_M\M\()_N1_END:
  220. GMUL xvf, d, D0, D0, VALPHA
  221. #ifndef B0
  222. GLD xv, , S0, C0, 0x00
  223. GMADD xvf, d, D0, S0, VBETA, D0
  224. #endif
  225. .if \M == 4
  226. GST xv, , D0, C0, 0x00
  227. .elseif \M == 2
  228. GST v, , V0, C0, 0x00
  229. .elseif \M == 1
  230. GST f, d, F0, C0, 0x00
  231. .endif
  232. .L_M\M\()_END:
  233. .if \M == 4
  234. PTR_ADDI A, A, 0x20
  235. PTR_ADDI C, C, 0x20
  236. .elseif \M == 2
  237. PTR_ADDI A, A, 0x10
  238. PTR_ADDI C, C, 0x10
  239. .elseif \M == 1
  240. .endif
  241. .endm
  242. PROLOGUE
  243. PTR_LD LDC, $sp, 0
  244. push_if_used 5, 2
  245. xvreplve0.d VALPHA, VALPHA
  246. #ifndef B0
  247. xvreplve0.d VBETA, VBETA
  248. #endif
  249. PTR_SLLI LDA, LDA, 3
  250. PTR_SLLI LDB, LDB, 3
  251. PTR_SLLI LDC, LDC, 3
  252. PTR_SLLI K8, K, 3
  253. PTR_SRAI M16, M, 4 // M >> 4
  254. beqz M16, .L_M15
  255. .L_M16:
  256. PTR_SRAI N4, N, 2 // N >> 2
  257. move A0, A // Restore A0
  258. move X0, B // Restore X0
  259. PTR_ADD B1, X0, LDB
  260. PTR_ADD B2, B1, LDB
  261. PTR_ADD B3, B2, LDB
  262. move C0, C // Restore C0
  263. PTR_ADD C1, C0, LDC
  264. PTR_ADD C2, C1, LDC
  265. PTR_ADD C3, C2, LDC
  266. beqz N4, .L_M16_N3
  267. .L_M16_N4:
  268. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \
  269. D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7, \
  270. D8, D8, D8, D9, D9, D9, D10, D10, D10, D11, D11, D11, \
  271. D12, D12, D12, D13, D13, D13, D14, D14, D14, D15, D15, D15
  272. move K1, K // Restore K1
  273. PTR_ADDI N4, N4, -1
  274. bge ZERO, K, .L_M16_N4_END
  275. .L_M16_N4_K1:
  276. PTR_ADDI K1, K1, -1
  277. GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60
  278. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00
  279. GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \
  280. D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7, \
  281. D8, S0, Z2, D8, D9, S1, Z2, D9, D10, S2, Z2, D10, D11, S3, Z2, D11, \
  282. D12, S0, Z3, D12, D13, S1, Z3, D13, D14, S2, Z3, D14, D15, S3, Z3, D15
  283. PTR_ADDI X0, X0, 0x08
  284. PTR_ADDI B1, B1, 0x08
  285. PTR_ADDI B2, B2, 0x08
  286. PTR_ADDI B3, B3, 0x08
  287. PTR_ADD A0, A0, LDA
  288. bnez K1, .L_M16_N4_K1
  289. .L_M16_N4_END:
  290. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \
  291. D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \
  292. D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \
  293. D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA
  294. #ifndef B0
  295. GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60
  296. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3
  297. GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60
  298. GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7
  299. GLD xv, , S0, C2, 0x00, S1, C2, 0x20, S2, C2, 0x40, S3, C2, 0x60
  300. GMADD xvf, d, D8, S0, VBETA, D8, D9, S1, VBETA, D9, D10, S2, VBETA, D10, D11, S3, VBETA, D11
  301. GLD xv, , S0, C3, 0x00, S1, C3, 0x20, S2, C3, 0x40, S3, C3, 0x60
  302. GMADD xvf, d, D12, S0, VBETA, D12, D13, S1, VBETA, D13, D14, S2, VBETA, D14, D15, S3, VBETA, D15
  303. #endif
  304. GST xv, , D12, C3, 0x00, D13, C3, 0x20, D14, C3, 0x40, D15, C3, 0x60, \
  305. D8, C2, 0x00, D9, C2, 0x20, D10, C2, 0x40, D11, C2, 0x60, \
  306. D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \
  307. D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60
  308. // Update C0, C1, C2, C3
  309. PTR_ALSL C0, LDC, C0, 2
  310. PTR_ALSL C1, LDC, C1, 2
  311. PTR_ALSL C2, LDC, C2, 2
  312. PTR_ALSL C3, LDC, C3, 2
  313. // Update X0, B1, B2, B3
  314. PTR_SUB X0, X0, K8
  315. PTR_SUB B1, B1, K8
  316. PTR_SUB B2, B2, K8
  317. PTR_SUB B3, B3, K8
  318. PTR_ALSL X0, LDB, X0, 2
  319. PTR_ALSL B1, LDB, B1, 2
  320. PTR_ALSL B2, LDB, B2, 2
  321. PTR_ALSL B3, LDB, B3, 2
  322. // Restore A0
  323. move A0, A
  324. bnez N4, .L_M16_N4
  325. .L_M16_N3:
  326. andi N2, N, 0x02
  327. beqz N2, .L_M16_N1
  328. .L_M16_N2:
  329. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \
  330. D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7
  331. move K1, K // Restore K1
  332. bge ZERO, K, .L_M16_N2_END
  333. .L_M16_N2_K1:
  334. PTR_ADDI K1, K1, -1
  335. GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60
  336. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00
  337. GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \
  338. D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7
  339. PTR_ADDI X0, X0, 0x08
  340. PTR_ADDI B1, B1, 0x08
  341. PTR_ADD A0, A0, LDA
  342. bnez K1, .L_M16_N2_K1
  343. .L_M16_N2_END:
  344. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \
  345. D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA
  346. #ifndef B0
  347. GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60
  348. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3
  349. GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60
  350. GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7
  351. #endif
  352. GST xv, , D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \
  353. D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60
  354. // Update C0, C1, C2, C3
  355. PTR_ALSL C0, LDC, C0, 1
  356. PTR_ALSL C1, LDC, C1, 1
  357. // Update X0, B1, B2, B3
  358. PTR_SUB X0, X0, K8
  359. PTR_SUB B1, B1, K8
  360. PTR_ALSL X0, LDB, X0, 1
  361. PTR_ALSL B1, LDB, B1, 1
  362. // Restore A0
  363. move A0, A
  364. .L_M16_N1:
  365. andi N1, N, 0x01
  366. beqz N1, .L_M16_END
  367. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3
  368. move K1, K // Restore K1
  369. bge ZERO, K, .L_M16_N1_END
  370. .L_M16_N1_K1:
  371. PTR_ADDI K1, K1, -1
  372. GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60
  373. GLDREPL xv, d, Z0, X0, 0x00
  374. GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3
  375. PTR_ADDI X0, X0, 0x08
  376. PTR_ADD A0, A0, LDA
  377. bnez K1, .L_M16_N1_K1
  378. .L_M16_N1_END:
  379. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
  380. #ifndef B0
  381. GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60
  382. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3
  383. #endif
  384. GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60
  385. // Update C0, C1, C2, C3
  386. PTR_ALSL C0, LDC, C0, 2
  387. // Update X0, B1, B2, B3
  388. PTR_SUB X0, X0, K8
  389. PTR_ALSL X0, LDB, X0, 2
  390. // Restore A0
  391. move A0, A
  392. .L_M16_END:
  393. PTR_ADDI M16, M16, -1
  394. PTR_ADDI A, A, 0x80
  395. PTR_ADDI C, C, 0x80
  396. bnez M16, .L_M16
  397. .L_M15:
  398. andi M8, M, 0x08
  399. beqz M8, .L_M7
  400. .L_M8:
  401. PTR_SRAI N4, N, 2 // N >> 2
  402. move A0, A // Restore A0
  403. move X0, B // Restore X0
  404. PTR_ADD B1, X0, LDB
  405. PTR_ADD B2, B1, LDB
  406. PTR_ADD B3, B2, LDB
  407. move C0, C // Restore C0
  408. PTR_ADD C1, C0, LDC
  409. PTR_ADD C2, C1, LDC
  410. PTR_ADD C3, C2, LDC
  411. beqz N4, .L_M8_N3
  412. .L_M8_N4:
  413. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \
  414. D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7
  415. move K1, K // Restore K1
  416. PTR_ADDI N4, N4, -1
  417. bge ZERO, K, .L_M8_N4_END
  418. .L_M8_N4_K1:
  419. PTR_ADDI K1, K1, -1
  420. GLD xv, , S0, A0, 0x00, S1, A0, 0x20
  421. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00
  422. GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \
  423. D2, S0, Z1, D2, D3, S1, Z1, D3, \
  424. D4, S0, Z2, D4, D5, S1, Z2, D5, \
  425. D6, S0, Z3, D6, D7, S1, Z3, D7,
  426. PTR_ADDI X0, X0, 0x08
  427. PTR_ADDI B1, B1, 0x08
  428. PTR_ADDI B2, B2, 0x08
  429. PTR_ADDI B3, B3, 0x08
  430. PTR_ADD A0, A0, LDA
  431. bnez K1, .L_M8_N4_K1
  432. .L_M8_N4_END:
  433. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \
  434. D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA
  435. #ifndef B0
  436. GLD xv, , S0, C0, 0x00, S1, C0, 0x20
  437. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1
  438. GLD xv, , S0, C1, 0x00, S1, C1, 0x20
  439. GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3
  440. GLD xv, , S0, C2, 0x00, S1, C2, 0x20
  441. GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5
  442. GLD xv, , S0, C3, 0x00, S1, C3, 0x20
  443. GMADD xvf, d, D6, S0, VBETA, D6, D7, S1, VBETA, D7
  444. #endif
  445. GST xv, , D4, C2, 0x00, D5, C2, 0x20, D6, C3, 0x00, D7, C3, 0x20, \
  446. D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20
  447. // Update C0, C1, C2, C3
  448. PTR_ALSL C0, LDC, C0, 2
  449. PTR_ALSL C1, LDC, C1, 2
  450. PTR_ALSL C2, LDC, C2, 2
  451. PTR_ALSL C3, LDC, C3, 2
  452. // Update X0, B1, B2, B3
  453. PTR_SUB X0, X0, K8
  454. PTR_SUB B1, B1, K8
  455. PTR_SUB B2, B2, K8
  456. PTR_SUB B3, B3, K8
  457. PTR_ALSL X0, LDB, X0, 2
  458. PTR_ALSL B1, LDB, B1, 2
  459. PTR_ALSL B2, LDB, B2, 2
  460. PTR_ALSL B3, LDB, B3, 2
  461. // Restore A0
  462. move A0, A
  463. bnez N4, .L_M8_N4
  464. .L_M8_N3:
  465. andi N2, N, 0x02
  466. beqz N2, .L_M8_N1
  467. .L_M8_N2:
  468. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3
  469. move K1, K // Restore K1
  470. bge ZERO, K, .L_M8_N2_END
  471. .L_M8_N2_K1:
  472. PTR_ADDI K1, K1, -1
  473. GLD xv, , S0, A0, 0x00, S1, A0, 0x20
  474. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00
  475. GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \
  476. D2, S0, Z1, D2, D3, S1, Z1, D3
  477. PTR_ADDI X0, X0, 0x08
  478. PTR_ADDI B1, B1, 0x08
  479. PTR_ADD A0, A0, LDA
  480. bnez K1, .L_M8_N2_K1
  481. .L_M8_N2_END:
  482. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
  483. #ifndef B0
  484. GLD xv, , S0, C0, 0x00, S1, C0, 0x20
  485. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1
  486. GLD xv, , S0, C1, 0x00, S1, C1, 0x20
  487. GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3
  488. #endif
  489. GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20
  490. // Update C0, C1
  491. PTR_ALSL C0, LDC, C0, 1
  492. PTR_ALSL C1, LDC, C1, 1
  493. // Update X0, B1
  494. PTR_SUB X0, X0, K8
  495. PTR_SUB B1, B1, K8
  496. PTR_ALSL X0, LDB, X0, 1
  497. PTR_ALSL B1, LDB, B1, 1
  498. // Restore A0
  499. move A0, A
  500. .L_M8_N1:
  501. andi N1, N, 0x01
  502. beqz N1, .L_M8_END
  503. GXOR xv, v, D0, D0, D0, D1, D1, D1
  504. move K1, K // Restore K1
  505. bge ZERO, K, .L_M8_N1_END
  506. .L_M8_N1_K1:
  507. PTR_ADDI K1, K1, -1
  508. GLD xv, , S0, A0, 0x00, S1, A0, 0x20
  509. GLDREPL xv, d, Z0, X0, 0x00
  510. GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1
  511. PTR_ADDI X0, X0, 0x08
  512. PTR_ADD A0, A0, LDA
  513. bnez K1, .L_M8_N1_K1
  514. .L_M8_N1_END:
  515. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA
  516. #ifndef B0
  517. GLD xv, , S0, C0, 0x00, S1, C0, 0x20
  518. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1
  519. #endif
  520. GST xv, , D0, C0, 0x00, D1, C0, 0x20
  521. .L_M8_END:
  522. PTR_ADDI A, A, 0x40
  523. PTR_ADDI C, C, 0x40
  524. .L_M7:
  525. andi M4, M, 0x04
  526. beqz M4, .L_M3
  527. .L_M4:
  528. DGEMM_SMALL_KERNEL_NN_TAIL 4
  529. .L_M3:
  530. andi M2, M, 0x02
  531. beqz M2, .L_M1
  532. .L_M2:
  533. DGEMM_SMALL_KERNEL_NN_TAIL 2
  534. .L_M1:
  535. andi M1, M, 0x01
  536. beqz M1, .L_M0
  537. DGEMM_SMALL_KERNEL_NN_TAIL 1
  538. .L_M0:
  539. pop_if_used 5, 2
  540. jirl $r0, $r1, 0x0
  541. EPILOGUE