You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_sve_v1x8.S 20 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define lanes x15
  47. #define pA x16
  48. #define alpha w17
  49. #define alpha0 s10
  50. #define alphaZ z2.s
  51. #define A_PRE_SIZE 1536
  52. #define B_PRE_SIZE 512
  53. #define C_PRE_SIZE 128
  54. // 00 origM
  55. // 01 origN
  56. // 02 origK
  57. // 03 origPA
  58. // 04 origPB
  59. // 05 pC
  60. // 06 origLDC -> LDC
  61. // 07 temp
  62. // 08 counterL
  63. // 09 counterI
  64. // 10 counterJ
  65. // 11 pB
  66. // 12 pCRow0
  67. // 13 pCRow1
  68. // 14 pCRow2
  69. // 15 lanes
  70. // 16 pA
  71. // 17
  72. // 18 must save
  73. // 19 must save
  74. // 20 must save
  75. // 21 must save
  76. // 22 must save
  77. // 23 must save
  78. // 24 must save
  79. // 25 must save
  80. // 26 must save
  81. // 27 must save
  82. // 28 must save
  83. // 29 frame
  84. // 30 link
  85. // 31 sp
  86. //v00 ALPHA -> pA0_0
  87. //v01 pA0_1
  88. //v02 ALPHA0
  89. //v03
  90. //v04
  91. //v05
  92. //v06
  93. //v07
  94. //v08 must save pB0_0
  95. //v09 must save pB0_1
  96. //v10 must save pB0_2
  97. //v11 must save pB0_3
  98. //v12 must save pB0_4
  99. //v13 must save pB0_5
  100. //v14 must save pB0_6
  101. //v15 must save pB0_7
  102. //v16 must save C0
  103. //v17 must save C1
  104. //v18 must save C2
  105. //v19 must save C3
  106. //v20 must save C4
  107. //v21 must save C5
  108. //v22 must save C6
  109. //v23 must save C7
  110. /*******************************************************************************
  111. * Macro definitions
  112. *******************************************************************************/
  113. .macro INITv1x8
  114. dup z16.s, #0
  115. dup z17.s, #0
  116. dup z18.s, #0
  117. dup z19.s, #0
  118. dup z20.s, #0
  119. dup z21.s, #0
  120. dup z22.s, #0
  121. dup z23.s, #0
  122. .endm
  123. .macro KERNELv1x8_I
  124. ld1w z0.s, p1/z, [pA]
  125. ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one
  126. add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4
  127. ld1rw z8.s, p0/z, [pB]
  128. ld1rw z9.s, p0/z, [pB, 4]
  129. ld1rw z10.s, p0/z, [pB, 8]
  130. ld1rw z11.s, p0/z, [pB, 12]
  131. ld1rw z12.s, p0/z, [pB, 16]
  132. ld1rw z13.s, p0/z, [pB, 20]
  133. ld1rw z14.s, p0/z, [pB, 24]
  134. ld1rw z15.s, p0/z, [pB, 28]
  135. add pB, pB, 32
  136. fmla z16.s, p1/m, z0.s, z8.s
  137. ld1rw z8.s, p0/z, [pB]
  138. fmla z17.s, p1/m, z0.s, z9.s
  139. ld1rw z9.s, p0/z, [pB, 4]
  140. fmla z18.s, p1/m, z0.s, z10.s
  141. ld1rw z10.s, p0/z, [pB, 8]
  142. fmla z19.s, p1/m, z0.s, z11.s
  143. ld1rw z11.s, p0/z, [pB, 12]
  144. fmla z20.s, p1/m, z0.s, z12.s
  145. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  146. ld1rw z12.s, p0/z, [pB, 16]
  147. fmla z21.s, p1/m, z0.s, z13.s
  148. ld1rw z13.s, p0/z, [pB, 20]
  149. fmla z22.s, p1/m, z0.s, z14.s
  150. ld1rw z14.s, p0/z, [pB, 24]
  151. fmla z23.s, p1/m, z0.s, z15.s
  152. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  153. ld1rw z15.s, p0/z, [pB, 28]
  154. add pB, pB, 32
  155. .endm
  156. .macro KERNELv1x8_M1
  157. ld1w z1.s, p1/z, [pA]
  158. add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
  159. fmla z16.s, p1/m, z0.s, z8.s
  160. ld1rw z8.s, p0/z, [pB]
  161. fmla z17.s, p1/m, z0.s, z9.s
  162. ld1rw z9.s, p0/z, [pB, 4]
  163. fmla z18.s, p1/m, z0.s, z10.s
  164. ld1rw z10.s, p0/z, [pB, 8]
  165. fmla z19.s, p1/m, z0.s, z11.s
  166. ld1rw z11.s, p0/z, [pB, 12]
  167. fmla z20.s, p1/m, z0.s, z12.s
  168. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  169. ld1rw z12.s, p0/z, [pB, 16]
  170. fmla z21.s, p1/m, z0.s, z13.s
  171. ld1rw z13.s, p0/z, [pB, 20]
  172. fmla z22.s, p1/m, z0.s, z14.s
  173. ld1rw z14.s, p0/z, [pB, 24]
  174. fmla z23.s, p1/m, z0.s, z15.s
  175. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  176. ld1rw z15.s, p0/z, [pB, 28]
  177. add pB, pB, 32
  178. .endm
  179. .macro KERNELv1x8_M2
  180. ld1w z0.s, p1/z, [pA]
  181. add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
  182. fmla z16.s, p1/m, z1.s, z8.s
  183. ld1rw z8.s, p0/z, [pB]
  184. fmla z17.s, p1/m, z1.s, z9.s
  185. ld1rw z9.s, p0/z, [pB, 4]
  186. fmla z18.s, p1/m, z1.s, z10.s
  187. ld1rw z10.s, p0/z, [pB, 8]
  188. fmla z19.s, p1/m, z1.s, z11.s
  189. ld1rw z11.s, p0/z, [pB, 12]
  190. fmla z20.s, p1/m, z1.s, z12.s
  191. ld1rw z12.s, p0/z, [pB, 16]
  192. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  193. fmla z21.s, p1/m, z1.s, z13.s
  194. ld1rw z13.s, p0/z, [pB, 20]
  195. fmla z22.s, p1/m, z1.s, z14.s
  196. ld1rw z14.s, p0/z, [pB, 24]
  197. fmla z23.s, p1/m, z1.s, z15.s
  198. ld1rw z15.s, p0/z, [pB, 28]
  199. add pB, pB, 32
  200. .endm
  201. .macro KERNELv1x8_E
  202. fmla z16.s, p1/m, z1.s, z8.s
  203. fmla z17.s, p1/m, z1.s, z9.s
  204. fmla z18.s, p1/m, z1.s, z10.s
  205. fmla z19.s, p1/m, z1.s, z11.s
  206. fmla z20.s, p1/m, z1.s, z12.s
  207. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  208. fmla z21.s, p1/m, z1.s, z13.s
  209. fmla z22.s, p1/m, z1.s, z14.s
  210. fmla z23.s, p1/m, z1.s, z15.s
  211. .endm
  212. .macro KERNELv1x8_SUB
  213. ld1w z0.s, p1/z, [pA]
  214. add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
  215. ld1rw z8.s, p0/z, [pB]
  216. ld1rw z9.s, p0/z, [pB, 4]
  217. ld1rw z10.s, p0/z, [pB, 8]
  218. ld1rw z11.s, p0/z, [pB, 12]
  219. ld1rw z12.s, p0/z, [pB, 16]
  220. ld1rw z13.s, p0/z, [pB, 20]
  221. ld1rw z14.s, p0/z, [pB, 24]
  222. ld1rw z15.s, p0/z, [pB, 28]
  223. add pB, pB, 32
  224. fmla z16.s, p1/m, z0.s, z8.s
  225. fmla z17.s, p1/m, z0.s, z9.s
  226. fmla z18.s, p1/m, z0.s, z10.s
  227. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  228. fmla z19.s, p1/m, z0.s, z11.s
  229. fmla z20.s, p1/m, z0.s, z12.s
  230. fmla z21.s, p1/m, z0.s, z13.s
  231. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  232. fmla z22.s, p1/m, z0.s, z14.s
  233. fmla z23.s, p1/m, z0.s, z15.s
  234. .endm
  235. .macro SAVEv1x8
  236. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  237. add pCRow1, pCRow0, LDC
  238. ld1w z24.s, p1/z, [pCRow0]
  239. fmla z24.s, p1/m, z16.s, alphaZ
  240. st1w z24.s, p1, [pCRow0]
  241. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  242. add pCRow2, pCRow1, LDC
  243. ld1w z25.s, p1/z, [pCRow1]
  244. fmla z25.s, p1/m, z17.s, alphaZ
  245. st1w z25.s, p1, [pCRow1]
  246. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  247. add pCRow1, pCRow2, LDC
  248. ld1w z26.s, p1/z, [pCRow2]
  249. fmla z26.s, p1/m, z18.s, alphaZ
  250. st1w z26.s, p1, [pCRow2]
  251. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  252. add pCRow2, pCRow1, LDC
  253. ld1w z27.s, p1/z, [pCRow1]
  254. fmla z27.s, p1/m, z19.s, alphaZ
  255. st1w z27.s, p1, [pCRow1]
  256. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  257. add pCRow1, pCRow2, LDC
  258. ld1w z28.s, p1/z, [pCRow2]
  259. fmla z28.s, p1/m, z20.s, alphaZ
  260. st1w z28.s, p1, [pCRow2]
  261. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  262. add pCRow2, pCRow1, LDC
  263. ld1w z29.s, p1/z, [pCRow1]
  264. fmla z29.s, p1/m, z21.s, alphaZ
  265. st1w z29.s, p1, [pCRow1]
  266. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  267. add pCRow1, pCRow2, LDC
  268. ld1w z30.s, p1/z, [pCRow2]
  269. fmla z30.s, p1/m, z22.s, alphaZ
  270. st1w z30.s, p1, [pCRow2]
  271. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  272. ld1w z31.s, p1/z, [pCRow1]
  273. fmla z31.s, p1/m, z23.s, alphaZ
  274. st1w z31.s, p1, [pCRow1]
  275. add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
  276. .endm
  277. /******************************************************************************/
  278. .macro INITv1x4
  279. dup z16.s, #0
  280. dup z17.s, #0
  281. dup z18.s, #0
  282. dup z19.s, #0
  283. .endm
  284. .macro KERNELv1x4_SUB
  285. ld1w z0.s, p1/z, [pA]
  286. add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
  287. ld1rw z8.s, p0/z, [pB]
  288. ld1rw z9.s, p0/z, [pB, 4]
  289. ld1rw z10.s, p0/z, [pB, 8]
  290. ld1rw z11.s, p0/z, [pB, 12]
  291. add pB, pB, 16
  292. fmla z16.s, p1/m, z0.s, z8.s
  293. fmla z17.s, p1/m, z0.s, z9.s
  294. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  295. fmla z18.s, p1/m, z0.s, z10.s
  296. fmla z19.s, p1/m, z0.s, z11.s
  297. .endm
  298. .macro SAVEv1x4
  299. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  300. add pCRow1, pCRow0, LDC
  301. ld1w z24.s, p1/z, [pCRow0]
  302. fmla z24.s, p1/m, z16.s, alphaZ
  303. st1w z24.s, p1, [pCRow0]
  304. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  305. add pCRow2, pCRow1, LDC
  306. ld1w z25.s, p1/z, [pCRow1]
  307. fmla z25.s, p1/m, z17.s, alphaZ
  308. st1w z25.s, p1, [pCRow1]
  309. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  310. add pCRow1, pCRow2, LDC
  311. ld1w z26.s, p1/z, [pCRow2]
  312. fmla z26.s, p1/m, z18.s, alphaZ
  313. st1w z26.s, p1, [pCRow2]
  314. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  315. ld1w z27.s, p1/z, [pCRow1]
  316. fmla z27.s, p1/m, z19.s, alphaZ
  317. st1w z27.s, p1, [pCRow1]
  318. add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
  319. .endm
  320. /******************************************************************************/
  321. .macro INITv1x2
  322. dup z16.s, #0
  323. dup z17.s, #0
  324. .endm
  325. .macro KERNELv1x2_SUB
  326. ld1w z0.s, p1/z, [pA]
  327. add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4
  328. ld1rw z8.s, p0/z, [pB]
  329. ld1rw z9.s, p0/z, [pB, 4]
  330. add pB, pB, 8
  331. fmla z16.s, p1/m, z0.s, z8.s
  332. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  333. fmla z17.s, p1/m, z0.s, z9.s
  334. .endm
  335. .macro SAVEv1x2
  336. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  337. add pCRow1, pCRow0, LDC
  338. ld1w z24.s, p1/z, [pCRow0]
  339. fmla z24.s, p1/m, z16.s, alphaZ
  340. st1w z24.s, p1, [pCRow0]
  341. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  342. ld1w z25.s, p1/z, [pCRow1]
  343. fmla z25.s, p1/m, z17.s, alphaZ
  344. st1w z25.s, p1, [pCRow1]
  345. add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
  346. .endm
  347. /******************************************************************************/
  348. .macro INITv1x1
  349. dup z16.s, #0
  350. .endm
  351. .macro KERNELv1x1_SUB
  352. ld1w z0.s, p1/z, [pA]
  353. add pA, pA, lanes, lsl #2 // pA = pA + lanes * 8
  354. ld1rw z8.s, p0/z, [pB]
  355. add pB, pB, 4
  356. fmla z16.s, p1/m, z0.s, z8.s
  357. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  358. .endm
  359. .macro SAVEv1x1
  360. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  361. ld1w z24.s, p1/z, [pCRow0]
  362. fmla z24.s, p1/m, z16.s, alphaZ
  363. st1w z24.s, p1, [pCRow0]
  364. add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4
  365. .endm
  366. /*******************************************************************************
  367. * End of macro definitions
  368. *******************************************************************************/
  369. PROLOGUE
  370. .align 5
  371. add sp, sp, #-(11 * 16)
  372. stp d8, d9, [sp, #(0 * 16)]
  373. stp d10, d11, [sp, #(1 * 16)]
  374. stp d12, d13, [sp, #(2 * 16)]
  375. stp d14, d15, [sp, #(3 * 16)]
  376. stp d16, d17, [sp, #(4 * 16)]
  377. stp x18, x19, [sp, #(5 * 16)]
  378. stp x20, x21, [sp, #(6 * 16)]
  379. stp x22, x23, [sp, #(7 * 16)]
  380. stp x24, x25, [sp, #(8 * 16)]
  381. stp x26, x27, [sp, #(9 * 16)]
  382. str x28, [sp, #(10 * 16)]
  383. prfm PLDL1KEEP, [origPB]
  384. prfm PLDL1KEEP, [origPA]
  385. fmov alpha, s0
  386. dup alphaZ, alpha
  387. lsl LDC, LDC, #2 // ldc = ldc * 4
  388. ptrue p0.s // create true predicate
  389. mov pB, origPB
  390. // Loop over N
  391. mov counterJ, origN
  392. asr counterJ, counterJ, #3 // J = J / 8
  393. cmp counterJ, #0
  394. ble .Ldgemm_kernel_L4_BEGIN
  395. /******************************************************************************/
  396. /* Repeat this as long as there are 8 left in N */
  397. .align 5
  398. .Ldgemm_kernel_L8_BEGIN:
  399. mov pCRow0, pC
  400. add pC, pC, LDC, lsl #3 // add 8 x LDC
  401. mov pA, origPA // pA = start of A array
  402. .Ldgemm_kernel_L8_Mv1_BEGIN:
  403. /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
  404. mov counterI, #0
  405. whilelt p1.s, counterI, origM
  406. cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
  407. .align 5
  408. .Ldgemm_kernel_L8_Mv1_20:
  409. mov pB, origPB
  410. INITv1x8 // fill with zeros
  411. asr counterL , origK, #3 // L = K / 8
  412. cmp counterL , #2 // is there at least 4 to do?
  413. blt .Ldgemm_kernel_L8_Mv1_32
  414. KERNELv1x8_I
  415. KERNELv1x8_M2
  416. KERNELv1x8_M1
  417. KERNELv1x8_M2
  418. KERNELv1x8_M1
  419. KERNELv1x8_M2
  420. KERNELv1x8_M1
  421. KERNELv1x8_M2
  422. subs counterL, counterL, #2 // subtract 2
  423. ble .Ldgemm_kernel_L8_Mv1_22a
  424. .align 5
  425. .Ldgemm_kernel_L8_Mv1_22:
  426. KERNELv1x8_M1
  427. KERNELv1x8_M2
  428. KERNELv1x8_M1
  429. KERNELv1x8_M2
  430. KERNELv1x8_M1
  431. KERNELv1x8_M2
  432. KERNELv1x8_M1
  433. KERNELv1x8_M2
  434. subs counterL, counterL, #1
  435. bgt .Ldgemm_kernel_L8_Mv1_22
  436. .align 5
  437. .Ldgemm_kernel_L8_Mv1_22a:
  438. KERNELv1x8_M1
  439. KERNELv1x8_M2
  440. KERNELv1x8_M1
  441. KERNELv1x8_M2
  442. KERNELv1x8_M1
  443. KERNELv1x8_M2
  444. KERNELv1x8_M1
  445. KERNELv1x8_E
  446. b .Ldgemm_kernel_L8_Mv1_44
  447. .align 5
  448. .Ldgemm_kernel_L8_Mv1_32:
  449. tst counterL, #1
  450. ble .Ldgemm_kernel_L8_Mv1_40
  451. KERNELv1x8_I
  452. KERNELv1x8_M2
  453. KERNELv1x8_M1
  454. KERNELv1x8_M2
  455. KERNELv1x8_M1
  456. KERNELv1x8_M2
  457. KERNELv1x8_M1
  458. KERNELv1x8_E
  459. b .Ldgemm_kernel_L8_Mv1_44
  460. .Ldgemm_kernel_L8_Mv1_40:
  461. INITv1x8
  462. .Ldgemm_kernel_L8_Mv1_44:
  463. ands counterL , origK, #7
  464. ble .Ldgemm_kernel_L8_Mv1_100
  465. .align 5
  466. .Ldgemm_kernel_L8_Mv1_46:
  467. KERNELv1x8_SUB
  468. subs counterL, counterL, #1
  469. bne .Ldgemm_kernel_L8_Mv1_46
  470. .Ldgemm_kernel_L8_Mv1_100:
  471. prfm PLDL1KEEP, [pA]
  472. prfm PLDL1KEEP, [pA, #64]
  473. prfm PLDL1KEEP, [origPB]
  474. SAVEv1x8
  475. .Ldgemm_kernel_L8_Mv1_END:
  476. incw counterI
  477. whilelt p1.s, counterI, origM //SVE instruction
  478. cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
  479. b.any .Ldgemm_kernel_L8_Mv1_20
  480. .Ldgemm_kernel_L8_END:
  481. lsl temp, origK, #5
  482. add origPB, origPB, temp // B = B + K * 8 * 4
  483. subs counterJ, counterJ , #1 // j--
  484. bgt .Ldgemm_kernel_L8_BEGIN
  485. /******************************************************************************/
  486. /* Repeat the same thing if 4 left in N */
  487. .align 5
  488. .Ldgemm_kernel_L4_BEGIN:
  489. mov counterJ , origN
  490. tst counterJ , #4
  491. ble .Ldgemm_kernel_L2_BEGIN
  492. mov pCRow0, pC
  493. add pC, pC, LDC, lsl #2 // add 4 x LDC
  494. mov pA, origPA // pA = start of A array
  495. .Ldgemm_kernel_L4_Mv1_BEGIN:
  496. mov counterI, #0
  497. whilelt p1.s, counterI, origM //SVE instruction
  498. cntp lanes, p0, p1.s
  499. .align 5
  500. .Ldgemm_kernel_L4_Mv1_20:
  501. mov pB, origPB
  502. INITv1x4 // fill with zeros
  503. asr counterL , origK, #3 // L = K / 8
  504. cmp counterL , #0 // is there at least 4 to do?
  505. ble .Ldgemm_kernel_L4_Mv1_44
  506. .align 5
  507. .Ldgemm_kernel_L4_Mv1_22:
  508. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  509. KERNELv1x4_SUB
  510. KERNELv1x4_SUB
  511. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  512. KERNELv1x4_SUB
  513. KERNELv1x4_SUB
  514. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  515. KERNELv1x4_SUB
  516. KERNELv1x4_SUB
  517. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  518. KERNELv1x4_SUB
  519. KERNELv1x4_SUB
  520. subs counterL, counterL, #1
  521. bgt .Ldgemm_kernel_L4_Mv1_22
  522. .Ldgemm_kernel_L4_Mv1_44:
  523. ands counterL , origK, #7
  524. ble .Ldgemm_kernel_L4_Mv1_100
  525. .align 5
  526. .Ldgemm_kernel_L4_Mv1_46:
  527. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  528. KERNELv1x4_SUB
  529. subs counterL, counterL, #1
  530. bne .Ldgemm_kernel_L4_Mv1_46
  531. .Ldgemm_kernel_L4_Mv1_100:
  532. prfm PLDL1KEEP, [pA]
  533. prfm PLDL1KEEP, [pA, #64]
  534. prfm PLDL1KEEP, [origPB]
  535. SAVEv1x4
  536. .Ldgemm_kernel_L4_Mv1_END:
  537. incw counterI
  538. whilelt p1.s, counterI, origM //SVE instruction
  539. cntp lanes, p0, p1.s
  540. b.any .Ldgemm_kernel_L4_Mv1_20
  541. .Ldgemm_kernel_L4_END:
  542. lsl temp, origK, #4
  543. add origPB, origPB, temp // B = B + K * 4 * 4
  544. /******************************************************************************/
  545. /* Repeat the same thing if 2 left in N */
  546. .align 5
  547. .Ldgemm_kernel_L2_BEGIN:
  548. mov counterJ , origN
  549. tst counterJ , #2
  550. ble .Ldgemm_kernel_L1_BEGIN
  551. mov pCRow0, pC
  552. add pC, pC, LDC, lsl #1 // add 2 x LDC
  553. mov pA, origPA // pA = start of A array
  554. .Ldgemm_kernel_L2_Mv1_BEGIN:
  555. mov counterI, #0
  556. whilelt p1.s, counterI, origM //SVE instruction
  557. cntp lanes, p0, p1.s
  558. .align 5
  559. .Ldgemm_kernel_L2_Mv1_20:
  560. mov pB, origPB
  561. INITv1x2 // fill with zeros
  562. asr counterL , origK, #3 // L = K / 8
  563. cmp counterL , #0 // is there at least 4 to do?
  564. ble .Ldgemm_kernel_L2_Mv1_44
  565. .align 5
  566. .Ldgemm_kernel_L2_Mv1_22:
  567. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  568. KERNELv1x2_SUB
  569. KERNELv1x2_SUB
  570. KERNELv1x2_SUB
  571. KERNELv1x2_SUB
  572. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  573. KERNELv1x2_SUB
  574. KERNELv1x2_SUB
  575. KERNELv1x2_SUB
  576. KERNELv1x2_SUB
  577. subs counterL, counterL, #1
  578. bgt .Ldgemm_kernel_L2_Mv1_22
  579. .Ldgemm_kernel_L2_Mv1_44:
  580. ands counterL , origK, #7
  581. ble .Ldgemm_kernel_L2_Mv1_100
  582. .align 5
  583. .Ldgemm_kernel_L2_Mv1_46:
  584. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  585. KERNELv1x2_SUB
  586. subs counterL, counterL, #1
  587. bne .Ldgemm_kernel_L2_Mv1_46
  588. .Ldgemm_kernel_L2_Mv1_100:
  589. prfm PLDL1KEEP, [pA]
  590. prfm PLDL1KEEP, [pA, #64]
  591. prfm PLDL1KEEP, [origPB]
  592. SAVEv1x2
  593. .Ldgemm_kernel_L2_Mv1_END:
  594. incw counterI
  595. whilelt p1.s, counterI, origM //SVE instruction
  596. cntp lanes, p0, p1.s
  597. b.any .Ldgemm_kernel_L2_Mv1_20
  598. .Ldgemm_kernel_L2_END:
  599. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  600. /******************************************************************************/
  601. /* Repeat the same thing if 1 left in N */
  602. .align 5
  603. .Ldgemm_kernel_L1_BEGIN:
  604. mov counterJ , origN
  605. tst counterJ , #1
  606. ble .Ldgemm_kernel_L999 // done
  607. mov pCRow0, pC
  608. add pC, pC, LDC // add 1 x LDC
  609. mov pA, origPA // pA = start of A array
  610. .Ldgemm_kernel_L1_Mv1_BEGIN:
  611. mov counterI, #0
  612. whilelt p1.s, counterI, origM //SVE instruction
  613. cntp lanes, p0, p1.s
  614. .align 5
  615. .Ldgemm_kernel_L1_Mv1_20:
  616. mov pB, origPB
  617. INITv1x1 // fill with zeros
  618. asr counterL , origK, #3 // L = K / 8
  619. cmp counterL , #0 // is there at least 8 to do?
  620. ble .Ldgemm_kernel_L1_Mv1_44
  621. .align 5
  622. .Ldgemm_kernel_L1_Mv1_22:
  623. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  624. KERNELv1x1_SUB
  625. KERNELv1x1_SUB
  626. KERNELv1x1_SUB
  627. KERNELv1x1_SUB
  628. KERNELv1x1_SUB
  629. KERNELv1x1_SUB
  630. KERNELv1x1_SUB
  631. KERNELv1x1_SUB
  632. subs counterL, counterL, #1
  633. bgt .Ldgemm_kernel_L1_Mv1_22
  634. .Ldgemm_kernel_L1_Mv1_44:
  635. ands counterL , origK, #7
  636. ble .Ldgemm_kernel_L1_Mv1_100
  637. .align 5
  638. .Ldgemm_kernel_L1_Mv1_46:
  639. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  640. KERNELv1x1_SUB
  641. subs counterL, counterL, #1
  642. bgt .Ldgemm_kernel_L1_Mv1_46
  643. .Ldgemm_kernel_L1_Mv1_100:
  644. prfm PLDL1KEEP, [pA]
  645. prfm PLDL1KEEP, [pA, #64]
  646. prfm PLDL1KEEP, [origPB]
  647. SAVEv1x1
  648. .Ldgemm_kernel_L1_Mv1_END:
  649. incw counterI
  650. whilelt p1.s, counterI, origM //SVE instruction
  651. cntp lanes, p0, p1.s
  652. b.any .Ldgemm_kernel_L1_Mv1_20
  653. .Ldgemm_kernel_L1_END:
  654. /******************************************************************************/
  655. .Ldgemm_kernel_L999:
  656. mov x0, #0 // set return value
  657. ldp d8, d9, [sp, #(0 * 16)]
  658. ldp d10, d11, [sp, #(1 * 16)]
  659. ldp d12, d13, [sp, #(2 * 16)]
  660. ldp d14, d15, [sp, #(3 * 16)]
  661. ldp d16, d17, [sp, #(4 * 16)]
  662. ldp x18, x19, [sp, #(5 * 16)]
  663. ldp x20, x21, [sp, #(6 * 16)]
  664. ldp x22, x23, [sp, #(7 * 16)]
  665. ldp x24, x25, [sp, #(8 * 16)]
  666. ldp x26, x27, [sp, #(9 * 16)]
  667. ldr x28, [sp, #(10 * 16)]
  668. add sp, sp, #(11*16)
  669. ret
  670. EPILOGUE