You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_sve_v2x8.S 37 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. /* This is an SVE dgemm kernel with size 2*SVE_LEN x 8.
  28. However, the data layout is the same as for the kernel 1*SVE_LEN x 8.
  29. This means that we sweep two panels of packed A when iterating in a loop over K.
  30. With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */
  31. #define ASSEMBLER
  32. #include "common.h"
  33. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  34. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
  35. #define origM x0
  36. #define origN x1
  37. #define origK x2
  38. #define origPA x3
  39. #define origPB x4
  40. #define pC x5
  41. #define LDC x6
  42. #define temp x7
  43. #define counterL x8
  44. #define counterI x9
  45. #define counterJ x10
  46. #define pB x11
  47. #define pCRow0 x12
  48. #define pCRow1 x13
  49. #define pCRow2 x14
  50. #define lanes x15
  51. #define pA1 x16
  52. #define pA2 x17
  53. #define alpha x18
  54. #define vec_len x19
  55. #define vec_lenx2 x20
  56. #define alpha0 d10
  57. #define alphaZ z7.d
  58. #define A_PRE_SIZE 1536
  59. #define B_PRE_SIZE 512
  60. #define C_PRE_SIZE 128
  61. // 00 origM
  62. // 01 origN
  63. // 02 origK
  64. // 03 origPA
  65. // 04 origPB
  66. // 05 pC
  67. // 06 origLDC -> LDC
  68. // 07 temp
  69. // 08 counterL
  70. // 09 counterI
  71. // 10 counterJ
  72. // 11 pB
  73. // 12 pCRow0
  74. // 13 pCRow1
  75. // 14 pCRow2
  76. // 15 lanes
  77. // 16 pA1
  78. // 17 pA1
  79. // 18 must save alpha
  80. // 19 must save vec_len
  81. // 20 must save
  82. // 21 must save
  83. // 22 must save
  84. // 23 must save
  85. // 24 must save
  86. // 25 must save
  87. // 26 must save
  88. // 27 must save
  89. // 28 must save
  90. // 29 frame
  91. // 30 link
  92. // 31 sp
  93. //v00 ALPHA -> pA10_0
  94. //v01 pA10_1
  95. //v02 pA20_0
  96. //v03 pA20_1
  97. //v04
  98. //v05
  99. //v06
  100. //v07 ALPHA0
  101. //v08 must save pB0_0
  102. //v09 must save pB0_1
  103. //v10 must save pB0_2
  104. //v11 must save pB0_3
  105. //v12 must save pB0_4
  106. //v13 must save pB0_5
  107. //v14 must save pB0_6
  108. //v15 must save pB0_7
  109. //v16 must save C0
  110. //v17 must save C1
  111. //v18 must save C2
  112. //v19 must save C3
  113. //v20 must save C4
  114. //v21 must save C5
  115. //v22 must save C6
  116. //v23 must save C7
  117. //v24 must save C8
  118. //v25 must save C9
  119. //v26 must save C10
  120. //v27 must save C11
  121. //v28 must save C12
  122. //v29 must save C13
  123. //v30 must save C14
  124. //v31 must save C15
  125. /*******************************************************************************
  126. * Macro definitions
  127. *******************************************************************************/
  128. .macro INITv2x8
  129. dup z16.d, #0
  130. dup z17.d, #0
  131. dup z18.d, #0
  132. dup z19.d, #0
  133. dup z20.d, #0
  134. dup z21.d, #0
  135. dup z22.d, #0
  136. dup z23.d, #0
  137. dup z24.d, #0
  138. dup z25.d, #0
  139. dup z26.d, #0
  140. dup z27.d, #0
  141. dup z28.d, #0
  142. dup z29.d, #0
  143. dup z30.d, #0
  144. dup z31.d, #0
  145. .endm
  146. .macro KERNELv2x8_I
  147. ld1d z0.d, p0/z, [pA1]
  148. ld1d z1.d, p0/z, [pA2]
  149. ld1d z2.d, p0/z, [pA1, vec_len, lsl #3]
  150. ld1d z3.d, p0/z, [pA2, vec_len, lsl #3]
  151. add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 *2
  152. add pA2, pA2, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 *2
  153. ld1rd z8.d, p0/z, [pB]
  154. ld1rd z9.d, p0/z, [pB, 8]
  155. ld1rd z10.d, p0/z, [pB, 16]
  156. ld1rd z11.d, p0/z, [pB, 24]
  157. ld1rd z12.d, p0/z, [pB, 32]
  158. ld1rd z13.d, p0/z, [pB, 40]
  159. ld1rd z14.d, p0/z, [pB, 48]
  160. ld1rd z15.d, p0/z, [pB, 56]
  161. add pB, pB, 64
  162. fmla z16.d, p0/m, z0.d, z8.d
  163. fmla z17.d, p0/m, z1.d, z8.d
  164. ld1rd z8.d, p0/z, [pB]
  165. fmla z18.d, p0/m, z0.d, z9.d
  166. fmla z19.d, p0/m, z1.d, z9.d
  167. ld1rd z9.d, p0/z, [pB, 8]
  168. fmla z20.d, p0/m, z0.d, z10.d
  169. fmla z21.d, p0/m, z1.d, z10.d
  170. ld1rd z10.d, p0/z, [pB, 16]
  171. fmla z22.d, p0/m, z0.d, z11.d
  172. fmla z23.d, p0/m, z1.d, z11.d
  173. ld1rd z11.d, p0/z, [pB, 24]
  174. fmla z24.d, p0/m, z0.d, z12.d
  175. fmla z25.d, p0/m, z1.d, z12.d
  176. ld1rd z12.d, p0/z, [pB, 32]
  177. fmla z26.d, p0/m, z0.d, z13.d
  178. fmla z27.d, p0/m, z1.d, z13.d
  179. ld1rd z13.d, p0/z, [pB, 40]
  180. fmla z28.d, p0/m, z0.d, z14.d
  181. fmla z29.d, p0/m, z1.d, z14.d
  182. ld1rd z14.d, p0/z, [pB, 48]
  183. fmla z30.d, p0/m, z0.d, z15.d
  184. fmla z31.d, p0/m, z1.d, z15.d
  185. ld1rd z15.d, p0/z, [pB, 56]
  186. add pB, pB, 64
  187. .endm
  188. .macro KERNELv2x8_M1
  189. ld1d z2.d, p0/z, [pA1]
  190. ld1d z3.d, p0/z, [pA2]
  191. add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  192. add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  193. fmla z16.d, p0/m, z0.d, z8.d
  194. fmla z17.d, p0/m, z1.d, z8.d
  195. ld1rd z8.d, p0/z, [pB]
  196. fmla z18.d, p0/m, z0.d, z9.d
  197. fmla z19.d, p0/m, z1.d, z9.d
  198. ld1rd z9.d, p0/z, [pB, 8]
  199. fmla z20.d, p0/m, z0.d, z10.d
  200. fmla z21.d, p0/m, z1.d, z10.d
  201. ld1rd z10.d, p0/z, [pB, 16]
  202. fmla z22.d, p0/m, z0.d, z11.d
  203. fmla z23.d, p0/m, z1.d, z11.d
  204. ld1rd z11.d, p0/z, [pB, 24]
  205. fmla z24.d, p0/m, z0.d, z12.d
  206. fmla z25.d, p0/m, z1.d, z12.d
  207. ld1rd z12.d, p0/z, [pB, 32]
  208. fmla z26.d, p0/m, z0.d, z13.d
  209. fmla z27.d, p0/m, z1.d, z13.d
  210. ld1rd z13.d, p0/z, [pB, 40]
  211. fmla z28.d, p0/m, z0.d, z14.d
  212. fmla z29.d, p0/m, z1.d, z14.d
  213. ld1rd z14.d, p0/z, [pB, 48]
  214. fmla z30.d, p0/m, z0.d, z15.d
  215. fmla z31.d, p0/m, z1.d, z15.d
  216. ld1rd z15.d, p0/z, [pB, 56]
  217. add pB, pB, 64
  218. .endm
  219. .macro KERNELv2x8_M2
  220. ld1d z0.d, p0/z, [pA1]
  221. ld1d z1.d, p0/z, [pA2]
  222. add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 2 * 8
  223. add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 2 * 8
  224. fmla z16.d, p0/m, z2.d, z8.d
  225. fmla z17.d, p0/m, z3.d, z8.d
  226. ld1rd z8.d, p0/z, [pB]
  227. fmla z18.d, p0/m, z2.d, z9.d
  228. fmla z19.d, p0/m, z3.d, z9.d
  229. ld1rd z9.d, p0/z, [pB, 8]
  230. fmla z20.d, p0/m, z2.d, z10.d
  231. fmla z21.d, p0/m, z3.d, z10.d
  232. ld1rd z10.d, p0/z, [pB, 16]
  233. fmla z22.d, p0/m, z2.d, z11.d
  234. fmla z23.d, p0/m, z3.d, z11.d
  235. ld1rd z11.d, p0/z, [pB, 24]
  236. fmla z24.d, p0/m, z2.d, z12.d
  237. fmla z25.d, p0/m, z3.d, z12.d
  238. ld1rd z12.d, p0/z, [pB, 32]
  239. fmla z26.d, p0/m, z2.d, z13.d
  240. fmla z27.d, p0/m, z3.d, z13.d
  241. ld1rd z13.d, p0/z, [pB, 40]
  242. fmla z28.d, p0/m, z2.d, z14.d
  243. fmla z29.d, p0/m, z3.d, z14.d
  244. ld1rd z14.d, p0/z, [pB, 48]
  245. fmla z30.d, p0/m, z2.d, z15.d
  246. fmla z31.d, p0/m, z3.d, z15.d
  247. ld1rd z15.d, p0/z, [pB, 56]
  248. add pB, pB, 64
  249. .endm
  250. .macro KERNELv2x8_E
  251. fmla z16.d, p0/m, z2.d, z8.d
  252. fmla z17.d, p0/m, z3.d, z8.d
  253. fmla z18.d, p0/m, z2.d, z9.d
  254. fmla z19.d, p0/m, z3.d, z9.d
  255. fmla z20.d, p0/m, z2.d, z10.d
  256. fmla z21.d, p0/m, z3.d, z10.d
  257. fmla z22.d, p0/m, z2.d, z11.d
  258. fmla z23.d, p0/m, z3.d, z11.d
  259. fmla z24.d, p0/m, z2.d, z12.d
  260. fmla z25.d, p0/m, z3.d, z12.d
  261. fmla z26.d, p0/m, z2.d, z13.d
  262. fmla z27.d, p0/m, z3.d, z13.d
  263. fmla z28.d, p0/m, z2.d, z14.d
  264. fmla z29.d, p0/m, z3.d, z14.d
  265. fmla z30.d, p0/m, z2.d, z15.d
  266. fmla z31.d, p0/m, z3.d, z15.d
  267. .endm
  268. .macro KERNELv2x8_SUB
  269. ld1d z0.d, p0/z, [pA1]
  270. ld1d z1.d, p0/z, [pA2]
  271. add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  272. add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  273. ld1rd z8.d, p0/z, [pB]
  274. ld1rd z9.d, p0/z, [pB, 8]
  275. ld1rd z10.d, p0/z, [pB, 16]
  276. ld1rd z11.d, p0/z, [pB, 24]
  277. ld1rd z12.d, p0/z, [pB, 32]
  278. ld1rd z13.d, p0/z, [pB, 40]
  279. ld1rd z14.d, p0/z, [pB, 48]
  280. ld1rd z15.d, p0/z, [pB, 56]
  281. add pB, pB, 64
  282. fmla z16.d, p0/m, z0.d, z8.d
  283. fmla z17.d, p0/m, z1.d, z8.d
  284. fmla z18.d, p0/m, z0.d, z9.d
  285. fmla z19.d, p0/m, z1.d, z9.d
  286. fmla z20.d, p0/m, z0.d, z10.d
  287. fmla z21.d, p0/m, z1.d, z10.d
  288. fmla z22.d, p0/m, z0.d, z11.d
  289. fmla z23.d, p0/m, z1.d, z11.d
  290. fmla z24.d, p0/m, z0.d, z12.d
  291. fmla z25.d, p0/m, z1.d, z12.d
  292. fmla z26.d, p0/m, z0.d, z13.d
  293. fmla z27.d, p0/m, z1.d, z13.d
  294. fmla z28.d, p0/m, z0.d, z14.d
  295. fmla z29.d, p0/m, z1.d, z14.d
  296. fmla z30.d, p0/m, z0.d, z15.d
  297. fmla z31.d, p0/m, z1.d, z15.d
  298. .endm
  299. .macro SAVEv2x8
  300. add pCRow1, pCRow0, LDC
  301. ld1d z8.d, p0/z, [pCRow0]
  302. ld1d z9.d, p0/z, [pCRow0, #1, mul vl]
  303. fmla z8.d, p0/m, z16.d, alphaZ
  304. fmla z9.d, p0/m, z17.d, alphaZ
  305. st1d z8.d, p0, [pCRow0]
  306. st1d z9.d, p0, [pCRow0, #1, mul vl]
  307. add pCRow2, pCRow1, LDC
  308. ld1d z10.d, p0/z, [pCRow1]
  309. ld1d z11.d, p0/z, [pCRow1, #1, mul vl]
  310. fmla z10.d, p0/m, z18.d, alphaZ
  311. fmla z11.d, p0/m, z19.d, alphaZ
  312. st1d z10.d, p0, [pCRow1]
  313. st1d z11.d, p0, [pCRow1, #1, mul vl]
  314. add pCRow1, pCRow2, LDC
  315. ld1d z12.d, p0/z, [pCRow2]
  316. ld1d z13.d, p0/z, [pCRow2, #1, mul vl]
  317. fmla z12.d, p0/m, z20.d, alphaZ
  318. fmla z13.d, p0/m, z21.d, alphaZ
  319. st1d z12.d, p0, [pCRow2]
  320. st1d z13.d, p0, [pCRow2, #1, mul vl]
  321. add pCRow2, pCRow1, LDC
  322. ld1d z14.d, p0/z, [pCRow1]
  323. ld1d z15.d, p0/z, [pCRow1, #1, mul vl]
  324. fmla z14.d, p0/m, z22.d, alphaZ
  325. fmla z15.d, p0/m, z23.d, alphaZ
  326. st1d z14.d, p0, [pCRow1]
  327. st1d z15.d, p0, [pCRow1, #1, mul vl]
  328. add pCRow1, pCRow2, LDC
  329. ld1d z8.d, p0/z, [pCRow2]
  330. ld1d z9.d, p0/z, [pCRow2, #1, mul vl]
  331. fmla z8.d, p0/m, z24.d, alphaZ
  332. fmla z9.d, p0/m, z25.d, alphaZ
  333. st1d z8.d, p0, [pCRow2]
  334. st1d z9.d, p0, [pCRow2, #1, mul vl]
  335. add pCRow2, pCRow1, LDC
  336. ld1d z10.d, p0/z, [pCRow1]
  337. ld1d z11.d, p0/z, [pCRow1, #1, mul vl]
  338. fmla z10.d, p0/m, z26.d, alphaZ
  339. fmla z11.d, p0/m, z27.d, alphaZ
  340. st1d z10.d, p0, [pCRow1]
  341. st1d z11.d, p0, [pCRow1, #1, mul vl]
  342. add pCRow1, pCRow2, LDC
  343. ld1d z12.d, p0/z, [pCRow2]
  344. ld1d z13.d, p0/z, [pCRow2, #1, mul vl]
  345. fmla z12.d, p0/m, z28.d, alphaZ
  346. fmla z13.d, p0/m, z29.d, alphaZ
  347. st1d z12.d, p0, [pCRow2]
  348. st1d z13.d, p0, [pCRow2, #1, mul vl]
  349. ld1d z14.d, p0/z, [pCRow1]
  350. ld1d z15.d, p0/z, [pCRow1, #1, mul vl]
  351. fmla z14.d, p0/m, z30.d, alphaZ
  352. fmla z15.d, p0/m, z31.d, alphaZ
  353. st1d z14.d, p0, [pCRow1]
  354. st1d z15.d, p0, [pCRow1, #1, mul vl]
  355. add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2
  356. .endm
  357. .macro INITv2x4
  358. dup z16.d, #0
  359. dup z17.d, #0
  360. dup z18.d, #0
  361. dup z19.d, #0
  362. dup z20.d, #0
  363. dup z21.d, #0
  364. dup z22.d, #0
  365. dup z23.d, #0
  366. .endm
  367. .macro KERNELv2x4_SUB
  368. ld1d z0.d, p0/z, [pA1]
  369. ld1d z1.d, p0/z, [pA2]
  370. add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  371. add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  372. ld1rd z8.d, p0/z, [pB]
  373. ld1rd z9.d, p0/z, [pB, 8]
  374. ld1rd z10.d, p0/z, [pB, 16]
  375. ld1rd z11.d, p0/z, [pB, 24]
  376. add pB, pB, 32
  377. fmla z16.d, p0/m, z0.d, z8.d
  378. fmla z17.d, p0/m, z1.d, z8.d
  379. fmla z18.d, p0/m, z0.d, z9.d
  380. fmla z19.d, p0/m, z1.d, z9.d
  381. fmla z20.d, p0/m, z0.d, z10.d
  382. fmla z21.d, p0/m, z1.d, z10.d
  383. fmla z22.d, p0/m, z0.d, z11.d
  384. fmla z23.d, p0/m, z1.d, z11.d
  385. .endm
  386. .macro SAVEv2x4
  387. add pCRow1, pCRow0, LDC
  388. ld1d z8.d, p0/z, [pCRow0]
  389. ld1d z9.d, p0/z, [pCRow0, #1, mul vl]
  390. fmla z8.d, p0/m, z16.d, alphaZ
  391. fmla z9.d, p0/m, z17.d, alphaZ
  392. st1d z8.d, p0, [pCRow0]
  393. st1d z9.d, p0, [pCRow0, #1, mul vl]
  394. add pCRow2, pCRow1, LDC
  395. ld1d z10.d, p0/z, [pCRow1]
  396. ld1d z11.d, p0/z, [pCRow1, #1, mul vl]
  397. fmla z10.d, p0/m, z18.d, alphaZ
  398. fmla z11.d, p0/m, z19.d, alphaZ
  399. st1d z10.d, p0, [pCRow1]
  400. st1d z11.d, p0, [pCRow1, #1, mul vl]
  401. add pCRow1, pCRow2, LDC
  402. ld1d z12.d, p0/z, [pCRow2]
  403. ld1d z13.d, p0/z, [pCRow2, #1, mul vl]
  404. fmla z12.d, p0/m, z20.d, alphaZ
  405. fmla z13.d, p0/m, z21.d, alphaZ
  406. st1d z12.d, p0, [pCRow2]
  407. st1d z13.d, p0, [pCRow2, #1, mul vl]
  408. ld1d z14.d, p0/z, [pCRow1]
  409. ld1d z15.d, p0/z, [pCRow1, #1, mul vl]
  410. fmla z14.d, p0/m, z22.d, alphaZ
  411. fmla z15.d, p0/m, z23.d, alphaZ
  412. st1d z14.d, p0, [pCRow1]
  413. st1d z15.d, p0, [pCRow1, #1, mul vl]
  414. add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2
  415. .endm
  416. .macro INITv2x2
  417. dup z16.d, #0
  418. dup z17.d, #0
  419. dup z18.d, #0
  420. dup z19.d, #0
  421. .endm
  422. .macro KERNELv2x2_SUB
  423. ld1d z0.d, p0/z, [pA1]
  424. ld1d z1.d, p0/z, [pA2]
  425. add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  426. add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  427. ld1rd z8.d, p0/z, [pB]
  428. ld1rd z9.d, p0/z, [pB, 8]
  429. add pB, pB, 16
  430. fmla z16.d, p0/m, z0.d, z8.d
  431. fmla z17.d, p0/m, z1.d, z8.d
  432. fmla z18.d, p0/m, z0.d, z9.d
  433. fmla z19.d, p0/m, z1.d, z9.d
  434. .endm
  435. .macro SAVEv2x2
  436. add pCRow1, pCRow0, LDC
  437. ld1d z8.d, p0/z, [pCRow0]
  438. ld1d z9.d, p0/z, [pCRow0, #1, mul vl]
  439. fmla z8.d, p0/m, z16.d, alphaZ
  440. fmla z9.d, p0/m, z17.d, alphaZ
  441. st1d z8.d, p0, [pCRow0]
  442. st1d z9.d, p0, [pCRow0, #1, mul vl]
  443. ld1d z10.d, p0/z, [pCRow1]
  444. ld1d z11.d, p0/z, [pCRow1, #1, mul vl]
  445. fmla z10.d, p0/m, z18.d, alphaZ
  446. fmla z11.d, p0/m, z19.d, alphaZ
  447. st1d z10.d, p0, [pCRow1]
  448. st1d z11.d, p0, [pCRow1, #1, mul vl]
  449. add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2
  450. .endm
  451. .macro INITv2x1
  452. dup z16.d, #0
  453. dup z17.d, #0
  454. .endm
  455. .macro KERNELv2x1_SUB
  456. ld1d z0.d, p0/z, [pA1]
  457. ld1d z1.d, p0/z, [pA2]
  458. add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  459. add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  460. ld1rd z8.d, p0/z, [pB]
  461. add pB, pB, 8
  462. fmla z16.d, p0/m, z0.d, z8.d
  463. fmla z17.d, p0/m, z1.d, z8.d
  464. .endm
  465. .macro SAVEv2x1
  466. add pCRow1, pCRow0, LDC
  467. ld1d z8.d, p0/z, [pCRow0]
  468. ld1d z9.d, p0/z, [pCRow0, #1, mul vl]
  469. fmla z8.d, p0/m, z16.d, alphaZ
  470. fmla z9.d, p0/m, z17.d, alphaZ
  471. st1d z8.d, p0, [pCRow0]
  472. st1d z9.d, p0, [pCRow0, #1, mul vl]
  473. add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2
  474. .endm
  475. .macro INITv1x8
  476. dup z16.d, #0
  477. dup z17.d, #0
  478. dup z18.d, #0
  479. dup z19.d, #0
  480. dup z20.d, #0
  481. dup z21.d, #0
  482. dup z22.d, #0
  483. dup z23.d, #0
  484. .endm
  485. .macro KERNELv1x8_I
  486. ld1d z0.d, p1/z, [pA1]
  487. ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one
  488. add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8
  489. ld1rd z8.d, p0/z, [pB]
  490. ld1rd z9.d, p0/z, [pB, 8]
  491. ld1rd z10.d, p0/z, [pB, 16]
  492. ld1rd z11.d, p0/z, [pB, 24]
  493. ld1rd z12.d, p0/z, [pB, 32]
  494. ld1rd z13.d, p0/z, [pB, 40]
  495. ld1rd z14.d, p0/z, [pB, 48]
  496. ld1rd z15.d, p0/z, [pB, 56]
  497. add pB, pB, 64
  498. fmla z16.d, p1/m, z0.d, z8.d
  499. ld1rd z8.d, p0/z, [pB]
  500. fmla z17.d, p1/m, z0.d, z9.d
  501. ld1rd z9.d, p0/z, [pB, 8]
  502. fmla z18.d, p1/m, z0.d, z10.d
  503. ld1rd z10.d, p0/z, [pB, 16]
  504. fmla z19.d, p1/m, z0.d, z11.d
  505. ld1rd z11.d, p0/z, [pB, 24]
  506. fmla z20.d, p1/m, z0.d, z12.d
  507. ld1rd z12.d, p0/z, [pB, 32]
  508. fmla z21.d, p1/m, z0.d, z13.d
  509. ld1rd z13.d, p0/z, [pB, 40]
  510. fmla z22.d, p1/m, z0.d, z14.d
  511. ld1rd z14.d, p0/z, [pB, 48]
  512. fmla z23.d, p1/m, z0.d, z15.d
  513. ld1rd z15.d, p0/z, [pB, 56]
  514. add pB, pB, 64
  515. .endm
  516. .macro KERNELv1x8_M1
  517. ld1d z1.d, p1/z, [pA1]
  518. add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8
  519. fmla z16.d, p1/m, z0.d, z8.d
  520. ld1rd z8.d, p0/z, [pB]
  521. fmla z17.d, p1/m, z0.d, z9.d
  522. ld1rd z9.d, p0/z, [pB, 8]
  523. fmla z18.d, p1/m, z0.d, z10.d
  524. ld1rd z10.d, p0/z, [pB, 16]
  525. fmla z19.d, p1/m, z0.d, z11.d
  526. ld1rd z11.d, p0/z, [pB, 24]
  527. fmla z20.d, p1/m, z0.d, z12.d
  528. ld1rd z12.d, p0/z, [pB, 32]
  529. fmla z21.d, p1/m, z0.d, z13.d
  530. ld1rd z13.d, p0/z, [pB, 40]
  531. fmla z22.d, p1/m, z0.d, z14.d
  532. ld1rd z14.d, p0/z, [pB, 48]
  533. fmla z23.d, p1/m, z0.d, z15.d
  534. ld1rd z15.d, p0/z, [pB, 56]
  535. add pB, pB, 64
  536. .endm
  537. .macro KERNELv1x8_M2
  538. ld1d z0.d, p1/z, [pA1]
  539. add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8
  540. fmla z16.d, p1/m, z1.d, z8.d
  541. ld1rd z8.d, p0/z, [pB]
  542. fmla z17.d, p1/m, z1.d, z9.d
  543. ld1rd z9.d, p0/z, [pB, 8]
  544. fmla z18.d, p1/m, z1.d, z10.d
  545. ld1rd z10.d, p0/z, [pB, 16]
  546. fmla z19.d, p1/m, z1.d, z11.d
  547. ld1rd z11.d, p0/z, [pB, 24]
  548. fmla z20.d, p1/m, z1.d, z12.d
  549. ld1rd z12.d, p0/z, [pB, 32]
  550. fmla z21.d, p1/m, z1.d, z13.d
  551. ld1rd z13.d, p0/z, [pB, 40]
  552. fmla z22.d, p1/m, z1.d, z14.d
  553. ld1rd z14.d, p0/z, [pB, 48]
  554. fmla z23.d, p1/m, z1.d, z15.d
  555. ld1rd z15.d, p0/z, [pB, 56]
  556. add pB, pB, 64
  557. .endm
  558. .macro KERNELv1x8_E
  559. fmla z16.d, p1/m, z1.d, z8.d
  560. fmla z17.d, p1/m, z1.d, z9.d
  561. fmla z18.d, p1/m, z1.d, z10.d
  562. fmla z19.d, p1/m, z1.d, z11.d
  563. fmla z20.d, p1/m, z1.d, z12.d
  564. fmla z21.d, p1/m, z1.d, z13.d
  565. fmla z22.d, p1/m, z1.d, z14.d
  566. fmla z23.d, p1/m, z1.d, z15.d
  567. .endm
  568. .macro KERNELv1x8_SUB
  569. ld1d z0.d, p1/z, [pA1]
  570. add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8
  571. ld1rd z8.d, p0/z, [pB]
  572. ld1rd z9.d, p0/z, [pB, 8]
  573. ld1rd z10.d, p0/z, [pB, 16]
  574. ld1rd z11.d, p0/z, [pB, 24]
  575. ld1rd z12.d, p0/z, [pB, 32]
  576. ld1rd z13.d, p0/z, [pB, 40]
  577. ld1rd z14.d, p0/z, [pB, 48]
  578. ld1rd z15.d, p0/z, [pB, 56]
  579. add pB, pB, 64
  580. fmla z16.d, p1/m, z0.d, z8.d
  581. fmla z17.d, p1/m, z0.d, z9.d
  582. fmla z18.d, p1/m, z0.d, z10.d
  583. fmla z19.d, p1/m, z0.d, z11.d
  584. fmla z20.d, p1/m, z0.d, z12.d
  585. fmla z21.d, p1/m, z0.d, z13.d
  586. fmla z22.d, p1/m, z0.d, z14.d
  587. fmla z23.d, p1/m, z0.d, z15.d
  588. .endm
  589. .macro SAVEv1x8
  590. add pCRow1, pCRow0, LDC
  591. ld1d z24.d, p1/z, [pCRow0]
  592. fmla z24.d, p1/m, z16.d, alphaZ
  593. st1d z24.d, p1, [pCRow0]
  594. add pCRow2, pCRow1, LDC
  595. ld1d z25.d, p1/z, [pCRow1]
  596. fmla z25.d, p1/m, z17.d, alphaZ
  597. st1d z25.d, p1, [pCRow1]
  598. add pCRow1, pCRow2, LDC
  599. ld1d z26.d, p1/z, [pCRow2]
  600. fmla z26.d, p1/m, z18.d, alphaZ
  601. st1d z26.d, p1, [pCRow2]
  602. add pCRow2, pCRow1, LDC
  603. ld1d z27.d, p1/z, [pCRow1]
  604. fmla z27.d, p1/m, z19.d, alphaZ
  605. st1d z27.d, p1, [pCRow1]
  606. add pCRow1, pCRow2, LDC
  607. ld1d z28.d, p1/z, [pCRow2]
  608. fmla z28.d, p1/m, z20.d, alphaZ
  609. st1d z28.d, p1, [pCRow2]
  610. add pCRow2, pCRow1, LDC
  611. ld1d z29.d, p1/z, [pCRow1]
  612. fmla z29.d, p1/m, z21.d, alphaZ
  613. st1d z29.d, p1, [pCRow1]
  614. add pCRow1, pCRow2, LDC
  615. ld1d z30.d, p1/z, [pCRow2]
  616. fmla z30.d, p1/m, z22.d, alphaZ
  617. st1d z30.d, p1, [pCRow2]
  618. ld1d z31.d, p1/z, [pCRow1]
  619. fmla z31.d, p1/m, z23.d, alphaZ
  620. st1d z31.d, p1, [pCRow1]
  621. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
  622. .endm
  623. /******************************************************************************/
  624. .macro INITv1x4
  625. dup z16.d, #0
  626. dup z17.d, #0
  627. dup z18.d, #0
  628. dup z19.d, #0
  629. .endm
  630. .macro KERNELv1x4_SUB
  631. ld1d z0.d, p1/z, [pA1]
  632. add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8
  633. ld1rd z8.d, p0/z, [pB]
  634. ld1rd z9.d, p0/z, [pB, 8]
  635. ld1rd z10.d, p0/z, [pB, 16]
  636. ld1rd z11.d, p0/z, [pB, 24]
  637. add pB, pB, 32
  638. fmla z16.d, p1/m, z0.d, z8.d
  639. fmla z17.d, p1/m, z0.d, z9.d
  640. fmla z18.d, p1/m, z0.d, z10.d
  641. fmla z19.d, p1/m, z0.d, z11.d
  642. .endm
  643. .macro SAVEv1x4
  644. add pCRow1, pCRow0, LDC
  645. ld1d z24.d, p1/z, [pCRow0]
  646. fmla z24.d, p1/m, z16.d, alphaZ
  647. st1d z24.d, p1, [pCRow0]
  648. add pCRow2, pCRow1, LDC
  649. ld1d z25.d, p1/z, [pCRow1]
  650. fmla z25.d, p1/m, z17.d, alphaZ
  651. st1d z25.d, p1, [pCRow1]
  652. add pCRow1, pCRow2, LDC
  653. ld1d z26.d, p1/z, [pCRow2]
  654. fmla z26.d, p1/m, z18.d, alphaZ
  655. st1d z26.d, p1, [pCRow2]
  656. ld1d z27.d, p1/z, [pCRow1]
  657. fmla z27.d, p1/m, z19.d, alphaZ
  658. st1d z27.d, p1, [pCRow1]
  659. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
  660. .endm
  661. /******************************************************************************/
  662. .macro INITv1x2
  663. dup z16.d, #0
  664. dup z17.d, #0
  665. .endm
  666. .macro KERNELv1x2_SUB
  667. ld1d z0.d, p1/z, [pA1]
  668. add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8
  669. ld1rd z8.d, p0/z, [pB]
  670. ld1rd z9.d, p0/z, [pB, 8]
  671. add pB, pB, 16
  672. fmla z16.d, p1/m, z0.d, z8.d
  673. fmla z17.d, p1/m, z0.d, z9.d
  674. .endm
  675. .macro SAVEv1x2
  676. add pCRow1, pCRow0, LDC
  677. ld1d z24.d, p1/z, [pCRow0]
  678. fmla z24.d, p1/m, z16.d, alphaZ
  679. st1d z24.d, p1, [pCRow0]
  680. ld1d z25.d, p1/z, [pCRow1]
  681. fmla z25.d, p1/m, z17.d, alphaZ
  682. st1d z25.d, p1, [pCRow1]
  683. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
  684. .endm
  685. /******************************************************************************/
  686. .macro INITv1x1
  687. dup z16.d, #0
  688. .endm
  689. .macro KERNELv1x1_SUB
  690. ld1d z0.d, p1/z, [pA1]
  691. add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8
  692. ld1rd z8.d, p0/z, [pB]
  693. add pB, pB, 8
  694. fmla z16.d, p1/m, z0.d, z8.d
  695. .endm
  696. .macro SAVEv1x1
  697. ld1d z24.d, p1/z, [pCRow0]
  698. fmla z24.d, p1/m, z16.d, alphaZ
  699. st1d z24.d, p1, [pCRow0]
  700. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
  701. .endm
  702. /*******************************************************************************
  703. * End of macro definitions
  704. *******************************************************************************/
  705. PROLOGUE
  706. .align 5
  707. add sp, sp, #-(11 * 16)
  708. stp d8, d9, [sp, #(0 * 16)]
  709. stp d10, d11, [sp, #(1 * 16)]
  710. stp d12, d13, [sp, #(2 * 16)]
  711. stp d14, d15, [sp, #(3 * 16)]
  712. stp d16, d17, [sp, #(4 * 16)]
  713. stp x18, x19, [sp, #(5 * 16)]
  714. stp x20, x21, [sp, #(6 * 16)]
  715. stp x22, x23, [sp, #(7 * 16)]
  716. stp x24, x25, [sp, #(8 * 16)]
  717. stp x26, x27, [sp, #(9 * 16)]
  718. str x28, [sp, #(10 * 16)]
  719. fmov alpha, d0
  720. dup alphaZ, alpha
  721. cntd vec_len
  722. lsl vec_lenx2, vec_len, #1
  723. lsl LDC, LDC, #3 // ldc = ldc * 8
  724. ptrue p0.d // create true predicate
  725. mov pB, origPB
  726. // Loop over N
  727. mov counterJ, origN
  728. asr counterJ, counterJ, #3 // J = J / 8
  729. cmp counterJ, #0
  730. ble .Ldgemm_kernel_L4_BEGIN
  731. /******************************************************************************/
  732. /* Repeat this as long as there are 8 left in N */
  733. .align 5
  734. .Ldgemm_kernel_L8_BEGIN:
  735. mov pCRow0, pC
  736. add pC, pC, LDC, lsl #3 // add 8 x LDC
  737. mov pA1, origPA // pA1 = start of A array
  738. .Ldgemm_kernel_L8_Mv2_BEGIN:
  739. mov counterI, #0
  740. cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN
  741. blt .Ldgemm_kernel_L8_Mv1_BEGIN
  742. mov counterI, origM
  743. /* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */
  744. mul temp, vec_len, origK // generate address of pA2
  745. add pA2, pA1, temp, lsl #3 // pA1 = start of A array
  746. .align 5
  747. .Ldgemm_kernel_L8_Mv2_20:
  748. mov pB, origPB
  749. INITv2x8 // fill with zeros
  750. asr counterL , origK, #3 // L = K / 8
  751. cmp counterL , #2 // is there at least 4 to do?
  752. blt .Ldgemm_kernel_L8_Mv2_32
  753. KERNELv2x8_I
  754. KERNELv2x8_M2
  755. KERNELv2x8_M1
  756. KERNELv2x8_M2
  757. KERNELv2x8_M1
  758. KERNELv2x8_M2
  759. KERNELv2x8_M1
  760. KERNELv2x8_M2
  761. subs counterL, counterL, #2 // subtract 2
  762. ble .Ldgemm_kernel_L8_Mv2_22a
  763. .align 5
  764. .Ldgemm_kernel_L8_Mv2_22:
  765. KERNELv2x8_M1
  766. KERNELv2x8_M2
  767. KERNELv2x8_M1
  768. KERNELv2x8_M2
  769. KERNELv2x8_M1
  770. KERNELv2x8_M2
  771. KERNELv2x8_M1
  772. KERNELv2x8_M2
  773. subs counterL, counterL, #1
  774. bgt .Ldgemm_kernel_L8_Mv2_22
  775. .align 5
  776. .Ldgemm_kernel_L8_Mv2_22a:
  777. KERNELv2x8_M1
  778. KERNELv2x8_M2
  779. KERNELv2x8_M1
  780. KERNELv2x8_M2
  781. KERNELv2x8_M1
  782. KERNELv2x8_M2
  783. KERNELv2x8_M1
  784. KERNELv2x8_E
  785. b .Ldgemm_kernel_L8_Mv2_44
  786. .align 5
  787. .Ldgemm_kernel_L8_Mv2_32:
  788. tst counterL, #1
  789. ble .Ldgemm_kernel_L8_Mv2_40
  790. KERNELv2x8_I
  791. KERNELv2x8_M2
  792. KERNELv2x8_M1
  793. KERNELv2x8_M2
  794. KERNELv2x8_M1
  795. KERNELv2x8_M2
  796. KERNELv2x8_M1
  797. KERNELv2x8_E
  798. b .Ldgemm_kernel_L8_Mv2_44
  799. .Ldgemm_kernel_L8_Mv2_40:
  800. INITv2x8
  801. .Ldgemm_kernel_L8_Mv2_44:
  802. ands counterL , origK, #7
  803. ble .Ldgemm_kernel_L8_Mv2_100
  804. .align 5
  805. .Ldgemm_kernel_L8_Mv2_46:
  806. KERNELv2x8_SUB
  807. subs counterL, counterL, #1
  808. bne .Ldgemm_kernel_L8_Mv2_46
  809. .Ldgemm_kernel_L8_Mv2_100:
  810. SAVEv2x8
  811. mov pA1, pA2 // pA1 = pA2
  812. mul temp, vec_len, origK // generate address of pA2
  813. add pA2, pA1, temp, lsl #3 //
  814. .Ldgemm_kernel_L8_Mv2_END:
  815. sub counterI, counterI, vec_lenx2
  816. cmp counterI, vec_lenx2
  817. bge .Ldgemm_kernel_L8_Mv2_20
  818. sub counterI, origM, counterI
  819. cmp counterI, origM
  820. beq .Ldgemm_kernel_L8_END
  821. //////////////////////////////////////////
  822. // We have less than 2*SVE_LEN left. We do this with V1x8 kernel.
  823. .Ldgemm_kernel_L8_Mv1_BEGIN:
  824. whilelt p1.d, counterI, origM //SVE instruction
  825. cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
  826. .align 5
  827. .Ldgemm_kernel_L8_Mv1_20:
  828. mov pB, origPB
  829. INITv1x8 // fill with zeros
  830. asr counterL , origK, #3 // L = K / 8
  831. cmp counterL , #2 // is there at least 4 to do?
  832. blt .Ldgemm_kernel_L8_Mv1_32
  833. KERNELv1x8_I
  834. KERNELv1x8_M2
  835. KERNELv1x8_M1
  836. KERNELv1x8_M2
  837. KERNELv1x8_M1
  838. KERNELv1x8_M2
  839. KERNELv1x8_M1
  840. KERNELv1x8_M2
  841. subs counterL, counterL, #2 // subtract 2
  842. ble .Ldgemm_kernel_L8_Mv1_22a
  843. .align 5
  844. .Ldgemm_kernel_L8_Mv1_22:
  845. KERNELv1x8_M1
  846. KERNELv1x8_M2
  847. KERNELv1x8_M1
  848. KERNELv1x8_M2
  849. KERNELv1x8_M1
  850. KERNELv1x8_M2
  851. KERNELv1x8_M1
  852. KERNELv1x8_M2
  853. subs counterL, counterL, #1
  854. bgt .Ldgemm_kernel_L8_Mv1_22
  855. .align 5
  856. .Ldgemm_kernel_L8_Mv1_22a:
  857. KERNELv1x8_M1
  858. KERNELv1x8_M2
  859. KERNELv1x8_M1
  860. KERNELv1x8_M2
  861. KERNELv1x8_M1
  862. KERNELv1x8_M2
  863. KERNELv1x8_M1
  864. KERNELv1x8_E
  865. b .Ldgemm_kernel_L8_Mv1_44
  866. .align 5
  867. .Ldgemm_kernel_L8_Mv1_32:
  868. tst counterL, #1
  869. ble .Ldgemm_kernel_L8_Mv1_40
  870. KERNELv1x8_I
  871. KERNELv1x8_M2
  872. KERNELv1x8_M1
  873. KERNELv1x8_M2
  874. KERNELv1x8_M1
  875. KERNELv1x8_M2
  876. KERNELv1x8_M1
  877. KERNELv1x8_E
  878. b .Ldgemm_kernel_L8_Mv1_44
  879. .Ldgemm_kernel_L8_Mv1_40:
  880. INITv1x8
  881. .Ldgemm_kernel_L8_Mv1_44:
  882. ands counterL , origK, #7
  883. ble .Ldgemm_kernel_L8_Mv1_100
  884. .align 5
  885. .Ldgemm_kernel_L8_Mv1_46:
  886. KERNELv1x8_SUB
  887. subs counterL, counterL, #1
  888. bne .Ldgemm_kernel_L8_Mv1_46
  889. .Ldgemm_kernel_L8_Mv1_100:
  890. SAVEv1x8
  891. .Ldgemm_kernel_L8_Mv1_END:
  892. incd counterI
  893. whilelt p1.d, counterI, origM //SVE instruction
  894. cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
  895. b.any .Ldgemm_kernel_L8_Mv1_20
  896. .Ldgemm_kernel_L8_END:
  897. lsl temp, origK, #6
  898. add origPB, origPB, temp // B = B + K * 8 * 8
  899. subs counterJ, counterJ , #1 // j--
  900. bgt .Ldgemm_kernel_L8_BEGIN
  901. /******************************************************************************/
  902. /* Repeat the same thing if 4 left in N */
  903. .align 5
  904. .Ldgemm_kernel_L4_BEGIN:
  905. mov counterJ , origN
  906. tst counterJ , #4
  907. ble .Ldgemm_kernel_L2_BEGIN
  908. mov pCRow0, pC
  909. add pC, pC, LDC, lsl #2 // add 4 x LDC
  910. mov pA1, origPA // pA1 = start of A array
  911. .Ldgemm_kernel_L4_Mv2_BEGIN:
  912. mov counterI, #0
  913. cmp origM, vec_lenx2
  914. blt .Ldgemm_kernel_L4_Mv1_BEGIN
  915. mov counterI, origM
  916. mul temp, vec_len, origK // generate address of pA2
  917. add pA2, pA1, temp, lsl #3 // pA1 = start of A array
  918. .align 5
  919. .Ldgemm_kernel_L4_Mv2_20:
  920. mov pB, origPB
  921. INITv2x4 // fill with zeros
  922. asr counterL , origK, #3 // L = K / 8
  923. cmp counterL , #0 // is there at least 4 to do?
  924. ble .Ldgemm_kernel_L4_Mv2_44
  925. .align 5
  926. .Ldgemm_kernel_L4_Mv2_22:
  927. KERNELv2x4_SUB
  928. KERNELv2x4_SUB
  929. KERNELv2x4_SUB
  930. KERNELv2x4_SUB
  931. KERNELv2x4_SUB
  932. KERNELv2x4_SUB
  933. KERNELv2x4_SUB
  934. KERNELv2x4_SUB
  935. subs counterL, counterL, #1
  936. bgt .Ldgemm_kernel_L4_Mv2_22
  937. .Ldgemm_kernel_L4_Mv2_44:
  938. ands counterL , origK, #7
  939. ble .Ldgemm_kernel_L4_Mv2_100
  940. .align 5
  941. .Ldgemm_kernel_L4_Mv2_46:
  942. KERNELv2x4_SUB
  943. subs counterL, counterL, #1
  944. bne .Ldgemm_kernel_L4_Mv2_46
  945. .Ldgemm_kernel_L4_Mv2_100:
  946. SAVEv2x4
  947. mov pA1, pA2 // pA1 = pA2
  948. mul temp, vec_len, origK // generate address of pA2
  949. add pA2, pA1, temp, lsl #3 //
  950. .Ldgemm_kernel_L4_Mv2_END:
  951. sub counterI, counterI, vec_lenx2
  952. cmp counterI, vec_lenx2
  953. bge .Ldgemm_kernel_L4_Mv2_20
  954. sub counterI, origM, counterI
  955. cmp counterI, origM
  956. beq .Ldgemm_kernel_L4_END
  957. //////////////////////////////////
  958. // We have less than 2*SVE_LEN left. We do this with V1x4 kernel.
  959. .Ldgemm_kernel_L4_Mv1_BEGIN:
  960. whilelt p1.d, counterI, origM //SVE instruction
  961. cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
  962. .align 5
  963. .Ldgemm_kernel_L4_Mv1_20:
  964. mov pB, origPB
  965. INITv1x4 // fill with zeros
  966. asr counterL , origK, #3 // L = K / 8
  967. cmp counterL , #0 // is there at least 4 to do?
  968. ble .Ldgemm_kernel_L4_Mv1_44
  969. .align 5
  970. .Ldgemm_kernel_L4_Mv1_22:
  971. KERNELv1x4_SUB
  972. KERNELv1x4_SUB
  973. KERNELv1x4_SUB
  974. KERNELv1x4_SUB
  975. KERNELv1x4_SUB
  976. KERNELv1x4_SUB
  977. KERNELv1x4_SUB
  978. KERNELv1x4_SUB
  979. subs counterL, counterL, #1
  980. bgt .Ldgemm_kernel_L4_Mv1_22
  981. .Ldgemm_kernel_L4_Mv1_44:
  982. ands counterL , origK, #7
  983. ble .Ldgemm_kernel_L4_Mv1_100
  984. .align 5
  985. .Ldgemm_kernel_L4_Mv1_46:
  986. KERNELv1x4_SUB
  987. subs counterL, counterL, #1
  988. bne .Ldgemm_kernel_L4_Mv1_46
  989. .Ldgemm_kernel_L4_Mv1_100:
  990. SAVEv1x4
  991. .Ldgemm_kernel_L4_Mv1_END:
  992. incd counterI
  993. whilelt p1.d, counterI, origM //SVE instruction
  994. cntp lanes, p0, p1.d
  995. b.any .Ldgemm_kernel_L4_Mv1_20
  996. .Ldgemm_kernel_L4_END:
  997. lsl temp, origK, #5
  998. add origPB, origPB, temp // B = B + K * 4 * 8
  999. /******************************************************************************/
  1000. /* Repeat the same thing if 2 left in N */
  1001. .align 5
  1002. .Ldgemm_kernel_L2_BEGIN:
  1003. mov counterJ , origN
  1004. tst counterJ , #2
  1005. ble .Ldgemm_kernel_L1_BEGIN
  1006. mov pCRow0, pC
  1007. add pC, pC, LDC, lsl #1 // add 2 x LDC
  1008. mov pA1, origPA // pA1 = start of A array
  1009. .Ldgemm_kernel_L2_Mv2_BEGIN:
  1010. mov counterI, #0
  1011. cmp origM, vec_lenx2
  1012. blt .Ldgemm_kernel_L2_Mv1_BEGIN
  1013. mov counterI, origM
  1014. mul temp, vec_len, origK // generate address of pA2
  1015. add pA2, pA1, temp, lsl #3 // pA1 = start of A array
  1016. .align 5
  1017. .Ldgemm_kernel_L2_Mv2_20:
  1018. mov pB, origPB
  1019. INITv2x2 // fill with zeros
  1020. asr counterL , origK, #3 // L = K / 8
  1021. cmp counterL , #0 // is there at least 4 to do?
  1022. ble .Ldgemm_kernel_L2_Mv2_44
  1023. .align 5
  1024. .Ldgemm_kernel_L2_Mv2_22:
  1025. KERNELv2x2_SUB
  1026. KERNELv2x2_SUB
  1027. KERNELv2x2_SUB
  1028. KERNELv2x2_SUB
  1029. KERNELv2x2_SUB
  1030. KERNELv2x2_SUB
  1031. KERNELv2x2_SUB
  1032. KERNELv2x2_SUB
  1033. subs counterL, counterL, #1
  1034. bgt .Ldgemm_kernel_L2_Mv2_22
  1035. .Ldgemm_kernel_L2_Mv2_44:
  1036. ands counterL , origK, #7
  1037. ble .Ldgemm_kernel_L2_Mv2_100
  1038. .align 5
  1039. .Ldgemm_kernel_L2_Mv2_46:
  1040. KERNELv2x2_SUB
  1041. subs counterL, counterL, #1
  1042. bne .Ldgemm_kernel_L2_Mv2_46
  1043. .Ldgemm_kernel_L2_Mv2_100:
  1044. SAVEv2x2
  1045. mov pA1, pA2 // pA1 = pA2
  1046. mul temp, vec_len, origK // generate address of pA2
  1047. add pA2, pA1, temp, lsl #3 //
  1048. .Ldgemm_kernel_L2_Mv2_END:
  1049. sub counterI, counterI, vec_lenx2
  1050. cmp counterI, vec_lenx2
  1051. bge .Ldgemm_kernel_L2_Mv2_20
  1052. sub counterI, origM, counterI
  1053. cmp counterI, origM
  1054. beq .Ldgemm_kernel_L2_END
  1055. //////////////////////////////////
  1056. // We have less than 2*SVE_LEN left. We do this with V1x2 kernel.
  1057. .Ldgemm_kernel_L2_Mv1_BEGIN:
  1058. whilelt p1.d, counterI, origM //SVE instruction
  1059. cntp lanes, p0, p1.d
  1060. .align 5
  1061. .Ldgemm_kernel_L2_Mv1_20:
  1062. mov pB, origPB
  1063. INITv1x2 // fill with zeros
  1064. asr counterL , origK, #3 // L = K / 8
  1065. cmp counterL , #0 // is there at least 4 to do?
  1066. ble .Ldgemm_kernel_L2_Mv1_44
  1067. .align 5
  1068. .Ldgemm_kernel_L2_Mv1_22:
  1069. KERNELv1x2_SUB
  1070. KERNELv1x2_SUB
  1071. KERNELv1x2_SUB
  1072. KERNELv1x2_SUB
  1073. KERNELv1x2_SUB
  1074. KERNELv1x2_SUB
  1075. KERNELv1x2_SUB
  1076. KERNELv1x2_SUB
  1077. subs counterL, counterL, #1
  1078. bgt .Ldgemm_kernel_L2_Mv1_22
  1079. .Ldgemm_kernel_L2_Mv1_44:
  1080. ands counterL , origK, #7
  1081. ble .Ldgemm_kernel_L2_Mv1_100
  1082. .align 5
  1083. .Ldgemm_kernel_L2_Mv1_46:
  1084. KERNELv1x2_SUB
  1085. subs counterL, counterL, #1
  1086. bne .Ldgemm_kernel_L2_Mv1_46
  1087. .Ldgemm_kernel_L2_Mv1_100:
  1088. SAVEv1x2
  1089. .Ldgemm_kernel_L2_Mv1_END:
  1090. incd counterI
  1091. whilelt p1.d, counterI, origM //SVE instruction
  1092. cntp lanes, p0, p1.d
  1093. b.any .Ldgemm_kernel_L2_Mv1_20
  1094. .Ldgemm_kernel_L2_END:
  1095. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1096. /******************************************************************************/
  1097. /* Repeat the same thing if 1 left in N */
  1098. .align 5
  1099. .Ldgemm_kernel_L1_BEGIN:
  1100. mov counterJ , origN
  1101. tst counterJ , #1
  1102. ble .Ldgemm_kernel_L999 // done
  1103. mov pCRow0, pC
  1104. add pC, pC, LDC // add 1 x LDC
  1105. mov pA1, origPA // pA1 = start of A array
  1106. .Ldgemm_kernel_L1_Mv2_BEGIN:
  1107. mov counterI, #0
  1108. cmp origM, vec_lenx2
  1109. blt .Ldgemm_kernel_L1_Mv1_BEGIN
  1110. mov counterI, origM
  1111. mul temp, vec_len, origK // generate address of pA2
  1112. add pA2, pA1, temp, lsl #3 // pA1 = start of A array
  1113. .align 5
  1114. .Ldgemm_kernel_L1_Mv2_20:
  1115. mov pB, origPB
  1116. INITv2x1 // fill with zeros
  1117. asr counterL , origK, #3 // L = K / 8
  1118. cmp counterL , #0 // is there at least 8 to do?
  1119. ble .Ldgemm_kernel_L1_Mv2_44
  1120. .align 5
  1121. .Ldgemm_kernel_L1_Mv2_22:
  1122. KERNELv2x1_SUB
  1123. KERNELv2x1_SUB
  1124. KERNELv2x1_SUB
  1125. KERNELv2x1_SUB
  1126. KERNELv2x1_SUB
  1127. KERNELv2x1_SUB
  1128. KERNELv2x1_SUB
  1129. KERNELv2x1_SUB
  1130. subs counterL, counterL, #1
  1131. bgt .Ldgemm_kernel_L1_Mv2_22
  1132. .Ldgemm_kernel_L1_Mv2_44:
  1133. ands counterL , origK, #7
  1134. ble .Ldgemm_kernel_L1_Mv2_100
  1135. .align 5
  1136. .Ldgemm_kernel_L1_Mv2_46:
  1137. KERNELv2x1_SUB
  1138. subs counterL, counterL, #1
  1139. bgt .Ldgemm_kernel_L1_Mv2_46
  1140. .Ldgemm_kernel_L1_Mv2_100:
  1141. SAVEv2x1
  1142. mov pA1, pA2 // pA1 = pA2
  1143. mul temp, vec_len, origK // generate address of pA2
  1144. add pA2, pA1, temp, lsl #3 //
  1145. .Ldgemm_kernel_L1_Mv2_END:
  1146. sub counterI, counterI, vec_lenx2
  1147. cmp counterI, vec_lenx2
  1148. bge .Ldgemm_kernel_L1_Mv2_20
  1149. sub counterI, origM, counterI
  1150. cmp counterI, origM
  1151. beq .Ldgemm_kernel_L1_END
  1152. //////////////////////////////////
  1153. // We have less than 2*SVE_LEN left. We do this with V1x1 kernel.
  1154. .Ldgemm_kernel_L1_Mv1_BEGIN:
  1155. whilelt p1.d, counterI, origM //SVE instruction
  1156. cntp lanes, p0, p1.d
  1157. .align 5
  1158. .Ldgemm_kernel_L1_Mv1_20:
  1159. mov pB, origPB
  1160. INITv1x1 // fill with zeros
  1161. asr counterL , origK, #3 // L = K / 8
  1162. cmp counterL , #0 // is there at least 8 to do?
  1163. ble .Ldgemm_kernel_L1_Mv1_44
  1164. .align 5
  1165. .Ldgemm_kernel_L1_Mv1_22:
  1166. KERNELv1x1_SUB
  1167. KERNELv1x1_SUB
  1168. KERNELv1x1_SUB
  1169. KERNELv1x1_SUB
  1170. KERNELv1x1_SUB
  1171. KERNELv1x1_SUB
  1172. KERNELv1x1_SUB
  1173. KERNELv1x1_SUB
  1174. subs counterL, counterL, #1
  1175. bgt .Ldgemm_kernel_L1_Mv1_22
  1176. .Ldgemm_kernel_L1_Mv1_44:
  1177. ands counterL , origK, #7
  1178. ble .Ldgemm_kernel_L1_Mv1_100
  1179. .align 5
  1180. .Ldgemm_kernel_L1_Mv1_46:
  1181. KERNELv1x1_SUB
  1182. subs counterL, counterL, #1
  1183. bgt .Ldgemm_kernel_L1_Mv1_46
  1184. .Ldgemm_kernel_L1_Mv1_100:
  1185. SAVEv1x1
  1186. .Ldgemm_kernel_L1_Mv1_END:
  1187. incd counterI
  1188. whilelt p1.d, counterI, origM //SVE instruction
  1189. cntp lanes, p0, p1.d
  1190. b.any .Ldgemm_kernel_L1_Mv1_20
  1191. .Ldgemm_kernel_L1_END:
  1192. /******************************************************************************/
  1193. .Ldgemm_kernel_L999:
  1194. mov x0, #0 // set return value
  1195. ldp d8, d9, [sp, #(0 * 16)]
  1196. ldp d10, d11, [sp, #(1 * 16)]
  1197. ldp d12, d13, [sp, #(2 * 16)]
  1198. ldp d14, d15, [sp, #(3 * 16)]
  1199. ldp d16, d17, [sp, #(4 * 16)]
  1200. ldp x18, x19, [sp, #(5 * 16)]
  1201. ldp x20, x21, [sp, #(6 * 16)]
  1202. ldp x22, x23, [sp, #(7 * 16)]
  1203. ldp x24, x25, [sp, #(8 * 16)]
  1204. ldp x26, x27, [sp, #(9 * 16)]
  1205. ldr x28, [sp, #(10 * 16)]
  1206. add sp, sp, #(11*16)
  1207. ret
  1208. EPILOGUE