You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_sve_v2x8.S 40 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. /* This is an SVE dgemm kernel with size 2*SVE_LEN x 8.
  28. However, the data layout is the same as for the kernel 1*SVE_LEN x 8.
  29. This means that we sweep two panels of packed A when iterating in a loop over K.
  30. With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */
  31. #define ASSEMBLER
  32. #include "common.h"
  33. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  34. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
  35. #define origM x0
  36. #define origN x1
  37. #define origK x2
  38. #define origPA x3
  39. #define origPB x4
  40. #define pC x5
  41. #define LDC x6
  42. #define temp x7
  43. #define counterL x8
  44. #define counterI x9
  45. #define counterJ x10
  46. #define pB x11
  47. #define pCRow0 x12
  48. #define pCRow1 x13
  49. #define pCRow2 x14
  50. #define lanes x15
  51. #define pA1 x16
  52. #define pA2 x17
  53. #define alpha x18
  54. #define vec_len x19
  55. #define vec_lenx2 x20
  56. #define alpha0 d10
  57. #define alphaZ z7.d
  58. #define A_PRE_SIZE 1536
  59. #define B_PRE_SIZE 512
  60. #define C_PRE_SIZE 128
  61. // 00 origM
  62. // 01 origN
  63. // 02 origK
  64. // 03 origPA
  65. // 04 origPB
  66. // 05 pC
  67. // 06 origLDC -> LDC
  68. // 07 temp
  69. // 08 counterL
  70. // 09 counterI
  71. // 10 counterJ
  72. // 11 pB
  73. // 12 pCRow0
  74. // 13 pCRow1
  75. // 14 pCRow2
  76. // 15 lanes
  77. // 16 pA1
  78. // 17 pA1
  79. // 18 must save alpha
  80. // 19 must save vec_len
  81. // 20 must save
  82. // 21 must save
  83. // 22 must save
  84. // 23 must save
  85. // 24 must save
  86. // 25 must save
  87. // 26 must save
  88. // 27 must save
  89. // 28 must save
  90. // 29 frame
  91. // 30 link
  92. // 31 sp
  93. //v00 ALPHA -> pA10_0
  94. //v01 pA10_1
  95. //v02 pA20_0
  96. //v03 pA20_1
  97. //v04
  98. //v05
  99. //v06
  100. //v07 ALPHA0
  101. //v08 must save pB0_0
  102. //v09 must save pB0_1
  103. //v10 must save pB0_2
  104. //v11 must save pB0_3
  105. //v12 must save pB0_4
  106. //v13 must save pB0_5
  107. //v14 must save pB0_6
  108. //v15 must save pB0_7
  109. //v16 must save C0
  110. //v17 must save C1
  111. //v18 must save C2
  112. //v19 must save C3
  113. //v20 must save C4
  114. //v21 must save C5
  115. //v22 must save C6
  116. //v23 must save C7
  117. //v24 must save C8
  118. //v25 must save C9
  119. //v26 must save C10
  120. //v27 must save C11
  121. //v28 must save C12
  122. //v29 must save C13
  123. //v30 must save C14
  124. //v31 must save C15
  125. /*******************************************************************************
  126. * Macro definitions
  127. *******************************************************************************/
  128. .macro INITv2x8
  129. dup z16.d, #0
  130. dup z17.d, #0
  131. dup z18.d, #0
  132. dup z19.d, #0
  133. dup z20.d, #0
  134. dup z21.d, #0
  135. dup z22.d, #0
  136. dup z23.d, #0
  137. dup z24.d, #0
  138. dup z25.d, #0
  139. dup z26.d, #0
  140. dup z27.d, #0
  141. dup z28.d, #0
  142. dup z29.d, #0
  143. dup z30.d, #0
  144. dup z31.d, #0
  145. .endm
  146. .macro KERNELv2x8_I
  147. ld1d z0.d, p0/z, [pA1]
  148. ld1d z1.d, p0/z, [pA2]
  149. ld1d z2.d, p0/z, [pA1, vec_len, lsl #3]
  150. ld1d z3.d, p0/z, [pA2, vec_len, lsl #3]
  151. add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 *2
  152. add pA2, pA2, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 *2
  153. ld1rd z8.d, p0/z, [pB]
  154. ld1rd z9.d, p0/z, [pB, 8]
  155. ld1rd z10.d, p0/z, [pB, 16]
  156. ld1rd z11.d, p0/z, [pB, 24]
  157. ld1rd z12.d, p0/z, [pB, 32]
  158. ld1rd z13.d, p0/z, [pB, 40]
  159. ld1rd z14.d, p0/z, [pB, 48]
  160. ld1rd z15.d, p0/z, [pB, 56]
  161. add pB, pB, 64
  162. fmla z16.d, p0/m, z0.d, z8.d
  163. fmla z17.d, p0/m, z1.d, z8.d
  164. ld1rd z8.d, p0/z, [pB]
  165. fmla z18.d, p0/m, z0.d, z9.d
  166. fmla z19.d, p0/m, z1.d, z9.d
  167. ld1rd z9.d, p0/z, [pB, 8]
  168. fmla z20.d, p0/m, z0.d, z10.d
  169. fmla z21.d, p0/m, z1.d, z10.d
  170. ld1rd z10.d, p0/z, [pB, 16]
  171. fmla z22.d, p0/m, z0.d, z11.d
  172. fmla z23.d, p0/m, z1.d, z11.d
  173. ld1rd z11.d, p0/z, [pB, 24]
  174. fmla z24.d, p0/m, z0.d, z12.d
  175. fmla z25.d, p0/m, z1.d, z12.d
  176. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE]
  177. ld1rd z12.d, p0/z, [pB, 32]
  178. fmla z26.d, p0/m, z0.d, z13.d
  179. fmla z27.d, p0/m, z1.d, z13.d
  180. prfm PLDL1KEEP, [pA2, #A_PRE_SIZE]
  181. ld1rd z13.d, p0/z, [pB, 40]
  182. fmla z28.d, p0/m, z0.d, z14.d
  183. fmla z29.d, p0/m, z1.d, z14.d
  184. ld1rd z14.d, p0/z, [pB, 48]
  185. fmla z30.d, p0/m, z0.d, z15.d
  186. fmla z31.d, p0/m, z1.d, z15.d
  187. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64]
  188. ld1rd z15.d, p0/z, [pB, 56]
  189. prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64]
  190. add pB, pB, 64
  191. .endm
  192. .macro KERNELv2x8_M1
  193. ld1d z2.d, p0/z, [pA1]
  194. ld1d z3.d, p0/z, [pA2]
  195. add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  196. add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  197. fmla z16.d, p0/m, z0.d, z8.d
  198. fmla z17.d, p0/m, z1.d, z8.d
  199. ld1rd z8.d, p0/z, [pB]
  200. fmla z18.d, p0/m, z0.d, z9.d
  201. fmla z19.d, p0/m, z1.d, z9.d
  202. ld1rd z9.d, p0/z, [pB, 8]
  203. fmla z20.d, p0/m, z0.d, z10.d
  204. fmla z21.d, p0/m, z1.d, z10.d
  205. ld1rd z10.d, p0/z, [pB, 16]
  206. fmla z22.d, p0/m, z0.d, z11.d
  207. fmla z23.d, p0/m, z1.d, z11.d
  208. ld1rd z11.d, p0/z, [pB, 24]
  209. fmla z24.d, p0/m, z0.d, z12.d
  210. fmla z25.d, p0/m, z1.d, z12.d
  211. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE]
  212. ld1rd z12.d, p0/z, [pB, 32]
  213. fmla z26.d, p0/m, z0.d, z13.d
  214. fmla z27.d, p0/m, z1.d, z13.d
  215. prfm PLDL1KEEP, [pA2, #A_PRE_SIZE]
  216. ld1rd z13.d, p0/z, [pB, 40]
  217. fmla z28.d, p0/m, z0.d, z14.d
  218. fmla z29.d, p0/m, z1.d, z14.d
  219. ld1rd z14.d, p0/z, [pB, 48]
  220. fmla z30.d, p0/m, z0.d, z15.d
  221. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64]
  222. fmla z31.d, p0/m, z1.d, z15.d
  223. prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64]
  224. ld1rd z15.d, p0/z, [pB, 56]
  225. add pB, pB, 64
  226. .endm
  227. .macro KERNELv2x8_M2
  228. ld1d z0.d, p0/z, [pA1]
  229. ld1d z1.d, p0/z, [pA2]
  230. add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 2 * 8
  231. add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 2 * 8
  232. fmla z16.d, p0/m, z2.d, z8.d
  233. fmla z17.d, p0/m, z3.d, z8.d
  234. ld1rd z8.d, p0/z, [pB]
  235. fmla z18.d, p0/m, z2.d, z9.d
  236. fmla z19.d, p0/m, z3.d, z9.d
  237. ld1rd z9.d, p0/z, [pB, 8]
  238. fmla z20.d, p0/m, z2.d, z10.d
  239. fmla z21.d, p0/m, z3.d, z10.d
  240. ld1rd z10.d, p0/z, [pB, 16]
  241. fmla z22.d, p0/m, z2.d, z11.d
  242. fmla z23.d, p0/m, z3.d, z11.d
  243. ld1rd z11.d, p0/z, [pB, 24]
  244. fmla z24.d, p0/m, z2.d, z12.d
  245. fmla z25.d, p0/m, z3.d, z12.d
  246. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  247. ld1rd z12.d, p0/z, [pB, 32]
  248. fmla z26.d, p0/m, z2.d, z13.d
  249. fmla z27.d, p0/m, z3.d, z13.d
  250. ld1rd z13.d, p0/z, [pB, 40]
  251. fmla z28.d, p0/m, z2.d, z14.d
  252. fmla z29.d, p0/m, z3.d, z14.d
  253. ld1rd z14.d, p0/z, [pB, 48]
  254. fmla z30.d, p0/m, z2.d, z15.d
  255. fmla z31.d, p0/m, z3.d, z15.d
  256. ld1rd z15.d, p0/z, [pB, 56]
  257. add pB, pB, 64
  258. .endm
  259. .macro KERNELv2x8_E
  260. fmla z16.d, p0/m, z2.d, z8.d
  261. fmla z17.d, p0/m, z3.d, z8.d
  262. fmla z18.d, p0/m, z2.d, z9.d
  263. fmla z19.d, p0/m, z3.d, z9.d
  264. fmla z20.d, p0/m, z2.d, z10.d
  265. fmla z21.d, p0/m, z3.d, z10.d
  266. fmla z22.d, p0/m, z2.d, z11.d
  267. fmla z23.d, p0/m, z3.d, z11.d
  268. fmla z24.d, p0/m, z2.d, z12.d
  269. fmla z25.d, p0/m, z3.d, z12.d
  270. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  271. fmla z26.d, p0/m, z2.d, z13.d
  272. fmla z27.d, p0/m, z3.d, z13.d
  273. fmla z28.d, p0/m, z2.d, z14.d
  274. fmla z29.d, p0/m, z3.d, z14.d
  275. fmla z30.d, p0/m, z2.d, z15.d
  276. fmla z31.d, p0/m, z3.d, z15.d
  277. .endm
  278. .macro KERNELv2x8_SUB
  279. ld1d z0.d, p0/z, [pA1]
  280. ld1d z1.d, p0/z, [pA2]
  281. add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  282. add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  283. ld1rd z8.d, p0/z, [pB]
  284. ld1rd z9.d, p0/z, [pB, 8]
  285. ld1rd z10.d, p0/z, [pB, 16]
  286. ld1rd z11.d, p0/z, [pB, 24]
  287. ld1rd z12.d, p0/z, [pB, 32]
  288. ld1rd z13.d, p0/z, [pB, 40]
  289. ld1rd z14.d, p0/z, [pB, 48]
  290. ld1rd z15.d, p0/z, [pB, 56]
  291. add pB, pB, 64
  292. fmla z16.d, p0/m, z0.d, z8.d
  293. fmla z17.d, p0/m, z1.d, z8.d
  294. fmla z18.d, p0/m, z0.d, z9.d
  295. fmla z19.d, p0/m, z1.d, z9.d
  296. fmla z20.d, p0/m, z0.d, z10.d
  297. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE]
  298. fmla z21.d, p0/m, z1.d, z10.d
  299. fmla z22.d, p0/m, z0.d, z11.d
  300. fmla z23.d, p0/m, z1.d, z11.d
  301. fmla z24.d, p0/m, z0.d, z12.d
  302. prfm PLDL1KEEP, [pA2, #A_PRE_SIZE]
  303. fmla z25.d, p0/m, z1.d, z12.d
  304. fmla z26.d, p0/m, z0.d, z13.d
  305. fmla z27.d, p0/m, z1.d, z13.d
  306. fmla z28.d, p0/m, z0.d, z14.d
  307. fmla z29.d, p0/m, z1.d, z14.d
  308. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  309. fmla z30.d, p0/m, z0.d, z15.d
  310. fmla z31.d, p0/m, z1.d, z15.d
  311. .endm
  312. .macro SAVEv2x8
  313. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  314. add pCRow1, pCRow0, LDC
  315. ld1d z8.d, p0/z, [pCRow0]
  316. ld1d z9.d, p0/z, [pCRow0, #1, mul vl]
  317. fmla z8.d, p0/m, z16.d, alphaZ
  318. fmla z9.d, p0/m, z17.d, alphaZ
  319. st1d z8.d, p0, [pCRow0]
  320. st1d z9.d, p0, [pCRow0, #1, mul vl]
  321. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  322. add pCRow2, pCRow1, LDC
  323. ld1d z10.d, p0/z, [pCRow1]
  324. ld1d z11.d, p0/z, [pCRow1, #1, mul vl]
  325. fmla z10.d, p0/m, z18.d, alphaZ
  326. fmla z11.d, p0/m, z19.d, alphaZ
  327. st1d z10.d, p0, [pCRow1]
  328. st1d z11.d, p0, [pCRow1, #1, mul vl]
  329. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  330. add pCRow1, pCRow2, LDC
  331. ld1d z12.d, p0/z, [pCRow2]
  332. ld1d z13.d, p0/z, [pCRow2, #1, mul vl]
  333. fmla z12.d, p0/m, z20.d, alphaZ
  334. fmla z13.d, p0/m, z21.d, alphaZ
  335. st1d z12.d, p0, [pCRow2]
  336. st1d z13.d, p0, [pCRow2, #1, mul vl]
  337. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  338. add pCRow2, pCRow1, LDC
  339. ld1d z14.d, p0/z, [pCRow1]
  340. ld1d z15.d, p0/z, [pCRow1, #1, mul vl]
  341. fmla z14.d, p0/m, z22.d, alphaZ
  342. fmla z15.d, p0/m, z23.d, alphaZ
  343. st1d z14.d, p0, [pCRow1]
  344. st1d z15.d, p0, [pCRow1, #1, mul vl]
  345. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  346. add pCRow1, pCRow2, LDC
  347. ld1d z8.d, p0/z, [pCRow2]
  348. ld1d z9.d, p0/z, [pCRow2, #1, mul vl]
  349. fmla z8.d, p0/m, z24.d, alphaZ
  350. fmla z9.d, p0/m, z25.d, alphaZ
  351. st1d z8.d, p0, [pCRow2]
  352. st1d z9.d, p0, [pCRow2, #1, mul vl]
  353. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  354. add pCRow2, pCRow1, LDC
  355. ld1d z10.d, p0/z, [pCRow1]
  356. ld1d z11.d, p0/z, [pCRow1, #1, mul vl]
  357. fmla z10.d, p0/m, z26.d, alphaZ
  358. fmla z11.d, p0/m, z27.d, alphaZ
  359. st1d z10.d, p0, [pCRow1]
  360. st1d z11.d, p0, [pCRow1, #1, mul vl]
  361. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  362. add pCRow1, pCRow2, LDC
  363. ld1d z12.d, p0/z, [pCRow2]
  364. ld1d z13.d, p0/z, [pCRow2, #1, mul vl]
  365. fmla z12.d, p0/m, z28.d, alphaZ
  366. fmla z13.d, p0/m, z29.d, alphaZ
  367. st1d z12.d, p0, [pCRow2]
  368. st1d z13.d, p0, [pCRow2, #1, mul vl]
  369. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  370. ld1d z14.d, p0/z, [pCRow1]
  371. ld1d z15.d, p0/z, [pCRow1, #1, mul vl]
  372. fmla z14.d, p0/m, z30.d, alphaZ
  373. fmla z15.d, p0/m, z31.d, alphaZ
  374. st1d z14.d, p0, [pCRow1]
  375. st1d z15.d, p0, [pCRow1, #1, mul vl]
  376. add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2
  377. .endm
  378. .macro INITv2x4
  379. dup z16.d, #0
  380. dup z17.d, #0
  381. dup z18.d, #0
  382. dup z19.d, #0
  383. dup z20.d, #0
  384. dup z21.d, #0
  385. dup z22.d, #0
  386. dup z23.d, #0
  387. .endm
  388. .macro KERNELv2x4_SUB
  389. ld1d z0.d, p0/z, [pA1]
  390. ld1d z1.d, p0/z, [pA2]
  391. add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  392. add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  393. ld1rd z8.d, p0/z, [pB]
  394. ld1rd z9.d, p0/z, [pB, 8]
  395. ld1rd z10.d, p0/z, [pB, 16]
  396. ld1rd z11.d, p0/z, [pB, 24]
  397. add pB, pB, 32
  398. fmla z16.d, p0/m, z0.d, z8.d
  399. fmla z17.d, p0/m, z1.d, z8.d
  400. fmla z18.d, p0/m, z0.d, z9.d
  401. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE]
  402. fmla z19.d, p0/m, z1.d, z9.d
  403. fmla z20.d, p0/m, z0.d, z10.d
  404. prfm PLDL1KEEP, [pA2, #A_PRE_SIZE]
  405. fmla z21.d, p0/m, z1.d, z10.d
  406. fmla z22.d, p0/m, z0.d, z11.d
  407. fmla z23.d, p0/m, z1.d, z11.d
  408. .endm
  409. .macro SAVEv2x4
  410. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  411. add pCRow1, pCRow0, LDC
  412. ld1d z8.d, p0/z, [pCRow0]
  413. ld1d z9.d, p0/z, [pCRow0, #1, mul vl]
  414. fmla z8.d, p0/m, z16.d, alphaZ
  415. fmla z9.d, p0/m, z17.d, alphaZ
  416. st1d z8.d, p0, [pCRow0]
  417. st1d z9.d, p0, [pCRow0, #1, mul vl]
  418. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  419. add pCRow2, pCRow1, LDC
  420. ld1d z10.d, p0/z, [pCRow1]
  421. ld1d z11.d, p0/z, [pCRow1, #1, mul vl]
  422. fmla z10.d, p0/m, z18.d, alphaZ
  423. fmla z11.d, p0/m, z19.d, alphaZ
  424. st1d z10.d, p0, [pCRow1]
  425. st1d z11.d, p0, [pCRow1, #1, mul vl]
  426. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  427. add pCRow1, pCRow2, LDC
  428. ld1d z12.d, p0/z, [pCRow2]
  429. ld1d z13.d, p0/z, [pCRow2, #1, mul vl]
  430. fmla z12.d, p0/m, z20.d, alphaZ
  431. fmla z13.d, p0/m, z21.d, alphaZ
  432. st1d z12.d, p0, [pCRow2]
  433. st1d z13.d, p0, [pCRow2, #1, mul vl]
  434. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  435. ld1d z14.d, p0/z, [pCRow1]
  436. ld1d z15.d, p0/z, [pCRow1, #1, mul vl]
  437. fmla z14.d, p0/m, z22.d, alphaZ
  438. fmla z15.d, p0/m, z23.d, alphaZ
  439. st1d z14.d, p0, [pCRow1]
  440. st1d z15.d, p0, [pCRow1, #1, mul vl]
  441. add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2
  442. .endm
  443. .macro INITv2x2
  444. dup z16.d, #0
  445. dup z17.d, #0
  446. dup z18.d, #0
  447. dup z19.d, #0
  448. .endm
  449. .macro KERNELv2x2_SUB
  450. ld1d z0.d, p0/z, [pA1]
  451. ld1d z1.d, p0/z, [pA2]
  452. add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  453. add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  454. ld1rd z8.d, p0/z, [pB]
  455. ld1rd z9.d, p0/z, [pB, 8]
  456. add pB, pB, 16
  457. fmla z16.d, p0/m, z0.d, z8.d
  458. fmla z17.d, p0/m, z1.d, z8.d
  459. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE]
  460. fmla z18.d, p0/m, z0.d, z9.d
  461. fmla z19.d, p0/m, z1.d, z9.d
  462. prfm PLDL1KEEP, [pA2, #A_PRE_SIZE]
  463. .endm
  464. .macro SAVEv2x2
  465. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  466. add pCRow1, pCRow0, LDC
  467. ld1d z8.d, p0/z, [pCRow0]
  468. ld1d z9.d, p0/z, [pCRow0, #1, mul vl]
  469. fmla z8.d, p0/m, z16.d, alphaZ
  470. fmla z9.d, p0/m, z17.d, alphaZ
  471. st1d z8.d, p0, [pCRow0]
  472. st1d z9.d, p0, [pCRow0, #1, mul vl]
  473. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  474. ld1d z10.d, p0/z, [pCRow1]
  475. ld1d z11.d, p0/z, [pCRow1, #1, mul vl]
  476. fmla z10.d, p0/m, z18.d, alphaZ
  477. fmla z11.d, p0/m, z19.d, alphaZ
  478. st1d z10.d, p0, [pCRow1]
  479. st1d z11.d, p0, [pCRow1, #1, mul vl]
  480. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  481. add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2
  482. .endm
  483. .macro INITv2x1
  484. dup z16.d, #0
  485. dup z17.d, #0
  486. .endm
  487. .macro KERNELv2x1_SUB
  488. ld1d z0.d, p0/z, [pA1]
  489. ld1d z1.d, p0/z, [pA2]
  490. add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  491. add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8
  492. ld1rd z8.d, p0/z, [pB]
  493. add pB, pB, 8
  494. fmla z16.d, p0/m, z0.d, z8.d
  495. fmla z17.d, p0/m, z1.d, z8.d
  496. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE]
  497. .endm
  498. .macro SAVEv2x1
  499. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  500. add pCRow1, pCRow0, LDC
  501. ld1d z8.d, p0/z, [pCRow0]
  502. ld1d z9.d, p0/z, [pCRow0, #1, mul vl]
  503. fmla z8.d, p0/m, z16.d, alphaZ
  504. fmla z9.d, p0/m, z17.d, alphaZ
  505. st1d z8.d, p0, [pCRow0]
  506. st1d z9.d, p0, [pCRow0, #1, mul vl]
  507. add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2
  508. .endm
  509. .macro INITv1x8
  510. dup z16.d, #0
  511. dup z17.d, #0
  512. dup z18.d, #0
  513. dup z19.d, #0
  514. dup z20.d, #0
  515. dup z21.d, #0
  516. dup z22.d, #0
  517. dup z23.d, #0
  518. .endm
  519. .macro KERNELv1x8_I
  520. ld1d z0.d, p1/z, [pA1]
  521. ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one
  522. add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8
  523. ld1rd z8.d, p0/z, [pB]
  524. ld1rd z9.d, p0/z, [pB, 8]
  525. ld1rd z10.d, p0/z, [pB, 16]
  526. ld1rd z11.d, p0/z, [pB, 24]
  527. ld1rd z12.d, p0/z, [pB, 32]
  528. ld1rd z13.d, p0/z, [pB, 40]
  529. ld1rd z14.d, p0/z, [pB, 48]
  530. ld1rd z15.d, p0/z, [pB, 56]
  531. add pB, pB, 64
  532. fmla z16.d, p1/m, z0.d, z8.d
  533. ld1rd z8.d, p0/z, [pB]
  534. fmla z17.d, p1/m, z0.d, z9.d
  535. ld1rd z9.d, p0/z, [pB, 8]
  536. fmla z18.d, p1/m, z0.d, z10.d
  537. ld1rd z10.d, p0/z, [pB, 16]
  538. fmla z19.d, p1/m, z0.d, z11.d
  539. ld1rd z11.d, p0/z, [pB, 24]
  540. fmla z20.d, p1/m, z0.d, z12.d
  541. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE]
  542. ld1rd z12.d, p0/z, [pB, 32]
  543. fmla z21.d, p1/m, z0.d, z13.d
  544. ld1rd z13.d, p0/z, [pB, 40]
  545. fmla z22.d, p1/m, z0.d, z14.d
  546. ld1rd z14.d, p0/z, [pB, 48]
  547. fmla z23.d, p1/m, z0.d, z15.d
  548. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64]
  549. ld1rd z15.d, p0/z, [pB, 56]
  550. add pB, pB, 64
  551. .endm
  552. .macro KERNELv1x8_M1
  553. ld1d z1.d, p1/z, [pA1]
  554. add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8
  555. fmla z16.d, p1/m, z0.d, z8.d
  556. ld1rd z8.d, p0/z, [pB]
  557. fmla z17.d, p1/m, z0.d, z9.d
  558. ld1rd z9.d, p0/z, [pB, 8]
  559. fmla z18.d, p1/m, z0.d, z10.d
  560. ld1rd z10.d, p0/z, [pB, 16]
  561. fmla z19.d, p1/m, z0.d, z11.d
  562. ld1rd z11.d, p0/z, [pB, 24]
  563. fmla z20.d, p1/m, z0.d, z12.d
  564. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE]
  565. ld1rd z12.d, p0/z, [pB, 32]
  566. fmla z21.d, p1/m, z0.d, z13.d
  567. ld1rd z13.d, p0/z, [pB, 40]
  568. fmla z22.d, p1/m, z0.d, z14.d
  569. ld1rd z14.d, p0/z, [pB, 48]
  570. fmla z23.d, p1/m, z0.d, z15.d
  571. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64]
  572. ld1rd z15.d, p0/z, [pB, 56]
  573. add pB, pB, 64
  574. .endm
  575. .macro KERNELv1x8_M2
  576. ld1d z0.d, p1/z, [pA1]
  577. add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8
  578. fmla z16.d, p1/m, z1.d, z8.d
  579. ld1rd z8.d, p0/z, [pB]
  580. fmla z17.d, p1/m, z1.d, z9.d
  581. ld1rd z9.d, p0/z, [pB, 8]
  582. fmla z18.d, p1/m, z1.d, z10.d
  583. ld1rd z10.d, p0/z, [pB, 16]
  584. fmla z19.d, p1/m, z1.d, z11.d
  585. ld1rd z11.d, p0/z, [pB, 24]
  586. fmla z20.d, p1/m, z1.d, z12.d
  587. ld1rd z12.d, p0/z, [pB, 32]
  588. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  589. fmla z21.d, p1/m, z1.d, z13.d
  590. ld1rd z13.d, p0/z, [pB, 40]
  591. fmla z22.d, p1/m, z1.d, z14.d
  592. ld1rd z14.d, p0/z, [pB, 48]
  593. fmla z23.d, p1/m, z1.d, z15.d
  594. ld1rd z15.d, p0/z, [pB, 56]
  595. add pB, pB, 64
  596. .endm
  597. .macro KERNELv1x8_E
  598. fmla z16.d, p1/m, z1.d, z8.d
  599. fmla z17.d, p1/m, z1.d, z9.d
  600. fmla z18.d, p1/m, z1.d, z10.d
  601. fmla z19.d, p1/m, z1.d, z11.d
  602. fmla z20.d, p1/m, z1.d, z12.d
  603. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  604. fmla z21.d, p1/m, z1.d, z13.d
  605. fmla z22.d, p1/m, z1.d, z14.d
  606. fmla z23.d, p1/m, z1.d, z15.d
  607. .endm
  608. .macro KERNELv1x8_SUB
  609. ld1d z0.d, p1/z, [pA1]
  610. add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8
  611. ld1rd z8.d, p0/z, [pB]
  612. ld1rd z9.d, p0/z, [pB, 8]
  613. ld1rd z10.d, p0/z, [pB, 16]
  614. ld1rd z11.d, p0/z, [pB, 24]
  615. ld1rd z12.d, p0/z, [pB, 32]
  616. ld1rd z13.d, p0/z, [pB, 40]
  617. ld1rd z14.d, p0/z, [pB, 48]
  618. ld1rd z15.d, p0/z, [pB, 56]
  619. add pB, pB, 64
  620. fmla z16.d, p1/m, z0.d, z8.d
  621. fmla z17.d, p1/m, z0.d, z9.d
  622. fmla z18.d, p1/m, z0.d, z10.d
  623. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE]
  624. fmla z19.d, p1/m, z0.d, z11.d
  625. fmla z20.d, p1/m, z0.d, z12.d
  626. fmla z21.d, p1/m, z0.d, z13.d
  627. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  628. fmla z22.d, p1/m, z0.d, z14.d
  629. fmla z23.d, p1/m, z0.d, z15.d
  630. .endm
  631. .macro SAVEv1x8
  632. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  633. add pCRow1, pCRow0, LDC
  634. ld1d z24.d, p1/z, [pCRow0]
  635. fmla z24.d, p1/m, z16.d, alphaZ
  636. st1d z24.d, p1, [pCRow0]
  637. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  638. add pCRow2, pCRow1, LDC
  639. ld1d z25.d, p1/z, [pCRow1]
  640. fmla z25.d, p1/m, z17.d, alphaZ
  641. st1d z25.d, p1, [pCRow1]
  642. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  643. add pCRow1, pCRow2, LDC
  644. ld1d z26.d, p1/z, [pCRow2]
  645. fmla z26.d, p1/m, z18.d, alphaZ
  646. st1d z26.d, p1, [pCRow2]
  647. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  648. add pCRow2, pCRow1, LDC
  649. ld1d z27.d, p1/z, [pCRow1]
  650. fmla z27.d, p1/m, z19.d, alphaZ
  651. st1d z27.d, p1, [pCRow1]
  652. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  653. add pCRow1, pCRow2, LDC
  654. ld1d z28.d, p1/z, [pCRow2]
  655. fmla z28.d, p1/m, z20.d, alphaZ
  656. st1d z28.d, p1, [pCRow2]
  657. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  658. add pCRow2, pCRow1, LDC
  659. ld1d z29.d, p1/z, [pCRow1]
  660. fmla z29.d, p1/m, z21.d, alphaZ
  661. st1d z29.d, p1, [pCRow1]
  662. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  663. add pCRow1, pCRow2, LDC
  664. ld1d z30.d, p1/z, [pCRow2]
  665. fmla z30.d, p1/m, z22.d, alphaZ
  666. st1d z30.d, p1, [pCRow2]
  667. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  668. ld1d z31.d, p1/z, [pCRow1]
  669. fmla z31.d, p1/m, z23.d, alphaZ
  670. st1d z31.d, p1, [pCRow1]
  671. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
  672. .endm
  673. /******************************************************************************/
  674. .macro INITv1x4
  675. dup z16.d, #0
  676. dup z17.d, #0
  677. dup z18.d, #0
  678. dup z19.d, #0
  679. .endm
  680. .macro KERNELv1x4_SUB
  681. ld1d z0.d, p1/z, [pA1]
  682. add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8
  683. ld1rd z8.d, p0/z, [pB]
  684. ld1rd z9.d, p0/z, [pB, 8]
  685. ld1rd z10.d, p0/z, [pB, 16]
  686. ld1rd z11.d, p0/z, [pB, 24]
  687. add pB, pB, 32
  688. fmla z16.d, p1/m, z0.d, z8.d
  689. fmla z17.d, p1/m, z0.d, z9.d
  690. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE]
  691. fmla z18.d, p1/m, z0.d, z10.d
  692. fmla z19.d, p1/m, z0.d, z11.d
  693. .endm
  694. .macro SAVEv1x4
  695. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  696. add pCRow1, pCRow0, LDC
  697. ld1d z24.d, p1/z, [pCRow0]
  698. fmla z24.d, p1/m, z16.d, alphaZ
  699. st1d z24.d, p1, [pCRow0]
  700. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  701. add pCRow2, pCRow1, LDC
  702. ld1d z25.d, p1/z, [pCRow1]
  703. fmla z25.d, p1/m, z17.d, alphaZ
  704. st1d z25.d, p1, [pCRow1]
  705. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  706. add pCRow1, pCRow2, LDC
  707. ld1d z26.d, p1/z, [pCRow2]
  708. fmla z26.d, p1/m, z18.d, alphaZ
  709. st1d z26.d, p1, [pCRow2]
  710. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  711. ld1d z27.d, p1/z, [pCRow1]
  712. fmla z27.d, p1/m, z19.d, alphaZ
  713. st1d z27.d, p1, [pCRow1]
  714. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
  715. .endm
  716. /******************************************************************************/
  717. .macro INITv1x2
  718. dup z16.d, #0
  719. dup z17.d, #0
  720. .endm
  721. .macro KERNELv1x2_SUB
  722. ld1d z0.d, p1/z, [pA1]
  723. add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8
  724. ld1rd z8.d, p0/z, [pB]
  725. ld1rd z9.d, p0/z, [pB, 8]
  726. add pB, pB, 16
  727. fmla z16.d, p1/m, z0.d, z8.d
  728. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE]
  729. fmla z17.d, p1/m, z0.d, z9.d
  730. .endm
  731. .macro SAVEv1x2
  732. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  733. add pCRow1, pCRow0, LDC
  734. ld1d z24.d, p1/z, [pCRow0]
  735. fmla z24.d, p1/m, z16.d, alphaZ
  736. st1d z24.d, p1, [pCRow0]
  737. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  738. ld1d z25.d, p1/z, [pCRow1]
  739. fmla z25.d, p1/m, z17.d, alphaZ
  740. st1d z25.d, p1, [pCRow1]
  741. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
  742. .endm
  743. /******************************************************************************/
  744. .macro INITv1x1
  745. dup z16.d, #0
  746. .endm
  747. .macro KERNELv1x1_SUB
  748. ld1d z0.d, p1/z, [pA1]
  749. add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8
  750. ld1rd z8.d, p0/z, [pB]
  751. add pB, pB, 8
  752. fmla z16.d, p1/m, z0.d, z8.d
  753. prfm PLDL1KEEP, [pA1, #A_PRE_SIZE]
  754. .endm
  755. .macro SAVEv1x1
  756. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  757. ld1d z24.d, p1/z, [pCRow0]
  758. fmla z24.d, p1/m, z16.d, alphaZ
  759. st1d z24.d, p1, [pCRow0]
  760. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
  761. .endm
  762. /*******************************************************************************
  763. * End of macro definitions
  764. *******************************************************************************/
  765. PROLOGUE
  766. .align 5
  767. add sp, sp, #-(11 * 16)
  768. stp d8, d9, [sp, #(0 * 16)]
  769. stp d10, d11, [sp, #(1 * 16)]
  770. stp d12, d13, [sp, #(2 * 16)]
  771. stp d14, d15, [sp, #(3 * 16)]
  772. stp d16, d17, [sp, #(4 * 16)]
  773. stp x18, x19, [sp, #(5 * 16)]
  774. stp x20, x21, [sp, #(6 * 16)]
  775. stp x22, x23, [sp, #(7 * 16)]
  776. stp x24, x25, [sp, #(8 * 16)]
  777. stp x26, x27, [sp, #(9 * 16)]
  778. str x28, [sp, #(10 * 16)]
  779. prfm PLDL1KEEP, [origPB]
  780. prfm PLDL1KEEP, [origPA]
  781. fmov alpha, d0
  782. dup alphaZ, alpha
  783. cntd vec_len
  784. lsl vec_lenx2, vec_len, #1
  785. lsl LDC, LDC, #3 // ldc = ldc * 8
  786. ptrue p0.d // create true predicate
  787. mov pB, origPB
  788. // Loop over N
  789. mov counterJ, origN
  790. asr counterJ, counterJ, #3 // J = J / 8
  791. cmp counterJ, #0
  792. ble .Ldgemm_kernel_L4_BEGIN
  793. /******************************************************************************/
  794. /* Repeat this as long as there are 8 left in N */
  795. .align 5
  796. .Ldgemm_kernel_L8_BEGIN:
  797. mov pCRow0, pC
  798. add pC, pC, LDC, lsl #3 // add 8 x LDC
  799. mov pA1, origPA // pA1 = start of A array
  800. .Ldgemm_kernel_L8_Mv2_BEGIN:
  801. mov counterI, #0
  802. cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN
  803. blt .Ldgemm_kernel_L8_Mv1_BEGIN
  804. mov counterI, origM
  805. /* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */
  806. mul temp, vec_len, origK // generate address of pA2
  807. add pA2, pA1, temp, lsl #3 // pA1 = start of A array
  808. prfm PLDL1KEEP, [pA2]
  809. .align 5
  810. .Ldgemm_kernel_L8_Mv2_20:
  811. mov pB, origPB
  812. INITv2x8 // fill with zeros
  813. asr counterL , origK, #3 // L = K / 8
  814. cmp counterL , #2 // is there at least 4 to do?
  815. blt .Ldgemm_kernel_L8_Mv2_32
  816. KERNELv2x8_I
  817. KERNELv2x8_M2
  818. KERNELv2x8_M1
  819. KERNELv2x8_M2
  820. KERNELv2x8_M1
  821. KERNELv2x8_M2
  822. KERNELv2x8_M1
  823. KERNELv2x8_M2
  824. subs counterL, counterL, #2 // subtract 2
  825. ble .Ldgemm_kernel_L8_Mv2_22a
  826. .align 5
  827. .Ldgemm_kernel_L8_Mv2_22:
  828. KERNELv2x8_M1
  829. KERNELv2x8_M2
  830. KERNELv2x8_M1
  831. KERNELv2x8_M2
  832. KERNELv2x8_M1
  833. KERNELv2x8_M2
  834. KERNELv2x8_M1
  835. KERNELv2x8_M2
  836. subs counterL, counterL, #1
  837. bgt .Ldgemm_kernel_L8_Mv2_22
  838. .align 5
  839. .Ldgemm_kernel_L8_Mv2_22a:
  840. KERNELv2x8_M1
  841. KERNELv2x8_M2
  842. KERNELv2x8_M1
  843. KERNELv2x8_M2
  844. KERNELv2x8_M1
  845. KERNELv2x8_M2
  846. KERNELv2x8_M1
  847. KERNELv2x8_E
  848. b .Ldgemm_kernel_L8_Mv2_44
  849. .align 5
  850. .Ldgemm_kernel_L8_Mv2_32:
  851. tst counterL, #1
  852. ble .Ldgemm_kernel_L8_Mv2_40
  853. KERNELv2x8_I
  854. KERNELv2x8_M2
  855. KERNELv2x8_M1
  856. KERNELv2x8_M2
  857. KERNELv2x8_M1
  858. KERNELv2x8_M2
  859. KERNELv2x8_M1
  860. KERNELv2x8_E
  861. b .Ldgemm_kernel_L8_Mv2_44
  862. .Ldgemm_kernel_L8_Mv2_40:
  863. INITv2x8
  864. .Ldgemm_kernel_L8_Mv2_44:
  865. ands counterL , origK, #7
  866. ble .Ldgemm_kernel_L8_Mv2_100
  867. .align 5
  868. .Ldgemm_kernel_L8_Mv2_46:
  869. KERNELv2x8_SUB
  870. subs counterL, counterL, #1
  871. bne .Ldgemm_kernel_L8_Mv2_46
  872. .Ldgemm_kernel_L8_Mv2_100:
  873. prfm PLDL1KEEP, [pA1]
  874. prfm PLDL1KEEP, [pA1, #64]
  875. prfm PLDL1KEEP, [pA2]
  876. prfm PLDL1KEEP, [pA2, #64]
  877. prfm PLDL1KEEP, [origPB]
  878. SAVEv2x8
  879. mov pA1, pA2 // pA1 = pA2
  880. mul temp, vec_len, origK // generate address of pA2
  881. add pA2, pA1, temp, lsl #3 //
  882. .Ldgemm_kernel_L8_Mv2_END:
  883. sub counterI, counterI, vec_lenx2
  884. cmp counterI, vec_lenx2
  885. bge .Ldgemm_kernel_L8_Mv2_20
  886. sub counterI, origM, counterI
  887. cmp counterI, origM
  888. beq .Ldgemm_kernel_L8_END
  889. //////////////////////////////////////////
  890. // We have less than 2*SVE_LEN left. We do this with V1x8 kernel.
  891. .Ldgemm_kernel_L8_Mv1_BEGIN:
  892. whilelt p1.d, counterI, origM //SVE instruction
  893. cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
  894. .align 5
  895. .Ldgemm_kernel_L8_Mv1_20:
  896. mov pB, origPB
  897. INITv1x8 // fill with zeros
  898. asr counterL , origK, #3 // L = K / 8
  899. cmp counterL , #2 // is there at least 4 to do?
  900. blt .Ldgemm_kernel_L8_Mv1_32
  901. KERNELv1x8_I
  902. KERNELv1x8_M2
  903. KERNELv1x8_M1
  904. KERNELv1x8_M2
  905. KERNELv1x8_M1
  906. KERNELv1x8_M2
  907. KERNELv1x8_M1
  908. KERNELv1x8_M2
  909. subs counterL, counterL, #2 // subtract 2
  910. ble .Ldgemm_kernel_L8_Mv1_22a
  911. .align 5
  912. .Ldgemm_kernel_L8_Mv1_22:
  913. KERNELv1x8_M1
  914. KERNELv1x8_M2
  915. KERNELv1x8_M1
  916. KERNELv1x8_M2
  917. KERNELv1x8_M1
  918. KERNELv1x8_M2
  919. KERNELv1x8_M1
  920. KERNELv1x8_M2
  921. subs counterL, counterL, #1
  922. bgt .Ldgemm_kernel_L8_Mv1_22
  923. .align 5
  924. .Ldgemm_kernel_L8_Mv1_22a:
  925. KERNELv1x8_M1
  926. KERNELv1x8_M2
  927. KERNELv1x8_M1
  928. KERNELv1x8_M2
  929. KERNELv1x8_M1
  930. KERNELv1x8_M2
  931. KERNELv1x8_M1
  932. KERNELv1x8_E
  933. b .Ldgemm_kernel_L8_Mv1_44
  934. .align 5
  935. .Ldgemm_kernel_L8_Mv1_32:
  936. tst counterL, #1
  937. ble .Ldgemm_kernel_L8_Mv1_40
  938. KERNELv1x8_I
  939. KERNELv1x8_M2
  940. KERNELv1x8_M1
  941. KERNELv1x8_M2
  942. KERNELv1x8_M1
  943. KERNELv1x8_M2
  944. KERNELv1x8_M1
  945. KERNELv1x8_E
  946. b .Ldgemm_kernel_L8_Mv1_44
  947. .Ldgemm_kernel_L8_Mv1_40:
  948. INITv1x8
  949. .Ldgemm_kernel_L8_Mv1_44:
  950. ands counterL , origK, #7
  951. ble .Ldgemm_kernel_L8_Mv1_100
  952. .align 5
  953. .Ldgemm_kernel_L8_Mv1_46:
  954. KERNELv1x8_SUB
  955. subs counterL, counterL, #1
  956. bne .Ldgemm_kernel_L8_Mv1_46
  957. .Ldgemm_kernel_L8_Mv1_100:
  958. prfm PLDL1KEEP, [pA1]
  959. prfm PLDL1KEEP, [pA1, #64]
  960. prfm PLDL1KEEP, [origPB]
  961. SAVEv1x8
  962. .Ldgemm_kernel_L8_Mv1_END:
  963. incd counterI
  964. whilelt p1.d, counterI, origM //SVE instruction
  965. cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
  966. b.any .Ldgemm_kernel_L8_Mv1_20
  967. .Ldgemm_kernel_L8_END:
  968. lsl temp, origK, #6
  969. add origPB, origPB, temp // B = B + K * 8 * 8
  970. subs counterJ, counterJ , #1 // j--
  971. bgt .Ldgemm_kernel_L8_BEGIN
  972. /******************************************************************************/
  973. /* Repeat the same thing if 4 left in N */
  974. .align 5
  975. .Ldgemm_kernel_L4_BEGIN:
  976. mov counterJ , origN
  977. tst counterJ , #4
  978. ble .Ldgemm_kernel_L2_BEGIN
  979. mov pCRow0, pC
  980. add pC, pC, LDC, lsl #2 // add 4 x LDC
  981. mov pA1, origPA // pA1 = start of A array
  982. .Ldgemm_kernel_L4_Mv2_BEGIN:
  983. mov counterI, #0
  984. cmp origM, vec_lenx2
  985. blt .Ldgemm_kernel_L4_Mv1_BEGIN
  986. mov counterI, origM
  987. mul temp, vec_len, origK // generate address of pA2
  988. add pA2, pA1, temp, lsl #3 // pA1 = start of A array
  989. .align 5
  990. .Ldgemm_kernel_L4_Mv2_20:
  991. mov pB, origPB
  992. INITv2x4 // fill with zeros
  993. asr counterL , origK, #3 // L = K / 8
  994. cmp counterL , #0 // is there at least 4 to do?
  995. ble .Ldgemm_kernel_L4_Mv2_44
  996. .align 5
  997. .Ldgemm_kernel_L4_Mv2_22:
  998. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  999. KERNELv2x4_SUB
  1000. KERNELv2x4_SUB
  1001. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1002. KERNELv2x4_SUB
  1003. KERNELv2x4_SUB
  1004. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1005. KERNELv2x4_SUB
  1006. KERNELv2x4_SUB
  1007. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1008. KERNELv2x4_SUB
  1009. KERNELv2x4_SUB
  1010. subs counterL, counterL, #1
  1011. bgt .Ldgemm_kernel_L4_Mv2_22
  1012. .Ldgemm_kernel_L4_Mv2_44:
  1013. ands counterL , origK, #7
  1014. ble .Ldgemm_kernel_L4_Mv2_100
  1015. .align 5
  1016. .Ldgemm_kernel_L4_Mv2_46:
  1017. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1018. KERNELv2x4_SUB
  1019. subs counterL, counterL, #1
  1020. bne .Ldgemm_kernel_L4_Mv2_46
  1021. .Ldgemm_kernel_L4_Mv2_100:
  1022. prfm PLDL1KEEP, [pA1]
  1023. prfm PLDL1KEEP, [pA1, #64]
  1024. prfm PLDL1KEEP, [pA2]
  1025. prfm PLDL1KEEP, [pA2, #64]
  1026. prfm PLDL1KEEP, [origPB]
  1027. SAVEv2x4
  1028. mov pA1, pA2 // pA1 = pA2
  1029. mul temp, vec_len, origK // generate address of pA2
  1030. add pA2, pA1, temp, lsl #3 //
  1031. .Ldgemm_kernel_L4_Mv2_END:
  1032. sub counterI, counterI, vec_lenx2
  1033. cmp counterI, vec_lenx2
  1034. bge .Ldgemm_kernel_L4_Mv2_20
  1035. sub counterI, origM, counterI
  1036. cmp counterI, origM
  1037. beq .Ldgemm_kernel_L4_END
  1038. //////////////////////////////////
  1039. // We have less than 2*SVE_LEN left. We do this with V1x4 kernel.
  1040. .Ldgemm_kernel_L4_Mv1_BEGIN:
  1041. whilelt p1.d, counterI, origM //SVE instruction
  1042. cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
  1043. .align 5
  1044. .Ldgemm_kernel_L4_Mv1_20:
  1045. mov pB, origPB
  1046. INITv1x4 // fill with zeros
  1047. asr counterL , origK, #3 // L = K / 8
  1048. cmp counterL , #0 // is there at least 4 to do?
  1049. ble .Ldgemm_kernel_L4_Mv1_44
  1050. .align 5
  1051. .Ldgemm_kernel_L4_Mv1_22:
  1052. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1053. KERNELv1x4_SUB
  1054. KERNELv1x4_SUB
  1055. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1056. KERNELv1x4_SUB
  1057. KERNELv1x4_SUB
  1058. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1059. KERNELv1x4_SUB
  1060. KERNELv1x4_SUB
  1061. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1062. KERNELv1x4_SUB
  1063. KERNELv1x4_SUB
  1064. subs counterL, counterL, #1
  1065. bgt .Ldgemm_kernel_L4_Mv1_22
  1066. .Ldgemm_kernel_L4_Mv1_44:
  1067. ands counterL , origK, #7
  1068. ble .Ldgemm_kernel_L4_Mv1_100
  1069. .align 5
  1070. .Ldgemm_kernel_L4_Mv1_46:
  1071. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1072. KERNELv1x4_SUB
  1073. subs counterL, counterL, #1
  1074. bne .Ldgemm_kernel_L4_Mv1_46
  1075. .Ldgemm_kernel_L4_Mv1_100:
  1076. prfm PLDL1KEEP, [pA1]
  1077. prfm PLDL1KEEP, [pA1, #64]
  1078. prfm PLDL1KEEP, [origPB]
  1079. SAVEv1x4
  1080. .Ldgemm_kernel_L4_Mv1_END:
  1081. incd counterI
  1082. whilelt p1.d, counterI, origM //SVE instruction
  1083. cntp lanes, p0, p1.d
  1084. b.any .Ldgemm_kernel_L4_Mv1_20
  1085. .Ldgemm_kernel_L4_END:
  1086. lsl temp, origK, #5
  1087. add origPB, origPB, temp // B = B + K * 4 * 8
  1088. /******************************************************************************/
  1089. /* Repeat the same thing if 2 left in N */
  1090. .align 5
  1091. .Ldgemm_kernel_L2_BEGIN:
  1092. mov counterJ , origN
  1093. tst counterJ , #2
  1094. ble .Ldgemm_kernel_L1_BEGIN
  1095. mov pCRow0, pC
  1096. add pC, pC, LDC, lsl #1 // add 2 x LDC
  1097. mov pA1, origPA // pA1 = start of A array
  1098. .Ldgemm_kernel_L2_Mv2_BEGIN:
  1099. mov counterI, #0
  1100. cmp origM, vec_lenx2
  1101. blt .Ldgemm_kernel_L2_Mv1_BEGIN
  1102. mov counterI, origM
  1103. mul temp, vec_len, origK // generate address of pA2
  1104. add pA2, pA1, temp, lsl #3 // pA1 = start of A array
  1105. .align 5
  1106. .Ldgemm_kernel_L2_Mv2_20:
  1107. mov pB, origPB
  1108. INITv2x2 // fill with zeros
  1109. asr counterL , origK, #3 // L = K / 8
  1110. cmp counterL , #0 // is there at least 4 to do?
  1111. ble .Ldgemm_kernel_L2_Mv2_44
  1112. .align 5
  1113. .Ldgemm_kernel_L2_Mv2_22:
  1114. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1115. KERNELv2x2_SUB
  1116. KERNELv2x2_SUB
  1117. KERNELv2x2_SUB
  1118. KERNELv2x2_SUB
  1119. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1120. KERNELv2x2_SUB
  1121. KERNELv2x2_SUB
  1122. KERNELv2x2_SUB
  1123. KERNELv2x2_SUB
  1124. subs counterL, counterL, #1
  1125. bgt .Ldgemm_kernel_L2_Mv2_22
  1126. .Ldgemm_kernel_L2_Mv2_44:
  1127. ands counterL , origK, #7
  1128. ble .Ldgemm_kernel_L2_Mv2_100
  1129. .align 5
  1130. .Ldgemm_kernel_L2_Mv2_46:
  1131. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1132. KERNELv2x2_SUB
  1133. subs counterL, counterL, #1
  1134. bne .Ldgemm_kernel_L2_Mv2_46
  1135. .Ldgemm_kernel_L2_Mv2_100:
  1136. prfm PLDL1KEEP, [pA1]
  1137. prfm PLDL1KEEP, [pA1, #64]
  1138. prfm PLDL1KEEP, [pA2]
  1139. prfm PLDL1KEEP, [pA2, #64]
  1140. prfm PLDL1KEEP, [origPB]
  1141. SAVEv2x2
  1142. mov pA1, pA2 // pA1 = pA2
  1143. mul temp, vec_len, origK // generate address of pA2
  1144. add pA2, pA1, temp, lsl #3 //
  1145. .Ldgemm_kernel_L2_Mv2_END:
  1146. sub counterI, counterI, vec_lenx2
  1147. cmp counterI, vec_lenx2
  1148. bge .Ldgemm_kernel_L2_Mv2_20
  1149. sub counterI, origM, counterI
  1150. cmp counterI, origM
  1151. beq .Ldgemm_kernel_L2_END
  1152. //////////////////////////////////
  1153. // We have less than 2*SVE_LEN left. We do this with V1x2 kernel.
  1154. .Ldgemm_kernel_L2_Mv1_BEGIN:
  1155. whilelt p1.d, counterI, origM //SVE instruction
  1156. cntp lanes, p0, p1.d
  1157. .align 5
  1158. .Ldgemm_kernel_L2_Mv1_20:
  1159. mov pB, origPB
  1160. INITv1x2 // fill with zeros
  1161. asr counterL , origK, #3 // L = K / 8
  1162. cmp counterL , #0 // is there at least 4 to do?
  1163. ble .Ldgemm_kernel_L2_Mv1_44
  1164. .align 5
  1165. .Ldgemm_kernel_L2_Mv1_22:
  1166. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1167. KERNELv1x2_SUB
  1168. KERNELv1x2_SUB
  1169. KERNELv1x2_SUB
  1170. KERNELv1x2_SUB
  1171. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1172. KERNELv1x2_SUB
  1173. KERNELv1x2_SUB
  1174. KERNELv1x2_SUB
  1175. KERNELv1x2_SUB
  1176. subs counterL, counterL, #1
  1177. bgt .Ldgemm_kernel_L2_Mv1_22
  1178. .Ldgemm_kernel_L2_Mv1_44:
  1179. ands counterL , origK, #7
  1180. ble .Ldgemm_kernel_L2_Mv1_100
  1181. .align 5
  1182. .Ldgemm_kernel_L2_Mv1_46:
  1183. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1184. KERNELv1x2_SUB
  1185. subs counterL, counterL, #1
  1186. bne .Ldgemm_kernel_L2_Mv1_46
  1187. .Ldgemm_kernel_L2_Mv1_100:
  1188. prfm PLDL1KEEP, [pA1]
  1189. prfm PLDL1KEEP, [pA1, #64]
  1190. prfm PLDL1KEEP, [origPB]
  1191. SAVEv1x2
  1192. .Ldgemm_kernel_L2_Mv1_END:
  1193. incd counterI
  1194. whilelt p1.d, counterI, origM //SVE instruction
  1195. cntp lanes, p0, p1.d
  1196. b.any .Ldgemm_kernel_L2_Mv1_20
  1197. .Ldgemm_kernel_L2_END:
  1198. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1199. /******************************************************************************/
  1200. /* Repeat the same thing if 1 left in N */
  1201. .align 5
  1202. .Ldgemm_kernel_L1_BEGIN:
  1203. mov counterJ , origN
  1204. tst counterJ , #1
  1205. ble .Ldgemm_kernel_L999 // done
  1206. mov pCRow0, pC
  1207. add pC, pC, LDC // add 1 x LDC
  1208. mov pA1, origPA // pA1 = start of A array
  1209. .Ldgemm_kernel_L1_Mv2_BEGIN:
  1210. mov counterI, #0
  1211. cmp origM, vec_lenx2
  1212. blt .Ldgemm_kernel_L1_Mv1_BEGIN
  1213. mov counterI, origM
  1214. mul temp, vec_len, origK // generate address of pA2
  1215. add pA2, pA1, temp, lsl #3 // pA1 = start of A array
  1216. .align 5
  1217. .Ldgemm_kernel_L1_Mv2_20:
  1218. mov pB, origPB
  1219. INITv2x1 // fill with zeros
  1220. asr counterL , origK, #3 // L = K / 8
  1221. cmp counterL , #0 // is there at least 8 to do?
  1222. ble .Ldgemm_kernel_L1_Mv2_44
  1223. .align 5
  1224. .Ldgemm_kernel_L1_Mv2_22:
  1225. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1226. KERNELv2x1_SUB
  1227. KERNELv2x1_SUB
  1228. KERNELv2x1_SUB
  1229. KERNELv2x1_SUB
  1230. KERNELv2x1_SUB
  1231. KERNELv2x1_SUB
  1232. KERNELv2x1_SUB
  1233. KERNELv2x1_SUB
  1234. subs counterL, counterL, #1
  1235. bgt .Ldgemm_kernel_L1_Mv2_22
  1236. .Ldgemm_kernel_L1_Mv2_44:
  1237. ands counterL , origK, #7
  1238. ble .Ldgemm_kernel_L1_Mv2_100
  1239. .align 5
  1240. .Ldgemm_kernel_L1_Mv2_46:
  1241. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1242. KERNELv2x1_SUB
  1243. subs counterL, counterL, #1
  1244. bgt .Ldgemm_kernel_L1_Mv2_46
  1245. .Ldgemm_kernel_L1_Mv2_100:
  1246. prfm PLDL1KEEP, [pA1]
  1247. prfm PLDL1KEEP, [pA1, #64]
  1248. prfm PLDL1KEEP, [origPB]
  1249. SAVEv2x1
  1250. mov pA1, pA2 // pA1 = pA2
  1251. mul temp, vec_len, origK // generate address of pA2
  1252. add pA2, pA1, temp, lsl #3 //
  1253. .Ldgemm_kernel_L1_Mv2_END:
  1254. sub counterI, counterI, vec_lenx2
  1255. cmp counterI, vec_lenx2
  1256. bge .Ldgemm_kernel_L1_Mv2_20
  1257. sub counterI, origM, counterI
  1258. cmp counterI, origM
  1259. beq .Ldgemm_kernel_L1_END
  1260. //////////////////////////////////
  1261. // We have less than 2*SVE_LEN left. We do this with V1x1 kernel.
  1262. .Ldgemm_kernel_L1_Mv1_BEGIN:
  1263. whilelt p1.d, counterI, origM //SVE instruction
  1264. cntp lanes, p0, p1.d
  1265. .align 5
  1266. .Ldgemm_kernel_L1_Mv1_20:
  1267. mov pB, origPB
  1268. INITv1x1 // fill with zeros
  1269. asr counterL , origK, #3 // L = K / 8
  1270. cmp counterL , #0 // is there at least 8 to do?
  1271. ble .Ldgemm_kernel_L1_Mv1_44
  1272. .align 5
  1273. .Ldgemm_kernel_L1_Mv1_22:
  1274. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1275. KERNELv1x1_SUB
  1276. KERNELv1x1_SUB
  1277. KERNELv1x1_SUB
  1278. KERNELv1x1_SUB
  1279. KERNELv1x1_SUB
  1280. KERNELv1x1_SUB
  1281. KERNELv1x1_SUB
  1282. KERNELv1x1_SUB
  1283. subs counterL, counterL, #1
  1284. bgt .Ldgemm_kernel_L1_Mv1_22
  1285. .Ldgemm_kernel_L1_Mv1_44:
  1286. ands counterL , origK, #7
  1287. ble .Ldgemm_kernel_L1_Mv1_100
  1288. .align 5
  1289. .Ldgemm_kernel_L1_Mv1_46:
  1290. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  1291. KERNELv1x1_SUB
  1292. subs counterL, counterL, #1
  1293. bgt .Ldgemm_kernel_L1_Mv1_46
  1294. .Ldgemm_kernel_L1_Mv1_100:
  1295. prfm PLDL1KEEP, [pA1]
  1296. prfm PLDL1KEEP, [pA1, #64]
  1297. prfm PLDL1KEEP, [origPB]
  1298. SAVEv1x1
  1299. .Ldgemm_kernel_L1_Mv1_END:
  1300. incd counterI
  1301. whilelt p1.d, counterI, origM //SVE instruction
  1302. cntp lanes, p0, p1.d
  1303. b.any .Ldgemm_kernel_L1_Mv1_20
  1304. .Ldgemm_kernel_L1_END:
  1305. /******************************************************************************/
  1306. .Ldgemm_kernel_L999:
  1307. mov x0, #0 // set return value
  1308. ldp d8, d9, [sp, #(0 * 16)]
  1309. ldp d10, d11, [sp, #(1 * 16)]
  1310. ldp d12, d13, [sp, #(2 * 16)]
  1311. ldp d14, d15, [sp, #(3 * 16)]
  1312. ldp d16, d17, [sp, #(4 * 16)]
  1313. ldp x18, x19, [sp, #(5 * 16)]
  1314. ldp x20, x21, [sp, #(6 * 16)]
  1315. ldp x22, x23, [sp, #(7 * 16)]
  1316. ldp x24, x25, [sp, #(8 * 16)]
  1317. ldp x26, x27, [sp, #(9 * 16)]
  1318. ldr x28, [sp, #(10 * 16)]
  1319. add sp, sp, #(11*16)
  1320. ret
  1321. EPILOGUE