You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_16x4.S 37 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define alpha0 s10
  48. #define alphaV0 v10.s[0]
  49. #define alpha1 s11
  50. #define alphaV1 v11.s[0]
  51. #define alpha2 s14
  52. #define alphaV2 v14.s[0]
  53. #define alpha3 s15
  54. #define alphaV3 v15.s[0]
  55. // 00 origM
  56. // 01 origN
  57. // 02 origK
  58. // 03 origPA
  59. // 04 origPB
  60. // 05 pC
  61. // 06 origLDC -> LDC
  62. // 07 offset
  63. // 08 counterL
  64. // 09 counterI
  65. // 10 counterJ
  66. // 11 pB
  67. // 12 pCRow0
  68. // 13 pCRow1
  69. // 14 pCRow2
  70. // 15 pA
  71. // 16 temp
  72. // 17
  73. // 18 must save
  74. // 19 must save
  75. // 20 must save
  76. // 21 must save
  77. // 22 must save
  78. // 23 must save
  79. // 24 must save
  80. // 25 must save
  81. // 26 must save
  82. // 27 must save
  83. // 28 must save
  84. // 29 frame
  85. // 30 link
  86. // 31 sp
  87. //v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03
  88. //v01 pA0_04, pA0_05, pA0_06, pA0_07
  89. //v02 pA0_08, pA0_09, pA0_10, pA0_11
  90. //v03 pA0_12, pA0_13, pA0_14, pA0_15
  91. //v04 pA1_00, pA1_01, pA1_02, pA1_03
  92. //v05 pA1_04, pA1_05, pA1_06, pA1_07
  93. //v06 pA1_08, pA1_09, pA1_10, pA1_11
  94. //v07 pA1_12, pA1_13, pA1_14, pA1_15
  95. //v08 must save pB00, pB01
  96. //v09 must save pB02, pB03
  97. //v10 must save ALPHA0
  98. //v11 must save ALPHA1
  99. //v12 must save pB10, pB11
  100. //v13 must save pB12, pB13
  101. //v14 must save ALPHA2
  102. //v15 must save ALPHA3
  103. //v16 must save C00, C01, C02, C03
  104. //v17 must save C04, C05, C06, C07
  105. //v18 C08, C09, C10, C11
  106. //v19 C12, C13, C14, C15
  107. //v20 C16, C17, C18, C19
  108. //v21 C20, C21, C22, C23
  109. //v22 C24, C25, C26, C27
  110. //v23 C28, C29, C30, C31
  111. //v24 C32, C33, C34, C35
  112. //v25 C36, C37, C38, C39
  113. //v26 C40, C41, C42, C43
  114. //v27 C44, C45, C46, C47
  115. //v28 C48, C49, C50, C51
  116. //v29 C52, C53, C54, C55
  117. //v30 C56, C57, C58, C59
  118. //v31 C60, C61, C62, C63
  119. /*******************************************************************************
  120. * Macro definitions
  121. *******************************************************************************/
  122. .macro INIT16x4
  123. fmov s16, wzr
  124. fmov s17, wzr
  125. fmov s18, s16
  126. fmov s19, s17
  127. fmov s20, wzr
  128. fmov s21, s16
  129. fmov s22, s17
  130. fmov s23, s18
  131. fmov s24, wzr
  132. fmov s25, s16
  133. fmov s26, s17
  134. fmov s27, s18
  135. fmov s28, wzr
  136. fmov s29, s16
  137. fmov s30, s17
  138. fmov s31, s18
  139. .endm
  140. .macro KERNEL16x4_I
  141. ld1 {v8.2s, v9.2s}, [pB]
  142. add pB, pB, #16
  143. ld1 {v0.4s}, [pA]
  144. add pA, pA, #16
  145. ld1 {v1.4s}, [pA]
  146. add pA, pA, #16
  147. ld1 {v2.4s}, [pA]
  148. add pA, pA, #16
  149. ld1 {v3.4s}, [pA]
  150. add pA, pA, #16
  151. fmul v16.4s, v0.4s, v8.s[0]
  152. fmul v17.4s, v1.4s, v8.s[0]
  153. fmul v18.4s, v2.4s, v8.s[0]
  154. fmul v19.4s, v3.4s, v8.s[0]
  155. fmul v20.4s, v0.4s, v8.s[1]
  156. fmul v21.4s, v1.4s, v8.s[1]
  157. fmul v22.4s, v2.4s, v8.s[1]
  158. fmul v23.4s, v3.4s, v8.s[1]
  159. fmul v24.4s, v0.4s, v9.s[0]
  160. fmul v25.4s, v1.4s, v9.s[0]
  161. fmul v26.4s, v2.4s, v9.s[0]
  162. fmul v27.4s, v3.4s, v9.s[0]
  163. fmul v28.4s, v0.4s, v9.s[1]
  164. fmul v29.4s, v1.4s, v9.s[1]
  165. fmul v30.4s, v2.4s, v9.s[1]
  166. fmul v31.4s, v3.4s, v9.s[1]
  167. ld1 {v12.2s, v13.2s}, [pB]
  168. add pB, pB, #16
  169. ld1 {v4.4s}, [pA]
  170. add pA, pA, #16
  171. ld1 {v5.4s}, [pA]
  172. add pA, pA, #16
  173. ld1 {v6.4s}, [pA]
  174. add pA, pA, #16
  175. ld1 {v7.4s}, [pA]
  176. add pA, pA, #16
  177. .endm
  178. .macro KERNEL16x4_M1
  179. fmla v16.4s, v0.4s, v8.s[0]
  180. fmla v17.4s, v1.4s, v8.s[0]
  181. fmla v18.4s, v2.4s, v8.s[0]
  182. fmla v19.4s, v3.4s, v8.s[0]
  183. fmla v20.4s, v0.4s, v8.s[1]
  184. fmla v21.4s, v1.4s, v8.s[1]
  185. fmla v22.4s, v2.4s, v8.s[1]
  186. fmla v23.4s, v3.4s, v8.s[1]
  187. fmla v24.4s, v0.4s, v9.s[0]
  188. fmla v25.4s, v1.4s, v9.s[0]
  189. fmla v26.4s, v2.4s, v9.s[0]
  190. fmla v27.4s, v3.4s, v9.s[0]
  191. fmla v28.4s, v0.4s, v9.s[1]
  192. fmla v29.4s, v1.4s, v9.s[1]
  193. fmla v30.4s, v2.4s, v9.s[1]
  194. fmla v31.4s, v3.4s, v9.s[1]
  195. ld1 {v12.2s, v13.2s}, [pB]
  196. add pB, pB, #16
  197. ld1 {v4.4s}, [pA]
  198. add pA, pA, #16
  199. ld1 {v5.4s}, [pA]
  200. add pA, pA, #16
  201. ld1 {v6.4s}, [pA]
  202. add pA, pA, #16
  203. ld1 {v7.4s}, [pA]
  204. add pA, pA, #16
  205. .endm
  206. .macro KERNEL16x4_M2
  207. fmla v16.4s, v4.4s, v12.s[0]
  208. fmla v17.4s, v5.4s, v12.s[0]
  209. fmla v18.4s, v6.4s, v12.s[0]
  210. fmla v19.4s, v7.4s, v12.s[0]
  211. fmla v20.4s, v4.4s, v12.s[1]
  212. fmla v21.4s, v5.4s, v12.s[1]
  213. fmla v22.4s, v6.4s, v12.s[1]
  214. fmla v23.4s, v7.4s, v12.s[1]
  215. fmla v24.4s, v4.4s, v13.s[0]
  216. fmla v25.4s, v5.4s, v13.s[0]
  217. fmla v26.4s, v6.4s, v13.s[0]
  218. fmla v27.4s, v7.4s, v13.s[0]
  219. fmla v28.4s, v4.4s, v13.s[1]
  220. fmla v29.4s, v5.4s, v13.s[1]
  221. fmla v30.4s, v6.4s, v13.s[1]
  222. fmla v31.4s, v7.4s, v13.s[1]
  223. ld1 {v8.2s, v9.2s}, [pB]
  224. add pB, pB, #16
  225. ld1 {v0.4s}, [pA]
  226. add pA, pA, #16
  227. ld1 {v1.4s}, [pA]
  228. add pA, pA, #16
  229. ld1 {v2.4s}, [pA]
  230. add pA, pA, #16
  231. ld1 {v3.4s}, [pA]
  232. add pA, pA, #16
  233. .endm
  234. .macro KERNEL16x4_E
  235. fmla v16.4s, v4.4s, v12.s[0]
  236. fmla v17.4s, v5.4s, v12.s[0]
  237. fmla v18.4s, v6.4s, v12.s[0]
  238. fmla v19.4s, v7.4s, v12.s[0]
  239. fmla v20.4s, v4.4s, v12.s[1]
  240. fmla v21.4s, v5.4s, v12.s[1]
  241. fmla v22.4s, v6.4s, v12.s[1]
  242. fmla v23.4s, v7.4s, v12.s[1]
  243. fmla v24.4s, v4.4s, v13.s[0]
  244. fmla v25.4s, v5.4s, v13.s[0]
  245. fmla v26.4s, v6.4s, v13.s[0]
  246. fmla v27.4s, v7.4s, v13.s[0]
  247. fmla v28.4s, v4.4s, v13.s[1]
  248. fmla v29.4s, v5.4s, v13.s[1]
  249. fmla v30.4s, v6.4s, v13.s[1]
  250. fmla v31.4s, v7.4s, v13.s[1]
  251. .endm
  252. .macro KERNEL16x4_SUB
  253. ld1 {v8.2s, v9.2s}, [pB]
  254. add pB, pB, #16
  255. ld1 {v0.4s}, [pA]
  256. add pA, pA, #16
  257. ld1 {v1.4s}, [pA]
  258. add pA, pA, #16
  259. ld1 {v2.4s}, [pA]
  260. add pA, pA, #16
  261. ld1 {v3.4s}, [pA]
  262. add pA, pA, #16
  263. fmla v16.4s, v0.4s, v8.s[0]
  264. fmla v17.4s, v1.4s, v8.s[0]
  265. fmla v18.4s, v2.4s, v8.s[0]
  266. fmla v19.4s, v3.4s, v8.s[0]
  267. fmla v20.4s, v0.4s, v8.s[1]
  268. fmla v21.4s, v1.4s, v8.s[1]
  269. fmla v22.4s, v2.4s, v8.s[1]
  270. fmla v23.4s, v3.4s, v8.s[1]
  271. fmla v24.4s, v0.4s, v9.s[0]
  272. fmla v25.4s, v1.4s, v9.s[0]
  273. fmla v26.4s, v2.4s, v9.s[0]
  274. fmla v27.4s, v3.4s, v9.s[0]
  275. fmla v28.4s, v0.4s, v9.s[1]
  276. fmla v29.4s, v1.4s, v9.s[1]
  277. fmla v30.4s, v2.4s, v9.s[1]
  278. fmla v31.4s, v3.4s, v9.s[1]
  279. .endm
  280. .macro SAVE16x4
  281. add pCRow1, pCRow0, LDC
  282. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  283. fmla v0.4s, v16.4s, alphaV0
  284. fmla v1.4s, v17.4s, alphaV1
  285. fmla v2.4s, v18.4s, alphaV2
  286. fmla v3.4s, v19.4s, alphaV3
  287. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  288. add pCRow2, pCRow1, LDC
  289. ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  290. fmla v4.4s, v20.4s, alphaV0
  291. fmla v5.4s, v21.4s, alphaV1
  292. fmla v6.4s, v22.4s, alphaV2
  293. fmla v7.4s, v23.4s, alphaV3
  294. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  295. add pCRow1, pCRow2, LDC
  296. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2]
  297. fmla v0.4s, v24.4s, alphaV0
  298. fmla v1.4s, v25.4s, alphaV1
  299. fmla v2.4s, v26.4s, alphaV2
  300. fmla v3.4s, v27.4s, alphaV3
  301. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2]
  302. ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  303. fmla v4.4s, v28.4s, alphaV0
  304. fmla v5.4s, v29.4s, alphaV1
  305. fmla v6.4s, v30.4s, alphaV2
  306. fmla v7.4s, v31.4s, alphaV3
  307. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  308. add pCRow0, pCRow0, #64
  309. .endm
  310. /******************************************************************************/
  311. .macro INIT8x4
  312. fmov s16, wzr
  313. fmov s17, wzr
  314. fmov s20, wzr
  315. fmov s21, s16
  316. fmov s24, wzr
  317. fmov s25, s16
  318. fmov s28, wzr
  319. fmov s29, s16
  320. .endm
  321. .macro KERNEL8x4_I
  322. ld1 {v8.2s, v9.2s}, [pB]
  323. add pB, pB, #16
  324. ld1 {v0.4s}, [pA]
  325. add pA, pA, #16
  326. ld1 {v1.4s}, [pA]
  327. add pA, pA, #16
  328. fmul v16.4s, v0.4s, v8.s[0]
  329. fmul v17.4s, v1.4s, v8.s[0]
  330. fmul v20.4s, v0.4s, v8.s[1]
  331. fmul v21.4s, v1.4s, v8.s[1]
  332. fmul v24.4s, v0.4s, v9.s[0]
  333. fmul v25.4s, v1.4s, v9.s[0]
  334. fmul v28.4s, v0.4s, v9.s[1]
  335. fmul v29.4s, v1.4s, v9.s[1]
  336. ld1 {v12.2s, v13.2s}, [pB]
  337. add pB, pB, #16
  338. ld1 {v4.4s}, [pA]
  339. add pA, pA, #16
  340. ld1 {v5.4s}, [pA]
  341. add pA, pA, #16
  342. .endm
  343. .macro KERNEL8x4_M1
  344. fmla v16.4s, v0.4s, v8.s[0]
  345. fmla v17.4s, v1.4s, v8.s[0]
  346. fmla v20.4s, v0.4s, v8.s[1]
  347. fmla v21.4s, v1.4s, v8.s[1]
  348. fmla v24.4s, v0.4s, v9.s[0]
  349. fmla v25.4s, v1.4s, v9.s[0]
  350. fmla v28.4s, v0.4s, v9.s[1]
  351. fmla v29.4s, v1.4s, v9.s[1]
  352. ld1 {v12.2s, v13.2s}, [pB]
  353. add pB, pB, #16
  354. ld1 {v4.4s}, [pA]
  355. add pA, pA, #16
  356. ld1 {v5.4s}, [pA]
  357. add pA, pA, #16
  358. .endm
  359. .macro KERNEL8x4_M2
  360. fmla v16.4s, v4.4s, v12.s[0]
  361. fmla v17.4s, v5.4s, v12.s[0]
  362. fmla v20.4s, v4.4s, v12.s[1]
  363. fmla v21.4s, v5.4s, v12.s[1]
  364. fmla v24.4s, v4.4s, v13.s[0]
  365. fmla v25.4s, v5.4s, v13.s[0]
  366. fmla v28.4s, v4.4s, v13.s[1]
  367. fmla v29.4s, v5.4s, v13.s[1]
  368. ld1 {v8.2s, v9.2s}, [pB]
  369. add pB, pB, #16
  370. ld1 {v0.4s}, [pA]
  371. add pA, pA, #16
  372. ld1 {v1.4s}, [pA]
  373. add pA, pA, #16
  374. .endm
  375. .macro KERNEL8x4_E
  376. fmla v16.4s, v4.4s, v12.s[0]
  377. fmla v17.4s, v5.4s, v12.s[0]
  378. fmla v20.4s, v4.4s, v12.s[1]
  379. fmla v21.4s, v5.4s, v12.s[1]
  380. fmla v24.4s, v4.4s, v13.s[0]
  381. fmla v25.4s, v5.4s, v13.s[0]
  382. fmla v28.4s, v4.4s, v13.s[1]
  383. fmla v29.4s, v5.4s, v13.s[1]
  384. .endm
  385. .macro KERNEL8x4_SUB
  386. ld1 {v8.2s, v9.2s}, [pB]
  387. add pB, pB, #16
  388. ld1 {v0.4s}, [pA]
  389. add pA, pA, #16
  390. ld1 {v1.4s}, [pA]
  391. add pA, pA, #16
  392. fmla v16.4s, v0.4s, v8.s[0]
  393. fmla v17.4s, v1.4s, v8.s[0]
  394. fmla v20.4s, v0.4s, v8.s[1]
  395. fmla v21.4s, v1.4s, v8.s[1]
  396. fmla v24.4s, v0.4s, v9.s[0]
  397. fmla v25.4s, v1.4s, v9.s[0]
  398. fmla v28.4s, v0.4s, v9.s[1]
  399. fmla v29.4s, v1.4s, v9.s[1]
  400. .endm
  401. .macro SAVE8x4
  402. add pCRow1, pCRow0, LDC
  403. ld1 {v0.4s, v1.4s}, [pCRow0]
  404. fmla v0.4s, v16.4s, alphaV0
  405. fmla v1.4s, v17.4s, alphaV1
  406. st1 {v0.4s, v1.4s}, [pCRow0]
  407. add pCRow2, pCRow1, LDC
  408. ld1 {v4.4s, v5.4s}, [pCRow1]
  409. fmla v4.4s, v20.4s, alphaV0
  410. fmla v5.4s, v21.4s, alphaV1
  411. st1 {v4.4s, v5.4s}, [pCRow1]
  412. add pCRow1, pCRow2, LDC
  413. ld1 {v0.4s, v1.4s}, [pCRow2]
  414. fmla v0.4s, v24.4s, alphaV0
  415. fmla v1.4s, v25.4s, alphaV1
  416. st1 {v0.4s, v1.4s}, [pCRow2]
  417. ld1 {v4.4s, v5.4s}, [pCRow1]
  418. fmla v4.4s, v28.4s, alphaV0
  419. fmla v5.4s, v29.4s, alphaV1
  420. st1 {v4.4s, v5.4s}, [pCRow1]
  421. add pCRow0, pCRow0, #32
  422. .endm
  423. /******************************************************************************/
  424. .macro INIT4x4
  425. fmov s16, wzr
  426. fmov s17, s16
  427. fmov s20, s17
  428. fmov s21, s16
  429. fmov s24, s17
  430. fmov s25, s16
  431. fmov s28, s17
  432. fmov s29, s16
  433. .endm
  434. .macro KERNEL4x4_I
  435. ld1 {v8.2s, v9.2s}, [pB]
  436. add pB, pB, #16
  437. ld1 {v0.2s, v1.2s}, [pA]
  438. add pA, pA, #16
  439. fmul v16.2s, v0.2s, v8.s[0]
  440. fmul v29.2s, v1.2s, v9.s[1]
  441. fmul v20.2s, v0.2s, v8.s[1]
  442. fmul v25.2s, v1.2s, v9.s[0]
  443. fmul v24.2s, v0.2s, v9.s[0]
  444. fmul v21.2s, v1.2s, v8.s[1]
  445. fmul v28.2s, v0.2s, v9.s[1]
  446. fmul v17.2s, v1.2s, v8.s[0]
  447. ld1 {v12.2s, v13.2s}, [pB]
  448. add pB, pB, #16
  449. ld1 {v4.2s, v5.2s}, [pA]
  450. add pA, pA, #16
  451. .endm
  452. .macro KERNEL4x4_M1
  453. fmla v16.2s, v0.2s, v8.s[0]
  454. fmla v29.2s, v1.2s, v9.s[1]
  455. ld1 {v12.2s, v13.2s}, [pB] // For next round
  456. add pB, pB, #16
  457. fmla v20.2s, v0.2s, v8.s[1]
  458. fmla v25.2s, v1.2s, v9.s[0]
  459. ld1 {v4.2s, v5.2s}, [pA] // For next round
  460. add pA, pA, #16
  461. fmla v24.2s, v0.2s, v9.s[0]
  462. fmla v21.2s, v1.2s, v8.s[1]
  463. prfm PLDL1KEEP, [pB, #512]
  464. fmla v28.2s, v0.2s, v9.s[1]
  465. fmla v17.2s, v1.2s, v8.s[0]
  466. .endm
  467. .macro KERNEL4x4_M2
  468. fmla v16.2s, v4.2s, v12.s[0]
  469. fmla v29.2s, v5.2s, v13.s[1]
  470. ld1 {v8.2s, v9.2s}, [pB] // For next round
  471. add pB, pB, #16
  472. fmla v20.2s, v4.2s, v12.s[1]
  473. fmla v25.2s, v5.2s, v13.s[0]
  474. ld1 {v0.2s, v1.2s}, [pA] // For next round
  475. add pA, pA, #16
  476. fmla v24.2s, v4.2s, v13.s[0]
  477. fmla v21.2s, v5.2s, v12.s[1]
  478. prfm PLDL1KEEP, [pA, #512]
  479. fmla v28.2s, v4.2s, v13.s[1]
  480. fmla v17.2s, v5.2s, v12.s[0]
  481. .endm
  482. .macro KERNEL4x4_E
  483. fmla v16.2s, v4.2s, v12.s[0]
  484. fmla v29.2s, v5.2s, v13.s[1]
  485. fmla v20.2s, v4.2s, v12.s[1]
  486. fmla v25.2s, v5.2s, v13.s[0]
  487. fmla v24.2s, v4.2s, v13.s[0]
  488. fmla v21.2s, v5.2s, v12.s[1]
  489. fmla v28.2s, v4.2s, v13.s[1]
  490. fmla v17.2s, v5.2s, v12.s[0]
  491. .endm
  492. .macro KERNEL4x4_SUB
  493. ld1 {v8.2s, v9.2s}, [pB]
  494. add pB, pB, #16
  495. ld1 {v0.2s, v1.2s}, [pA]
  496. add pA, pA, #16
  497. fmla v16.2s, v0.2s, v8.s[0]
  498. fmla v29.2s, v1.2s, v9.s[1]
  499. fmla v20.2s, v0.2s, v8.s[1]
  500. fmla v25.2s, v1.2s, v9.s[0]
  501. fmla v24.2s, v0.2s, v9.s[0]
  502. fmla v21.2s, v1.2s, v8.s[1]
  503. fmla v28.2s, v0.2s, v9.s[1]
  504. fmla v17.2s, v1.2s, v8.s[0]
  505. .endm
  506. .macro SAVE4x4
  507. ld1 {v8.2s, v9.2s}, [pCRow0]
  508. fmla v8.2s, v16.2s, alphaV0
  509. fmla v9.2s, v17.2s, alphaV1
  510. st1 {v8.2s, v9.2s}, [pCRow0]
  511. add pCRow1, pCRow0, LDC
  512. ld1 {v12.2s, v13.2s}, [pCRow1]
  513. fmla v12.2s, v20.2s, alphaV2
  514. fmla v13.2s, v21.2s, alphaV3
  515. st1 {v12.2s, v13.2s}, [pCRow1]
  516. add pCRow2, pCRow1, LDC
  517. ld1 {v8.2s, v9.2s}, [pCRow2]
  518. fmla v8.2s, v24.2s, alphaV0
  519. fmla v9.2s, v25.2s, alphaV1
  520. st1 {v8.2s, v9.2s}, [pCRow2]
  521. add pCRow1, pCRow2, LDC
  522. ld1 {v12.2s, v13.2s}, [pCRow1]
  523. fmla v12.2s, v28.2s, alphaV2
  524. fmla v13.2s, v29.2s, alphaV3
  525. st1 {v12.2s, v13.2s}, [pCRow1]
  526. add pCRow0, pCRow0, #16
  527. .endm
  528. /******************************************************************************/
  529. .macro INIT2x4
  530. fmov s16, wzr
  531. fmov s20, s16
  532. fmov s24, s20
  533. fmov s28, s16
  534. .endm
  535. .macro KERNEL2x4_SUB
  536. ld1 {v8.2s, v9.2s}, [pB]
  537. add pB, pB, #16
  538. ld1 {v0.2s}, [pA]
  539. add pA, pA, #8
  540. fmla v16.2s, v0.2s, v8.s[0]
  541. fmla v20.2s, v0.2s, v8.s[1]
  542. fmla v24.2s, v0.2s, v9.s[0]
  543. fmla v28.2s, v0.2s, v9.s[1]
  544. .endm
  545. .macro SAVE2x4
  546. ld1 {v8.2s}, [pCRow0]
  547. fmla v8.2s, v16.2s, alphaV0
  548. st1 {v8.2s}, [pCRow0]
  549. add pCRow1, pCRow0, LDC
  550. ld1 {v12.2s}, [pCRow1]
  551. fmla v12.2s, v20.2s, alphaV1
  552. st1 {v12.2s}, [pCRow1]
  553. add pCRow2, pCRow1, LDC
  554. ld1 {v8.2s}, [pCRow2]
  555. fmla v8.2s, v24.2s, alphaV2
  556. st1 {v8.2s}, [pCRow2]
  557. add pCRow1, pCRow2, LDC
  558. ld1 {v12.2s}, [pCRow1]
  559. fmla v12.2s, v28.2s, alphaV3
  560. st1 {v12.2s}, [pCRow1]
  561. add pCRow0, pCRow0, #8
  562. .endm
  563. /******************************************************************************/
  564. .macro INIT1x4
  565. fmov s16, wzr
  566. fmov s20, s16
  567. .endm
  568. .macro KERNEL1x4_SUB
  569. ldr s0, [pA]
  570. add pA, pA, #4
  571. ld1 {v8.2s, v9.2s}, [pB]
  572. add pB, pB, #16
  573. fmla v16.2s, v8.2s, v0.s[0]
  574. fmla v20.2s, v9.2s, v0.s[0]
  575. .endm
  576. .macro SAVE1x4
  577. add pCRow1, pCRow0, LDC
  578. ld1 {v8.s}[0], [pCRow0]
  579. ld1 {v8.s}[1], [pCRow1]
  580. fmla v8.2s, v16.2s, alphaV0
  581. st1 {v8.s}[0], [pCRow0]
  582. st1 {v8.s}[1], [pCRow1]
  583. add pCRow2, pCRow1, LDC
  584. add pCRow1, pCRow2, LDC
  585. ld1 {v12.s}[0], [pCRow2]
  586. ld1 {v12.s}[1], [pCRow1]
  587. fmla v12.2s, v20.2s, alphaV1
  588. st1 {v12.s}[0], [pCRow2]
  589. st1 {v12.s}[1], [pCRow1]
  590. add pCRow0, pCRow0, #4
  591. .endm
  592. /******************************************************************************/
  593. .macro INIT16x2
  594. fmov s16, wzr
  595. fmov s17, wzr
  596. fmov s18, wzr
  597. fmov s19, s16
  598. fmov s20, wzr
  599. fmov s21, s16
  600. fmov s22, wzr
  601. fmov s23, s16
  602. .endm
  603. .macro KERNEL16x2_SUB
  604. ld1 {v8.2s}, [pB]
  605. add pB, pB, #8
  606. ld1 {v0.4s}, [pA]
  607. add pA, pA, #16
  608. ld1 {v1.4s}, [pA]
  609. add pA, pA, #16
  610. ld1 {v2.4s}, [pA]
  611. add pA, pA, #16
  612. ld1 {v3.4s}, [pA]
  613. add pA, pA, #16
  614. fmla v16.4s, v0.4s, v8.s[0]
  615. fmla v17.4s, v1.4s, v8.s[0]
  616. fmla v18.4s, v2.4s, v8.s[0]
  617. fmla v19.4s, v3.4s, v8.s[0]
  618. fmla v20.4s, v0.4s, v8.s[1]
  619. fmla v21.4s, v1.4s, v8.s[1]
  620. fmla v22.4s, v2.4s, v8.s[1]
  621. fmla v23.4s, v3.4s, v8.s[1]
  622. .endm
  623. .macro SAVE16x2
  624. add pCRow1, pCRow0, LDC
  625. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  626. fmla v0.4s, v16.4s, alphaV0
  627. fmla v1.4s, v17.4s, alphaV1
  628. fmla v2.4s, v18.4s, alphaV2
  629. fmla v3.4s, v19.4s, alphaV3
  630. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  631. ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  632. fmla v4.4s, v20.4s, alphaV0
  633. fmla v5.4s, v21.4s, alphaV1
  634. fmla v6.4s, v22.4s, alphaV2
  635. fmla v7.4s, v23.4s, alphaV3
  636. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  637. add pCRow0, pCRow0, #64
  638. .endm
  639. /******************************************************************************/
  640. .macro INIT8x2
  641. fmov s16, wzr
  642. fmov s17, s16
  643. fmov s20, s17
  644. fmov s21, s16
  645. .endm
  646. .macro KERNEL8x2_SUB
  647. ld1 {v8.2s}, [pB]
  648. add pB, pB, #8
  649. ld1 {v0.4s}, [pA]
  650. add pA, pA, #16
  651. ld1 {v1.4s}, [pA]
  652. add pA, pA, #16
  653. fmla v16.4s, v0.4s, v8.s[0]
  654. fmla v17.4s, v1.4s, v8.s[0]
  655. fmla v20.4s, v0.4s, v8.s[1]
  656. fmla v21.4s, v1.4s, v8.s[1]
  657. .endm
  658. .macro SAVE8x2
  659. add pCRow1, pCRow0, LDC
  660. ld1 {v0.4s, v1.4s}, [pCRow0]
  661. fmla v0.4s, v16.4s, alphaV0
  662. fmla v1.4s, v17.4s, alphaV1
  663. st1 {v0.4s, v1.4s}, [pCRow0]
  664. add pCRow2, pCRow1, LDC
  665. ld1 {v4.4s, v5.4s}, [pCRow1]
  666. fmla v4.4s, v20.4s, alphaV0
  667. fmla v5.4s, v21.4s, alphaV1
  668. st1 {v4.4s, v5.4s}, [pCRow1]
  669. add pCRow0, pCRow0, #32
  670. .endm
  671. /******************************************************************************/
  672. .macro INIT4x2
  673. fmov s16, wzr
  674. fmov s17, s16
  675. fmov s20, s17
  676. fmov s21, s16
  677. .endm
  678. .macro KERNEL4x2_SUB
  679. ld1 {v8.2s}, [pB]
  680. add pB, pB, #8
  681. ld1 {v0.2s, v1.2s}, [pA]
  682. add pA, pA, #16
  683. fmla v16.2s, v0.2s, v8.s[0]
  684. fmla v17.2s, v1.2s, v8.s[0]
  685. fmla v20.2s, v0.2s, v8.s[1]
  686. fmla v21.2s, v1.2s, v8.s[1]
  687. .endm
  688. .macro SAVE4x2
  689. ld1 {v8.2s, v9.2s}, [pCRow0]
  690. fmla v8.2s, v16.2s, alphaV0
  691. fmla v9.2s, v17.2s, alphaV1
  692. st1 {v8.2s, v9.2s}, [pCRow0]
  693. add pCRow1, pCRow0, LDC
  694. ld1 {v12.2s, v13.2s}, [pCRow1]
  695. fmla v12.2s, v20.2s, alphaV2
  696. fmla v13.2s, v21.2s, alphaV3
  697. st1 {v12.2s, v13.2s}, [pCRow1]
  698. add pCRow0, pCRow0, #16
  699. .endm
  700. /******************************************************************************/
  701. .macro INIT2x2
  702. fmov s16, wzr
  703. fmov s20, s16
  704. .endm
  705. .macro KERNEL2x2_SUB
  706. ld1 {v8.2s}, [pB]
  707. add pB, pB, #8
  708. ld1 {v0.2s}, [pA]
  709. add pA, pA, #8
  710. fmla v16.2s, v0.2s, v8.s[0]
  711. fmla v20.2s, v0.2s, v8.s[1]
  712. .endm
  713. .macro SAVE2x2
  714. ld1 {v8.2s}, [pCRow0]
  715. fmla v8.2s, v16.2s, alphaV0
  716. st1 {v8.2s}, [pCRow0]
  717. add pCRow1 , pCRow0, LDC
  718. ld1 {v12.2s}, [pCRow1]
  719. fmla v12.2s, v20.2s, alphaV1
  720. st1 {v12.2s}, [pCRow1]
  721. add pCRow0, pCRow0, #8
  722. .endm
  723. /******************************************************************************/
  724. .macro INIT1x2
  725. fmov s16, wzr
  726. .endm
  727. .macro KERNEL1x2_SUB
  728. ld1 {v8.2s} , [pB]
  729. add pB , pB, #8
  730. ldr s0 , [pA]
  731. add pA, pA, #4
  732. fmla v16.2s, v8.2s, v0.s[0]
  733. .endm
  734. .macro SAVE1x2
  735. add pCRow1 , pCRow0, LDC
  736. ld1 {v8.s}[0], [pCRow0]
  737. ld1 {v8.s}[1], [pCRow1]
  738. fmla v8.2s, v16.2s, alphaV0
  739. st1 {v8.s}[0], [pCRow0]
  740. st1 {v8.s}[1], [pCRow1]
  741. add pCRow0, pCRow0, #4
  742. .endm
  743. /******************************************************************************/
  744. .macro INIT16x1
  745. fmov s16, wzr
  746. fmov s17, wzr
  747. fmov s18, wzr
  748. fmov s19, s16
  749. .endm
  750. .macro KERNEL16x1_SUB
  751. ldr s8, [pB]
  752. add pB , pB, #4
  753. ld1 {v0.4s}, [pA]
  754. add pA, pA, #16
  755. ld1 {v1.4s}, [pA]
  756. add pA, pA, #16
  757. ld1 {v2.4s}, [pA]
  758. add pA, pA, #16
  759. ld1 {v3.4s}, [pA]
  760. add pA, pA, #16
  761. fmla v16.4s, v0.4s, v8.s[0]
  762. fmla v17.4s, v1.4s, v8.s[0]
  763. fmla v18.4s, v2.4s, v8.s[0]
  764. fmla v19.4s, v3.4s, v8.s[0]
  765. .endm
  766. .macro SAVE16x1
  767. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  768. fmla v0.4s, v16.4s, alphaV0
  769. fmla v1.4s, v17.4s, alphaV1
  770. fmla v2.4s, v18.4s, alphaV2
  771. fmla v3.4s, v19.4s, alphaV3
  772. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  773. add pCRow0, pCRow0, #64
  774. .endm
  775. /******************************************************************************/
  776. .macro INIT8x1
  777. fmov s16, wzr
  778. fmov s17, wzr
  779. .endm
  780. .macro KERNEL8x1_SUB
  781. ldr s8, [pB]
  782. add pB , pB, #4
  783. ld1 {v0.4s}, [pA]
  784. add pA, pA, #16
  785. ld1 {v1.4s}, [pA]
  786. add pA, pA, #16
  787. fmla v16.4s, v0.4s, v8.s[0]
  788. fmla v17.4s, v1.4s, v8.s[0]
  789. .endm
  790. .macro SAVE8x1
  791. ld1 {v0.4s, v1.4s}, [pCRow0]
  792. fmla v0.4s, v16.4s, alphaV0
  793. fmla v1.4s, v17.4s, alphaV1
  794. st1 {v0.4s, v1.4s}, [pCRow0]
  795. add pCRow0, pCRow0, #32
  796. .endm
  797. /******************************************************************************/
  798. .macro INIT4x1
  799. fmov s16, wzr
  800. fmov s17, s16
  801. .endm
  802. .macro KERNEL4x1_SUB
  803. ldr s8, [pB]
  804. add pB , pB, #4
  805. ld1 {v0.2s, v1.2s}, [pA]
  806. add pA , pA, #16
  807. fmla v16.2s, v0.2s, v8.s[0]
  808. fmla v17.2s, v1.2s, v8.s[0]
  809. .endm
  810. .macro SAVE4x1
  811. ld1 {v8.2s, v9.2s}, [pCRow0]
  812. fmla v8.2s, v16.2s, alphaV0
  813. fmla v9.2s, v17.2s, alphaV1
  814. st1 {v8.2s, v9.2s}, [pCRow0]
  815. add pCRow0, pCRow0, #16
  816. .endm
  817. /******************************************************************************/
  818. .macro INIT2x1
  819. fmov s16, wzr
  820. .endm
  821. .macro KERNEL2x1_SUB
  822. ldr s8, [pB]
  823. add pB , pB, #4
  824. ld1 {v0.2s}, [pA]
  825. add pA , pA, #8
  826. fmla v16.2s, v0.2s, v8.s[0]
  827. .endm
  828. .macro SAVE2x1
  829. ld1 {v8.2s}, [pCRow0]
  830. fmla v8.2s, v16.2s, alphaV0
  831. st1 {v8.2s}, [pCRow0]
  832. add pCRow0, pCRow0, #8
  833. .endm
  834. /******************************************************************************/
  835. .macro INIT1x1
  836. fmov s16, wzr
  837. .endm
  838. .macro KERNEL1x1_SUB
  839. ldr s8, [pB]
  840. add pB , pB, #4
  841. ldr s0, [pA]
  842. add pA , pA, #4
  843. fmadd s16, s0, s8, s16
  844. .endm
  845. .macro SAVE1x1
  846. ldr s8, [pCRow0]
  847. fmla s8, s16, alphaV0
  848. str s8, [pCRow0]
  849. add pCRow0, pCRow0, #4
  850. .endm
  851. /*******************************************************************************
  852. * End of macro definitions
  853. *******************************************************************************/
  854. PROLOGUE
  855. sgemm_kernel_begin:
  856. .align 5
  857. add sp, sp, #-(11 * 16)
  858. stp d8, d9, [sp, #(0 * 16)]
  859. stp d10, d11, [sp, #(1 * 16)]
  860. stp d12, d13, [sp, #(2 * 16)]
  861. stp d14, d15, [sp, #(3 * 16)]
  862. stp d16, d17, [sp, #(4 * 16)]
  863. stp x18, x19, [sp, #(5 * 16)]
  864. stp x20, x21, [sp, #(6 * 16)]
  865. stp x22, x23, [sp, #(7 * 16)]
  866. stp x24, x25, [sp, #(8 * 16)]
  867. stp x26, x27, [sp, #(9 * 16)]
  868. str x28, [sp, #(10 * 16)]
  869. fmov alpha0, s0
  870. fmov alpha1, s0
  871. fmov alpha2, s0
  872. fmov alpha3, s0
  873. lsl LDC, LDC, #2 // ldc = ldc * 4
  874. mov pB, origPB
  875. mov counterJ, origN
  876. asr counterJ, counterJ, #2 // J = J / 4
  877. cmp counterJ, #0
  878. ble sgemm_kernel_L2_BEGIN
  879. /******************************************************************************/
  880. sgemm_kernel_L4_BEGIN:
  881. mov pCRow0, pC // pCRow0 = C
  882. add pC, pC, LDC, lsl #2
  883. mov pA, origPA // pA = start of A array
  884. sgemm_kernel_L4_M16_BEGIN:
  885. mov counterI, origM
  886. asr counterI, counterI, #4 // counterI = counterI / 16
  887. cmp counterI, #0
  888. ble sgemm_kernel_L4_M8_BEGIN
  889. sgemm_kernel_L4_M16_20:
  890. mov pB, origPB
  891. asr counterL , origK, #1 // L = K / 2
  892. cmp counterL , #2 // is there at least 4 to do?
  893. blt sgemm_kernel_L4_M16_32
  894. KERNEL16x4_I // do one in the K
  895. KERNEL16x4_M2 // do another in the K
  896. subs counterL, counterL, #2
  897. ble sgemm_kernel_L4_M16_22a
  898. .align 5
  899. sgemm_kernel_L4_M16_22:
  900. KERNEL16x4_M1
  901. KERNEL16x4_M2
  902. subs counterL, counterL, #1
  903. bgt sgemm_kernel_L4_M16_22
  904. sgemm_kernel_L4_M16_22a:
  905. KERNEL16x4_M1
  906. KERNEL16x4_E
  907. b sgemm_kernel_L4_M16_44
  908. sgemm_kernel_L4_M16_32:
  909. tst counterL, #1
  910. ble sgemm_kernel_L4_M16_40
  911. KERNEL16x4_I
  912. KERNEL16x4_E
  913. b sgemm_kernel_L4_M16_44
  914. sgemm_kernel_L4_M16_40:
  915. INIT16x4
  916. sgemm_kernel_L4_M16_44:
  917. ands counterL , origK, #1
  918. ble sgemm_kernel_L4_M16_100
  919. sgemm_kernel_L4_M16_46:
  920. KERNEL16x4_SUB
  921. sgemm_kernel_L4_M16_100:
  922. SAVE16x4
  923. sgemm_kernel_L4_M16_END:
  924. subs counterI, counterI, #1
  925. bne sgemm_kernel_L4_M16_20
  926. //------------------------------------------------------------------------------
  927. sgemm_kernel_L4_M8_BEGIN:
  928. mov counterI, origM
  929. tst counterI , #15
  930. ble sgemm_kernel_L4_END
  931. tst counterI, #8
  932. ble sgemm_kernel_L4_M4_BEGIN
  933. sgemm_kernel_L4_M8_20:
  934. mov pB, origPB
  935. asr counterL , origK, #1 // L = K / 2
  936. cmp counterL , #2 // is there at least 4 to do?
  937. blt sgemm_kernel_L4_M8_32
  938. KERNEL8x4_I // do one in the K
  939. KERNEL8x4_M2 // do another in the K
  940. subs counterL, counterL, #2
  941. ble sgemm_kernel_L4_M8_22a
  942. .align 5
  943. sgemm_kernel_L4_M8_22:
  944. KERNEL8x4_M1
  945. KERNEL8x4_M2
  946. subs counterL, counterL, #1
  947. bgt sgemm_kernel_L4_M8_22
  948. sgemm_kernel_L4_M8_22a:
  949. KERNEL8x4_M1
  950. KERNEL8x4_E
  951. b sgemm_kernel_L4_M8_44
  952. sgemm_kernel_L4_M8_32:
  953. tst counterL, #1
  954. ble sgemm_kernel_L4_M8_40
  955. KERNEL8x4_I
  956. KERNEL8x4_E
  957. b sgemm_kernel_L4_M8_44
  958. sgemm_kernel_L4_M8_40:
  959. INIT8x4
  960. sgemm_kernel_L4_M8_44:
  961. ands counterL , origK, #1
  962. ble sgemm_kernel_L4_M8_100
  963. sgemm_kernel_L4_M8_46:
  964. KERNEL8x4_SUB
  965. sgemm_kernel_L4_M8_100:
  966. SAVE8x4
  967. sgemm_kernel_L4_M8_END:
  968. //------------------------------------------------------------------------------
  969. sgemm_kernel_L4_M4_BEGIN:
  970. mov counterI, origM
  971. tst counterI , #7
  972. ble sgemm_kernel_L4_END
  973. tst counterI, #4
  974. ble sgemm_kernel_L4_M2_BEGIN
  975. sgemm_kernel_L4_M4_20:
  976. mov pB, origPB
  977. asr counterL , origK, #1 // L = K / 2
  978. cmp counterL , #2 // is there at least 4 to do?
  979. blt sgemm_kernel_L4_M4_32
  980. KERNEL4x4_I // do one in the K
  981. KERNEL4x4_M2 // do another in the K
  982. subs counterL, counterL, #2
  983. ble sgemm_kernel_L4_M4_22a
  984. .align 5
  985. sgemm_kernel_L4_M4_22:
  986. KERNEL4x4_M1
  987. KERNEL4x4_M2
  988. subs counterL, counterL, #1
  989. bgt sgemm_kernel_L4_M4_22
  990. sgemm_kernel_L4_M4_22a:
  991. KERNEL4x4_M1
  992. KERNEL4x4_E
  993. b sgemm_kernel_L4_M4_44
  994. sgemm_kernel_L4_M4_32:
  995. tst counterL, #1
  996. ble sgemm_kernel_L4_M4_40
  997. KERNEL4x4_I
  998. KERNEL4x4_E
  999. b sgemm_kernel_L4_M4_44
  1000. sgemm_kernel_L4_M4_40:
  1001. INIT4x4
  1002. sgemm_kernel_L4_M4_44:
  1003. ands counterL , origK, #1
  1004. ble sgemm_kernel_L4_M4_100
  1005. sgemm_kernel_L4_M4_46:
  1006. KERNEL4x4_SUB
  1007. sgemm_kernel_L4_M4_100:
  1008. SAVE4x4
  1009. sgemm_kernel_L4_M4_END:
  1010. //------------------------------------------------------------------------------
  1011. sgemm_kernel_L4_M2_BEGIN:
  1012. mov counterI, origM
  1013. tst counterI , #3
  1014. ble sgemm_kernel_L4_END
  1015. tst counterI, #2 // counterI = counterI / 2
  1016. ble sgemm_kernel_L4_M1_BEGIN
  1017. sgemm_kernel_L4_M2_20:
  1018. INIT2x4
  1019. mov pB, origPB
  1020. asr counterL , origK, #3 // counterL = counterL / 8
  1021. cmp counterL , #0
  1022. ble sgemm_kernel_L4_M2_40
  1023. sgemm_kernel_L4_M2_22:
  1024. KERNEL2x4_SUB
  1025. KERNEL2x4_SUB
  1026. KERNEL2x4_SUB
  1027. KERNEL2x4_SUB
  1028. KERNEL2x4_SUB
  1029. KERNEL2x4_SUB
  1030. KERNEL2x4_SUB
  1031. KERNEL2x4_SUB
  1032. subs counterL, counterL, #1
  1033. bgt sgemm_kernel_L4_M2_22
  1034. sgemm_kernel_L4_M2_40:
  1035. ands counterL , origK, #7 // counterL = counterL % 8
  1036. ble sgemm_kernel_L4_M2_100
  1037. sgemm_kernel_L4_M2_42:
  1038. KERNEL2x4_SUB
  1039. subs counterL, counterL, #1
  1040. bgt sgemm_kernel_L4_M2_42
  1041. sgemm_kernel_L4_M2_100:
  1042. SAVE2x4
  1043. sgemm_kernel_L4_M2_END:
  1044. sgemm_kernel_L4_M1_BEGIN:
  1045. tst counterI, #1 // counterI = counterI % 2
  1046. ble sgemm_kernel_L4_END
  1047. sgemm_kernel_L4_M1_20:
  1048. INIT1x4
  1049. mov pB, origPB
  1050. asr counterL , origK, #3 // counterL = counterL / 8
  1051. cmp counterL , #0
  1052. ble sgemm_kernel_L4_M1_40
  1053. sgemm_kernel_L4_M1_22:
  1054. KERNEL1x4_SUB
  1055. KERNEL1x4_SUB
  1056. KERNEL1x4_SUB
  1057. KERNEL1x4_SUB
  1058. KERNEL1x4_SUB
  1059. KERNEL1x4_SUB
  1060. KERNEL1x4_SUB
  1061. KERNEL1x4_SUB
  1062. subs counterL, counterL, #1
  1063. bgt sgemm_kernel_L4_M1_22
  1064. sgemm_kernel_L4_M1_40:
  1065. ands counterL , origK, #7 // counterL = counterL % 8
  1066. ble sgemm_kernel_L4_M1_100
  1067. sgemm_kernel_L4_M1_42:
  1068. KERNEL1x4_SUB
  1069. subs counterL, counterL, #1
  1070. bgt sgemm_kernel_L4_M1_42
  1071. sgemm_kernel_L4_M1_100:
  1072. SAVE1x4
  1073. sgemm_kernel_L4_END:
  1074. add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
  1075. subs counterJ, counterJ , #1 // j--
  1076. bgt sgemm_kernel_L4_BEGIN
  1077. /******************************************************************************/
  1078. sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  1079. mov counterJ , origN
  1080. tst counterJ , #3
  1081. ble sgemm_kernel_L999
  1082. tst counterJ , #2
  1083. ble sgemm_kernel_L1_BEGIN
  1084. mov pCRow0, pC // pCRow0 = pC
  1085. add pC,pC,LDC, lsl #1
  1086. mov pA, origPA // pA = A
  1087. sgemm_kernel_L2_M16_BEGIN:
  1088. mov counterI, origM
  1089. asr counterI, counterI, #4 // counterI = counterI / 16
  1090. cmp counterI,#0
  1091. ble sgemm_kernel_L2_M8_BEGIN
  1092. sgemm_kernel_L2_M16_20:
  1093. INIT16x2
  1094. mov pB, origPB
  1095. asr counterL , origK, #3 // counterL = counterL / 8
  1096. cmp counterL,#0
  1097. ble sgemm_kernel_L2_M16_40
  1098. .align 5
  1099. sgemm_kernel_L2_M16_22:
  1100. KERNEL16x2_SUB
  1101. KERNEL16x2_SUB
  1102. KERNEL16x2_SUB
  1103. KERNEL16x2_SUB
  1104. KERNEL16x2_SUB
  1105. KERNEL16x2_SUB
  1106. KERNEL16x2_SUB
  1107. KERNEL16x2_SUB
  1108. subs counterL, counterL, #1
  1109. bgt sgemm_kernel_L2_M16_22
  1110. sgemm_kernel_L2_M16_40:
  1111. ands counterL , origK, #7 // counterL = counterL % 8
  1112. ble sgemm_kernel_L2_M16_100
  1113. sgemm_kernel_L2_M16_42:
  1114. KERNEL16x2_SUB
  1115. subs counterL, counterL, #1
  1116. bgt sgemm_kernel_L2_M16_42
  1117. sgemm_kernel_L2_M16_100:
  1118. SAVE16x2
  1119. sgemm_kernel_L2_M16_END:
  1120. subs counterI, counterI, #1
  1121. bgt sgemm_kernel_L2_M16_20
  1122. //------------------------------------------------------------------------------
  1123. sgemm_kernel_L2_M8_BEGIN:
  1124. mov counterI, origM
  1125. tst counterI , #15
  1126. ble sgemm_kernel_L2_END
  1127. tst counterI, #8
  1128. ble sgemm_kernel_L2_M4_BEGIN
  1129. sgemm_kernel_L2_M8_20:
  1130. INIT8x2
  1131. mov pB, origPB
  1132. asr counterL , origK, #3 // counterL = counterL / 8
  1133. cmp counterL,#0
  1134. ble sgemm_kernel_L2_M8_40
  1135. .align 5
  1136. sgemm_kernel_L2_M8_22:
  1137. KERNEL8x2_SUB
  1138. KERNEL8x2_SUB
  1139. KERNEL8x2_SUB
  1140. KERNEL8x2_SUB
  1141. KERNEL8x2_SUB
  1142. KERNEL8x2_SUB
  1143. KERNEL8x2_SUB
  1144. KERNEL8x2_SUB
  1145. subs counterL, counterL, #1
  1146. bgt sgemm_kernel_L2_M8_22
  1147. sgemm_kernel_L2_M8_40:
  1148. ands counterL , origK, #7 // counterL = counterL % 8
  1149. ble sgemm_kernel_L2_M8_100
  1150. sgemm_kernel_L2_M8_42:
  1151. KERNEL8x2_SUB
  1152. subs counterL, counterL, #1
  1153. bgt sgemm_kernel_L2_M8_42
  1154. sgemm_kernel_L2_M8_100:
  1155. SAVE8x2
  1156. sgemm_kernel_L2_M8_END:
  1157. //------------------------------------------------------------------------------
  1158. sgemm_kernel_L2_M4_BEGIN:
  1159. mov counterI, origM
  1160. tst counterI , #7
  1161. ble sgemm_kernel_L2_END
  1162. tst counterI, #4
  1163. ble sgemm_kernel_L2_M2_BEGIN
  1164. sgemm_kernel_L2_M4_20:
  1165. INIT4x2
  1166. mov pB, origPB
  1167. asr counterL , origK, #3 // counterL = counterL / 8
  1168. cmp counterL,#0
  1169. ble sgemm_kernel_L2_M4_40
  1170. .align 5
  1171. sgemm_kernel_L2_M4_22:
  1172. KERNEL4x2_SUB
  1173. KERNEL4x2_SUB
  1174. KERNEL4x2_SUB
  1175. KERNEL4x2_SUB
  1176. KERNEL4x2_SUB
  1177. KERNEL4x2_SUB
  1178. KERNEL4x2_SUB
  1179. KERNEL4x2_SUB
  1180. subs counterL, counterL, #1
  1181. bgt sgemm_kernel_L2_M4_22
  1182. sgemm_kernel_L2_M4_40:
  1183. ands counterL , origK, #7 // counterL = counterL % 8
  1184. ble sgemm_kernel_L2_M4_100
  1185. sgemm_kernel_L2_M4_42:
  1186. KERNEL4x2_SUB
  1187. subs counterL, counterL, #1
  1188. bgt sgemm_kernel_L2_M4_42
  1189. sgemm_kernel_L2_M4_100:
  1190. SAVE4x2
  1191. sgemm_kernel_L2_M4_END:
  1192. //------------------------------------------------------------------------------
  1193. sgemm_kernel_L2_M2_BEGIN:
  1194. mov counterI, origM
  1195. tst counterI , #3
  1196. ble sgemm_kernel_L2_END
  1197. tst counterI, #2 // counterI = counterI / 2
  1198. ble sgemm_kernel_L2_M1_BEGIN
  1199. sgemm_kernel_L2_M2_20:
  1200. INIT2x2
  1201. mov pB, origPB
  1202. asr counterL , origK, #3 // counterL = counterL / 8
  1203. cmp counterL,#0
  1204. ble sgemm_kernel_L2_M2_40
  1205. sgemm_kernel_L2_M2_22:
  1206. KERNEL2x2_SUB
  1207. KERNEL2x2_SUB
  1208. KERNEL2x2_SUB
  1209. KERNEL2x2_SUB
  1210. KERNEL2x2_SUB
  1211. KERNEL2x2_SUB
  1212. KERNEL2x2_SUB
  1213. KERNEL2x2_SUB
  1214. subs counterL, counterL, #1
  1215. bgt sgemm_kernel_L2_M2_22
  1216. sgemm_kernel_L2_M2_40:
  1217. ands counterL , origK, #7 // counterL = counterL % 8
  1218. ble sgemm_kernel_L2_M2_100
  1219. sgemm_kernel_L2_M2_42:
  1220. KERNEL2x2_SUB
  1221. subs counterL, counterL, #1
  1222. bgt sgemm_kernel_L2_M2_42
  1223. sgemm_kernel_L2_M2_100:
  1224. SAVE2x2
  1225. sgemm_kernel_L2_M2_END:
  1226. sgemm_kernel_L2_M1_BEGIN:
  1227. tst counterI, #1 // counterI = counterI % 2
  1228. ble sgemm_kernel_L2_END
  1229. sgemm_kernel_L2_M1_20:
  1230. INIT1x2
  1231. mov pB, origPB
  1232. asr counterL , origK, #3 // counterL = counterL / 8
  1233. cmp counterL, #0
  1234. ble sgemm_kernel_L2_M1_40
  1235. sgemm_kernel_L2_M1_22:
  1236. KERNEL1x2_SUB
  1237. KERNEL1x2_SUB
  1238. KERNEL1x2_SUB
  1239. KERNEL1x2_SUB
  1240. KERNEL1x2_SUB
  1241. KERNEL1x2_SUB
  1242. KERNEL1x2_SUB
  1243. KERNEL1x2_SUB
  1244. subs counterL, counterL, #1
  1245. bgt sgemm_kernel_L2_M1_22
  1246. sgemm_kernel_L2_M1_40:
  1247. ands counterL , origK, #7 // counterL = counterL % 8
  1248. ble sgemm_kernel_L2_M1_100
  1249. sgemm_kernel_L2_M1_42:
  1250. KERNEL1x2_SUB
  1251. subs counterL, counterL, #1
  1252. bgt sgemm_kernel_L2_M1_42
  1253. sgemm_kernel_L2_M1_100:
  1254. SAVE1x2
  1255. sgemm_kernel_L2_END:
  1256. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  1257. /******************************************************************************/
  1258. sgemm_kernel_L1_BEGIN:
  1259. mov counterJ , origN
  1260. tst counterJ , #1
  1261. ble sgemm_kernel_L999 // done
  1262. mov pCRow0, pC // pCRow0 = C
  1263. add pC , pC , LDC // Update pC to point to next
  1264. mov pA, origPA // pA = A
  1265. sgemm_kernel_L1_M16_BEGIN:
  1266. mov counterI, origM
  1267. asr counterI, counterI, #4 // counterI = counterI / 16
  1268. cmp counterI, #0
  1269. ble sgemm_kernel_L1_M8_BEGIN
  1270. sgemm_kernel_L1_M16_20:
  1271. INIT16x1
  1272. mov pB, origPB
  1273. asr counterL , origK, #3 // counterL = counterL / 8
  1274. cmp counterL , #0
  1275. ble sgemm_kernel_L1_M16_40
  1276. .align 5
  1277. sgemm_kernel_L1_M16_22:
  1278. KERNEL16x1_SUB
  1279. KERNEL16x1_SUB
  1280. KERNEL16x1_SUB
  1281. KERNEL16x1_SUB
  1282. KERNEL16x1_SUB
  1283. KERNEL16x1_SUB
  1284. KERNEL16x1_SUB
  1285. KERNEL16x1_SUB
  1286. subs counterL, counterL, #1
  1287. bgt sgemm_kernel_L1_M16_22
  1288. sgemm_kernel_L1_M16_40:
  1289. ands counterL , origK, #7 // counterL = counterL % 8
  1290. ble sgemm_kernel_L1_M16_100
  1291. sgemm_kernel_L1_M16_42:
  1292. KERNEL16x1_SUB
  1293. subs counterL, counterL, #1
  1294. bgt sgemm_kernel_L1_M16_42
  1295. sgemm_kernel_L1_M16_100:
  1296. SAVE16x1
  1297. sgemm_kernel_L1_M16_END:
  1298. subs counterI, counterI, #1
  1299. bgt sgemm_kernel_L1_M16_20
  1300. //------------------------------------------------------------------------------
  1301. sgemm_kernel_L1_M8_BEGIN:
  1302. mov counterI, origM
  1303. tst counterI , #15
  1304. ble sgemm_kernel_L1_END
  1305. tst counterI, #8
  1306. ble sgemm_kernel_L1_M4_BEGIN
  1307. sgemm_kernel_L1_M8_20:
  1308. INIT8x1
  1309. mov pB, origPB
  1310. asr counterL , origK, #3 // counterL = counterL / 8
  1311. cmp counterL , #0
  1312. ble sgemm_kernel_L1_M8_40
  1313. .align 5
  1314. sgemm_kernel_L1_M8_22:
  1315. KERNEL8x1_SUB
  1316. KERNEL8x1_SUB
  1317. KERNEL8x1_SUB
  1318. KERNEL8x1_SUB
  1319. KERNEL8x1_SUB
  1320. KERNEL8x1_SUB
  1321. KERNEL8x1_SUB
  1322. KERNEL8x1_SUB
  1323. subs counterL, counterL, #1
  1324. bgt sgemm_kernel_L1_M8_22
  1325. sgemm_kernel_L1_M8_40:
  1326. ands counterL , origK, #7 // counterL = counterL % 8
  1327. ble sgemm_kernel_L1_M8_100
  1328. sgemm_kernel_L1_M8_42:
  1329. KERNEL8x1_SUB
  1330. subs counterL, counterL, #1
  1331. bgt sgemm_kernel_L1_M8_42
  1332. sgemm_kernel_L1_M8_100:
  1333. SAVE8x1
  1334. sgemm_kernel_L1_M8_END:
  1335. //------------------------------------------------------------------------------
  1336. sgemm_kernel_L1_M4_BEGIN:
  1337. mov counterI, origM
  1338. tst counterI , #7
  1339. ble sgemm_kernel_L1_END
  1340. tst counterI, #4
  1341. ble sgemm_kernel_L1_M2_BEGIN
  1342. sgemm_kernel_L1_M4_20:
  1343. INIT4x1
  1344. mov pB, origPB
  1345. asr counterL , origK, #3 // counterL = counterL / 8
  1346. cmp counterL , #0
  1347. ble sgemm_kernel_L1_M4_40
  1348. .align 5
  1349. sgemm_kernel_L1_M4_22:
  1350. KERNEL4x1_SUB
  1351. KERNEL4x1_SUB
  1352. KERNEL4x1_SUB
  1353. KERNEL4x1_SUB
  1354. KERNEL4x1_SUB
  1355. KERNEL4x1_SUB
  1356. KERNEL4x1_SUB
  1357. KERNEL4x1_SUB
  1358. subs counterL, counterL, #1
  1359. bgt sgemm_kernel_L1_M4_22
  1360. sgemm_kernel_L1_M4_40:
  1361. ands counterL , origK, #7 // counterL = counterL % 8
  1362. ble sgemm_kernel_L1_M4_100
  1363. sgemm_kernel_L1_M4_42:
  1364. KERNEL4x1_SUB
  1365. subs counterL, counterL, #1
  1366. bgt sgemm_kernel_L1_M4_42
  1367. sgemm_kernel_L1_M4_100:
  1368. SAVE4x1
  1369. sgemm_kernel_L1_M4_END:
  1370. //------------------------------------------------------------------------------
  1371. sgemm_kernel_L1_M2_BEGIN:
  1372. mov counterI, origM
  1373. tst counterI , #3
  1374. ble sgemm_kernel_L1_END
  1375. tst counterI, #2 // counterI = counterI / 2
  1376. ble sgemm_kernel_L1_M1_BEGIN
  1377. sgemm_kernel_L1_M2_20:
  1378. INIT2x1
  1379. mov pB, origPB
  1380. asr counterL , origK, #3 // counterL = counterL / 8
  1381. cmp counterL , #0
  1382. ble sgemm_kernel_L1_M2_40
  1383. sgemm_kernel_L1_M2_22:
  1384. KERNEL2x1_SUB
  1385. KERNEL2x1_SUB
  1386. KERNEL2x1_SUB
  1387. KERNEL2x1_SUB
  1388. KERNEL2x1_SUB
  1389. KERNEL2x1_SUB
  1390. KERNEL2x1_SUB
  1391. KERNEL2x1_SUB
  1392. subs counterL, counterL, #1
  1393. bgt sgemm_kernel_L1_M2_22
  1394. sgemm_kernel_L1_M2_40:
  1395. ands counterL , origK, #7 // counterL = counterL % 8
  1396. ble sgemm_kernel_L1_M2_100
  1397. sgemm_kernel_L1_M2_42:
  1398. KERNEL2x1_SUB
  1399. subs counterL, counterL, #1
  1400. bgt sgemm_kernel_L1_M2_42
  1401. sgemm_kernel_L1_M2_100:
  1402. SAVE2x1
  1403. sgemm_kernel_L1_M2_END:
  1404. sgemm_kernel_L1_M1_BEGIN:
  1405. tst counterI, #1 // counterI = counterI % 2
  1406. ble sgemm_kernel_L1_END
  1407. sgemm_kernel_L1_M1_20:
  1408. INIT1x1
  1409. mov pB, origPB
  1410. asr counterL , origK, #3 // counterL = counterL / 8
  1411. cmp counterL , #0
  1412. ble sgemm_kernel_L1_M1_40
  1413. sgemm_kernel_L1_M1_22:
  1414. KERNEL1x1_SUB
  1415. KERNEL1x1_SUB
  1416. KERNEL1x1_SUB
  1417. KERNEL1x1_SUB
  1418. KERNEL1x1_SUB
  1419. KERNEL1x1_SUB
  1420. KERNEL1x1_SUB
  1421. KERNEL1x1_SUB
  1422. subs counterL, counterL, #1
  1423. bgt sgemm_kernel_L1_M1_22
  1424. sgemm_kernel_L1_M1_40:
  1425. ands counterL , origK, #7 // counterL = counterL % 8
  1426. ble sgemm_kernel_L1_M1_100
  1427. sgemm_kernel_L1_M1_42:
  1428. KERNEL1x1_SUB
  1429. subs counterL, counterL, #1
  1430. bgt sgemm_kernel_L1_M1_42
  1431. sgemm_kernel_L1_M1_100:
  1432. SAVE1x1
  1433. sgemm_kernel_L1_END:
  1434. sgemm_kernel_L999:
  1435. mov x0, #0 // set return value
  1436. ldp d8, d9, [sp, #(0 * 16)]
  1437. ldp d10, d11, [sp, #(1 * 16)]
  1438. ldp d12, d13, [sp, #(2 * 16)]
  1439. ldp d14, d15, [sp, #(3 * 16)]
  1440. ldp d16, d17, [sp, #(4 * 16)]
  1441. ldp x18, x19, [sp, #(5 * 16)]
  1442. ldp x20, x21, [sp, #(6 * 16)]
  1443. ldp x22, x23, [sp, #(7 * 16)]
  1444. ldp x24, x25, [sp, #(8 * 16)]
  1445. ldp x26, x27, [sp, #(9 * 16)]
  1446. ldr x28, [sp, #(10 * 16)]
  1447. add sp, sp, #(11*16)
  1448. ret
  1449. EPILOGUE