You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_8x8_cortexa53.S 44 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define temp x16
  48. #define alpha0 s10
  49. #define alphaV0 v10.s[0]
  50. #define alpha1 s11
  51. #define alphaV1 v11.s[0]
  52. #define alpha2 s14
  53. #define alphaV2 v14.s[0]
  54. #define alpha3 s15
  55. #define alphaV3 v15.s[0]
  56. #define A_PRE_SIZE 640
  57. #define B_PRE_SIZE 224
  58. #define C_PRE_SIZE 96
  59. // 00 origM
  60. // 01 origN
  61. // 02 origK
  62. // 03 origPA
  63. // 04 origPB
  64. // 05 pC
  65. // 06 origLDC -> LDC
  66. // 07 offset
  67. // 08 counterL
  68. // 09 counterI
  69. // 10 counterJ
  70. // 11 pB
  71. // 12 pCRow0
  72. // 13 pCRow1
  73. // 14 pCRow2
  74. // 15 pA
  75. // 16 temp
  76. // 17
  77. // 18 must save
  78. // 19 must save
  79. // 20 must save pA0_2, pA0_3
  80. // 21 must save pA0_6, pA0_7
  81. // 22 must save pA1_2, pA1_3
  82. // 23 must save pA1_6, pA1_7
  83. // 24 must save pB0_2, pB0_3
  84. // 25 must save pB0_6, pB0_7
  85. // 26 must save pB1_2, pB1_3
  86. // 27 must save pB1_6, pB1_7
  87. // 28 must save
  88. // 29 frame
  89. // 30 link
  90. // 31 sp
  91. //v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3
  92. //v01 pA0_4, pA0_5, pA0_6, pA0_7
  93. //v02 pA1_0, pA1_1, pA1_2, pA1_3
  94. //v03 pA1_4, pA1_5, pA1_6, pA1_7
  95. //v04 pB0_0, pB0_1, pB0_2, pB0_3
  96. //v05 pB0_4, pB0_5, pB0_6, pB0_7
  97. //v06 pB1_0, pB1_1, pB1_2, pB1_3
  98. //v07 pB1_4, pB1_5, pB1_6, pB1_7
  99. //v08 must save
  100. //v09 must save
  101. //v10 must save ALPHA0
  102. //v11 must save ALPHA1
  103. //v12 must save
  104. //v13 must save
  105. //v14 must save ALPHA2
  106. //v15 must save ALPHA3
  107. //v16 must save C00, C01, C02, C03
  108. //v17 must save C04, C05, C06, C07
  109. //v18 C08, C09, C10, C11
  110. //v19 C12, C13, C14, C15
  111. //v20 C16, C17, C18, C19
  112. //v21 C20, C21, C22, C23
  113. //v22 C24, C25, C26, C27
  114. //v23 C28, C29, C30, C31
  115. //v24 C32, C33, C34, C35
  116. //v25 C36, C37, C38, C39
  117. //v26 C40, C41, C42, C43
  118. //v27 C44, C45, C46, C47
  119. //v28 C48, C49, C50, C51
  120. //v29 C52, C53, C54, C55
  121. //v30 C56, C57, C58, C59
  122. //v31 C60, C61, C62, C63
  123. /*******************************************************************************
  124. * Macro definitions
  125. *******************************************************************************/
  126. .macro INIT8x8
  127. fmov s16, wzr
  128. fmov s17, wzr
  129. fmov s18, s16
  130. fmov s19, s17
  131. fmov s20, wzr
  132. fmov s21, s16
  133. fmov s22, s17
  134. fmov s23, s18
  135. fmov s24, wzr
  136. fmov s25, s16
  137. fmov s26, s17
  138. fmov s27, s18
  139. fmov s28, wzr
  140. fmov s29, s16
  141. fmov s30, s17
  142. fmov s31, s18
  143. .endm
  144. .macro KERNEL8x8_I
  145. ldp q0, q1, [pA], #32
  146. ldp q4, q5, [pB], #32
  147. ldr d2, [pA], #8
  148. ldr d6, [pB], #8
  149. ldr d3, [pA, #8]
  150. ldr d7, [pB, #8]
  151. ldr x22, [pA], #16
  152. fmul v16.4s, v0.4s, v4.s[0]
  153. ldr x26, [pB], #16
  154. fmul v17.4s, v1.4s, v4.s[0]
  155. ldr x23, [pA], #8
  156. fmul v18.4s, v0.4s, v4.s[1]
  157. ldr x27, [pB], #8
  158. fmul v19.4s, v1.4s, v4.s[1]
  159. fmul v20.4s, v0.4s, v4.s[2]
  160. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  161. fmul v21.4s, v1.4s, v4.s[2]
  162. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  163. fmul v22.4s, v0.4s, v4.s[3]
  164. fmul v23.4s, v1.4s, v4.s[3]
  165. fmul v24.4s, v0.4s, v5.s[0]
  166. fmul v25.4s, v1.4s, v5.s[0]
  167. fmul v26.4s, v0.4s, v5.s[1]
  168. fmul v27.4s, v1.4s, v5.s[1]
  169. fmul v28.4s, v0.4s, v5.s[2]
  170. fmul v29.4s, v1.4s, v5.s[2]
  171. fmul v30.4s, v0.4s, v5.s[3]
  172. fmul v31.4s, v1.4s, v5.s[3]
  173. .endm
  174. .macro KERNEL8x8_M1
  175. ldr d2, [pA], #8
  176. fmov v0.d[1], x20
  177. ldr d6, [pB], #8
  178. fmov v4.d[1], x24
  179. ldr d3, [pA, #8]
  180. fmov v1.d[1], x21
  181. ldr d7, [pB, #8]
  182. fmov v5.d[1], x25
  183. fmla v16.4s, v0.4s, v4.s[0]
  184. ldr x22, [pA], #16
  185. fmla v17.4s, v1.4s, v4.s[0]
  186. ldr x26, [pB], #16
  187. fmla v18.4s, v0.4s, v4.s[1]
  188. ldr x23, [pA], #8
  189. fmla v19.4s, v1.4s, v4.s[1]
  190. ldr x27, [pB], #8
  191. fmla v20.4s, v0.4s, v4.s[2]
  192. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  193. fmla v21.4s, v1.4s, v4.s[2]
  194. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  195. fmla v22.4s, v0.4s, v4.s[3]
  196. fmla v23.4s, v1.4s, v4.s[3]
  197. fmla v24.4s, v0.4s, v5.s[0]
  198. fmla v25.4s, v1.4s, v5.s[0]
  199. fmla v26.4s, v0.4s, v5.s[1]
  200. fmla v27.4s, v1.4s, v5.s[1]
  201. fmla v28.4s, v0.4s, v5.s[2]
  202. fmla v29.4s, v1.4s, v5.s[2]
  203. fmla v30.4s, v0.4s, v5.s[3]
  204. fmla v31.4s, v1.4s, v5.s[3]
  205. .endm
  206. .macro KERNEL8x8_M2
  207. ldr d0, [pA], #8
  208. fmov v2.d[1], x22
  209. ldr d4, [pB], #8
  210. fmov v6.d[1], x26
  211. ldr d1, [pA, #8]
  212. fmov v3.d[1], x23
  213. ldr d5, [pB, #8]
  214. fmov v7.d[1], x27
  215. fmla v16.4s, v2.4s, v6.s[0]
  216. ldr x20, [pA], #16
  217. fmla v17.4s, v3.4s, v6.s[0]
  218. ldr x24, [pB], #16
  219. fmla v18.4s, v2.4s, v6.s[1]
  220. ldr x21, [pA], #8
  221. fmla v19.4s, v3.4s, v6.s[1]
  222. ldr x25, [pB], #8
  223. fmla v20.4s, v2.4s, v6.s[2]
  224. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  225. fmla v21.4s, v3.4s, v6.s[2]
  226. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  227. fmla v22.4s, v2.4s, v6.s[3]
  228. fmla v23.4s, v3.4s, v6.s[3]
  229. fmla v24.4s, v2.4s, v7.s[0]
  230. fmla v25.4s, v3.4s, v7.s[0]
  231. fmla v26.4s, v2.4s, v7.s[1]
  232. fmla v27.4s, v3.4s, v7.s[1]
  233. fmla v28.4s, v2.4s, v7.s[2]
  234. fmla v29.4s, v3.4s, v7.s[2]
  235. fmla v30.4s, v2.4s, v7.s[3]
  236. fmla v31.4s, v3.4s, v7.s[3]
  237. .endm
  238. .macro KERNEL8x8_E
  239. fmov v2.d[1], x22
  240. fmov v6.d[1], x26
  241. fmov v3.d[1], x23
  242. fmov v7.d[1], x27
  243. fmla v16.4s, v2.4s, v6.s[0]
  244. fmla v17.4s, v3.4s, v6.s[0]
  245. fmla v18.4s, v2.4s, v6.s[1]
  246. fmla v19.4s, v3.4s, v6.s[1]
  247. fmla v20.4s, v2.4s, v6.s[2]
  248. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  249. fmla v21.4s, v3.4s, v6.s[2]
  250. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  251. fmla v22.4s, v2.4s, v6.s[3]
  252. fmla v23.4s, v3.4s, v6.s[3]
  253. fmla v24.4s, v2.4s, v7.s[0]
  254. fmla v25.4s, v3.4s, v7.s[0]
  255. fmla v26.4s, v2.4s, v7.s[1]
  256. fmla v27.4s, v3.4s, v7.s[1]
  257. fmla v28.4s, v2.4s, v7.s[2]
  258. fmla v29.4s, v3.4s, v7.s[2]
  259. fmla v30.4s, v2.4s, v7.s[3]
  260. fmla v31.4s, v3.4s, v7.s[3]
  261. .endm
  262. .macro KERNEL8x8_SUB
  263. ldp q0, q1, [pA], #32
  264. ldp q4, q5, [pB], #32
  265. fmla v16.4s, v0.4s, v4.s[0]
  266. fmla v17.4s, v1.4s, v4.s[0]
  267. fmla v18.4s, v0.4s, v4.s[1]
  268. fmla v19.4s, v1.4s, v4.s[1]
  269. fmla v20.4s, v0.4s, v4.s[2]
  270. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  271. fmla v21.4s, v1.4s, v4.s[2]
  272. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  273. fmla v22.4s, v0.4s, v4.s[3]
  274. fmla v23.4s, v1.4s, v4.s[3]
  275. fmla v24.4s, v0.4s, v5.s[0]
  276. fmla v25.4s, v1.4s, v5.s[0]
  277. fmla v26.4s, v0.4s, v5.s[1]
  278. fmla v27.4s, v1.4s, v5.s[1]
  279. fmla v28.4s, v0.4s, v5.s[2]
  280. fmla v29.4s, v1.4s, v5.s[2]
  281. fmla v30.4s, v0.4s, v5.s[3]
  282. fmla v31.4s, v1.4s, v5.s[3]
  283. .endm
  284. .macro SAVE8x8
  285. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  286. add pCRow1, pCRow0, LDC
  287. ldp q0, q1, [pCRow0]
  288. fmla v0.4s, v16.4s, alphaV0
  289. fmla v1.4s, v17.4s, alphaV1
  290. stp q0, q1, [pCRow0]
  291. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  292. add pCRow2, pCRow1, LDC
  293. ldp q2, q3, [pCRow1]
  294. fmla v2.4s, v18.4s, alphaV2
  295. fmla v3.4s, v19.4s, alphaV3
  296. stp q2, q3, [pCRow1]
  297. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  298. add pCRow1, pCRow2, LDC
  299. ldp q4, q5, [pCRow2]
  300. fmla v4.4s, v20.4s, alphaV0
  301. fmla v5.4s, v21.4s, alphaV1
  302. stp q4, q5, [pCRow2]
  303. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  304. add pCRow2, pCRow1, LDC
  305. ldp q6, q7, [pCRow1]
  306. fmla v6.4s, v22.4s, alphaV2
  307. fmla v7.4s, v23.4s, alphaV3
  308. stp q6, q7, [pCRow1]
  309. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  310. add pCRow1, pCRow2, LDC
  311. ldp q0, q1, [pCRow2]
  312. fmla v0.4s, v24.4s, alphaV0
  313. fmla v1.4s, v25.4s, alphaV1
  314. stp q0, q1, [pCRow2]
  315. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  316. add pCRow2, pCRow1, LDC
  317. ldp q2, q3, [pCRow1]
  318. fmla v2.4s, v26.4s, alphaV2
  319. fmla v3.4s, v27.4s, alphaV3
  320. stp q2, q3, [pCRow1]
  321. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  322. add pCRow1, pCRow2, LDC
  323. ldp q4, q5, [pCRow2]
  324. fmla v4.4s, v28.4s, alphaV0
  325. fmla v5.4s, v29.4s, alphaV1
  326. stp q4, q5, [pCRow2]
  327. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  328. ldp q6, q7, [pCRow1]
  329. fmla v6.4s, v30.4s, alphaV2
  330. fmla v7.4s, v31.4s, alphaV3
  331. stp q6, q7, [pCRow1]
  332. add pCRow0, pCRow0, #32
  333. .endm
  334. /******************************************************************************/
  335. .macro INIT4x8
  336. fmov s16, wzr
  337. fmov s18, wzr
  338. fmov s20, wzr
  339. fmov s22, s16
  340. fmov s24, wzr
  341. fmov s26, s16
  342. fmov s28, s18
  343. fmov s30, s20
  344. .endm
  345. .macro KERNEL4x8_I
  346. ldr q0, [pA], #16
  347. ldp q4, q5, [pB], #32
  348. ldr d2, [pA], #8
  349. ldr d6, [pB], #8
  350. ldr d7, [pB, #8]
  351. ldr x22, [pA], #8
  352. fmul v16.4s, v0.4s, v4.s[0]
  353. ldr x26, [pB], #16
  354. fmul v18.4s, v0.4s, v4.s[1]
  355. ldr x27, [pB], #8
  356. fmul v20.4s, v0.4s, v4.s[2]
  357. fmul v22.4s, v0.4s, v4.s[3]
  358. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  359. fmul v24.4s, v0.4s, v5.s[0]
  360. fmul v26.4s, v0.4s, v5.s[1]
  361. fmul v28.4s, v0.4s, v5.s[2]
  362. fmul v30.4s, v0.4s, v5.s[3]
  363. .endm
  364. .macro KERNEL4x8_M1
  365. ldr d2, [pA], #8
  366. fmov v0.d[1], x20
  367. ldr d6, [pB], #8
  368. fmov v4.d[1], x24
  369. ldr d7, [pB, #8]
  370. fmov v5.d[1], x25
  371. ldr x22, [pA], #8
  372. fmla v16.4s, v0.4s, v4.s[0]
  373. ldr x26, [pB], #16
  374. fmla v18.4s, v0.4s, v4.s[1]
  375. ldr x27, [pB], #8
  376. fmla v20.4s, v0.4s, v4.s[2]
  377. fmla v22.4s, v0.4s, v4.s[3]
  378. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  379. fmla v24.4s, v0.4s, v5.s[0]
  380. fmla v26.4s, v0.4s, v5.s[1]
  381. fmla v28.4s, v0.4s, v5.s[2]
  382. fmla v30.4s, v0.4s, v5.s[3]
  383. .endm
  384. .macro KERNEL4x8_M2
  385. ldr d0, [pA], #8
  386. fmov v2.d[1], x22
  387. ldr d4, [pB], #8
  388. fmov v6.d[1], x26
  389. ldr d5, [pB, #8]
  390. fmov v7.d[1], x27
  391. ldr x20, [pA], #8
  392. fmla v16.4s, v2.4s, v6.s[0]
  393. ldr x24, [pB], #16
  394. fmla v18.4s, v2.4s, v6.s[1]
  395. ldr x25, [pB], #8
  396. fmla v20.4s, v2.4s, v6.s[2]
  397. fmla v22.4s, v2.4s, v6.s[3]
  398. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  399. fmla v24.4s, v2.4s, v7.s[0]
  400. fmla v26.4s, v2.4s, v7.s[1]
  401. fmla v28.4s, v2.4s, v7.s[2]
  402. fmla v30.4s, v2.4s, v7.s[3]
  403. .endm
  404. .macro KERNEL4x8_E
  405. fmov v2.d[1], x22
  406. fmov v6.d[1], x26
  407. fmov v7.d[1], x27
  408. fmla v16.4s, v2.4s, v6.s[0]
  409. fmla v18.4s, v2.4s, v6.s[1]
  410. fmla v20.4s, v2.4s, v6.s[2]
  411. fmla v22.4s, v2.4s, v6.s[3]
  412. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  413. fmla v24.4s, v2.4s, v7.s[0]
  414. fmla v26.4s, v2.4s, v7.s[1]
  415. fmla v28.4s, v2.4s, v7.s[2]
  416. fmla v30.4s, v2.4s, v7.s[3]
  417. .endm
  418. .macro KERNEL4x8_SUB
  419. ldr q0, [pA], #16
  420. ldp q4, q5, [pB], #32
  421. fmla v16.4s, v0.4s, v4.s[0]
  422. fmla v18.4s, v0.4s, v4.s[1]
  423. fmla v20.4s, v0.4s, v4.s[2]
  424. fmla v22.4s, v0.4s, v4.s[3]
  425. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  426. fmla v24.4s, v0.4s, v5.s[0]
  427. fmla v26.4s, v0.4s, v5.s[1]
  428. fmla v28.4s, v0.4s, v5.s[2]
  429. fmla v30.4s, v0.4s, v5.s[3]
  430. .endm
  431. .macro SAVE4x8
  432. add pCRow1, pCRow0, LDC
  433. ldr q0, [pCRow0]
  434. fmla v0.4s, v16.4s, alphaV0
  435. str q0, [pCRow0]
  436. add pCRow2, pCRow1, LDC
  437. ldr q2, [pCRow1]
  438. fmla v2.4s, v18.4s, alphaV2
  439. str q2, [pCRow1]
  440. add pCRow1, pCRow2, LDC
  441. ldr q4, [pCRow2]
  442. fmla v4.4s, v20.4s, alphaV0
  443. str q4, [pCRow2]
  444. add pCRow2, pCRow1, LDC
  445. ldr q6, [pCRow1]
  446. fmla v6.4s, v22.4s, alphaV2
  447. str q6, [pCRow1]
  448. add pCRow1, pCRow2, LDC
  449. ldr q0, [pCRow2]
  450. fmla v0.4s, v24.4s, alphaV0
  451. str q0, [pCRow2]
  452. add pCRow2, pCRow1, LDC
  453. ldr q2, [pCRow1]
  454. fmla v2.4s, v26.4s, alphaV2
  455. str q2, [pCRow1]
  456. add pCRow1, pCRow2, LDC
  457. ldr q4, [pCRow2]
  458. fmla v4.4s, v28.4s, alphaV0
  459. str q4, [pCRow2]
  460. ldr q6, [pCRow1]
  461. fmla v6.4s, v30.4s, alphaV2
  462. str q6, [pCRow1]
  463. add pCRow0, pCRow0, #16
  464. .endm
  465. /******************************************************************************/
  466. .macro INIT2x8
  467. fmov s16, wzr
  468. fmov s18, wzr
  469. fmov s20, wzr
  470. fmov s22, s16
  471. fmov s24, wzr
  472. fmov s26, s16
  473. fmov s28, s18
  474. fmov s30, s20
  475. .endm
  476. .macro KERNEL2x8_SUB
  477. ldr d0, [pA], #8
  478. ldp q4, q5, [pB], #32
  479. fmla v16.2s, v0.2s, v4.s[0]
  480. fmla v18.2s, v0.2s, v4.s[1]
  481. fmla v20.2s, v0.2s, v4.s[2]
  482. fmla v22.2s, v0.2s, v4.s[3]
  483. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  484. fmla v24.2s, v0.2s, v5.s[0]
  485. fmla v26.2s, v0.2s, v5.s[1]
  486. fmla v28.2s, v0.2s, v5.s[2]
  487. fmla v30.2s, v0.2s, v5.s[3]
  488. .endm
  489. .macro SAVE2x8
  490. add pCRow1, pCRow0, LDC
  491. ldr d0, [pCRow0]
  492. fmla v0.2s, v16.2s, alphaV0
  493. str d0, [pCRow0]
  494. add pCRow2, pCRow1, LDC
  495. ldr d2, [pCRow1]
  496. fmla v2.2s, v18.2s, alphaV2
  497. str d2, [pCRow1]
  498. add pCRow1, pCRow2, LDC
  499. ldr d4, [pCRow2]
  500. fmla v4.2s, v20.2s, alphaV0
  501. str d4, [pCRow2]
  502. add pCRow2, pCRow1, LDC
  503. ldr d6, [pCRow1]
  504. fmla v6.2s, v22.2s, alphaV2
  505. str d6, [pCRow1]
  506. add pCRow1, pCRow2, LDC
  507. ldr d0, [pCRow2]
  508. fmla v0.2s, v24.2s, alphaV0
  509. str d0, [pCRow2]
  510. add pCRow2, pCRow1, LDC
  511. ldr d2, [pCRow1]
  512. fmla v2.2s, v26.2s, alphaV2
  513. str d2, [pCRow1]
  514. add pCRow1, pCRow2, LDC
  515. ldr d4, [pCRow2]
  516. fmla v4.2s, v28.2s, alphaV0
  517. str d4, [pCRow2]
  518. ldr d6, [pCRow1]
  519. fmla v6.2s, v30.2s, alphaV2
  520. str d6, [pCRow1]
  521. add pCRow0, pCRow0, #8
  522. .endm
  523. /******************************************************************************/
  524. .macro INIT1x8
  525. fmov s16, wzr
  526. fmov s18, wzr
  527. fmov s20, wzr
  528. fmov s22, s16
  529. fmov s24, wzr
  530. fmov s26, s16
  531. fmov s28, s18
  532. fmov s30, s20
  533. .endm
  534. .macro KERNEL1x8_SUB
  535. ldp q4, q5, [pB], #32
  536. ldr s0, [pA], #4
  537. fmla s16, s0, v4.s[0]
  538. fmla s18, s0, v4.s[1]
  539. fmla s20, s0, v4.s[2]
  540. fmla s22, s0, v4.s[3]
  541. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  542. fmla s24, s0, v5.s[0]
  543. fmla s26, s0, v5.s[1]
  544. fmla s28, s0, v5.s[2]
  545. fmla s30, s0, v5.s[3]
  546. .endm
  547. .macro SAVE1x8
  548. add pCRow1, pCRow0, LDC
  549. ldr s0, [pCRow0]
  550. fmla s0, s16, alphaV0
  551. str s0, [pCRow0]
  552. add pCRow2, pCRow1, LDC
  553. ldr s2, [pCRow1]
  554. fmla s2, s18, alphaV2
  555. str s2, [pCRow1]
  556. add pCRow1, pCRow2, LDC
  557. ldr s4, [pCRow2]
  558. fmla s4, s20, alphaV0
  559. str s4, [pCRow2]
  560. add pCRow2, pCRow1, LDC
  561. ldr s6, [pCRow1]
  562. fmla s6, s22, alphaV2
  563. str s6, [pCRow1]
  564. add pCRow1, pCRow2, LDC
  565. ldr s0, [pCRow2]
  566. fmla s0, s24, alphaV0
  567. str s0, [pCRow2]
  568. add pCRow2, pCRow1, LDC
  569. ldr s2, [pCRow1]
  570. fmla s2, s26, alphaV2
  571. str s2, [pCRow1]
  572. add pCRow1, pCRow2, LDC
  573. ldr s4, [pCRow2]
  574. fmla s4, s28, alphaV0
  575. str s4, [pCRow2]
  576. ldr s6, [pCRow1]
  577. fmla s6, s30, alphaV2
  578. str s6, [pCRow1]
  579. add pCRow0, pCRow0, #4
  580. .endm
  581. /******************************************************************************/
  582. .macro INIT8x4
  583. fmov s16, wzr
  584. fmov s17, wzr
  585. fmov s18, wzr
  586. fmov s19, s16
  587. fmov s20, wzr
  588. fmov s21, s16
  589. fmov s22, wzr
  590. fmov s23, s16
  591. .endm
  592. .macro KERNEL8x4_I
  593. ldp q0, q1, [pA], #32
  594. ldr q4, [pB], #16
  595. ldr d2, [pA], #8
  596. ldr d6, [pB], #8
  597. ldr d3, [pA, #8]
  598. fmul v16.4s, v0.4s, v4.s[0]
  599. ldr x22, [pA], #16
  600. fmul v17.4s, v1.4s, v4.s[0]
  601. ldr x26, [pB], #8
  602. fmul v18.4s, v0.4s, v4.s[1]
  603. ldr x23, [pA], #8
  604. fmul v19.4s, v1.4s, v4.s[1]
  605. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  606. fmul v20.4s, v0.4s, v4.s[2]
  607. fmul v21.4s, v1.4s, v4.s[2]
  608. fmul v22.4s, v0.4s, v4.s[3]
  609. fmul v23.4s, v1.4s, v4.s[3]
  610. .endm
  611. .macro KERNEL8x4_M1
  612. ldr d2, [pA], #8
  613. fmov v0.d[1], x20
  614. ldr d6, [pB], #8
  615. fmov v4.d[1], x24
  616. ldr d3, [pA, #8]
  617. fmov v1.d[1], x21
  618. ldr x22, [pA], #16
  619. fmla v16.4s, v0.4s, v4.s[0]
  620. ldr x26, [pB], #8
  621. fmla v17.4s, v1.4s, v4.s[0]
  622. ldr x23, [pA], #8
  623. fmla v18.4s, v0.4s, v4.s[1]
  624. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  625. fmla v19.4s, v1.4s, v4.s[1]
  626. fmla v20.4s, v0.4s, v4.s[2]
  627. fmla v21.4s, v1.4s, v4.s[2]
  628. fmla v22.4s, v0.4s, v4.s[3]
  629. fmla v23.4s, v1.4s, v4.s[3]
  630. .endm
  631. .macro KERNEL8x4_M2
  632. ldr d0, [pA], #8
  633. fmov v2.d[1], x22
  634. ldr d4, [pB], #8
  635. fmov v6.d[1], x26
  636. ldr d1, [pA, #8]
  637. fmov v3.d[1], x23
  638. ldr x20, [pA], #16
  639. fmla v16.4s, v2.4s, v6.s[0]
  640. ldr x24, [pB], #8
  641. fmla v17.4s, v3.4s, v6.s[0]
  642. ldr x21, [pA], #8
  643. fmla v18.4s, v2.4s, v6.s[1]
  644. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  645. fmla v19.4s, v3.4s, v6.s[1]
  646. fmla v20.4s, v2.4s, v6.s[2]
  647. fmla v21.4s, v3.4s, v6.s[2]
  648. fmla v22.4s, v2.4s, v6.s[3]
  649. fmla v23.4s, v3.4s, v6.s[3]
  650. .endm
  651. .macro KERNEL8x4_E
  652. fmov v2.d[1], x22
  653. fmov v6.d[1], x26
  654. fmov v3.d[1], x23
  655. fmla v16.4s, v2.4s, v6.s[0]
  656. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  657. fmla v17.4s, v3.4s, v6.s[0]
  658. fmla v18.4s, v2.4s, v6.s[1]
  659. fmla v19.4s, v3.4s, v6.s[1]
  660. fmla v20.4s, v2.4s, v6.s[2]
  661. fmla v21.4s, v3.4s, v6.s[2]
  662. fmla v22.4s, v2.4s, v6.s[3]
  663. fmla v23.4s, v3.4s, v6.s[3]
  664. .endm
  665. .macro KERNEL8x4_SUB
  666. ldp q0, q1, [pA], #32
  667. ldr q4, [pB], #16
  668. fmla v16.4s, v0.4s, v4.s[0]
  669. fmla v17.4s, v1.4s, v4.s[0]
  670. fmla v18.4s, v0.4s, v4.s[1]
  671. fmla v19.4s, v1.4s, v4.s[1]
  672. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  673. fmla v20.4s, v0.4s, v4.s[2]
  674. fmla v21.4s, v1.4s, v4.s[2]
  675. fmla v22.4s, v0.4s, v4.s[3]
  676. fmla v23.4s, v1.4s, v4.s[3]
  677. .endm
  678. .macro SAVE8x4
  679. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  680. add pCRow1, pCRow0, LDC
  681. ldp q0, q1, [pCRow0]
  682. fmla v0.4s, v16.4s, alphaV0
  683. fmla v1.4s, v17.4s, alphaV1
  684. stp q0, q1, [pCRow0]
  685. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  686. add pCRow2, pCRow1, LDC
  687. ldp q4, q5, [pCRow1]
  688. fmla v4.4s, v18.4s, alphaV0
  689. fmla v5.4s, v19.4s, alphaV1
  690. stp q4, q5, [pCRow1]
  691. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  692. add pCRow1, pCRow2, LDC
  693. ldp q0, q1, [pCRow2]
  694. fmla v0.4s, v20.4s, alphaV0
  695. fmla v1.4s, v21.4s, alphaV1
  696. stp q0, q1, [pCRow2]
  697. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  698. ldp q4, q5, [pCRow1]
  699. fmla v4.4s, v22.4s, alphaV0
  700. fmla v5.4s, v23.4s, alphaV1
  701. stp q4, q5, [pCRow1]
  702. add pCRow0, pCRow0, #32
  703. .endm
  704. /******************************************************************************/
  705. .macro INIT4x4
  706. fmov s16, wzr
  707. fmov s18, wzr
  708. fmov s20, wzr
  709. fmov s22, wzr
  710. .endm
  711. .macro KERNEL4x4_I
  712. ldr q0, [pA], #16
  713. ldr q4, [pB], #16
  714. ldr d2, [pA], #8
  715. ldr d6, [pB], #8
  716. fmul v16.4s, v0.4s, v4.s[0]
  717. ldr x22, [pA], #8
  718. fmul v18.4s, v0.4s, v4.s[1]
  719. ldr x26, [pB], #8
  720. fmul v20.4s, v0.4s, v4.s[2]
  721. fmul v22.4s, v0.4s, v4.s[3]
  722. .endm
  723. .macro KERNEL4x4_M1
  724. ldr d2, [pA], #8
  725. fmov v0.d[1], x20
  726. ldr d6, [pB], #8
  727. fmov v4.d[1], x24
  728. ldr x22, [pA], #8
  729. ldr x26, [pB], #8
  730. fmla v16.4s, v0.4s, v4.s[0]
  731. fmla v18.4s, v0.4s, v4.s[1]
  732. fmla v20.4s, v0.4s, v4.s[2]
  733. fmla v22.4s, v0.4s, v4.s[3]
  734. .endm
  735. .macro KERNEL4x4_M2
  736. ldr d0, [pA], #8
  737. fmov v2.d[1], x22
  738. ldr d4, [pB], #8
  739. fmov v6.d[1], x26
  740. ldr x20, [pA], #8
  741. ldr x24, [pB], #8
  742. fmla v16.4s, v2.4s, v6.s[0]
  743. fmla v18.4s, v2.4s, v6.s[1]
  744. fmla v20.4s, v2.4s, v6.s[2]
  745. fmla v22.4s, v2.4s, v6.s[3]
  746. .endm
  747. .macro KERNEL4x4_E
  748. fmov v2.d[1], x22
  749. fmov v6.d[1], x26
  750. fmla v16.4s, v2.4s, v6.s[0]
  751. fmla v18.4s, v2.4s, v6.s[1]
  752. fmla v20.4s, v2.4s, v6.s[2]
  753. fmla v22.4s, v2.4s, v6.s[3]
  754. .endm
  755. .macro KERNEL4x4_SUB
  756. ldr q0, [pA], #16
  757. ldr q4, [pB], #16
  758. fmla v16.4s, v0.4s, v4.s[0]
  759. fmla v18.4s, v0.4s, v4.s[1]
  760. fmla v20.4s, v0.4s, v4.s[2]
  761. fmla v22.4s, v0.4s, v4.s[3]
  762. .endm
  763. .macro SAVE4x4
  764. ldr q0, [pCRow0]
  765. fmla v0.4s, v16.4s, alphaV0
  766. str q0, [pCRow0]
  767. add pCRow1, pCRow0, LDC
  768. ldr q1, [pCRow1]
  769. fmla v1.4s, v18.4s, alphaV2
  770. str q1, [pCRow1]
  771. add pCRow2, pCRow1, LDC
  772. ldr q2, [pCRow2]
  773. fmla v2.4s, v20.4s, alphaV0
  774. str q2, [pCRow2]
  775. add pCRow1, pCRow2, LDC
  776. ldr q3, [pCRow1]
  777. fmla v3.4s, v22.4s, alphaV2
  778. str q3, [pCRow1]
  779. add pCRow0, pCRow0, #16
  780. .endm
  781. /******************************************************************************/
  782. .macro INIT2x4
  783. fmov s16, wzr
  784. fmov s18, wzr
  785. fmov s20, wzr
  786. fmov s22, s16
  787. .endm
  788. .macro KERNEL2x4_SUB
  789. ldr d0, [pA], #8
  790. ldr q4, [pB], #16
  791. fmla v16.2s, v0.2s, v4.s[0]
  792. fmla v18.2s, v0.2s, v4.s[1]
  793. fmla v20.2s, v0.2s, v4.s[2]
  794. fmla v22.2s, v0.2s, v4.s[3]
  795. .endm
  796. .macro SAVE2x4
  797. ldr d8, [pCRow0]
  798. fmla v8.2s, v16.2s, alphaV0
  799. str d8, [pCRow0]
  800. add pCRow1, pCRow0, LDC
  801. ldr d12, [pCRow1]
  802. fmla v12.2s, v18.2s, alphaV1
  803. str d12, [pCRow1]
  804. add pCRow2, pCRow1, LDC
  805. ldr d8, [pCRow2]
  806. fmla v8.2s, v20.2s, alphaV2
  807. str d8, [pCRow2]
  808. add pCRow1, pCRow2, LDC
  809. ldr d12, [pCRow1]
  810. fmla v12.2s, v22.2s, alphaV3
  811. str d12, [pCRow1]
  812. add pCRow0, pCRow0, #8
  813. .endm
  814. /******************************************************************************/
  815. .macro INIT1x4
  816. fmov s16, wzr
  817. fmov s20, s16
  818. .endm
  819. .macro KERNEL1x4_SUB
  820. ldr s0, [pA]
  821. add pA, pA, #4
  822. ld1 {v8.2s, v9.2s}, [pB]
  823. add pB, pB, #16
  824. fmla v16.2s, v8.2s, v0.s[0]
  825. fmla v20.2s, v9.2s, v0.s[0]
  826. .endm
  827. .macro SAVE1x4
  828. add pCRow1, pCRow0, LDC
  829. ld1 {v8.s}[0], [pCRow0]
  830. ld1 {v8.s}[1], [pCRow1]
  831. fmla v8.2s, v16.2s, alphaV0
  832. st1 {v8.s}[0], [pCRow0]
  833. st1 {v8.s}[1], [pCRow1]
  834. add pCRow2, pCRow1, LDC
  835. add pCRow1, pCRow2, LDC
  836. ld1 {v12.s}[0], [pCRow2]
  837. ld1 {v12.s}[1], [pCRow1]
  838. fmla v12.2s, v20.2s, alphaV1
  839. st1 {v12.s}[0], [pCRow2]
  840. st1 {v12.s}[1], [pCRow1]
  841. add pCRow0, pCRow0, #4
  842. .endm
  843. /******************************************************************************/
  844. .macro INIT8x2
  845. fmov s16, wzr
  846. fmov s17, s16
  847. fmov s18, s17
  848. fmov s19, s16
  849. .endm
  850. .macro KERNEL8x2_SUB
  851. ldp q0, q1, [pA], #32
  852. ldr d4, [pB], #8
  853. fmla v16.4s, v0.4s, v4.s[0]
  854. fmla v17.4s, v1.4s, v4.s[0]
  855. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  856. fmla v18.4s, v0.4s, v4.s[1]
  857. fmla v19.4s, v1.4s, v4.s[1]
  858. .endm
  859. .macro SAVE8x2
  860. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  861. add pCRow1, pCRow0, LDC
  862. ldp q0, q1, [pCRow0]
  863. fmla v0.4s, v16.4s, alphaV0
  864. fmla v1.4s, v17.4s, alphaV1
  865. stp q0, q1, [pCRow0]
  866. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  867. add pCRow2, pCRow1, LDC
  868. ldp q4, q5, [pCRow1]
  869. fmla v4.4s, v18.4s, alphaV0
  870. fmla v5.4s, v19.4s, alphaV1
  871. stp q4, q5, [pCRow1]
  872. add pCRow0, pCRow0, #32
  873. .endm
  874. /******************************************************************************/
  875. .macro INIT4x2
  876. fmov s16, wzr
  877. fmov s17, s16
  878. fmov s20, s17
  879. fmov s21, s16
  880. .endm
  881. .macro KERNEL4x2_SUB
  882. ld1 {v8.2s}, [pB]
  883. add pB, pB, #8
  884. ld1 {v0.2s, v1.2s}, [pA]
  885. add pA, pA, #16
  886. fmla v16.2s, v0.2s, v8.s[0]
  887. fmla v17.2s, v1.2s, v8.s[0]
  888. fmla v20.2s, v0.2s, v8.s[1]
  889. fmla v21.2s, v1.2s, v8.s[1]
  890. .endm
  891. .macro SAVE4x2
  892. ld1 {v8.2s, v9.2s}, [pCRow0]
  893. fmla v8.2s, v16.2s, alphaV0
  894. fmla v9.2s, v17.2s, alphaV1
  895. st1 {v8.2s, v9.2s}, [pCRow0]
  896. add pCRow1, pCRow0, LDC
  897. ld1 {v12.2s, v13.2s}, [pCRow1]
  898. fmla v12.2s, v20.2s, alphaV2
  899. fmla v13.2s, v21.2s, alphaV3
  900. st1 {v12.2s, v13.2s}, [pCRow1]
  901. add pCRow0, pCRow0, #16
  902. .endm
  903. /******************************************************************************/
  904. .macro INIT2x2
  905. fmov s16, wzr
  906. fmov s20, s16
  907. .endm
  908. .macro KERNEL2x2_SUB
  909. ld1 {v8.2s}, [pB]
  910. add pB, pB, #8
  911. ld1 {v0.2s}, [pA]
  912. add pA, pA, #8
  913. fmla v16.2s, v0.2s, v8.s[0]
  914. fmla v20.2s, v0.2s, v8.s[1]
  915. .endm
  916. .macro SAVE2x2
  917. ld1 {v8.2s}, [pCRow0]
  918. fmla v8.2s, v16.2s, alphaV0
  919. st1 {v8.2s}, [pCRow0]
  920. add pCRow1 , pCRow0, LDC
  921. ld1 {v12.2s}, [pCRow1]
  922. fmla v12.2s, v20.2s, alphaV1
  923. st1 {v12.2s}, [pCRow1]
  924. add pCRow0, pCRow0, #8
  925. .endm
  926. /******************************************************************************/
  927. .macro INIT1x2
  928. fmov s16, wzr
  929. .endm
  930. .macro KERNEL1x2_SUB
  931. ld1 {v8.2s} , [pB]
  932. add pB , pB, #8
  933. ldr s0 , [pA]
  934. add pA, pA, #4
  935. fmla v16.2s, v8.2s, v0.s[0]
  936. .endm
  937. .macro SAVE1x2
  938. add pCRow1 , pCRow0, LDC
  939. ld1 {v8.s}[0], [pCRow0]
  940. ld1 {v8.s}[1], [pCRow1]
  941. fmla v8.2s, v16.2s, alphaV0
  942. st1 {v8.s}[0], [pCRow0]
  943. st1 {v8.s}[1], [pCRow1]
  944. add pCRow0, pCRow0, #4
  945. .endm
  946. /******************************************************************************/
  947. .macro INIT8x1
  948. fmov s16, wzr
  949. fmov s17, wzr
  950. .endm
  951. .macro KERNEL8x1_SUB
  952. ldr s4, [pB], #4
  953. ldp q0, q1, [pA], #32
  954. fmla v16.4s, v0.4s, v4.s[0]
  955. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  956. fmla v17.4s, v1.4s, v4.s[0]
  957. .endm
  958. .macro SAVE8x1
  959. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  960. ldp q0, q1, [pCRow0]
  961. fmla v0.4s, v16.4s, alphaV0
  962. fmla v1.4s, v17.4s, alphaV1
  963. stp q0, q1, [pCRow0]
  964. add pCRow0, pCRow0, #32
  965. .endm
  966. /******************************************************************************/
  967. .macro INIT4x1
  968. fmov s16, wzr
  969. fmov s17, s16
  970. .endm
  971. .macro KERNEL4x1_SUB
  972. ldr s8, [pB]
  973. add pB , pB, #4
  974. ld1 {v0.2s, v1.2s}, [pA]
  975. add pA , pA, #16
  976. fmla v16.2s, v0.2s, v8.s[0]
  977. fmla v17.2s, v1.2s, v8.s[0]
  978. .endm
  979. .macro SAVE4x1
  980. ld1 {v8.2s, v9.2s}, [pCRow0]
  981. fmla v8.2s, v16.2s, alphaV0
  982. fmla v9.2s, v17.2s, alphaV1
  983. st1 {v8.2s, v9.2s}, [pCRow0]
  984. add pCRow0, pCRow0, #16
  985. .endm
  986. /******************************************************************************/
  987. .macro INIT2x1
  988. fmov s16, wzr
  989. .endm
  990. .macro KERNEL2x1_SUB
  991. ldr s8, [pB]
  992. add pB , pB, #4
  993. ld1 {v0.2s}, [pA]
  994. add pA , pA, #8
  995. fmla v16.2s, v0.2s, v8.s[0]
  996. .endm
  997. .macro SAVE2x1
  998. ld1 {v8.2s}, [pCRow0]
  999. fmla v8.2s, v16.2s, alphaV0
  1000. st1 {v8.2s}, [pCRow0]
  1001. add pCRow0, pCRow0, #8
  1002. .endm
  1003. /******************************************************************************/
  1004. .macro INIT1x1
  1005. fmov s16, wzr
  1006. .endm
  1007. .macro KERNEL1x1_SUB
  1008. ldr s8, [pB]
  1009. add pB , pB, #4
  1010. ldr s0, [pA]
  1011. add pA , pA, #4
  1012. fmadd s16, s0, s8, s16
  1013. .endm
  1014. .macro SAVE1x1
  1015. ldr s8, [pCRow0]
  1016. fmla s8, s16, alphaV0
  1017. str s8, [pCRow0]
  1018. add pCRow0, pCRow0, #4
  1019. .endm
  1020. /*******************************************************************************
  1021. * End of macro definitions
  1022. *******************************************************************************/
  1023. PROLOGUE
  1024. .Lsgemm_kernel_begin:
  1025. .align 5
  1026. add sp, sp, #-(11 * 16)
  1027. stp d8, d9, [sp, #(0 * 16)]
  1028. stp d10, d11, [sp, #(1 * 16)]
  1029. stp d12, d13, [sp, #(2 * 16)]
  1030. stp d14, d15, [sp, #(3 * 16)]
  1031. stp d16, d17, [sp, #(4 * 16)]
  1032. stp x18, x19, [sp, #(5 * 16)]
  1033. stp x20, x21, [sp, #(6 * 16)]
  1034. stp x22, x23, [sp, #(7 * 16)]
  1035. stp x24, x25, [sp, #(8 * 16)]
  1036. stp x26, x27, [sp, #(9 * 16)]
  1037. str x28, [sp, #(10 * 16)]
  1038. fmov alpha0, s0
  1039. fmov alpha1, s0
  1040. fmov alpha2, s0
  1041. fmov alpha3, s0
  1042. lsl LDC, LDC, #2 // ldc = ldc * 4
  1043. mov pB, origPB
  1044. mov counterJ, origN
  1045. asr counterJ, counterJ, #3 // J = J / 8
  1046. cmp counterJ, #0
  1047. ble .Lsgemm_kernel_L4_BEGIN
  1048. /******************************************************************************/
  1049. /******************************************************************************/
  1050. .Lsgemm_kernel_L8_BEGIN:
  1051. mov pCRow0, pC // pCRow0 = C
  1052. add pC, pC, LDC, lsl #3
  1053. mov pA, origPA // pA = start of A array
  1054. /******************************************************************************/
  1055. .Lsgemm_kernel_L8_M8_BEGIN:
  1056. mov counterI, origM
  1057. asr counterI, counterI, #3 // counterI = counterI / 8
  1058. cmp counterI, #0
  1059. ble .Lsgemm_kernel_L8_M4_BEGIN
  1060. .Lsgemm_kernel_L8_M8_20:
  1061. mov pB, origPB
  1062. asr counterL , origK, #3 // L = K / 8
  1063. cmp counterL , #2 // is there at least 16 to do?
  1064. blt .Lsgemm_kernel_L8_M8_32
  1065. KERNEL8x8_I // do one in the K
  1066. KERNEL8x8_M2 // do another in the K
  1067. KERNEL8x8_M1
  1068. KERNEL8x8_M2
  1069. KERNEL8x8_M1
  1070. KERNEL8x8_M2
  1071. KERNEL8x8_M1
  1072. KERNEL8x8_M2
  1073. subs counterL, counterL, #2
  1074. ble .Lsgemm_kernel_L8_M8_22a
  1075. .align 5
  1076. .Lsgemm_kernel_L8_M8_22:
  1077. KERNEL8x8_M1
  1078. KERNEL8x8_M2
  1079. KERNEL8x8_M1
  1080. KERNEL8x8_M2
  1081. KERNEL8x8_M1
  1082. KERNEL8x8_M2
  1083. KERNEL8x8_M1
  1084. KERNEL8x8_M2
  1085. subs counterL, counterL, #1
  1086. bgt .Lsgemm_kernel_L8_M8_22
  1087. .Lsgemm_kernel_L8_M8_22a:
  1088. KERNEL8x8_M1
  1089. KERNEL8x8_M2
  1090. KERNEL8x8_M1
  1091. KERNEL8x8_M2
  1092. KERNEL8x8_M1
  1093. KERNEL8x8_M2
  1094. KERNEL8x8_M1
  1095. KERNEL8x8_E
  1096. b .Lsgemm_kernel_L8_M8_44
  1097. .Lsgemm_kernel_L8_M8_32:
  1098. tst counterL, #1
  1099. ble .Lsgemm_kernel_L8_M8_40
  1100. KERNEL8x8_I
  1101. KERNEL8x8_M2
  1102. KERNEL8x8_M1
  1103. KERNEL8x8_M2
  1104. KERNEL8x8_M1
  1105. KERNEL8x8_M2
  1106. KERNEL8x8_M1
  1107. KERNEL8x8_E
  1108. b .Lsgemm_kernel_L8_M8_44
  1109. .Lsgemm_kernel_L8_M8_40:
  1110. INIT8x8
  1111. .Lsgemm_kernel_L8_M8_44:
  1112. ands counterL , origK, #7
  1113. ble .Lsgemm_kernel_L8_M8_100
  1114. .Lsgemm_kernel_L8_M8_46:
  1115. KERNEL8x8_SUB
  1116. subs counterL, counterL, 1
  1117. bgt .Lsgemm_kernel_L8_M8_46
  1118. .Lsgemm_kernel_L8_M8_100:
  1119. SAVE8x8
  1120. .Lsgemm_kernel_L8_M8_END:
  1121. subs counterI, counterI, #1
  1122. bne .Lsgemm_kernel_L8_M8_20
  1123. /******************************************************************************/
  1124. .Lsgemm_kernel_L8_M4_BEGIN:
  1125. mov counterI, origM
  1126. tst counterI , #7
  1127. ble .Lsgemm_kernel_L8_END
  1128. tst counterI, #4
  1129. ble .Lsgemm_kernel_L8_M2_BEGIN
  1130. .Lsgemm_kernel_L8_M4_20:
  1131. mov pB, origPB
  1132. asr counterL , origK, #1 // L = K / 2
  1133. cmp counterL , #2 // is there at least 4 to do?
  1134. blt .Lsgemm_kernel_L8_M4_32
  1135. KERNEL4x8_I // do one in the K
  1136. KERNEL4x8_M2 // do another in the K
  1137. subs counterL, counterL, #2
  1138. ble .Lsgemm_kernel_L8_M4_22a
  1139. .align 5
  1140. .Lsgemm_kernel_L8_M4_22:
  1141. KERNEL4x8_M1
  1142. KERNEL4x8_M2
  1143. subs counterL, counterL, #1
  1144. bgt .Lsgemm_kernel_L8_M4_22
  1145. .Lsgemm_kernel_L8_M4_22a:
  1146. KERNEL4x8_M1
  1147. KERNEL4x8_E
  1148. b .Lsgemm_kernel_L8_M4_44
  1149. .Lsgemm_kernel_L8_M4_32:
  1150. tst counterL, #1
  1151. ble .Lsgemm_kernel_L8_M4_40
  1152. KERNEL4x8_I
  1153. KERNEL4x8_E
  1154. b .Lsgemm_kernel_L8_M4_44
  1155. .Lsgemm_kernel_L8_M4_40:
  1156. INIT4x8
  1157. .Lsgemm_kernel_L8_M4_44:
  1158. ands counterL , origK, #1
  1159. ble .Lsgemm_kernel_L8_M4_100
  1160. .Lsgemm_kernel_L8_M4_46:
  1161. KERNEL4x8_SUB
  1162. .Lsgemm_kernel_L8_M4_100:
  1163. SAVE4x8
  1164. .Lsgemm_kernel_L8_M4_END:
  1165. /******************************************************************************/
  1166. .Lsgemm_kernel_L8_M2_BEGIN:
  1167. mov counterI, origM
  1168. tst counterI , #3
  1169. ble .Lsgemm_kernel_L8_END
  1170. tst counterI, #2 // counterI = counterI / 2
  1171. ble .Lsgemm_kernel_L8_M1_BEGIN
  1172. .Lsgemm_kernel_L8_M2_20:
  1173. INIT2x8
  1174. mov pB, origPB
  1175. asr counterL , origK, #3 // counterL = counterL / 8
  1176. cmp counterL , #0
  1177. ble .Lsgemm_kernel_L8_M2_40
  1178. .Lsgemm_kernel_L8_M2_22:
  1179. KERNEL2x8_SUB
  1180. KERNEL2x8_SUB
  1181. KERNEL2x8_SUB
  1182. KERNEL2x8_SUB
  1183. KERNEL2x8_SUB
  1184. KERNEL2x8_SUB
  1185. KERNEL2x8_SUB
  1186. KERNEL2x8_SUB
  1187. subs counterL, counterL, #1
  1188. bgt .Lsgemm_kernel_L8_M2_22
  1189. .Lsgemm_kernel_L8_M2_40:
  1190. ands counterL , origK, #7 // counterL = counterL % 8
  1191. ble .Lsgemm_kernel_L8_M2_100
  1192. .Lsgemm_kernel_L8_M2_42:
  1193. KERNEL2x8_SUB
  1194. subs counterL, counterL, #1
  1195. bgt .Lsgemm_kernel_L8_M2_42
  1196. .Lsgemm_kernel_L8_M2_100:
  1197. SAVE2x8
  1198. .Lsgemm_kernel_L8_M2_END:
  1199. /******************************************************************************/
  1200. .Lsgemm_kernel_L8_M1_BEGIN:
  1201. tst counterI, #1 // counterI = counterI % 2
  1202. ble .Lsgemm_kernel_L8_END
  1203. .Lsgemm_kernel_L8_M1_20:
  1204. INIT1x8
  1205. mov pB, origPB
  1206. asr counterL , origK, #3 // counterL = counterL / 8
  1207. cmp counterL , #0
  1208. ble .Lsgemm_kernel_L8_M1_40
  1209. .Lsgemm_kernel_L8_M1_22:
  1210. KERNEL1x8_SUB
  1211. KERNEL1x8_SUB
  1212. KERNEL1x8_SUB
  1213. KERNEL1x8_SUB
  1214. KERNEL1x8_SUB
  1215. KERNEL1x8_SUB
  1216. KERNEL1x8_SUB
  1217. KERNEL1x8_SUB
  1218. subs counterL, counterL, #1
  1219. bgt .Lsgemm_kernel_L8_M1_22
  1220. .Lsgemm_kernel_L8_M1_40:
  1221. ands counterL , origK, #7 // counterL = counterL % 8
  1222. ble .Lsgemm_kernel_L8_M1_100
  1223. .Lsgemm_kernel_L8_M1_42:
  1224. KERNEL1x8_SUB
  1225. subs counterL, counterL, #1
  1226. bgt .Lsgemm_kernel_L8_M1_42
  1227. .Lsgemm_kernel_L8_M1_100:
  1228. SAVE1x8
  1229. .Lsgemm_kernel_L8_END:
  1230. lsl temp, origK, #5 // B = B + K * 4 * 8
  1231. add origPB, origPB, temp
  1232. subs counterJ, counterJ , #1 // j--
  1233. bgt .Lsgemm_kernel_L8_BEGIN
  1234. /******************************************************************************/
  1235. /******************************************************************************/
  1236. .Lsgemm_kernel_L4_BEGIN:
  1237. mov counterJ , origN
  1238. tst counterJ , #7
  1239. ble .Lsgemm_kernel_L999
  1240. tst counterJ , #4
  1241. ble .Lsgemm_kernel_L2_BEGIN
  1242. mov pCRow0, pC // pCRow0 = pC
  1243. add pC,pC,LDC, lsl #2
  1244. mov pA, origPA // pA = A
  1245. /******************************************************************************/
  1246. .Lsgemm_kernel_L4_M8_BEGIN:
  1247. mov counterI, origM
  1248. asr counterI, counterI, #3 // counterI = counterI / 8
  1249. cmp counterI, #0
  1250. ble .Lsgemm_kernel_L4_M4_BEGIN
  1251. .Lsgemm_kernel_L4_M8_20:
  1252. mov pB, origPB
  1253. asr counterL , origK, #1 // L = K / 2
  1254. cmp counterL , #2 // is there at least 4 to do?
  1255. blt .Lsgemm_kernel_L4_M8_32
  1256. KERNEL8x4_I // do one in the K
  1257. KERNEL8x4_M2 // do another in the K
  1258. subs counterL, counterL, #2
  1259. ble .Lsgemm_kernel_L4_M8_22a
  1260. .align 5
  1261. .Lsgemm_kernel_L4_M8_22:
  1262. KERNEL8x4_M1
  1263. KERNEL8x4_M2
  1264. subs counterL, counterL, #1
  1265. bgt .Lsgemm_kernel_L4_M8_22
  1266. .Lsgemm_kernel_L4_M8_22a:
  1267. KERNEL8x4_M1
  1268. KERNEL8x4_E
  1269. b .Lsgemm_kernel_L4_M8_44
  1270. .Lsgemm_kernel_L4_M8_32:
  1271. tst counterL, #1
  1272. ble .Lsgemm_kernel_L4_M8_40
  1273. KERNEL8x4_I
  1274. KERNEL8x4_E
  1275. b .Lsgemm_kernel_L4_M8_44
  1276. .Lsgemm_kernel_L4_M8_40:
  1277. INIT8x4
  1278. .Lsgemm_kernel_L4_M8_44:
  1279. ands counterL , origK, #1
  1280. ble .Lsgemm_kernel_L4_M8_100
  1281. .Lsgemm_kernel_L4_M8_46:
  1282. KERNEL8x4_SUB
  1283. .Lsgemm_kernel_L4_M8_100:
  1284. SAVE8x4
  1285. .Lsgemm_kernel_L4_M8_END:
  1286. subs counterI, counterI, #1
  1287. bne .Lsgemm_kernel_L4_M8_20
  1288. /******************************************************************************/
  1289. .Lsgemm_kernel_L4_M4_BEGIN:
  1290. mov counterI, origM
  1291. tst counterI , #7
  1292. ble .Lsgemm_kernel_L4_END
  1293. tst counterI, #4
  1294. ble .Lsgemm_kernel_L4_M2_BEGIN
  1295. .Lsgemm_kernel_L4_M4_20:
  1296. mov pB, origPB
  1297. asr counterL , origK, #1 // L = K / 2
  1298. cmp counterL , #2 // is there at least 4 to do?
  1299. blt .Lsgemm_kernel_L4_M4_32
  1300. KERNEL4x4_I // do one in the K
  1301. KERNEL4x4_M2 // do another in the K
  1302. subs counterL, counterL, #2
  1303. ble .Lsgemm_kernel_L4_M4_22a
  1304. .align 5
  1305. .Lsgemm_kernel_L4_M4_22:
  1306. KERNEL4x4_M1
  1307. KERNEL4x4_M2
  1308. subs counterL, counterL, #1
  1309. bgt .Lsgemm_kernel_L4_M4_22
  1310. .Lsgemm_kernel_L4_M4_22a:
  1311. KERNEL4x4_M1
  1312. KERNEL4x4_E
  1313. b .Lsgemm_kernel_L4_M4_44
  1314. .Lsgemm_kernel_L4_M4_32:
  1315. tst counterL, #1
  1316. ble .Lsgemm_kernel_L4_M4_40
  1317. KERNEL4x4_I
  1318. KERNEL4x4_E
  1319. b .Lsgemm_kernel_L4_M4_44
  1320. .Lsgemm_kernel_L4_M4_40:
  1321. INIT4x4
  1322. .Lsgemm_kernel_L4_M4_44:
  1323. ands counterL , origK, #1
  1324. ble .Lsgemm_kernel_L4_M4_100
  1325. .Lsgemm_kernel_L4_M4_46:
  1326. KERNEL4x4_SUB
  1327. .Lsgemm_kernel_L4_M4_100:
  1328. SAVE4x4
  1329. .Lsgemm_kernel_L4_M4_END:
  1330. /******************************************************************************/
  1331. .Lsgemm_kernel_L4_M2_BEGIN:
  1332. mov counterI, origM
  1333. tst counterI , #3
  1334. ble .Lsgemm_kernel_L4_END
  1335. tst counterI, #2 // counterI = counterI / 2
  1336. ble .Lsgemm_kernel_L4_M1_BEGIN
  1337. .Lsgemm_kernel_L4_M2_20:
  1338. INIT2x4
  1339. mov pB, origPB
  1340. asr counterL , origK, #3 // counterL = counterL / 8
  1341. cmp counterL , #0
  1342. ble .Lsgemm_kernel_L4_M2_40
  1343. .Lsgemm_kernel_L4_M2_22:
  1344. KERNEL2x4_SUB
  1345. KERNEL2x4_SUB
  1346. KERNEL2x4_SUB
  1347. KERNEL2x4_SUB
  1348. KERNEL2x4_SUB
  1349. KERNEL2x4_SUB
  1350. KERNEL2x4_SUB
  1351. KERNEL2x4_SUB
  1352. subs counterL, counterL, #1
  1353. bgt .Lsgemm_kernel_L4_M2_22
  1354. .Lsgemm_kernel_L4_M2_40:
  1355. ands counterL , origK, #7 // counterL = counterL % 8
  1356. ble .Lsgemm_kernel_L4_M2_100
  1357. .Lsgemm_kernel_L4_M2_42:
  1358. KERNEL2x4_SUB
  1359. subs counterL, counterL, #1
  1360. bgt .Lsgemm_kernel_L4_M2_42
  1361. .Lsgemm_kernel_L4_M2_100:
  1362. SAVE2x4
  1363. .Lsgemm_kernel_L4_M2_END:
  1364. /******************************************************************************/
  1365. .Lsgemm_kernel_L4_M1_BEGIN:
  1366. tst counterI, #1 // counterI = counterI % 2
  1367. ble .Lsgemm_kernel_L4_END
  1368. .Lsgemm_kernel_L4_M1_20:
  1369. INIT1x4
  1370. mov pB, origPB
  1371. asr counterL , origK, #3 // counterL = counterL / 8
  1372. cmp counterL , #0
  1373. ble .Lsgemm_kernel_L4_M1_40
  1374. .Lsgemm_kernel_L4_M1_22:
  1375. KERNEL1x4_SUB
  1376. KERNEL1x4_SUB
  1377. KERNEL1x4_SUB
  1378. KERNEL1x4_SUB
  1379. KERNEL1x4_SUB
  1380. KERNEL1x4_SUB
  1381. KERNEL1x4_SUB
  1382. KERNEL1x4_SUB
  1383. subs counterL, counterL, #1
  1384. bgt .Lsgemm_kernel_L4_M1_22
  1385. .Lsgemm_kernel_L4_M1_40:
  1386. ands counterL , origK, #7 // counterL = counterL % 8
  1387. ble .Lsgemm_kernel_L4_M1_100
  1388. .Lsgemm_kernel_L4_M1_42:
  1389. KERNEL1x4_SUB
  1390. subs counterL, counterL, #1
  1391. bgt .Lsgemm_kernel_L4_M1_42
  1392. .Lsgemm_kernel_L4_M1_100:
  1393. SAVE1x4
  1394. .Lsgemm_kernel_L4_END:
  1395. add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
  1396. /******************************************************************************/
  1397. /******************************************************************************/
  1398. .Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  1399. mov counterJ , origN
  1400. tst counterJ , #3
  1401. ble .Lsgemm_kernel_L999
  1402. tst counterJ , #2
  1403. ble .Lsgemm_kernel_L1_BEGIN
  1404. mov pCRow0, pC // pCRow0 = pC
  1405. add pC,pC,LDC, lsl #1
  1406. mov pA, origPA // pA = A
  1407. /******************************************************************************/
  1408. .Lsgemm_kernel_L2_M8_BEGIN:
  1409. mov counterI, origM
  1410. asr counterI, counterI, #3 // counterI = counterI / 8
  1411. cmp counterI,#0
  1412. ble .Lsgemm_kernel_L2_M4_BEGIN
  1413. .Lsgemm_kernel_L2_M8_20:
  1414. INIT8x2
  1415. mov pB, origPB
  1416. asr counterL , origK, #3 // counterL = counterL / 8
  1417. cmp counterL,#0
  1418. ble .Lsgemm_kernel_L2_M8_40
  1419. .align 5
  1420. .Lsgemm_kernel_L2_M8_22:
  1421. KERNEL8x2_SUB
  1422. KERNEL8x2_SUB
  1423. KERNEL8x2_SUB
  1424. KERNEL8x2_SUB
  1425. KERNEL8x2_SUB
  1426. KERNEL8x2_SUB
  1427. KERNEL8x2_SUB
  1428. KERNEL8x2_SUB
  1429. subs counterL, counterL, #1
  1430. bgt .Lsgemm_kernel_L2_M8_22
  1431. .Lsgemm_kernel_L2_M8_40:
  1432. ands counterL , origK, #7 // counterL = counterL % 8
  1433. ble .Lsgemm_kernel_L2_M8_100
  1434. .Lsgemm_kernel_L2_M8_42:
  1435. KERNEL8x2_SUB
  1436. subs counterL, counterL, #1
  1437. bgt .Lsgemm_kernel_L2_M8_42
  1438. .Lsgemm_kernel_L2_M8_100:
  1439. SAVE8x2
  1440. .Lsgemm_kernel_L2_M8_END:
  1441. subs counterI, counterI, #1
  1442. bgt .Lsgemm_kernel_L2_M8_20
  1443. /******************************************************************************/
  1444. .Lsgemm_kernel_L2_M4_BEGIN:
  1445. mov counterI, origM
  1446. tst counterI , #7
  1447. ble .Lsgemm_kernel_L2_END
  1448. tst counterI, #4
  1449. ble .Lsgemm_kernel_L2_M2_BEGIN
  1450. .Lsgemm_kernel_L2_M4_20:
  1451. INIT4x2
  1452. mov pB, origPB
  1453. asr counterL , origK, #3 // counterL = counterL / 8
  1454. cmp counterL,#0
  1455. ble .Lsgemm_kernel_L2_M4_40
  1456. .align 5
  1457. .Lsgemm_kernel_L2_M4_22:
  1458. KERNEL4x2_SUB
  1459. KERNEL4x2_SUB
  1460. KERNEL4x2_SUB
  1461. KERNEL4x2_SUB
  1462. KERNEL4x2_SUB
  1463. KERNEL4x2_SUB
  1464. KERNEL4x2_SUB
  1465. KERNEL4x2_SUB
  1466. subs counterL, counterL, #1
  1467. bgt .Lsgemm_kernel_L2_M4_22
  1468. .Lsgemm_kernel_L2_M4_40:
  1469. ands counterL , origK, #7 // counterL = counterL % 8
  1470. ble .Lsgemm_kernel_L2_M4_100
  1471. .Lsgemm_kernel_L2_M4_42:
  1472. KERNEL4x2_SUB
  1473. subs counterL, counterL, #1
  1474. bgt .Lsgemm_kernel_L2_M4_42
  1475. .Lsgemm_kernel_L2_M4_100:
  1476. SAVE4x2
  1477. .Lsgemm_kernel_L2_M4_END:
  1478. /******************************************************************************/
  1479. .Lsgemm_kernel_L2_M2_BEGIN:
  1480. mov counterI, origM
  1481. tst counterI , #3
  1482. ble .Lsgemm_kernel_L2_END
  1483. tst counterI, #2 // counterI = counterI / 2
  1484. ble .Lsgemm_kernel_L2_M1_BEGIN
  1485. .Lsgemm_kernel_L2_M2_20:
  1486. INIT2x2
  1487. mov pB, origPB
  1488. asr counterL , origK, #3 // counterL = counterL / 8
  1489. cmp counterL,#0
  1490. ble .Lsgemm_kernel_L2_M2_40
  1491. .Lsgemm_kernel_L2_M2_22:
  1492. KERNEL2x2_SUB
  1493. KERNEL2x2_SUB
  1494. KERNEL2x2_SUB
  1495. KERNEL2x2_SUB
  1496. KERNEL2x2_SUB
  1497. KERNEL2x2_SUB
  1498. KERNEL2x2_SUB
  1499. KERNEL2x2_SUB
  1500. subs counterL, counterL, #1
  1501. bgt .Lsgemm_kernel_L2_M2_22
  1502. .Lsgemm_kernel_L2_M2_40:
  1503. ands counterL , origK, #7 // counterL = counterL % 8
  1504. ble .Lsgemm_kernel_L2_M2_100
  1505. .Lsgemm_kernel_L2_M2_42:
  1506. KERNEL2x2_SUB
  1507. subs counterL, counterL, #1
  1508. bgt .Lsgemm_kernel_L2_M2_42
  1509. .Lsgemm_kernel_L2_M2_100:
  1510. SAVE2x2
  1511. .Lsgemm_kernel_L2_M2_END:
  1512. /******************************************************************************/
  1513. .Lsgemm_kernel_L2_M1_BEGIN:
  1514. tst counterI, #1 // counterI = counterI % 2
  1515. ble .Lsgemm_kernel_L2_END
  1516. .Lsgemm_kernel_L2_M1_20:
  1517. INIT1x2
  1518. mov pB, origPB
  1519. asr counterL , origK, #3 // counterL = counterL / 8
  1520. cmp counterL, #0
  1521. ble .Lsgemm_kernel_L2_M1_40
  1522. .Lsgemm_kernel_L2_M1_22:
  1523. KERNEL1x2_SUB
  1524. KERNEL1x2_SUB
  1525. KERNEL1x2_SUB
  1526. KERNEL1x2_SUB
  1527. KERNEL1x2_SUB
  1528. KERNEL1x2_SUB
  1529. KERNEL1x2_SUB
  1530. KERNEL1x2_SUB
  1531. subs counterL, counterL, #1
  1532. bgt .Lsgemm_kernel_L2_M1_22
  1533. .Lsgemm_kernel_L2_M1_40:
  1534. ands counterL , origK, #7 // counterL = counterL % 8
  1535. ble .Lsgemm_kernel_L2_M1_100
  1536. .Lsgemm_kernel_L2_M1_42:
  1537. KERNEL1x2_SUB
  1538. subs counterL, counterL, #1
  1539. bgt .Lsgemm_kernel_L2_M1_42
  1540. .Lsgemm_kernel_L2_M1_100:
  1541. SAVE1x2
  1542. .Lsgemm_kernel_L2_END:
  1543. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  1544. /******************************************************************************/
  1545. /******************************************************************************/
  1546. .Lsgemm_kernel_L1_BEGIN:
  1547. mov counterJ , origN
  1548. tst counterJ , #1
  1549. ble .Lsgemm_kernel_L999 // done
  1550. mov pCRow0, pC // pCRow0 = C
  1551. add pC , pC , LDC // Update pC to point to next
  1552. mov pA, origPA // pA = A
  1553. /******************************************************************************/
  1554. .Lsgemm_kernel_L1_M8_BEGIN:
  1555. mov counterI, origM
  1556. asr counterI, counterI, #3
  1557. cmp counterI, #0
  1558. ble .Lsgemm_kernel_L1_M4_BEGIN
  1559. .Lsgemm_kernel_L1_M8_20:
  1560. INIT8x1
  1561. mov pB, origPB
  1562. asr counterL , origK, #3 // counterL = counterL / 8
  1563. cmp counterL , #0
  1564. ble .Lsgemm_kernel_L1_M8_40
  1565. .align 5
  1566. .Lsgemm_kernel_L1_M8_22:
  1567. KERNEL8x1_SUB
  1568. KERNEL8x1_SUB
  1569. KERNEL8x1_SUB
  1570. KERNEL8x1_SUB
  1571. KERNEL8x1_SUB
  1572. KERNEL8x1_SUB
  1573. KERNEL8x1_SUB
  1574. KERNEL8x1_SUB
  1575. subs counterL, counterL, #1
  1576. bgt .Lsgemm_kernel_L1_M8_22
  1577. .Lsgemm_kernel_L1_M8_40:
  1578. ands counterL , origK, #7 // counterL = counterL % 8
  1579. ble .Lsgemm_kernel_L1_M8_100
  1580. .Lsgemm_kernel_L1_M8_42:
  1581. KERNEL8x1_SUB
  1582. subs counterL, counterL, #1
  1583. bgt .Lsgemm_kernel_L1_M8_42
  1584. .Lsgemm_kernel_L1_M8_100:
  1585. SAVE8x1
  1586. .Lsgemm_kernel_L1_M8_END:
  1587. subs counterI, counterI, #1
  1588. bgt .Lsgemm_kernel_L1_M8_20
  1589. /******************************************************************************/
  1590. .Lsgemm_kernel_L1_M4_BEGIN:
  1591. mov counterI, origM
  1592. tst counterI , #7
  1593. ble .Lsgemm_kernel_L1_END
  1594. tst counterI, #4
  1595. ble .Lsgemm_kernel_L1_M2_BEGIN
  1596. .Lsgemm_kernel_L1_M4_20:
  1597. INIT4x1
  1598. mov pB, origPB
  1599. asr counterL , origK, #3 // counterL = counterL / 8
  1600. cmp counterL , #0
  1601. ble .Lsgemm_kernel_L1_M4_40
  1602. .align 5
  1603. .Lsgemm_kernel_L1_M4_22:
  1604. KERNEL4x1_SUB
  1605. KERNEL4x1_SUB
  1606. KERNEL4x1_SUB
  1607. KERNEL4x1_SUB
  1608. KERNEL4x1_SUB
  1609. KERNEL4x1_SUB
  1610. KERNEL4x1_SUB
  1611. KERNEL4x1_SUB
  1612. subs counterL, counterL, #1
  1613. bgt .Lsgemm_kernel_L1_M4_22
  1614. .Lsgemm_kernel_L1_M4_40:
  1615. ands counterL , origK, #7 // counterL = counterL % 8
  1616. ble .Lsgemm_kernel_L1_M4_100
  1617. .Lsgemm_kernel_L1_M4_42:
  1618. KERNEL4x1_SUB
  1619. subs counterL, counterL, #1
  1620. bgt .Lsgemm_kernel_L1_M4_42
  1621. .Lsgemm_kernel_L1_M4_100:
  1622. SAVE4x1
  1623. .Lsgemm_kernel_L1_M4_END:
  1624. /******************************************************************************/
  1625. .Lsgemm_kernel_L1_M2_BEGIN:
  1626. mov counterI, origM
  1627. tst counterI , #3
  1628. ble .Lsgemm_kernel_L1_END
  1629. tst counterI, #2 // counterI = counterI / 2
  1630. ble .Lsgemm_kernel_L1_M1_BEGIN
  1631. .Lsgemm_kernel_L1_M2_20:
  1632. INIT2x1
  1633. mov pB, origPB
  1634. asr counterL , origK, #3 // counterL = counterL / 8
  1635. cmp counterL , #0
  1636. ble .Lsgemm_kernel_L1_M2_40
  1637. .Lsgemm_kernel_L1_M2_22:
  1638. KERNEL2x1_SUB
  1639. KERNEL2x1_SUB
  1640. KERNEL2x1_SUB
  1641. KERNEL2x1_SUB
  1642. KERNEL2x1_SUB
  1643. KERNEL2x1_SUB
  1644. KERNEL2x1_SUB
  1645. KERNEL2x1_SUB
  1646. subs counterL, counterL, #1
  1647. bgt .Lsgemm_kernel_L1_M2_22
  1648. .Lsgemm_kernel_L1_M2_40:
  1649. ands counterL , origK, #7 // counterL = counterL % 8
  1650. ble .Lsgemm_kernel_L1_M2_100
  1651. .Lsgemm_kernel_L1_M2_42:
  1652. KERNEL2x1_SUB
  1653. subs counterL, counterL, #1
  1654. bgt .Lsgemm_kernel_L1_M2_42
  1655. .Lsgemm_kernel_L1_M2_100:
  1656. SAVE2x1
  1657. .Lsgemm_kernel_L1_M2_END:
  1658. /******************************************************************************/
  1659. .Lsgemm_kernel_L1_M1_BEGIN:
  1660. tst counterI, #1 // counterI = counterI % 2
  1661. ble .Lsgemm_kernel_L1_END
  1662. .Lsgemm_kernel_L1_M1_20:
  1663. INIT1x1
  1664. mov pB, origPB
  1665. asr counterL , origK, #3 // counterL = counterL / 8
  1666. cmp counterL , #0
  1667. ble .Lsgemm_kernel_L1_M1_40
  1668. .Lsgemm_kernel_L1_M1_22:
  1669. KERNEL1x1_SUB
  1670. KERNEL1x1_SUB
  1671. KERNEL1x1_SUB
  1672. KERNEL1x1_SUB
  1673. KERNEL1x1_SUB
  1674. KERNEL1x1_SUB
  1675. KERNEL1x1_SUB
  1676. KERNEL1x1_SUB
  1677. subs counterL, counterL, #1
  1678. bgt .Lsgemm_kernel_L1_M1_22
  1679. .Lsgemm_kernel_L1_M1_40:
  1680. ands counterL , origK, #7 // counterL = counterL % 8
  1681. ble .Lsgemm_kernel_L1_M1_100
  1682. .Lsgemm_kernel_L1_M1_42:
  1683. KERNEL1x1_SUB
  1684. subs counterL, counterL, #1
  1685. bgt .Lsgemm_kernel_L1_M1_42
  1686. .Lsgemm_kernel_L1_M1_100:
  1687. SAVE1x1
  1688. .Lsgemm_kernel_L1_END:
  1689. /******************************************************************************/
  1690. .Lsgemm_kernel_L999:
  1691. mov x0, #0 // set return value
  1692. ldp d8, d9, [sp, #(0 * 16)]
  1693. ldp d10, d11, [sp, #(1 * 16)]
  1694. ldp d12, d13, [sp, #(2 * 16)]
  1695. ldp d14, d15, [sp, #(3 * 16)]
  1696. ldp d16, d17, [sp, #(4 * 16)]
  1697. ldp x18, x19, [sp, #(5 * 16)]
  1698. ldp x20, x21, [sp, #(6 * 16)]
  1699. ldp x22, x23, [sp, #(7 * 16)]
  1700. ldp x24, x25, [sp, #(8 * 16)]
  1701. ldp x26, x27, [sp, #(9 * 16)]
  1702. ldr x28, [sp, #(10 * 16)]
  1703. add sp, sp, #(11*16)
  1704. ret
  1705. EPILOGUE