You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_16x4_thunderx2t99.S 38 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081
  1. /*******************************************************************************
  2. Copyright (c) 2017, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alpha w17
  49. #define alpha0 s10
  50. #define alphaV0 v10.s[0]
  51. #define A_PRE_SIZE 2560
  52. #define B_PRE_SIZE 224
  53. #define C_PRE_SIZE 160
  54. // 00 origM
  55. // 01 origN
  56. // 02 origK
  57. // 03 origPA
  58. // 04 origPB
  59. // 05 pC
  60. // 06 origLDC -> LDC
  61. // 07 offset
  62. // 08 counterL
  63. // 09 counterI
  64. // 10 counterJ
  65. // 11 pB
  66. // 12 pCRow0
  67. // 13 pCRow1
  68. // 14 pCRow2
  69. // 15 pA
  70. // 16 temp
  71. // 17
  72. // 18 must save
  73. // 19 must save
  74. // 20 must save
  75. // 21 must save
  76. // 22 must save
  77. // 23 must save
  78. // 24 must save
  79. // 25 must save
  80. // 26 must save
  81. // 27 must save
  82. // 28 must save
  83. // 29 frame
  84. // 30 link
  85. // 31 sp
  86. //v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03
  87. //v01 pA0_04, pA0_05, pA0_06, pA0_07
  88. //v02 pA0_08, pA0_09, pA0_10, pA0_11
  89. //v03 pA0_12, pA0_13, pA0_14, pA0_15
  90. //v04 pA1_00, pA1_01, pA1_02, pA1_03
  91. //v05 pA1_04, pA1_05, pA1_06, pA1_07
  92. //v06 pA1_08, pA1_09, pA1_10, pA1_11
  93. //v07 pA1_12, pA1_13, pA1_14, pA1_15
  94. //v08 must save pB00
  95. //v09 must save pB01
  96. //v10 must save pB02
  97. //v11 must save pB03
  98. //v12 must save pB10
  99. //v13 must save pB11
  100. //v14 must save pB12
  101. //v15 must save pB13
  102. //v16 must save C00, C01, C02, C03
  103. //v17 must save C04, C05, C06, C07
  104. //v18 C08, C09, C10, C11
  105. //v19 C12, C13, C14, C15
  106. //v20 C16, C17, C18, C19
  107. //v21 C20, C21, C22, C23
  108. //v22 C24, C25, C26, C27
  109. //v23 C28, C29, C30, C31
  110. //v24 C32, C33, C34, C35
  111. //v25 C36, C37, C38, C39
  112. //v26 C40, C41, C42, C43
  113. //v27 C44, C45, C46, C47
  114. //v28 C48, C49, C50, C51
  115. //v29 C52, C53, C54, C55
  116. //v30 C56, C57, C58, C59
  117. //v31 C60, C61, C62, C63
  118. /*******************************************************************************
  119. * Macro definitions
  120. *******************************************************************************/
  121. .macro INIT16x4
  122. fmov s16, wzr
  123. fmov s17, wzr
  124. fmov s18, s16
  125. fmov s19, s17
  126. fmov s20, wzr
  127. fmov s21, s16
  128. fmov s22, s17
  129. fmov s23, s18
  130. fmov s24, wzr
  131. fmov s25, s16
  132. fmov s26, s17
  133. fmov s27, s18
  134. fmov s28, wzr
  135. fmov s29, s16
  136. fmov s30, s17
  137. fmov s31, s18
  138. .endm
  139. .macro KERNEL16x4_I
  140. ldur q0, [pA]
  141. ldur q1, [pA, #16]
  142. ldur q8, [pB]
  143. fmul v16.4s, v0.4s, v8.s[0]
  144. fmul v20.4s, v0.4s, v8.s[1]
  145. fmul v24.4s, v0.4s, v8.s[2]
  146. fmul v28.4s, v0.4s, v8.s[3]
  147. ldur q2, [pA, #32]
  148. ldur q3, [pA, #48]
  149. fmul v17.4s, v1.4s, v8.s[0]
  150. fmul v21.4s, v1.4s, v8.s[1]
  151. ldur q4, [pA, #64]
  152. ldur q5, [pA, #80]
  153. fmul v25.4s, v1.4s, v8.s[2]
  154. fmul v29.4s, v1.4s, v8.s[3]
  155. ldur q12, [pB, #16]
  156. fmul v18.4s, v2.4s, v8.s[0]
  157. fmul v22.4s, v2.4s, v8.s[1]
  158. fmul v19.4s, v3.4s, v8.s[0]
  159. fmul v23.4s, v3.4s, v8.s[1]
  160. ldur q6, [pA, #96]
  161. ldur q7, [pA, #112]
  162. add pB, pB, #32
  163. add pA, pA, #128
  164. fmul v26.4s, v2.4s, v8.s[2]
  165. fmul v30.4s, v2.4s, v8.s[3]
  166. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  167. fmul v27.4s, v3.4s, v8.s[2]
  168. fmul v31.4s, v3.4s, v8.s[3]
  169. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  170. .endm
  171. .macro KERNEL16x4_M1
  172. fmla v16.4s, v0.4s, v8.s[0]
  173. fmla v17.4s, v1.4s, v8.s[0]
  174. ldur q4, [pA]
  175. ldur q5, [pA, #16]
  176. fmla v18.4s, v2.4s, v8.s[0]
  177. fmla v19.4s, v3.4s, v8.s[0]
  178. fmla v20.4s, v0.4s, v8.s[1]
  179. fmla v21.4s, v1.4s, v8.s[1]
  180. ldur q12, [pB]
  181. fmla v22.4s, v2.4s, v8.s[1]
  182. fmla v23.4s, v3.4s, v8.s[1]
  183. add pB, pB, #16
  184. fmla v24.4s, v0.4s, v8.s[2]
  185. fmla v25.4s, v1.4s, v8.s[2]
  186. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  187. fmla v26.4s, v2.4s, v8.s[2]
  188. fmla v27.4s, v3.4s, v8.s[2]
  189. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  190. fmla v28.4s, v0.4s, v8.s[3]
  191. fmla v29.4s, v1.4s, v8.s[3]
  192. ldur q6, [pA, #32]
  193. ldur q7, [pA, #48]
  194. add pA, pA, #64
  195. fmla v30.4s, v2.4s, v8.s[3]
  196. fmla v31.4s, v3.4s, v8.s[3]
  197. .endm
  198. .macro KERNEL16x4_M2
  199. fmla v16.4s, v4.4s, v12.s[0]
  200. fmla v17.4s, v5.4s, v12.s[0]
  201. ldur q0, [pA]
  202. ldur q1, [pA, #16]
  203. fmla v18.4s, v6.4s, v12.s[0]
  204. fmla v19.4s, v7.4s, v12.s[0]
  205. fmla v20.4s, v4.4s, v12.s[1]
  206. fmla v21.4s, v5.4s, v12.s[1]
  207. ldur q8, [pB]
  208. fmla v22.4s, v6.4s, v12.s[1]
  209. fmla v23.4s, v7.4s, v12.s[1]
  210. add pB, pB, #16
  211. fmla v24.4s, v4.4s, v12.s[2]
  212. fmla v25.4s, v5.4s, v12.s[2]
  213. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  214. fmla v26.4s, v6.4s, v12.s[2]
  215. fmla v27.4s, v7.4s, v12.s[2]
  216. ldur q2, [pA, #32]
  217. ldur q3, [pA, #48]
  218. add pA, pA, #64
  219. fmla v28.4s, v4.4s, v12.s[3]
  220. fmla v29.4s, v5.4s, v12.s[3]
  221. fmla v30.4s, v6.4s, v12.s[3]
  222. fmla v31.4s, v7.4s, v12.s[3]
  223. .endm
  224. .macro KERNEL16x4_E
  225. fmla v16.4s, v4.4s, v12.s[0]
  226. fmla v20.4s, v4.4s, v12.s[1]
  227. fmla v24.4s, v4.4s, v12.s[2]
  228. fmla v28.4s, v4.4s, v12.s[3]
  229. fmla v17.4s, v5.4s, v12.s[0]
  230. fmla v21.4s, v5.4s, v12.s[1]
  231. fmla v25.4s, v5.4s, v12.s[2]
  232. fmla v29.4s, v5.4s, v12.s[3]
  233. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  234. fmla v18.4s, v6.4s, v12.s[0]
  235. fmla v22.4s, v6.4s, v12.s[1]
  236. fmla v26.4s, v6.4s, v12.s[2]
  237. fmla v30.4s, v6.4s, v12.s[3]
  238. fmla v19.4s, v7.4s, v12.s[0]
  239. fmla v23.4s, v7.4s, v12.s[1]
  240. fmla v27.4s, v7.4s, v12.s[2]
  241. fmla v31.4s, v7.4s, v12.s[3]
  242. .endm
  243. .macro KERNEL16x4_SUB
  244. ldur q0, [pA]
  245. ldur q1, [pA, #16]
  246. ldur q8, [pB]
  247. fmla v16.4s, v0.4s, v8.s[0]
  248. fmla v20.4s, v0.4s, v8.s[1]
  249. add pB, pB, #16
  250. fmla v24.4s, v0.4s, v8.s[2]
  251. fmla v28.4s, v0.4s, v8.s[3]
  252. ldur q2, [pA, #32]
  253. ldur q3, [pA, #48]
  254. add pA, pA, #64
  255. fmla v17.4s, v1.4s, v8.s[0]
  256. fmla v21.4s, v1.4s, v8.s[1]
  257. fmla v25.4s, v1.4s, v8.s[2]
  258. fmla v29.4s, v1.4s, v8.s[3]
  259. fmla v18.4s, v2.4s, v8.s[0]
  260. fmla v22.4s, v2.4s, v8.s[1]
  261. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  262. fmla v19.4s, v3.4s, v8.s[0]
  263. fmla v23.4s, v3.4s, v8.s[1]
  264. fmla v26.4s, v2.4s, v8.s[2]
  265. fmla v30.4s, v2.4s, v8.s[3]
  266. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  267. fmla v27.4s, v3.4s, v8.s[2]
  268. fmla v31.4s, v3.4s, v8.s[3]
  269. .endm
  270. .macro SAVE16x4
  271. fmov alpha0, alpha
  272. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  273. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  274. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  275. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  276. ldur q0, [pCRow0]
  277. ldur q1, [pCRow0, #16]
  278. ldur q2, [pCRow0, #32]
  279. ldur q3, [pCRow0, #48]
  280. ldur q4, [pCRow1]
  281. ldur q5, [pCRow1, #16]
  282. ldur q6, [pCRow1, #32]
  283. ldur q7, [pCRow1, #48]
  284. fmla v0.4s, v16.4s, alphaV0
  285. fmla v1.4s, v17.4s, alphaV0
  286. stp q0, q1, [pCRow0]
  287. fmla v2.4s, v18.4s, alphaV0
  288. fmla v3.4s, v19.4s, alphaV0
  289. stp q2, q3, [pCRow0, #32]
  290. ldur q0, [pCRow2]
  291. ldur q1, [pCRow2, #16]
  292. fmla v4.4s, v20.4s, alphaV0
  293. fmla v5.4s, v21.4s, alphaV0
  294. stp q4, q5, [pCRow1]
  295. ldur q2, [pCRow2, #32]
  296. ldur q3, [pCRow2, #48]
  297. fmla v6.4s, v22.4s, alphaV0
  298. fmla v7.4s, v23.4s, alphaV0
  299. stp q6, q7, [pCRow1, #32]
  300. ldur q4, [pCRow3]
  301. ldur q5, [pCRow3, #16]
  302. fmla v0.4s, v24.4s, alphaV0
  303. fmla v1.4s, v25.4s, alphaV0
  304. stp q0, q1, [pCRow2]
  305. ldur q6, [pCRow3, #32]
  306. ldur q7, [pCRow3, #48]
  307. fmla v2.4s, v26.4s, alphaV0
  308. fmla v3.4s, v27.4s, alphaV0
  309. stp q2, q3, [pCRow2, #32]
  310. fmla v4.4s, v28.4s, alphaV0
  311. fmla v5.4s, v29.4s, alphaV0
  312. stp q4, q5, [pCRow3]
  313. fmla v6.4s, v30.4s, alphaV0
  314. fmla v7.4s, v31.4s, alphaV0
  315. stp q6, q7, [pCRow3, #32]
  316. add pCRow0, pCRow0, #64
  317. add pCRow1, pCRow1, #64
  318. add pCRow2, pCRow2, #64
  319. add pCRow3, pCRow3, #64
  320. .endm
  321. /******************************************************************************/
  322. .macro INIT8x4
  323. fmov s16, wzr
  324. fmov s17, wzr
  325. fmov s20, wzr
  326. fmov s21, s16
  327. fmov s24, wzr
  328. fmov s25, s16
  329. fmov s28, wzr
  330. fmov s29, s16
  331. .endm
  332. .macro KERNEL8x4_I
  333. ldp s8, s9, [pB], #8
  334. ldp s10, s11, [pB], #8
  335. ldr q0, [pA], #16
  336. ldr q1, [pA], #16
  337. fmul v16.4s, v0.4s, v8.s[0]
  338. fmul v17.4s, v1.4s, v8.s[0]
  339. fmul v20.4s, v0.4s, v9.s[0]
  340. fmul v21.4s, v1.4s, v9.s[0]
  341. fmul v24.4s, v0.4s, v10.s[0]
  342. fmul v25.4s, v1.4s, v10.s[0]
  343. fmul v28.4s, v0.4s, v11.s[0]
  344. fmul v29.4s, v1.4s, v11.s[0]
  345. ldp s12, s13, [pB], #8
  346. ldp s14, s15, [pB], #8
  347. ldr q4, [pA], #16
  348. ldr q5, [pA], #16
  349. .endm
  350. .macro KERNEL8x4_M1
  351. fmla v16.4s, v0.4s, v8.s[0]
  352. fmla v17.4s, v1.4s, v8.s[0]
  353. fmla v20.4s, v0.4s, v9.s[0]
  354. fmla v21.4s, v1.4s, v9.s[0]
  355. fmla v24.4s, v0.4s, v10.s[0]
  356. fmla v25.4s, v1.4s, v10.s[0]
  357. fmla v28.4s, v0.4s, v11.s[0]
  358. fmla v29.4s, v1.4s, v11.s[0]
  359. ldp s12, s13, [pB], #8
  360. ldp s14, s15, [pB], #8
  361. ldr q4, [pA], #16
  362. ldr q5, [pA], #16
  363. .endm
  364. .macro KERNEL8x4_M2
  365. fmla v16.4s, v4.4s, v12.s[0]
  366. fmla v17.4s, v5.4s, v12.s[0]
  367. fmla v20.4s, v4.4s, v13.s[0]
  368. fmla v21.4s, v5.4s, v13.s[0]
  369. fmla v24.4s, v4.4s, v14.s[0]
  370. fmla v25.4s, v5.4s, v14.s[0]
  371. fmla v28.4s, v4.4s, v15.s[0]
  372. fmla v29.4s, v5.4s, v15.s[0]
  373. ldp s8, s9, [pB], #8
  374. ldp s10, s11, [pB], #8
  375. ldr q0, [pA], #16
  376. ldr q1, [pA], #16
  377. .endm
  378. .macro KERNEL8x4_E
  379. fmla v16.4s, v4.4s, v12.s[0]
  380. fmla v17.4s, v5.4s, v12.s[0]
  381. fmla v20.4s, v4.4s, v13.s[0]
  382. fmla v21.4s, v5.4s, v13.s[0]
  383. fmla v24.4s, v4.4s, v14.s[0]
  384. fmla v25.4s, v5.4s, v14.s[0]
  385. fmla v28.4s, v4.4s, v15.s[0]
  386. fmla v29.4s, v5.4s, v15.s[0]
  387. .endm
  388. .macro KERNEL8x4_SUB
  389. ldp s8, s9, [pB], #8
  390. ldp s10, s11, [pB], #8
  391. ldr q0, [pA], #16
  392. ldr q1, [pA], #16
  393. fmla v16.4s, v0.4s, v8.s[0]
  394. fmla v17.4s, v1.4s, v8.s[0]
  395. fmla v20.4s, v0.4s, v9.s[0]
  396. fmla v21.4s, v1.4s, v9.s[0]
  397. fmla v24.4s, v0.4s, v10.s[0]
  398. fmla v25.4s, v1.4s, v10.s[0]
  399. fmla v28.4s, v0.4s, v11.s[0]
  400. fmla v29.4s, v1.4s, v11.s[0]
  401. .endm
  402. .macro SAVE8x4
  403. fmov alpha0, alpha
  404. ldp q0, q1, [pCRow0]
  405. fmla v0.4s, v16.4s, alphaV0
  406. fmla v1.4s, v17.4s, alphaV0
  407. stp q0, q1, [pCRow0]
  408. add pCRow0, pCRow0, #32
  409. ldp q2, q3, [pCRow1]
  410. fmla v2.4s, v20.4s, alphaV0
  411. fmla v3.4s, v21.4s, alphaV0
  412. stp q2, q3, [pCRow1]
  413. add pCRow1, pCRow1, #32
  414. ldp q4, q5, [pCRow2]
  415. fmla v4.4s, v24.4s, alphaV0
  416. fmla v5.4s, v25.4s, alphaV0
  417. stp q4, q5, [pCRow2]
  418. add pCRow2, pCRow2, #32
  419. ldp q6, q7, [pCRow3]
  420. fmla v6.4s, v28.4s, alphaV0
  421. fmla v7.4s, v29.4s, alphaV0
  422. stp q6, q7, [pCRow3]
  423. add pCRow3, pCRow3, #32
  424. .endm
  425. /******************************************************************************/
  426. .macro INIT4x4
  427. fmov s16, wzr
  428. fmov s20, wzr
  429. fmov s24, wzr
  430. fmov s28, wzr
  431. .endm
  432. .macro KERNEL4x4_I
  433. ldp s8, s9, [pB], #8
  434. ldp s10, s11, [pB], #8
  435. ldr q0, [pA], #16
  436. fmul v16.4s, v0.4s, v8.s[0]
  437. fmul v20.4s, v0.4s, v9.s[0]
  438. fmul v24.4s, v0.4s, v10.s[0]
  439. fmul v28.4s, v0.4s, v11.s[0]
  440. ldp s12, s13, [pB], #8
  441. ldp s14, s15, [pB], #8
  442. ldr q1, [pA], #16
  443. .endm
  444. .macro KERNEL4x4_M1
  445. fmla v16.4s, v0.4s, v8.s[0]
  446. fmla v20.4s, v0.4s, v9.s[0]
  447. fmla v24.4s, v0.4s, v10.s[0]
  448. fmla v28.4s, v0.4s, v11.s[0]
  449. ldp s12, s13, [pB], #8
  450. ldp s14, s15, [pB], #8
  451. ldr q1, [pA], #16
  452. .endm
  453. .macro KERNEL4x4_M2
  454. fmla v16.4s, v1.4s, v12.s[0]
  455. fmla v20.4s, v1.4s, v13.s[0]
  456. fmla v24.4s, v1.4s, v14.s[0]
  457. fmla v28.4s, v1.4s, v15.s[0]
  458. ldp s8, s9, [pB], #8
  459. ldp s10, s11, [pB], #8
  460. ldr q0, [pA], #16
  461. .endm
  462. .macro KERNEL4x4_E
  463. fmla v16.4s, v1.4s, v12.s[0]
  464. fmla v20.4s, v1.4s, v13.s[0]
  465. fmla v24.4s, v1.4s, v14.s[0]
  466. fmla v28.4s, v1.4s, v15.s[0]
  467. .endm
  468. .macro KERNEL4x4_SUB
  469. ldp s8, s9, [pB], #8
  470. ldp s10, s11, [pB], #8
  471. ldr q0, [pA], #16
  472. fmla v16.4s, v0.4s, v8.s[0]
  473. fmla v20.4s, v0.4s, v9.s[0]
  474. fmla v24.4s, v0.4s, v10.s[0]
  475. fmla v28.4s, v0.4s, v11.s[0]
  476. .endm
  477. .macro SAVE4x4
  478. fmov alpha0, alpha
  479. ldr q0, [pCRow0]
  480. fmla v0.4s, v16.4s, alphaV0
  481. str q0, [pCRow0]
  482. add pCRow0, pCRow0, #16
  483. ldr q1, [pCRow1]
  484. fmla v1.4s, v20.4s, alphaV0
  485. str q1, [pCRow1]
  486. add pCRow1, pCRow1, #16
  487. ldr q2, [pCRow2]
  488. fmla v2.4s, v24.4s, alphaV0
  489. str q2, [pCRow2]
  490. add pCRow2, pCRow2, #16
  491. ldr q3, [pCRow3]
  492. fmla v3.4s, v28.4s, alphaV0
  493. str q3, [pCRow3]
  494. add pCRow3, pCRow3, #16
  495. .endm
  496. /******************************************************************************/
  497. .macro INIT2x4
  498. fmov s16, wzr
  499. fmov s20, s16
  500. fmov s24, s20
  501. fmov s28, s16
  502. .endm
  503. .macro KERNEL2x4_SUB
  504. ldp s8, s9, [pB], #8
  505. ldp s10, s11, [pB], #8
  506. ldr d0, [pA], #8
  507. fmla v16.2s, v0.2s, v8.s[0]
  508. fmla v20.2s, v0.2s, v9.s[0]
  509. fmla v24.2s, v0.2s, v10.s[0]
  510. fmla v28.2s, v0.2s, v11.s[0]
  511. .endm
  512. .macro SAVE2x4
  513. fmov alpha0, alpha
  514. ldr d0, [pCRow0]
  515. fmla v0.2s, v16.2s, alphaV0
  516. str d0, [pCRow0]
  517. add pCRow0, pCRow0, #8
  518. ldr d1, [pCRow1]
  519. fmla v1.2s, v20.2s, alphaV0
  520. str d1, [pCRow1]
  521. add pCRow1, pCRow1, #8
  522. ldr d0, [pCRow2]
  523. fmla v0.2s, v24.2s, alphaV0
  524. str d0, [pCRow2]
  525. add pCRow2, pCRow2, #8
  526. ldr d1, [pCRow3]
  527. fmla v1.2s, v28.2s, alphaV0
  528. str d1, [pCRow3]
  529. add pCRow3, pCRow3, #8
  530. .endm
  531. /******************************************************************************/
  532. .macro INIT1x4
  533. fmov s16, wzr
  534. fmov s20, s16
  535. .endm
  536. .macro KERNEL1x4_SUB
  537. ldr s0, [pA]
  538. add pA, pA, #4
  539. ld1 {v8.2s, v9.2s}, [pB]
  540. add pB, pB, #16
  541. fmla v16.2s, v8.2s, v0.s[0]
  542. fmla v20.2s, v9.2s, v0.s[0]
  543. .endm
  544. .macro SAVE1x4
  545. fmov alpha0, alpha
  546. ld1 {v8.s}[0], [pCRow0]
  547. ld1 {v8.s}[1], [pCRow1]
  548. fmla v8.2s, v16.2s, alphaV0
  549. st1 {v8.s}[0], [pCRow0]
  550. st1 {v8.s}[1], [pCRow1]
  551. add pCRow0, pCRow0, #4
  552. add pCRow1, pCRow1, #4
  553. ld1 {v12.s}[0], [pCRow2]
  554. ld1 {v12.s}[1], [pCRow3]
  555. fmla v12.2s, v20.2s, alphaV0
  556. st1 {v12.s}[0], [pCRow2]
  557. st1 {v12.s}[1], [pCRow3]
  558. add pCRow2, pCRow2, #4
  559. add pCRow3, pCRow3, #4
  560. .endm
  561. /******************************************************************************/
  562. .macro INIT16x2
  563. fmov s16, wzr
  564. fmov s17, wzr
  565. fmov s18, wzr
  566. fmov s19, s16
  567. fmov s20, wzr
  568. fmov s21, s16
  569. fmov s22, wzr
  570. fmov s23, s16
  571. .endm
  572. .macro KERNEL16x2_SUB
  573. ld1 {v8.2s}, [pB]
  574. add pB, pB, #8
  575. ld1 {v0.4s}, [pA]
  576. add pA, pA, #16
  577. ld1 {v1.4s}, [pA]
  578. add pA, pA, #16
  579. ld1 {v2.4s}, [pA]
  580. add pA, pA, #16
  581. ld1 {v3.4s}, [pA]
  582. add pA, pA, #16
  583. fmla v16.4s, v0.4s, v8.s[0]
  584. fmla v17.4s, v1.4s, v8.s[0]
  585. fmla v18.4s, v2.4s, v8.s[0]
  586. fmla v19.4s, v3.4s, v8.s[0]
  587. fmla v20.4s, v0.4s, v8.s[1]
  588. fmla v21.4s, v1.4s, v8.s[1]
  589. fmla v22.4s, v2.4s, v8.s[1]
  590. fmla v23.4s, v3.4s, v8.s[1]
  591. .endm
  592. .macro SAVE16x2
  593. fmov alpha0, alpha
  594. add pCRow1, pCRow0, LDC
  595. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  596. fmla v0.4s, v16.4s, alphaV0
  597. fmla v1.4s, v17.4s, alphaV0
  598. fmla v2.4s, v18.4s, alphaV0
  599. fmla v3.4s, v19.4s, alphaV0
  600. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  601. ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  602. fmla v4.4s, v20.4s, alphaV0
  603. fmla v5.4s, v21.4s, alphaV0
  604. fmla v6.4s, v22.4s, alphaV0
  605. fmla v7.4s, v23.4s, alphaV0
  606. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  607. add pCRow0, pCRow0, #64
  608. .endm
  609. /******************************************************************************/
  610. .macro INIT8x2
  611. fmov s16, wzr
  612. fmov s17, s16
  613. fmov s20, s17
  614. fmov s21, s16
  615. .endm
  616. .macro KERNEL8x2_SUB
  617. ld1 {v8.2s}, [pB]
  618. add pB, pB, #8
  619. ld1 {v0.4s}, [pA]
  620. add pA, pA, #16
  621. ld1 {v1.4s}, [pA]
  622. add pA, pA, #16
  623. fmla v16.4s, v0.4s, v8.s[0]
  624. fmla v17.4s, v1.4s, v8.s[0]
  625. fmla v20.4s, v0.4s, v8.s[1]
  626. fmla v21.4s, v1.4s, v8.s[1]
  627. .endm
  628. .macro SAVE8x2
  629. fmov alpha0, alpha
  630. add pCRow1, pCRow0, LDC
  631. ld1 {v0.4s, v1.4s}, [pCRow0]
  632. fmla v0.4s, v16.4s, alphaV0
  633. fmla v1.4s, v17.4s, alphaV0
  634. st1 {v0.4s, v1.4s}, [pCRow0]
  635. add pCRow2, pCRow1, LDC
  636. ld1 {v4.4s, v5.4s}, [pCRow1]
  637. fmla v4.4s, v20.4s, alphaV0
  638. fmla v5.4s, v21.4s, alphaV0
  639. st1 {v4.4s, v5.4s}, [pCRow1]
  640. add pCRow0, pCRow0, #32
  641. .endm
  642. /******************************************************************************/
  643. .macro INIT4x2
  644. fmov s16, wzr
  645. fmov s17, s16
  646. fmov s20, s17
  647. fmov s21, s16
  648. .endm
  649. .macro KERNEL4x2_SUB
  650. ld1 {v8.2s}, [pB]
  651. add pB, pB, #8
  652. ld1 {v0.2s, v1.2s}, [pA]
  653. add pA, pA, #16
  654. fmla v16.2s, v0.2s, v8.s[0]
  655. fmla v17.2s, v1.2s, v8.s[0]
  656. fmla v20.2s, v0.2s, v8.s[1]
  657. fmla v21.2s, v1.2s, v8.s[1]
  658. .endm
  659. .macro SAVE4x2
  660. fmov alpha0, alpha
  661. ld1 {v8.2s, v9.2s}, [pCRow0]
  662. fmla v8.2s, v16.2s, alphaV0
  663. fmla v9.2s, v17.2s, alphaV0
  664. st1 {v8.2s, v9.2s}, [pCRow0]
  665. add pCRow1, pCRow0, LDC
  666. ld1 {v12.2s, v13.2s}, [pCRow1]
  667. fmla v12.2s, v20.2s, alphaV0
  668. fmla v13.2s, v21.2s, alphaV0
  669. st1 {v12.2s, v13.2s}, [pCRow1]
  670. add pCRow0, pCRow0, #16
  671. .endm
  672. /******************************************************************************/
  673. .macro INIT2x2
  674. fmov s16, wzr
  675. fmov s20, s16
  676. .endm
  677. .macro KERNEL2x2_SUB
  678. ld1 {v8.2s}, [pB]
  679. add pB, pB, #8
  680. ld1 {v0.2s}, [pA]
  681. add pA, pA, #8
  682. fmla v16.2s, v0.2s, v8.s[0]
  683. fmla v20.2s, v0.2s, v8.s[1]
  684. .endm
  685. .macro SAVE2x2
  686. fmov alpha0, alpha
  687. ld1 {v8.2s}, [pCRow0]
  688. fmla v8.2s, v16.2s, alphaV0
  689. st1 {v8.2s}, [pCRow0]
  690. add pCRow1 , pCRow0, LDC
  691. ld1 {v12.2s}, [pCRow1]
  692. fmla v12.2s, v20.2s, alphaV0
  693. st1 {v12.2s}, [pCRow1]
  694. add pCRow0, pCRow0, #8
  695. .endm
  696. /******************************************************************************/
  697. .macro INIT1x2
  698. fmov s16, wzr
  699. .endm
  700. .macro KERNEL1x2_SUB
  701. ld1 {v8.2s} , [pB]
  702. add pB , pB, #8
  703. ldr s0 , [pA]
  704. add pA, pA, #4
  705. fmla v16.2s, v8.2s, v0.s[0]
  706. .endm
  707. .macro SAVE1x2
  708. fmov alpha0, alpha
  709. add pCRow1 , pCRow0, LDC
  710. ld1 {v8.s}[0], [pCRow0]
  711. ld1 {v8.s}[1], [pCRow1]
  712. fmla v8.2s, v16.2s, alphaV0
  713. st1 {v8.s}[0], [pCRow0]
  714. st1 {v8.s}[1], [pCRow1]
  715. add pCRow0, pCRow0, #4
  716. .endm
  717. /******************************************************************************/
  718. .macro INIT16x1
  719. fmov s16, wzr
  720. fmov s17, wzr
  721. fmov s18, wzr
  722. fmov s19, s16
  723. .endm
  724. .macro KERNEL16x1_SUB
  725. ldr s8, [pB]
  726. add pB , pB, #4
  727. ld1 {v0.4s}, [pA]
  728. add pA, pA, #16
  729. ld1 {v1.4s}, [pA]
  730. add pA, pA, #16
  731. ld1 {v2.4s}, [pA]
  732. add pA, pA, #16
  733. ld1 {v3.4s}, [pA]
  734. add pA, pA, #16
  735. fmla v16.4s, v0.4s, v8.s[0]
  736. fmla v17.4s, v1.4s, v8.s[0]
  737. fmla v18.4s, v2.4s, v8.s[0]
  738. fmla v19.4s, v3.4s, v8.s[0]
  739. .endm
  740. .macro SAVE16x1
  741. fmov alpha0, alpha
  742. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  743. fmla v0.4s, v16.4s, alphaV0
  744. fmla v1.4s, v17.4s, alphaV0
  745. fmla v2.4s, v18.4s, alphaV0
  746. fmla v3.4s, v19.4s, alphaV0
  747. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  748. add pCRow0, pCRow0, #64
  749. .endm
  750. /******************************************************************************/
  751. .macro INIT8x1
  752. fmov s16, wzr
  753. fmov s17, wzr
  754. .endm
  755. .macro KERNEL8x1_SUB
  756. ldr s8, [pB]
  757. add pB , pB, #4
  758. ld1 {v0.4s}, [pA]
  759. add pA, pA, #16
  760. ld1 {v1.4s}, [pA]
  761. add pA, pA, #16
  762. fmla v16.4s, v0.4s, v8.s[0]
  763. fmla v17.4s, v1.4s, v8.s[0]
  764. .endm
  765. .macro SAVE8x1
  766. fmov alpha0, alpha
  767. ld1 {v0.4s, v1.4s}, [pCRow0]
  768. fmla v0.4s, v16.4s, alphaV0
  769. fmla v1.4s, v17.4s, alphaV0
  770. st1 {v0.4s, v1.4s}, [pCRow0]
  771. add pCRow0, pCRow0, #32
  772. .endm
  773. /******************************************************************************/
  774. .macro INIT4x1
  775. fmov s16, wzr
  776. fmov s17, s16
  777. .endm
  778. .macro KERNEL4x1_SUB
  779. ldr s8, [pB]
  780. add pB , pB, #4
  781. ld1 {v0.2s, v1.2s}, [pA]
  782. add pA , pA, #16
  783. fmla v16.2s, v0.2s, v8.s[0]
  784. fmla v17.2s, v1.2s, v8.s[0]
  785. .endm
  786. .macro SAVE4x1
  787. fmov alpha0, alpha
  788. ld1 {v8.2s, v9.2s}, [pCRow0]
  789. fmla v8.2s, v16.2s, alphaV0
  790. fmla v9.2s, v17.2s, alphaV0
  791. st1 {v8.2s, v9.2s}, [pCRow0]
  792. add pCRow0, pCRow0, #16
  793. .endm
  794. /******************************************************************************/
  795. .macro INIT2x1
  796. fmov s16, wzr
  797. .endm
  798. .macro KERNEL2x1_SUB
  799. ldr s8, [pB]
  800. add pB , pB, #4
  801. ld1 {v0.2s}, [pA]
  802. add pA , pA, #8
  803. fmla v16.2s, v0.2s, v8.s[0]
  804. .endm
  805. .macro SAVE2x1
  806. fmov alpha0, alpha
  807. ld1 {v8.2s}, [pCRow0]
  808. fmla v8.2s, v16.2s, alphaV0
  809. st1 {v8.2s}, [pCRow0]
  810. add pCRow0, pCRow0, #8
  811. .endm
  812. /******************************************************************************/
  813. .macro INIT1x1
  814. fmov s16, wzr
  815. .endm
  816. .macro KERNEL1x1_SUB
  817. ldr s8, [pB]
  818. add pB , pB, #4
  819. ldr s0, [pA]
  820. add pA , pA, #4
  821. fmadd s16, s0, s8, s16
  822. .endm
  823. .macro SAVE1x1
  824. fmov alpha0, alpha
  825. ldr s8, [pCRow0]
  826. fmla s8, s16, alphaV0
  827. str s8, [pCRow0]
  828. add pCRow0, pCRow0, #4
  829. .endm
  830. .macro KERNEL16x4_M1_M2_x1
  831. KERNEL16x4_M1
  832. KERNEL16x4_M2
  833. .endm
  834. .macro KERNEL16x4_M1_M2_x2
  835. KERNEL16x4_M1_M2_x1
  836. KERNEL16x4_M1_M2_x1
  837. .endm
  838. .macro KERNEL16x4_M1_M2_x4
  839. KERNEL16x4_M1_M2_x2
  840. KERNEL16x4_M1_M2_x2
  841. .endm
  842. .macro KERNEL16x4_M1_M2_x8
  843. KERNEL16x4_M1_M2_x4
  844. KERNEL16x4_M1_M2_x4
  845. .endm
  846. .macro KERNEL16x4_M1_M2_x16
  847. KERNEL16x4_M1_M2_x8
  848. KERNEL16x4_M1_M2_x8
  849. .endm
  850. .macro KERNEL16x4_M1_M2_x32
  851. KERNEL16x4_M1_M2_x16
  852. KERNEL16x4_M1_M2_x16
  853. .endm
  854. .macro KERNEL16x4_M1_M2_x64
  855. KERNEL16x4_M1_M2_x32
  856. KERNEL16x4_M1_M2_x32
  857. .endm
  858. /*******************************************************************************
  859. * End of macro definitions
  860. *******************************************************************************/
  861. PROLOGUE
  862. .Lsgemm_kernel_begin:
  863. .align 5
  864. add sp, sp, #-(11 * 16)
  865. stp d8, d9, [sp, #(0 * 16)]
  866. stp d10, d11, [sp, #(1 * 16)]
  867. stp d12, d13, [sp, #(2 * 16)]
  868. stp d14, d15, [sp, #(3 * 16)]
  869. stp d16, d17, [sp, #(4 * 16)]
  870. stp x18, x19, [sp, #(5 * 16)]
  871. stp x20, x21, [sp, #(6 * 16)]
  872. stp x22, x23, [sp, #(7 * 16)]
  873. stp x24, x25, [sp, #(8 * 16)]
  874. stp x26, x27, [sp, #(9 * 16)]
  875. str x28, [sp, #(10 * 16)]
  876. prfm PLDL1KEEP, [origPB]
  877. prfm PLDL1KEEP, [origPA]
  878. fmov alpha, s0
  879. lsl LDC, LDC, #2 // ldc = ldc * 4
  880. mov pB, origPB
  881. mov counterJ, origN
  882. asr counterJ, counterJ, #2 // J = J / 4
  883. cmp counterJ, #0
  884. ble .Lsgemm_kernel_L2_BEGIN
  885. /******************************************************************************/
  886. .Lsgemm_kernel_L4_BEGIN:
  887. mov pCRow0, pC
  888. add pCRow1, pCRow0, LDC
  889. add pCRow2, pCRow1, LDC
  890. add pCRow3, pCRow2, LDC
  891. add pC, pCRow3, LDC
  892. mov pA, origPA // pA = start of A array
  893. .Lsgemm_kernel_L4_M16_BEGIN:
  894. mov counterI, origM
  895. asr counterI, counterI, #4 // counterI = counterI / 16
  896. cmp counterI, #0
  897. ble .Lsgemm_kernel_L4_M8_BEGIN
  898. .align 5
  899. .Lsgemm_kernel_L4_M16_20:
  900. mov pB, origPB
  901. asr counterL , origK, #4 // L = K / 16
  902. cmp counterL , #2
  903. blt .Lsgemm_kernel_L4_M16_32
  904. KERNEL16x4_I
  905. KERNEL16x4_M2
  906. KERNEL16x4_M1_M2_x4
  907. KERNEL16x4_M1_M2_x2
  908. KERNEL16x4_M1_M2_x1
  909. subs counterL, counterL, #2
  910. ble .Lsgemm_kernel_L4_M16_22a
  911. .align 5
  912. .Lsgemm_kernel_L4_M16_22:
  913. KERNEL16x4_M1_M2_x8
  914. subs counterL, counterL, #1
  915. bgt .Lsgemm_kernel_L4_M16_22
  916. .align 5
  917. .Lsgemm_kernel_L4_M16_22a:
  918. KERNEL16x4_M1_M2_x4
  919. KERNEL16x4_M1_M2_x2
  920. KERNEL16x4_M1_M2_x1
  921. KERNEL16x4_M1
  922. KERNEL16x4_E
  923. b .Lsgemm_kernel_L4_M16_44
  924. .align 5
  925. .Lsgemm_kernel_L4_M16_32:
  926. tst counterL, #1
  927. ble .Lsgemm_kernel_L4_M16_40
  928. KERNEL16x4_I
  929. KERNEL16x4_M2
  930. KERNEL16x4_M1_M2_x4
  931. KERNEL16x4_M1_M2_x2
  932. KERNEL16x4_M1
  933. KERNEL16x4_E
  934. b .Lsgemm_kernel_L4_M16_44
  935. .Lsgemm_kernel_L4_M16_40:
  936. INIT16x4
  937. .Lsgemm_kernel_L4_M16_44:
  938. ands counterL , origK, #15
  939. ble .Lsgemm_kernel_L4_M16_100
  940. .align 5
  941. .Lsgemm_kernel_L4_M16_46:
  942. KERNEL16x4_SUB
  943. subs counterL, counterL, #1
  944. bne .Lsgemm_kernel_L4_M16_46
  945. .Lsgemm_kernel_L4_M16_100:
  946. prfm PLDL1KEEP, [pA]
  947. prfm PLDL1KEEP, [pA, #64]
  948. prfm PLDL1KEEP, [origPB]
  949. SAVE16x4
  950. .Lsgemm_kernel_L4_M16_END:
  951. subs counterI, counterI, #1
  952. bne .Lsgemm_kernel_L4_M16_20
  953. //------------------------------------------------------------------------------
  954. .Lsgemm_kernel_L4_M8_BEGIN:
  955. mov counterI, origM
  956. tst counterI , #15
  957. ble .Lsgemm_kernel_L4_END
  958. tst counterI, #8
  959. ble .Lsgemm_kernel_L4_M4_BEGIN
  960. .Lsgemm_kernel_L4_M8_20:
  961. mov pB, origPB
  962. asr counterL , origK, #1 // L = K / 2
  963. cmp counterL , #2 // is there at least 4 to do?
  964. blt .Lsgemm_kernel_L4_M8_32
  965. KERNEL8x4_I // do one in the K
  966. KERNEL8x4_M2 // do another in the K
  967. subs counterL, counterL, #2
  968. ble .Lsgemm_kernel_L4_M8_22a
  969. .align 5
  970. .Lsgemm_kernel_L4_M8_22:
  971. KERNEL8x4_M1
  972. KERNEL8x4_M2
  973. subs counterL, counterL, #1
  974. bgt .Lsgemm_kernel_L4_M8_22
  975. .Lsgemm_kernel_L4_M8_22a:
  976. KERNEL8x4_M1
  977. KERNEL8x4_E
  978. b .Lsgemm_kernel_L4_M8_44
  979. .Lsgemm_kernel_L4_M8_32:
  980. tst counterL, #1
  981. ble .Lsgemm_kernel_L4_M8_40
  982. KERNEL8x4_I
  983. KERNEL8x4_E
  984. b .Lsgemm_kernel_L4_M8_44
  985. .Lsgemm_kernel_L4_M8_40:
  986. INIT8x4
  987. .Lsgemm_kernel_L4_M8_44:
  988. ands counterL , origK, #1
  989. ble .Lsgemm_kernel_L4_M8_100
  990. .Lsgemm_kernel_L4_M8_46:
  991. KERNEL8x4_SUB
  992. .Lsgemm_kernel_L4_M8_100:
  993. SAVE8x4
  994. .Lsgemm_kernel_L4_M8_END:
  995. //------------------------------------------------------------------------------
  996. .Lsgemm_kernel_L4_M4_BEGIN:
  997. mov counterI, origM
  998. tst counterI , #7
  999. ble .Lsgemm_kernel_L4_END
  1000. tst counterI, #4
  1001. ble .Lsgemm_kernel_L4_M2_BEGIN
  1002. .Lsgemm_kernel_L4_M4_20:
  1003. mov pB, origPB
  1004. asr counterL , origK, #1 // L = K / 2
  1005. cmp counterL , #2 // is there at least 4 to do?
  1006. blt .Lsgemm_kernel_L4_M4_32
  1007. KERNEL4x4_I // do one in the K
  1008. KERNEL4x4_M2 // do another in the K
  1009. subs counterL, counterL, #2
  1010. ble .Lsgemm_kernel_L4_M4_22a
  1011. .align 5
  1012. .Lsgemm_kernel_L4_M4_22:
  1013. KERNEL4x4_M1
  1014. KERNEL4x4_M2
  1015. subs counterL, counterL, #1
  1016. bgt .Lsgemm_kernel_L4_M4_22
  1017. .Lsgemm_kernel_L4_M4_22a:
  1018. KERNEL4x4_M1
  1019. KERNEL4x4_E
  1020. b .Lsgemm_kernel_L4_M4_44
  1021. .Lsgemm_kernel_L4_M4_32:
  1022. tst counterL, #1
  1023. ble .Lsgemm_kernel_L4_M4_40
  1024. KERNEL4x4_I
  1025. KERNEL4x4_E
  1026. b .Lsgemm_kernel_L4_M4_44
  1027. .Lsgemm_kernel_L4_M4_40:
  1028. INIT4x4
  1029. .Lsgemm_kernel_L4_M4_44:
  1030. ands counterL , origK, #1
  1031. ble .Lsgemm_kernel_L4_M4_100
  1032. .Lsgemm_kernel_L4_M4_46:
  1033. KERNEL4x4_SUB
  1034. .Lsgemm_kernel_L4_M4_100:
  1035. SAVE4x4
  1036. .Lsgemm_kernel_L4_M4_END:
  1037. //------------------------------------------------------------------------------
  1038. .Lsgemm_kernel_L4_M2_BEGIN:
  1039. mov counterI, origM
  1040. tst counterI , #3
  1041. ble .Lsgemm_kernel_L4_END
  1042. tst counterI, #2 // counterI = counterI / 2
  1043. ble .Lsgemm_kernel_L4_M1_BEGIN
  1044. .Lsgemm_kernel_L4_M2_20:
  1045. INIT2x4
  1046. mov pB, origPB
  1047. asr counterL , origK, #3 // counterL = counterL / 8
  1048. cmp counterL , #0
  1049. ble .Lsgemm_kernel_L4_M2_40
  1050. .Lsgemm_kernel_L4_M2_22:
  1051. KERNEL2x4_SUB
  1052. KERNEL2x4_SUB
  1053. KERNEL2x4_SUB
  1054. KERNEL2x4_SUB
  1055. KERNEL2x4_SUB
  1056. KERNEL2x4_SUB
  1057. KERNEL2x4_SUB
  1058. KERNEL2x4_SUB
  1059. subs counterL, counterL, #1
  1060. bgt .Lsgemm_kernel_L4_M2_22
  1061. .Lsgemm_kernel_L4_M2_40:
  1062. ands counterL , origK, #7 // counterL = counterL % 8
  1063. ble .Lsgemm_kernel_L4_M2_100
  1064. .Lsgemm_kernel_L4_M2_42:
  1065. KERNEL2x4_SUB
  1066. subs counterL, counterL, #1
  1067. bgt .Lsgemm_kernel_L4_M2_42
  1068. .Lsgemm_kernel_L4_M2_100:
  1069. SAVE2x4
  1070. .Lsgemm_kernel_L4_M2_END:
  1071. .Lsgemm_kernel_L4_M1_BEGIN:
  1072. tst counterI, #1 // counterI = counterI % 2
  1073. ble .Lsgemm_kernel_L4_END
  1074. .Lsgemm_kernel_L4_M1_20:
  1075. INIT1x4
  1076. mov pB, origPB
  1077. asr counterL , origK, #3 // counterL = counterL / 8
  1078. cmp counterL , #0
  1079. ble .Lsgemm_kernel_L4_M1_40
  1080. .Lsgemm_kernel_L4_M1_22:
  1081. KERNEL1x4_SUB
  1082. KERNEL1x4_SUB
  1083. KERNEL1x4_SUB
  1084. KERNEL1x4_SUB
  1085. KERNEL1x4_SUB
  1086. KERNEL1x4_SUB
  1087. KERNEL1x4_SUB
  1088. KERNEL1x4_SUB
  1089. subs counterL, counterL, #1
  1090. bgt .Lsgemm_kernel_L4_M1_22
  1091. .Lsgemm_kernel_L4_M1_40:
  1092. ands counterL , origK, #7 // counterL = counterL % 8
  1093. ble .Lsgemm_kernel_L4_M1_100
  1094. .Lsgemm_kernel_L4_M1_42:
  1095. KERNEL1x4_SUB
  1096. subs counterL, counterL, #1
  1097. bgt .Lsgemm_kernel_L4_M1_42
  1098. .Lsgemm_kernel_L4_M1_100:
  1099. SAVE1x4
  1100. .Lsgemm_kernel_L4_END:
  1101. add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
  1102. subs counterJ, counterJ , #1 // j--
  1103. bgt .Lsgemm_kernel_L4_BEGIN
  1104. /******************************************************************************/
  1105. .Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  1106. mov counterJ , origN
  1107. tst counterJ , #3
  1108. ble .Lsgemm_kernel_L999
  1109. tst counterJ , #2
  1110. ble .Lsgemm_kernel_L1_BEGIN
  1111. mov pCRow0, pC // pCRow0 = pC
  1112. add pC,pC,LDC, lsl #1
  1113. mov pA, origPA // pA = A
  1114. .Lsgemm_kernel_L2_M16_BEGIN:
  1115. mov counterI, origM
  1116. asr counterI, counterI, #4 // counterI = counterI / 16
  1117. cmp counterI,#0
  1118. ble .Lsgemm_kernel_L2_M8_BEGIN
  1119. .Lsgemm_kernel_L2_M16_20:
  1120. INIT16x2
  1121. mov pB, origPB
  1122. asr counterL , origK, #3 // counterL = counterL / 8
  1123. cmp counterL,#0
  1124. ble .Lsgemm_kernel_L2_M16_40
  1125. .align 5
  1126. .Lsgemm_kernel_L2_M16_22:
  1127. KERNEL16x2_SUB
  1128. KERNEL16x2_SUB
  1129. KERNEL16x2_SUB
  1130. KERNEL16x2_SUB
  1131. KERNEL16x2_SUB
  1132. KERNEL16x2_SUB
  1133. KERNEL16x2_SUB
  1134. KERNEL16x2_SUB
  1135. subs counterL, counterL, #1
  1136. bgt .Lsgemm_kernel_L2_M16_22
  1137. .Lsgemm_kernel_L2_M16_40:
  1138. ands counterL , origK, #7 // counterL = counterL % 8
  1139. ble .Lsgemm_kernel_L2_M16_100
  1140. .Lsgemm_kernel_L2_M16_42:
  1141. KERNEL16x2_SUB
  1142. subs counterL, counterL, #1
  1143. bgt .Lsgemm_kernel_L2_M16_42
  1144. .Lsgemm_kernel_L2_M16_100:
  1145. SAVE16x2
  1146. .Lsgemm_kernel_L2_M16_END:
  1147. subs counterI, counterI, #1
  1148. bgt .Lsgemm_kernel_L2_M16_20
  1149. //------------------------------------------------------------------------------
  1150. .Lsgemm_kernel_L2_M8_BEGIN:
  1151. mov counterI, origM
  1152. tst counterI , #15
  1153. ble .Lsgemm_kernel_L2_END
  1154. tst counterI, #8
  1155. ble .Lsgemm_kernel_L2_M4_BEGIN
  1156. .Lsgemm_kernel_L2_M8_20:
  1157. INIT8x2
  1158. mov pB, origPB
  1159. asr counterL , origK, #3 // counterL = counterL / 8
  1160. cmp counterL,#0
  1161. ble .Lsgemm_kernel_L2_M8_40
  1162. .align 5
  1163. .Lsgemm_kernel_L2_M8_22:
  1164. KERNEL8x2_SUB
  1165. KERNEL8x2_SUB
  1166. KERNEL8x2_SUB
  1167. KERNEL8x2_SUB
  1168. KERNEL8x2_SUB
  1169. KERNEL8x2_SUB
  1170. KERNEL8x2_SUB
  1171. KERNEL8x2_SUB
  1172. subs counterL, counterL, #1
  1173. bgt .Lsgemm_kernel_L2_M8_22
  1174. .Lsgemm_kernel_L2_M8_40:
  1175. ands counterL , origK, #7 // counterL = counterL % 8
  1176. ble .Lsgemm_kernel_L2_M8_100
  1177. .Lsgemm_kernel_L2_M8_42:
  1178. KERNEL8x2_SUB
  1179. subs counterL, counterL, #1
  1180. bgt .Lsgemm_kernel_L2_M8_42
  1181. .Lsgemm_kernel_L2_M8_100:
  1182. SAVE8x2
  1183. .Lsgemm_kernel_L2_M8_END:
  1184. //------------------------------------------------------------------------------
  1185. .Lsgemm_kernel_L2_M4_BEGIN:
  1186. mov counterI, origM
  1187. tst counterI , #7
  1188. ble .Lsgemm_kernel_L2_END
  1189. tst counterI, #4
  1190. ble .Lsgemm_kernel_L2_M2_BEGIN
  1191. .Lsgemm_kernel_L2_M4_20:
  1192. INIT4x2
  1193. mov pB, origPB
  1194. asr counterL , origK, #3 // counterL = counterL / 8
  1195. cmp counterL,#0
  1196. ble .Lsgemm_kernel_L2_M4_40
  1197. .align 5
  1198. .Lsgemm_kernel_L2_M4_22:
  1199. KERNEL4x2_SUB
  1200. KERNEL4x2_SUB
  1201. KERNEL4x2_SUB
  1202. KERNEL4x2_SUB
  1203. KERNEL4x2_SUB
  1204. KERNEL4x2_SUB
  1205. KERNEL4x2_SUB
  1206. KERNEL4x2_SUB
  1207. subs counterL, counterL, #1
  1208. bgt .Lsgemm_kernel_L2_M4_22
  1209. .Lsgemm_kernel_L2_M4_40:
  1210. ands counterL , origK, #7 // counterL = counterL % 8
  1211. ble .Lsgemm_kernel_L2_M4_100
  1212. .Lsgemm_kernel_L2_M4_42:
  1213. KERNEL4x2_SUB
  1214. subs counterL, counterL, #1
  1215. bgt .Lsgemm_kernel_L2_M4_42
  1216. .Lsgemm_kernel_L2_M4_100:
  1217. SAVE4x2
  1218. .Lsgemm_kernel_L2_M4_END:
  1219. //------------------------------------------------------------------------------
  1220. .Lsgemm_kernel_L2_M2_BEGIN:
  1221. mov counterI, origM
  1222. tst counterI , #3
  1223. ble .Lsgemm_kernel_L2_END
  1224. tst counterI, #2 // counterI = counterI / 2
  1225. ble .Lsgemm_kernel_L2_M1_BEGIN
  1226. .Lsgemm_kernel_L2_M2_20:
  1227. INIT2x2
  1228. mov pB, origPB
  1229. asr counterL , origK, #3 // counterL = counterL / 8
  1230. cmp counterL,#0
  1231. ble .Lsgemm_kernel_L2_M2_40
  1232. .Lsgemm_kernel_L2_M2_22:
  1233. KERNEL2x2_SUB
  1234. KERNEL2x2_SUB
  1235. KERNEL2x2_SUB
  1236. KERNEL2x2_SUB
  1237. KERNEL2x2_SUB
  1238. KERNEL2x2_SUB
  1239. KERNEL2x2_SUB
  1240. KERNEL2x2_SUB
  1241. subs counterL, counterL, #1
  1242. bgt .Lsgemm_kernel_L2_M2_22
  1243. .Lsgemm_kernel_L2_M2_40:
  1244. ands counterL , origK, #7 // counterL = counterL % 8
  1245. ble .Lsgemm_kernel_L2_M2_100
  1246. .Lsgemm_kernel_L2_M2_42:
  1247. KERNEL2x2_SUB
  1248. subs counterL, counterL, #1
  1249. bgt .Lsgemm_kernel_L2_M2_42
  1250. .Lsgemm_kernel_L2_M2_100:
  1251. SAVE2x2
  1252. .Lsgemm_kernel_L2_M2_END:
  1253. .Lsgemm_kernel_L2_M1_BEGIN:
  1254. tst counterI, #1 // counterI = counterI % 2
  1255. ble .Lsgemm_kernel_L2_END
  1256. .Lsgemm_kernel_L2_M1_20:
  1257. INIT1x2
  1258. mov pB, origPB
  1259. asr counterL , origK, #3 // counterL = counterL / 8
  1260. cmp counterL, #0
  1261. ble .Lsgemm_kernel_L2_M1_40
  1262. .Lsgemm_kernel_L2_M1_22:
  1263. KERNEL1x2_SUB
  1264. KERNEL1x2_SUB
  1265. KERNEL1x2_SUB
  1266. KERNEL1x2_SUB
  1267. KERNEL1x2_SUB
  1268. KERNEL1x2_SUB
  1269. KERNEL1x2_SUB
  1270. KERNEL1x2_SUB
  1271. subs counterL, counterL, #1
  1272. bgt .Lsgemm_kernel_L2_M1_22
  1273. .Lsgemm_kernel_L2_M1_40:
  1274. ands counterL , origK, #7 // counterL = counterL % 8
  1275. ble .Lsgemm_kernel_L2_M1_100
  1276. .Lsgemm_kernel_L2_M1_42:
  1277. KERNEL1x2_SUB
  1278. subs counterL, counterL, #1
  1279. bgt .Lsgemm_kernel_L2_M1_42
  1280. .Lsgemm_kernel_L2_M1_100:
  1281. SAVE1x2
  1282. .Lsgemm_kernel_L2_END:
  1283. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  1284. /******************************************************************************/
  1285. .Lsgemm_kernel_L1_BEGIN:
  1286. mov counterJ , origN
  1287. tst counterJ , #1
  1288. ble .Lsgemm_kernel_L999 // done
  1289. mov pCRow0, pC // pCRow0 = C
  1290. add pC , pC , LDC // Update pC to point to next
  1291. mov pA, origPA // pA = A
  1292. .Lsgemm_kernel_L1_M16_BEGIN:
  1293. mov counterI, origM
  1294. asr counterI, counterI, #4 // counterI = counterI / 16
  1295. cmp counterI, #0
  1296. ble .Lsgemm_kernel_L1_M8_BEGIN
  1297. .Lsgemm_kernel_L1_M16_20:
  1298. INIT16x1
  1299. mov pB, origPB
  1300. asr counterL , origK, #3 // counterL = counterL / 8
  1301. cmp counterL , #0
  1302. ble .Lsgemm_kernel_L1_M16_40
  1303. .align 5
  1304. .Lsgemm_kernel_L1_M16_22:
  1305. KERNEL16x1_SUB
  1306. KERNEL16x1_SUB
  1307. KERNEL16x1_SUB
  1308. KERNEL16x1_SUB
  1309. KERNEL16x1_SUB
  1310. KERNEL16x1_SUB
  1311. KERNEL16x1_SUB
  1312. KERNEL16x1_SUB
  1313. subs counterL, counterL, #1
  1314. bgt .Lsgemm_kernel_L1_M16_22
  1315. .Lsgemm_kernel_L1_M16_40:
  1316. ands counterL , origK, #7 // counterL = counterL % 8
  1317. ble .Lsgemm_kernel_L1_M16_100
  1318. .Lsgemm_kernel_L1_M16_42:
  1319. KERNEL16x1_SUB
  1320. subs counterL, counterL, #1
  1321. bgt .Lsgemm_kernel_L1_M16_42
  1322. .Lsgemm_kernel_L1_M16_100:
  1323. SAVE16x1
  1324. .Lsgemm_kernel_L1_M16_END:
  1325. subs counterI, counterI, #1
  1326. bgt .Lsgemm_kernel_L1_M16_20
  1327. //------------------------------------------------------------------------------
  1328. .Lsgemm_kernel_L1_M8_BEGIN:
  1329. mov counterI, origM
  1330. tst counterI , #15
  1331. ble .Lsgemm_kernel_L1_END
  1332. tst counterI, #8
  1333. ble .Lsgemm_kernel_L1_M4_BEGIN
  1334. .Lsgemm_kernel_L1_M8_20:
  1335. INIT8x1
  1336. mov pB, origPB
  1337. asr counterL , origK, #3 // counterL = counterL / 8
  1338. cmp counterL , #0
  1339. ble .Lsgemm_kernel_L1_M8_40
  1340. .align 5
  1341. .Lsgemm_kernel_L1_M8_22:
  1342. KERNEL8x1_SUB
  1343. KERNEL8x1_SUB
  1344. KERNEL8x1_SUB
  1345. KERNEL8x1_SUB
  1346. KERNEL8x1_SUB
  1347. KERNEL8x1_SUB
  1348. KERNEL8x1_SUB
  1349. KERNEL8x1_SUB
  1350. subs counterL, counterL, #1
  1351. bgt .Lsgemm_kernel_L1_M8_22
  1352. .Lsgemm_kernel_L1_M8_40:
  1353. ands counterL , origK, #7 // counterL = counterL % 8
  1354. ble .Lsgemm_kernel_L1_M8_100
  1355. .Lsgemm_kernel_L1_M8_42:
  1356. KERNEL8x1_SUB
  1357. subs counterL, counterL, #1
  1358. bgt .Lsgemm_kernel_L1_M8_42
  1359. .Lsgemm_kernel_L1_M8_100:
  1360. SAVE8x1
  1361. .Lsgemm_kernel_L1_M8_END:
  1362. //------------------------------------------------------------------------------
  1363. .Lsgemm_kernel_L1_M4_BEGIN:
  1364. mov counterI, origM
  1365. tst counterI , #7
  1366. ble .Lsgemm_kernel_L1_END
  1367. tst counterI, #4
  1368. ble .Lsgemm_kernel_L1_M2_BEGIN
  1369. .Lsgemm_kernel_L1_M4_20:
  1370. INIT4x1
  1371. mov pB, origPB
  1372. asr counterL , origK, #3 // counterL = counterL / 8
  1373. cmp counterL , #0
  1374. ble .Lsgemm_kernel_L1_M4_40
  1375. .align 5
  1376. .Lsgemm_kernel_L1_M4_22:
  1377. KERNEL4x1_SUB
  1378. KERNEL4x1_SUB
  1379. KERNEL4x1_SUB
  1380. KERNEL4x1_SUB
  1381. KERNEL4x1_SUB
  1382. KERNEL4x1_SUB
  1383. KERNEL4x1_SUB
  1384. KERNEL4x1_SUB
  1385. subs counterL, counterL, #1
  1386. bgt .Lsgemm_kernel_L1_M4_22
  1387. .Lsgemm_kernel_L1_M4_40:
  1388. ands counterL , origK, #7 // counterL = counterL % 8
  1389. ble .Lsgemm_kernel_L1_M4_100
  1390. .Lsgemm_kernel_L1_M4_42:
  1391. KERNEL4x1_SUB
  1392. subs counterL, counterL, #1
  1393. bgt .Lsgemm_kernel_L1_M4_42
  1394. .Lsgemm_kernel_L1_M4_100:
  1395. SAVE4x1
  1396. .Lsgemm_kernel_L1_M4_END:
  1397. //------------------------------------------------------------------------------
  1398. .Lsgemm_kernel_L1_M2_BEGIN:
  1399. mov counterI, origM
  1400. tst counterI , #3
  1401. ble .Lsgemm_kernel_L1_END
  1402. tst counterI, #2 // counterI = counterI / 2
  1403. ble .Lsgemm_kernel_L1_M1_BEGIN
  1404. .Lsgemm_kernel_L1_M2_20:
  1405. INIT2x1
  1406. mov pB, origPB
  1407. asr counterL , origK, #3 // counterL = counterL / 8
  1408. cmp counterL , #0
  1409. ble .Lsgemm_kernel_L1_M2_40
  1410. .Lsgemm_kernel_L1_M2_22:
  1411. KERNEL2x1_SUB
  1412. KERNEL2x1_SUB
  1413. KERNEL2x1_SUB
  1414. KERNEL2x1_SUB
  1415. KERNEL2x1_SUB
  1416. KERNEL2x1_SUB
  1417. KERNEL2x1_SUB
  1418. KERNEL2x1_SUB
  1419. subs counterL, counterL, #1
  1420. bgt .Lsgemm_kernel_L1_M2_22
  1421. .Lsgemm_kernel_L1_M2_40:
  1422. ands counterL , origK, #7 // counterL = counterL % 8
  1423. ble .Lsgemm_kernel_L1_M2_100
  1424. .Lsgemm_kernel_L1_M2_42:
  1425. KERNEL2x1_SUB
  1426. subs counterL, counterL, #1
  1427. bgt .Lsgemm_kernel_L1_M2_42
  1428. .Lsgemm_kernel_L1_M2_100:
  1429. SAVE2x1
  1430. .Lsgemm_kernel_L1_M2_END:
  1431. .Lsgemm_kernel_L1_M1_BEGIN:
  1432. tst counterI, #1 // counterI = counterI % 2
  1433. ble .Lsgemm_kernel_L1_END
  1434. .Lsgemm_kernel_L1_M1_20:
  1435. INIT1x1
  1436. mov pB, origPB
  1437. asr counterL , origK, #3 // counterL = counterL / 8
  1438. cmp counterL , #0
  1439. ble .Lsgemm_kernel_L1_M1_40
  1440. .Lsgemm_kernel_L1_M1_22:
  1441. KERNEL1x1_SUB
  1442. KERNEL1x1_SUB
  1443. KERNEL1x1_SUB
  1444. KERNEL1x1_SUB
  1445. KERNEL1x1_SUB
  1446. KERNEL1x1_SUB
  1447. KERNEL1x1_SUB
  1448. KERNEL1x1_SUB
  1449. subs counterL, counterL, #1
  1450. bgt .Lsgemm_kernel_L1_M1_22
  1451. .Lsgemm_kernel_L1_M1_40:
  1452. ands counterL , origK, #7 // counterL = counterL % 8
  1453. ble .Lsgemm_kernel_L1_M1_100
  1454. .Lsgemm_kernel_L1_M1_42:
  1455. KERNEL1x1_SUB
  1456. subs counterL, counterL, #1
  1457. bgt .Lsgemm_kernel_L1_M1_42
  1458. .Lsgemm_kernel_L1_M1_100:
  1459. SAVE1x1
  1460. .Lsgemm_kernel_L1_END:
  1461. .Lsgemm_kernel_L999:
  1462. mov x0, #0 // set return value
  1463. ldp d8, d9, [sp, #(0 * 16)]
  1464. ldp d10, d11, [sp, #(1 * 16)]
  1465. ldp d12, d13, [sp, #(2 * 16)]
  1466. ldp d14, d15, [sp, #(3 * 16)]
  1467. ldp d16, d17, [sp, #(4 * 16)]
  1468. ldp x18, x19, [sp, #(5 * 16)]
  1469. ldp x20, x21, [sp, #(6 * 16)]
  1470. ldp x22, x23, [sp, #(7 * 16)]
  1471. ldp x24, x25, [sp, #(8 * 16)]
  1472. ldp x26, x27, [sp, #(9 * 16)]
  1473. ldr x28, [sp, #(10 * 16)]
  1474. add sp, sp, #(11*16)
  1475. ret
  1476. EPILOGUE