You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_8x4_thunderx2t99.S 34 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alpha x17
  49. #define alpha0 d10
  50. #define alphaV0 v10.d[0]
  51. #define A_PRE_SIZE x20
  52. #define B_PRE_SIZE x21
  53. #define C_PRE_SIZE x22
  54. #define A_PRE_SIZE_64 x23
  55. #define B_PRE_SIZE_64 x24
  56. // 00 origM
  57. // 01 origN
  58. // 02 origK
  59. // 03 origPA
  60. // 04 origPB
  61. // 05 pC
  62. // 06 origLDC -> LDC
  63. // 07 temp
  64. // 08 counterL
  65. // 09 counterI
  66. // 10 counterJ
  67. // 11 pB
  68. // 12 pCRow0
  69. // 13 pCRow1
  70. // 14 pCRow2
  71. // 15 pCRow3
  72. // 16 pA
  73. // 17
  74. // 18 must save
  75. // 19 must save
  76. // 20 must save
  77. // 21 must save
  78. // 22 must save
  79. // 23 must save
  80. // 24 must save
  81. // 25 must save
  82. // 26 must save
  83. // 27 must save
  84. // 28 must save
  85. // 29 frame
  86. // 30 link
  87. // 31 sp
  88. //v00 ALPHA -> pA0_0, pA0_1
  89. //v01 pA0_2, pA0_3
  90. //v02 pA0_4, pA0_5
  91. //v03 pA0_6, pA0_7
  92. //v04 pA1_0, pA1_1
  93. //v05 pA1_2, pA1_3
  94. //v06 pA1_4, pA1_5
  95. //v07 pA1_6, pA1_7
  96. //v08 must save pB0_0
  97. //v09 must save pB0_1
  98. //v10 must save pB0_2 --> ALPHA0
  99. //v11 must save pB0_3
  100. //v12 must save pB1_0
  101. //v13 must save pB1_1
  102. //v14 must save pB1_2
  103. //v15 must save pB1_3
  104. //v16 must save C00, C01
  105. //v17 must save C02, C03
  106. //v18 C04, C05
  107. //v19 C06, C07
  108. //v20 C10, C11
  109. //v21 C12, C13
  110. //v22 C14, C15
  111. //v23 C16, C17
  112. //v24 C20, C21
  113. //v25 C22, C23
  114. //v26 C24, C25
  115. //v27 C26, C27
  116. //v28 C30, C31
  117. //v29 C32, C33
  118. //v30 C34, C35
  119. //v31 C36, C37
  120. /*******************************************************************************
  121. * Macro definitions
  122. *******************************************************************************/
  123. .macro INIT8x4
  124. fmov d16, xzr
  125. fmov d17, xzr
  126. fmov d18, d16
  127. fmov d19, xzr
  128. fmov d20, xzr
  129. fmov d21, d16
  130. fmov d22, d17
  131. fmov d23, d18
  132. fmov d24, xzr
  133. fmov d25, d16
  134. fmov d26, d17
  135. fmov d27, d18
  136. fmov d28, xzr
  137. fmov d29, d16
  138. fmov d30, d17
  139. fmov d31, d18
  140. .endm
  141. .macro KERNEL8x4_I
  142. ldp q0, q1, [pA]
  143. ldp q8, q9, [pB]
  144. ldp q2, q3, [pA, #32]
  145. ldp q4, q5, [pA, #64]
  146. ldp q12, q13, [pB, #32]
  147. ldp q6, q7, [pA, #96]
  148. fmul v16.2d, v0.2d, v8.d[0]
  149. fmul v20.2d, v0.2d, v8.d[1]
  150. fmul v17.2d, v1.2d, v8.d[0]
  151. fmul v21.2d, v1.2d, v8.d[1]
  152. add pA, pA, #128
  153. add pB, pB, #64
  154. fmul v24.2d, v0.2d, v9.d[0]
  155. fmul v28.2d, v0.2d, v9.d[1]
  156. fmul v25.2d, v1.2d, v9.d[0]
  157. fmul v29.2d, v1.2d, v9.d[1]
  158. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  159. prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
  160. fmul v18.2d, v2.2d, v8.d[0]
  161. fmul v22.2d, v2.2d, v8.d[1]
  162. fmul v26.2d, v2.2d, v9.d[0]
  163. fmul v30.2d, v2.2d, v9.d[1]
  164. fmul v19.2d, v3.2d, v8.d[0]
  165. fmul v27.2d, v3.2d, v9.d[0]
  166. fmul v31.2d, v3.2d, v9.d[1]
  167. fmul v23.2d, v3.2d, v8.d[1]
  168. .endm
  169. .macro KERNEL8x4_M1_M2
  170. ldp q12, q13, [pB]
  171. ldp q4, q5, [pA]
  172. ldp q6, q7, [pA, #32]
  173. fmla v16.2d, v0.2d, v8.d[0]
  174. fmla v20.2d, v0.2d, v8.d[1]
  175. fmla v24.2d, v0.2d, v9.d[0]
  176. fmla v28.2d, v0.2d, v9.d[1]
  177. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  178. fmla v17.2d, v1.2d, v8.d[0]
  179. fmla v25.2d, v1.2d, v9.d[0]
  180. fmla v21.2d, v1.2d, v8.d[1]
  181. fmla v29.2d, v1.2d, v9.d[1]
  182. prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
  183. fmla v18.2d, v2.2d, v8.d[0]
  184. fmla v22.2d, v2.2d, v8.d[1]
  185. fmla v26.2d, v2.2d, v9.d[0]
  186. fmla v30.2d, v2.2d, v9.d[1]
  187. prfm PLDL1KEEP, [pA, #3840]
  188. fmla v19.2d, v3.2d, v8.d[0]
  189. fmla v23.2d, v3.2d, v8.d[1]
  190. fmla v27.2d, v3.2d, v9.d[0]
  191. fmla v31.2d, v3.2d, v9.d[1]
  192. ldp q8, q9, [pB, #32]
  193. ldp q0, q1, [pA, #64]
  194. ldp q2, q3, [pA, #96]
  195. fmla v16.2d, v4.2d, v12.d[0]
  196. fmla v20.2d, v4.2d, v12.d[1]
  197. fmla v24.2d, v4.2d, v13.d[0]
  198. fmla v28.2d, v4.2d, v13.d[1]
  199. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  200. fmla v17.2d, v5.2d, v12.d[0]
  201. fmla v25.2d, v5.2d, v13.d[0]
  202. fmla v21.2d, v5.2d, v12.d[1]
  203. fmla v29.2d, v5.2d, v13.d[1]
  204. fmla v18.2d, v6.2d, v12.d[0]
  205. fmla v22.2d, v6.2d, v12.d[1]
  206. fmla v26.2d, v6.2d, v13.d[0]
  207. fmla v30.2d, v6.2d, v13.d[1]
  208. add pB, pB, #64
  209. add pA, pA, #128
  210. fmla v19.2d, v7.2d, v12.d[0]
  211. fmla v23.2d, v7.2d, v12.d[1]
  212. fmla v27.2d, v7.2d, v13.d[0]
  213. fmla v31.2d, v7.2d, v13.d[1]
  214. .endm
  215. .macro KERNEL8x4_M1
  216. ldp q12, q13, [pB]
  217. ldp q4, q5, [pA]
  218. ldp q6, q7, [pA, #32]
  219. fmla v16.2d, v0.2d, v8.d[0]
  220. fmla v20.2d, v0.2d, v8.d[1]
  221. fmla v24.2d, v0.2d, v9.d[0]
  222. fmla v28.2d, v0.2d, v9.d[1]
  223. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  224. fmla v17.2d, v1.2d, v8.d[0]
  225. fmla v25.2d, v1.2d, v9.d[0]
  226. fmla v21.2d, v1.2d, v8.d[1]
  227. fmla v29.2d, v1.2d, v9.d[1]
  228. prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
  229. fmla v18.2d, v2.2d, v8.d[0]
  230. fmla v22.2d, v2.2d, v8.d[1]
  231. fmla v26.2d, v2.2d, v9.d[0]
  232. fmla v30.2d, v2.2d, v9.d[1]
  233. add pB, pB, #32
  234. add pA, pA, #64
  235. fmla v19.2d, v3.2d, v8.d[0]
  236. fmla v23.2d, v3.2d, v8.d[1]
  237. fmla v27.2d, v3.2d, v9.d[0]
  238. fmla v31.2d, v3.2d, v9.d[1]
  239. .endm
  240. .macro KERNEL8x4_M2
  241. ldp q8, q9, [pB]
  242. ldp q0, q1, [pA]
  243. ldp q2, q3, [pA, #32]
  244. fmla v16.2d, v4.2d, v12.d[0]
  245. fmla v20.2d, v4.2d, v12.d[1]
  246. fmla v24.2d, v4.2d, v13.d[0]
  247. fmla v28.2d, v4.2d, v13.d[1]
  248. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  249. fmla v17.2d, v5.2d, v12.d[0]
  250. fmla v25.2d, v5.2d, v13.d[0]
  251. fmla v21.2d, v5.2d, v12.d[1]
  252. fmla v29.2d, v5.2d, v13.d[1]
  253. fmla v18.2d, v6.2d, v12.d[0]
  254. fmla v22.2d, v6.2d, v12.d[1]
  255. fmla v26.2d, v6.2d, v13.d[0]
  256. fmla v30.2d, v6.2d, v13.d[1]
  257. add pB, pB, #32
  258. add pA, pA, #64
  259. fmla v19.2d, v7.2d, v12.d[0]
  260. fmla v23.2d, v7.2d, v12.d[1]
  261. fmla v27.2d, v7.2d, v13.d[0]
  262. fmla v31.2d, v7.2d, v13.d[1]
  263. .endm
  264. .macro KERNEL8x4_E
  265. fmla v16.2d, v4.2d, v12.d[0]
  266. fmla v20.2d, v4.2d, v12.d[1]
  267. fmla v24.2d, v4.2d, v13.d[0]
  268. fmla v28.2d, v4.2d, v13.d[1]
  269. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  270. fmla v17.2d, v5.2d, v12.d[0]
  271. fmla v25.2d, v5.2d, v13.d[0]
  272. fmla v21.2d, v5.2d, v12.d[1]
  273. fmla v29.2d, v5.2d, v13.d[1]
  274. fmla v18.2d, v6.2d, v12.d[0]
  275. fmla v22.2d, v6.2d, v12.d[1]
  276. fmla v26.2d, v6.2d, v13.d[0]
  277. fmla v30.2d, v6.2d, v13.d[1]
  278. fmla v19.2d, v7.2d, v12.d[0]
  279. fmla v23.2d, v7.2d, v12.d[1]
  280. fmla v27.2d, v7.2d, v13.d[0]
  281. fmla v31.2d, v7.2d, v13.d[1]
  282. .endm
  283. .macro KERNEL8x4_SUB
  284. ldp q0, q1, [pA]
  285. ldp q8, q9, [pB]
  286. ldp q2, q3, [pA, #32]
  287. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  288. fmla v16.2d, v0.2d, v8.d[0]
  289. fmla v20.2d, v0.2d, v8.d[1]
  290. fmla v17.2d, v1.2d, v8.d[0]
  291. fmla v21.2d, v1.2d, v8.d[1]
  292. prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
  293. fmla v24.2d, v0.2d, v9.d[0]
  294. fmla v28.2d, v0.2d, v9.d[1]
  295. fmla v25.2d, v1.2d, v9.d[0]
  296. fmla v29.2d, v1.2d, v9.d[1]
  297. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  298. fmla v18.2d, v2.2d, v8.d[0]
  299. fmla v22.2d, v2.2d, v8.d[1]
  300. fmla v26.2d, v2.2d, v9.d[0]
  301. fmla v30.2d, v2.2d, v9.d[1]
  302. add pB, pB, #32
  303. add pA, pA, #64
  304. fmla v19.2d, v3.2d, v8.d[0]
  305. fmla v27.2d, v3.2d, v9.d[0]
  306. fmla v31.2d, v3.2d, v9.d[1]
  307. fmla v23.2d, v3.2d, v8.d[1]
  308. .endm
  309. .macro SAVE8x4
  310. fmov alpha0, alpha
  311. ldr q0, [pCRow0]
  312. ldr q1, [pCRow0, #16]
  313. ldr q2, [pCRow0, #32]
  314. ldr q3, [pCRow0, #48]
  315. ldr q4, [pCRow1]
  316. ldr q5, [pCRow1, #16]
  317. ldr q6, [pCRow1, #32]
  318. ldr q7, [pCRow1, #48]
  319. fmla v0.2d, v16.2d, alphaV0
  320. fmla v1.2d, v17.2d, alphaV0
  321. stp q0, q1, [pCRow0]
  322. fmla v2.2d, v18.2d, alphaV0
  323. fmla v3.2d, v19.2d, alphaV0
  324. stp q2, q3, [pCRow0, #32]
  325. ldr q0, [pCRow2]
  326. ldr q1, [pCRow2, #16]
  327. fmla v4.2d, v20.2d, alphaV0
  328. fmla v5.2d, v21.2d, alphaV0
  329. stp q4, q5, [pCRow1]
  330. ldr q2, [pCRow2, #32]
  331. ldr q3, [pCRow2, #48]
  332. fmla v6.2d, v22.2d, alphaV0
  333. fmla v7.2d, v23.2d, alphaV0
  334. stp q6, q7, [pCRow1, #32]
  335. ldr q4, [pCRow3]
  336. ldr q5, [pCRow3, #16]
  337. fmla v0.2d, v24.2d, alphaV0
  338. fmla v1.2d, v25.2d, alphaV0
  339. stp q0, q1, [pCRow2]
  340. ldr q6, [pCRow3, #32]
  341. ldr q7, [pCRow3, #48]
  342. fmla v2.2d, v26.2d, alphaV0
  343. fmla v3.2d, v27.2d, alphaV0
  344. stp q2, q3, [pCRow2, #32]
  345. fmla v4.2d, v28.2d, alphaV0
  346. fmla v5.2d, v29.2d, alphaV0
  347. stp q4, q5, [pCRow3]
  348. fmla v6.2d, v30.2d, alphaV0
  349. fmla v7.2d, v31.2d, alphaV0
  350. stp q6, q7, [pCRow3, #32]
  351. add pCRow0, pCRow0, #64
  352. add pCRow1, pCRow1, #64
  353. add pCRow2, pCRow2, #64
  354. add pCRow3, pCRow3, #64
  355. .endm
  356. /******************************************************************************/
  357. .macro INIT4x4
  358. fmov d16, xzr
  359. fmov d17, d16
  360. fmov d20, d17
  361. fmov d21, d16
  362. fmov d24, d17
  363. fmov d25, d16
  364. fmov d28, d17
  365. fmov d29, d16
  366. .endm
  367. .macro KERNEL4x4_SUB
  368. ld1 {v8.2d, v9.2d}, [pB]
  369. add pB, pB, #32
  370. ld1 {v0.2d, v1.2d}, [pA]
  371. add pA, pA, #32
  372. fmla v16.2d, v0.2d, v8.d[0]
  373. fmla v29.2d, v1.2d, v9.d[1]
  374. fmla v20.2d, v0.2d, v8.d[1]
  375. fmla v25.2d, v1.2d, v9.d[0]
  376. fmla v24.2d, v0.2d, v9.d[0]
  377. fmla v21.2d, v1.2d, v8.d[1]
  378. fmla v28.2d, v0.2d, v9.d[1]
  379. fmla v17.2d, v1.2d, v8.d[0]
  380. .endm
  381. .macro SAVE4x4
  382. fmov alpha0, alpha
  383. ld1 {v8.2d, v9.2d}, [pCRow0]
  384. fmla v8.2d, v16.2d, alphaV0
  385. fmla v9.2d, v17.2d, alphaV0
  386. st1 {v8.2d, v9.2d}, [pCRow0]
  387. prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
  388. add pCRow0, pCRow0, #32
  389. ld1 {v12.2d, v13.2d}, [pCRow1]
  390. fmla v12.2d, v20.2d, alphaV0
  391. fmla v13.2d, v21.2d, alphaV0
  392. st1 {v12.2d, v13.2d}, [pCRow1]
  393. prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
  394. add pCRow1, pCRow1, #32
  395. ld1 {v8.2d, v9.2d}, [pCRow2]
  396. fmla v8.2d, v24.2d, alphaV0
  397. fmla v9.2d, v25.2d, alphaV0
  398. st1 {v8.2d, v9.2d}, [pCRow2]
  399. prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE]
  400. add pCRow2, pCRow2, #32
  401. ld1 {v12.2d, v13.2d}, [pCRow3]
  402. fmla v12.2d, v28.2d, alphaV0
  403. fmla v13.2d, v29.2d, alphaV0
  404. st1 {v12.2d, v13.2d}, [pCRow3]
  405. prfm PLDL2KEEP, [pCRow3, C_PRE_SIZE]
  406. add pCRow3, pCRow3, #32
  407. .endm
  408. /******************************************************************************/
  409. .macro INIT2x4
  410. fmov d16, xzr
  411. fmov d20, d16
  412. fmov d24, d20
  413. fmov d28, d16
  414. .endm
  415. .macro KERNEL2x4_SUB
  416. ld1 {v8.2d, v9.2d}, [pB]
  417. add pB, pB, #32
  418. ld1 {v0.2d}, [pA]
  419. add pA, pA, #16
  420. fmla v16.2d, v0.2d, v8.d[0]
  421. fmla v20.2d, v0.2d, v8.d[1]
  422. fmla v24.2d, v0.2d, v9.d[0]
  423. fmla v28.2d, v0.2d, v9.d[1]
  424. .endm
  425. .macro SAVE2x4
  426. fmov alpha0, alpha
  427. ld1 {v8.2d}, [pCRow0]
  428. fmla v8.2d, v16.2d, alphaV0
  429. st1 {v8.2d}, [pCRow0]
  430. prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
  431. add pCRow0, pCRow0, #16
  432. ld1 {v12.2d}, [pCRow1]
  433. fmla v12.2d, v20.2d, alphaV0
  434. st1 {v12.2d}, [pCRow1]
  435. prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
  436. add pCRow1, pCRow1, #16
  437. ld1 {v8.2d}, [pCRow2]
  438. fmla v8.2d, v24.2d, alphaV0
  439. st1 {v8.2d}, [pCRow2]
  440. prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE]
  441. add pCRow2, pCRow2, #16
  442. ld1 {v12.2d}, [pCRow3]
  443. fmla v12.2d, v28.2d, alphaV0
  444. st1 {v12.2d}, [pCRow3]
  445. prfm PLDL2KEEP, [pCRow3, C_PRE_SIZE]
  446. add pCRow3, pCRow3, #16
  447. .endm
  448. /******************************************************************************/
  449. .macro INIT1x4
  450. fmov d16, xzr
  451. fmov d20, d16
  452. .endm
  453. .macro KERNEL1x4_SUB
  454. ldr d0, [pA]
  455. add pA, pA, #8
  456. ld1 {v8.2d, v9.2d}, [pB]
  457. add pB, pB, #32
  458. fmla v16.2d, v8.2d, v0.d[0]
  459. fmla v20.2d, v9.2d, v0.d[0]
  460. .endm
  461. .macro SAVE1x4
  462. fmov alpha0, alpha
  463. ld1 {v8.d}[0], [pCRow0]
  464. ld1 {v8.d}[1], [pCRow1]
  465. fmla v8.2d, v16.2d, alphaV0
  466. st1 {v8.d}[0], [pCRow0]
  467. st1 {v8.d}[1], [pCRow1]
  468. prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
  469. add pCRow0, pCRow0, #8
  470. prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
  471. add pCRow1, pCRow1, #8
  472. ld1 {v12.d}[0], [pCRow2]
  473. ld1 {v12.d}[1], [pCRow3]
  474. fmla v12.2d, v20.2d, alphaV0
  475. st1 {v12.d}[0], [pCRow2]
  476. st1 {v12.d}[1], [pCRow3]
  477. prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE]
  478. add pCRow2, pCRow2, #8
  479. prfm PLDL2KEEP, [pCRow3, C_PRE_SIZE]
  480. add pCRow3, pCRow3, #8
  481. .endm
  482. /******************************************************************************/
  483. .macro INIT8x2
  484. fmov d16, xzr
  485. fmov d17, xzr
  486. fmov d18, d16
  487. fmov d19, d17
  488. fmov d20, xzr
  489. fmov d21, d16
  490. fmov d22, d17
  491. fmov d23, d18
  492. .endm
  493. .macro KERNEL8x2_SUB
  494. ld1 {v0.2d, v1.2d}, [pA]
  495. add pA, pA, #32
  496. ld1 {v8.2d}, [pB]
  497. add pB, pB, #16
  498. ld1 {v2.2d, v3.2d}, [pA]
  499. add pA, pA, #32
  500. fmla v16.2d, v0.2d, v8.d[0]
  501. fmla v17.2d, v1.2d, v8.d[0]
  502. fmla v18.2d, v2.2d, v8.d[0]
  503. fmla v19.2d, v3.2d, v8.d[0]
  504. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  505. fmla v20.2d, v0.2d, v8.d[1]
  506. fmla v21.2d, v1.2d, v8.d[1]
  507. fmla v22.2d, v2.2d, v8.d[1]
  508. fmla v23.2d, v3.2d, v8.d[1]
  509. .endm
  510. .macro SAVE8x2
  511. fmov alpha0, alpha
  512. ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  513. fmla v0.2d, v16.2d, alphaV0
  514. fmla v1.2d, v17.2d, alphaV0
  515. fmla v2.2d, v18.2d, alphaV0
  516. fmla v3.2d, v19.2d, alphaV0
  517. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  518. prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
  519. add pCRow0, pCRow0, #64
  520. ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
  521. fmla v4.2d, v20.2d, alphaV0
  522. fmla v5.2d, v21.2d, alphaV0
  523. fmla v6.2d, v22.2d, alphaV0
  524. fmla v7.2d, v23.2d, alphaV0
  525. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
  526. prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
  527. add pCRow1, pCRow1, #64
  528. .endm
  529. /******************************************************************************/
  530. .macro INIT4x2
  531. fmov d16, xzr
  532. fmov d17, d16
  533. fmov d20, d17
  534. fmov d21, d16
  535. .endm
  536. .macro KERNEL4x2_SUB
  537. ld1 {v8.2d}, [pB]
  538. add pB, pB, #16
  539. ld1 {v0.2d, v1.2d}, [pA]
  540. add pA, pA, #32
  541. fmla v16.2d, v0.2d, v8.d[0]
  542. fmla v17.2d, v1.2d, v8.d[0]
  543. fmla v20.2d, v0.2d, v8.d[1]
  544. fmla v21.2d, v1.2d, v8.d[1]
  545. .endm
  546. .macro SAVE4x2
  547. fmov alpha0, alpha
  548. ld1 {v8.2d, v9.2d}, [pCRow0]
  549. fmla v8.2d, v16.2d, alphaV0
  550. fmla v9.2d, v17.2d, alphaV0
  551. st1 {v8.2d, v9.2d}, [pCRow0]
  552. prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
  553. add pCRow0, pCRow0, #32
  554. ld1 {v12.2d, v13.2d}, [pCRow1]
  555. fmla v12.2d, v20.2d, alphaV0
  556. fmla v13.2d, v21.2d, alphaV0
  557. st1 {v12.2d, v13.2d}, [pCRow1]
  558. prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
  559. add pCRow1, pCRow1, #32
  560. .endm
  561. /******************************************************************************/
  562. .macro INIT2x2
  563. fmov d16, xzr
  564. fmov d20, d16
  565. .endm
  566. .macro KERNEL2x2_SUB
  567. ld1 {v8.2d}, [pB]
  568. add pB, pB, #16
  569. ld1 {v0.2d}, [pA]
  570. add pA, pA, #16
  571. fmla v16.2d, v0.2d, v8.d[0]
  572. fmla v20.2d, v0.2d, v8.d[1]
  573. .endm
  574. .macro SAVE2x2
  575. fmov alpha0, alpha
  576. ld1 {v8.2d}, [pCRow0]
  577. fmla v8.2d, v16.2d, alphaV0
  578. st1 {v8.2d}, [pCRow0]
  579. prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
  580. add pCRow0, pCRow0, #16
  581. ld1 {v12.2d}, [pCRow1]
  582. fmla v12.2d, v20.2d, alphaV0
  583. st1 {v12.2d}, [pCRow1]
  584. prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
  585. add pCRow1, pCRow1, #16
  586. .endm
  587. /******************************************************************************/
  588. .macro INIT1x2
  589. fmov d16, xzr
  590. .endm
  591. .macro KERNEL1x2_SUB
  592. ld1 {v8.2d} , [pB]
  593. add pB , pB, #16
  594. ldr d0 , [pA]
  595. add pA, pA, #8
  596. fmla v16.2d, v8.2d, v0.d[0]
  597. .endm
  598. .macro SAVE1x2
  599. fmov alpha0, alpha
  600. ld1 {v8.d}[0], [pCRow0]
  601. ld1 {v8.d}[1], [pCRow1]
  602. fmla v8.2d, v16.2d, alphaV0
  603. st1 {v8.d}[0], [pCRow0]
  604. st1 {v8.d}[1], [pCRow1]
  605. prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
  606. add pCRow0, pCRow0, #8
  607. prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
  608. add pCRow1, pCRow1, #8
  609. .endm
  610. /******************************************************************************/
  611. .macro INIT8x1
  612. fmov d16, xzr
  613. fmov d17, xzr
  614. fmov d18, d16
  615. fmov d19, d17
  616. .endm
  617. .macro KERNEL8x1_SUB
  618. ld1 {v0.2d, v1.2d}, [pA]
  619. add pA , pA, #32
  620. ldr d8, [pB]
  621. add pB , pB, #8
  622. ld1 {v2.2d, v3.2d}, [pA]
  623. add pA, pA, #32
  624. fmla v16.2d, v0.2d, v8.d[0]
  625. fmla v17.2d, v1.2d, v8.d[0]
  626. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  627. fmla v18.2d, v2.2d, v8.d[0]
  628. fmla v19.2d, v3.2d, v8.d[0]
  629. .endm
  630. .macro SAVE8x1
  631. fmov alpha0, alpha
  632. ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  633. fmla v0.2d, v16.2d, alphaV0
  634. fmla v1.2d, v17.2d, alphaV0
  635. fmla v2.2d, v18.2d, alphaV0
  636. fmla v3.2d, v19.2d, alphaV0
  637. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  638. prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
  639. add pCRow0, pCRow0, #64
  640. .endm
  641. /******************************************************************************/
  642. .macro INIT4x1
  643. fmov d16, xzr
  644. fmov d17, d16
  645. .endm
  646. .macro KERNEL4x1_SUB
  647. ldr d8, [pB]
  648. add pB , pB, #8
  649. ld1 {v0.2d, v1.2d}, [pA]
  650. add pA , pA, #32
  651. fmla v16.2d, v0.2d, v8.d[0]
  652. fmla v17.2d, v1.2d, v8.d[0]
  653. .endm
  654. .macro SAVE4x1
  655. fmov alpha0, alpha
  656. ld1 {v8.2d, v9.2d}, [pCRow0]
  657. fmla v8.2d, v16.2d, alphaV0
  658. fmla v9.2d, v17.2d, alphaV0
  659. st1 {v8.2d, v9.2d}, [pCRow0]
  660. prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
  661. add pCRow0, pCRow0, #32
  662. .endm
  663. /******************************************************************************/
  664. .macro INIT2x1
  665. fmov d16, xzr
  666. .endm
  667. .macro KERNEL2x1_SUB
  668. ldr d8, [pB]
  669. add pB , pB, #8
  670. ld1 {v0.2d}, [pA]
  671. add pA , pA, #16
  672. fmla v16.2d, v0.2d, v8.d[0]
  673. .endm
  674. .macro SAVE2x1
  675. fmov alpha0, alpha
  676. ld1 {v8.2d}, [pCRow0]
  677. fmla v8.2d, v16.2d, alphaV0
  678. st1 {v8.2d}, [pCRow0]
  679. prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
  680. add pCRow0, pCRow0, #16
  681. .endm
  682. /******************************************************************************/
  683. .macro INIT1x1
  684. fmov d16, xzr
  685. .endm
  686. .macro KERNEL1x1_SUB
  687. ldr d8, [pB]
  688. add pB , pB, #8
  689. ldr d0, [pA]
  690. add pA , pA, #8
  691. fmadd d16, d0, d8, d16
  692. .endm
  693. .macro SAVE1x1
  694. fmov alpha0, alpha
  695. ldr d8, [pCRow0]
  696. fmadd d8, d16, alpha0, d8
  697. str d8, [pCRow0]
  698. prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
  699. add pCRow0, pCRow0, #8
  700. .endm
  701. .macro KERNEL8x4_M1_M2_x1
  702. KERNEL8x4_M1_M2
  703. .endm
  704. .macro KERNEL8x4_M1_M2_x2
  705. KERNEL8x4_M1_M2_x1
  706. KERNEL8x4_M1_M2_x1
  707. .endm
  708. .macro KERNEL8x4_M1_M2_x4
  709. KERNEL8x4_M1_M2_x2
  710. KERNEL8x4_M1_M2_x2
  711. .endm
  712. .macro KERNEL8x4_M1_M2_x8
  713. KERNEL8x4_M1_M2_x4
  714. KERNEL8x4_M1_M2_x4
  715. .endm
  716. .macro KERNEL8x4_M1_M2_x16
  717. KERNEL8x4_M1_M2_x8
  718. KERNEL8x4_M1_M2_x8
  719. .endm
  720. .macro KERNEL8x4_M1_M2_x32
  721. KERNEL8x4_M1_M2_x16
  722. KERNEL8x4_M1_M2_x16
  723. .endm
  724. .macro KERNEL8x4_M1_M2_x64
  725. KERNEL8x4_M1_M2_x32
  726. KERNEL8x4_M1_M2_x32
  727. .endm
  728. /*******************************************************************************
  729. * End of macro definitions
  730. *******************************************************************************/
  731. PROLOGUE
  732. .align 5
  733. add sp, sp, #-(11 * 16)
  734. stp d8, d9, [sp, #(0 * 16)]
  735. stp d10, d11, [sp, #(1 * 16)]
  736. stp d12, d13, [sp, #(2 * 16)]
  737. stp d14, d15, [sp, #(3 * 16)]
  738. stp d16, d17, [sp, #(4 * 16)]
  739. stp x18, x19, [sp, #(5 * 16)]
  740. stp x20, x21, [sp, #(6 * 16)]
  741. stp x22, x23, [sp, #(7 * 16)]
  742. stp x24, x25, [sp, #(8 * 16)]
  743. stp x26, x27, [sp, #(9 * 16)]
  744. str x28, [sp, #(10 * 16)]
  745. prfm PLDL1KEEP, [origPB]
  746. prfm PLDL1KEEP, [origPA]
  747. mov A_PRE_SIZE, #3584
  748. mov B_PRE_SIZE, #512
  749. mov C_PRE_SIZE, #128
  750. add A_PRE_SIZE_64, A_PRE_SIZE, #64
  751. add B_PRE_SIZE_64, B_PRE_SIZE, #64
  752. fmov alpha, d0
  753. lsl LDC, LDC, #3 // ldc = ldc * 8
  754. mov pB, origPB
  755. mov counterJ, origN
  756. asr counterJ, counterJ, #2 // J = J / 4
  757. cmp counterJ, #0
  758. ble .Ldgemm_kernel_L2_BEGIN
  759. /******************************************************************************/
  760. .align 5
  761. .Ldgemm_kernel_L4_BEGIN:
  762. mov pCRow0, pC
  763. add pCRow1, pCRow0, LDC
  764. add pCRow2, pCRow1, LDC
  765. add pCRow3, pCRow2, LDC
  766. add pC, pCRow3, LDC
  767. mov pA, origPA // pA = start of A array
  768. .Ldgemm_kernel_L4_M8_BEGIN:
  769. mov counterI, origM
  770. asr counterI, counterI, #3 // counterI = counterI / 8
  771. cmp counterI, #0
  772. ble .Ldgemm_kernel_L4_M4_BEGIN
  773. .align 5
  774. .Ldgemm_kernel_L4_M8_20:
  775. mov pB, origPB
  776. asr counterL , origK, #7 // L = K / 128
  777. cmp counterL , #2 // is there at least 4 to do?
  778. blt .Ldgemm_kernel_L4_M8_32
  779. KERNEL8x4_I
  780. KERNEL8x4_M2
  781. KERNEL8x4_M1_M2_x32
  782. KERNEL8x4_M1_M2_x16
  783. KERNEL8x4_M1_M2_x8
  784. KERNEL8x4_M1_M2_x4
  785. KERNEL8x4_M1_M2_x2
  786. KERNEL8x4_M1_M2_x1
  787. subs counterL, counterL, #2 // subtract 2
  788. ble .Ldgemm_kernel_L4_M8_22a
  789. .align 5
  790. .Ldgemm_kernel_L4_M8_22:
  791. KERNEL8x4_M1_M2_x64
  792. subs counterL, counterL, #1
  793. bgt .Ldgemm_kernel_L4_M8_22
  794. .align 5
  795. .Ldgemm_kernel_L4_M8_22a:
  796. KERNEL8x4_M1_M2_x32
  797. KERNEL8x4_M1_M2_x16
  798. KERNEL8x4_M1_M2_x8
  799. KERNEL8x4_M1_M2_x4
  800. KERNEL8x4_M1_M2_x2
  801. KERNEL8x4_M1_M2_x1
  802. KERNEL8x4_M1
  803. KERNEL8x4_E
  804. b .Ldgemm_kernel_L4_M8_44
  805. .align 5
  806. .Ldgemm_kernel_L4_M8_32:
  807. tst counterL, #1
  808. ble .Ldgemm_kernel_L4_M8_40
  809. KERNEL8x4_I
  810. KERNEL8x4_M2
  811. KERNEL8x4_M1_M2_x32
  812. KERNEL8x4_M1_M2_x16
  813. KERNEL8x4_M1_M2_x8
  814. KERNEL8x4_M1_M2_x4
  815. KERNEL8x4_M1_M2_x2
  816. KERNEL8x4_M1
  817. KERNEL8x4_E
  818. b .Ldgemm_kernel_L4_M8_44
  819. .Ldgemm_kernel_L4_M8_40:
  820. INIT8x4
  821. .Ldgemm_kernel_L4_M8_44:
  822. ands counterL , origK, #127
  823. ble .Ldgemm_kernel_L4_M8_100
  824. .align 5
  825. .Ldgemm_kernel_L4_M8_46:
  826. KERNEL8x4_SUB
  827. subs counterL, counterL, #1
  828. bne .Ldgemm_kernel_L4_M8_46
  829. .Ldgemm_kernel_L4_M8_100:
  830. prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
  831. prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
  832. prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE]
  833. prfm PLDL2KEEP, [pCRow3, C_PRE_SIZE]
  834. prfm PLDL1KEEP, [pA]
  835. prfm PLDL1KEEP, [pA, #64]
  836. prfm PLDL1KEEP, [origPB]
  837. SAVE8x4
  838. .Ldgemm_kernel_L4_M8_END:
  839. subs counterI, counterI, #1
  840. bne .Ldgemm_kernel_L4_M8_20
  841. .Ldgemm_kernel_L4_M4_BEGIN:
  842. mov counterI, origM
  843. tst counterI , #7
  844. ble .Ldgemm_kernel_L4_END
  845. tst counterI, #4
  846. ble .Ldgemm_kernel_L4_M2_BEGIN
  847. .Ldgemm_kernel_L4_M4_20:
  848. INIT4x4
  849. mov pB, origPB
  850. asr counterL , origK, #3 // counterL = counterL / 8
  851. cmp counterL , #0
  852. ble .Ldgemm_kernel_L4_M4_40
  853. .align 5
  854. .Ldgemm_kernel_L4_M4_22:
  855. KERNEL4x4_SUB
  856. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  857. KERNEL4x4_SUB
  858. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  859. KERNEL4x4_SUB
  860. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  861. KERNEL4x4_SUB
  862. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  863. KERNEL4x4_SUB
  864. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  865. KERNEL4x4_SUB
  866. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  867. KERNEL4x4_SUB
  868. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  869. KERNEL4x4_SUB
  870. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  871. subs counterL, counterL, #1
  872. bgt .Ldgemm_kernel_L4_M4_22
  873. .Ldgemm_kernel_L4_M4_40:
  874. ands counterL , origK, #7 // counterL = counterL % 8
  875. ble .Ldgemm_kernel_L4_M4_100
  876. .Ldgemm_kernel_L4_M4_42:
  877. KERNEL4x4_SUB
  878. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  879. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  880. subs counterL, counterL, #1
  881. bgt .Ldgemm_kernel_L4_M4_42
  882. .Ldgemm_kernel_L4_M4_100:
  883. SAVE4x4
  884. .Ldgemm_kernel_L4_M4_END:
  885. .Ldgemm_kernel_L4_M2_BEGIN:
  886. mov counterI, origM
  887. tst counterI , #3
  888. ble .Ldgemm_kernel_L4_END
  889. tst counterI, #2 // counterI = counterI / 2
  890. ble .Ldgemm_kernel_L4_M1_BEGIN
  891. .Ldgemm_kernel_L4_M2_20:
  892. INIT2x4
  893. mov pB, origPB
  894. asr counterL , origK, #3 // counterL = counterL / 8
  895. cmp counterL , #0
  896. ble .Ldgemm_kernel_L4_M2_40
  897. .align 5
  898. .Ldgemm_kernel_L4_M2_22:
  899. KERNEL2x4_SUB
  900. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  901. KERNEL2x4_SUB
  902. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  903. KERNEL2x4_SUB
  904. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  905. KERNEL2x4_SUB
  906. KERNEL2x4_SUB
  907. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  908. KERNEL2x4_SUB
  909. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  910. KERNEL2x4_SUB
  911. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  912. KERNEL2x4_SUB
  913. subs counterL, counterL, #1
  914. bgt .Ldgemm_kernel_L4_M2_22
  915. .Ldgemm_kernel_L4_M2_40:
  916. ands counterL , origK, #7 // counterL = counterL % 8
  917. ble .Ldgemm_kernel_L4_M2_100
  918. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  919. prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
  920. .Ldgemm_kernel_L4_M2_42:
  921. KERNEL2x4_SUB
  922. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  923. subs counterL, counterL, #1
  924. bgt .Ldgemm_kernel_L4_M2_42
  925. .Ldgemm_kernel_L4_M2_100:
  926. SAVE2x4
  927. .Ldgemm_kernel_L4_M2_END:
  928. .Ldgemm_kernel_L4_M1_BEGIN:
  929. tst counterI, #1 // counterI = counterI % 2
  930. ble .Ldgemm_kernel_L4_END
  931. .Ldgemm_kernel_L4_M1_20:
  932. INIT1x4
  933. mov pB, origPB
  934. asr counterL , origK, #3 // counterL = counterL / 8
  935. cmp counterL , #0
  936. ble .Ldgemm_kernel_L4_M1_40
  937. .align 5
  938. .Ldgemm_kernel_L4_M1_22:
  939. KERNEL1x4_SUB
  940. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  941. KERNEL1x4_SUB
  942. KERNEL1x4_SUB
  943. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  944. KERNEL1x4_SUB
  945. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  946. KERNEL1x4_SUB
  947. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  948. KERNEL1x4_SUB
  949. KERNEL1x4_SUB
  950. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  951. KERNEL1x4_SUB
  952. subs counterL, counterL, #1
  953. bgt .Ldgemm_kernel_L4_M1_22
  954. .Ldgemm_kernel_L4_M1_40:
  955. ands counterL , origK, #7 // counterL = counterL % 8
  956. ble .Ldgemm_kernel_L4_M1_100
  957. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  958. .Ldgemm_kernel_L4_M1_42:
  959. KERNEL1x4_SUB
  960. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  961. subs counterL, counterL, #1
  962. bgt .Ldgemm_kernel_L4_M1_42
  963. .Ldgemm_kernel_L4_M1_100:
  964. SAVE1x4
  965. .Ldgemm_kernel_L4_END:
  966. lsl temp, origK, #5
  967. add origPB, origPB, temp // B = B + K * 4 * 8
  968. subs counterJ, counterJ , #1 // j--
  969. bgt .Ldgemm_kernel_L4_BEGIN
  970. /******************************************************************************/
  971. .Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  972. mov counterJ , origN
  973. tst counterJ , #3
  974. ble .Ldgemm_kernel_L999 // error, N was less than 4?
  975. tst counterJ , #2
  976. ble .Ldgemm_kernel_L1_BEGIN
  977. mov pCRow0, pC
  978. add pCRow1, pCRow0, LDC
  979. add pC, pCRow1, LDC
  980. mov pA, origPA // pA = A
  981. .Ldgemm_kernel_L2_M8_BEGIN:
  982. mov counterI, origM
  983. asr counterI, counterI, #3 // counterI = counterI / 8
  984. cmp counterI, #0
  985. ble .Ldgemm_kernel_L2_M4_BEGIN
  986. .align 5
  987. .Ldgemm_kernel_L2_M8_20:
  988. INIT8x2
  989. mov pB, origPB
  990. asr counterL , origK, #3 // counterL = counterL / 8
  991. cmp counterL,#0
  992. ble .Ldgemm_kernel_L2_M8_40
  993. .align 5
  994. .Ldgemm_kernel_L2_M8_22:
  995. KERNEL8x2_SUB
  996. KERNEL8x2_SUB
  997. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  998. KERNEL8x2_SUB
  999. KERNEL8x2_SUB
  1000. KERNEL8x2_SUB
  1001. KERNEL8x2_SUB
  1002. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1003. KERNEL8x2_SUB
  1004. KERNEL8x2_SUB
  1005. subs counterL, counterL, #1
  1006. bgt .Ldgemm_kernel_L2_M8_22
  1007. .Ldgemm_kernel_L2_M8_40:
  1008. ands counterL , origK, #7 // counterL = counterL % 8
  1009. ble .Ldgemm_kernel_L2_M8_100
  1010. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1011. prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
  1012. .Ldgemm_kernel_L2_M8_42:
  1013. KERNEL8x2_SUB
  1014. subs counterL, counterL, #1
  1015. bgt .Ldgemm_kernel_L2_M8_42
  1016. .Ldgemm_kernel_L2_M8_100:
  1017. SAVE8x2
  1018. .Ldgemm_kernel_L2_M8_END:
  1019. subs counterI, counterI, #1
  1020. bgt .Ldgemm_kernel_L2_M8_20
  1021. .Ldgemm_kernel_L2_M4_BEGIN:
  1022. mov counterI, origM
  1023. tst counterI , #7
  1024. ble .Ldgemm_kernel_L2_END
  1025. tst counterI, #4 // counterI = counterI / 2
  1026. ble .Ldgemm_kernel_L2_M2_BEGIN
  1027. .Ldgemm_kernel_L2_M4_20:
  1028. INIT4x2
  1029. mov pB, origPB
  1030. asr counterL , origK, #3 // counterL = counterL / 8
  1031. cmp counterL,#0
  1032. ble .Ldgemm_kernel_L2_M4_40
  1033. .align 5
  1034. .Ldgemm_kernel_L2_M4_22:
  1035. KERNEL4x2_SUB
  1036. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1037. KERNEL4x2_SUB
  1038. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1039. KERNEL4x2_SUB
  1040. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1041. KERNEL4x2_SUB
  1042. KERNEL4x2_SUB
  1043. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1044. KERNEL4x2_SUB
  1045. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1046. KERNEL4x2_SUB
  1047. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1048. KERNEL4x2_SUB
  1049. subs counterL, counterL, #1
  1050. bgt .Ldgemm_kernel_L2_M4_22
  1051. .Ldgemm_kernel_L2_M4_40:
  1052. ands counterL , origK, #7 // counterL = counterL % 8
  1053. ble .Ldgemm_kernel_L2_M4_100
  1054. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1055. prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
  1056. .Ldgemm_kernel_L2_M4_42:
  1057. KERNEL4x2_SUB
  1058. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1059. subs counterL, counterL, #1
  1060. bgt .Ldgemm_kernel_L2_M4_42
  1061. .Ldgemm_kernel_L2_M4_100:
  1062. SAVE4x2
  1063. .Ldgemm_kernel_L2_M4_END:
  1064. .Ldgemm_kernel_L2_M2_BEGIN:
  1065. mov counterI, origM
  1066. tst counterI , #3
  1067. ble .Ldgemm_kernel_L2_END
  1068. tst counterI, #2 // counterI = counterI / 2
  1069. ble .Ldgemm_kernel_L2_M1_BEGIN
  1070. .Ldgemm_kernel_L2_M2_20:
  1071. INIT2x2
  1072. mov pB, origPB
  1073. asr counterL , origK, #3 // counterL = counterL / 8
  1074. cmp counterL,#0
  1075. ble .Ldgemm_kernel_L2_M2_40
  1076. .Ldgemm_kernel_L2_M2_22:
  1077. KERNEL2x2_SUB
  1078. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1079. KERNEL2x2_SUB
  1080. KERNEL2x2_SUB
  1081. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1082. KERNEL2x2_SUB
  1083. KERNEL2x2_SUB
  1084. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1085. KERNEL2x2_SUB
  1086. KERNEL2x2_SUB
  1087. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1088. KERNEL2x2_SUB
  1089. subs counterL, counterL, #1
  1090. bgt .Ldgemm_kernel_L2_M2_22
  1091. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1092. prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
  1093. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1094. prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
  1095. .Ldgemm_kernel_L2_M2_40:
  1096. ands counterL , origK, #7 // counterL = counterL % 8
  1097. ble .Ldgemm_kernel_L2_M2_100
  1098. .Ldgemm_kernel_L2_M2_42:
  1099. KERNEL2x2_SUB
  1100. subs counterL, counterL, #1
  1101. bgt .Ldgemm_kernel_L2_M2_42
  1102. .Ldgemm_kernel_L2_M2_100:
  1103. SAVE2x2
  1104. .Ldgemm_kernel_L2_M2_END:
  1105. .Ldgemm_kernel_L2_M1_BEGIN:
  1106. tst counterI, #1 // counterI = counterI % 2
  1107. ble .Ldgemm_kernel_L2_END
  1108. .Ldgemm_kernel_L2_M1_20:
  1109. INIT1x2
  1110. mov pB, origPB
  1111. asr counterL , origK, #3 // counterL = counterL / 8
  1112. cmp counterL, #0
  1113. ble .Ldgemm_kernel_L2_M1_40
  1114. .Ldgemm_kernel_L2_M1_22:
  1115. KERNEL1x2_SUB
  1116. KERNEL1x2_SUB
  1117. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1118. KERNEL1x2_SUB
  1119. KERNEL1x2_SUB
  1120. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1121. KERNEL1x2_SUB
  1122. KERNEL1x2_SUB
  1123. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1124. KERNEL1x2_SUB
  1125. KERNEL1x2_SUB
  1126. subs counterL, counterL, #1
  1127. bgt .Ldgemm_kernel_L2_M1_22
  1128. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1129. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1130. prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
  1131. .Ldgemm_kernel_L2_M1_40:
  1132. ands counterL , origK, #7 // counterL = counterL % 8
  1133. ble .Ldgemm_kernel_L2_M1_100
  1134. .Ldgemm_kernel_L2_M1_42:
  1135. KERNEL1x2_SUB
  1136. subs counterL, counterL, #1
  1137. bgt .Ldgemm_kernel_L2_M1_42
  1138. .Ldgemm_kernel_L2_M1_100:
  1139. SAVE1x2
  1140. .Ldgemm_kernel_L2_END:
  1141. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1142. /******************************************************************************/
  1143. .Ldgemm_kernel_L1_BEGIN:
  1144. mov counterJ , origN
  1145. tst counterJ , #1
  1146. ble .Ldgemm_kernel_L999 // done
  1147. mov pCRow0, pC // pCRow0 = C
  1148. add pC , pC , LDC // Update pC to point to next
  1149. mov pA, origPA // pA = A
  1150. .Ldgemm_kernel_L1_M8_BEGIN:
  1151. mov counterI, origM
  1152. asr counterI, counterI, #3 // counterI = counterI / 8
  1153. cmp counterI, #0
  1154. ble .Ldgemm_kernel_L1_M4_BEGIN
  1155. .align 5
  1156. .Ldgemm_kernel_L1_M8_20:
  1157. INIT8x1
  1158. mov pB, origPB
  1159. asr counterL , origK, #3 // counterL = counterL / 8
  1160. cmp counterL , #0
  1161. ble .Ldgemm_kernel_L1_M8_40
  1162. .align 5
  1163. .Ldgemm_kernel_L1_M8_22:
  1164. KERNEL8x1_SUB
  1165. KERNEL8x1_SUB
  1166. KERNEL8x1_SUB
  1167. KERNEL8x1_SUB
  1168. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1169. KERNEL8x1_SUB
  1170. KERNEL8x1_SUB
  1171. KERNEL8x1_SUB
  1172. KERNEL8x1_SUB
  1173. subs counterL, counterL, #1
  1174. bgt .Ldgemm_kernel_L1_M8_22
  1175. .Ldgemm_kernel_L1_M8_40:
  1176. ands counterL , origK, #7 // counterL = counterL % 8
  1177. ble .Ldgemm_kernel_L1_M8_100
  1178. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1179. .Ldgemm_kernel_L1_M8_42:
  1180. KERNEL8x1_SUB
  1181. subs counterL, counterL, #1
  1182. bgt .Ldgemm_kernel_L1_M8_42
  1183. .Ldgemm_kernel_L1_M8_100:
  1184. SAVE8x1
  1185. .Ldgemm_kernel_L1_M8_END:
  1186. subs counterI, counterI, #1
  1187. bgt .Ldgemm_kernel_L1_M8_20
  1188. .Ldgemm_kernel_L1_M4_BEGIN:
  1189. mov counterI, origM
  1190. tst counterI , #7
  1191. ble .Ldgemm_kernel_L1_END
  1192. tst counterI, #4 // counterI = counterI / 2
  1193. ble .Ldgemm_kernel_L1_M2_BEGIN
  1194. .Ldgemm_kernel_L1_M4_20:
  1195. INIT4x1
  1196. mov pB, origPB
  1197. asr counterL , origK, #3 // counterL = counterL / 8
  1198. cmp counterL , #0
  1199. ble .Ldgemm_kernel_L1_M4_40
  1200. .align 5
  1201. .Ldgemm_kernel_L1_M4_22:
  1202. KERNEL4x1_SUB
  1203. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1204. KERNEL4x1_SUB
  1205. KERNEL4x1_SUB
  1206. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1207. KERNEL4x1_SUB
  1208. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1209. KERNEL4x1_SUB
  1210. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1211. KERNEL4x1_SUB
  1212. KERNEL4x1_SUB
  1213. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1214. KERNEL4x1_SUB
  1215. subs counterL, counterL, #1
  1216. bgt .Ldgemm_kernel_L1_M4_22
  1217. .Ldgemm_kernel_L1_M4_40:
  1218. ands counterL , origK, #7 // counterL = counterL % 8
  1219. ble .Ldgemm_kernel_L1_M4_100
  1220. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1221. .Ldgemm_kernel_L1_M4_42:
  1222. KERNEL4x1_SUB
  1223. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1224. subs counterL, counterL, #1
  1225. bgt .Ldgemm_kernel_L1_M4_42
  1226. .Ldgemm_kernel_L1_M4_100:
  1227. SAVE4x1
  1228. .Ldgemm_kernel_L1_M4_END:
  1229. .Ldgemm_kernel_L1_M2_BEGIN:
  1230. mov counterI, origM
  1231. tst counterI , #3
  1232. ble .Ldgemm_kernel_L1_END
  1233. tst counterI, #2 // counterI = counterI / 2
  1234. ble .Ldgemm_kernel_L1_M1_BEGIN
  1235. .Ldgemm_kernel_L1_M2_20:
  1236. INIT2x1
  1237. mov pB, origPB
  1238. asr counterL , origK, #3 // counterL = counterL / 8
  1239. cmp counterL , #0
  1240. ble .Ldgemm_kernel_L1_M2_40
  1241. .Ldgemm_kernel_L1_M2_22:
  1242. KERNEL2x1_SUB
  1243. KERNEL2x1_SUB
  1244. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1245. KERNEL2x1_SUB
  1246. KERNEL2x1_SUB
  1247. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1248. KERNEL2x1_SUB
  1249. KERNEL2x1_SUB
  1250. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1251. KERNEL2x1_SUB
  1252. KERNEL2x1_SUB
  1253. subs counterL, counterL, #1
  1254. bgt .Ldgemm_kernel_L1_M2_22
  1255. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1256. prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
  1257. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1258. .Ldgemm_kernel_L1_M2_40:
  1259. ands counterL , origK, #7 // counterL = counterL % 8
  1260. ble .Ldgemm_kernel_L1_M2_100
  1261. .Ldgemm_kernel_L1_M2_42:
  1262. KERNEL2x1_SUB
  1263. subs counterL, counterL, #1
  1264. bgt .Ldgemm_kernel_L1_M2_42
  1265. .Ldgemm_kernel_L1_M2_100:
  1266. SAVE2x1
  1267. .Ldgemm_kernel_L1_M2_END:
  1268. .Ldgemm_kernel_L1_M1_BEGIN:
  1269. tst counterI, #1 // counterI = counterI % 2
  1270. ble .Ldgemm_kernel_L1_END
  1271. .Ldgemm_kernel_L1_M1_20:
  1272. INIT1x1
  1273. mov pB, origPB
  1274. asr counterL , origK, #3 // counterL = counterL / 8
  1275. cmp counterL , #0
  1276. ble .Ldgemm_kernel_L1_M1_40
  1277. .Ldgemm_kernel_L1_M1_22:
  1278. KERNEL1x1_SUB
  1279. KERNEL1x1_SUB
  1280. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1281. KERNEL1x1_SUB
  1282. KERNEL1x1_SUB
  1283. KERNEL1x1_SUB
  1284. KERNEL1x1_SUB
  1285. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1286. KERNEL1x1_SUB
  1287. KERNEL1x1_SUB
  1288. subs counterL, counterL, #1
  1289. bgt .Ldgemm_kernel_L1_M1_22
  1290. .Ldgemm_kernel_L1_M1_40:
  1291. ands counterL , origK, #7 // counterL = counterL % 8
  1292. ble .Ldgemm_kernel_L1_M1_100
  1293. prfm PLDL1KEEP, [pA, A_PRE_SIZE]
  1294. prfm PLDL1KEEP, [pB, B_PRE_SIZE]
  1295. .Ldgemm_kernel_L1_M1_42:
  1296. KERNEL1x1_SUB
  1297. subs counterL, counterL, #1
  1298. bgt .Ldgemm_kernel_L1_M1_42
  1299. .Ldgemm_kernel_L1_M1_100:
  1300. SAVE1x1
  1301. .Ldgemm_kernel_L1_END:
  1302. .Ldgemm_kernel_L999:
  1303. mov x0, #0 // set return value
  1304. ldp d8, d9, [sp, #(0 * 16)]
  1305. ldp d10, d11, [sp, #(1 * 16)]
  1306. ldp d12, d13, [sp, #(2 * 16)]
  1307. ldp d14, d15, [sp, #(3 * 16)]
  1308. ldp d16, d17, [sp, #(4 * 16)]
  1309. ldp x18, x19, [sp, #(5 * 16)]
  1310. ldp x20, x21, [sp, #(6 * 16)]
  1311. ldp x22, x23, [sp, #(7 * 16)]
  1312. ldp x24, x25, [sp, #(8 * 16)]
  1313. ldp x26, x27, [sp, #(9 * 16)]
  1314. ldr x28, [sp, #(10 * 16)]
  1315. add sp, sp, #(11*16)
  1316. ret
  1317. EPILOGUE