You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_8x4_thunderx2t99.S 44 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175
  1. /*******************************************************************************
  2. Copyright (c) 2017, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alphaR w17
  49. #define alphaI w19
  50. #define alpha0_R s10
  51. #define alphaV0_R v10.s[0]
  52. #define alpha0_I s11
  53. #define alphaV0_I v11.s[0]
  54. #define A_PRE_SIZE 2560
  55. #define B_PRE_SIZE 448
  56. #define C_PRE_SIZE 128
  57. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  58. #define OP_rr fmla
  59. #define OP_ii fmls
  60. #define OP_ri fmla
  61. #define OP_ir fmla
  62. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  63. #define OP_rr fmla
  64. #define OP_ii fmla
  65. #define OP_ri fmls
  66. #define OP_ir fmla
  67. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  68. #define OP_rr fmla
  69. #define OP_ii fmla
  70. #define OP_ri fmla
  71. #define OP_ir fmls
  72. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  73. #define OP_rr fmla
  74. #define OP_ii fmls
  75. #define OP_ri fmls
  76. #define OP_ir fmls
  77. #endif
  78. // 00 origM
  79. // 01 origN
  80. // 02 origK
  81. // 03 origPA
  82. // 04 origPB
  83. // 05 pC
  84. // 06 origLDC -> LDC
  85. // 07 offset -> temp
  86. // 08 counterL
  87. // 09 counterI
  88. // 10 counterJ
  89. // 11 pB
  90. // 12 pCRow0
  91. // 13 pCRow1
  92. // 14 pCRow2
  93. // 15 pCRow3
  94. // 16 pA
  95. // 17
  96. // 18 must save
  97. // 19 must save
  98. // 20 must save
  99. // 21 must save
  100. // 22 must save
  101. // 23 must save
  102. // 24 must save
  103. // 25 must save
  104. // 26 must save
  105. // 27 must save
  106. // 28 must save
  107. // 29 frame
  108. // 30 link
  109. // 31 sp
  110. //v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R
  111. //v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I
  112. //v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R
  113. //v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I
  114. //v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R
  115. //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I
  116. //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R
  117. //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I
  118. //v08 must save pB0_00_R, pB0_01_R
  119. //v09 must save pB0_00_I, pB0_01_I
  120. //v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R
  121. //v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I
  122. //v12 must save pB1_00_R, pB1_01_R
  123. //v13 must save pB1_00_I, pB1_01_I
  124. //v14 must save pB1_02_R, pB1_03_R
  125. //v15 must save pB1_02_I, pB1_03_I
  126. //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R
  127. //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I
  128. //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R
  129. //v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I
  130. //v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R
  131. //v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I
  132. //v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R
  133. //v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I
  134. //v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R
  135. //v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I
  136. //v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R
  137. //v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I
  138. //v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R
  139. //v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I
  140. //v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R
  141. //v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I
  142. /*******************************************************************************
  143. * Macro definitions
  144. *******************************************************************************/
  145. .macro INIT8x4
  146. fmov s16, wzr
  147. fmov s17, wzr
  148. fmov s18, wzr
  149. fmov s19, s16
  150. fmov s20, wzr
  151. fmov s21, s16
  152. fmov s22, s17
  153. fmov s23, s18
  154. fmov s24, wzr
  155. fmov s25, s16
  156. fmov s26, s17
  157. fmov s27, s18
  158. fmov s28, wzr
  159. fmov s29, s16
  160. fmov s30, s17
  161. fmov s31, s18
  162. .endm
  163. .macro KERNEL8x4_I
  164. ldr q8, [pB]
  165. add pB, pB, #16
  166. ld2 {v0.4s, v1.4s}, [pA]
  167. add pA, pA, #32
  168. ld2 {v2.4s, v3.4s}, [pA]
  169. add pA, pA, #32
  170. fmul v16.4s, v0.4s, v8.s[0]
  171. OP_ii v16.4s, v1.4s, v8.s[1]
  172. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  173. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  174. eor v17.16b, v17.16b, v17.16b
  175. fmls v17.4s, v0.4s, v8.s[1]
  176. #else
  177. fmul v17.4s, v0.4s, v8.s[1]
  178. #endif
  179. OP_ir v17.4s, v1.4s, v8.s[0]
  180. ldr q10, [pB]
  181. add pB, pB, #16
  182. fmul v18.4s, v2.4s, v8.s[0]
  183. OP_ii v18.4s, v3.4s, v8.s[1]
  184. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  185. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  186. eor v19.16b, v19.16b, v19.16b
  187. fmls v19.4s, v2.4s, v8.s[1]
  188. #else
  189. fmul v19.4s, v2.4s, v8.s[1]
  190. #endif
  191. OP_ir v19.4s, v3.4s, v8.s[0]
  192. ldr q12, [pB]
  193. add pB, pB, #16
  194. fmul v20.4s, v0.4s, v8.s[2]
  195. OP_ii v20.4s, v1.4s, v8.s[3]
  196. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  197. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  198. eor v21.16b, v21.16b, v21.16b
  199. fmls v21.4s, v0.4s, v8.s[3]
  200. #else
  201. fmul v21.4s, v0.4s, v8.s[3]
  202. #endif
  203. OP_ir v21.4s, v1.4s, v8.s[2]
  204. ldr q14, [pB]
  205. add pB, pB, #16
  206. fmul v22.4s, v2.4s, v8.s[2]
  207. OP_ii v22.4s, v3.4s, v8.s[3]
  208. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  209. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  210. eor v23.16b, v23.16b, v23.16b
  211. fmls v23.4s, v2.4s, v8.s[3]
  212. #else
  213. fmul v23.4s, v2.4s, v8.s[3]
  214. #endif
  215. OP_ir v23.4s, v3.4s, v8.s[2]
  216. ld2 {v4.4s, v5.4s}, [pA]
  217. add pA, pA, #32
  218. fmul v24.4s, v0.4s, v10.s[0]
  219. OP_ii v24.4s, v1.4s, v10.s[1]
  220. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  221. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  222. eor v25.16b, v25.16b, v25.16b
  223. fmls v25.4s, v0.4s, v10.s[1]
  224. #else
  225. fmul v25.4s, v0.4s, v10.s[1]
  226. #endif
  227. OP_ir v25.4s, v1.4s, v10.s[0]
  228. ld2 {v6.4s, v7.4s}, [pA]
  229. add pA, pA, #32
  230. fmul v26.4s, v2.4s, v10.s[0]
  231. OP_ii v26.4s, v3.4s, v10.s[1]
  232. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  233. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  234. eor v27.16b, v27.16b, v27.16b
  235. fmls v27.4s, v2.4s, v10.s[1]
  236. #else
  237. fmul v27.4s, v2.4s, v10.s[1]
  238. #endif
  239. OP_ir v27.4s, v3.4s, v10.s[0]
  240. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  241. fmul v28.4s, v0.4s, v10.s[2]
  242. OP_ii v28.4s, v1.4s, v10.s[3]
  243. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  244. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  245. eor v29.16b, v29.16b, v29.16b
  246. fmls v29.4s, v0.4s, v10.s[3]
  247. #else
  248. fmul v29.4s, v0.4s, v10.s[3]
  249. #endif
  250. OP_ir v29.4s, v1.4s, v10.s[2]
  251. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  252. fmul v30.4s, v2.4s, v10.s[2]
  253. OP_ii v30.4s, v3.4s, v10.s[3]
  254. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  255. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  256. eor v31.16b, v31.16b, v31.16b
  257. fmls v31.4s, v2.4s, v10.s[3]
  258. #else
  259. fmul v31.4s, v2.4s, v10.s[3]
  260. #endif
  261. OP_ir v31.4s, v3.4s, v10.s[2]
  262. .endm
  263. .macro KERNEL8x4_M1
  264. OP_rr v16.4s, v0.4s, v8.s[0]
  265. OP_ii v16.4s, v1.4s, v8.s[1]
  266. OP_ri v17.4s, v0.4s, v8.s[1]
  267. OP_ir v17.4s, v1.4s, v8.s[0]
  268. ldr q12, [pB]
  269. add pB, pB, #16
  270. OP_rr v18.4s, v2.4s, v8.s[0]
  271. OP_ii v18.4s, v3.4s, v8.s[1]
  272. OP_ri v19.4s, v2.4s, v8.s[1]
  273. OP_ir v19.4s, v3.4s, v8.s[0]
  274. ld2 {v4.4s, v5.4s}, [pA]
  275. add pA, pA, #32
  276. OP_rr v20.4s, v0.4s, v8.s[2]
  277. OP_ii v20.4s, v1.4s, v8.s[3]
  278. OP_ri v21.4s, v0.4s, v8.s[3]
  279. OP_ir v21.4s, v1.4s, v8.s[2]
  280. ld2 {v6.4s, v7.4s}, [pA]
  281. add pA, pA, #32
  282. OP_rr v22.4s, v2.4s, v8.s[2]
  283. OP_ii v22.4s, v3.4s, v8.s[3]
  284. OP_ri v23.4s, v2.4s, v8.s[3]
  285. OP_ir v23.4s, v3.4s, v8.s[2]
  286. ldr q14, [pB]
  287. add pB, pB, #16
  288. OP_rr v24.4s, v0.4s, v10.s[0]
  289. OP_ii v24.4s, v1.4s, v10.s[1]
  290. OP_ri v25.4s, v0.4s, v10.s[1]
  291. OP_ir v25.4s, v1.4s, v10.s[0]
  292. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  293. OP_rr v26.4s, v2.4s, v10.s[0]
  294. OP_ii v26.4s, v3.4s, v10.s[1]
  295. OP_ri v27.4s, v2.4s, v10.s[1]
  296. OP_ir v27.4s, v3.4s, v10.s[0]
  297. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  298. OP_rr v28.4s, v0.4s, v10.s[2]
  299. OP_ii v28.4s, v1.4s, v10.s[3]
  300. OP_ri v29.4s, v0.4s, v10.s[3]
  301. OP_ir v29.4s, v1.4s, v10.s[2]
  302. OP_rr v30.4s, v2.4s, v10.s[2]
  303. OP_ii v30.4s, v3.4s, v10.s[3]
  304. OP_ri v31.4s, v2.4s, v10.s[3]
  305. OP_ir v31.4s, v3.4s, v10.s[2]
  306. .endm
  307. .macro KERNEL8x4_M2
  308. OP_rr v16.4s, v4.4s, v12.s[0]
  309. OP_ii v16.4s, v5.4s, v12.s[1]
  310. OP_ri v17.4s, v4.4s, v12.s[1]
  311. OP_ir v17.4s, v5.4s, v12.s[0]
  312. ldr q8, [pB]
  313. add pB, pB, #16
  314. OP_rr v18.4s, v6.4s, v12.s[0]
  315. OP_ii v18.4s, v7.4s, v12.s[1]
  316. OP_ri v19.4s, v6.4s, v12.s[1]
  317. OP_ir v19.4s, v7.4s, v12.s[0]
  318. ld2 {v0.4s, v1.4s}, [pA]
  319. add pA, pA, #32
  320. OP_rr v20.4s, v4.4s, v12.s[2]
  321. OP_ii v20.4s, v5.4s, v12.s[3]
  322. OP_ri v21.4s, v4.4s, v12.s[3]
  323. OP_ir v21.4s, v5.4s, v12.s[2]
  324. ld2 {v2.4s, v3.4s}, [pA]
  325. add pA, pA, #32
  326. OP_rr v22.4s, v6.4s, v12.s[2]
  327. OP_ii v22.4s, v7.4s, v12.s[3]
  328. OP_ri v23.4s, v6.4s, v12.s[3]
  329. OP_ir v23.4s, v7.4s, v12.s[2]
  330. ldr q10, [pB]
  331. add pB, pB, #16
  332. OP_rr v24.4s, v4.4s, v14.s[0]
  333. OP_ii v24.4s, v5.4s, v14.s[1]
  334. OP_ri v25.4s, v4.4s, v14.s[1]
  335. OP_ir v25.4s, v5.4s, v14.s[0]
  336. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  337. OP_rr v26.4s, v6.4s, v14.s[0]
  338. OP_ii v26.4s, v7.4s, v14.s[1]
  339. OP_ri v27.4s, v6.4s, v14.s[1]
  340. OP_ir v27.4s, v7.4s, v14.s[0]
  341. OP_rr v28.4s, v4.4s, v14.s[2]
  342. OP_ii v28.4s, v5.4s, v14.s[3]
  343. OP_ri v29.4s, v4.4s, v14.s[3]
  344. OP_ir v29.4s, v5.4s, v14.s[2]
  345. OP_rr v30.4s, v6.4s, v14.s[2]
  346. OP_ii v30.4s, v7.4s, v14.s[3]
  347. OP_ri v31.4s, v6.4s, v14.s[3]
  348. OP_ir v31.4s, v7.4s, v14.s[2]
  349. .endm
  350. .macro KERNEL8x4_E
  351. OP_rr v16.4s, v4.4s, v12.s[0]
  352. OP_ii v16.4s, v5.4s, v12.s[1]
  353. OP_ri v17.4s, v4.4s, v12.s[1]
  354. OP_ir v17.4s, v5.4s, v12.s[0]
  355. OP_rr v18.4s, v6.4s, v12.s[0]
  356. OP_ii v18.4s, v7.4s, v12.s[1]
  357. OP_ri v19.4s, v6.4s, v12.s[1]
  358. OP_ir v19.4s, v7.4s, v12.s[0]
  359. OP_rr v20.4s, v4.4s, v12.s[2]
  360. OP_ii v20.4s, v5.4s, v12.s[3]
  361. OP_ri v21.4s, v4.4s, v12.s[3]
  362. OP_ir v21.4s, v5.4s, v12.s[2]
  363. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  364. OP_rr v22.4s, v6.4s, v12.s[2]
  365. OP_ii v22.4s, v7.4s, v12.s[3]
  366. OP_ri v23.4s, v6.4s, v12.s[3]
  367. OP_ir v23.4s, v7.4s, v12.s[2]
  368. OP_rr v24.4s, v4.4s, v14.s[0]
  369. OP_ii v24.4s, v5.4s, v14.s[1]
  370. OP_ri v25.4s, v4.4s, v14.s[1]
  371. OP_ir v25.4s, v5.4s, v14.s[0]
  372. OP_rr v26.4s, v6.4s, v14.s[0]
  373. OP_ii v26.4s, v7.4s, v14.s[1]
  374. OP_ri v27.4s, v6.4s, v14.s[1]
  375. OP_ir v27.4s, v7.4s, v14.s[0]
  376. OP_rr v28.4s, v4.4s, v14.s[2]
  377. OP_ii v28.4s, v5.4s, v14.s[3]
  378. OP_ri v29.4s, v4.4s, v14.s[3]
  379. OP_ir v29.4s, v5.4s, v14.s[2]
  380. OP_rr v30.4s, v6.4s, v14.s[2]
  381. OP_ii v30.4s, v7.4s, v14.s[3]
  382. OP_ri v31.4s, v6.4s, v14.s[3]
  383. OP_ir v31.4s, v7.4s, v14.s[2]
  384. .endm
  385. .macro KERNEL8x4_SUB
  386. ldr q8, [pB]
  387. add pB, pB, #16
  388. ld2 {v0.4s, v1.4s}, [pA]
  389. add pA, pA, #32
  390. OP_rr v16.4s, v0.4s, v8.s[0]
  391. OP_ii v16.4s, v1.4s, v8.s[1]
  392. OP_ri v17.4s, v0.4s, v8.s[1]
  393. OP_ir v17.4s, v1.4s, v8.s[0]
  394. ld2 {v2.4s, v3.4s}, [pA]
  395. add pA, pA, #32
  396. OP_rr v20.4s, v0.4s, v8.s[2]
  397. OP_ii v20.4s, v1.4s, v8.s[3]
  398. OP_ri v21.4s, v0.4s, v8.s[3]
  399. OP_ir v21.4s, v1.4s, v8.s[2]
  400. ldr q10, [pB]
  401. add pB, pB, #16
  402. OP_rr v18.4s, v2.4s, v8.s[0]
  403. OP_ii v18.4s, v3.4s, v8.s[1]
  404. OP_ri v19.4s, v2.4s, v8.s[1]
  405. OP_ir v19.4s, v3.4s, v8.s[0]
  406. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  407. OP_rr v22.4s, v2.4s, v8.s[2]
  408. OP_ii v22.4s, v3.4s, v8.s[3]
  409. OP_ri v23.4s, v2.4s, v8.s[3]
  410. OP_ir v23.4s, v3.4s, v8.s[2]
  411. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  412. OP_rr v24.4s, v0.4s, v10.s[0]
  413. OP_ii v24.4s, v1.4s, v10.s[1]
  414. OP_ri v25.4s, v0.4s, v10.s[1]
  415. OP_ir v25.4s, v1.4s, v10.s[0]
  416. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  417. OP_rr v26.4s, v2.4s, v10.s[0]
  418. OP_ii v26.4s, v3.4s, v10.s[1]
  419. OP_ri v27.4s, v2.4s, v10.s[1]
  420. OP_ir v27.4s, v3.4s, v10.s[0]
  421. OP_rr v28.4s, v0.4s, v10.s[2]
  422. OP_ii v28.4s, v1.4s, v10.s[3]
  423. OP_ri v29.4s, v0.4s, v10.s[3]
  424. OP_ir v29.4s, v1.4s, v10.s[2]
  425. OP_rr v30.4s, v2.4s, v10.s[2]
  426. OP_ii v30.4s, v3.4s, v10.s[3]
  427. OP_ri v31.4s, v2.4s, v10.s[3]
  428. OP_ir v31.4s, v3.4s, v10.s[2]
  429. .endm
  430. .macro SAVE8x4
  431. fmov alpha0_R, alphaR
  432. fmov alpha0_I, alphaI
  433. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  434. ld2 {v0.4s, v1.4s}, [pCRow0]
  435. fmla v0.4s, v16.4s, alphaV0_R
  436. fmls v0.4s, v17.4s, alphaV0_I
  437. fmla v1.4s, v16.4s, alphaV0_I
  438. fmla v1.4s, v17.4s, alphaV0_R
  439. st2 {v0.4s, v1.4s}, [pCRow0]
  440. add pCRow0, pCRow0, #32
  441. ld2 {v2.4s, v3.4s}, [pCRow0]
  442. fmla v2.4s, v18.4s, alphaV0_R
  443. fmls v2.4s, v19.4s, alphaV0_I
  444. fmla v3.4s, v18.4s, alphaV0_I
  445. fmla v3.4s, v19.4s, alphaV0_R
  446. st2 {v2.4s, v3.4s}, [pCRow0]
  447. add pCRow0, pCRow0, #32
  448. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  449. ld2 {v4.4s, v5.4s}, [pCRow1]
  450. fmla v4.4s, v20.4s, alphaV0_R
  451. fmls v4.4s, v21.4s, alphaV0_I
  452. fmla v5.4s, v20.4s, alphaV0_I
  453. fmla v5.4s, v21.4s, alphaV0_R
  454. st2 {v4.4s, v5.4s}, [pCRow1]
  455. add pCRow1, pCRow1, #32
  456. ld2 {v6.4s, v7.4s}, [pCRow1]
  457. fmla v6.4s, v22.4s, alphaV0_R
  458. fmls v6.4s, v23.4s, alphaV0_I
  459. fmla v7.4s, v22.4s, alphaV0_I
  460. fmla v7.4s, v23.4s, alphaV0_R
  461. st2 {v6.4s, v7.4s}, [pCRow1]
  462. add pCRow1, pCRow1, #32
  463. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  464. ld2 {v0.4s, v1.4s}, [pCRow2]
  465. fmla v0.4s, v24.4s, alphaV0_R
  466. fmls v0.4s, v25.4s, alphaV0_I
  467. fmla v1.4s, v24.4s, alphaV0_I
  468. fmla v1.4s, v25.4s, alphaV0_R
  469. st2 {v0.4s, v1.4s}, [pCRow2]
  470. add pCRow2, pCRow2, #32
  471. ld2 {v2.4s, v3.4s}, [pCRow2]
  472. fmla v2.4s, v26.4s, alphaV0_R
  473. fmls v2.4s, v27.4s, alphaV0_I
  474. fmla v3.4s, v26.4s, alphaV0_I
  475. fmla v3.4s, v27.4s, alphaV0_R
  476. st2 {v2.4s, v3.4s}, [pCRow2]
  477. add pCRow2, pCRow2, #32
  478. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  479. ld2 {v4.4s, v5.4s}, [pCRow3]
  480. fmla v4.4s, v28.4s, alphaV0_R
  481. fmls v4.4s, v29.4s, alphaV0_I
  482. fmla v5.4s, v28.4s, alphaV0_I
  483. fmla v5.4s, v29.4s, alphaV0_R
  484. st2 {v4.4s, v5.4s}, [pCRow3]
  485. add pCRow3, pCRow3, #32
  486. ld2 {v6.4s, v7.4s}, [pCRow3]
  487. fmla v6.4s, v30.4s, alphaV0_R
  488. fmls v6.4s, v31.4s, alphaV0_I
  489. fmla v7.4s, v30.4s, alphaV0_I
  490. fmla v7.4s, v31.4s, alphaV0_R
  491. st2 {v6.4s, v7.4s}, [pCRow3]
  492. add pCRow3, pCRow3, #32
  493. .endm
  494. /******************************************************************************/
  495. .macro INIT4x4
  496. fmov s16, wzr
  497. fmov s17, s16
  498. fmov s20, s17
  499. fmov s21, s16
  500. fmov s24, s17
  501. fmov s25, s16
  502. fmov s28, s17
  503. fmov s29, s16
  504. .endm
  505. .macro KERNEL4x4_I
  506. ld2 {v8.4s, v9.4s}, [pB]
  507. add pB, pB, #32
  508. ld2 {v0.4s, v1.4s}, [pA]
  509. add pA, pA, #32
  510. fmul v16.4s, v0.4s, v8.s[0]
  511. OP_ii v16.4s, v1.4s, v9.s[0]
  512. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  513. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  514. eor v17.16b, v17.16b, v17.16b
  515. fmls v17.4s, v0.4s, v9.s[0]
  516. #else
  517. fmul v17.4s, v0.4s, v9.s[0]
  518. #endif
  519. OP_ir v17.4s, v1.4s, v8.s[0]
  520. fmul v20.4s, v0.4s, v8.s[1]
  521. OP_ii v20.4s, v1.4s, v9.s[1]
  522. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  523. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  524. eor v21.16b, v21.16b, v21.16b
  525. fmls v21.4s, v0.4s, v9.s[1]
  526. #else
  527. fmul v21.4s, v0.4s, v9.s[1]
  528. #endif
  529. OP_ir v21.4s, v1.4s, v8.s[1]
  530. fmul v24.4s, v0.4s, v8.s[2]
  531. OP_ii v24.4s, v1.4s, v9.s[2]
  532. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  533. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  534. eor v25.16b, v25.16b, v25.16b
  535. fmls v25.4s, v0.4s, v9.s[2]
  536. #else
  537. fmul v25.4s, v0.4s, v9.s[2]
  538. #endif
  539. OP_ir v25.4s, v1.4s, v8.s[2]
  540. fmul v28.4s, v0.4s, v8.s[3]
  541. OP_ii v28.4s, v1.4s, v9.s[3]
  542. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  543. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  544. eor v29.16b, v29.16b, v29.16b
  545. fmls v29.4s, v0.4s, v9.s[3]
  546. #else
  547. fmul v29.4s, v0.4s, v9.s[3]
  548. #endif
  549. OP_ir v29.4s, v1.4s, v8.s[3]
  550. ld2 {v12.4s, v13.4s}, [pB]
  551. add pB, pB, #32
  552. ld2 {v4.4s, v5.4s}, [pA]
  553. add pA, pA, #32
  554. .endm
  555. .macro KERNEL4x4_M1
  556. OP_rr v16.4s, v0.4s, v8.s[0]
  557. OP_ii v16.4s, v1.4s, v9.s[0]
  558. OP_ri v17.4s, v0.4s, v9.s[0]
  559. OP_ir v17.4s, v1.4s, v8.s[0]
  560. ld2 {v12.4s, v13.4s}, [pB] // For next round
  561. add pB, pB, #32
  562. OP_rr v20.4s, v0.4s, v8.s[1]
  563. OP_ii v20.4s, v1.4s, v9.s[1]
  564. OP_ri v21.4s, v0.4s, v9.s[1]
  565. OP_ir v21.4s, v1.4s, v8.s[1]
  566. ld2 {v4.4s, v5.4s}, [pA] // For next round
  567. add pA, pA, #32
  568. OP_rr v24.4s, v0.4s, v8.s[2]
  569. OP_ii v24.4s, v1.4s, v9.s[2]
  570. OP_ri v25.4s, v0.4s, v9.s[2]
  571. OP_ir v25.4s, v1.4s, v8.s[2]
  572. prfm PLDL1KEEP, [pA, #512]
  573. OP_rr v28.4s, v0.4s, v8.s[3]
  574. OP_ii v28.4s, v1.4s, v9.s[3]
  575. OP_ri v29.4s, v0.4s, v9.s[3]
  576. OP_ir v29.4s, v1.4s, v8.s[3]
  577. .endm
  578. .macro KERNEL4x4_M2
  579. OP_rr v16.4s, v4.4s, v12.s[0]
  580. OP_ii v16.4s, v5.4s, v13.s[0]
  581. OP_ri v17.4s, v4.4s, v13.s[0]
  582. OP_ir v17.4s, v5.4s, v12.s[0]
  583. ld2 {v8.4s, v9.4s}, [pB] // For next round
  584. add pB, pB, #32
  585. OP_rr v20.4s, v4.4s, v12.s[1]
  586. OP_ii v20.4s, v5.4s, v13.s[1]
  587. OP_ri v21.4s, v4.4s, v13.s[1]
  588. OP_ir v21.4s, v5.4s, v12.s[1]
  589. ld2 {v0.4s, v1.4s}, [pA] // For next round
  590. add pA, pA, #32
  591. OP_rr v24.4s, v4.4s, v12.s[2]
  592. OP_ii v24.4s, v5.4s, v13.s[2]
  593. OP_ri v25.4s, v4.4s, v13.s[2]
  594. OP_ir v25.4s, v5.4s, v12.s[2]
  595. prfm PLDL1KEEP, [pB, #512]
  596. OP_rr v28.4s, v4.4s, v12.s[3]
  597. OP_ii v28.4s, v5.4s, v13.s[3]
  598. OP_ri v29.4s, v4.4s, v13.s[3]
  599. OP_ir v29.4s, v5.4s, v12.s[3]
  600. .endm
  601. .macro KERNEL4x4_E
  602. OP_rr v16.4s, v4.4s, v12.s[0]
  603. OP_ii v16.4s, v5.4s, v13.s[0]
  604. OP_ri v17.4s, v4.4s, v13.s[0]
  605. OP_ir v17.4s, v5.4s, v12.s[0]
  606. OP_rr v20.4s, v4.4s, v12.s[1]
  607. OP_ii v20.4s, v5.4s, v13.s[1]
  608. OP_ri v21.4s, v4.4s, v13.s[1]
  609. OP_ir v21.4s, v5.4s, v12.s[1]
  610. OP_rr v24.4s, v4.4s, v12.s[2]
  611. OP_ii v24.4s, v5.4s, v13.s[2]
  612. OP_ri v25.4s, v4.4s, v13.s[2]
  613. OP_ir v25.4s, v5.4s, v12.s[2]
  614. OP_rr v28.4s, v4.4s, v12.s[3]
  615. OP_ii v28.4s, v5.4s, v13.s[3]
  616. OP_ri v29.4s, v4.4s, v13.s[3]
  617. OP_ir v29.4s, v5.4s, v12.s[3]
  618. .endm
  619. .macro KERNEL4x4_SUB
  620. ld2 {v8.4s, v9.4s}, [pB]
  621. add pB, pB, #32
  622. ld2 {v0.4s, v1.4s}, [pA]
  623. add pA, pA, #32
  624. OP_rr v16.4s, v0.4s, v8.s[0]
  625. OP_ii v16.4s, v1.4s, v9.s[0]
  626. OP_ri v17.4s, v0.4s, v9.s[0]
  627. OP_ir v17.4s, v1.4s, v8.s[0]
  628. OP_rr v20.4s, v0.4s, v8.s[1]
  629. OP_ii v20.4s, v1.4s, v9.s[1]
  630. OP_ri v21.4s, v0.4s, v9.s[1]
  631. OP_ir v21.4s, v1.4s, v8.s[1]
  632. OP_rr v24.4s, v0.4s, v8.s[2]
  633. OP_ii v24.4s, v1.4s, v9.s[2]
  634. OP_ri v25.4s, v0.4s, v9.s[2]
  635. OP_ir v25.4s, v1.4s, v8.s[2]
  636. OP_rr v28.4s, v0.4s, v8.s[3]
  637. OP_ii v28.4s, v1.4s, v9.s[3]
  638. OP_ri v29.4s, v0.4s, v9.s[3]
  639. OP_ir v29.4s, v1.4s, v8.s[3]
  640. .endm
  641. .macro SAVE4x4
  642. fmov alpha0_R, alphaR
  643. fmov alpha0_I, alphaI
  644. mov pCRow1, pCRow0
  645. ld2 {v0.4s, v1.4s}, [pCRow1]
  646. fmla v0.4s, v16.4s, alphaV0_R
  647. fmls v0.4s, v17.4s, alphaV0_I
  648. fmla v1.4s, v16.4s, alphaV0_I
  649. fmla v1.4s, v17.4s, alphaV0_R
  650. st2 {v0.4s, v1.4s}, [pCRow1]
  651. add pCRow1, pCRow1, LDC
  652. ld2 {v4.4s, v5.4s}, [pCRow1]
  653. fmla v4.4s, v20.4s, alphaV0_R
  654. fmls v4.4s, v21.4s, alphaV0_I
  655. fmla v5.4s, v20.4s, alphaV0_I
  656. fmla v5.4s, v21.4s, alphaV0_R
  657. st2 {v4.4s, v5.4s}, [pCRow1]
  658. add pCRow1, pCRow1, LDC
  659. ld2 {v0.4s, v1.4s}, [pCRow1]
  660. fmla v0.4s, v24.4s, alphaV0_R
  661. fmls v0.4s, v25.4s, alphaV0_I
  662. fmla v1.4s, v24.4s, alphaV0_I
  663. fmla v1.4s, v25.4s, alphaV0_R
  664. st2 {v0.4s, v1.4s}, [pCRow1]
  665. add pCRow1, pCRow1, LDC
  666. ld2 {v4.4s, v5.4s}, [pCRow1]
  667. fmla v4.4s, v28.4s, alphaV0_R
  668. fmls v4.4s, v29.4s, alphaV0_I
  669. fmla v5.4s, v28.4s, alphaV0_I
  670. fmla v5.4s, v29.4s, alphaV0_R
  671. st2 {v4.4s, v5.4s}, [pCRow1]
  672. add pCRow0, pCRow0, #32
  673. .endm
  674. /******************************************************************************/
  675. .macro INIT2x4
  676. fmov s16, wzr
  677. fmov s17, wzr
  678. fmov s20, s16
  679. fmov s21, s17
  680. fmov s24, s16
  681. fmov s25, s17
  682. fmov s28, s16
  683. fmov s29, s17
  684. .endm
  685. .macro KERNEL2x4_SUB
  686. ld2 {v8.4s, v9.4s}, [pB]
  687. add pB, pB, #32
  688. ld2 {v0.2s, v1.2s}, [pA]
  689. add pA, pA, #16
  690. OP_rr v16.2s, v0.2s, v8.s[0]
  691. OP_ii v16.2s, v1.2s, v9.s[0]
  692. OP_ri v17.2s, v0.2s, v9.s[0]
  693. OP_ir v17.2s, v1.2s, v8.s[0]
  694. OP_rr v20.2s, v0.2s, v8.s[1]
  695. OP_ii v20.2s, v1.2s, v9.s[1]
  696. OP_ri v21.2s, v0.2s, v9.s[1]
  697. OP_ir v21.2s, v1.2s, v8.s[1]
  698. OP_rr v24.2s, v0.2s, v8.s[2]
  699. OP_ii v24.2s, v1.2s, v9.s[2]
  700. OP_ri v25.2s, v0.2s, v9.s[2]
  701. OP_ir v25.2s, v1.2s, v8.s[2]
  702. OP_rr v28.2s, v0.2s, v8.s[3]
  703. OP_ii v28.2s, v1.2s, v9.s[3]
  704. OP_ri v29.2s, v0.2s, v9.s[3]
  705. OP_ir v29.2s, v1.2s, v8.s[3]
  706. .endm
  707. .macro SAVE2x4
  708. fmov alpha0_R, alphaR
  709. fmov alpha0_I, alphaI
  710. mov pCRow1, pCRow0
  711. ld2 {v0.2s, v1.2s}, [pCRow1]
  712. fmla v0.2s, v16.2s, alphaV0_R
  713. fmls v0.2s, v17.2s, alphaV0_I
  714. fmla v1.2s, v16.2s, alphaV0_I
  715. fmla v1.2s, v17.2s, alphaV0_R
  716. st2 {v0.2s, v1.2s}, [pCRow1]
  717. add pCRow1, pCRow1, LDC
  718. ld2 {v4.2s, v5.2s}, [pCRow1]
  719. fmla v4.2s, v20.2s, alphaV0_R
  720. fmls v4.2s, v21.2s, alphaV0_I
  721. fmla v5.2s, v20.2s, alphaV0_I
  722. fmla v5.2s, v21.2s, alphaV0_R
  723. st2 {v4.2s, v5.2s}, [pCRow1]
  724. add pCRow1, pCRow1, LDC
  725. ld2 {v0.2s, v1.2s}, [pCRow1]
  726. fmla v0.2s, v24.2s, alphaV0_R
  727. fmls v0.2s, v25.2s, alphaV0_I
  728. fmla v1.2s, v24.2s, alphaV0_I
  729. fmla v1.2s, v25.2s, alphaV0_R
  730. st2 {v0.2s, v1.2s}, [pCRow1]
  731. add pCRow1, pCRow1, LDC
  732. ld2 {v4.2s, v5.2s}, [pCRow1]
  733. fmla v4.2s, v28.2s, alphaV0_R
  734. fmls v4.2s, v29.2s, alphaV0_I
  735. fmla v5.2s, v28.2s, alphaV0_I
  736. fmla v5.2s, v29.2s, alphaV0_R
  737. st2 {v4.2s, v5.2s}, [pCRow1]
  738. add pCRow0, pCRow0, #16
  739. .endm
  740. /******************************************************************************/
  741. .macro INIT1x4
  742. fmov s16, wzr
  743. fmov s17, wzr
  744. fmov s20, s16
  745. fmov s21, s17
  746. fmov s24, s16
  747. fmov s25, s17
  748. fmov s28, s16
  749. fmov s29, s17
  750. .endm
  751. .macro KERNEL1x4_SUB
  752. ld2 {v8.4s, v9.4s}, [pB]
  753. add pB, pB, #32
  754. ld2 {v0.s, v1.s}[0], [pA]
  755. add pA, pA, #8
  756. OP_rr s16, s0, v8.s[0]
  757. OP_ii s16, s1, v9.s[0]
  758. OP_ri s17, s0, v9.s[0]
  759. OP_ir s17, s1, v8.s[0]
  760. OP_rr s20, s0, v8.s[1]
  761. OP_ii s20, s1, v9.s[1]
  762. OP_ri s21, s0, v9.s[1]
  763. OP_ir s21, s1, v8.s[1]
  764. OP_rr s24, s0, v8.s[2]
  765. OP_ii s24, s1, v9.s[2]
  766. OP_ri s25, s0, v9.s[2]
  767. OP_ir s25, s1, v8.s[2]
  768. OP_rr s28, s0, v8.s[3]
  769. OP_ii s28, s1, v9.s[3]
  770. OP_ri s29, s0, v9.s[3]
  771. OP_ir s29, s1, v8.s[3]
  772. .endm
  773. .macro SAVE1x4
  774. fmov alpha0_R, alphaR
  775. fmov alpha0_I, alphaI
  776. mov pCRow1, pCRow0
  777. ld2 {v0.s, v1.s}[0], [pCRow1]
  778. fmla s0, s16, alphaV0_R
  779. fmls s0, s17, alphaV0_I
  780. fmla s1, s16, alphaV0_I
  781. fmla s1, s17, alphaV0_R
  782. st2 {v0.s, v1.s}[0], [pCRow1]
  783. add pCRow1, pCRow1, LDC
  784. ld2 {v4.s, v5.s}[0], [pCRow1]
  785. fmla s4, s20, alphaV0_R
  786. fmls s4, s21, alphaV0_I
  787. fmla s5, s20, alphaV0_I
  788. fmla s5, s21, alphaV0_R
  789. st2 {v4.s, v5.s}[0], [pCRow1]
  790. add pCRow1, pCRow1, LDC
  791. ld2 {v0.s, v1.s}[0], [pCRow1]
  792. fmla s0, s24, alphaV0_R
  793. fmls s0, s25, alphaV0_I
  794. fmla s1, s24, alphaV0_I
  795. fmla s1, s25, alphaV0_R
  796. st2 {v0.s, v1.s}[0], [pCRow1]
  797. add pCRow1, pCRow1, LDC
  798. ld2 {v4.s, v5.s}[0], [pCRow1]
  799. fmla s4, s28, alphaV0_R
  800. fmls s4, s29, alphaV0_I
  801. fmla s5, s28, alphaV0_I
  802. fmla s5, s29, alphaV0_R
  803. st2 {v4.s, v5.s}[0], [pCRow1]
  804. add pCRow0, pCRow0, #8
  805. .endm
  806. /******************************************************************************/
  807. .macro INIT8x2
  808. fmov s16, wzr
  809. fmov s17, wzr
  810. fmov s18, wzr
  811. fmov s19, s16
  812. fmov s20, wzr
  813. fmov s21, s16
  814. fmov s22, s17
  815. fmov s23, s18
  816. .endm
  817. .macro KERNEL8x2_SUB
  818. ld2 {v8.2s, v9.2s}, [pB]
  819. add pB, pB, #16
  820. ld2 {v0.4s, v1.4s}, [pA]
  821. add pA, pA, #32
  822. ld2 {v2.4s, v3.4s}, [pA]
  823. add pA, pA, #32
  824. OP_rr v16.4s, v0.4s, v8.s[0]
  825. OP_ii v16.4s, v1.4s, v9.s[0]
  826. OP_ri v17.4s, v0.4s, v9.s[0]
  827. OP_ir v17.4s, v1.4s, v8.s[0]
  828. OP_rr v18.4s, v2.4s, v8.s[0]
  829. OP_ii v18.4s, v3.4s, v9.s[0]
  830. OP_ri v19.4s, v2.4s, v9.s[0]
  831. OP_ir v19.4s, v3.4s, v8.s[0]
  832. OP_rr v20.4s, v0.4s, v8.s[1]
  833. OP_ii v20.4s, v1.4s, v9.s[1]
  834. OP_ri v21.4s, v0.4s, v9.s[1]
  835. OP_ir v21.4s, v1.4s, v8.s[1]
  836. OP_rr v22.4s, v2.4s, v8.s[1]
  837. OP_ii v22.4s, v3.4s, v9.s[1]
  838. OP_ri v23.4s, v2.4s, v9.s[1]
  839. OP_ir v23.4s, v3.4s, v8.s[1]
  840. .endm
  841. .macro SAVE8x2
  842. fmov alpha0_R, alphaR
  843. fmov alpha0_I, alphaI
  844. mov pCRow1, pCRow0
  845. ld2 {v0.4s, v1.4s}, [pCRow1]
  846. fmla v0.4s, v16.4s, alphaV0_R
  847. fmls v0.4s, v17.4s, alphaV0_I
  848. fmla v1.4s, v16.4s, alphaV0_I
  849. fmla v1.4s, v17.4s, alphaV0_R
  850. st2 {v0.4s, v1.4s}, [pCRow1]
  851. add pCRow2, pCRow1, #32
  852. ld2 {v2.4s, v3.4s}, [pCRow2]
  853. fmla v2.4s, v18.4s, alphaV0_R
  854. fmls v2.4s, v19.4s, alphaV0_I
  855. fmla v3.4s, v18.4s, alphaV0_I
  856. fmla v3.4s, v19.4s, alphaV0_R
  857. st2 {v2.4s, v3.4s}, [pCRow2]
  858. add pCRow1, pCRow1, LDC
  859. ld2 {v4.4s, v5.4s}, [pCRow1]
  860. fmla v4.4s, v20.4s, alphaV0_R
  861. fmls v4.4s, v21.4s, alphaV0_I
  862. fmla v5.4s, v20.4s, alphaV0_I
  863. fmla v5.4s, v21.4s, alphaV0_R
  864. st2 {v4.4s, v5.4s}, [pCRow1]
  865. add pCRow2, pCRow1, #32
  866. ld2 {v6.4s, v7.4s}, [pCRow2]
  867. fmla v6.4s, v22.4s, alphaV0_R
  868. fmls v6.4s, v23.4s, alphaV0_I
  869. fmla v7.4s, v22.4s, alphaV0_I
  870. fmla v7.4s, v23.4s, alphaV0_R
  871. st2 {v6.4s, v7.4s}, [pCRow2]
  872. add pCRow0, pCRow0, #64
  873. .endm
  874. /******************************************************************************/
  875. .macro INIT4x2
  876. fmov s16, wzr
  877. fmov s17, wzr
  878. fmov s20, s16
  879. fmov s21, s17
  880. .endm
  881. .macro KERNEL4x2_SUB
  882. ld2 {v8.2s, v9.2s}, [pB]
  883. add pB, pB, #16
  884. ld2 {v0.4s, v1.4s}, [pA]
  885. add pA, pA, #32
  886. OP_rr v16.4s, v0.4s, v8.s[0]
  887. OP_ii v16.4s, v1.4s, v9.s[0]
  888. OP_ri v17.4s, v0.4s, v9.s[0]
  889. OP_ir v17.4s, v1.4s, v8.s[0]
  890. OP_rr v20.4s, v0.4s, v8.s[1]
  891. OP_ii v20.4s, v1.4s, v9.s[1]
  892. OP_ri v21.4s, v0.4s, v9.s[1]
  893. OP_ir v21.4s, v1.4s, v8.s[1]
  894. .endm
  895. .macro SAVE4x2
  896. fmov alpha0_R, alphaR
  897. fmov alpha0_I, alphaI
  898. mov pCRow1, pCRow0
  899. ld2 {v0.4s, v1.4s}, [pCRow1]
  900. fmla v0.4s, v16.4s, alphaV0_R
  901. fmls v0.4s, v17.4s, alphaV0_I
  902. fmla v1.4s, v16.4s, alphaV0_I
  903. fmla v1.4s, v17.4s, alphaV0_R
  904. st2 {v0.4s, v1.4s}, [pCRow1]
  905. add pCRow1, pCRow1, LDC
  906. ld2 {v4.4s, v5.4s}, [pCRow1]
  907. fmla v4.4s, v20.4s, alphaV0_R
  908. fmls v4.4s, v21.4s, alphaV0_I
  909. fmla v5.4s, v20.4s, alphaV0_I
  910. fmla v5.4s, v21.4s, alphaV0_R
  911. st2 {v4.4s, v5.4s}, [pCRow1]
  912. add pCRow0, pCRow0, #32
  913. .endm
  914. /******************************************************************************/
  915. .macro INIT2x2
  916. fmov s16, wzr
  917. fmov s17, wzr
  918. fmov s20, s16
  919. fmov s21, s17
  920. .endm
  921. .macro KERNEL2x2_SUB
  922. ld2 {v8.2s, v9.2s}, [pB]
  923. add pB, pB, #16
  924. ld2 {v0.2s, v1.2s}, [pA]
  925. add pA, pA, #16
  926. OP_rr v16.2s, v0.2s, v8.s[0]
  927. OP_ii v16.2s, v1.2s, v9.s[0]
  928. OP_ri v17.2s, v0.2s, v9.s[0]
  929. OP_ir v17.2s, v1.2s, v8.s[0]
  930. OP_rr v20.2s, v0.2s, v8.s[1]
  931. OP_ii v20.2s, v1.2s, v9.s[1]
  932. OP_ri v21.2s, v0.2s, v9.s[1]
  933. OP_ir v21.2s, v1.2s, v8.s[1]
  934. .endm
  935. .macro SAVE2x2
  936. fmov alpha0_R, alphaR
  937. fmov alpha0_I, alphaI
  938. mov pCRow1, pCRow0
  939. ld2 {v0.2s, v1.2s}, [pCRow1]
  940. fmla v0.2s, v16.2s, alphaV0_R
  941. fmls v0.2s, v17.2s, alphaV0_I
  942. fmla v1.2s, v16.2s, alphaV0_I
  943. fmla v1.2s, v17.2s, alphaV0_R
  944. st2 {v0.2s, v1.2s}, [pCRow1]
  945. add pCRow1, pCRow1, LDC
  946. ld2 {v4.2s, v5.2s}, [pCRow1]
  947. fmla v4.2s, v20.2s, alphaV0_R
  948. fmls v4.2s, v21.2s, alphaV0_I
  949. fmla v5.2s, v20.2s, alphaV0_I
  950. fmla v5.2s, v21.2s, alphaV0_R
  951. st2 {v4.2s, v5.2s}, [pCRow1]
  952. add pCRow0, pCRow0, #16
  953. .endm
  954. /******************************************************************************/
  955. .macro INIT1x2
  956. fmov s16, wzr
  957. fmov s17, wzr
  958. fmov s20, wzr
  959. fmov s21, wzr
  960. .endm
  961. .macro KERNEL1x2_SUB
  962. ld2 {v8.2s, v9.2s}, [pB]
  963. add pB, pB, #16
  964. ld2 {v0.s, v1.s}[0], [pA]
  965. add pA, pA, #8
  966. OP_rr s16, s0, v8.s[0]
  967. OP_ii s16, s1, v9.s[0]
  968. OP_ri s17, s0, v9.s[0]
  969. OP_ir s17, s1, v8.s[0]
  970. OP_rr s20, s0, v8.s[1]
  971. OP_ii s20, s1, v9.s[1]
  972. OP_ri s21, s0, v9.s[1]
  973. OP_ir s21, s1, v8.s[1]
  974. .endm
  975. .macro SAVE1x2
  976. fmov alpha0_R, alphaR
  977. fmov alpha0_I, alphaI
  978. mov pCRow1, pCRow0
  979. ld2 {v0.s, v1.s}[0], [pCRow1]
  980. fmla s0, s16, alphaV0_R
  981. fmls s0, s17, alphaV0_I
  982. fmla s1, s16, alphaV0_I
  983. fmla s1, s17, alphaV0_R
  984. st2 {v0.s, v1.s}[0], [pCRow1]
  985. add pCRow1, pCRow1, LDC
  986. ld2 {v4.s, v5.s}[0], [pCRow1]
  987. fmla s4, s20, alphaV0_R
  988. fmls s4, s21, alphaV0_I
  989. fmla s5, s20, alphaV0_I
  990. fmla s5, s21, alphaV0_R
  991. st2 {v4.s, v5.s}[0], [pCRow1]
  992. add pCRow0, pCRow0, #8
  993. .endm
  994. /******************************************************************************/
  995. .macro INIT8x1
  996. fmov s16, wzr
  997. fmov s17, wzr
  998. fmov s18, wzr
  999. fmov s19, s16
  1000. .endm
  1001. .macro KERNEL8x1_SUB
  1002. ld1 {v8.2s}, [pB]
  1003. add pB, pB, #8
  1004. ld2 {v0.4s, v1.4s}, [pA]
  1005. add pA, pA, #32
  1006. ld2 {v2.4s, v3.4s}, [pA]
  1007. add pA, pA, #32
  1008. OP_rr v16.4s, v0.4s, v8.s[0]
  1009. OP_ii v16.4s, v1.4s, v8.s[1]
  1010. OP_ri v17.4s, v0.4s, v8.s[1]
  1011. OP_ir v17.4s, v1.4s, v8.s[0]
  1012. OP_rr v18.4s, v2.4s, v8.s[0]
  1013. OP_ii v18.4s, v3.4s, v8.s[1]
  1014. OP_ri v19.4s, v2.4s, v8.s[1]
  1015. OP_ir v19.4s, v3.4s, v8.s[0]
  1016. .endm
  1017. .macro SAVE8x1
  1018. fmov alpha0_R, alphaR
  1019. fmov alpha0_I, alphaI
  1020. mov pCRow1, pCRow0
  1021. ld2 {v0.4s, v1.4s}, [pCRow1]
  1022. fmla v0.4s, v16.4s, alphaV0_R
  1023. fmls v0.4s, v17.4s, alphaV0_I
  1024. fmla v1.4s, v16.4s, alphaV0_I
  1025. fmla v1.4s, v17.4s, alphaV0_R
  1026. st2 {v0.4s, v1.4s}, [pCRow1]
  1027. add pCRow1, pCRow1, #32
  1028. ld2 {v2.4s, v3.4s}, [pCRow1]
  1029. fmla v2.4s, v18.4s, alphaV0_R
  1030. fmls v2.4s, v19.4s, alphaV0_I
  1031. fmla v3.4s, v18.4s, alphaV0_I
  1032. fmla v3.4s, v19.4s, alphaV0_R
  1033. st2 {v2.4s, v3.4s}, [pCRow1]
  1034. add pCRow0, pCRow0, #64
  1035. .endm
  1036. /******************************************************************************/
  1037. .macro INIT4x1
  1038. fmov s16, wzr
  1039. fmov s17, s16
  1040. .endm
  1041. .macro KERNEL4x1_SUB
  1042. ld2 {v8.s, v9.s}[0], [pB]
  1043. add pB, pB, #8
  1044. ld2 {v0.4s, v1.4s}, [pA]
  1045. add pA, pA, #32
  1046. OP_rr v16.4s, v0.4s, v8.s[0]
  1047. OP_ii v16.4s, v1.4s, v9.s[0]
  1048. OP_ri v17.4s, v0.4s, v9.s[0]
  1049. OP_ir v17.4s, v1.4s, v8.s[0]
  1050. .endm
  1051. .macro SAVE4x1
  1052. fmov alpha0_R, alphaR
  1053. fmov alpha0_I, alphaI
  1054. mov pCRow1, pCRow0
  1055. ld2 {v0.4s, v1.4s}, [pCRow1]
  1056. fmla v0.4s, v16.4s, alphaV0_R
  1057. fmls v0.4s, v17.4s, alphaV0_I
  1058. fmla v1.4s, v16.4s, alphaV0_I
  1059. fmla v1.4s, v17.4s, alphaV0_R
  1060. st2 {v0.4s, v1.4s}, [pCRow1]
  1061. add pCRow0, pCRow0, #32
  1062. .endm
  1063. /******************************************************************************/
  1064. .macro INIT2x1
  1065. fmov s16, wzr
  1066. fmov s17, wzr
  1067. .endm
  1068. .macro KERNEL2x1_SUB
  1069. ld2 {v8.s, v9.s}[0], [pB]
  1070. add pB, pB, #8
  1071. ld2 {v0.2s, v1.2s}, [pA]
  1072. add pA, pA, #16
  1073. OP_rr v16.2s, v0.2s, v8.s[0]
  1074. OP_ii v16.2s, v1.2s, v9.s[0]
  1075. OP_ri v17.2s, v0.2s, v9.s[0]
  1076. OP_ir v17.2s, v1.2s, v8.s[0]
  1077. .endm
  1078. .macro SAVE2x1
  1079. fmov alpha0_R, alphaR
  1080. fmov alpha0_I, alphaI
  1081. mov pCRow1, pCRow0
  1082. ld2 {v0.2s, v1.2s}, [pCRow1]
  1083. fmla v0.2s, v16.2s, alphaV0_R
  1084. fmls v0.2s, v17.2s, alphaV0_I
  1085. fmla v1.2s, v16.2s, alphaV0_I
  1086. fmla v1.2s, v17.2s, alphaV0_R
  1087. st2 {v0.2s, v1.2s}, [pCRow1]
  1088. add pCRow0, pCRow0, #16
  1089. .endm
  1090. /******************************************************************************/
  1091. .macro INIT1x1
  1092. fmov s16, wzr
  1093. fmov s17, wzr
  1094. .endm
  1095. .macro KERNEL1x1_SUB
  1096. ld2 {v8.s, v9.s}[0], [pB]
  1097. add pB, pB, #8
  1098. ld2 {v0.s, v1.s}[0], [pA]
  1099. add pA, pA, #8
  1100. OP_rr s16, s0, v8.s[0]
  1101. OP_ii s16, s1, v9.s[0]
  1102. OP_ri s17, s0, v9.s[0]
  1103. OP_ir s17, s1, v8.s[0]
  1104. .endm
  1105. .macro SAVE1x1
  1106. fmov alpha0_R, alphaR
  1107. fmov alpha0_I, alphaI
  1108. mov pCRow1, pCRow0
  1109. ld2 {v0.s, v1.s}[0], [pCRow1]
  1110. fmla s0, s16, alphaV0_R
  1111. fmls s0, s17, alphaV0_I
  1112. fmla s1, s16, alphaV0_I
  1113. fmla s1, s17, alphaV0_R
  1114. st2 {v0.s, v1.s}[0], [pCRow1]
  1115. add pCRow0, pCRow0, #8
  1116. .endm
  1117. .macro KERNEL8x4_M1_M2_x1
  1118. KERNEL8x4_M1
  1119. KERNEL8x4_M2
  1120. .endm
  1121. .macro KERNEL8x4_M1_M2_x2
  1122. KERNEL8x4_M1_M2_x1
  1123. KERNEL8x4_M1_M2_x1
  1124. .endm
  1125. .macro KERNEL8x4_M1_M2_x4
  1126. KERNEL8x4_M1_M2_x2
  1127. KERNEL8x4_M1_M2_x2
  1128. .endm
  1129. .macro KERNEL8x4_M1_M2_x8
  1130. KERNEL8x4_M1_M2_x4
  1131. KERNEL8x4_M1_M2_x4
  1132. .endm
  1133. .macro KERNEL8x4_M1_M2_x16
  1134. KERNEL8x4_M1_M2_x8
  1135. KERNEL8x4_M1_M2_x8
  1136. .endm
  1137. /*******************************************************************************
  1138. * End of macro definitions
  1139. *******************************************************************************/
  1140. PROLOGUE
  1141. .align 5
  1142. add sp, sp, #-(11 * 16)
  1143. stp d8, d9, [sp, #(0 * 16)]
  1144. stp d10, d11, [sp, #(1 * 16)]
  1145. stp d12, d13, [sp, #(2 * 16)]
  1146. stp d14, d15, [sp, #(3 * 16)]
  1147. stp d16, d17, [sp, #(4 * 16)]
  1148. stp x18, x19, [sp, #(5 * 16)]
  1149. stp x20, x21, [sp, #(6 * 16)]
  1150. stp x22, x23, [sp, #(7 * 16)]
  1151. stp x24, x25, [sp, #(8 * 16)]
  1152. stp x26, x27, [sp, #(9 * 16)]
  1153. str x28, [sp, #(10 * 16)]
  1154. prfm PLDL1KEEP, [origPB]
  1155. prfm PLDL1KEEP, [origPA]
  1156. fmov alphaR, s0
  1157. fmov alphaI, s1
  1158. lsl LDC, LDC, #3 // ldc = ldc * 8
  1159. mov pB, origPB
  1160. mov counterJ, origN
  1161. asr counterJ, counterJ, #2 // J = J / 4
  1162. cmp counterJ, #0
  1163. ble .Lcgemm_kernel_L2_BEGIN
  1164. /******************************************************************************/
  1165. .Lcgemm_kernel_L4_BEGIN:
  1166. mov pCRow0, pC
  1167. add pCRow1, pCRow0, LDC
  1168. add pCRow2, pCRow1, LDC
  1169. add pCRow3, pCRow2, LDC
  1170. add pC, pCRow3, LDC
  1171. mov pA, origPA // pA = start of A array
  1172. .Lcgemm_kernel_L4_M8_BEGIN:
  1173. mov counterI, origM
  1174. asr counterI, counterI, #3 // counterI = counterI / 8
  1175. cmp counterI, #0
  1176. ble .Lcgemm_kernel_L4_M4_BEGIN
  1177. .align 5
  1178. .Lcgemm_kernel_L4_M8_20:
  1179. mov pB, origPB
  1180. asr counterL , origK, #5 // origK / 32
  1181. cmp counterL , #2
  1182. blt .Lcgemm_kernel_L4_M8_32
  1183. KERNEL8x4_I
  1184. KERNEL8x4_M2
  1185. KERNEL8x4_M1_M2_x1
  1186. KERNEL8x4_M1_M2_x2
  1187. KERNEL8x4_M1_M2_x4
  1188. KERNEL8x4_M1_M2_x8
  1189. subs counterL, counterL, #2 // subtract 2
  1190. ble .Lcgemm_kernel_L4_M8_22a
  1191. .align 5
  1192. .Lcgemm_kernel_L4_M8_22:
  1193. KERNEL8x4_M1_M2_x16
  1194. subs counterL, counterL, #1
  1195. bgt .Lcgemm_kernel_L4_M8_22
  1196. .align 5
  1197. .Lcgemm_kernel_L4_M8_22a:
  1198. KERNEL8x4_M1_M2_x8
  1199. KERNEL8x4_M1_M2_x4
  1200. KERNEL8x4_M1_M2_x2
  1201. KERNEL8x4_M1_M2_x1
  1202. KERNEL8x4_M1
  1203. KERNEL8x4_E
  1204. b .Lcgemm_kernel_L4_M8_44
  1205. .align 5
  1206. .Lcgemm_kernel_L4_M8_32:
  1207. tst counterL, #1
  1208. ble .Lcgemm_kernel_L4_M8_40
  1209. KERNEL8x4_I
  1210. KERNEL8x4_M2
  1211. KERNEL8x4_M1_M2_x8
  1212. KERNEL8x4_M1_M2_x4
  1213. KERNEL8x4_M1_M2_x2
  1214. KERNEL8x4_M1
  1215. KERNEL8x4_E
  1216. b .Lcgemm_kernel_L4_M8_44
  1217. .Lcgemm_kernel_L4_M8_40:
  1218. INIT8x4
  1219. .Lcgemm_kernel_L4_M8_44:
  1220. ands counterL , origK, #31
  1221. ble .Lcgemm_kernel_L4_M8_100
  1222. .align 5
  1223. .Lcgemm_kernel_L4_M8_46:
  1224. KERNEL8x4_SUB
  1225. subs counterL, counterL, #1
  1226. bne .Lcgemm_kernel_L4_M8_46
  1227. .Lcgemm_kernel_L4_M8_100:
  1228. prfm PLDL1KEEP, [pA]
  1229. prfm PLDL1KEEP, [pA, #64]
  1230. prfm PLDL1KEEP, [origPB]
  1231. SAVE8x4
  1232. .Lcgemm_kernel_L4_M8_END:
  1233. subs counterI, counterI, #1
  1234. bne .Lcgemm_kernel_L4_M8_20
  1235. .Lcgemm_kernel_L4_M4_BEGIN:
  1236. mov counterI, origM
  1237. tst counterI , #7
  1238. ble .Lcgemm_kernel_L4_END
  1239. tst counterI, #4
  1240. ble .Lcgemm_kernel_L4_M2_BEGIN
  1241. .Lcgemm_kernel_L4_M4_20:
  1242. mov pB, origPB
  1243. asr counterL , origK, #1 // L = K / 2
  1244. cmp counterL , #2 // is there at least 4 to do?
  1245. blt .Lcgemm_kernel_L4_M4_32
  1246. KERNEL4x4_I // do one in the K
  1247. KERNEL4x4_M2 // do another in the K
  1248. subs counterL, counterL, #2
  1249. ble .Lcgemm_kernel_L4_M4_22a
  1250. .align 5
  1251. .Lcgemm_kernel_L4_M4_22:
  1252. KERNEL4x4_M1
  1253. KERNEL4x4_M2
  1254. subs counterL, counterL, #1
  1255. bgt .Lcgemm_kernel_L4_M4_22
  1256. .Lcgemm_kernel_L4_M4_22a:
  1257. KERNEL4x4_M1
  1258. KERNEL4x4_E
  1259. b .Lcgemm_kernel_L4_M4_44
  1260. .Lcgemm_kernel_L4_M4_32:
  1261. tst counterL, #1
  1262. ble .Lcgemm_kernel_L4_M4_40
  1263. KERNEL4x4_I
  1264. KERNEL4x4_E
  1265. b .Lcgemm_kernel_L4_M4_44
  1266. .Lcgemm_kernel_L4_M4_40:
  1267. INIT4x4
  1268. .Lcgemm_kernel_L4_M4_44:
  1269. ands counterL , origK, #1
  1270. ble .Lcgemm_kernel_L4_M4_100
  1271. .Lcgemm_kernel_L4_M4_46:
  1272. KERNEL4x4_SUB
  1273. .Lcgemm_kernel_L4_M4_100:
  1274. SAVE4x4
  1275. .Lcgemm_kernel_L4_M4_END:
  1276. .Lcgemm_kernel_L4_M2_BEGIN:
  1277. mov counterI, origM
  1278. tst counterI , #3
  1279. ble .Lcgemm_kernel_L4_END
  1280. tst counterI, #2 // counterI = counterI / 2
  1281. ble .Lcgemm_kernel_L4_M1_BEGIN
  1282. .Lcgemm_kernel_L4_M2_20:
  1283. INIT2x4
  1284. mov pB, origPB
  1285. asr counterL , origK, #3 // counterL = counterL / 8
  1286. cmp counterL , #0
  1287. ble .Lcgemm_kernel_L4_M2_40
  1288. .Lcgemm_kernel_L4_M2_22:
  1289. KERNEL2x4_SUB
  1290. KERNEL2x4_SUB
  1291. KERNEL2x4_SUB
  1292. KERNEL2x4_SUB
  1293. KERNEL2x4_SUB
  1294. KERNEL2x4_SUB
  1295. KERNEL2x4_SUB
  1296. KERNEL2x4_SUB
  1297. subs counterL, counterL, #1
  1298. bgt .Lcgemm_kernel_L4_M2_22
  1299. .Lcgemm_kernel_L4_M2_40:
  1300. ands counterL , origK, #7 // counterL = counterL % 8
  1301. ble .Lcgemm_kernel_L4_M2_100
  1302. .Lcgemm_kernel_L4_M2_42:
  1303. KERNEL2x4_SUB
  1304. subs counterL, counterL, #1
  1305. bgt .Lcgemm_kernel_L4_M2_42
  1306. .Lcgemm_kernel_L4_M2_100:
  1307. SAVE2x4
  1308. .Lcgemm_kernel_L4_M2_END:
  1309. .Lcgemm_kernel_L4_M1_BEGIN:
  1310. tst counterI, #1 // counterI = counterI % 2
  1311. ble .Lcgemm_kernel_L4_END
  1312. .Lcgemm_kernel_L4_M1_20:
  1313. INIT1x4
  1314. mov pB, origPB
  1315. asr counterL , origK, #3 // counterL = counterL / 8
  1316. cmp counterL , #0
  1317. ble .Lcgemm_kernel_L4_M1_40
  1318. .Lcgemm_kernel_L4_M1_22:
  1319. KERNEL1x4_SUB
  1320. KERNEL1x4_SUB
  1321. KERNEL1x4_SUB
  1322. KERNEL1x4_SUB
  1323. KERNEL1x4_SUB
  1324. KERNEL1x4_SUB
  1325. KERNEL1x4_SUB
  1326. KERNEL1x4_SUB
  1327. subs counterL, counterL, #1
  1328. bgt .Lcgemm_kernel_L4_M1_22
  1329. .Lcgemm_kernel_L4_M1_40:
  1330. ands counterL , origK, #7 // counterL = counterL % 8
  1331. ble .Lcgemm_kernel_L4_M1_100
  1332. .Lcgemm_kernel_L4_M1_42:
  1333. KERNEL1x4_SUB
  1334. subs counterL, counterL, #1
  1335. bgt .Lcgemm_kernel_L4_M1_42
  1336. .Lcgemm_kernel_L4_M1_100:
  1337. SAVE1x4
  1338. .Lcgemm_kernel_L4_END:
  1339. lsl temp, origK, #5
  1340. add origPB, origPB, temp // B = B + K * 4 * 8
  1341. subs counterJ, counterJ , #1 // j--
  1342. bgt .Lcgemm_kernel_L4_BEGIN
  1343. /******************************************************************************/
  1344. .Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  1345. mov counterJ , origN
  1346. tst counterJ , #3
  1347. ble .Lcgemm_kernel_L999 // error, N was less than 4?
  1348. tst counterJ , #2
  1349. ble .Lcgemm_kernel_L1_BEGIN
  1350. mov pCRow0, pC // pCRow0 = pC
  1351. add pC,pC,LDC, lsl #1
  1352. mov pA, origPA // pA = A
  1353. .Lcgemm_kernel_L2_M8_BEGIN:
  1354. mov counterI, origM
  1355. asr counterI, counterI, #3 // counterI = counterI / 8
  1356. cmp counterI, #0
  1357. ble .Lcgemm_kernel_L2_M4_BEGIN
  1358. .Lcgemm_kernel_L2_M8_20:
  1359. INIT8x2
  1360. mov pB, origPB
  1361. asr counterL , origK, #3 // counterL = counterL / 8
  1362. cmp counterL,#0
  1363. ble .Lcgemm_kernel_L2_M8_40
  1364. .align 5
  1365. .Lcgemm_kernel_L2_M8_22:
  1366. KERNEL8x2_SUB
  1367. KERNEL8x2_SUB
  1368. KERNEL8x2_SUB
  1369. KERNEL8x2_SUB
  1370. KERNEL8x2_SUB
  1371. KERNEL8x2_SUB
  1372. KERNEL8x2_SUB
  1373. KERNEL8x2_SUB
  1374. subs counterL, counterL, #1
  1375. bgt .Lcgemm_kernel_L2_M8_22
  1376. .Lcgemm_kernel_L2_M8_40:
  1377. ands counterL , origK, #7 // counterL = counterL % 8
  1378. ble .Lcgemm_kernel_L2_M8_100
  1379. .Lcgemm_kernel_L2_M8_42:
  1380. KERNEL8x2_SUB
  1381. subs counterL, counterL, #1
  1382. bgt .Lcgemm_kernel_L2_M8_42
  1383. .Lcgemm_kernel_L2_M8_100:
  1384. SAVE8x2
  1385. .Lcgemm_kernel_L2_M8_END:
  1386. subs counterI, counterI, #1
  1387. bgt .Lcgemm_kernel_L2_M8_20
  1388. .Lcgemm_kernel_L2_M4_BEGIN:
  1389. mov counterI, origM
  1390. tst counterI , #7
  1391. ble .Lcgemm_kernel_L2_END
  1392. tst counterI, #4 // counterI = counterI / 2
  1393. ble .Lcgemm_kernel_L2_M2_BEGIN
  1394. .Lcgemm_kernel_L2_M4_20:
  1395. INIT4x2
  1396. mov pB, origPB
  1397. asr counterL , origK, #3 // counterL = counterL / 8
  1398. cmp counterL,#0
  1399. ble .Lcgemm_kernel_L2_M4_40
  1400. .align 5
  1401. .Lcgemm_kernel_L2_M4_22:
  1402. KERNEL4x2_SUB
  1403. KERNEL4x2_SUB
  1404. KERNEL4x2_SUB
  1405. KERNEL4x2_SUB
  1406. KERNEL4x2_SUB
  1407. KERNEL4x2_SUB
  1408. KERNEL4x2_SUB
  1409. KERNEL4x2_SUB
  1410. subs counterL, counterL, #1
  1411. bgt .Lcgemm_kernel_L2_M4_22
  1412. .Lcgemm_kernel_L2_M4_40:
  1413. ands counterL , origK, #7 // counterL = counterL % 8
  1414. ble .Lcgemm_kernel_L2_M4_100
  1415. .Lcgemm_kernel_L2_M4_42:
  1416. KERNEL4x2_SUB
  1417. subs counterL, counterL, #1
  1418. bgt .Lcgemm_kernel_L2_M4_42
  1419. .Lcgemm_kernel_L2_M4_100:
  1420. SAVE4x2
  1421. .Lcgemm_kernel_L2_M4_END:
  1422. .Lcgemm_kernel_L2_M2_BEGIN:
  1423. mov counterI, origM
  1424. tst counterI , #3
  1425. ble .Lcgemm_kernel_L2_END
  1426. tst counterI, #2 // counterI = counterI / 2
  1427. ble .Lcgemm_kernel_L2_M1_BEGIN
  1428. .Lcgemm_kernel_L2_M2_20:
  1429. INIT2x2
  1430. mov pB, origPB
  1431. asr counterL , origK, #3 // counterL = counterL / 8
  1432. cmp counterL,#0
  1433. ble .Lcgemm_kernel_L2_M2_40
  1434. .Lcgemm_kernel_L2_M2_22:
  1435. KERNEL2x2_SUB
  1436. KERNEL2x2_SUB
  1437. KERNEL2x2_SUB
  1438. KERNEL2x2_SUB
  1439. KERNEL2x2_SUB
  1440. KERNEL2x2_SUB
  1441. KERNEL2x2_SUB
  1442. KERNEL2x2_SUB
  1443. subs counterL, counterL, #1
  1444. bgt .Lcgemm_kernel_L2_M2_22
  1445. .Lcgemm_kernel_L2_M2_40:
  1446. ands counterL , origK, #7 // counterL = counterL % 8
  1447. ble .Lcgemm_kernel_L2_M2_100
  1448. .Lcgemm_kernel_L2_M2_42:
  1449. KERNEL2x2_SUB
  1450. subs counterL, counterL, #1
  1451. bgt .Lcgemm_kernel_L2_M2_42
  1452. .Lcgemm_kernel_L2_M2_100:
  1453. SAVE2x2
  1454. .Lcgemm_kernel_L2_M2_END:
  1455. .Lcgemm_kernel_L2_M1_BEGIN:
  1456. tst counterI, #1 // counterI = counterI % 2
  1457. ble .Lcgemm_kernel_L2_END
  1458. .Lcgemm_kernel_L2_M1_20:
  1459. INIT1x2
  1460. mov pB, origPB
  1461. asr counterL , origK, #3 // counterL = counterL / 8
  1462. cmp counterL, #0
  1463. ble .Lcgemm_kernel_L2_M1_40
  1464. .Lcgemm_kernel_L2_M1_22:
  1465. KERNEL1x2_SUB
  1466. KERNEL1x2_SUB
  1467. KERNEL1x2_SUB
  1468. KERNEL1x2_SUB
  1469. KERNEL1x2_SUB
  1470. KERNEL1x2_SUB
  1471. KERNEL1x2_SUB
  1472. KERNEL1x2_SUB
  1473. subs counterL, counterL, #1
  1474. bgt .Lcgemm_kernel_L2_M1_22
  1475. .Lcgemm_kernel_L2_M1_40:
  1476. ands counterL , origK, #7 // counterL = counterL % 8
  1477. ble .Lcgemm_kernel_L2_M1_100
  1478. .Lcgemm_kernel_L2_M1_42:
  1479. KERNEL1x2_SUB
  1480. subs counterL, counterL, #1
  1481. bgt .Lcgemm_kernel_L2_M1_42
  1482. .Lcgemm_kernel_L2_M1_100:
  1483. SAVE1x2
  1484. .Lcgemm_kernel_L2_END:
  1485. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1486. /******************************************************************************/
  1487. .Lcgemm_kernel_L1_BEGIN:
  1488. mov counterJ , origN
  1489. tst counterJ , #1
  1490. ble .Lcgemm_kernel_L999 // done
  1491. mov pCRow0, pC // pCRow0 = C
  1492. add pC , pC , LDC // Update pC to point to next
  1493. mov pA, origPA // pA = A
  1494. .Lcgemm_kernel_L1_M8_BEGIN:
  1495. mov counterI, origM
  1496. asr counterI, counterI, #3 // counterI = counterI / 8
  1497. cmp counterI, #0
  1498. ble .Lcgemm_kernel_L1_M4_BEGIN
  1499. .Lcgemm_kernel_L1_M8_20:
  1500. INIT8x1
  1501. mov pB, origPB
  1502. asr counterL , origK, #3 // counterL = counterL / 8
  1503. cmp counterL , #0
  1504. ble .Lcgemm_kernel_L1_M8_40
  1505. .align 5
  1506. .Lcgemm_kernel_L1_M8_22:
  1507. KERNEL8x1_SUB
  1508. KERNEL8x1_SUB
  1509. KERNEL8x1_SUB
  1510. KERNEL8x1_SUB
  1511. KERNEL8x1_SUB
  1512. KERNEL8x1_SUB
  1513. KERNEL8x1_SUB
  1514. KERNEL8x1_SUB
  1515. subs counterL, counterL, #1
  1516. bgt .Lcgemm_kernel_L1_M8_22
  1517. .Lcgemm_kernel_L1_M8_40:
  1518. ands counterL , origK, #7 // counterL = counterL % 8
  1519. ble .Lcgemm_kernel_L1_M8_100
  1520. .Lcgemm_kernel_L1_M8_42:
  1521. KERNEL8x1_SUB
  1522. subs counterL, counterL, #1
  1523. bgt .Lcgemm_kernel_L1_M8_42
  1524. .Lcgemm_kernel_L1_M8_100:
  1525. SAVE8x1
  1526. .Lcgemm_kernel_L1_M8_END:
  1527. subs counterI, counterI, #1
  1528. bgt .Lcgemm_kernel_L1_M8_20
  1529. .Lcgemm_kernel_L1_M4_BEGIN:
  1530. mov counterI, origM
  1531. tst counterI , #7
  1532. ble .Lcgemm_kernel_L1_END
  1533. tst counterI, #4 // counterI = counterI / 2
  1534. ble .Lcgemm_kernel_L1_M2_BEGIN
  1535. .Lcgemm_kernel_L1_M4_20:
  1536. INIT4x1
  1537. mov pB, origPB
  1538. asr counterL , origK, #3 // counterL = counterL / 8
  1539. cmp counterL , #0
  1540. ble .Lcgemm_kernel_L1_M4_40
  1541. .align 5
  1542. .Lcgemm_kernel_L1_M4_22:
  1543. KERNEL4x1_SUB
  1544. KERNEL4x1_SUB
  1545. KERNEL4x1_SUB
  1546. KERNEL4x1_SUB
  1547. KERNEL4x1_SUB
  1548. KERNEL4x1_SUB
  1549. KERNEL4x1_SUB
  1550. KERNEL4x1_SUB
  1551. subs counterL, counterL, #1
  1552. bgt .Lcgemm_kernel_L1_M4_22
  1553. .Lcgemm_kernel_L1_M4_40:
  1554. ands counterL , origK, #7 // counterL = counterL % 8
  1555. ble .Lcgemm_kernel_L1_M4_100
  1556. .Lcgemm_kernel_L1_M4_42:
  1557. KERNEL4x1_SUB
  1558. subs counterL, counterL, #1
  1559. bgt .Lcgemm_kernel_L1_M4_42
  1560. .Lcgemm_kernel_L1_M4_100:
  1561. SAVE4x1
  1562. .Lcgemm_kernel_L1_M4_END:
  1563. .Lcgemm_kernel_L1_M2_BEGIN:
  1564. mov counterI, origM
  1565. tst counterI , #3
  1566. ble .Lcgemm_kernel_L1_END
  1567. tst counterI, #2 // counterI = counterI / 2
  1568. ble .Lcgemm_kernel_L1_M1_BEGIN
  1569. .Lcgemm_kernel_L1_M2_20:
  1570. INIT2x1
  1571. mov pB, origPB
  1572. asr counterL , origK, #3 // counterL = counterL / 8
  1573. cmp counterL , #0
  1574. ble .Lcgemm_kernel_L1_M2_40
  1575. .Lcgemm_kernel_L1_M2_22:
  1576. KERNEL2x1_SUB
  1577. KERNEL2x1_SUB
  1578. KERNEL2x1_SUB
  1579. KERNEL2x1_SUB
  1580. KERNEL2x1_SUB
  1581. KERNEL2x1_SUB
  1582. KERNEL2x1_SUB
  1583. KERNEL2x1_SUB
  1584. subs counterL, counterL, #1
  1585. bgt .Lcgemm_kernel_L1_M2_22
  1586. .Lcgemm_kernel_L1_M2_40:
  1587. ands counterL , origK, #7 // counterL = counterL % 8
  1588. ble .Lcgemm_kernel_L1_M2_100
  1589. .Lcgemm_kernel_L1_M2_42:
  1590. KERNEL2x1_SUB
  1591. subs counterL, counterL, #1
  1592. bgt .Lcgemm_kernel_L1_M2_42
  1593. .Lcgemm_kernel_L1_M2_100:
  1594. SAVE2x1
  1595. .Lcgemm_kernel_L1_M2_END:
  1596. .Lcgemm_kernel_L1_M1_BEGIN:
  1597. tst counterI, #1 // counterI = counterI % 2
  1598. ble .Lcgemm_kernel_L1_END
  1599. .Lcgemm_kernel_L1_M1_20:
  1600. INIT1x1
  1601. mov pB, origPB
  1602. asr counterL , origK, #3 // counterL = counterL / 8
  1603. cmp counterL , #0
  1604. ble .Lcgemm_kernel_L1_M1_40
  1605. .Lcgemm_kernel_L1_M1_22:
  1606. KERNEL1x1_SUB
  1607. KERNEL1x1_SUB
  1608. KERNEL1x1_SUB
  1609. KERNEL1x1_SUB
  1610. KERNEL1x1_SUB
  1611. KERNEL1x1_SUB
  1612. KERNEL1x1_SUB
  1613. KERNEL1x1_SUB
  1614. subs counterL, counterL, #1
  1615. bgt .Lcgemm_kernel_L1_M1_22
  1616. .Lcgemm_kernel_L1_M1_40:
  1617. ands counterL , origK, #7 // counterL = counterL % 8
  1618. ble .Lcgemm_kernel_L1_M1_100
  1619. .Lcgemm_kernel_L1_M1_42:
  1620. KERNEL1x1_SUB
  1621. subs counterL, counterL, #1
  1622. bgt .Lcgemm_kernel_L1_M1_42
  1623. .Lcgemm_kernel_L1_M1_100:
  1624. SAVE1x1
  1625. .Lcgemm_kernel_L1_END:
  1626. .Lcgemm_kernel_L999:
  1627. mov x0, #0 // set return value
  1628. ldp d8, d9, [sp, #(0 * 16)]
  1629. ldp d10, d11, [sp, #(1 * 16)]
  1630. ldp d12, d13, [sp, #(2 * 16)]
  1631. ldp d14, d15, [sp, #(3 * 16)]
  1632. ldp d16, d17, [sp, #(4 * 16)]
  1633. ldp x18, x19, [sp, #(5 * 16)]
  1634. ldp x20, x21, [sp, #(6 * 16)]
  1635. ldp x22, x23, [sp, #(7 * 16)]
  1636. ldp x24, x25, [sp, #(8 * 16)]
  1637. ldp x26, x27, [sp, #(9 * 16)]
  1638. ldr x28, [sp, #(10 * 16)]
  1639. add sp, sp, #(11*16)
  1640. ret
  1641. EPILOGUE