You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_4x4_thunderx2t99.S 34 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alphaR x17
  49. #define alphaI x18
  50. #define alpha0_R d10
  51. #define alphaV0_R v10.d[0]
  52. #define alpha0_I d11
  53. #define alphaV0_I v11.d[0]
  54. #define A_PRE_SIZE 3584
  55. #define B_PRE_SIZE 512
  56. #define C_PRE_SIZE 128
  57. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  58. #define OP_rr fmla
  59. #define OP_ii fmls
  60. #define OP_ri fmla
  61. #define OP_ir fmla
  62. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  63. #define OP_rr fmla
  64. #define OP_ii fmla
  65. #define OP_ri fmls
  66. #define OP_ir fmla
  67. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  68. #define OP_rr fmla
  69. #define OP_ii fmla
  70. #define OP_ri fmla
  71. #define OP_ir fmls
  72. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  73. #define OP_rr fmla
  74. #define OP_ii fmls
  75. #define OP_ri fmls
  76. #define OP_ir fmls
  77. #endif
  78. // 00 origM
  79. // 01 origN
  80. // 02 origK
  81. // 03 origPA
  82. // 04 origPB
  83. // 05 pC
  84. // 06 origLDC -> LDC
  85. // 07 offset -> temp
  86. // 08 counterL
  87. // 09 counterI
  88. // 10 counterJ
  89. // 11 pB
  90. // 12 pCRow0
  91. // 13 pCRow1
  92. // 14 pCRow2
  93. // 15 pCRow3
  94. // 16 pA
  95. // 17 alpha_save_R
  96. // 18 must save alpha_save_I
  97. // 19 must save
  98. // 20 must save
  99. // 21 must save
  100. // 22 must save
  101. // 23 must save
  102. // 24 must save
  103. // 25 must save
  104. // 26 must save
  105. // 27 must save
  106. // 28 must save
  107. // 29 frame
  108. // 30 link
  109. // 31 sp
  110. //v00 ALPHA_R -> pA00_R, pA01_R
  111. //v01 ALPHA_I -> pA00_I, pA01_I
  112. //v02 pA02_R, pA03_R
  113. //v03 pA02_I, pA03_I
  114. //v04 pA10_R, pA11_R
  115. //v05 pA10_I, pA11_I
  116. //v06 pA12_R, pA13_R
  117. //v07 pA12_I, pA13_I
  118. //v08 must save pB00_R, pB01_R
  119. //v09 must save pB00_I, pB01_I
  120. //v10 must save pB02_R, pB03_R OR ALPHA0_R
  121. //v11 must save pB02_I, pB03_I OR ALPHA0_I
  122. //v12 must save pB10_R, pB11_R
  123. //v13 must save pB10_I, pB11_I
  124. //v14 must save pB12_R, pB13_R OR ALPHA1_R
  125. //v15 must save pB12_I, pB13_I OR ALPHA1_R
  126. //v16 must save pC00_R, pC01_R
  127. //v17 must save pC00_I, pC01_I
  128. //v18 pC02_R, pC03_R
  129. //v19 pC02_I, pC03_I
  130. //v20 pC10_R, pC11_R
  131. //v21 pC10_I, pC11_I
  132. //v22 pC12_R, pC13_R
  133. //v23 pC12_I, pC13_I
  134. //v24 pC20_R, pC21_R
  135. //v25 pC20_I, pC21_I
  136. //v26 pC22_R, pC23_R
  137. //v27 pC22_I, pC23_I
  138. //v28 pC30_R, pC31_R
  139. //v29 pC30_I, pC31_I
  140. //v30 pC32_R, pC33_R
  141. //v31 pC32_I, pC33_I
  142. /*******************************************************************************
  143. * Macro definitions
  144. *******************************************************************************/
  145. .macro INIT4x4
  146. fmov d16, xzr
  147. fmov d17, d16
  148. fmov d18, d17
  149. fmov d19, d16
  150. fmov d20, d17
  151. fmov d21, d16
  152. fmov d22, d17
  153. fmov d23, d16
  154. fmov d24, d17
  155. fmov d25, d16
  156. fmov d26, d17
  157. fmov d27, d16
  158. fmov d28, d17
  159. fmov d29, d16
  160. fmov d30, d17
  161. fmov d31, d16
  162. .endm
  163. .macro KERNEL4x4_I
  164. ldr q8, [pB]
  165. ldr q9, [pB, #16]
  166. add pB, pB, #32
  167. ld2 {v0.2d, v1.2d}, [pA]
  168. add pA, pA, #32
  169. fmul v16.2d, v0.2d, v8.d[0]
  170. OP_ii v16.2d, v1.2d, v8.d[1]
  171. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  172. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  173. eor v17.16b, v17.16b, v17.16b
  174. fmls v17.2d, v0.2d, v8.d[1]
  175. #else
  176. fmul v17.2d, v0.2d, v8.d[1]
  177. #endif
  178. OP_ir v17.2d, v1.2d, v8.d[0]
  179. ld2 {v2.2d, v3.2d}, [pA]
  180. add pA, pA, #32
  181. fmul v20.2d, v0.2d, v9.d[0]
  182. OP_ii v20.2d, v1.2d, v9.d[1]
  183. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  184. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  185. eor v21.16b, v21.16b, v21.16b
  186. fmls v21.2d, v0.2d, v9.d[1]
  187. #else
  188. fmul v21.2d, v0.2d, v9.d[1]
  189. #endif
  190. OP_ir v21.2d, v1.2d, v9.d[0]
  191. ldr q10, [pB]
  192. ldr q11, [pB, #16]
  193. add pB, pB, #32
  194. fmul v22.2d, v2.2d, v9.d[0]
  195. OP_ii v22.2d, v3.2d, v9.d[1]
  196. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  197. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  198. eor v23.16b, v23.16b, v23.16b
  199. fmls v23.2d, v2.2d, v9.d[1]
  200. #else
  201. fmul v23.2d, v2.2d, v9.d[1]
  202. #endif
  203. OP_ir v23.2d, v3.2d, v9.d[0]
  204. ldr q12, [pB]
  205. ldr q13, [pB, #16]
  206. add pB, pB, #32
  207. fmul v18.2d, v2.2d, v8.d[0]
  208. OP_ii v18.2d, v3.2d, v8.d[1]
  209. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  210. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  211. eor v19.16b, v19.16b, v19.16b
  212. fmls v19.2d, v2.2d, v8.d[1]
  213. #else
  214. fmul v19.2d, v2.2d, v8.d[1]
  215. #endif
  216. OP_ir v19.2d, v3.2d, v8.d[0]
  217. ld2 {v4.2d, v5.2d} , [pA]
  218. add pA, pA, #32
  219. fmul v24.2d, v0.2d, v10.d[0]
  220. OP_ii v24.2d, v1.2d, v10.d[1]
  221. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  222. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  223. eor v25.16b, v25.16b, v25.16b
  224. fmls v25.2d, v0.2d, v10.d[1]
  225. #else
  226. fmul v25.2d, v0.2d, v10.d[1]
  227. #endif
  228. OP_ir v25.2d, v1.2d, v10.d[0]
  229. ld2 {v6.2d, v7.2d} , [pA]
  230. add pA, pA, #32
  231. fmul v26.2d, v2.2d, v10.d[0]
  232. OP_ii v26.2d, v3.2d, v10.d[1]
  233. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  234. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  235. eor v27.16b, v27.16b, v27.16b
  236. fmls v27.2d, v2.2d, v10.d[1]
  237. #else
  238. fmul v27.2d, v2.2d, v10.d[1]
  239. #endif
  240. OP_ir v27.2d, v3.2d, v10.d[0]
  241. ldr q14, [pB]
  242. ldr q15, [pB, #16]
  243. add pB, pB, #32
  244. fmul v28.2d, v0.2d, v11.d[0]
  245. OP_ii v28.2d, v1.2d, v11.d[1]
  246. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  247. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  248. eor v29.16b, v29.16b, v29.16b
  249. fmls v29.2d, v0.2d, v11.d[1]
  250. #else
  251. fmul v29.2d, v0.2d, v11.d[1]
  252. #endif
  253. OP_ir v29.2d, v1.2d, v11.d[0]
  254. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  255. fmul v30.2d, v2.2d, v11.d[0]
  256. OP_ii v30.2d, v3.2d, v11.d[1]
  257. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  258. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  259. eor v31.16b, v31.16b, v31.16b
  260. fmls v31.2d, v2.2d, v11.d[1]
  261. #else
  262. fmul v31.2d, v2.2d, v11.d[1]
  263. #endif
  264. OP_ir v31.2d, v3.2d, v11.d[0]
  265. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  266. .endm
  267. .macro KERNEL4x4_M1
  268. OP_rr v16.2d, v0.2d, v8.d[0]
  269. OP_ii v16.2d, v1.2d, v8.d[1]
  270. OP_ri v17.2d, v0.2d, v8.d[1]
  271. OP_ir v17.2d, v1.2d, v8.d[0]
  272. ldr q12, [pB]
  273. ldr q13, [pB, #16]
  274. add pB, pB, #32
  275. OP_rr v18.2d, v2.2d, v8.d[0]
  276. OP_ii v18.2d, v3.2d, v8.d[1]
  277. OP_ri v19.2d, v2.2d, v8.d[1]
  278. OP_ir v19.2d, v3.2d, v8.d[0]
  279. ld2 {v4.2d, v5.2d} , [pA]
  280. add pA, pA, #32
  281. OP_rr v20.2d, v0.2d, v9.d[0]
  282. OP_ii v20.2d, v1.2d, v9.d[1]
  283. OP_ri v21.2d, v0.2d, v9.d[1]
  284. OP_ir v21.2d, v1.2d, v9.d[0]
  285. ld2 {v6.2d, v7.2d} , [pA]
  286. add pA, pA, #32
  287. OP_rr v22.2d, v2.2d, v9.d[0]
  288. OP_ii v22.2d, v3.2d, v9.d[1]
  289. OP_ri v23.2d, v2.2d, v9.d[1]
  290. OP_ir v23.2d, v3.2d, v9.d[0]
  291. ldr q14, [pB]
  292. ldr q15, [pB, #16]
  293. add pB, pB, #32
  294. OP_rr v24.2d, v0.2d, v10.d[0]
  295. OP_ii v24.2d, v1.2d, v10.d[1]
  296. OP_ri v25.2d, v0.2d, v10.d[1]
  297. OP_ir v25.2d, v1.2d, v10.d[0]
  298. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  299. OP_rr v26.2d, v2.2d, v10.d[0]
  300. OP_ii v26.2d, v3.2d, v10.d[1]
  301. OP_ri v27.2d, v2.2d, v10.d[1]
  302. OP_ir v27.2d, v3.2d, v10.d[0]
  303. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  304. OP_rr v28.2d, v0.2d, v11.d[0]
  305. OP_ii v28.2d, v1.2d, v11.d[1]
  306. OP_ri v29.2d, v0.2d, v11.d[1]
  307. OP_ir v29.2d, v1.2d, v11.d[0]
  308. OP_rr v30.2d, v2.2d, v11.d[0]
  309. OP_ii v30.2d, v3.2d, v11.d[1]
  310. OP_ri v31.2d, v2.2d, v11.d[1]
  311. OP_ir v31.2d, v3.2d, v11.d[0]
  312. .endm
  313. .macro KERNEL4x4_M2
  314. OP_rr v16.2d, v4.2d, v12.d[0]
  315. OP_ii v16.2d, v5.2d, v12.d[1]
  316. OP_ri v17.2d, v4.2d, v12.d[1]
  317. OP_ir v17.2d, v5.2d, v12.d[0]
  318. ldr q8, [pB]
  319. ldr q9, [pB, #16]
  320. add pB, pB, #32
  321. OP_rr v18.2d, v6.2d, v12.d[0]
  322. OP_ii v18.2d, v7.2d, v12.d[1]
  323. OP_ri v19.2d, v6.2d, v12.d[1]
  324. OP_ir v19.2d, v7.2d, v12.d[0]
  325. ld2 {v0.2d, v1.2d}, [pA]
  326. add pA, pA, #32
  327. OP_rr v20.2d, v4.2d, v13.d[0]
  328. OP_ii v20.2d, v5.2d, v13.d[1]
  329. OP_ri v21.2d, v4.2d, v13.d[1]
  330. OP_ir v21.2d, v5.2d, v13.d[0]
  331. ld2 {v2.2d, v3.2d}, [pA]
  332. add pA, pA, #32
  333. OP_rr v22.2d, v6.2d, v13.d[0]
  334. OP_ii v22.2d, v7.2d, v13.d[1]
  335. OP_ri v23.2d, v6.2d, v13.d[1]
  336. OP_ir v23.2d, v7.2d, v13.d[0]
  337. ldr q10, [pB]
  338. ldr q11, [pB, #16]
  339. add pB, pB, #32
  340. OP_rr v24.2d, v4.2d, v14.d[0]
  341. OP_ii v24.2d, v5.2d, v14.d[1]
  342. OP_ri v25.2d, v4.2d, v14.d[1]
  343. OP_ir v25.2d, v5.2d, v14.d[0]
  344. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  345. OP_rr v26.2d, v6.2d, v14.d[0]
  346. OP_ii v26.2d, v7.2d, v14.d[1]
  347. OP_ri v27.2d, v6.2d, v14.d[1]
  348. OP_ir v27.2d, v7.2d, v14.d[0]
  349. prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
  350. OP_rr v28.2d, v4.2d, v15.d[0]
  351. OP_ii v28.2d, v5.2d, v15.d[1]
  352. OP_ri v29.2d, v4.2d, v15.d[1]
  353. OP_ir v29.2d, v5.2d, v15.d[0]
  354. OP_rr v30.2d, v6.2d, v15.d[0]
  355. OP_ii v30.2d, v7.2d, v15.d[1]
  356. OP_ri v31.2d, v6.2d, v15.d[1]
  357. OP_ir v31.2d, v7.2d, v15.d[0]
  358. .endm
  359. .macro KERNEL4x4_E
  360. OP_rr v16.2d, v4.2d, v12.d[0]
  361. OP_ii v16.2d, v5.2d, v12.d[1]
  362. OP_ri v17.2d, v4.2d, v12.d[1]
  363. OP_ir v17.2d, v5.2d, v12.d[0]
  364. OP_rr v18.2d, v6.2d, v12.d[0]
  365. OP_ii v18.2d, v7.2d, v12.d[1]
  366. OP_ri v19.2d, v6.2d, v12.d[1]
  367. OP_ir v19.2d, v7.2d, v12.d[0]
  368. OP_rr v20.2d, v4.2d, v13.d[0]
  369. OP_ii v20.2d, v5.2d, v13.d[1]
  370. OP_ri v21.2d, v4.2d, v13.d[1]
  371. OP_ir v21.2d, v5.2d, v13.d[0]
  372. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  373. OP_rr v22.2d, v6.2d, v13.d[0]
  374. OP_ii v22.2d, v7.2d, v13.d[1]
  375. OP_ri v23.2d, v6.2d, v13.d[1]
  376. OP_ir v23.2d, v7.2d, v13.d[0]
  377. OP_rr v24.2d, v4.2d, v14.d[0]
  378. OP_ii v24.2d, v5.2d, v14.d[1]
  379. OP_ri v25.2d, v4.2d, v14.d[1]
  380. OP_ir v25.2d, v5.2d, v14.d[0]
  381. prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
  382. OP_rr v26.2d, v6.2d, v14.d[0]
  383. OP_ii v26.2d, v7.2d, v14.d[1]
  384. OP_ri v27.2d, v6.2d, v14.d[1]
  385. OP_ir v27.2d, v7.2d, v14.d[0]
  386. OP_rr v28.2d, v4.2d, v15.d[0]
  387. OP_ii v28.2d, v5.2d, v15.d[1]
  388. OP_ri v29.2d, v4.2d, v15.d[1]
  389. OP_ir v29.2d, v5.2d, v15.d[0]
  390. OP_rr v30.2d, v6.2d, v15.d[0]
  391. OP_ii v30.2d, v7.2d, v15.d[1]
  392. OP_ri v31.2d, v6.2d, v15.d[1]
  393. OP_ir v31.2d, v7.2d, v15.d[0]
  394. .endm
  395. .macro KERNEL4x4_SUB
  396. ldr q8, [pB]
  397. ldr q9, [pB, #16]
  398. add pB, pB, #32
  399. ld2 {v0.2d, v1.2d}, [pA]
  400. add pA, pA, #32
  401. OP_rr v16.2d, v0.2d, v8.d[0]
  402. OP_ii v16.2d, v1.2d, v8.d[1]
  403. OP_ri v17.2d, v0.2d, v8.d[1]
  404. OP_ir v17.2d, v1.2d, v8.d[0]
  405. ld2 {v2.2d, v3.2d}, [pA]
  406. add pA, pA, #32
  407. OP_rr v20.2d, v0.2d, v9.d[0]
  408. OP_ii v20.2d, v1.2d, v9.d[1]
  409. OP_ri v21.2d, v0.2d, v9.d[1]
  410. OP_ir v21.2d, v1.2d, v9.d[0]
  411. ldr q10, [pB]
  412. ldr q11, [pB, #16]
  413. add pB, pB, #32
  414. OP_rr v18.2d, v2.2d, v8.d[0]
  415. OP_ii v18.2d, v3.2d, v8.d[1]
  416. OP_ri v19.2d, v2.2d, v8.d[1]
  417. OP_ir v19.2d, v3.2d, v8.d[0]
  418. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  419. OP_rr v22.2d, v2.2d, v9.d[0]
  420. OP_ii v22.2d, v3.2d, v9.d[1]
  421. OP_ri v23.2d, v2.2d, v9.d[1]
  422. OP_ir v23.2d, v3.2d, v9.d[0]
  423. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  424. OP_rr v24.2d, v0.2d, v10.d[0]
  425. OP_ii v24.2d, v1.2d, v10.d[1]
  426. OP_ri v25.2d, v0.2d, v10.d[1]
  427. OP_ir v25.2d, v1.2d, v10.d[0]
  428. OP_rr v26.2d, v2.2d, v10.d[0]
  429. OP_ii v26.2d, v3.2d, v10.d[1]
  430. OP_ri v27.2d, v2.2d, v10.d[1]
  431. OP_ir v27.2d, v3.2d, v10.d[0]
  432. OP_rr v28.2d, v0.2d, v11.d[0]
  433. OP_ii v28.2d, v1.2d, v11.d[1]
  434. OP_ri v29.2d, v0.2d, v11.d[1]
  435. OP_ir v29.2d, v1.2d, v11.d[0]
  436. OP_rr v30.2d, v2.2d, v11.d[0]
  437. OP_ii v30.2d, v3.2d, v11.d[1]
  438. OP_ri v31.2d, v2.2d, v11.d[1]
  439. OP_ir v31.2d, v3.2d, v11.d[0]
  440. .endm
  441. .macro SAVE4x4
  442. fmov alpha0_R, alphaR
  443. fmov alpha0_I, alphaI
  444. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  445. ld2 {v0.2d, v1.2d}, [pCRow0]
  446. fmla v0.2d, v16.2d, alphaV0_R
  447. fmls v0.2d, v17.2d, alphaV0_I
  448. fmla v1.2d, v16.2d, alphaV0_I
  449. fmla v1.2d, v17.2d, alphaV0_R
  450. st2 {v0.2d, v1.2d}, [pCRow0]
  451. add pCRow0, pCRow0, #32
  452. ld2 {v2.2d, v3.2d}, [pCRow0]
  453. fmla v2.2d, v18.2d, alphaV0_R
  454. fmls v2.2d, v19.2d, alphaV0_I
  455. fmla v3.2d, v18.2d, alphaV0_I
  456. fmla v3.2d, v19.2d, alphaV0_R
  457. st2 {v2.2d, v3.2d}, [pCRow0]
  458. add pCRow0, pCRow0, #32
  459. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  460. ld2 {v4.2d, v5.2d}, [pCRow1]
  461. fmla v4.2d, v20.2d, alphaV0_R
  462. fmls v4.2d, v21.2d, alphaV0_I
  463. fmla v5.2d, v20.2d, alphaV0_I
  464. fmla v5.2d, v21.2d, alphaV0_R
  465. st2 {v4.2d, v5.2d}, [pCRow1]
  466. add pCRow1, pCRow1, #32
  467. ld2 {v6.2d, v7.2d}, [pCRow1]
  468. fmla v6.2d, v22.2d, alphaV0_R
  469. fmls v6.2d, v23.2d, alphaV0_I
  470. fmla v7.2d, v22.2d, alphaV0_I
  471. fmla v7.2d, v23.2d, alphaV0_R
  472. st2 {v6.2d, v7.2d}, [pCRow1]
  473. add pCRow1, pCRow1, #32
  474. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  475. ld2 {v0.2d, v1.2d}, [pCRow2]
  476. fmla v0.2d, v24.2d, alphaV0_R
  477. fmls v0.2d, v25.2d, alphaV0_I
  478. fmla v1.2d, v24.2d, alphaV0_I
  479. fmla v1.2d, v25.2d, alphaV0_R
  480. st2 {v0.2d, v1.2d}, [pCRow2]
  481. add pCRow2, pCRow2, #32
  482. ld2 {v2.2d, v3.2d}, [pCRow2]
  483. fmla v2.2d, v26.2d, alphaV0_R
  484. fmls v2.2d, v27.2d, alphaV0_I
  485. fmla v3.2d, v26.2d, alphaV0_I
  486. fmla v3.2d, v27.2d, alphaV0_R
  487. st2 {v2.2d, v3.2d}, [pCRow2]
  488. add pCRow2, pCRow2, #32
  489. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  490. ld2 {v4.2d, v5.2d}, [pCRow3]
  491. fmla v4.2d, v28.2d, alphaV0_R
  492. fmls v4.2d, v29.2d, alphaV0_I
  493. fmla v5.2d, v28.2d, alphaV0_I
  494. fmla v5.2d, v29.2d, alphaV0_R
  495. st2 {v4.2d, v5.2d}, [pCRow3]
  496. add pCRow3, pCRow3, #32
  497. ld2 {v6.2d, v7.2d}, [pCRow3]
  498. fmla v6.2d, v30.2d, alphaV0_R
  499. fmls v6.2d, v31.2d, alphaV0_I
  500. fmla v7.2d, v30.2d, alphaV0_I
  501. fmla v7.2d, v31.2d, alphaV0_R
  502. st2 {v6.2d, v7.2d}, [pCRow3]
  503. add pCRow3, pCRow3, #32
  504. .endm
  505. /******************************************************************************/
  506. .macro INIT2x4
  507. fmov d16, xzr
  508. fmov d17, xzr
  509. fmov d20, d16
  510. fmov d21, d17
  511. fmov d24, d16
  512. fmov d25, d17
  513. fmov d28, d16
  514. fmov d29, d17
  515. .endm
  516. .macro KERNEL2x4_SUB
  517. ld2 {v8.2d, v9.2d}, [pB]
  518. add pB, pB, #32
  519. ld2 {v10.2d, v11.2d}, [pB]
  520. add pB, pB, #32
  521. ld2 {v0.2d, v1.2d}, [pA]
  522. add pA, pA, #32
  523. OP_rr v16.2d, v0.2d, v8.d[0]
  524. OP_ii v16.2d, v1.2d, v9.d[0]
  525. OP_ri v17.2d, v0.2d, v9.d[0]
  526. OP_ir v17.2d, v1.2d, v8.d[0]
  527. OP_rr v20.2d, v0.2d, v8.d[1]
  528. OP_ii v20.2d, v1.2d, v9.d[1]
  529. OP_ri v21.2d, v0.2d, v9.d[1]
  530. OP_ir v21.2d, v1.2d, v8.d[1]
  531. OP_rr v24.2d, v0.2d, v10.d[0]
  532. OP_ii v24.2d, v1.2d, v11.d[0]
  533. OP_ri v25.2d, v0.2d, v11.d[0]
  534. OP_ir v25.2d, v1.2d, v10.d[0]
  535. OP_rr v28.2d, v0.2d, v10.d[1]
  536. OP_ii v28.2d, v1.2d, v11.d[1]
  537. OP_ri v29.2d, v0.2d, v11.d[1]
  538. OP_ir v29.2d, v1.2d, v10.d[1]
  539. .endm
  540. .macro SAVE2x4
  541. fmov alpha0_R, alphaR
  542. fmov alpha0_I, alphaI
  543. mov pCRow1, pCRow0
  544. ld2 {v0.2d, v1.2d}, [pCRow1]
  545. fmla v0.2d, v16.2d, alphaV0_R
  546. fmls v0.2d, v17.2d, alphaV0_I
  547. fmla v1.2d, v16.2d, alphaV0_I
  548. fmla v1.2d, v17.2d, alphaV0_R
  549. st2 {v0.2d, v1.2d}, [pCRow1]
  550. add pCRow1, pCRow1, LDC
  551. ld2 {v4.2d, v5.2d}, [pCRow1]
  552. fmla v4.2d, v20.2d, alphaV0_R
  553. fmls v4.2d, v21.2d, alphaV0_I
  554. fmla v5.2d, v20.2d, alphaV0_I
  555. fmla v5.2d, v21.2d, alphaV0_R
  556. st2 {v4.2d, v5.2d}, [pCRow1]
  557. add pCRow1, pCRow1, LDC
  558. ld2 {v0.2d, v1.2d}, [pCRow1]
  559. fmla v0.2d, v24.2d, alphaV0_R
  560. fmls v0.2d, v25.2d, alphaV0_I
  561. fmla v1.2d, v24.2d, alphaV0_I
  562. fmla v1.2d, v25.2d, alphaV0_R
  563. st2 {v0.2d, v1.2d}, [pCRow1]
  564. add pCRow1, pCRow1, LDC
  565. ld2 {v4.2d, v5.2d}, [pCRow1]
  566. fmla v4.2d, v28.2d, alphaV0_R
  567. fmls v4.2d, v29.2d, alphaV0_I
  568. fmla v5.2d, v28.2d, alphaV0_I
  569. fmla v5.2d, v29.2d, alphaV0_R
  570. st2 {v4.2d, v5.2d}, [pCRow1]
  571. add pCRow0, pCRow0, #32
  572. .endm
  573. /******************************************************************************/
  574. .macro INIT1x4
  575. fmov d16, xzr
  576. fmov d17, xzr
  577. fmov d20, d16
  578. fmov d21, d17
  579. fmov d24, d16
  580. fmov d25, d17
  581. fmov d28, d16
  582. fmov d29, d17
  583. .endm
  584. .macro KERNEL1x4_SUB
  585. ld2 {v8.2d, v9.2d}, [pB]
  586. add pB, pB, #32
  587. ld2 {v10.2d, v11.2d}, [pB]
  588. add pB, pB, #32
  589. ld2 {v0.d, v1.d}[0], [pA]
  590. add pA, pA, #16
  591. OP_rr d16, d0, v8.d[0]
  592. OP_ii d16, d1, v9.d[0]
  593. OP_ri d17, d0, v9.d[0]
  594. OP_ir d17, d1, v8.d[0]
  595. OP_rr d20, d0, v8.d[1]
  596. OP_ii d20, d1, v9.d[1]
  597. OP_ri d21, d0, v9.d[1]
  598. OP_ir d21, d1, v8.d[1]
  599. OP_rr d24, d0, v10.d[0]
  600. OP_ii d24, d1, v11.d[0]
  601. OP_ri d25, d0, v11.d[0]
  602. OP_ir d25, d1, v10.d[0]
  603. OP_rr d28, d0, v10.d[1]
  604. OP_ii d28, d1, v11.d[1]
  605. OP_ri d29, d0, v11.d[1]
  606. OP_ir d29, d1, v10.d[1]
  607. .endm
  608. .macro SAVE1x4
  609. fmov alpha0_R, alphaR
  610. fmov alpha0_I, alphaI
  611. mov pCRow1, pCRow0
  612. ld2 {v0.d, v1.d}[0], [pCRow1]
  613. fmla d0, d16, alphaV0_R
  614. fmls d0, d17, alphaV0_I
  615. fmla d1, d16, alphaV0_I
  616. fmla d1, d17, alphaV0_R
  617. st2 {v0.d, v1.d}[0], [pCRow1]
  618. add pCRow1, pCRow1, LDC
  619. ld2 {v4.d, v5.d}[0], [pCRow1]
  620. fmla d4, d20, alphaV0_R
  621. fmls d4, d21, alphaV0_I
  622. fmla d5, d20, alphaV0_I
  623. fmla d5, d21, alphaV0_R
  624. st2 {v4.d, v5.d}[0], [pCRow1]
  625. add pCRow1, pCRow1, LDC
  626. ld2 {v0.d, v1.d}[0], [pCRow1]
  627. fmla d0, d24, alphaV0_R
  628. fmls d0, d25, alphaV0_I
  629. fmla d1, d24, alphaV0_I
  630. fmla d1, d25, alphaV0_R
  631. st2 {v0.d, v1.d}[0], [pCRow1]
  632. add pCRow1, pCRow1, LDC
  633. ld2 {v4.d, v5.d}[0], [pCRow1]
  634. fmla d4, d28, alphaV0_R
  635. fmls d4, d29, alphaV0_I
  636. fmla d5, d28, alphaV0_I
  637. fmla d5, d29, alphaV0_R
  638. st2 {v4.d, v5.d}[0], [pCRow1]
  639. add pCRow0, pCRow0, #16
  640. .endm
  641. /******************************************************************************/
  642. .macro INIT4x2
  643. fmov d16, xzr
  644. fmov d17, xzr
  645. fmov d18, d16
  646. fmov d19, d17
  647. fmov d20, d16
  648. fmov d21, d17
  649. fmov d22, d16
  650. fmov d23, d17
  651. .endm
  652. .macro KERNEL4x2_SUB
  653. ld2 {v8.2d, v9.2d}, [pB]
  654. add pB, pB, #32
  655. ld2 {v0.2d, v1.2d}, [pA]
  656. add pA, pA, #32
  657. ld2 {v2.2d, v3.2d}, [pA]
  658. add pA, pA, #32
  659. OP_rr v16.2d, v0.2d, v8.d[0]
  660. OP_ii v16.2d, v1.2d, v9.d[0]
  661. OP_ri v17.2d, v0.2d, v9.d[0]
  662. OP_ir v17.2d, v1.2d, v8.d[0]
  663. OP_rr v18.2d, v2.2d, v8.d[0]
  664. OP_ii v18.2d, v3.2d, v9.d[0]
  665. OP_ri v19.2d, v2.2d, v9.d[0]
  666. OP_ir v19.2d, v3.2d, v8.d[0]
  667. OP_rr v20.2d, v0.2d, v8.d[1]
  668. OP_ii v20.2d, v1.2d, v9.d[1]
  669. OP_ri v21.2d, v0.2d, v9.d[1]
  670. OP_ir v21.2d, v1.2d, v8.d[1]
  671. OP_rr v22.2d, v2.2d, v8.d[1]
  672. OP_ii v22.2d, v3.2d, v9.d[1]
  673. OP_ri v23.2d, v2.2d, v9.d[1]
  674. OP_ir v23.2d, v3.2d, v8.d[1]
  675. .endm
  676. .macro SAVE4x2
  677. fmov alpha0_R, alphaR
  678. fmov alpha0_I, alphaI
  679. mov pCRow1, pCRow0
  680. ld2 {v0.2d, v1.2d}, [pCRow1]
  681. fmla v0.2d, v16.2d, alphaV0_R
  682. fmls v0.2d, v17.2d, alphaV0_I
  683. fmla v1.2d, v16.2d, alphaV0_I
  684. fmla v1.2d, v17.2d, alphaV0_R
  685. st2 {v0.2d, v1.2d}, [pCRow1]
  686. add pCRow2, pCRow1, #32
  687. ld2 {v2.2d, v3.2d}, [pCRow2]
  688. fmla v2.2d, v18.2d, alphaV0_R
  689. fmls v2.2d, v19.2d, alphaV0_I
  690. fmla v3.2d, v18.2d, alphaV0_I
  691. fmla v3.2d, v19.2d, alphaV0_R
  692. st2 {v2.2d, v3.2d}, [pCRow2]
  693. add pCRow1, pCRow1, LDC
  694. ld2 {v4.2d, v5.2d}, [pCRow1]
  695. fmla v4.2d, v20.2d, alphaV0_R
  696. fmls v4.2d, v21.2d, alphaV0_I
  697. fmla v5.2d, v20.2d, alphaV0_I
  698. fmla v5.2d, v21.2d, alphaV0_R
  699. st2 {v4.2d, v5.2d}, [pCRow1]
  700. add pCRow2, pCRow1, #32
  701. ld2 {v6.2d, v7.2d}, [pCRow2]
  702. fmla v6.2d, v22.2d, alphaV0_R
  703. fmls v6.2d, v23.2d, alphaV0_I
  704. fmla v7.2d, v22.2d, alphaV0_I
  705. fmla v7.2d, v23.2d, alphaV0_R
  706. st2 {v6.2d, v7.2d}, [pCRow2]
  707. add pCRow0, pCRow0, #64
  708. .endm
  709. /******************************************************************************/
  710. .macro INIT2x2
  711. fmov d16, xzr
  712. fmov d17, xzr
  713. fmov d20, d16
  714. fmov d21, d17
  715. .endm
  716. .macro KERNEL2x2_SUB
  717. ld2 {v8.2d, v9.2d}, [pB]
  718. add pB, pB, #32
  719. ld2 {v0.2d, v1.2d}, [pA]
  720. add pA, pA, #32
  721. OP_rr v16.2d, v0.2d, v8.d[0]
  722. OP_ii v16.2d, v1.2d, v9.d[0]
  723. OP_ri v17.2d, v0.2d, v9.d[0]
  724. OP_ir v17.2d, v1.2d, v8.d[0]
  725. OP_rr v20.2d, v0.2d, v8.d[1]
  726. OP_ii v20.2d, v1.2d, v9.d[1]
  727. OP_ri v21.2d, v0.2d, v9.d[1]
  728. OP_ir v21.2d, v1.2d, v8.d[1]
  729. .endm
  730. .macro SAVE2x2
  731. fmov alpha0_R, alphaR
  732. fmov alpha0_I, alphaI
  733. mov pCRow1, pCRow0
  734. ld2 {v0.2d, v1.2d}, [pCRow1]
  735. fmla v0.2d, v16.2d, alphaV0_R
  736. fmls v0.2d, v17.2d, alphaV0_I
  737. fmla v1.2d, v16.2d, alphaV0_I
  738. fmla v1.2d, v17.2d, alphaV0_R
  739. st2 {v0.2d, v1.2d}, [pCRow1]
  740. add pCRow1, pCRow1, LDC
  741. ld2 {v4.2d, v5.2d}, [pCRow1]
  742. fmla v4.2d, v20.2d, alphaV0_R
  743. fmls v4.2d, v21.2d, alphaV0_I
  744. fmla v5.2d, v20.2d, alphaV0_I
  745. fmla v5.2d, v21.2d, alphaV0_R
  746. st2 {v4.2d, v5.2d}, [pCRow1]
  747. add pCRow0, pCRow0, #32
  748. .endm
  749. /******************************************************************************/
  750. .macro INIT1x2
  751. fmov d16, xzr
  752. fmov d17, xzr
  753. fmov d20, xzr
  754. fmov d21, xzr
  755. .endm
  756. .macro KERNEL1x2_SUB
  757. ld2 {v8.2d, v9.2d}, [pB]
  758. add pB, pB, #32
  759. ld2 {v0.d, v1.d}[0], [pA]
  760. add pA, pA, #16
  761. OP_rr d16, d0, v8.d[0]
  762. OP_ii d16, d1, v9.d[0]
  763. OP_ri d17, d0, v9.d[0]
  764. OP_ir d17, d1, v8.d[0]
  765. OP_rr d20, d0, v8.d[1]
  766. OP_ii d20, d1, v9.d[1]
  767. OP_ri d21, d0, v9.d[1]
  768. OP_ir d21, d1, v8.d[1]
  769. .endm
  770. .macro SAVE1x2
  771. fmov alpha0_R, alphaR
  772. fmov alpha0_I, alphaI
  773. mov pCRow1, pCRow0
  774. ld2 {v0.d, v1.d}[0], [pCRow1]
  775. fmla d0, d16, alphaV0_R
  776. fmls d0, d17, alphaV0_I
  777. fmla d1, d16, alphaV0_I
  778. fmla d1, d17, alphaV0_R
  779. st2 {v0.d, v1.d}[0], [pCRow1]
  780. add pCRow1, pCRow1, LDC
  781. ld2 {v4.d, v5.d}[0], [pCRow1]
  782. fmla d4, d20, alphaV0_R
  783. fmls d4, d21, alphaV0_I
  784. fmla d5, d20, alphaV0_I
  785. fmla d5, d21, alphaV0_R
  786. st2 {v4.d, v5.d}[0], [pCRow1]
  787. add pCRow0, pCRow0, #16
  788. .endm
  789. /******************************************************************************/
  790. .macro INIT4x1
  791. fmov d16, xzr
  792. fmov d17, d16
  793. fmov d18, d16
  794. fmov d19, d17
  795. .endm
  796. .macro KERNEL4x1_SUB
  797. ld2 {v8.d, v9.d}[0], [pB]
  798. add pB, pB, #16
  799. ld2 {v0.2d, v1.2d}, [pA]
  800. add pA, pA, #32
  801. ld2 {v2.2d, v3.2d}, [pA]
  802. add pA, pA, #32
  803. OP_rr v16.2d, v0.2d, v8.d[0]
  804. OP_ii v16.2d, v1.2d, v9.d[0]
  805. OP_ri v17.2d, v0.2d, v9.d[0]
  806. OP_ir v17.2d, v1.2d, v8.d[0]
  807. OP_rr v18.2d, v2.2d, v8.d[0]
  808. OP_ii v18.2d, v3.2d, v9.d[0]
  809. OP_ri v19.2d, v2.2d, v9.d[0]
  810. OP_ir v19.2d, v3.2d, v8.d[0]
  811. .endm
  812. .macro SAVE4x1
  813. fmov alpha0_R, alphaR
  814. fmov alpha0_I, alphaI
  815. mov pCRow1, pCRow0
  816. ld2 {v0.2d, v1.2d}, [pCRow1]
  817. fmla v0.2d, v16.2d, alphaV0_R
  818. fmls v0.2d, v17.2d, alphaV0_I
  819. fmla v1.2d, v16.2d, alphaV0_I
  820. fmla v1.2d, v17.2d, alphaV0_R
  821. st2 {v0.2d, v1.2d}, [pCRow1]
  822. add pCRow2, pCRow1, #32
  823. ld2 {v2.2d, v3.2d}, [pCRow2]
  824. fmla v2.2d, v18.2d, alphaV0_R
  825. fmls v2.2d, v19.2d, alphaV0_I
  826. fmla v3.2d, v18.2d, alphaV0_I
  827. fmla v3.2d, v19.2d, alphaV0_R
  828. st2 {v2.2d, v3.2d}, [pCRow2]
  829. add pCRow0, pCRow0, #64
  830. .endm
  831. /******************************************************************************/
  832. .macro INIT2x1
  833. fmov d16, xzr
  834. fmov d17, xzr
  835. .endm
  836. .macro KERNEL2x1_SUB
  837. ld2 {v8.d, v9.d}[0], [pB]
  838. add pB, pB, #16
  839. ld2 {v0.2d, v1.2d}, [pA]
  840. add pA, pA, #32
  841. OP_rr v16.2d, v0.2d, v8.d[0]
  842. OP_ii v16.2d, v1.2d, v9.d[0]
  843. OP_ri v17.2d, v0.2d, v9.d[0]
  844. OP_ir v17.2d, v1.2d, v8.d[0]
  845. .endm
  846. .macro SAVE2x1
  847. fmov alpha0_R, alphaR
  848. fmov alpha0_I, alphaI
  849. mov pCRow1, pCRow0
  850. ld2 {v0.2d, v1.2d}, [pCRow1]
  851. fmla v0.2d, v16.2d, alphaV0_R
  852. fmls v0.2d, v17.2d, alphaV0_I
  853. fmla v1.2d, v16.2d, alphaV0_I
  854. fmla v1.2d, v17.2d, alphaV0_R
  855. st2 {v0.2d, v1.2d}, [pCRow1]
  856. add pCRow0, pCRow0, #32
  857. .endm
  858. /******************************************************************************/
  859. .macro INIT1x1
  860. fmov d16, xzr
  861. fmov d17, xzr
  862. .endm
  863. .macro KERNEL1x1_SUB
  864. ld2 {v8.d, v9.d}[0], [pB]
  865. add pB, pB, #16
  866. ld2 {v0.d, v1.d}[0], [pA]
  867. add pA, pA, #16
  868. OP_rr d16, d0, v8.d[0]
  869. OP_ii d16, d1, v9.d[0]
  870. OP_ri d17, d0, v9.d[0]
  871. OP_ir d17, d1, v8.d[0]
  872. .endm
  873. .macro SAVE1x1
  874. fmov alpha0_R, alphaR
  875. fmov alpha0_I, alphaI
  876. mov pCRow1, pCRow0
  877. ld2 {v0.d, v1.d}[0], [pCRow1]
  878. fmla d0, d16, alphaV0_R
  879. fmls d0, d17, alphaV0_I
  880. fmla d1, d16, alphaV0_I
  881. fmla d1, d17, alphaV0_R
  882. st2 {v0.d, v1.d}[0], [pCRow1]
  883. add pCRow0, pCRow0, #16
  884. .endm
  885. /*******************************************************************************
  886. * End of macro definitions
  887. *******************************************************************************/
  888. PROLOGUE
  889. .align 5
  890. add sp, sp, #-(11 * 16)
  891. stp d8, d9, [sp, #(0 * 16)]
  892. stp d10, d11, [sp, #(1 * 16)]
  893. stp d12, d13, [sp, #(2 * 16)]
  894. stp d14, d15, [sp, #(3 * 16)]
  895. stp d16, d17, [sp, #(4 * 16)]
  896. stp x18, x19, [sp, #(5 * 16)]
  897. stp x20, x21, [sp, #(6 * 16)]
  898. stp x22, x23, [sp, #(7 * 16)]
  899. stp x24, x25, [sp, #(8 * 16)]
  900. stp x26, x27, [sp, #(9 * 16)]
  901. str x28, [sp, #(10 * 16)]
  902. prfm PLDL1KEEP, [origPB]
  903. prfm PLDL1KEEP, [origPA]
  904. fmov alphaR, d0
  905. fmov alphaI, d1
  906. lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
  907. mov pB, origPB
  908. mov counterJ, origN
  909. asr counterJ, counterJ, #2 // J = J / 4
  910. cmp counterJ, #0
  911. ble .Lzgemm_kernel_L2_BEGIN
  912. .Lzgemm_kernel_L4_BEGIN:
  913. mov pCRow0, pC
  914. add pCRow1, pCRow0, LDC
  915. add pCRow2, pCRow1, LDC
  916. add pCRow3, pCRow2, LDC
  917. add pC, pCRow3, LDC
  918. mov pA, origPA // pA = start of A array
  919. .Lzgemm_kernel_L4_M4_BEGIN:
  920. mov counterI, origM
  921. asr counterI, counterI, #2 // counterI = counterI / 4
  922. cmp counterI, #0
  923. ble .Lzgemm_kernel_L4_M2_BEGIN
  924. .align 5
  925. .Lzgemm_kernel_L4_M4_20:
  926. mov pB, origPB
  927. asr counterL , origK, #3
  928. cmp counterL , #2
  929. blt .Lzgemm_kernel_L4_M4_32
  930. KERNEL4x4_I
  931. KERNEL4x4_M2
  932. KERNEL4x4_M1
  933. KERNEL4x4_M2
  934. KERNEL4x4_M1
  935. KERNEL4x4_M2
  936. KERNEL4x4_M1
  937. KERNEL4x4_M2
  938. subs counterL, counterL, #2 // subtract 2
  939. ble .Lzgemm_kernel_L4_M4_22a
  940. .align 5
  941. .Lzgemm_kernel_L4_M4_22:
  942. KERNEL4x4_M1
  943. KERNEL4x4_M2
  944. KERNEL4x4_M1
  945. KERNEL4x4_M2
  946. KERNEL4x4_M1
  947. KERNEL4x4_M2
  948. KERNEL4x4_M1
  949. KERNEL4x4_M2
  950. subs counterL, counterL, #1
  951. bgt .Lzgemm_kernel_L4_M4_22
  952. .align 5
  953. .Lzgemm_kernel_L4_M4_22a:
  954. KERNEL4x4_M1
  955. KERNEL4x4_M2
  956. KERNEL4x4_M1
  957. KERNEL4x4_M2
  958. KERNEL4x4_M1
  959. KERNEL4x4_M2
  960. KERNEL4x4_M1
  961. KERNEL4x4_E
  962. b .Lzgemm_kernel_L4_M4_44
  963. .align 5
  964. .Lzgemm_kernel_L4_M4_32:
  965. tst counterL, #1
  966. ble .Lzgemm_kernel_L4_M4_40
  967. KERNEL4x4_I
  968. KERNEL4x4_M2
  969. KERNEL4x4_M1
  970. KERNEL4x4_M2
  971. KERNEL4x4_M1
  972. KERNEL4x4_M2
  973. KERNEL4x4_M1
  974. KERNEL4x4_E
  975. b .Lzgemm_kernel_L4_M4_44
  976. .Lzgemm_kernel_L4_M4_40:
  977. INIT4x4
  978. .Lzgemm_kernel_L4_M4_44:
  979. ands counterL , origK, #7
  980. ble .Lzgemm_kernel_L4_M4_100
  981. .align 5
  982. .Lzgemm_kernel_L4_M4_46:
  983. KERNEL4x4_SUB
  984. subs counterL, counterL, #1
  985. bne .Lzgemm_kernel_L4_M4_46
  986. .Lzgemm_kernel_L4_M4_100:
  987. prfm PLDL1KEEP, [pA]
  988. prfm PLDL1KEEP, [pA, #64]
  989. prfm PLDL1KEEP, [origPB]
  990. SAVE4x4
  991. .Lzgemm_kernel_L4_M4_END:
  992. subs counterI, counterI, #1
  993. bne .Lzgemm_kernel_L4_M4_20
  994. .Lzgemm_kernel_L4_M2_BEGIN:
  995. mov counterI, origM
  996. tst counterI , #3
  997. ble .Lzgemm_kernel_L4_END
  998. tst counterI, #2 // counterI = counterI / 2
  999. ble .Lzgemm_kernel_L4_M1_BEGIN
  1000. .Lzgemm_kernel_L4_M2_20:
  1001. INIT2x4
  1002. mov pB, origPB
  1003. asr counterL , origK, #3 // counterL = counterL / 8
  1004. cmp counterL , #0
  1005. ble .Lzgemm_kernel_L4_M2_40
  1006. .Lzgemm_kernel_L4_M2_22:
  1007. KERNEL2x4_SUB
  1008. KERNEL2x4_SUB
  1009. KERNEL2x4_SUB
  1010. KERNEL2x4_SUB
  1011. KERNEL2x4_SUB
  1012. KERNEL2x4_SUB
  1013. KERNEL2x4_SUB
  1014. KERNEL2x4_SUB
  1015. subs counterL, counterL, #1
  1016. bgt .Lzgemm_kernel_L4_M2_22
  1017. .Lzgemm_kernel_L4_M2_40:
  1018. ands counterL , origK, #7 // counterL = counterL % 8
  1019. ble .Lzgemm_kernel_L4_M2_100
  1020. .Lzgemm_kernel_L4_M2_42:
  1021. KERNEL2x4_SUB
  1022. subs counterL, counterL, #1
  1023. bgt .Lzgemm_kernel_L4_M2_42
  1024. .Lzgemm_kernel_L4_M2_100:
  1025. SAVE2x4
  1026. .Lzgemm_kernel_L4_M2_END:
  1027. .Lzgemm_kernel_L4_M1_BEGIN:
  1028. tst counterI, #1 // counterI = counterI % 2
  1029. ble .Lzgemm_kernel_L4_END
  1030. .Lzgemm_kernel_L4_M1_20:
  1031. INIT1x4
  1032. mov pB, origPB
  1033. asr counterL , origK, #3 // counterL = counterL / 8
  1034. cmp counterL , #0
  1035. ble .Lzgemm_kernel_L4_M1_40
  1036. .Lzgemm_kernel_L4_M1_22:
  1037. KERNEL1x4_SUB
  1038. KERNEL1x4_SUB
  1039. KERNEL1x4_SUB
  1040. KERNEL1x4_SUB
  1041. KERNEL1x4_SUB
  1042. KERNEL1x4_SUB
  1043. KERNEL1x4_SUB
  1044. KERNEL1x4_SUB
  1045. subs counterL, counterL, #1
  1046. bgt .Lzgemm_kernel_L4_M1_22
  1047. .Lzgemm_kernel_L4_M1_40:
  1048. ands counterL , origK, #7 // counterL = counterL % 8
  1049. ble .Lzgemm_kernel_L4_M1_100
  1050. .Lzgemm_kernel_L4_M1_42:
  1051. KERNEL1x4_SUB
  1052. subs counterL, counterL, #1
  1053. bgt .Lzgemm_kernel_L4_M1_42
  1054. .Lzgemm_kernel_L4_M1_100:
  1055. SAVE1x4
  1056. .Lzgemm_kernel_L4_END:
  1057. lsl temp, origK, #6
  1058. add origPB, origPB, temp // B = B + K * 4 * 8 * 2
  1059. subs counterJ, counterJ , #1 // j--
  1060. bgt .Lzgemm_kernel_L4_BEGIN
  1061. /******************************************************************************/
  1062. .Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  1063. mov counterJ , origN
  1064. tst counterJ , #3
  1065. ble .Lzgemm_kernel_L999
  1066. tst counterJ , #2
  1067. ble .Lzgemm_kernel_L1_BEGIN
  1068. mov pCRow0, pC // pCRow0 = pC
  1069. add pC,pC,LDC, lsl #1
  1070. mov pA, origPA // pA = A
  1071. .Lzgemm_kernel_L2_M4_BEGIN:
  1072. mov counterI, origM
  1073. asr counterI, counterI, #2 // counterI = counterI / 4
  1074. cmp counterI,#0
  1075. ble .Lzgemm_kernel_L2_M2_BEGIN
  1076. .Lzgemm_kernel_L2_M4_20:
  1077. INIT4x2
  1078. mov pB, origPB
  1079. asr counterL , origK, #3 // counterL = counterL / 8
  1080. cmp counterL,#0
  1081. ble .Lzgemm_kernel_L2_M4_40
  1082. .align 5
  1083. .Lzgemm_kernel_L2_M4_22:
  1084. KERNEL4x2_SUB
  1085. KERNEL4x2_SUB
  1086. KERNEL4x2_SUB
  1087. KERNEL4x2_SUB
  1088. KERNEL4x2_SUB
  1089. KERNEL4x2_SUB
  1090. KERNEL4x2_SUB
  1091. KERNEL4x2_SUB
  1092. subs counterL, counterL, #1
  1093. bgt .Lzgemm_kernel_L2_M4_22
  1094. .Lzgemm_kernel_L2_M4_40:
  1095. ands counterL , origK, #7 // counterL = counterL % 8
  1096. ble .Lzgemm_kernel_L2_M4_100
  1097. .Lzgemm_kernel_L2_M4_42:
  1098. KERNEL4x2_SUB
  1099. subs counterL, counterL, #1
  1100. bgt .Lzgemm_kernel_L2_M4_42
  1101. .Lzgemm_kernel_L2_M4_100:
  1102. SAVE4x2
  1103. .Lzgemm_kernel_L2_M4_END:
  1104. subs counterI, counterI, #1
  1105. bgt .Lzgemm_kernel_L2_M4_20
  1106. .Lzgemm_kernel_L2_M2_BEGIN:
  1107. mov counterI, origM
  1108. tst counterI , #3
  1109. ble .Lzgemm_kernel_L2_END
  1110. tst counterI, #2 // counterI = counterI / 2
  1111. ble .Lzgemm_kernel_L2_M1_BEGIN
  1112. .Lzgemm_kernel_L2_M2_20:
  1113. INIT2x2
  1114. mov pB, origPB
  1115. asr counterL , origK, #3 // counterL = counterL / 8
  1116. cmp counterL,#0
  1117. ble .Lzgemm_kernel_L2_M2_40
  1118. .Lzgemm_kernel_L2_M2_22:
  1119. KERNEL2x2_SUB
  1120. KERNEL2x2_SUB
  1121. KERNEL2x2_SUB
  1122. KERNEL2x2_SUB
  1123. KERNEL2x2_SUB
  1124. KERNEL2x2_SUB
  1125. KERNEL2x2_SUB
  1126. KERNEL2x2_SUB
  1127. subs counterL, counterL, #1
  1128. bgt .Lzgemm_kernel_L2_M2_22
  1129. .Lzgemm_kernel_L2_M2_40:
  1130. ands counterL , origK, #7 // counterL = counterL % 8
  1131. ble .Lzgemm_kernel_L2_M2_100
  1132. .Lzgemm_kernel_L2_M2_42:
  1133. KERNEL2x2_SUB
  1134. subs counterL, counterL, #1
  1135. bgt .Lzgemm_kernel_L2_M2_42
  1136. .Lzgemm_kernel_L2_M2_100:
  1137. SAVE2x2
  1138. .Lzgemm_kernel_L2_M2_END:
  1139. .Lzgemm_kernel_L2_M1_BEGIN:
  1140. tst counterI, #1 // counterI = counterI % 2
  1141. ble .Lzgemm_kernel_L2_END
  1142. .Lzgemm_kernel_L2_M1_20:
  1143. INIT1x2
  1144. mov pB, origPB
  1145. asr counterL , origK, #3 // counterL = counterL / 8
  1146. cmp counterL, #0
  1147. ble .Lzgemm_kernel_L2_M1_40
  1148. .Lzgemm_kernel_L2_M1_22:
  1149. KERNEL1x2_SUB
  1150. KERNEL1x2_SUB
  1151. KERNEL1x2_SUB
  1152. KERNEL1x2_SUB
  1153. KERNEL1x2_SUB
  1154. KERNEL1x2_SUB
  1155. KERNEL1x2_SUB
  1156. KERNEL1x2_SUB
  1157. subs counterL, counterL, #1
  1158. bgt .Lzgemm_kernel_L2_M1_22
  1159. .Lzgemm_kernel_L2_M1_40:
  1160. ands counterL , origK, #7 // counterL = counterL % 8
  1161. ble .Lzgemm_kernel_L2_M1_100
  1162. .Lzgemm_kernel_L2_M1_42:
  1163. KERNEL1x2_SUB
  1164. subs counterL, counterL, #1
  1165. bgt .Lzgemm_kernel_L2_M1_42
  1166. .Lzgemm_kernel_L2_M1_100:
  1167. SAVE1x2
  1168. .Lzgemm_kernel_L2_END:
  1169. lsl temp, origK, #5
  1170. add origPB, origPB, temp // B = B + K * 2 * 8 * 2
  1171. /******************************************************************************/
  1172. .Lzgemm_kernel_L1_BEGIN:
  1173. mov counterJ , origN
  1174. tst counterJ , #1
  1175. ble .Lzgemm_kernel_L999 // done
  1176. mov pCRow0, pC // pCRow0 = C
  1177. add pC , pC , LDC // Update pC to point to next
  1178. mov pA, origPA // pA = A
  1179. .Lzgemm_kernel_L1_M4_BEGIN:
  1180. mov counterI, origM
  1181. asr counterI, counterI, #2 // counterI = counterI / 4
  1182. cmp counterI, #0
  1183. ble .Lzgemm_kernel_L1_M2_BEGIN
  1184. .Lzgemm_kernel_L1_M4_20:
  1185. INIT4x1
  1186. mov pB, origPB
  1187. asr counterL , origK, #3 // counterL = counterL / 8
  1188. cmp counterL , #0
  1189. ble .Lzgemm_kernel_L1_M4_40
  1190. .align 5
  1191. .Lzgemm_kernel_L1_M4_22:
  1192. KERNEL4x1_SUB
  1193. KERNEL4x1_SUB
  1194. KERNEL4x1_SUB
  1195. KERNEL4x1_SUB
  1196. KERNEL4x1_SUB
  1197. KERNEL4x1_SUB
  1198. KERNEL4x1_SUB
  1199. KERNEL4x1_SUB
  1200. subs counterL, counterL, #1
  1201. bgt .Lzgemm_kernel_L1_M4_22
  1202. .Lzgemm_kernel_L1_M4_40:
  1203. ands counterL , origK, #7 // counterL = counterL % 8
  1204. ble .Lzgemm_kernel_L1_M4_100
  1205. .Lzgemm_kernel_L1_M4_42:
  1206. KERNEL4x1_SUB
  1207. subs counterL, counterL, #1
  1208. bgt .Lzgemm_kernel_L1_M4_42
  1209. .Lzgemm_kernel_L1_M4_100:
  1210. SAVE4x1
  1211. .Lzgemm_kernel_L1_M4_END:
  1212. subs counterI, counterI, #1
  1213. bgt .Lzgemm_kernel_L1_M4_20
  1214. .Lzgemm_kernel_L1_M2_BEGIN:
  1215. mov counterI, origM
  1216. tst counterI , #3
  1217. ble .Lzgemm_kernel_L1_END
  1218. tst counterI, #2 // counterI = counterI / 2
  1219. ble .Lzgemm_kernel_L1_M1_BEGIN
  1220. .Lzgemm_kernel_L1_M2_20:
  1221. INIT2x1
  1222. mov pB, origPB
  1223. asr counterL , origK, #3 // counterL = counterL / 8
  1224. cmp counterL , #0
  1225. ble .Lzgemm_kernel_L1_M2_40
  1226. .Lzgemm_kernel_L1_M2_22:
  1227. KERNEL2x1_SUB
  1228. KERNEL2x1_SUB
  1229. KERNEL2x1_SUB
  1230. KERNEL2x1_SUB
  1231. KERNEL2x1_SUB
  1232. KERNEL2x1_SUB
  1233. KERNEL2x1_SUB
  1234. KERNEL2x1_SUB
  1235. subs counterL, counterL, #1
  1236. bgt .Lzgemm_kernel_L1_M2_22
  1237. .Lzgemm_kernel_L1_M2_40:
  1238. ands counterL , origK, #7 // counterL = counterL % 8
  1239. ble .Lzgemm_kernel_L1_M2_100
  1240. .Lzgemm_kernel_L1_M2_42:
  1241. KERNEL2x1_SUB
  1242. subs counterL, counterL, #1
  1243. bgt .Lzgemm_kernel_L1_M2_42
  1244. .Lzgemm_kernel_L1_M2_100:
  1245. SAVE2x1
  1246. .Lzgemm_kernel_L1_M2_END:
  1247. .Lzgemm_kernel_L1_M1_BEGIN:
  1248. tst counterI, #1 // counterI = counterI % 2
  1249. ble .Lzgemm_kernel_L1_END
  1250. .Lzgemm_kernel_L1_M1_20:
  1251. INIT1x1
  1252. mov pB, origPB
  1253. asr counterL , origK, #3 // counterL = counterL / 8
  1254. cmp counterL , #0
  1255. ble .Lzgemm_kernel_L1_M1_40
  1256. .Lzgemm_kernel_L1_M1_22:
  1257. KERNEL1x1_SUB
  1258. KERNEL1x1_SUB
  1259. KERNEL1x1_SUB
  1260. KERNEL1x1_SUB
  1261. KERNEL1x1_SUB
  1262. KERNEL1x1_SUB
  1263. KERNEL1x1_SUB
  1264. KERNEL1x1_SUB
  1265. subs counterL, counterL, #1
  1266. bgt .Lzgemm_kernel_L1_M1_22
  1267. .Lzgemm_kernel_L1_M1_40:
  1268. ands counterL , origK, #7 // counterL = counterL % 8
  1269. ble .Lzgemm_kernel_L1_M1_100
  1270. .Lzgemm_kernel_L1_M1_42:
  1271. KERNEL1x1_SUB
  1272. subs counterL, counterL, #1
  1273. bgt .Lzgemm_kernel_L1_M1_42
  1274. .Lzgemm_kernel_L1_M1_100:
  1275. SAVE1x1
  1276. .Lzgemm_kernel_L1_END:
  1277. .Lzgemm_kernel_L999:
  1278. mov x0, #0 // set return value
  1279. ldp d8, d9, [sp, #(0 * 16)]
  1280. ldp d10, d11, [sp, #(1 * 16)]
  1281. ldp d12, d13, [sp, #(2 * 16)]
  1282. ldp d14, d15, [sp, #(3 * 16)]
  1283. ldp d16, d17, [sp, #(4 * 16)]
  1284. ldp x18, x19, [sp, #(5 * 16)]
  1285. ldp x20, x21, [sp, #(6 * 16)]
  1286. ldp x22, x23, [sp, #(7 * 16)]
  1287. ldp x24, x25, [sp, #(8 * 16)]
  1288. ldp x26, x27, [sp, #(9 * 16)]
  1289. ldr x28, [sp, #(10 * 16)]
  1290. add sp, sp, #(11*16)
  1291. ret
  1292. EPILOGUE