You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrmm_kernel_4x4.S 40 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alphaR x17
  49. #define alphaI x22
  50. #define temp x19
  51. #define tempOffset x20
  52. #define tempK x21
  53. #define alpha0_R d10
  54. #define alphaV0_R v10.d[0]
  55. #define alpha0_I d11
  56. #define alphaV0_I v11.d[0]
  57. #define A_PRE_SIZE 2560
  58. #define B_PRE_SIZE 448
  59. #define C_PRE_SIZE 128
  60. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  61. #define OP_rr fmla
  62. #define OP_ii fmls
  63. #define OP_ri fmla
  64. #define OP_ir fmla
  65. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  66. #define OP_rr fmla
  67. #define OP_ii fmla
  68. #define OP_ri fmls
  69. #define OP_ir fmla
  70. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  71. #define OP_rr fmla
  72. #define OP_ii fmla
  73. #define OP_ri fmla
  74. #define OP_ir fmls
  75. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  76. #define OP_rr fmla
  77. #define OP_ii fmls
  78. #define OP_ri fmls
  79. #define OP_ir fmls
  80. #endif
  81. // 00 origM
  82. // 01 origN
  83. // 02 origK
  84. // 03 origPA
  85. // 04 origPB
  86. // 05 pC
  87. // 06 origLDC -> LDC
  88. // 07 offset -> temp
  89. // 08 counterL
  90. // 09 counterI
  91. // 10 counterJ
  92. // 11 pB
  93. // 12 pCRow0
  94. // 13 pCRow1
  95. // 14 pCRow2
  96. // 15 pCRow3
  97. // 16 pA
  98. // 17 alpha_save_R
  99. // 18 must save alpha_save_I
  100. // 19 must save temp
  101. // 20 must save tempOffset
  102. // 21 must save tempK
  103. // 22 must save
  104. // 23 must save
  105. // 24 must save
  106. // 25 must save
  107. // 26 must save
  108. // 27 must save
  109. // 28 must save
  110. // 29 frame
  111. // 30 link
  112. // 31 sp
  113. //v00 ALPHA_R -> pA00_R, pA01_R
  114. //v01 ALPHA_I -> pA00_I, pA01_I
  115. //v02 pA02_R, pA03_R
  116. //v03 pA02_I, pA03_I
  117. //v04 pA10_R, pA11_R
  118. //v05 pA10_I, pA11_I
  119. //v06 pA12_R, pA13_R
  120. //v07 pA12_I, pA13_I
  121. //v08 must save pB00_R, pB01_R
  122. //v09 must save pB00_I, pB01_I
  123. //v10 must save pB02_R, pB03_R OR ALPHA0_R
  124. //v11 must save pB02_I, pB03_I OR ALPHA0_I
  125. //v12 must save pB10_R, pB11_R
  126. //v13 must save pB10_I, pB11_I
  127. //v14 must save pB12_R, pB13_R OR ALPHA1_R
  128. //v15 must save pB12_I, pB13_I OR ALPHA1_R
  129. //v16 must save pC00_R, pC01_R
  130. //v17 must save pC00_I, pC01_I
  131. //v18 pC02_R, pC03_R
  132. //v19 pC02_I, pC03_I
  133. //v20 pC10_R, pC11_R
  134. //v21 pC10_I, pC11_I
  135. //v22 pC12_R, pC13_R
  136. //v23 pC12_I, pC13_I
  137. //v24 pC20_R, pC21_R
  138. //v25 pC20_I, pC21_I
  139. //v26 pC22_R, pC23_R
  140. //v27 pC22_I, pC23_I
  141. //v28 pC30_R, pC31_R
  142. //v29 pC30_I, pC31_I
  143. //v30 pC32_R, pC33_R
  144. //v31 pC32_I, pC33_I
  145. /*******************************************************************************
  146. * Macro definitions
  147. *******************************************************************************/
  148. .macro INIT4x4
  149. fmov d16, xzr
  150. fmov d17, d16
  151. fmov d18, d17
  152. fmov d19, d16
  153. fmov d20, d17
  154. fmov d21, d16
  155. fmov d22, d17
  156. fmov d23, d16
  157. fmov d24, d17
  158. fmov d25, d16
  159. fmov d26, d17
  160. fmov d27, d16
  161. fmov d28, d17
  162. fmov d29, d16
  163. fmov d30, d17
  164. fmov d31, d16
  165. .endm
  166. .macro KERNEL4x4_I
  167. ld2 {v8.2d, v9.2d}, [pB]
  168. add pB, pB, #32
  169. ld2 {v0.2d, v1.2d}, [pA]
  170. add pA, pA, #32
  171. fmul v16.2d, v0.2d, v8.d[0]
  172. OP_ii v16.2d, v1.2d, v9.d[0]
  173. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  174. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  175. eor v17.16b, v17.16b, v17.16b
  176. fmls v17.2d, v0.2d, v9.d[0]
  177. #else
  178. fmul v17.2d, v0.2d, v9.d[0]
  179. #endif
  180. OP_ir v17.2d, v1.2d, v8.d[0]
  181. ld2 {v2.2d, v3.2d}, [pA]
  182. add pA, pA, #32
  183. fmul v20.2d, v0.2d, v8.d[1]
  184. OP_ii v20.2d, v1.2d, v9.d[1]
  185. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  186. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  187. eor v21.16b, v21.16b, v21.16b
  188. fmls v21.2d, v0.2d, v9.d[1]
  189. #else
  190. fmul v21.2d, v0.2d, v9.d[1]
  191. #endif
  192. OP_ir v21.2d, v1.2d, v8.d[1]
  193. ld2 {v10.2d, v11.2d}, [pB]
  194. add pB, pB, #32
  195. fmul v22.2d, v2.2d, v8.d[1]
  196. OP_ii v22.2d, v3.2d, v9.d[1]
  197. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  198. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  199. eor v23.16b, v23.16b, v23.16b
  200. fmls v23.2d, v2.2d, v9.d[1]
  201. #else
  202. fmul v23.2d, v2.2d, v9.d[1]
  203. #endif
  204. OP_ir v23.2d, v3.2d, v8.d[1]
  205. ld2 {v12.2d, v13.2d}, [pB]
  206. add pB, pB, #32
  207. fmul v18.2d, v2.2d, v8.d[0]
  208. OP_ii v18.2d, v3.2d, v9.d[0]
  209. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  210. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  211. eor v19.16b, v19.16b, v19.16b
  212. fmls v19.2d, v2.2d, v9.d[0]
  213. #else
  214. fmul v19.2d, v2.2d, v9.d[0]
  215. #endif
  216. OP_ir v19.2d, v3.2d, v8.d[0]
  217. ld2 {v4.2d, v5.2d} , [pA]
  218. add pA, pA, #32
  219. fmul v24.2d, v0.2d, v10.d[0]
  220. OP_ii v24.2d, v1.2d, v11.d[0]
  221. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  222. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  223. eor v25.16b, v25.16b, v25.16b
  224. fmls v25.2d, v0.2d, v11.d[0]
  225. #else
  226. fmul v25.2d, v0.2d, v11.d[0]
  227. #endif
  228. OP_ir v25.2d, v1.2d, v10.d[0]
  229. ld2 {v6.2d, v7.2d} , [pA]
  230. add pA, pA, #32
  231. fmul v26.2d, v2.2d, v10.d[0]
  232. OP_ii v26.2d, v3.2d, v11.d[0]
  233. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  234. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  235. eor v27.16b, v27.16b, v27.16b
  236. fmls v27.2d, v2.2d, v11.d[0]
  237. #else
  238. fmul v27.2d, v2.2d, v11.d[0]
  239. #endif
  240. OP_ir v27.2d, v3.2d, v10.d[0]
  241. ld2 {v14.2d, v15.2d}, [pB]
  242. add pB, pB, #32
  243. fmul v28.2d, v0.2d, v10.d[1]
  244. OP_ii v28.2d, v1.2d, v11.d[1]
  245. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  246. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  247. eor v29.16b, v29.16b, v29.16b
  248. fmls v29.2d, v0.2d, v11.d[1]
  249. #else
  250. fmul v29.2d, v0.2d, v11.d[1]
  251. #endif
  252. OP_ir v29.2d, v1.2d, v10.d[1]
  253. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  254. fmul v30.2d, v2.2d, v10.d[1]
  255. OP_ii v30.2d, v3.2d, v11.d[1]
  256. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  257. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  258. eor v31.16b, v31.16b, v31.16b
  259. fmls v31.2d, v2.2d, v11.d[1]
  260. #else
  261. fmul v31.2d, v2.2d, v11.d[1]
  262. #endif
  263. OP_ir v31.2d, v3.2d, v10.d[1]
  264. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  265. .endm
  266. .macro KERNEL4x4_M1
  267. OP_rr v16.2d, v0.2d, v8.d[0]
  268. OP_ii v16.2d, v1.2d, v9.d[0]
  269. OP_ri v17.2d, v0.2d, v9.d[0]
  270. OP_ir v17.2d, v1.2d, v8.d[0]
  271. ld2 {v12.2d, v13.2d}, [pB]
  272. add pB, pB, #32
  273. OP_rr v18.2d, v2.2d, v8.d[0]
  274. OP_ii v18.2d, v3.2d, v9.d[0]
  275. OP_ri v19.2d, v2.2d, v9.d[0]
  276. OP_ir v19.2d, v3.2d, v8.d[0]
  277. ld2 {v4.2d, v5.2d} , [pA]
  278. add pA, pA, #32
  279. OP_rr v20.2d, v0.2d, v8.d[1]
  280. OP_ii v20.2d, v1.2d, v9.d[1]
  281. OP_ri v21.2d, v0.2d, v9.d[1]
  282. OP_ir v21.2d, v1.2d, v8.d[1]
  283. ld2 {v6.2d, v7.2d} , [pA]
  284. add pA, pA, #32
  285. OP_rr v22.2d, v2.2d, v8.d[1]
  286. OP_ii v22.2d, v3.2d, v9.d[1]
  287. OP_ri v23.2d, v2.2d, v9.d[1]
  288. OP_ir v23.2d, v3.2d, v8.d[1]
  289. ld2 {v14.2d, v15.2d}, [pB]
  290. add pB, pB, #32
  291. OP_rr v24.2d, v0.2d, v10.d[0]
  292. OP_ii v24.2d, v1.2d, v11.d[0]
  293. OP_ri v25.2d, v0.2d, v11.d[0]
  294. OP_ir v25.2d, v1.2d, v10.d[0]
  295. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  296. OP_rr v26.2d, v2.2d, v10.d[0]
  297. OP_ii v26.2d, v3.2d, v11.d[0]
  298. OP_ri v27.2d, v2.2d, v11.d[0]
  299. OP_ir v27.2d, v3.2d, v10.d[0]
  300. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  301. OP_rr v28.2d, v0.2d, v10.d[1]
  302. OP_ii v28.2d, v1.2d, v11.d[1]
  303. OP_ri v29.2d, v0.2d, v11.d[1]
  304. OP_ir v29.2d, v1.2d, v10.d[1]
  305. OP_rr v30.2d, v2.2d, v10.d[1]
  306. OP_ii v30.2d, v3.2d, v11.d[1]
  307. OP_ri v31.2d, v2.2d, v11.d[1]
  308. OP_ir v31.2d, v3.2d, v10.d[1]
  309. .endm
  310. .macro KERNEL4x4_M2
  311. OP_rr v16.2d, v4.2d, v12.d[0]
  312. OP_ii v16.2d, v5.2d, v13.d[0]
  313. OP_ri v17.2d, v4.2d, v13.d[0]
  314. OP_ir v17.2d, v5.2d, v12.d[0]
  315. ld2 {v8.2d, v9.2d}, [pB]
  316. add pB, pB, #32
  317. OP_rr v18.2d, v6.2d, v12.d[0]
  318. OP_ii v18.2d, v7.2d, v13.d[0]
  319. OP_ri v19.2d, v6.2d, v13.d[0]
  320. OP_ir v19.2d, v7.2d, v12.d[0]
  321. ld2 {v0.2d, v1.2d}, [pA]
  322. add pA, pA, #32
  323. OP_rr v20.2d, v4.2d, v12.d[1]
  324. OP_ii v20.2d, v5.2d, v13.d[1]
  325. OP_ri v21.2d, v4.2d, v13.d[1]
  326. OP_ir v21.2d, v5.2d, v12.d[1]
  327. ld2 {v2.2d, v3.2d}, [pA]
  328. add pA, pA, #32
  329. OP_rr v22.2d, v6.2d, v12.d[1]
  330. OP_ii v22.2d, v7.2d, v13.d[1]
  331. OP_ri v23.2d, v6.2d, v13.d[1]
  332. OP_ir v23.2d, v7.2d, v12.d[1]
  333. ld2 {v10.2d, v11.2d}, [pB]
  334. add pB, pB, #32
  335. OP_rr v24.2d, v4.2d, v14.d[0]
  336. OP_ii v24.2d, v5.2d, v15.d[0]
  337. OP_ri v25.2d, v4.2d, v15.d[0]
  338. OP_ir v25.2d, v5.2d, v14.d[0]
  339. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  340. OP_rr v26.2d, v6.2d, v14.d[0]
  341. OP_ii v26.2d, v7.2d, v15.d[0]
  342. OP_ri v27.2d, v6.2d, v15.d[0]
  343. OP_ir v27.2d, v7.2d, v14.d[0]
  344. prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
  345. OP_rr v28.2d, v4.2d, v14.d[1]
  346. OP_ii v28.2d, v5.2d, v15.d[1]
  347. OP_ri v29.2d, v4.2d, v15.d[1]
  348. OP_ir v29.2d, v5.2d, v14.d[1]
  349. OP_rr v30.2d, v6.2d, v14.d[1]
  350. OP_ii v30.2d, v7.2d, v15.d[1]
  351. OP_ri v31.2d, v6.2d, v15.d[1]
  352. OP_ir v31.2d, v7.2d, v14.d[1]
  353. .endm
  354. .macro KERNEL4x4_E
  355. OP_rr v16.2d, v4.2d, v12.d[0]
  356. OP_ii v16.2d, v5.2d, v13.d[0]
  357. OP_ri v17.2d, v4.2d, v13.d[0]
  358. OP_ir v17.2d, v5.2d, v12.d[0]
  359. OP_rr v18.2d, v6.2d, v12.d[0]
  360. OP_ii v18.2d, v7.2d, v13.d[0]
  361. OP_ri v19.2d, v6.2d, v13.d[0]
  362. OP_ir v19.2d, v7.2d, v12.d[0]
  363. OP_rr v20.2d, v4.2d, v12.d[1]
  364. OP_ii v20.2d, v5.2d, v13.d[1]
  365. OP_ri v21.2d, v4.2d, v13.d[1]
  366. OP_ir v21.2d, v5.2d, v12.d[1]
  367. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  368. OP_rr v22.2d, v6.2d, v12.d[1]
  369. OP_ii v22.2d, v7.2d, v13.d[1]
  370. OP_ri v23.2d, v6.2d, v13.d[1]
  371. OP_ir v23.2d, v7.2d, v12.d[1]
  372. OP_rr v24.2d, v4.2d, v14.d[0]
  373. OP_ii v24.2d, v5.2d, v15.d[0]
  374. OP_ri v25.2d, v4.2d, v15.d[0]
  375. OP_ir v25.2d, v5.2d, v14.d[0]
  376. prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
  377. OP_rr v26.2d, v6.2d, v14.d[0]
  378. OP_ii v26.2d, v7.2d, v15.d[0]
  379. OP_ri v27.2d, v6.2d, v15.d[0]
  380. OP_ir v27.2d, v7.2d, v14.d[0]
  381. OP_rr v28.2d, v4.2d, v14.d[1]
  382. OP_ii v28.2d, v5.2d, v15.d[1]
  383. OP_ri v29.2d, v4.2d, v15.d[1]
  384. OP_ir v29.2d, v5.2d, v14.d[1]
  385. OP_rr v30.2d, v6.2d, v14.d[1]
  386. OP_ii v30.2d, v7.2d, v15.d[1]
  387. OP_ri v31.2d, v6.2d, v15.d[1]
  388. OP_ir v31.2d, v7.2d, v14.d[1]
  389. .endm
  390. .macro KERNEL4x4_SUB
  391. ld2 {v8.2d, v9.2d}, [pB]
  392. add pB, pB, #32
  393. ld2 {v0.2d, v1.2d}, [pA]
  394. add pA, pA, #32
  395. OP_rr v16.2d, v0.2d, v8.d[0]
  396. OP_ii v16.2d, v1.2d, v9.d[0]
  397. OP_ri v17.2d, v0.2d, v9.d[0]
  398. OP_ir v17.2d, v1.2d, v8.d[0]
  399. ld2 {v2.2d, v3.2d}, [pA]
  400. add pA, pA, #32
  401. OP_rr v20.2d, v0.2d, v8.d[1]
  402. OP_ii v20.2d, v1.2d, v9.d[1]
  403. OP_ri v21.2d, v0.2d, v9.d[1]
  404. OP_ir v21.2d, v1.2d, v8.d[1]
  405. ld2 {v10.2d, v11.2d}, [pB]
  406. add pB, pB, #32
  407. OP_rr v18.2d, v2.2d, v8.d[0]
  408. OP_ii v18.2d, v3.2d, v9.d[0]
  409. OP_ri v19.2d, v2.2d, v9.d[0]
  410. OP_ir v19.2d, v3.2d, v8.d[0]
  411. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  412. OP_rr v22.2d, v2.2d, v8.d[1]
  413. OP_ii v22.2d, v3.2d, v9.d[1]
  414. OP_ri v23.2d, v2.2d, v9.d[1]
  415. OP_ir v23.2d, v3.2d, v8.d[1]
  416. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  417. OP_rr v24.2d, v0.2d, v10.d[0]
  418. OP_ii v24.2d, v1.2d, v11.d[0]
  419. OP_ri v25.2d, v0.2d, v11.d[0]
  420. OP_ir v25.2d, v1.2d, v10.d[0]
  421. OP_rr v26.2d, v2.2d, v10.d[0]
  422. OP_ii v26.2d, v3.2d, v11.d[0]
  423. OP_ri v27.2d, v2.2d, v11.d[0]
  424. OP_ir v27.2d, v3.2d, v10.d[0]
  425. OP_rr v28.2d, v0.2d, v10.d[1]
  426. OP_ii v28.2d, v1.2d, v11.d[1]
  427. OP_ri v29.2d, v0.2d, v11.d[1]
  428. OP_ir v29.2d, v1.2d, v10.d[1]
  429. OP_rr v30.2d, v2.2d, v10.d[1]
  430. OP_ii v30.2d, v3.2d, v11.d[1]
  431. OP_ri v31.2d, v2.2d, v11.d[1]
  432. OP_ir v31.2d, v3.2d, v10.d[1]
  433. .endm
  434. .macro SAVE4x4
  435. fmov alpha0_R, alphaR
  436. fmov alpha0_I, alphaI
  437. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  438. fmul v0.2d, v16.2d, alphaV0_R
  439. fmls v0.2d, v17.2d, alphaV0_I
  440. fmul v1.2d, v16.2d, alphaV0_I
  441. fmla v1.2d, v17.2d, alphaV0_R
  442. st2 {v0.2d, v1.2d}, [pCRow0]
  443. add pCRow0, pCRow0, #32
  444. fmul v2.2d, v18.2d, alphaV0_R
  445. fmls v2.2d, v19.2d, alphaV0_I
  446. fmul v3.2d, v18.2d, alphaV0_I
  447. fmla v3.2d, v19.2d, alphaV0_R
  448. st2 {v2.2d, v3.2d}, [pCRow0]
  449. add pCRow0, pCRow0, #32
  450. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  451. fmul v4.2d, v20.2d, alphaV0_R
  452. fmls v4.2d, v21.2d, alphaV0_I
  453. fmul v5.2d, v20.2d, alphaV0_I
  454. fmla v5.2d, v21.2d, alphaV0_R
  455. st2 {v4.2d, v5.2d}, [pCRow1]
  456. add pCRow1, pCRow1, #32
  457. fmul v6.2d, v22.2d, alphaV0_R
  458. fmls v6.2d, v23.2d, alphaV0_I
  459. fmul v7.2d, v22.2d, alphaV0_I
  460. fmla v7.2d, v23.2d, alphaV0_R
  461. st2 {v6.2d, v7.2d}, [pCRow1]
  462. add pCRow1, pCRow1, #32
  463. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  464. fmul v0.2d, v24.2d, alphaV0_R
  465. fmls v0.2d, v25.2d, alphaV0_I
  466. fmul v1.2d, v24.2d, alphaV0_I
  467. fmla v1.2d, v25.2d, alphaV0_R
  468. st2 {v0.2d, v1.2d}, [pCRow2]
  469. add pCRow2, pCRow2, #32
  470. fmul v2.2d, v26.2d, alphaV0_R
  471. fmls v2.2d, v27.2d, alphaV0_I
  472. fmul v3.2d, v26.2d, alphaV0_I
  473. fmla v3.2d, v27.2d, alphaV0_R
  474. st2 {v2.2d, v3.2d}, [pCRow2]
  475. add pCRow2, pCRow2, #32
  476. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  477. fmul v4.2d, v28.2d, alphaV0_R
  478. fmls v4.2d, v29.2d, alphaV0_I
  479. fmul v5.2d, v28.2d, alphaV0_I
  480. fmla v5.2d, v29.2d, alphaV0_R
  481. st2 {v4.2d, v5.2d}, [pCRow3]
  482. add pCRow3, pCRow3, #32
  483. fmul v6.2d, v30.2d, alphaV0_R
  484. fmls v6.2d, v31.2d, alphaV0_I
  485. fmul v7.2d, v30.2d, alphaV0_I
  486. fmla v7.2d, v31.2d, alphaV0_R
  487. st2 {v6.2d, v7.2d}, [pCRow3]
  488. add pCRow3, pCRow3, #32
  489. .endm
  490. /******************************************************************************/
  491. .macro INIT2x4
  492. fmov d16, xzr
  493. fmov d17, xzr
  494. fmov d20, d16
  495. fmov d21, d17
  496. fmov d24, d16
  497. fmov d25, d17
  498. fmov d28, d16
  499. fmov d29, d17
  500. .endm
  501. .macro KERNEL2x4_SUB
  502. ld2 {v8.2d, v9.2d}, [pB]
  503. add pB, pB, #32
  504. ld2 {v10.2d, v11.2d}, [pB]
  505. add pB, pB, #32
  506. ld2 {v0.2d, v1.2d}, [pA]
  507. add pA, pA, #32
  508. OP_rr v16.2d, v0.2d, v8.d[0]
  509. OP_ii v16.2d, v1.2d, v9.d[0]
  510. OP_ri v17.2d, v0.2d, v9.d[0]
  511. OP_ir v17.2d, v1.2d, v8.d[0]
  512. OP_rr v20.2d, v0.2d, v8.d[1]
  513. OP_ii v20.2d, v1.2d, v9.d[1]
  514. OP_ri v21.2d, v0.2d, v9.d[1]
  515. OP_ir v21.2d, v1.2d, v8.d[1]
  516. OP_rr v24.2d, v0.2d, v10.d[0]
  517. OP_ii v24.2d, v1.2d, v11.d[0]
  518. OP_ri v25.2d, v0.2d, v11.d[0]
  519. OP_ir v25.2d, v1.2d, v10.d[0]
  520. OP_rr v28.2d, v0.2d, v10.d[1]
  521. OP_ii v28.2d, v1.2d, v11.d[1]
  522. OP_ri v29.2d, v0.2d, v11.d[1]
  523. OP_ir v29.2d, v1.2d, v10.d[1]
  524. .endm
  525. .macro SAVE2x4
  526. fmov alpha0_R, alphaR
  527. fmov alpha0_I, alphaI
  528. mov pCRow1, pCRow0
  529. fmul v0.2d, v16.2d, alphaV0_R
  530. fmls v0.2d, v17.2d, alphaV0_I
  531. fmul v1.2d, v16.2d, alphaV0_I
  532. fmla v1.2d, v17.2d, alphaV0_R
  533. st2 {v0.2d, v1.2d}, [pCRow1]
  534. add pCRow1, pCRow1, LDC
  535. fmul v4.2d, v20.2d, alphaV0_R
  536. fmls v4.2d, v21.2d, alphaV0_I
  537. fmul v5.2d, v20.2d, alphaV0_I
  538. fmla v5.2d, v21.2d, alphaV0_R
  539. st2 {v4.2d, v5.2d}, [pCRow1]
  540. add pCRow1, pCRow1, LDC
  541. fmul v0.2d, v24.2d, alphaV0_R
  542. fmls v0.2d, v25.2d, alphaV0_I
  543. fmul v1.2d, v24.2d, alphaV0_I
  544. fmla v1.2d, v25.2d, alphaV0_R
  545. st2 {v0.2d, v1.2d}, [pCRow1]
  546. add pCRow1, pCRow1, LDC
  547. fmul v4.2d, v28.2d, alphaV0_R
  548. fmls v4.2d, v29.2d, alphaV0_I
  549. fmul v5.2d, v28.2d, alphaV0_I
  550. fmla v5.2d, v29.2d, alphaV0_R
  551. st2 {v4.2d, v5.2d}, [pCRow1]
  552. add pCRow0, pCRow0, #32
  553. .endm
  554. /******************************************************************************/
  555. .macro INIT1x4
  556. fmov d16, xzr
  557. fmov d17, xzr
  558. fmov d20, d16
  559. fmov d21, d17
  560. fmov d24, d16
  561. fmov d25, d17
  562. fmov d28, d16
  563. fmov d29, d17
  564. .endm
  565. .macro KERNEL1x4_SUB
  566. ld2 {v8.2d, v9.2d}, [pB]
  567. add pB, pB, #32
  568. ld2 {v10.2d, v11.2d}, [pB]
  569. add pB, pB, #32
  570. ld2 {v0.d, v1.d}[0], [pA]
  571. add pA, pA, #16
  572. OP_rr d16, d0, v8.d[0]
  573. OP_ii d16, d1, v9.d[0]
  574. OP_ri d17, d0, v9.d[0]
  575. OP_ir d17, d1, v8.d[0]
  576. OP_rr d20, d0, v8.d[1]
  577. OP_ii d20, d1, v9.d[1]
  578. OP_ri d21, d0, v9.d[1]
  579. OP_ir d21, d1, v8.d[1]
  580. OP_rr d24, d0, v10.d[0]
  581. OP_ii d24, d1, v11.d[0]
  582. OP_ri d25, d0, v11.d[0]
  583. OP_ir d25, d1, v10.d[0]
  584. OP_rr d28, d0, v10.d[1]
  585. OP_ii d28, d1, v11.d[1]
  586. OP_ri d29, d0, v11.d[1]
  587. OP_ir d29, d1, v10.d[1]
  588. .endm
  589. .macro SAVE1x4
  590. fmov alpha0_R, alphaR
  591. fmov alpha0_I, alphaI
  592. mov pCRow1, pCRow0
  593. fmul d0, d16, alphaV0_R
  594. fmls d0, d17, alphaV0_I
  595. fmul d1, d16, alphaV0_I
  596. fmla d1, d17, alphaV0_R
  597. st2 {v0.d, v1.d}[0], [pCRow1]
  598. add pCRow1, pCRow1, LDC
  599. fmul d4, d20, alphaV0_R
  600. fmls d4, d21, alphaV0_I
  601. fmul d5, d20, alphaV0_I
  602. fmla d5, d21, alphaV0_R
  603. st2 {v4.d, v5.d}[0], [pCRow1]
  604. add pCRow1, pCRow1, LDC
  605. fmul d0, d24, alphaV0_R
  606. fmls d0, d25, alphaV0_I
  607. fmul d1, d24, alphaV0_I
  608. fmla d1, d25, alphaV0_R
  609. st2 {v0.d, v1.d}[0], [pCRow1]
  610. add pCRow1, pCRow1, LDC
  611. fmul d4, d28, alphaV0_R
  612. fmls d4, d29, alphaV0_I
  613. fmul d5, d28, alphaV0_I
  614. fmla d5, d29, alphaV0_R
  615. st2 {v4.d, v5.d}[0], [pCRow1]
  616. add pCRow0, pCRow0, #16
  617. .endm
  618. /******************************************************************************/
  619. .macro INIT4x2
  620. fmov d16, xzr
  621. fmov d17, xzr
  622. fmov d18, d16
  623. fmov d19, d17
  624. fmov d20, d16
  625. fmov d21, d17
  626. fmov d22, d16
  627. fmov d23, d17
  628. .endm
  629. .macro KERNEL4x2_SUB
  630. ld2 {v8.2d, v9.2d}, [pB]
  631. add pB, pB, #32
  632. ld2 {v0.2d, v1.2d}, [pA]
  633. add pA, pA, #32
  634. ld2 {v2.2d, v3.2d}, [pA]
  635. add pA, pA, #32
  636. OP_rr v16.2d, v0.2d, v8.d[0]
  637. OP_ii v16.2d, v1.2d, v9.d[0]
  638. OP_ri v17.2d, v0.2d, v9.d[0]
  639. OP_ir v17.2d, v1.2d, v8.d[0]
  640. OP_rr v18.2d, v2.2d, v8.d[0]
  641. OP_ii v18.2d, v3.2d, v9.d[0]
  642. OP_ri v19.2d, v2.2d, v9.d[0]
  643. OP_ir v19.2d, v3.2d, v8.d[0]
  644. OP_rr v20.2d, v0.2d, v8.d[1]
  645. OP_ii v20.2d, v1.2d, v9.d[1]
  646. OP_ri v21.2d, v0.2d, v9.d[1]
  647. OP_ir v21.2d, v1.2d, v8.d[1]
  648. OP_rr v22.2d, v2.2d, v8.d[1]
  649. OP_ii v22.2d, v3.2d, v9.d[1]
  650. OP_ri v23.2d, v2.2d, v9.d[1]
  651. OP_ir v23.2d, v3.2d, v8.d[1]
  652. .endm
  653. .macro SAVE4x2
  654. fmov alpha0_R, alphaR
  655. fmov alpha0_I, alphaI
  656. mov pCRow1, pCRow0
  657. fmul v0.2d, v16.2d, alphaV0_R
  658. fmls v0.2d, v17.2d, alphaV0_I
  659. fmul v1.2d, v16.2d, alphaV0_I
  660. fmla v1.2d, v17.2d, alphaV0_R
  661. st2 {v0.2d, v1.2d}, [pCRow1]
  662. add pCRow2, pCRow1, #32
  663. fmul v2.2d, v18.2d, alphaV0_R
  664. fmls v2.2d, v19.2d, alphaV0_I
  665. fmul v3.2d, v18.2d, alphaV0_I
  666. fmla v3.2d, v19.2d, alphaV0_R
  667. st2 {v2.2d, v3.2d}, [pCRow2]
  668. add pCRow1, pCRow1, LDC
  669. fmul v4.2d, v20.2d, alphaV0_R
  670. fmls v4.2d, v21.2d, alphaV0_I
  671. fmul v5.2d, v20.2d, alphaV0_I
  672. fmla v5.2d, v21.2d, alphaV0_R
  673. st2 {v4.2d, v5.2d}, [pCRow1]
  674. add pCRow2, pCRow1, #32
  675. fmul v6.2d, v22.2d, alphaV0_R
  676. fmls v6.2d, v23.2d, alphaV0_I
  677. fmul v7.2d, v22.2d, alphaV0_I
  678. fmla v7.2d, v23.2d, alphaV0_R
  679. st2 {v6.2d, v7.2d}, [pCRow2]
  680. add pCRow0, pCRow0, #64
  681. .endm
  682. /******************************************************************************/
  683. .macro INIT2x2
  684. fmov d16, xzr
  685. fmov d17, xzr
  686. fmov d20, d16
  687. fmov d21, d17
  688. .endm
  689. .macro KERNEL2x2_SUB
  690. ld2 {v8.2d, v9.2d}, [pB]
  691. add pB, pB, #32
  692. ld2 {v0.2d, v1.2d}, [pA]
  693. add pA, pA, #32
  694. OP_rr v16.2d, v0.2d, v8.d[0]
  695. OP_ii v16.2d, v1.2d, v9.d[0]
  696. OP_ri v17.2d, v0.2d, v9.d[0]
  697. OP_ir v17.2d, v1.2d, v8.d[0]
  698. OP_rr v20.2d, v0.2d, v8.d[1]
  699. OP_ii v20.2d, v1.2d, v9.d[1]
  700. OP_ri v21.2d, v0.2d, v9.d[1]
  701. OP_ir v21.2d, v1.2d, v8.d[1]
  702. .endm
  703. .macro SAVE2x2
  704. fmov alpha0_R, alphaR
  705. fmov alpha0_I, alphaI
  706. mov pCRow1, pCRow0
  707. fmul v0.2d, v16.2d, alphaV0_R
  708. fmls v0.2d, v17.2d, alphaV0_I
  709. fmul v1.2d, v16.2d, alphaV0_I
  710. fmla v1.2d, v17.2d, alphaV0_R
  711. st2 {v0.2d, v1.2d}, [pCRow1]
  712. add pCRow1, pCRow1, LDC
  713. fmul v4.2d, v20.2d, alphaV0_R
  714. fmls v4.2d, v21.2d, alphaV0_I
  715. fmul v5.2d, v20.2d, alphaV0_I
  716. fmla v5.2d, v21.2d, alphaV0_R
  717. st2 {v4.2d, v5.2d}, [pCRow1]
  718. add pCRow0, pCRow0, #32
  719. .endm
  720. /******************************************************************************/
  721. .macro INIT1x2
  722. fmov d16, xzr
  723. fmov d17, xzr
  724. fmov d20, xzr
  725. fmov d21, xzr
  726. .endm
  727. .macro KERNEL1x2_SUB
  728. ld2 {v8.2d, v9.2d}, [pB]
  729. add pB, pB, #32
  730. ld2 {v0.d, v1.d}[0], [pA]
  731. add pA, pA, #16
  732. OP_rr d16, d0, v8.d[0]
  733. OP_ii d16, d1, v9.d[0]
  734. OP_ri d17, d0, v9.d[0]
  735. OP_ir d17, d1, v8.d[0]
  736. OP_rr d20, d0, v8.d[1]
  737. OP_ii d20, d1, v9.d[1]
  738. OP_ri d21, d0, v9.d[1]
  739. OP_ir d21, d1, v8.d[1]
  740. .endm
  741. .macro SAVE1x2
  742. fmov alpha0_R, alphaR
  743. fmov alpha0_I, alphaI
  744. mov pCRow1, pCRow0
  745. fmul d0, d16, alphaV0_R
  746. fmls d0, d17, alphaV0_I
  747. fmul d1, d16, alphaV0_I
  748. fmla d1, d17, alphaV0_R
  749. st2 {v0.d, v1.d}[0], [pCRow1]
  750. add pCRow1, pCRow1, LDC
  751. fmul d4, d20, alphaV0_R
  752. fmls d4, d21, alphaV0_I
  753. fmul d5, d20, alphaV0_I
  754. fmla d5, d21, alphaV0_R
  755. st2 {v4.d, v5.d}[0], [pCRow1]
  756. add pCRow0, pCRow0, #16
  757. .endm
  758. /******************************************************************************/
  759. .macro INIT4x1
  760. fmov d16, xzr
  761. fmov d17, d16
  762. fmov d18, d16
  763. fmov d19, d17
  764. .endm
  765. .macro KERNEL4x1_SUB
  766. ld2 {v8.d, v9.d}[0], [pB]
  767. add pB, pB, #16
  768. ld2 {v0.2d, v1.2d}, [pA]
  769. add pA, pA, #32
  770. ld2 {v2.2d, v3.2d}, [pA]
  771. add pA, pA, #32
  772. OP_rr v16.2d, v0.2d, v8.d[0]
  773. OP_ii v16.2d, v1.2d, v9.d[0]
  774. OP_ri v17.2d, v0.2d, v9.d[0]
  775. OP_ir v17.2d, v1.2d, v8.d[0]
  776. OP_rr v18.2d, v2.2d, v8.d[0]
  777. OP_ii v18.2d, v3.2d, v9.d[0]
  778. OP_ri v19.2d, v2.2d, v9.d[0]
  779. OP_ir v19.2d, v3.2d, v8.d[0]
  780. .endm
  781. .macro SAVE4x1
  782. fmov alpha0_R, alphaR
  783. fmov alpha0_I, alphaI
  784. mov pCRow1, pCRow0
  785. fmul v0.2d, v16.2d, alphaV0_R
  786. fmls v0.2d, v17.2d, alphaV0_I
  787. fmul v1.2d, v16.2d, alphaV0_I
  788. fmla v1.2d, v17.2d, alphaV0_R
  789. st2 {v0.2d, v1.2d}, [pCRow1]
  790. add pCRow2, pCRow1, #32
  791. fmul v2.2d, v18.2d, alphaV0_R
  792. fmls v2.2d, v19.2d, alphaV0_I
  793. fmul v3.2d, v18.2d, alphaV0_I
  794. fmla v3.2d, v19.2d, alphaV0_R
  795. st2 {v2.2d, v3.2d}, [pCRow2]
  796. add pCRow0, pCRow0, #64
  797. .endm
  798. /******************************************************************************/
  799. .macro INIT2x1
  800. fmov d16, xzr
  801. fmov d17, xzr
  802. .endm
  803. .macro KERNEL2x1_SUB
  804. ld2 {v8.d, v9.d}[0], [pB]
  805. add pB, pB, #16
  806. ld2 {v0.2d, v1.2d}, [pA]
  807. add pA, pA, #32
  808. OP_rr v16.2d, v0.2d, v8.d[0]
  809. OP_ii v16.2d, v1.2d, v9.d[0]
  810. OP_ri v17.2d, v0.2d, v9.d[0]
  811. OP_ir v17.2d, v1.2d, v8.d[0]
  812. .endm
  813. .macro SAVE2x1
  814. fmov alpha0_R, alphaR
  815. fmov alpha0_I, alphaI
  816. mov pCRow1, pCRow0
  817. fmul v0.2d, v16.2d, alphaV0_R
  818. fmls v0.2d, v17.2d, alphaV0_I
  819. fmul v1.2d, v16.2d, alphaV0_I
  820. fmla v1.2d, v17.2d, alphaV0_R
  821. st2 {v0.2d, v1.2d}, [pCRow1]
  822. add pCRow0, pCRow0, #32
  823. .endm
  824. /******************************************************************************/
  825. .macro INIT1x1
  826. fmov d16, xzr
  827. fmov d17, xzr
  828. .endm
  829. .macro KERNEL1x1_SUB
  830. ld2 {v8.d, v9.d}[0], [pB]
  831. add pB, pB, #16
  832. ld2 {v0.d, v1.d}[0], [pA]
  833. add pA, pA, #16
  834. OP_rr d16, d0, v8.d[0]
  835. OP_ii d16, d1, v9.d[0]
  836. OP_ri d17, d0, v9.d[0]
  837. OP_ir d17, d1, v8.d[0]
  838. .endm
  839. .macro SAVE1x1
  840. fmov alpha0_R, alphaR
  841. fmov alpha0_I, alphaI
  842. mov pCRow1, pCRow0
  843. fmul d0, d16, alphaV0_R
  844. fmls d0, d17, alphaV0_I
  845. fmul d1, d16, alphaV0_I
  846. fmla d1, d17, alphaV0_R
  847. st2 {v0.d, v1.d}[0], [pCRow1]
  848. add pCRow0, pCRow0, #16
  849. .endm
  850. /*******************************************************************************
  851. * End of macro definitions
  852. *******************************************************************************/
  853. PROLOGUE
  854. .align 5
  855. add sp, sp, #-(11 * 16)
  856. stp d8, d9, [sp, #(0 * 16)]
  857. stp d10, d11, [sp, #(1 * 16)]
  858. stp d12, d13, [sp, #(2 * 16)]
  859. stp d14, d15, [sp, #(3 * 16)]
  860. stp d16, d17, [sp, #(4 * 16)]
  861. stp x18, x19, [sp, #(5 * 16)]
  862. stp x20, x21, [sp, #(6 * 16)]
  863. stp x22, x23, [sp, #(7 * 16)]
  864. stp x24, x25, [sp, #(8 * 16)]
  865. stp x26, x27, [sp, #(9 * 16)]
  866. str x28, [sp, #(10 * 16)]
  867. prfm PLDL1KEEP, [origPB]
  868. prfm PLDL1KEEP, [origPA]
  869. fmov alphaR, d0
  870. fmov alphaI, d1
  871. lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
  872. #if !defined(LEFT)
  873. neg tempOffset, offset
  874. #endif
  875. mov pB, origPB
  876. mov counterJ, origN
  877. asr counterJ, counterJ, #2 // J = J / 4
  878. cmp counterJ, #0
  879. ble .Lztrmm_kernel_L2_BEGIN
  880. .Lztrmm_kernel_L4_BEGIN:
  881. mov pCRow0, pC
  882. add pCRow1, pCRow0, LDC
  883. add pCRow2, pCRow1, LDC
  884. add pCRow3, pCRow2, LDC
  885. add pC, pCRow3, LDC
  886. #if defined(LEFT)
  887. mov tempOffset, offset
  888. #endif
  889. mov pA, origPA // pA = start of A array
  890. .Lztrmm_kernel_L4_M4_BEGIN:
  891. mov counterI, origM
  892. asr counterI, counterI, #2 // counterI = counterI / 4
  893. cmp counterI, #0
  894. ble .Lztrmm_kernel_L4_M2_BEGIN
  895. .align 5
  896. .Lztrmm_kernel_L4_M4_20:
  897. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  898. mov pB, origPB
  899. #else
  900. mov pB, origPB
  901. lsl temp, tempOffset, #6
  902. add pB, pB, temp
  903. add pA, pA, temp
  904. #endif
  905. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  906. sub tempK, origK, tempOffset
  907. #elif defined(LEFT)
  908. add tempK, tempOffset, #4
  909. #else
  910. add tempK, tempOffset, #4
  911. #endif
  912. asr counterL , tempK, #3
  913. cmp counterL , #2
  914. blt .Lztrmm_kernel_L4_M4_32
  915. KERNEL4x4_I
  916. KERNEL4x4_M2
  917. KERNEL4x4_M1
  918. KERNEL4x4_M2
  919. KERNEL4x4_M1
  920. KERNEL4x4_M2
  921. KERNEL4x4_M1
  922. KERNEL4x4_M2
  923. subs counterL, counterL, #2
  924. ble .Lztrmm_kernel_L4_M4_22a
  925. .align 5
  926. .Lztrmm_kernel_L4_M4_22:
  927. KERNEL4x4_M1
  928. KERNEL4x4_M2
  929. KERNEL4x4_M1
  930. KERNEL4x4_M2
  931. KERNEL4x4_M1
  932. KERNEL4x4_M2
  933. KERNEL4x4_M1
  934. KERNEL4x4_M2
  935. subs counterL, counterL, #1
  936. bgt .Lztrmm_kernel_L4_M4_22
  937. .align 5
  938. .Lztrmm_kernel_L4_M4_22a:
  939. KERNEL4x4_M1
  940. KERNEL4x4_M2
  941. KERNEL4x4_M1
  942. KERNEL4x4_M2
  943. KERNEL4x4_M1
  944. KERNEL4x4_M2
  945. KERNEL4x4_M1
  946. KERNEL4x4_E
  947. b .Lztrmm_kernel_L4_M4_44
  948. .align 5
  949. .Lztrmm_kernel_L4_M4_32:
  950. tst counterL, #1
  951. ble .Lztrmm_kernel_L4_M4_40
  952. KERNEL4x4_I
  953. KERNEL4x4_M2
  954. KERNEL4x4_M1
  955. KERNEL4x4_M2
  956. KERNEL4x4_M1
  957. KERNEL4x4_M2
  958. KERNEL4x4_M1
  959. KERNEL4x4_E
  960. b .Lztrmm_kernel_L4_M4_44
  961. .Lztrmm_kernel_L4_M4_40:
  962. INIT4x4
  963. .Lztrmm_kernel_L4_M4_44:
  964. ands counterL , tempK, #7
  965. ble .Lztrmm_kernel_L4_M4_100
  966. .align 5
  967. .Lztrmm_kernel_L4_M4_46:
  968. KERNEL4x4_SUB
  969. subs counterL, counterL, #1
  970. bne .Lztrmm_kernel_L4_M4_46
  971. .Lztrmm_kernel_L4_M4_100:
  972. SAVE4x4
  973. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  974. sub tempK, origK, tempOffset
  975. #if defined(LEFT)
  976. sub tempK, tempK, #4
  977. #else
  978. sub tempK, tempK, #4
  979. #endif
  980. lsl temp, tempK, #6
  981. add pA, pA, temp
  982. add pB, pB, temp
  983. #endif
  984. #if defined(LEFT)
  985. add tempOffset, tempOffset, #4
  986. #endif
  987. prfm PLDL1KEEP, [pA]
  988. prfm PLDL1KEEP, [pA, #64]
  989. prfm PLDL1KEEP, [origPB]
  990. .Lztrmm_kernel_L4_M4_END:
  991. subs counterI, counterI, #1
  992. bne .Lztrmm_kernel_L4_M4_20
  993. .Lztrmm_kernel_L4_M2_BEGIN:
  994. mov counterI, origM
  995. tst counterI , #3
  996. ble .Lztrmm_kernel_L4_END
  997. tst counterI, #2 // counterI = counterI / 2
  998. ble .Lztrmm_kernel_L4_M1_BEGIN
  999. .Lztrmm_kernel_L4_M2_20:
  1000. INIT2x4
  1001. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1002. mov pB, origPB
  1003. #else
  1004. mov pB, origPB
  1005. lsl temp, tempOffset, #5
  1006. add pA, pA, temp
  1007. lsl temp, tempOffset, #6
  1008. add pB, pB, temp
  1009. #endif
  1010. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1011. sub tempK, origK, tempOffset
  1012. #elif defined(LEFT)
  1013. add tempK, tempOffset, #2
  1014. #else
  1015. add tempK, tempOffset, #4
  1016. #endif
  1017. asr counterL , tempK, #3 // counterL = counterL / 8
  1018. cmp counterL , #0
  1019. ble .Lztrmm_kernel_L4_M2_40
  1020. .Lztrmm_kernel_L4_M2_22:
  1021. KERNEL2x4_SUB
  1022. KERNEL2x4_SUB
  1023. KERNEL2x4_SUB
  1024. KERNEL2x4_SUB
  1025. KERNEL2x4_SUB
  1026. KERNEL2x4_SUB
  1027. KERNEL2x4_SUB
  1028. KERNEL2x4_SUB
  1029. subs counterL, counterL, #1
  1030. bgt .Lztrmm_kernel_L4_M2_22
  1031. .Lztrmm_kernel_L4_M2_40:
  1032. ands counterL , tempK, #7 // counterL = counterL % 8
  1033. ble .Lztrmm_kernel_L4_M2_100
  1034. .Lztrmm_kernel_L4_M2_42:
  1035. KERNEL2x4_SUB
  1036. subs counterL, counterL, #1
  1037. bgt .Lztrmm_kernel_L4_M2_42
  1038. .Lztrmm_kernel_L4_M2_100:
  1039. SAVE2x4
  1040. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1041. sub tempK, origK, tempOffset
  1042. #if defined(LEFT)
  1043. sub tempK, tempK, #2
  1044. #else
  1045. sub tempK, tempK, #4
  1046. #endif
  1047. lsl temp, tempK, #5
  1048. add pA, pA, temp
  1049. lsl temp, tempK, #6
  1050. add pB, pB, temp
  1051. #endif
  1052. #if defined(LEFT)
  1053. add tempOffset, tempOffset, #2
  1054. #endif
  1055. .Lztrmm_kernel_L4_M2_END:
  1056. .Lztrmm_kernel_L4_M1_BEGIN:
  1057. tst counterI, #1 // counterI = counterI % 2
  1058. ble .Lztrmm_kernel_L4_END
  1059. .Lztrmm_kernel_L4_M1_20:
  1060. INIT1x4
  1061. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1062. mov pB, origPB
  1063. #else
  1064. mov pB, origPB
  1065. lsl temp, tempOffset, #6
  1066. add pB, pB, temp
  1067. lsl temp, tempOffset, #4
  1068. add pA, pA, temp
  1069. #endif
  1070. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1071. sub tempK, origK, tempOffset
  1072. #elif defined(LEFT)
  1073. add tempK, tempOffset, #1
  1074. #else
  1075. add tempK, tempOffset, #4
  1076. #endif
  1077. asr counterL , tempK, #3 // counterL = counterL / 8
  1078. cmp counterL , #0
  1079. ble .Lztrmm_kernel_L4_M1_40
  1080. .Lztrmm_kernel_L4_M1_22:
  1081. KERNEL1x4_SUB
  1082. KERNEL1x4_SUB
  1083. KERNEL1x4_SUB
  1084. KERNEL1x4_SUB
  1085. KERNEL1x4_SUB
  1086. KERNEL1x4_SUB
  1087. KERNEL1x4_SUB
  1088. KERNEL1x4_SUB
  1089. subs counterL, counterL, #1
  1090. bgt .Lztrmm_kernel_L4_M1_22
  1091. .Lztrmm_kernel_L4_M1_40:
  1092. ands counterL , tempK, #7 // counterL = counterL % 8
  1093. ble .Lztrmm_kernel_L4_M1_100
  1094. .Lztrmm_kernel_L4_M1_42:
  1095. KERNEL1x4_SUB
  1096. subs counterL, counterL, #1
  1097. bgt .Lztrmm_kernel_L4_M1_42
  1098. .Lztrmm_kernel_L4_M1_100:
  1099. SAVE1x4
  1100. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1101. sub tempK, origK, tempOffset
  1102. #if defined(LEFT)
  1103. sub tempK, tempK, #1
  1104. #else
  1105. sub tempK, tempK, #4
  1106. #endif
  1107. lsl temp, tempK, #4
  1108. add pA, pA, temp
  1109. lsl temp, tempK, #6
  1110. add pB, pB, temp
  1111. #endif
  1112. #if defined(LEFT)
  1113. add tempOffset, tempOffset, #1
  1114. #endif
  1115. .Lztrmm_kernel_L4_END:
  1116. lsl temp, origK, #6
  1117. add origPB, origPB, temp // B = B + K * 4 * 8 * 2
  1118. #if !defined(LEFT)
  1119. add tempOffset, tempOffset, #4
  1120. #endif
  1121. subs counterJ, counterJ , #1 // j--
  1122. bgt .Lztrmm_kernel_L4_BEGIN
  1123. /******************************************************************************/
  1124. .Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  1125. mov counterJ , origN
  1126. tst counterJ , #3
  1127. ble .Lztrmm_kernel_L999 // error, N was less than 4?
  1128. tst counterJ , #2
  1129. ble .Lztrmm_kernel_L1_BEGIN
  1130. mov pCRow0, pC // pCRow0 = pC
  1131. add pC,pC,LDC, lsl #1
  1132. #if defined(LEFT)
  1133. mov tempOffset, offset
  1134. #endif
  1135. mov pA, origPA // pA = A
  1136. .Lztrmm_kernel_L2_M4_BEGIN:
  1137. mov counterI, origM
  1138. asr counterI, counterI, #2 // counterI = counterI / 4
  1139. cmp counterI,#0
  1140. ble .Lztrmm_kernel_L2_M2_BEGIN
  1141. .Lztrmm_kernel_L2_M4_20:
  1142. INIT4x2
  1143. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1144. mov pB, origPB
  1145. #else
  1146. mov pB, origPB
  1147. lsl temp, tempOffset, #5
  1148. add pB, pB, temp
  1149. lsl temp, tempOffset, #6
  1150. add pA, pA, temp
  1151. #endif
  1152. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1153. sub tempK, origK, tempOffset
  1154. #elif defined(LEFT)
  1155. add tempK, tempOffset, #4
  1156. #else
  1157. add tempK, tempOffset, #2
  1158. #endif
  1159. asr counterL , tempK, #3 // counterL = counterL / 8
  1160. cmp counterL,#0
  1161. ble .Lztrmm_kernel_L2_M4_40
  1162. .align 5
  1163. .Lztrmm_kernel_L2_M4_22:
  1164. KERNEL4x2_SUB
  1165. KERNEL4x2_SUB
  1166. KERNEL4x2_SUB
  1167. KERNEL4x2_SUB
  1168. KERNEL4x2_SUB
  1169. KERNEL4x2_SUB
  1170. KERNEL4x2_SUB
  1171. KERNEL4x2_SUB
  1172. subs counterL, counterL, #1
  1173. bgt .Lztrmm_kernel_L2_M4_22
  1174. .Lztrmm_kernel_L2_M4_40:
  1175. ands counterL , tempK, #7 // counterL = counterL % 8
  1176. ble .Lztrmm_kernel_L2_M4_100
  1177. .Lztrmm_kernel_L2_M4_42:
  1178. KERNEL4x2_SUB
  1179. subs counterL, counterL, #1
  1180. bgt .Lztrmm_kernel_L2_M4_42
  1181. .Lztrmm_kernel_L2_M4_100:
  1182. SAVE4x2
  1183. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1184. sub tempK, origK, tempOffset
  1185. #if defined(LEFT)
  1186. sub tempK, tempK, #4
  1187. #else
  1188. sub tempK, tempK, #2
  1189. #endif
  1190. lsl temp, tempK, #6
  1191. add pA, pA, temp
  1192. lsl temp, tempK, #5
  1193. add pB, pB, temp
  1194. #endif
  1195. #if defined(LEFT)
  1196. add tempOffset, tempOffset, #4
  1197. #endif
  1198. .Lztrmm_kernel_L2_M4_END:
  1199. subs counterI, counterI, #1
  1200. bgt .Lztrmm_kernel_L2_M4_20
  1201. .Lztrmm_kernel_L2_M2_BEGIN:
  1202. mov counterI, origM
  1203. tst counterI , #3
  1204. ble .Lztrmm_kernel_L2_END
  1205. tst counterI, #2 // counterI = counterI / 2
  1206. ble .Lztrmm_kernel_L2_M1_BEGIN
  1207. .Lztrmm_kernel_L2_M2_20:
  1208. INIT2x2
  1209. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1210. mov pB, origPB
  1211. #else
  1212. mov pB, origPB
  1213. lsl temp, tempOffset, #5
  1214. add pB, pB, temp
  1215. lsl temp, tempOffset, #5
  1216. add pA, pA, temp
  1217. #endif
  1218. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1219. sub tempK, origK, tempOffset
  1220. #elif defined(LEFT)
  1221. add tempK, tempOffset, #2
  1222. #else
  1223. add tempK, tempOffset, #2
  1224. #endif
  1225. asr counterL , tempK, #3 // counterL = counterL / 8
  1226. cmp counterL,#0
  1227. ble .Lztrmm_kernel_L2_M2_40
  1228. .Lztrmm_kernel_L2_M2_22:
  1229. KERNEL2x2_SUB
  1230. KERNEL2x2_SUB
  1231. KERNEL2x2_SUB
  1232. KERNEL2x2_SUB
  1233. KERNEL2x2_SUB
  1234. KERNEL2x2_SUB
  1235. KERNEL2x2_SUB
  1236. KERNEL2x2_SUB
  1237. subs counterL, counterL, #1
  1238. bgt .Lztrmm_kernel_L2_M2_22
  1239. .Lztrmm_kernel_L2_M2_40:
  1240. ands counterL , tempK, #7 // counterL = counterL % 8
  1241. ble .Lztrmm_kernel_L2_M2_100
  1242. .Lztrmm_kernel_L2_M2_42:
  1243. KERNEL2x2_SUB
  1244. subs counterL, counterL, #1
  1245. bgt .Lztrmm_kernel_L2_M2_42
  1246. .Lztrmm_kernel_L2_M2_100:
  1247. SAVE2x2
  1248. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1249. sub tempK, origK, tempOffset
  1250. #if defined(LEFT)
  1251. sub tempK, tempK, #2
  1252. #else
  1253. sub tempK, tempK, #2
  1254. #endif
  1255. lsl temp, tempK, #5
  1256. add pA, pA, temp
  1257. lsl temp, tempK, #5
  1258. add pB, pB, temp
  1259. #endif
  1260. #if defined(LEFT)
  1261. add tempOffset, tempOffset, #2
  1262. #endif
  1263. .Lztrmm_kernel_L2_M2_END:
  1264. .Lztrmm_kernel_L2_M1_BEGIN:
  1265. tst counterI, #1 // counterI = counterI % 2
  1266. ble .Lztrmm_kernel_L2_END
  1267. .Lztrmm_kernel_L2_M1_20:
  1268. INIT1x2
  1269. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1270. mov pB, origPB
  1271. #else
  1272. mov pB, origPB
  1273. lsl temp, tempOffset, #5
  1274. add pB, pB, temp
  1275. lsl temp, tempOffset, #4
  1276. add pA, pA, temp
  1277. #endif
  1278. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1279. sub tempK, origK, tempOffset
  1280. #elif defined(LEFT)
  1281. add tempK, tempOffset, #1
  1282. #else
  1283. add tempK, tempOffset, #2
  1284. #endif
  1285. asr counterL , tempK, #3 // counterL = counterL / 8
  1286. cmp counterL, #0
  1287. ble .Lztrmm_kernel_L2_M1_40
  1288. .Lztrmm_kernel_L2_M1_22:
  1289. KERNEL1x2_SUB
  1290. KERNEL1x2_SUB
  1291. KERNEL1x2_SUB
  1292. KERNEL1x2_SUB
  1293. KERNEL1x2_SUB
  1294. KERNEL1x2_SUB
  1295. KERNEL1x2_SUB
  1296. KERNEL1x2_SUB
  1297. subs counterL, counterL, #1
  1298. bgt .Lztrmm_kernel_L2_M1_22
  1299. .Lztrmm_kernel_L2_M1_40:
  1300. ands counterL , tempK, #7 // counterL = counterL % 8
  1301. ble .Lztrmm_kernel_L2_M1_100
  1302. .Lztrmm_kernel_L2_M1_42:
  1303. KERNEL1x2_SUB
  1304. subs counterL, counterL, #1
  1305. bgt .Lztrmm_kernel_L2_M1_42
  1306. .Lztrmm_kernel_L2_M1_100:
  1307. SAVE1x2
  1308. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1309. sub tempK, origK, tempOffset
  1310. #if defined(LEFT)
  1311. sub tempK, tempK, #1
  1312. #else
  1313. sub tempK, tempK, #2
  1314. #endif
  1315. lsl temp, tempK, #4
  1316. add pA, pA, temp
  1317. lsl temp, tempK, #5
  1318. add pB, pB, temp
  1319. #endif
  1320. #if defined(LEFT)
  1321. add tempOffset, tempOffset, #1
  1322. #endif
  1323. .Lztrmm_kernel_L2_END:
  1324. #if !defined(LEFT)
  1325. add tempOffset, tempOffset, #2
  1326. #endif
  1327. lsl temp, origK, #5
  1328. add origPB, origPB, temp // B = B + K * 2 * 8 * 2
  1329. /******************************************************************************/
  1330. .Lztrmm_kernel_L1_BEGIN:
  1331. mov counterJ , origN
  1332. tst counterJ , #1
  1333. ble .Lztrmm_kernel_L999 // done
  1334. mov pCRow0, pC // pCRow0 = C
  1335. add pC , pC , LDC // Update pC to point to next
  1336. #if defined(LEFT)
  1337. mov tempOffset, offset
  1338. #endif
  1339. mov pA, origPA // pA = A
  1340. .Lztrmm_kernel_L1_M4_BEGIN:
  1341. mov counterI, origM
  1342. asr counterI, counterI, #2 // counterI = counterI / 4
  1343. cmp counterI, #0
  1344. ble .Lztrmm_kernel_L1_M2_BEGIN
  1345. .Lztrmm_kernel_L1_M4_20:
  1346. INIT4x1
  1347. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1348. mov pB, origPB
  1349. #else
  1350. mov pB, origPB
  1351. lsl temp, tempOffset, #4
  1352. add pB, pB, temp
  1353. lsl temp, tempOffset, #6
  1354. add pA, pA, temp
  1355. #endif
  1356. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1357. sub tempK, origK, tempOffset
  1358. #elif defined(LEFT)
  1359. add tempK, tempOffset, #4
  1360. #else
  1361. add tempK, tempOffset, #1
  1362. #endif
  1363. asr counterL , tempK, #3 // counterL = counterL / 8
  1364. cmp counterL , #0
  1365. ble .Lztrmm_kernel_L1_M4_40
  1366. .align 5
  1367. .Lztrmm_kernel_L1_M4_22:
  1368. KERNEL4x1_SUB
  1369. KERNEL4x1_SUB
  1370. KERNEL4x1_SUB
  1371. KERNEL4x1_SUB
  1372. KERNEL4x1_SUB
  1373. KERNEL4x1_SUB
  1374. KERNEL4x1_SUB
  1375. KERNEL4x1_SUB
  1376. subs counterL, counterL, #1
  1377. bgt .Lztrmm_kernel_L1_M4_22
  1378. .Lztrmm_kernel_L1_M4_40:
  1379. ands counterL , tempK, #7 // counterL = counterL % 8
  1380. ble .Lztrmm_kernel_L1_M4_100
  1381. .Lztrmm_kernel_L1_M4_42:
  1382. KERNEL4x1_SUB
  1383. subs counterL, counterL, #1
  1384. bgt .Lztrmm_kernel_L1_M4_42
  1385. .Lztrmm_kernel_L1_M4_100:
  1386. SAVE4x1
  1387. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1388. sub tempK, origK, tempOffset
  1389. #if defined(LEFT)
  1390. sub tempK, tempK, #4
  1391. #else
  1392. sub tempK, tempK, #1
  1393. #endif
  1394. lsl temp, tempK, #6
  1395. add pA, pA, temp
  1396. lsl temp, tempK, #4
  1397. add pB, pB, temp
  1398. #endif
  1399. #if defined(LEFT)
  1400. add tempOffset, tempOffset, #4
  1401. #endif
  1402. .Lztrmm_kernel_L1_M4_END:
  1403. subs counterI, counterI, #1
  1404. bgt .Lztrmm_kernel_L1_M4_20
  1405. .Lztrmm_kernel_L1_M2_BEGIN:
  1406. mov counterI, origM
  1407. tst counterI , #3
  1408. ble .Lztrmm_kernel_L1_END
  1409. tst counterI, #2 // counterI = counterI / 2
  1410. ble .Lztrmm_kernel_L1_M1_BEGIN
  1411. .Lztrmm_kernel_L1_M2_20:
  1412. INIT2x1
  1413. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1414. mov pB, origPB
  1415. #else
  1416. mov pB, origPB
  1417. lsl temp, tempOffset, #4
  1418. add pB, pB, temp
  1419. lsl temp, tempOffset, #5
  1420. add pA, pA, temp
  1421. #endif
  1422. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1423. sub tempK, origK, tempOffset
  1424. #elif defined(LEFT)
  1425. add tempK, tempOffset, #2
  1426. #else
  1427. add tempK, tempOffset, #1
  1428. #endif
  1429. asr counterL , tempK, #3 // counterL = counterL / 8
  1430. cmp counterL , #0
  1431. ble .Lztrmm_kernel_L1_M2_40
  1432. .Lztrmm_kernel_L1_M2_22:
  1433. KERNEL2x1_SUB
  1434. KERNEL2x1_SUB
  1435. KERNEL2x1_SUB
  1436. KERNEL2x1_SUB
  1437. KERNEL2x1_SUB
  1438. KERNEL2x1_SUB
  1439. KERNEL2x1_SUB
  1440. KERNEL2x1_SUB
  1441. subs counterL, counterL, #1
  1442. bgt .Lztrmm_kernel_L1_M2_22
  1443. .Lztrmm_kernel_L1_M2_40:
  1444. ands counterL , tempK, #7 // counterL = counterL % 8
  1445. ble .Lztrmm_kernel_L1_M2_100
  1446. .Lztrmm_kernel_L1_M2_42:
  1447. KERNEL2x1_SUB
  1448. subs counterL, counterL, #1
  1449. bgt .Lztrmm_kernel_L1_M2_42
  1450. .Lztrmm_kernel_L1_M2_100:
  1451. SAVE2x1
  1452. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1453. sub tempK, origK, tempOffset
  1454. #if defined(LEFT)
  1455. sub tempK, tempK, #2
  1456. #else
  1457. sub tempK, tempK, #1
  1458. #endif
  1459. lsl temp, tempK, #5
  1460. add pA, pA, temp
  1461. lsl temp, tempK, #4
  1462. add pB, pB, temp
  1463. #endif
  1464. #if defined(LEFT)
  1465. add tempOffset, tempOffset, #2
  1466. #endif
  1467. .Lztrmm_kernel_L1_M2_END:
  1468. .Lztrmm_kernel_L1_M1_BEGIN:
  1469. tst counterI, #1 // counterI = counterI % 2
  1470. ble .Lztrmm_kernel_L1_END
  1471. .Lztrmm_kernel_L1_M1_20:
  1472. INIT1x1
  1473. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1474. mov pB, origPB
  1475. #else
  1476. mov pB, origPB
  1477. lsl temp, tempOffset, #4
  1478. add pB, pB, temp
  1479. lsl temp, tempOffset, #4
  1480. add pA, pA, temp
  1481. #endif
  1482. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1483. sub tempK, origK, tempOffset
  1484. #elif defined(LEFT)
  1485. add tempK, tempOffset, #1
  1486. #else
  1487. add tempK, tempOffset, #1
  1488. #endif
  1489. asr counterL , tempK, #3 // counterL = counterL / 8
  1490. cmp counterL , #0
  1491. ble .Lztrmm_kernel_L1_M1_40
  1492. .Lztrmm_kernel_L1_M1_22:
  1493. KERNEL1x1_SUB
  1494. KERNEL1x1_SUB
  1495. KERNEL1x1_SUB
  1496. KERNEL1x1_SUB
  1497. KERNEL1x1_SUB
  1498. KERNEL1x1_SUB
  1499. KERNEL1x1_SUB
  1500. KERNEL1x1_SUB
  1501. subs counterL, counterL, #1
  1502. bgt .Lztrmm_kernel_L1_M1_22
  1503. .Lztrmm_kernel_L1_M1_40:
  1504. ands counterL , tempK, #7 // counterL = counterL % 8
  1505. ble .Lztrmm_kernel_L1_M1_100
  1506. .Lztrmm_kernel_L1_M1_42:
  1507. KERNEL1x1_SUB
  1508. subs counterL, counterL, #1
  1509. bgt .Lztrmm_kernel_L1_M1_42
  1510. .Lztrmm_kernel_L1_M1_100:
  1511. SAVE1x1
  1512. .Lztrmm_kernel_L1_END:
  1513. .Lztrmm_kernel_L999:
  1514. mov x0, #0 // set return value
  1515. ldp d8, d9, [sp, #(0 * 16)]
  1516. ldp d10, d11, [sp, #(1 * 16)]
  1517. ldp d12, d13, [sp, #(2 * 16)]
  1518. ldp d14, d15, [sp, #(3 * 16)]
  1519. ldp d16, d17, [sp, #(4 * 16)]
  1520. ldp x18, x19, [sp, #(5 * 16)]
  1521. ldp x20, x21, [sp, #(6 * 16)]
  1522. ldp x22, x23, [sp, #(7 * 16)]
  1523. ldp x24, x25, [sp, #(8 * 16)]
  1524. ldp x26, x27, [sp, #(9 * 16)]
  1525. ldr x28, [sp, #(10 * 16)]
  1526. add sp, sp, #(11*16)
  1527. ret
  1528. EPILOGUE