You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrmm_kernel_8x4.S 37 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 x7*/
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alpha x17
  49. //#define temp x18
  50. #define tempOffset x19
  51. #define tempK x20
  52. #define temp x21
  53. #define alpha0 d10
  54. #define alphaV0 v10.d[0]
  55. #define A_PRE_SIZE 2560
  56. #define B_PRE_SIZE 448
  57. #define C_PRE_SIZE 128
  58. // 00 origM
  59. // 01 origN
  60. // 02 origK
  61. // 03 origPA
  62. // 04 origPB
  63. // 05 pC
  64. // 06 origLDC -> LDC
  65. // 07 offset
  66. // 08 counterL
  67. // 09 counterI
  68. // 10 counterJ
  69. // 11 pB
  70. // 12 pCRow0
  71. // 13 pCRow1
  72. // 14 pCRow2
  73. // 15 pA
  74. // 16 temp
  75. // 17 tempOffset
  76. // 18 must save tempK
  77. // 19 must save
  78. // 20 must save
  79. // 21 must save
  80. // 22 must save
  81. // 23 must save
  82. // 24 must save
  83. // 25 must save
  84. // 26 must save
  85. // 27 must save
  86. // 28 must save
  87. // 29 frame
  88. // 30 link
  89. // 31 sp
  90. //v00 ALPHA -> pA0_0, pA0_1
  91. //v01 pA0_2, pA0_3
  92. //v02 pA0_4, pA0_5
  93. //v03 pA0_6, pA0_7
  94. //v04 pA1_0, pA1_1
  95. //v05 pA1_2, pA1_3
  96. //v06 pA1_4, pA1_5
  97. //v07 pA1_6, pA1_7
  98. //v08 must save pB0_0
  99. //v09 must save pB0_1
  100. //v10 must save pB0_2 --> ALPHA0
  101. //v11 must save pB0_3
  102. //v12 must save pB1_0
  103. //v13 must save pB1_1
  104. //v14 must save pB1_2
  105. //v15 must save pB1_3
  106. //v16 must save C00, C01
  107. //v17 must save C02, C03
  108. //v18 C04, C05
  109. //v19 C06, C07
  110. //v20 C10, C11
  111. //v21 C12, C13
  112. //v22 C14, C15
  113. //v23 C16, C17
  114. //v24 C20, C21
  115. //v25 C22, C23
  116. //v26 C24, C25
  117. //v27 C26, C27
  118. //v28 C30, C31
  119. //v29 C32, C33
  120. //v30 C34, C35
  121. //v31 C36, C37
  122. /*******************************************************************************
  123. * Macro definitions
  124. *******************************************************************************/
  125. .macro INIT8x4
  126. fmov d16, xzr
  127. fmov d17, xzr
  128. fmov d18, d16
  129. fmov d19, xzr
  130. fmov d20, xzr
  131. fmov d21, d16
  132. fmov d22, d17
  133. fmov d23, d18
  134. fmov d24, xzr
  135. fmov d25, d16
  136. fmov d26, d17
  137. fmov d27, d18
  138. fmov d28, xzr
  139. fmov d29, d16
  140. fmov d30, d17
  141. fmov d31, d18
  142. .endm
  143. .macro KERNEL8x4_I
  144. ldp q0, q1, [pA], #32
  145. ldp d8, d9, [pB], #16
  146. fmul v16.2d, v0.2d, v8.d[0]
  147. fmul v20.2d, v0.2d, v9.d[0]
  148. ldp d10, d11, [pB], #16
  149. fmul v17.2d, v1.2d, v8.d[0]
  150. fmul v21.2d, v1.2d, v9.d[0]
  151. ldp q2, q3, [pA], #32
  152. fmul v24.2d, v0.2d, v10.d[0]
  153. fmul v28.2d, v0.2d, v11.d[0]
  154. ldp q4, q5, [pA], #32
  155. fmul v25.2d, v1.2d, v10.d[0]
  156. fmul v29.2d, v1.2d, v11.d[0]
  157. ldp d12, d13, [pB], #16
  158. fmul v18.2d, v2.2d, v8.d[0]
  159. fmul v22.2d, v2.2d, v9.d[0]
  160. ldp d14, d15, [pB], #16
  161. fmul v26.2d, v2.2d, v10.d[0]
  162. fmul v30.2d, v2.2d, v11.d[0]
  163. ldp q6, q7, [pA], #32
  164. fmul v19.2d, v3.2d, v8.d[0]
  165. fmul v27.2d, v3.2d, v10.d[0]
  166. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  167. fmul v31.2d, v3.2d, v11.d[0]
  168. fmul v23.2d, v3.2d, v9.d[0]
  169. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  170. .endm
  171. .macro KERNEL8x4_M1
  172. fmla v16.2d, v0.2d, v8.d[0]
  173. fmla v20.2d, v0.2d, v9.d[0]
  174. ldp q4, q5, [pA], #32
  175. fmla v24.2d, v0.2d, v10.d[0]
  176. fmla v28.2d, v0.2d, v11.d[0]
  177. ldp d12, d13, [pB], #16
  178. fmla v17.2d, v1.2d, v8.d[0]
  179. fmla v25.2d, v1.2d, v10.d[0]
  180. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  181. fmla v21.2d, v1.2d, v9.d[0]
  182. fmla v29.2d, v1.2d, v11.d[0]
  183. ldp d14, d15, [pB], #16
  184. fmla v18.2d, v2.2d, v8.d[0]
  185. fmla v22.2d, v2.2d, v9.d[0]
  186. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  187. fmla v26.2d, v2.2d, v10.d[0]
  188. fmla v30.2d, v2.2d, v11.d[0]
  189. fmla v19.2d, v3.2d, v8.d[0]
  190. fmla v23.2d, v3.2d, v9.d[0]
  191. ldp q6, q7, [pA], #32
  192. fmla v27.2d, v3.2d, v10.d[0]
  193. fmla v31.2d, v3.2d, v11.d[0]
  194. .endm
  195. .macro KERNEL8x4_M2
  196. fmla v16.2d, v4.2d, v12.d[0]
  197. fmla v20.2d, v4.2d, v13.d[0]
  198. fmla v24.2d, v4.2d, v14.d[0]
  199. fmla v28.2d, v4.2d, v15.d[0]
  200. ldp q0, q1, [pA], #32
  201. fmla v17.2d, v5.2d, v12.d[0]
  202. fmla v25.2d, v5.2d, v14.d[0]
  203. ldp d8, d9, [pB], #16
  204. fmla v21.2d, v5.2d, v13.d[0]
  205. fmla v29.2d, v5.2d, v15.d[0]
  206. ldp d10, d11, [pB], #16
  207. fmla v18.2d, v6.2d, v12.d[0]
  208. fmla v22.2d, v6.2d, v13.d[0]
  209. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  210. fmla v26.2d, v6.2d, v14.d[0]
  211. fmla v30.2d, v6.2d, v15.d[0]
  212. fmla v19.2d, v7.2d, v12.d[0]
  213. fmla v23.2d, v7.2d, v13.d[0]
  214. ldp q2, q3, [pA], #32
  215. fmla v27.2d, v7.2d, v14.d[0]
  216. fmla v31.2d, v7.2d, v15.d[0]
  217. .endm
  218. .macro KERNEL8x4_E
  219. fmla v16.2d, v4.2d, v12.d[0]
  220. fmla v20.2d, v4.2d, v13.d[0]
  221. fmla v24.2d, v4.2d, v14.d[0]
  222. fmla v28.2d, v4.2d, v15.d[0]
  223. fmla v17.2d, v5.2d, v12.d[0]
  224. fmla v25.2d, v5.2d, v14.d[0]
  225. fmla v21.2d, v5.2d, v13.d[0]
  226. fmla v29.2d, v5.2d, v15.d[0]
  227. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  228. fmla v18.2d, v6.2d, v12.d[0]
  229. fmla v22.2d, v6.2d, v13.d[0]
  230. fmla v26.2d, v6.2d, v14.d[0]
  231. fmla v30.2d, v6.2d, v15.d[0]
  232. fmla v19.2d, v7.2d, v12.d[0]
  233. fmla v23.2d, v7.2d, v13.d[0]
  234. fmla v27.2d, v7.2d, v14.d[0]
  235. fmla v31.2d, v7.2d, v15.d[0]
  236. .endm
  237. .macro KERNEL8x4_SUB
  238. ldp q0, q1, [pA], #32
  239. ldp d8, d9, [pB], #16
  240. fmla v16.2d, v0.2d, v8.d[0]
  241. fmla v20.2d, v0.2d, v9.d[0]
  242. ldp d10, d11, [pB], #16
  243. fmla v17.2d, v1.2d, v8.d[0]
  244. fmla v21.2d, v1.2d, v9.d[0]
  245. ldp q2, q3, [pA], #32
  246. fmla v24.2d, v0.2d, v10.d[0]
  247. fmla v28.2d, v0.2d, v11.d[0]
  248. fmla v25.2d, v1.2d, v10.d[0]
  249. fmla v29.2d, v1.2d, v11.d[0]
  250. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  251. fmla v18.2d, v2.2d, v8.d[0]
  252. fmla v22.2d, v2.2d, v9.d[0]
  253. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  254. fmla v26.2d, v2.2d, v10.d[0]
  255. fmla v30.2d, v2.2d, v11.d[0]
  256. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  257. fmla v19.2d, v3.2d, v8.d[0]
  258. fmla v27.2d, v3.2d, v10.d[0]
  259. fmla v31.2d, v3.2d, v11.d[0]
  260. fmla v23.2d, v3.2d, v9.d[0]
  261. .endm
  262. .macro SAVE8x4
  263. fmov alpha0, alpha
  264. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  265. fmul v0.2d, v16.2d, alphaV0
  266. fmul v1.2d, v17.2d, alphaV0
  267. stp q0, q1, [pCRow0]
  268. add pCRow0, pCRow0, #32
  269. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  270. fmul v2.2d, v18.2d, alphaV0
  271. fmul v3.2d, v19.2d, alphaV0
  272. stp q2, q3, [pCRow0]
  273. add pCRow0, pCRow0, #32
  274. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  275. fmul v4.2d, v20.2d, alphaV0
  276. fmul v5.2d, v21.2d, alphaV0
  277. stp q4, q5, [pCRow1]
  278. add pCRow1, pCRow1, #32
  279. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  280. fmul v6.2d, v22.2d, alphaV0
  281. fmul v7.2d, v23.2d, alphaV0
  282. stp q6, q7, [pCRow1]
  283. add pCRow1, pCRow1, #32
  284. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  285. fmul v0.2d, v24.2d, alphaV0
  286. fmul v1.2d, v25.2d, alphaV0
  287. stp q0, q1, [pCRow2]
  288. add pCRow2, pCRow2, #32
  289. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  290. fmul v2.2d, v26.2d, alphaV0
  291. fmul v3.2d, v27.2d, alphaV0
  292. stp q2, q3, [pCRow2]
  293. add pCRow2, pCRow2, #32
  294. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  295. fmul v4.2d, v28.2d, alphaV0
  296. fmul v5.2d, v29.2d, alphaV0
  297. stp q4, q5, [pCRow3]
  298. add pCRow3, pCRow3, #32
  299. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  300. fmul v6.2d, v30.2d, alphaV0
  301. fmul v7.2d, v31.2d, alphaV0
  302. stp q6, q7, [pCRow3]
  303. add pCRow3, pCRow3, #32
  304. .endm
  305. /******************************************************************************/
  306. .macro INIT4x4
  307. fmov d16, xzr
  308. fmov d17, d16
  309. fmov d20, d17
  310. fmov d21, d16
  311. fmov d24, d17
  312. fmov d25, d16
  313. fmov d28, d17
  314. fmov d29, d16
  315. .endm
  316. .macro KERNEL4x4_SUB
  317. ld1 {v8.2d, v9.2d}, [pB]
  318. add pB, pB, #32
  319. ld1 {v0.2d, v1.2d}, [pA]
  320. add pA, pA, #32
  321. fmla v16.2d, v0.2d, v8.d[0]
  322. fmla v29.2d, v1.2d, v9.d[1]
  323. fmla v20.2d, v0.2d, v8.d[1]
  324. fmla v25.2d, v1.2d, v9.d[0]
  325. fmla v24.2d, v0.2d, v9.d[0]
  326. fmla v21.2d, v1.2d, v8.d[1]
  327. fmla v28.2d, v0.2d, v9.d[1]
  328. fmla v17.2d, v1.2d, v8.d[0]
  329. .endm
  330. .macro SAVE4x4
  331. fmov alpha0, alpha
  332. fmul v8.2d, v16.2d, alphaV0
  333. fmul v9.2d, v17.2d, alphaV0
  334. st1 {v8.2d, v9.2d}, [pCRow0]
  335. add pCRow1, pCRow0, LDC
  336. fmul v12.2d, v20.2d, alphaV0
  337. fmul v13.2d, v21.2d, alphaV0
  338. st1 {v12.2d, v13.2d}, [pCRow1]
  339. add pCRow2, pCRow1, LDC
  340. fmul v8.2d, v24.2d, alphaV0
  341. fmul v9.2d, v25.2d, alphaV0
  342. st1 {v8.2d, v9.2d}, [pCRow2]
  343. add pCRow1, pCRow2, LDC
  344. fmul v12.2d, v28.2d, alphaV0
  345. fmul v13.2d, v29.2d, alphaV0
  346. st1 {v12.2d, v13.2d}, [pCRow1]
  347. add pCRow0, pCRow0, #32
  348. .endm
  349. /******************************************************************************/
  350. .macro INIT2x4
  351. fmov d16, xzr
  352. fmov d20, d16
  353. fmov d24, d20
  354. fmov d28, d16
  355. .endm
  356. .macro KERNEL2x4_SUB
  357. ld1 {v8.2d, v9.2d}, [pB]
  358. add pB, pB, #32
  359. ld1 {v0.2d}, [pA]
  360. add pA, pA, #16
  361. fmla v16.2d, v0.2d, v8.d[0]
  362. fmla v20.2d, v0.2d, v8.d[1]
  363. fmla v24.2d, v0.2d, v9.d[0]
  364. fmla v28.2d, v0.2d, v9.d[1]
  365. .endm
  366. .macro SAVE2x4
  367. fmov alpha0, alpha
  368. fmul v8.2d, v16.2d, alphaV0
  369. st1 {v8.2d}, [pCRow0]
  370. add pCRow1, pCRow0, LDC
  371. fmul v12.2d, v20.2d, alphaV0
  372. st1 {v12.2d}, [pCRow1]
  373. add pCRow2, pCRow1, LDC
  374. fmul v8.2d, v24.2d, alphaV0
  375. st1 {v8.2d}, [pCRow2]
  376. add pCRow1, pCRow2, LDC
  377. fmul v12.2d, v28.2d, alphaV0
  378. st1 {v12.2d}, [pCRow1]
  379. add pCRow0, pCRow0, #16
  380. .endm
  381. /******************************************************************************/
  382. .macro INIT1x4
  383. fmov d16, xzr
  384. fmov d20, d16
  385. .endm
  386. .macro KERNEL1x4_SUB
  387. ldr d0, [pA]
  388. add pA, pA, #8
  389. ld1 {v8.2d, v9.2d}, [pB]
  390. add pB, pB, #32
  391. fmla v16.2d, v8.2d, v0.d[0]
  392. fmla v20.2d, v9.2d, v0.d[0]
  393. .endm
  394. .macro SAVE1x4
  395. fmov alpha0, alpha
  396. add pCRow1, pCRow0, LDC
  397. fmul v8.2d, v16.2d, alphaV0
  398. st1 {v8.d}[0], [pCRow0]
  399. st1 {v8.d}[1], [pCRow1]
  400. add pCRow2, pCRow1, LDC
  401. add pCRow1, pCRow2, LDC
  402. fmul v12.2d, v20.2d, alphaV0
  403. st1 {v12.d}[0], [pCRow2]
  404. st1 {v12.d}[1], [pCRow1]
  405. add pCRow0, pCRow0, #8
  406. .endm
  407. /******************************************************************************/
  408. .macro INIT8x2
  409. fmov d16, xzr
  410. fmov d17, xzr
  411. fmov d18, d16
  412. fmov d19, d17
  413. fmov d20, xzr
  414. fmov d21, d16
  415. fmov d22, d17
  416. fmov d23, d18
  417. .endm
  418. .macro KERNEL8x2_SUB
  419. ld1 {v0.2d, v1.2d}, [pA]
  420. add pA, pA, #32
  421. ld1 {v8.2d}, [pB]
  422. add pB, pB, #16
  423. ld1 {v2.2d, v3.2d}, [pA]
  424. add pA, pA, #32
  425. fmla v16.2d, v0.2d, v8.d[0]
  426. fmla v17.2d, v1.2d, v8.d[0]
  427. fmla v18.2d, v2.2d, v8.d[0]
  428. fmla v19.2d, v3.2d, v8.d[0]
  429. fmla v20.2d, v0.2d, v8.d[1]
  430. fmla v21.2d, v1.2d, v8.d[1]
  431. fmla v22.2d, v2.2d, v8.d[1]
  432. fmla v23.2d, v3.2d, v8.d[1]
  433. .endm
  434. .macro SAVE8x2
  435. fmov alpha0, alpha
  436. add pCRow1, pCRow0, LDC
  437. fmul v0.2d, v16.2d, alphaV0
  438. fmul v1.2d, v17.2d, alphaV0
  439. fmul v2.2d, v18.2d, alphaV0
  440. fmul v3.2d, v19.2d, alphaV0
  441. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  442. fmul v4.2d, v20.2d, alphaV0
  443. fmul v5.2d, v21.2d, alphaV0
  444. fmul v6.2d, v22.2d, alphaV0
  445. fmul v7.2d, v23.2d, alphaV0
  446. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
  447. add pCRow0, pCRow0, #64
  448. .endm
  449. /******************************************************************************/
  450. .macro INIT4x2
  451. fmov d16, xzr
  452. fmov d17, d16
  453. fmov d20, d17
  454. fmov d21, d16
  455. .endm
  456. .macro KERNEL4x2_SUB
  457. ld1 {v8.2d}, [pB]
  458. add pB, pB, #16
  459. ld1 {v0.2d, v1.2d}, [pA]
  460. add pA, pA, #32
  461. fmla v16.2d, v0.2d, v8.d[0]
  462. fmla v17.2d, v1.2d, v8.d[0]
  463. fmla v20.2d, v0.2d, v8.d[1]
  464. fmla v21.2d, v1.2d, v8.d[1]
  465. .endm
  466. .macro SAVE4x2
  467. fmov alpha0, alpha
  468. fmul v8.2d, v16.2d, alphaV0
  469. fmul v9.2d, v17.2d, alphaV0
  470. st1 {v8.2d, v9.2d}, [pCRow0]
  471. add pCRow1, pCRow0, LDC
  472. fmul v12.2d, v20.2d, alphaV0
  473. fmul v13.2d, v21.2d, alphaV0
  474. st1 {v12.2d, v13.2d}, [pCRow1]
  475. add pCRow0, pCRow0, #32
  476. .endm
  477. /******************************************************************************/
  478. .macro INIT2x2
  479. fmov d16, xzr
  480. fmov d20, d16
  481. .endm
  482. .macro KERNEL2x2_SUB
  483. ld1 {v8.2d}, [pB]
  484. add pB, pB, #16
  485. ld1 {v0.2d}, [pA]
  486. add pA, pA, #16
  487. fmla v16.2d, v0.2d, v8.d[0]
  488. fmla v20.2d, v0.2d, v8.d[1]
  489. .endm
  490. .macro SAVE2x2
  491. fmov alpha0, alpha
  492. fmul v8.2d, v16.2d, alphaV0
  493. st1 {v8.2d}, [pCRow0]
  494. add pCRow1 , pCRow0, LDC
  495. fmul v12.2d, v20.2d, alphaV0
  496. st1 {v12.2d}, [pCRow1]
  497. add pCRow0, pCRow0, #16
  498. .endm
  499. /******************************************************************************/
  500. .macro INIT1x2
  501. fmov d16, xzr
  502. .endm
  503. .macro KERNEL1x2_SUB
  504. ld1 {v8.2d} , [pB]
  505. add pB , pB, #16
  506. ldr d0 , [pA]
  507. add pA, pA, #8
  508. fmla v16.2d, v8.2d, v0.d[0]
  509. .endm
  510. .macro SAVE1x2
  511. fmov alpha0, alpha
  512. add pCRow1 , pCRow0, LDC
  513. fmul v8.2d, v16.2d, alphaV0
  514. st1 {v8.d}[0], [pCRow0]
  515. st1 {v8.d}[1], [pCRow1]
  516. add pCRow0, pCRow0, #8
  517. .endm
  518. /******************************************************************************/
  519. .macro INIT8x1
  520. fmov d16, xzr
  521. fmov d17, xzr
  522. fmov d18, d16
  523. fmov d19, d17
  524. .endm
  525. .macro KERNEL8x1_SUB
  526. ld1 {v0.2d, v1.2d}, [pA]
  527. add pA , pA, #32
  528. ldr d8, [pB]
  529. add pB , pB, #8
  530. ld1 {v2.2d, v3.2d}, [pA]
  531. add pA, pA, #32
  532. fmla v16.2d, v0.2d, v8.d[0]
  533. fmla v17.2d, v1.2d, v8.d[0]
  534. fmla v18.2d, v2.2d, v8.d[0]
  535. fmla v19.2d, v3.2d, v8.d[0]
  536. .endm
  537. .macro SAVE8x1
  538. fmov alpha0, alpha
  539. fmul v0.2d, v16.2d, alphaV0
  540. fmul v1.2d, v17.2d, alphaV0
  541. fmul v2.2d, v18.2d, alphaV0
  542. fmul v3.2d, v19.2d, alphaV0
  543. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  544. add pCRow0, pCRow0, #64
  545. .endm
  546. /******************************************************************************/
  547. .macro INIT4x1
  548. fmov d16, xzr
  549. fmov d17, d16
  550. .endm
  551. .macro KERNEL4x1_SUB
  552. ldr d8, [pB]
  553. add pB , pB, #8
  554. ld1 {v0.2d, v1.2d}, [pA]
  555. add pA , pA, #32
  556. fmla v16.2d, v0.2d, v8.d[0]
  557. fmla v17.2d, v1.2d, v8.d[0]
  558. .endm
  559. .macro SAVE4x1
  560. fmov alpha0, alpha
  561. fmul v8.2d, v16.2d, alphaV0
  562. fmul v9.2d, v17.2d, alphaV0
  563. st1 {v8.2d, v9.2d}, [pCRow0]
  564. add pCRow0, pCRow0, #32
  565. .endm
  566. /******************************************************************************/
  567. .macro INIT2x1
  568. fmov d16, xzr
  569. .endm
  570. .macro KERNEL2x1_SUB
  571. ldr d8, [pB]
  572. add pB , pB, #8
  573. ld1 {v0.2d}, [pA]
  574. add pA , pA, #16
  575. fmla v16.2d, v0.2d, v8.d[0]
  576. .endm
  577. .macro SAVE2x1
  578. fmov alpha0, alpha
  579. fmul v8.2d, v16.2d, alphaV0
  580. st1 {v8.2d}, [pCRow0]
  581. add pCRow0, pCRow0, #16
  582. .endm
  583. /******************************************************************************/
  584. .macro INIT1x1
  585. fmov d16, xzr
  586. .endm
  587. .macro KERNEL1x1_SUB
  588. ldr d8, [pB]
  589. add pB , pB, #8
  590. ldr d0, [pA]
  591. add pA , pA, #8
  592. fmadd d16, d0, d8, d16
  593. .endm
  594. .macro SAVE1x1
  595. fmov alpha0, alpha
  596. fmul d8, d16, alpha0
  597. str d8, [pCRow0]
  598. add pCRow0, pCRow0, #8
  599. .endm
  600. /*******************************************************************************
  601. * End of macro definitions
  602. *******************************************************************************/
  603. PROLOGUE
  604. .align 5
  605. add sp, sp, #-(11 * 16)
  606. stp d8, d9, [sp, #(0 * 16)]
  607. stp d10, d11, [sp, #(1 * 16)]
  608. stp d12, d13, [sp, #(2 * 16)]
  609. stp d14, d15, [sp, #(3 * 16)]
  610. stp d16, d17, [sp, #(4 * 16)]
  611. stp x18, x19, [sp, #(5 * 16)]
  612. stp x20, x21, [sp, #(6 * 16)]
  613. stp x22, x23, [sp, #(7 * 16)]
  614. stp x24, x25, [sp, #(8 * 16)]
  615. stp x26, x27, [sp, #(9 * 16)]
  616. str x28, [sp, #(10 * 16)]
  617. prfm PLDL1KEEP, [origPB]
  618. prfm PLDL1KEEP, [origPA]
  619. fmov alpha, d0
  620. lsl LDC, LDC, #3 // ldc = ldc * 8
  621. #if !defined(LEFT)
  622. neg tempOffset, offset
  623. #endif
  624. mov pB, origPB
  625. mov counterJ, origN
  626. asr counterJ, counterJ, #2 // J = J / 4
  627. cmp counterJ, #0
  628. ble .Ldtrmm_kernel_L2_BEGIN
  629. /******************************************************************************/
  630. .Ldtrmm_kernel_L4_BEGIN:
  631. mov pCRow0, pC
  632. add pCRow1, pCRow0, LDC
  633. add pCRow2, pCRow1, LDC
  634. add pCRow3, pCRow2, LDC
  635. add pC, pCRow3, LDC
  636. #if defined(LEFT)
  637. mov tempOffset, offset
  638. #endif
  639. mov pA, origPA // pA = start of A array
  640. .Ldtrmm_kernel_L4_M8_BEGIN:
  641. mov counterI, origM
  642. asr counterI, counterI, #3 // counterI = counterI / 8
  643. cmp counterI, #0
  644. ble .Ldtrmm_kernel_L4_M4_BEGIN
  645. .align 5
  646. .Ldtrmm_kernel_L4_M8_20:
  647. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  648. mov pB, origPB
  649. #else
  650. mov pB, origPB
  651. lsl temp, tempOffset, #6
  652. add pA, pA, temp
  653. lsl temp, tempOffset, #5
  654. add pB, pB, temp
  655. #endif
  656. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  657. sub tempK, origK, tempOffset
  658. #elif defined(LEFT)
  659. add tempK, tempOffset, #8
  660. #else
  661. add tempK, tempOffset, #4
  662. #endif
  663. asr counterL , tempK, #3 // L = K / 8
  664. cmp counterL , #2 // is there at least 4 to do?
  665. blt .Ldtrmm_kernel_L4_M8_32
  666. KERNEL8x4_I // do one in the K
  667. KERNEL8x4_M2 // do another in the K
  668. KERNEL8x4_M1
  669. KERNEL8x4_M2
  670. KERNEL8x4_M1
  671. KERNEL8x4_M2
  672. KERNEL8x4_M1
  673. KERNEL8x4_M2
  674. subs counterL, counterL, #2 // subtract 2
  675. ble .Ldtrmm_kernel_L4_M8_22a
  676. .align 5
  677. .Ldtrmm_kernel_L4_M8_22:
  678. KERNEL8x4_M1
  679. KERNEL8x4_M2
  680. KERNEL8x4_M1
  681. KERNEL8x4_M2
  682. KERNEL8x4_M1
  683. KERNEL8x4_M2
  684. KERNEL8x4_M1
  685. KERNEL8x4_M2
  686. subs counterL, counterL, #1
  687. bgt .Ldtrmm_kernel_L4_M8_22
  688. .align 5
  689. .Ldtrmm_kernel_L4_M8_22a:
  690. KERNEL8x4_M1
  691. KERNEL8x4_M2
  692. KERNEL8x4_M1
  693. KERNEL8x4_M2
  694. KERNEL8x4_M1
  695. KERNEL8x4_M2
  696. KERNEL8x4_M1
  697. KERNEL8x4_E
  698. b .Ldtrmm_kernel_L4_M8_44
  699. .align 5
  700. .Ldtrmm_kernel_L4_M8_32:
  701. tst counterL, #1
  702. ble .Ldtrmm_kernel_L4_M8_40
  703. KERNEL8x4_I
  704. KERNEL8x4_M2
  705. KERNEL8x4_M1
  706. KERNEL8x4_M2
  707. KERNEL8x4_M1
  708. KERNEL8x4_M2
  709. KERNEL8x4_M1
  710. KERNEL8x4_E
  711. b .Ldtrmm_kernel_L4_M8_44
  712. .Ldtrmm_kernel_L4_M8_40:
  713. INIT8x4
  714. .Ldtrmm_kernel_L4_M8_44:
  715. ands counterL , tempK, #7
  716. ble .Ldtrmm_kernel_L4_M8_100
  717. .align 5
  718. .Ldtrmm_kernel_L4_M8_46:
  719. KERNEL8x4_SUB
  720. subs counterL, counterL, #1
  721. bne .Ldtrmm_kernel_L4_M8_46
  722. .Ldtrmm_kernel_L4_M8_100:
  723. SAVE8x4
  724. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  725. sub tempK, origK, tempOffset
  726. #if defined(LEFT)
  727. sub tempK, tempK, #8
  728. #else
  729. sub tempK, tempK, #4
  730. #endif
  731. lsl temp, tempK, #6
  732. add pA, pA, temp
  733. lsl temp, tempK, #5
  734. add pB, pB, temp
  735. #endif
  736. #if defined(LEFT)
  737. add tempOffset, tempOffset, #8
  738. #endif
  739. prfm PLDL1KEEP, [pA]
  740. prfm PLDL1KEEP, [pA, #64]
  741. prfm PLDL1KEEP, [origPB]
  742. .Ldtrmm_kernel_L4_M8_END:
  743. subs counterI, counterI, #1
  744. bne .Ldtrmm_kernel_L4_M8_20
  745. .Ldtrmm_kernel_L4_M4_BEGIN:
  746. mov counterI, origM
  747. tst counterI , #7
  748. ble .Ldtrmm_kernel_L4_END
  749. tst counterI, #4
  750. ble .Ldtrmm_kernel_L4_M2_BEGIN
  751. .Ldtrmm_kernel_L4_M4_20:
  752. INIT4x4
  753. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  754. mov pB, origPB
  755. #else
  756. mov pB, origPB
  757. lsl temp, tempOffset, #5
  758. add pB, pB, temp
  759. add pA, pA, temp
  760. #endif
  761. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  762. sub tempK, origK, tempOffset
  763. #elif defined(LEFT)
  764. add tempK, tempOffset, #4
  765. #else
  766. add tempK, tempOffset, #4
  767. #endif
  768. asr counterL , tempK, #3 // counterL = counterL / 8
  769. cmp counterL , #0
  770. ble .Ldtrmm_kernel_L4_M4_40
  771. .Ldtrmm_kernel_L4_M4_22:
  772. KERNEL4x4_SUB
  773. KERNEL4x4_SUB
  774. KERNEL4x4_SUB
  775. KERNEL4x4_SUB
  776. KERNEL4x4_SUB
  777. KERNEL4x4_SUB
  778. KERNEL4x4_SUB
  779. KERNEL4x4_SUB
  780. subs counterL, counterL, #1
  781. bgt .Ldtrmm_kernel_L4_M4_22
  782. .Ldtrmm_kernel_L4_M4_40:
  783. ands counterL , tempK, #7 // counterL = counterL % 8
  784. ble .Ldtrmm_kernel_L4_M4_100
  785. .Ldtrmm_kernel_L4_M4_42:
  786. KERNEL4x4_SUB
  787. subs counterL, counterL, #1
  788. bgt .Ldtrmm_kernel_L4_M4_42
  789. .Ldtrmm_kernel_L4_M4_100:
  790. SAVE4x4
  791. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  792. sub tempK, origK, tempOffset
  793. #if defined(LEFT)
  794. sub tempK, tempK, #4
  795. #else
  796. sub tempK, tempK, #4
  797. #endif
  798. lsl temp, tempK, #5
  799. add pA, pA, temp
  800. add pB, pB, temp
  801. #endif
  802. #if defined(LEFT)
  803. add tempOffset, tempOffset, #4
  804. #endif
  805. .Ldtrmm_kernel_L4_M4_END:
  806. .Ldtrmm_kernel_L4_M2_BEGIN:
  807. mov counterI, origM
  808. tst counterI , #3
  809. ble .Ldtrmm_kernel_L4_END
  810. tst counterI, #2 // counterI = counterI / 2
  811. ble .Ldtrmm_kernel_L4_M1_BEGIN
  812. .Ldtrmm_kernel_L4_M2_20:
  813. INIT2x4
  814. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  815. mov pB, origPB
  816. #else
  817. mov pB, origPB
  818. lsl temp, tempOffset, #4
  819. add pA, pA, temp
  820. lsl temp, tempOffset, #5
  821. add pB, pB, temp
  822. #endif
  823. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  824. sub tempK, origK, tempOffset
  825. #elif defined(LEFT)
  826. add tempK, tempOffset, #2
  827. #else
  828. add tempK, tempOffset, #4
  829. #endif
  830. asr counterL , tempK, #3 // counterL = counterL / 8
  831. cmp counterL , #0
  832. ble .Ldtrmm_kernel_L4_M2_40
  833. .Ldtrmm_kernel_L4_M2_22:
  834. KERNEL2x4_SUB
  835. KERNEL2x4_SUB
  836. KERNEL2x4_SUB
  837. KERNEL2x4_SUB
  838. KERNEL2x4_SUB
  839. KERNEL2x4_SUB
  840. KERNEL2x4_SUB
  841. KERNEL2x4_SUB
  842. subs counterL, counterL, #1
  843. bgt .Ldtrmm_kernel_L4_M2_22
  844. .Ldtrmm_kernel_L4_M2_40:
  845. ands counterL , tempK, #7 // counterL = counterL % 8
  846. ble .Ldtrmm_kernel_L4_M2_100
  847. .Ldtrmm_kernel_L4_M2_42:
  848. KERNEL2x4_SUB
  849. subs counterL, counterL, #1
  850. bgt .Ldtrmm_kernel_L4_M2_42
  851. .Ldtrmm_kernel_L4_M2_100:
  852. SAVE2x4
  853. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  854. sub tempK, origK, tempOffset
  855. #if defined(LEFT)
  856. sub tempK, tempK, #2
  857. #else
  858. sub tempK, tempK, #4
  859. #endif
  860. lsl temp, tempK, #4
  861. add pA, pA, temp
  862. lsl temp, tempK, #5
  863. add pB, pB, temp
  864. #endif
  865. #if defined(LEFT)
  866. add tempOffset, tempOffset, #2
  867. #endif
  868. .Ldtrmm_kernel_L4_M2_END:
  869. .Ldtrmm_kernel_L4_M1_BEGIN:
  870. tst counterI, #1 // counterI = counterI % 2
  871. ble .Ldtrmm_kernel_L4_END
  872. .Ldtrmm_kernel_L4_M1_20:
  873. INIT1x4
  874. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  875. mov pB, origPB
  876. #else
  877. mov pB, origPB
  878. lsl temp, tempOffset, #5
  879. add pB, pB, temp
  880. lsl temp, tempOffset, #3
  881. add pA, pA, temp
  882. #endif
  883. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  884. sub tempK, origK, tempOffset
  885. #elif defined(LEFT)
  886. add tempK, tempOffset, #1
  887. #else
  888. add tempK, tempOffset, #4
  889. #endif
  890. asr counterL , tempK, #3 // counterL = counterL / 8
  891. cmp counterL , #0
  892. ble .Ldtrmm_kernel_L4_M1_40
  893. .Ldtrmm_kernel_L4_M1_22:
  894. KERNEL1x4_SUB
  895. KERNEL1x4_SUB
  896. KERNEL1x4_SUB
  897. KERNEL1x4_SUB
  898. KERNEL1x4_SUB
  899. KERNEL1x4_SUB
  900. KERNEL1x4_SUB
  901. KERNEL1x4_SUB
  902. subs counterL, counterL, #1
  903. bgt .Ldtrmm_kernel_L4_M1_22
  904. .Ldtrmm_kernel_L4_M1_40:
  905. ands counterL , tempK, #7 // counterL = counterL % 8
  906. ble .Ldtrmm_kernel_L4_M1_100
  907. .Ldtrmm_kernel_L4_M1_42:
  908. KERNEL1x4_SUB
  909. subs counterL, counterL, #1
  910. bgt .Ldtrmm_kernel_L4_M1_42
  911. .Ldtrmm_kernel_L4_M1_100:
  912. SAVE1x4
  913. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  914. sub tempK, origK, tempOffset
  915. #if defined(LEFT)
  916. sub tempK, tempK, #1
  917. #else
  918. sub tempK, tempK, #4
  919. #endif
  920. lsl temp, tempK, #3
  921. add pA, pA, temp
  922. lsl temp, tempK, #5
  923. add pB, pB, temp
  924. #endif
  925. #if defined(LEFT)
  926. add tempOffset, tempOffset, #1
  927. #endif
  928. .Ldtrmm_kernel_L4_END:
  929. lsl temp, origK, #5
  930. add origPB, origPB, temp // B = B + K * 4 * 8
  931. #if !defined(LEFT)
  932. add tempOffset, tempOffset, #4
  933. #endif
  934. subs counterJ, counterJ , #1 // j--
  935. bgt .Ldtrmm_kernel_L4_BEGIN
  936. /******************************************************************************/
  937. .Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  938. mov counterJ , origN
  939. tst counterJ , #3
  940. ble .Ldtrmm_kernel_L999 // error, N was less than 4?
  941. tst counterJ , #2
  942. ble .Ldtrmm_kernel_L1_BEGIN
  943. mov pCRow0, pC // pCRow0 = pC
  944. add pC,pC,LDC, lsl #1
  945. #if defined(LEFT)
  946. mov tempOffset, offset
  947. #endif
  948. mov pA, origPA // pA = A
  949. .Ldtrmm_kernel_L2_M8_BEGIN:
  950. mov counterI, origM
  951. asr counterI, counterI, #3 // counterI = counterI / 8
  952. cmp counterI, #0
  953. ble .Ldtrmm_kernel_L2_M4_BEGIN
  954. .Ldtrmm_kernel_L2_M8_20:
  955. INIT8x2
  956. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  957. mov pB, origPB
  958. #else
  959. mov pB, origPB
  960. lsl temp, tempOffset, #6
  961. add pA, pA, temp
  962. lsl temp, tempOffset, #4
  963. add pB, pB, temp
  964. #endif
  965. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  966. sub tempK, origK, tempOffset
  967. #elif defined(LEFT)
  968. add tempK, tempOffset, #8
  969. #else
  970. add tempK, tempOffset, #2
  971. #endif
  972. asr counterL , tempK, #3 // counterL = counterL / 8
  973. cmp counterL,#0
  974. ble .Ldtrmm_kernel_L2_M8_40
  975. .align 5
  976. .Ldtrmm_kernel_L2_M8_22:
  977. KERNEL8x2_SUB
  978. KERNEL8x2_SUB
  979. KERNEL8x2_SUB
  980. KERNEL8x2_SUB
  981. KERNEL8x2_SUB
  982. KERNEL8x2_SUB
  983. KERNEL8x2_SUB
  984. KERNEL8x2_SUB
  985. subs counterL, counterL, #1
  986. bgt .Ldtrmm_kernel_L2_M8_22
  987. .Ldtrmm_kernel_L2_M8_40:
  988. ands counterL , tempK, #7 // counterL = counterL % 8
  989. ble .Ldtrmm_kernel_L2_M8_100
  990. .Ldtrmm_kernel_L2_M8_42:
  991. KERNEL8x2_SUB
  992. subs counterL, counterL, #1
  993. bgt .Ldtrmm_kernel_L2_M8_42
  994. .Ldtrmm_kernel_L2_M8_100:
  995. SAVE8x2
  996. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  997. sub tempK, origK, tempOffset
  998. #if defined(LEFT)
  999. sub tempK, tempK, #8
  1000. #else
  1001. sub tempK, tempK, #2
  1002. #endif
  1003. lsl temp, tempK, #6
  1004. add pA, pA, temp
  1005. lsl temp, tempK, #4
  1006. add pB, pB, temp
  1007. #endif
  1008. #if defined(LEFT)
  1009. add tempOffset, tempOffset, #8
  1010. #endif
  1011. .Ldtrmm_kernel_L2_M8_END:
  1012. subs counterI, counterI, #1
  1013. bgt .Ldtrmm_kernel_L2_M8_20
  1014. .Ldtrmm_kernel_L2_M4_BEGIN:
  1015. mov counterI, origM
  1016. tst counterI , #7
  1017. ble .Ldtrmm_kernel_L2_END
  1018. tst counterI, #4 // counterI = counterI / 2
  1019. ble .Ldtrmm_kernel_L2_M2_BEGIN
  1020. .Ldtrmm_kernel_L2_M4_20:
  1021. INIT4x2
  1022. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1023. mov pB, origPB
  1024. #else
  1025. mov pB, origPB
  1026. lsl temp, tempOffset, #4
  1027. add pB, pB, temp
  1028. lsl temp, tempOffset, #5
  1029. add pA, pA, temp
  1030. #endif
  1031. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1032. sub tempK, origK, tempOffset
  1033. #elif defined(LEFT)
  1034. add tempK, tempOffset, #4
  1035. #else
  1036. add tempK, tempOffset, #2
  1037. #endif
  1038. asr counterL , tempK, #3 // counterL = counterL / 8
  1039. cmp counterL,#0
  1040. ble .Ldtrmm_kernel_L2_M4_40
  1041. .align 5
  1042. .Ldtrmm_kernel_L2_M4_22:
  1043. KERNEL4x2_SUB
  1044. KERNEL4x2_SUB
  1045. KERNEL4x2_SUB
  1046. KERNEL4x2_SUB
  1047. KERNEL4x2_SUB
  1048. KERNEL4x2_SUB
  1049. KERNEL4x2_SUB
  1050. KERNEL4x2_SUB
  1051. subs counterL, counterL, #1
  1052. bgt .Ldtrmm_kernel_L2_M4_22
  1053. .Ldtrmm_kernel_L2_M4_40:
  1054. ands counterL , tempK, #7 // counterL = counterL % 8
  1055. ble .Ldtrmm_kernel_L2_M4_100
  1056. .Ldtrmm_kernel_L2_M4_42:
  1057. KERNEL4x2_SUB
  1058. subs counterL, counterL, #1
  1059. bgt .Ldtrmm_kernel_L2_M4_42
  1060. .Ldtrmm_kernel_L2_M4_100:
  1061. SAVE4x2
  1062. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1063. sub tempK, origK, tempOffset
  1064. #if defined(LEFT)
  1065. sub tempK, tempK, #4
  1066. #else
  1067. sub tempK, tempK, #2
  1068. #endif
  1069. lsl temp, tempK, #5
  1070. add pA, pA, temp
  1071. lsl temp, tempK, #4
  1072. add pB, pB, temp
  1073. #endif
  1074. #if defined(LEFT)
  1075. add tempOffset, tempOffset, #4
  1076. #endif
  1077. .Ldtrmm_kernel_L2_M4_END:
  1078. .Ldtrmm_kernel_L2_M2_BEGIN:
  1079. mov counterI, origM
  1080. tst counterI , #3
  1081. ble .Ldtrmm_kernel_L2_END
  1082. tst counterI, #2 // counterI = counterI / 2
  1083. ble .Ldtrmm_kernel_L2_M1_BEGIN
  1084. .Ldtrmm_kernel_L2_M2_20:
  1085. INIT2x2
  1086. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1087. mov pB, origPB
  1088. #else
  1089. mov pB, origPB
  1090. lsl temp, tempOffset, #4
  1091. add pB, pB, temp
  1092. lsl temp, tempOffset, #4
  1093. add pA, pA, temp
  1094. #endif
  1095. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1096. sub tempK, origK, tempOffset
  1097. #elif defined(LEFT)
  1098. add tempK, tempOffset, #2
  1099. #else
  1100. add tempK, tempOffset, #2
  1101. #endif
  1102. asr counterL , tempK, #3 // counterL = counterL / 8
  1103. cmp counterL,#0
  1104. ble .Ldtrmm_kernel_L2_M2_40
  1105. .Ldtrmm_kernel_L2_M2_22:
  1106. KERNEL2x2_SUB
  1107. KERNEL2x2_SUB
  1108. KERNEL2x2_SUB
  1109. KERNEL2x2_SUB
  1110. KERNEL2x2_SUB
  1111. KERNEL2x2_SUB
  1112. KERNEL2x2_SUB
  1113. KERNEL2x2_SUB
  1114. subs counterL, counterL, #1
  1115. bgt .Ldtrmm_kernel_L2_M2_22
  1116. .Ldtrmm_kernel_L2_M2_40:
  1117. ands counterL , tempK, #7 // counterL = counterL % 8
  1118. ble .Ldtrmm_kernel_L2_M2_100
  1119. .Ldtrmm_kernel_L2_M2_42:
  1120. KERNEL2x2_SUB
  1121. subs counterL, counterL, #1
  1122. bgt .Ldtrmm_kernel_L2_M2_42
  1123. .Ldtrmm_kernel_L2_M2_100:
  1124. SAVE2x2
  1125. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1126. sub tempK, origK, tempOffset
  1127. #if defined(LEFT)
  1128. sub tempK, tempK, #2
  1129. #else
  1130. sub tempK, tempK, #2
  1131. #endif
  1132. lsl temp, tempK, #4
  1133. add pA, pA, temp
  1134. lsl temp, tempK, #4
  1135. add pB, pB, temp
  1136. #endif
  1137. #if defined(LEFT)
  1138. add tempOffset, tempOffset, #2
  1139. #endif
  1140. .Ldtrmm_kernel_L2_M2_END:
  1141. .Ldtrmm_kernel_L2_M1_BEGIN:
  1142. tst counterI, #1 // counterI = counterI % 2
  1143. ble .Ldtrmm_kernel_L2_END
  1144. .Ldtrmm_kernel_L2_M1_20:
  1145. INIT1x2
  1146. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1147. mov pB, origPB
  1148. #else
  1149. mov pB, origPB
  1150. lsl temp, tempOffset, #4
  1151. add pB, pB, temp
  1152. lsl temp, tempOffset, #3
  1153. add pA, pA, temp
  1154. #endif
  1155. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1156. sub tempK, origK, tempOffset
  1157. #elif defined(LEFT)
  1158. add tempK, tempOffset, #1
  1159. #else
  1160. add tempK, tempOffset, #2
  1161. #endif
  1162. asr counterL , tempK, #3 // counterL = counterL / 8
  1163. cmp counterL, #0
  1164. ble .Ldtrmm_kernel_L2_M1_40
  1165. .Ldtrmm_kernel_L2_M1_22:
  1166. KERNEL1x2_SUB
  1167. KERNEL1x2_SUB
  1168. KERNEL1x2_SUB
  1169. KERNEL1x2_SUB
  1170. KERNEL1x2_SUB
  1171. KERNEL1x2_SUB
  1172. KERNEL1x2_SUB
  1173. KERNEL1x2_SUB
  1174. subs counterL, counterL, #1
  1175. bgt .Ldtrmm_kernel_L2_M1_22
  1176. .Ldtrmm_kernel_L2_M1_40:
  1177. ands counterL , tempK, #7 // counterL = counterL % 8
  1178. ble .Ldtrmm_kernel_L2_M1_100
  1179. .Ldtrmm_kernel_L2_M1_42:
  1180. KERNEL1x2_SUB
  1181. subs counterL, counterL, #1
  1182. bgt .Ldtrmm_kernel_L2_M1_42
  1183. .Ldtrmm_kernel_L2_M1_100:
  1184. SAVE1x2
  1185. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1186. sub tempK, origK, tempOffset
  1187. #if defined(LEFT)
  1188. sub tempK, tempK, #1
  1189. #else
  1190. sub tempK, tempK, #2
  1191. #endif
  1192. lsl temp, tempK, #3
  1193. add pA, pA, temp
  1194. lsl temp, tempK, #4
  1195. add pB, pB, temp
  1196. #endif
  1197. #if defined(LEFT)
  1198. add tempOffset, tempOffset, #1
  1199. #endif
  1200. .Ldtrmm_kernel_L2_END:
  1201. #if !defined(LEFT)
  1202. add tempOffset, tempOffset, #2
  1203. #endif
  1204. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1205. /******************************************************************************/
  1206. .Ldtrmm_kernel_L1_BEGIN:
  1207. mov counterJ , origN
  1208. tst counterJ , #1
  1209. ble .Ldtrmm_kernel_L999 // done
  1210. mov pCRow0, pC // pCRow0 = C
  1211. add pC , pC , LDC // Update pC to point to next
  1212. #if defined(LEFT)
  1213. mov tempOffset, offset
  1214. #endif
  1215. mov pA, origPA // pA = A
  1216. .Ldtrmm_kernel_L1_M8_BEGIN:
  1217. mov counterI, origM
  1218. asr counterI, counterI, #3 // counterI = counterI / 8
  1219. cmp counterI, #0
  1220. ble .Ldtrmm_kernel_L1_M4_BEGIN
  1221. .Ldtrmm_kernel_L1_M8_20:
  1222. INIT8x1
  1223. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1224. mov pB, origPB
  1225. #else
  1226. mov pB, origPB
  1227. lsl temp, tempOffset, #6
  1228. add pA, pA, temp
  1229. lsl temp, tempOffset, #3
  1230. add pB, pB, temp
  1231. #endif
  1232. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1233. sub tempK, origK, tempOffset
  1234. #elif defined(LEFT)
  1235. add tempK, tempOffset, #8
  1236. #else
  1237. add tempK, tempOffset, #1
  1238. #endif
  1239. asr counterL , tempK, #3 // counterL = counterL / 8
  1240. cmp counterL , #0
  1241. ble .Ldtrmm_kernel_L1_M8_40
  1242. .align 5
  1243. .Ldtrmm_kernel_L1_M8_22:
  1244. KERNEL8x1_SUB
  1245. KERNEL8x1_SUB
  1246. KERNEL8x1_SUB
  1247. KERNEL8x1_SUB
  1248. KERNEL8x1_SUB
  1249. KERNEL8x1_SUB
  1250. KERNEL8x1_SUB
  1251. KERNEL8x1_SUB
  1252. subs counterL, counterL, #1
  1253. bgt .Ldtrmm_kernel_L1_M8_22
  1254. .Ldtrmm_kernel_L1_M8_40:
  1255. ands counterL , tempK, #7 // counterL = counterL % 8
  1256. ble .Ldtrmm_kernel_L1_M8_100
  1257. .Ldtrmm_kernel_L1_M8_42:
  1258. KERNEL8x1_SUB
  1259. subs counterL, counterL, #1
  1260. bgt .Ldtrmm_kernel_L1_M8_42
  1261. .Ldtrmm_kernel_L1_M8_100:
  1262. SAVE8x1
  1263. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1264. sub tempK, origK, tempOffset
  1265. #if defined(LEFT)
  1266. sub tempK, tempK, #8
  1267. #else
  1268. sub tempK, tempK, #1
  1269. #endif
  1270. lsl temp, tempK, #6
  1271. add pA, pA, temp
  1272. lsl temp, tempK, #3
  1273. add pB, pB, temp
  1274. #endif
  1275. #if defined(LEFT)
  1276. add tempOffset, tempOffset, #8
  1277. #endif
  1278. .Ldtrmm_kernel_L1_M8_END:
  1279. subs counterI, counterI, #1
  1280. bgt .Ldtrmm_kernel_L1_M8_20
  1281. .Ldtrmm_kernel_L1_M4_BEGIN:
  1282. mov counterI, origM
  1283. tst counterI , #7
  1284. ble .Ldtrmm_kernel_L1_END
  1285. tst counterI, #4 // counterI = counterI / 2
  1286. ble .Ldtrmm_kernel_L1_M2_BEGIN
  1287. .Ldtrmm_kernel_L1_M4_20:
  1288. INIT4x1
  1289. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1290. mov pB, origPB
  1291. #else
  1292. mov pB, origPB
  1293. lsl temp, tempOffset, #3
  1294. add pB, pB, temp
  1295. lsl temp, tempOffset, #5
  1296. add pA, pA, temp
  1297. #endif
  1298. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1299. sub tempK, origK, tempOffset
  1300. #elif defined(LEFT)
  1301. add tempK, tempOffset, #4
  1302. #else
  1303. add tempK, tempOffset, #1
  1304. #endif
  1305. asr counterL , tempK, #3 // counterL = counterL / 8
  1306. cmp counterL , #0
  1307. ble .Ldtrmm_kernel_L1_M4_40
  1308. .align 5
  1309. .Ldtrmm_kernel_L1_M4_22:
  1310. KERNEL4x1_SUB
  1311. KERNEL4x1_SUB
  1312. KERNEL4x1_SUB
  1313. KERNEL4x1_SUB
  1314. KERNEL4x1_SUB
  1315. KERNEL4x1_SUB
  1316. KERNEL4x1_SUB
  1317. KERNEL4x1_SUB
  1318. subs counterL, counterL, #1
  1319. bgt .Ldtrmm_kernel_L1_M4_22
  1320. .Ldtrmm_kernel_L1_M4_40:
  1321. ands counterL , tempK, #7 // counterL = counterL % 8
  1322. ble .Ldtrmm_kernel_L1_M4_100
  1323. .Ldtrmm_kernel_L1_M4_42:
  1324. KERNEL4x1_SUB
  1325. subs counterL, counterL, #1
  1326. bgt .Ldtrmm_kernel_L1_M4_42
  1327. .Ldtrmm_kernel_L1_M4_100:
  1328. SAVE4x1
  1329. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1330. sub tempK, origK, tempOffset
  1331. #if defined(LEFT)
  1332. sub tempK, tempK, #4
  1333. #else
  1334. sub tempK, tempK, #1
  1335. #endif
  1336. lsl temp, tempK, #5
  1337. add pA, pA, temp
  1338. lsl temp, tempK, #3
  1339. add pB, pB, temp
  1340. #endif
  1341. #if defined(LEFT)
  1342. add tempOffset, tempOffset, #4
  1343. #endif
  1344. .Ldtrmm_kernel_L1_M4_END:
  1345. .Ldtrmm_kernel_L1_M2_BEGIN:
  1346. mov counterI, origM
  1347. tst counterI , #3
  1348. ble .Ldtrmm_kernel_L1_END
  1349. tst counterI, #2 // counterI = counterI / 2
  1350. ble .Ldtrmm_kernel_L1_M1_BEGIN
  1351. .Ldtrmm_kernel_L1_M2_20:
  1352. INIT2x1
  1353. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1354. mov pB, origPB
  1355. #else
  1356. mov pB, origPB
  1357. lsl temp, tempOffset, #3
  1358. add pB, pB, temp
  1359. lsl temp, tempOffset, #4
  1360. add pA, pA, temp
  1361. #endif
  1362. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1363. sub tempK, origK, tempOffset
  1364. #elif defined(LEFT)
  1365. add tempK, tempOffset, #2
  1366. #else
  1367. add tempK, tempOffset, #1
  1368. #endif
  1369. asr counterL , tempK, #3 // counterL = counterL / 8
  1370. cmp counterL , #0
  1371. ble .Ldtrmm_kernel_L1_M2_40
  1372. .Ldtrmm_kernel_L1_M2_22:
  1373. KERNEL2x1_SUB
  1374. KERNEL2x1_SUB
  1375. KERNEL2x1_SUB
  1376. KERNEL2x1_SUB
  1377. KERNEL2x1_SUB
  1378. KERNEL2x1_SUB
  1379. KERNEL2x1_SUB
  1380. KERNEL2x1_SUB
  1381. subs counterL, counterL, #1
  1382. bgt .Ldtrmm_kernel_L1_M2_22
  1383. .Ldtrmm_kernel_L1_M2_40:
  1384. ands counterL , tempK, #7 // counterL = counterL % 8
  1385. ble .Ldtrmm_kernel_L1_M2_100
  1386. .Ldtrmm_kernel_L1_M2_42:
  1387. KERNEL2x1_SUB
  1388. subs counterL, counterL, #1
  1389. bgt .Ldtrmm_kernel_L1_M2_42
  1390. .Ldtrmm_kernel_L1_M2_100:
  1391. SAVE2x1
  1392. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1393. sub tempK, origK, tempOffset
  1394. #if defined(LEFT)
  1395. sub tempK, tempK, #2
  1396. #else
  1397. sub tempK, tempK, #1
  1398. #endif
  1399. lsl temp, tempK, #4
  1400. add pA, pA, temp
  1401. lsl temp, tempK, #3
  1402. add pB, pB, temp
  1403. #endif
  1404. #if defined(LEFT)
  1405. add tempOffset, tempOffset, #2
  1406. #endif
  1407. .Ldtrmm_kernel_L1_M2_END:
  1408. .Ldtrmm_kernel_L1_M1_BEGIN:
  1409. tst counterI, #1 // counterI = counterI % 2
  1410. ble .Ldtrmm_kernel_L1_END
  1411. .Ldtrmm_kernel_L1_M1_20:
  1412. INIT1x1
  1413. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1414. mov pB, origPB
  1415. #else
  1416. mov pB, origPB
  1417. lsl temp, tempOffset, #3
  1418. add pB, pB, temp
  1419. lsl temp, tempOffset, #3
  1420. add pA, pA, temp
  1421. #endif
  1422. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1423. sub tempK, origK, tempOffset
  1424. #elif defined(LEFT)
  1425. add tempK, tempOffset, #1
  1426. #else
  1427. add tempK, tempOffset, #1
  1428. #endif
  1429. asr counterL , tempK, #3 // counterL = counterL / 8
  1430. cmp counterL , #0
  1431. ble .Ldtrmm_kernel_L1_M1_40
  1432. .Ldtrmm_kernel_L1_M1_22:
  1433. KERNEL1x1_SUB
  1434. KERNEL1x1_SUB
  1435. KERNEL1x1_SUB
  1436. KERNEL1x1_SUB
  1437. KERNEL1x1_SUB
  1438. KERNEL1x1_SUB
  1439. KERNEL1x1_SUB
  1440. KERNEL1x1_SUB
  1441. subs counterL, counterL, #1
  1442. bgt .Ldtrmm_kernel_L1_M1_22
  1443. .Ldtrmm_kernel_L1_M1_40:
  1444. ands counterL , tempK, #7 // counterL = counterL % 8
  1445. ble .Ldtrmm_kernel_L1_M1_100
  1446. .Ldtrmm_kernel_L1_M1_42:
  1447. KERNEL1x1_SUB
  1448. subs counterL, counterL, #1
  1449. bgt .Ldtrmm_kernel_L1_M1_42
  1450. .Ldtrmm_kernel_L1_M1_100:
  1451. SAVE1x1
  1452. .Ldtrmm_kernel_L1_END:
  1453. .Ldtrmm_kernel_L999:
  1454. mov x0, #0 // set return value
  1455. ldp d8, d9, [sp, #(0 * 16)]
  1456. ldp d10, d11, [sp, #(1 * 16)]
  1457. ldp d12, d13, [sp, #(2 * 16)]
  1458. ldp d14, d15, [sp, #(3 * 16)]
  1459. ldp d16, d17, [sp, #(4 * 16)]
  1460. ldp x18, x19, [sp, #(5 * 16)]
  1461. ldp x20, x21, [sp, #(6 * 16)]
  1462. ldp x22, x23, [sp, #(7 * 16)]
  1463. ldp x24, x25, [sp, #(8 * 16)]
  1464. ldp x26, x27, [sp, #(9 * 16)]
  1465. ldr x28, [sp, #(10 * 16)]
  1466. add sp, sp, #(11*16)
  1467. ret
  1468. EPILOGUE