You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrmm_kernel_4x4.S 40 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define alpha_save_R x16
  48. #define alpha_save_I x17
  49. #define temp x18
  50. #define tempOffset x19
  51. #define tempK x20
  52. #define alpha0_R d10
  53. #define alphaV0_R v10.d[0]
  54. #define alpha0_I d11
  55. #define alphaV0_I v11.d[0]
  56. #define alpha1_R d14
  57. #define alphaV1_R v14.d[0]
  58. #define alpha1_I d15
  59. #define alphaV1_I v15.d[0]
  60. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  61. #define OP_rr fmla
  62. #define OP_ii fmls
  63. #define OP_ri fmla
  64. #define OP_ir fmla
  65. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  66. #define OP_rr fmla
  67. #define OP_ii fmla
  68. #define OP_ri fmls
  69. #define OP_ir fmla
  70. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  71. #define OP_rr fmla
  72. #define OP_ii fmla
  73. #define OP_ri fmla
  74. #define OP_ir fmls
  75. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  76. #define OP_rr fmla
  77. #define OP_ii fmls
  78. #define OP_ri fmls
  79. #define OP_ir fmls
  80. #endif
  81. // 00 origM
  82. // 01 origN
  83. // 02 origK
  84. // 03 origPA
  85. // 04 origPB
  86. // 05 pC
  87. // 06 origLDC -> LDC
  88. // 07 offset
  89. // 08 counterL
  90. // 09 counterI
  91. // 10 counterJ
  92. // 11 pB
  93. // 12 pCRow0
  94. // 13 pCRow1
  95. // 14 pCRow2
  96. // 15 pA
  97. // 16 alpha_save_R
  98. // 17 alpha_save_I
  99. // 18 must save temp
  100. // 19 must save tempOffset
  101. // 20 must save tempK
  102. // 21 must save
  103. // 22 must save
  104. // 23 must save
  105. // 24 must save
  106. // 25 must save
  107. // 26 must save
  108. // 27 must save
  109. // 28 must save
  110. // 29 frame
  111. // 30 link
  112. // 31 sp
  113. //v00 ALPHA_R -> pA00_R, pA01_R
  114. //v01 ALPHA_I -> pA00_I, pA01_I
  115. //v02 pA02_R, pA03_R
  116. //v03 pA02_I, pA03_I
  117. //v04 pA10_R, pA11_R
  118. //v05 pA10_I, pA11_I
  119. //v06 pA12_R, pA13_R
  120. //v07 pA12_I, pA13_I
  121. //v08 must save pB00_R, pB01_R
  122. //v09 must save pB00_I, pB01_I
  123. //v10 must save pB02_R, pB03_R OR ALPHA0_R
  124. //v11 must save pB02_I, pB03_I OR ALPHA0_I
  125. //v12 must save pB10_R, pB11_R
  126. //v13 must save pB10_I, pB11_I
  127. //v14 must save pB12_R, pB13_R OR ALPHA1_R
  128. //v15 must save pB12_I, pB13_I OR ALPHA1_R
  129. //v16 must save pC00_R, pC01_R
  130. //v17 must save pC00_I, pC01_I
  131. //v18 pC02_R, pC03_R
  132. //v19 pC02_I, pC03_I
  133. //v20 pC10_R, pC11_R
  134. //v21 pC10_I, pC11_I
  135. //v22 pC12_R, pC13_R
  136. //v23 pC12_I, pC13_I
  137. //v24 pC20_R, pC21_R
  138. //v25 pC20_I, pC21_I
  139. //v26 pC22_R, pC23_R
  140. //v27 pC22_I, pC23_I
  141. //v28 pC30_R, pC31_R
  142. //v29 pC30_I, pC31_I
  143. //v30 pC32_R, pC33_R
  144. //v31 pC32_I, pC33_I
  145. /*******************************************************************************
  146. * Macro definitions
  147. *******************************************************************************/
  148. .macro INIT4x4
  149. fmov d16, xzr
  150. fmov d17, d16
  151. fmov d18, d17
  152. fmov d19, d16
  153. fmov d20, d17
  154. fmov d21, d16
  155. fmov d22, d17
  156. fmov d23, d16
  157. fmov d24, d17
  158. fmov d25, d16
  159. fmov d26, d17
  160. fmov d27, d16
  161. fmov d28, d17
  162. fmov d29, d16
  163. fmov d30, d17
  164. fmov d31, d16
  165. .endm
  166. .macro KERNEL4x4_I
  167. ld2 {v8.2d, v9.2d}, [pB]
  168. add pB, pB, #32
  169. ld2 {v10.2d, v11.2d}, [pB]
  170. add pB, pB, #32
  171. ld2 {v0.2d, v1.2d}, [pA]
  172. add pA, pA, #32
  173. ld2 {v2.2d, v3.2d}, [pA]
  174. add pA, pA, #32
  175. fmul v16.2d, v0.2d, v8.d[0]
  176. OP_ii v16.2d, v1.2d, v9.d[0]
  177. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  178. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  179. eor v17.16b, v17.16b, v17.16b
  180. fmls v17.2d, v0.2d, v9.d[0]
  181. #else
  182. fmul v17.2d, v0.2d, v9.d[0]
  183. #endif
  184. OP_ir v17.2d, v1.2d, v8.d[0]
  185. fmul v18.2d, v2.2d, v8.d[0]
  186. OP_ii v18.2d, v3.2d, v9.d[0]
  187. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  188. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  189. eor v19.16b, v19.16b, v19.16b
  190. fmls v19.2d, v2.2d, v9.d[0]
  191. #else
  192. fmul v19.2d, v2.2d, v9.d[0]
  193. #endif
  194. OP_ir v19.2d, v3.2d, v8.d[0]
  195. fmul v20.2d, v0.2d, v8.d[1]
  196. OP_ii v20.2d, v1.2d, v9.d[1]
  197. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  198. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  199. eor v21.16b, v21.16b, v21.16b
  200. fmls v21.2d, v0.2d, v9.d[1]
  201. #else
  202. fmul v21.2d, v0.2d, v9.d[1]
  203. #endif
  204. OP_ir v21.2d, v1.2d, v8.d[1]
  205. fmul v22.2d, v2.2d, v8.d[1]
  206. OP_ii v22.2d, v3.2d, v9.d[1]
  207. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  208. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  209. eor v23.16b, v23.16b, v23.16b
  210. fmls v23.2d, v2.2d, v9.d[1]
  211. #else
  212. fmul v23.2d, v2.2d, v9.d[1]
  213. #endif
  214. OP_ir v23.2d, v3.2d, v8.d[1]
  215. fmul v24.2d, v0.2d, v10.d[0]
  216. OP_ii v24.2d, v1.2d, v11.d[0]
  217. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  218. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  219. eor v25.16b, v25.16b, v25.16b
  220. fmls v25.2d, v0.2d, v11.d[0]
  221. #else
  222. fmul v25.2d, v0.2d, v11.d[0]
  223. #endif
  224. OP_ir v25.2d, v1.2d, v10.d[0]
  225. fmul v26.2d, v2.2d, v10.d[0]
  226. OP_ii v26.2d, v3.2d, v11.d[0]
  227. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  228. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  229. eor v27.16b, v27.16b, v27.16b
  230. fmls v27.2d, v2.2d, v11.d[0]
  231. #else
  232. fmul v27.2d, v2.2d, v11.d[0]
  233. #endif
  234. OP_ir v27.2d, v3.2d, v10.d[0]
  235. fmul v28.2d, v0.2d, v10.d[1]
  236. OP_ii v28.2d, v1.2d, v11.d[1]
  237. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  238. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  239. eor v29.16b, v29.16b, v29.16b
  240. fmls v29.2d, v0.2d, v11.d[1]
  241. #else
  242. fmul v29.2d, v0.2d, v11.d[1]
  243. #endif
  244. OP_ir v29.2d, v1.2d, v10.d[1]
  245. fmul v30.2d, v2.2d, v10.d[1]
  246. OP_ii v30.2d, v3.2d, v11.d[1]
  247. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  248. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  249. eor v31.16b, v31.16b, v31.16b
  250. fmls v31.2d, v2.2d, v11.d[1]
  251. #else
  252. fmul v31.2d, v2.2d, v11.d[1]
  253. #endif
  254. OP_ir v31.2d, v3.2d, v10.d[1]
  255. ld2 {v12.2d, v13.2d}, [pB]
  256. add pB, pB, #32
  257. ld2 {v14.2d, v15.2d}, [pB]
  258. add pB, pB, #32
  259. ld2 {v4.2d, v5.2d} , [pA]
  260. add pA, pA, #32
  261. ld2 {v6.2d, v7.2d} , [pA]
  262. add pA, pA, #32
  263. .endm
  264. .macro KERNEL4x4_M1
  265. OP_rr v16.2d, v0.2d, v8.d[0]
  266. OP_ii v16.2d, v1.2d, v9.d[0]
  267. OP_ri v17.2d, v0.2d, v9.d[0]
  268. OP_ir v17.2d, v1.2d, v8.d[0]
  269. ld2 {v12.2d, v13.2d}, [pB] // For next round
  270. add pB, pB, #32
  271. OP_rr v18.2d, v2.2d, v8.d[0]
  272. OP_ii v18.2d, v3.2d, v9.d[0]
  273. OP_ri v19.2d, v2.2d, v9.d[0]
  274. OP_ir v19.2d, v3.2d, v8.d[0]
  275. ld2 {v14.2d, v15.2d}, [pB] // For next round
  276. add pB, pB, #32
  277. OP_rr v20.2d, v0.2d, v8.d[1]
  278. OP_ii v20.2d, v1.2d, v9.d[1]
  279. OP_ri v21.2d, v0.2d, v9.d[1]
  280. OP_ir v21.2d, v1.2d, v8.d[1]
  281. ld2 {v4.2d, v5.2d} , [pA] // For next round
  282. add pA, pA, #32
  283. OP_rr v22.2d, v2.2d, v8.d[1]
  284. OP_ii v22.2d, v3.2d, v9.d[1]
  285. OP_ri v23.2d, v2.2d, v9.d[1]
  286. OP_ir v23.2d, v3.2d, v8.d[1]
  287. ld2 {v6.2d, v7.2d} , [pA] // For next round
  288. add pA, pA, #32
  289. OP_rr v24.2d, v0.2d, v10.d[0]
  290. OP_ii v24.2d, v1.2d, v11.d[0]
  291. OP_ri v25.2d, v0.2d, v11.d[0]
  292. OP_ir v25.2d, v1.2d, v10.d[0]
  293. prfm PLDL1KEEP, [pA, #512]
  294. OP_rr v26.2d, v2.2d, v10.d[0]
  295. OP_ii v26.2d, v3.2d, v11.d[0]
  296. OP_ri v27.2d, v2.2d, v11.d[0]
  297. OP_ir v27.2d, v3.2d, v10.d[0]
  298. prfm PLDL1KEEP, [pB, #512]
  299. OP_rr v28.2d, v0.2d, v10.d[1]
  300. OP_ii v28.2d, v1.2d, v11.d[1]
  301. OP_ri v29.2d, v0.2d, v11.d[1]
  302. OP_ir v29.2d, v1.2d, v10.d[1]
  303. OP_rr v30.2d, v2.2d, v10.d[1]
  304. OP_ii v30.2d, v3.2d, v11.d[1]
  305. OP_ri v31.2d, v2.2d, v11.d[1]
  306. OP_ir v31.2d, v3.2d, v10.d[1]
  307. .endm
  308. .macro KERNEL4x4_M2
  309. OP_rr v16.2d, v4.2d, v12.d[0]
  310. OP_ii v16.2d, v5.2d, v13.d[0]
  311. OP_ri v17.2d, v4.2d, v13.d[0]
  312. OP_ir v17.2d, v5.2d, v12.d[0]
  313. ld2 {v8.2d, v9.2d}, [pB] // For next round
  314. add pB, pB, #32
  315. OP_rr v18.2d, v6.2d, v12.d[0]
  316. OP_ii v18.2d, v7.2d, v13.d[0]
  317. OP_ri v19.2d, v6.2d, v13.d[0]
  318. OP_ir v19.2d, v7.2d, v12.d[0]
  319. ld2 {v10.2d, v11.2d}, [pB] // For next round
  320. add pB, pB, #32
  321. OP_rr v20.2d, v4.2d, v12.d[1]
  322. OP_ii v20.2d, v5.2d, v13.d[1]
  323. OP_ri v21.2d, v4.2d, v13.d[1]
  324. OP_ir v21.2d, v5.2d, v12.d[1]
  325. ld2 {v0.2d, v1.2d}, [pA] // For next round
  326. add pA, pA, #32
  327. OP_rr v22.2d, v6.2d, v12.d[1]
  328. OP_ii v22.2d, v7.2d, v13.d[1]
  329. OP_ri v23.2d, v6.2d, v13.d[1]
  330. OP_ir v23.2d, v7.2d, v12.d[1]
  331. ld2 {v2.2d, v3.2d}, [pA] // For next round
  332. add pA, pA, #32
  333. OP_rr v24.2d, v4.2d, v14.d[0]
  334. OP_ii v24.2d, v5.2d, v15.d[0]
  335. OP_ri v25.2d, v4.2d, v15.d[0]
  336. OP_ir v25.2d, v5.2d, v14.d[0]
  337. prfm PLDL1KEEP, [pA, #512]
  338. OP_rr v26.2d, v6.2d, v14.d[0]
  339. OP_ii v26.2d, v7.2d, v15.d[0]
  340. OP_ri v27.2d, v6.2d, v15.d[0]
  341. OP_ir v27.2d, v7.2d, v14.d[0]
  342. prfm PLDL1KEEP, [pB, #512]
  343. OP_rr v28.2d, v4.2d, v14.d[1]
  344. OP_ii v28.2d, v5.2d, v15.d[1]
  345. OP_ri v29.2d, v4.2d, v15.d[1]
  346. OP_ir v29.2d, v5.2d, v14.d[1]
  347. OP_rr v30.2d, v6.2d, v14.d[1]
  348. OP_ii v30.2d, v7.2d, v15.d[1]
  349. OP_ri v31.2d, v6.2d, v15.d[1]
  350. OP_ir v31.2d, v7.2d, v14.d[1]
  351. .endm
  352. .macro KERNEL4x4_E
  353. OP_rr v16.2d, v4.2d, v12.d[0]
  354. OP_ii v16.2d, v5.2d, v13.d[0]
  355. OP_ri v17.2d, v4.2d, v13.d[0]
  356. OP_ir v17.2d, v5.2d, v12.d[0]
  357. OP_rr v18.2d, v6.2d, v12.d[0]
  358. OP_ii v18.2d, v7.2d, v13.d[0]
  359. OP_ri v19.2d, v6.2d, v13.d[0]
  360. OP_ir v19.2d, v7.2d, v12.d[0]
  361. OP_rr v20.2d, v4.2d, v12.d[1]
  362. OP_ii v20.2d, v5.2d, v13.d[1]
  363. OP_ri v21.2d, v4.2d, v13.d[1]
  364. OP_ir v21.2d, v5.2d, v12.d[1]
  365. OP_rr v22.2d, v6.2d, v12.d[1]
  366. OP_ii v22.2d, v7.2d, v13.d[1]
  367. OP_ri v23.2d, v6.2d, v13.d[1]
  368. OP_ir v23.2d, v7.2d, v12.d[1]
  369. OP_rr v24.2d, v4.2d, v14.d[0]
  370. OP_ii v24.2d, v5.2d, v15.d[0]
  371. OP_ri v25.2d, v4.2d, v15.d[0]
  372. OP_ir v25.2d, v5.2d, v14.d[0]
  373. OP_rr v26.2d, v6.2d, v14.d[0]
  374. OP_ii v26.2d, v7.2d, v15.d[0]
  375. OP_ri v27.2d, v6.2d, v15.d[0]
  376. OP_ir v27.2d, v7.2d, v14.d[0]
  377. OP_rr v28.2d, v4.2d, v14.d[1]
  378. OP_ii v28.2d, v5.2d, v15.d[1]
  379. OP_ri v29.2d, v4.2d, v15.d[1]
  380. OP_ir v29.2d, v5.2d, v14.d[1]
  381. OP_rr v30.2d, v6.2d, v14.d[1]
  382. OP_ii v30.2d, v7.2d, v15.d[1]
  383. OP_ri v31.2d, v6.2d, v15.d[1]
  384. OP_ir v31.2d, v7.2d, v14.d[1]
  385. .endm
  386. .macro KERNEL4x4_SUB
  387. ld2 {v8.2d, v9.2d}, [pB]
  388. add pB, pB, #32
  389. ld2 {v10.2d, v11.2d}, [pB]
  390. add pB, pB, #32
  391. ld2 {v0.2d, v1.2d}, [pA]
  392. add pA, pA, #32
  393. ld2 {v2.2d, v3.2d}, [pA]
  394. add pA, pA, #32
  395. OP_rr v16.2d, v0.2d, v8.d[0]
  396. OP_ii v16.2d, v1.2d, v9.d[0]
  397. OP_ri v17.2d, v0.2d, v9.d[0]
  398. OP_ir v17.2d, v1.2d, v8.d[0]
  399. OP_rr v18.2d, v2.2d, v8.d[0]
  400. OP_ii v18.2d, v3.2d, v9.d[0]
  401. OP_ri v19.2d, v2.2d, v9.d[0]
  402. OP_ir v19.2d, v3.2d, v8.d[0]
  403. OP_rr v20.2d, v0.2d, v8.d[1]
  404. OP_ii v20.2d, v1.2d, v9.d[1]
  405. OP_ri v21.2d, v0.2d, v9.d[1]
  406. OP_ir v21.2d, v1.2d, v8.d[1]
  407. OP_rr v22.2d, v2.2d, v8.d[1]
  408. OP_ii v22.2d, v3.2d, v9.d[1]
  409. OP_ri v23.2d, v2.2d, v9.d[1]
  410. OP_ir v23.2d, v3.2d, v8.d[1]
  411. OP_rr v24.2d, v0.2d, v10.d[0]
  412. OP_ii v24.2d, v1.2d, v11.d[0]
  413. OP_ri v25.2d, v0.2d, v11.d[0]
  414. OP_ir v25.2d, v1.2d, v10.d[0]
  415. OP_rr v26.2d, v2.2d, v10.d[0]
  416. OP_ii v26.2d, v3.2d, v11.d[0]
  417. OP_ri v27.2d, v2.2d, v11.d[0]
  418. OP_ir v27.2d, v3.2d, v10.d[0]
  419. OP_rr v28.2d, v0.2d, v10.d[1]
  420. OP_ii v28.2d, v1.2d, v11.d[1]
  421. OP_ri v29.2d, v0.2d, v11.d[1]
  422. OP_ir v29.2d, v1.2d, v10.d[1]
  423. OP_rr v30.2d, v2.2d, v10.d[1]
  424. OP_ii v30.2d, v3.2d, v11.d[1]
  425. OP_ri v31.2d, v2.2d, v11.d[1]
  426. OP_ir v31.2d, v3.2d, v10.d[1]
  427. .endm
  428. .macro SAVE4x4
  429. fmov alpha0_R, alpha_save_R
  430. fmov alpha0_I, alpha_save_I
  431. fmov alpha1_R, alpha0_R
  432. fmov alpha1_I, alpha0_I
  433. mov pCRow1, pCRow0
  434. fmul v0.2d, v16.2d, alphaV0_R
  435. fmls v0.2d, v17.2d, alphaV0_I
  436. fmul v1.2d, v16.2d, alphaV1_I
  437. fmla v1.2d, v17.2d, alphaV1_R
  438. st2 {v0.2d, v1.2d}, [pCRow1]
  439. add pCRow2, pCRow1, #32
  440. fmul v2.2d, v18.2d, alphaV0_R
  441. fmls v2.2d, v19.2d, alphaV0_I
  442. fmul v3.2d, v18.2d, alphaV1_I
  443. fmla v3.2d, v19.2d, alphaV1_R
  444. st2 {v2.2d, v3.2d}, [pCRow2]
  445. add pCRow1, pCRow1, LDC
  446. fmul v4.2d, v20.2d, alphaV0_R
  447. fmls v4.2d, v21.2d, alphaV0_I
  448. fmul v5.2d, v20.2d, alphaV1_I
  449. fmla v5.2d, v21.2d, alphaV1_R
  450. st2 {v4.2d, v5.2d}, [pCRow1]
  451. add pCRow2, pCRow1, #32
  452. fmul v6.2d, v22.2d, alphaV0_R
  453. fmls v6.2d, v23.2d, alphaV0_I
  454. fmul v7.2d, v22.2d, alphaV1_I
  455. fmla v7.2d, v23.2d, alphaV1_R
  456. st2 {v6.2d, v7.2d}, [pCRow2]
  457. add pCRow1, pCRow1, LDC
  458. fmul v0.2d, v24.2d, alphaV0_R
  459. fmls v0.2d, v25.2d, alphaV0_I
  460. fmul v1.2d, v24.2d, alphaV1_I
  461. fmla v1.2d, v25.2d, alphaV1_R
  462. st2 {v0.2d, v1.2d}, [pCRow1]
  463. add pCRow2, pCRow1, #32
  464. fmul v2.2d, v26.2d, alphaV0_R
  465. fmls v2.2d, v27.2d, alphaV0_I
  466. fmul v3.2d, v26.2d, alphaV1_I
  467. fmla v3.2d, v27.2d, alphaV1_R
  468. st2 {v2.2d, v3.2d}, [pCRow2]
  469. add pCRow1, pCRow1, LDC
  470. fmul v4.2d, v28.2d, alphaV0_R
  471. fmls v4.2d, v29.2d, alphaV0_I
  472. fmul v5.2d, v28.2d, alphaV1_I
  473. fmla v5.2d, v29.2d, alphaV1_R
  474. st2 {v4.2d, v5.2d}, [pCRow1]
  475. add pCRow2, pCRow1, #32
  476. fmul v6.2d, v30.2d, alphaV0_R
  477. fmls v6.2d, v31.2d, alphaV0_I
  478. fmul v7.2d, v30.2d, alphaV1_I
  479. fmla v7.2d, v31.2d, alphaV1_R
  480. st2 {v6.2d, v7.2d}, [pCRow2]
  481. add pCRow0, pCRow0, #64
  482. .endm
  483. /******************************************************************************/
  484. .macro INIT2x4
  485. fmov d16, xzr
  486. fmov d17, xzr
  487. fmov d20, d16
  488. fmov d21, d17
  489. fmov d24, d16
  490. fmov d25, d17
  491. fmov d28, d16
  492. fmov d29, d17
  493. .endm
  494. .macro KERNEL2x4_SUB
  495. ld2 {v8.2d, v9.2d}, [pB]
  496. add pB, pB, #32
  497. ld2 {v10.2d, v11.2d}, [pB]
  498. add pB, pB, #32
  499. ld2 {v0.2d, v1.2d}, [pA]
  500. add pA, pA, #32
  501. OP_rr v16.2d, v0.2d, v8.d[0]
  502. OP_ii v16.2d, v1.2d, v9.d[0]
  503. OP_ri v17.2d, v0.2d, v9.d[0]
  504. OP_ir v17.2d, v1.2d, v8.d[0]
  505. OP_rr v20.2d, v0.2d, v8.d[1]
  506. OP_ii v20.2d, v1.2d, v9.d[1]
  507. OP_ri v21.2d, v0.2d, v9.d[1]
  508. OP_ir v21.2d, v1.2d, v8.d[1]
  509. OP_rr v24.2d, v0.2d, v10.d[0]
  510. OP_ii v24.2d, v1.2d, v11.d[0]
  511. OP_ri v25.2d, v0.2d, v11.d[0]
  512. OP_ir v25.2d, v1.2d, v10.d[0]
  513. OP_rr v28.2d, v0.2d, v10.d[1]
  514. OP_ii v28.2d, v1.2d, v11.d[1]
  515. OP_ri v29.2d, v0.2d, v11.d[1]
  516. OP_ir v29.2d, v1.2d, v10.d[1]
  517. .endm
  518. .macro SAVE2x4
  519. fmov alpha0_R, alpha_save_R
  520. fmov alpha0_I, alpha_save_I
  521. fmov alpha1_R, alpha0_R
  522. fmov alpha1_I, alpha0_I
  523. mov pCRow1, pCRow0
  524. fmul v0.2d, v16.2d, alphaV0_R
  525. fmls v0.2d, v17.2d, alphaV0_I
  526. fmul v1.2d, v16.2d, alphaV1_I
  527. fmla v1.2d, v17.2d, alphaV1_R
  528. st2 {v0.2d, v1.2d}, [pCRow1]
  529. add pCRow1, pCRow1, LDC
  530. fmul v4.2d, v20.2d, alphaV0_R
  531. fmls v4.2d, v21.2d, alphaV0_I
  532. fmul v5.2d, v20.2d, alphaV1_I
  533. fmla v5.2d, v21.2d, alphaV1_R
  534. st2 {v4.2d, v5.2d}, [pCRow1]
  535. add pCRow1, pCRow1, LDC
  536. fmul v0.2d, v24.2d, alphaV0_R
  537. fmls v0.2d, v25.2d, alphaV0_I
  538. fmul v1.2d, v24.2d, alphaV1_I
  539. fmla v1.2d, v25.2d, alphaV1_R
  540. st2 {v0.2d, v1.2d}, [pCRow1]
  541. add pCRow1, pCRow1, LDC
  542. fmul v4.2d, v28.2d, alphaV0_R
  543. fmls v4.2d, v29.2d, alphaV0_I
  544. fmul v5.2d, v28.2d, alphaV1_I
  545. fmla v5.2d, v29.2d, alphaV1_R
  546. st2 {v4.2d, v5.2d}, [pCRow1]
  547. add pCRow0, pCRow0, #32
  548. .endm
  549. /******************************************************************************/
  550. .macro INIT1x4
  551. fmov d16, xzr
  552. fmov d17, xzr
  553. fmov d20, d16
  554. fmov d21, d17
  555. fmov d24, d16
  556. fmov d25, d17
  557. fmov d28, d16
  558. fmov d29, d17
  559. .endm
  560. .macro KERNEL1x4_SUB
  561. ld2 {v8.2d, v9.2d}, [pB]
  562. add pB, pB, #32
  563. ld2 {v10.2d, v11.2d}, [pB]
  564. add pB, pB, #32
  565. ld2 {v0.d, v1.d}[0], [pA]
  566. add pA, pA, #16
  567. OP_rr d16, d0, v8.d[0]
  568. OP_ii d16, d1, v9.d[0]
  569. OP_ri d17, d0, v9.d[0]
  570. OP_ir d17, d1, v8.d[0]
  571. OP_rr d20, d0, v8.d[1]
  572. OP_ii d20, d1, v9.d[1]
  573. OP_ri d21, d0, v9.d[1]
  574. OP_ir d21, d1, v8.d[1]
  575. OP_rr d24, d0, v10.d[0]
  576. OP_ii d24, d1, v11.d[0]
  577. OP_ri d25, d0, v11.d[0]
  578. OP_ir d25, d1, v10.d[0]
  579. OP_rr d28, d0, v10.d[1]
  580. OP_ii d28, d1, v11.d[1]
  581. OP_ri d29, d0, v11.d[1]
  582. OP_ir d29, d1, v10.d[1]
  583. .endm
  584. .macro SAVE1x4
  585. fmov alpha0_R, alpha_save_R
  586. fmov alpha0_I, alpha_save_I
  587. fmov alpha1_R, alpha0_R
  588. fmov alpha1_I, alpha0_I
  589. mov pCRow1, pCRow0
  590. fmul d0, d16, alphaV0_R
  591. fmls d0, d17, alphaV0_I
  592. fmul d1, d16, alphaV1_I
  593. fmla d1, d17, alphaV1_R
  594. st2 {v0.d, v1.d}[0], [pCRow1]
  595. add pCRow1, pCRow1, LDC
  596. fmul d4, d20, alphaV0_R
  597. fmls d4, d21, alphaV0_I
  598. fmul d5, d20, alphaV1_I
  599. fmla d5, d21, alphaV1_R
  600. st2 {v4.d, v5.d}[0], [pCRow1]
  601. add pCRow1, pCRow1, LDC
  602. fmul d0, d24, alphaV0_R
  603. fmls d0, d25, alphaV0_I
  604. fmul d1, d24, alphaV1_I
  605. fmla d1, d25, alphaV1_R
  606. st2 {v0.d, v1.d}[0], [pCRow1]
  607. add pCRow1, pCRow1, LDC
  608. fmul d4, d28, alphaV0_R
  609. fmls d4, d29, alphaV0_I
  610. fmul d5, d28, alphaV1_I
  611. fmla d5, d29, alphaV1_R
  612. st2 {v4.d, v5.d}[0], [pCRow1]
  613. add pCRow0, pCRow0, #16
  614. .endm
  615. /******************************************************************************/
  616. .macro INIT4x2
  617. fmov d16, xzr
  618. fmov d17, xzr
  619. fmov d18, d16
  620. fmov d19, d17
  621. fmov d20, d16
  622. fmov d21, d17
  623. fmov d22, d16
  624. fmov d23, d17
  625. .endm
  626. .macro KERNEL4x2_SUB
  627. ld2 {v8.2d, v9.2d}, [pB]
  628. add pB, pB, #32
  629. ld2 {v0.2d, v1.2d}, [pA]
  630. add pA, pA, #32
  631. ld2 {v2.2d, v3.2d}, [pA]
  632. add pA, pA, #32
  633. OP_rr v16.2d, v0.2d, v8.d[0]
  634. OP_ii v16.2d, v1.2d, v9.d[0]
  635. OP_ri v17.2d, v0.2d, v9.d[0]
  636. OP_ir v17.2d, v1.2d, v8.d[0]
  637. OP_rr v18.2d, v2.2d, v8.d[0]
  638. OP_ii v18.2d, v3.2d, v9.d[0]
  639. OP_ri v19.2d, v2.2d, v9.d[0]
  640. OP_ir v19.2d, v3.2d, v8.d[0]
  641. OP_rr v20.2d, v0.2d, v8.d[1]
  642. OP_ii v20.2d, v1.2d, v9.d[1]
  643. OP_ri v21.2d, v0.2d, v9.d[1]
  644. OP_ir v21.2d, v1.2d, v8.d[1]
  645. OP_rr v22.2d, v2.2d, v8.d[1]
  646. OP_ii v22.2d, v3.2d, v9.d[1]
  647. OP_ri v23.2d, v2.2d, v9.d[1]
  648. OP_ir v23.2d, v3.2d, v8.d[1]
  649. .endm
  650. .macro SAVE4x2
  651. fmov alpha0_R, alpha_save_R
  652. fmov alpha0_I, alpha_save_I
  653. fmov alpha1_R, alpha0_R
  654. fmov alpha1_I, alpha0_I
  655. mov pCRow1, pCRow0
  656. fmul v0.2d, v16.2d, alphaV0_R
  657. fmls v0.2d, v17.2d, alphaV0_I
  658. fmul v1.2d, v16.2d, alphaV1_I
  659. fmla v1.2d, v17.2d, alphaV1_R
  660. st2 {v0.2d, v1.2d}, [pCRow1]
  661. add pCRow2, pCRow1, #32
  662. fmul v2.2d, v18.2d, alphaV0_R
  663. fmls v2.2d, v19.2d, alphaV0_I
  664. fmul v3.2d, v18.2d, alphaV1_I
  665. fmla v3.2d, v19.2d, alphaV1_R
  666. st2 {v2.2d, v3.2d}, [pCRow2]
  667. add pCRow1, pCRow1, LDC
  668. fmul v4.2d, v20.2d, alphaV0_R
  669. fmls v4.2d, v21.2d, alphaV0_I
  670. fmul v5.2d, v20.2d, alphaV1_I
  671. fmla v5.2d, v21.2d, alphaV1_R
  672. st2 {v4.2d, v5.2d}, [pCRow1]
  673. add pCRow2, pCRow1, #32
  674. fmul v6.2d, v22.2d, alphaV0_R
  675. fmls v6.2d, v23.2d, alphaV0_I
  676. fmul v7.2d, v22.2d, alphaV1_I
  677. fmla v7.2d, v23.2d, alphaV1_R
  678. st2 {v6.2d, v7.2d}, [pCRow2]
  679. add pCRow0, pCRow0, #64
  680. .endm
  681. /******************************************************************************/
  682. .macro INIT2x2
  683. fmov d16, xzr
  684. fmov d17, xzr
  685. fmov d20, d16
  686. fmov d21, d17
  687. .endm
  688. .macro KERNEL2x2_SUB
  689. ld2 {v8.2d, v9.2d}, [pB]
  690. add pB, pB, #32
  691. ld2 {v0.2d, v1.2d}, [pA]
  692. add pA, pA, #32
  693. OP_rr v16.2d, v0.2d, v8.d[0]
  694. OP_ii v16.2d, v1.2d, v9.d[0]
  695. OP_ri v17.2d, v0.2d, v9.d[0]
  696. OP_ir v17.2d, v1.2d, v8.d[0]
  697. OP_rr v20.2d, v0.2d, v8.d[1]
  698. OP_ii v20.2d, v1.2d, v9.d[1]
  699. OP_ri v21.2d, v0.2d, v9.d[1]
  700. OP_ir v21.2d, v1.2d, v8.d[1]
  701. .endm
  702. .macro SAVE2x2
  703. fmov alpha0_R, alpha_save_R
  704. fmov alpha0_I, alpha_save_I
  705. fmov alpha1_R, alpha0_R
  706. fmov alpha1_I, alpha0_I
  707. mov pCRow1, pCRow0
  708. fmul v0.2d, v16.2d, alphaV0_R
  709. fmls v0.2d, v17.2d, alphaV0_I
  710. fmul v1.2d, v16.2d, alphaV1_I
  711. fmla v1.2d, v17.2d, alphaV1_R
  712. st2 {v0.2d, v1.2d}, [pCRow1]
  713. add pCRow1, pCRow1, LDC
  714. fmul v4.2d, v20.2d, alphaV0_R
  715. fmls v4.2d, v21.2d, alphaV0_I
  716. fmul v5.2d, v20.2d, alphaV1_I
  717. fmla v5.2d, v21.2d, alphaV1_R
  718. st2 {v4.2d, v5.2d}, [pCRow1]
  719. add pCRow0, pCRow0, #32
  720. .endm
  721. /******************************************************************************/
  722. .macro INIT1x2
  723. fmov d16, xzr
  724. fmov d17, xzr
  725. fmov d20, xzr
  726. fmov d21, xzr
  727. .endm
  728. .macro KERNEL1x2_SUB
  729. ld2 {v8.2d, v9.2d}, [pB]
  730. add pB, pB, #32
  731. ld2 {v0.d, v1.d}[0], [pA]
  732. add pA, pA, #16
  733. OP_rr d16, d0, v8.d[0]
  734. OP_ii d16, d1, v9.d[0]
  735. OP_ri d17, d0, v9.d[0]
  736. OP_ir d17, d1, v8.d[0]
  737. OP_rr d20, d0, v8.d[1]
  738. OP_ii d20, d1, v9.d[1]
  739. OP_ri d21, d0, v9.d[1]
  740. OP_ir d21, d1, v8.d[1]
  741. .endm
  742. .macro SAVE1x2
  743. fmov alpha0_R, alpha_save_R
  744. fmov alpha0_I, alpha_save_I
  745. fmov alpha1_R, alpha0_R
  746. fmov alpha1_I, alpha0_I
  747. mov pCRow1, pCRow0
  748. fmul d0, d16, alphaV0_R
  749. fmls d0, d17, alphaV0_I
  750. fmul d1, d16, alphaV1_I
  751. fmla d1, d17, alphaV1_R
  752. st2 {v0.d, v1.d}[0], [pCRow1]
  753. add pCRow1, pCRow1, LDC
  754. fmul d4, d20, alphaV0_R
  755. fmls d4, d21, alphaV0_I
  756. fmul d5, d20, alphaV1_I
  757. fmla d5, d21, alphaV1_R
  758. st2 {v4.d, v5.d}[0], [pCRow1]
  759. add pCRow0, pCRow0, #16
  760. .endm
  761. /******************************************************************************/
  762. .macro INIT4x1
  763. fmov d16, xzr
  764. fmov d17, d16
  765. fmov d18, d16
  766. fmov d19, d17
  767. .endm
  768. .macro KERNEL4x1_SUB
  769. ld2 {v8.d, v9.d}[0], [pB]
  770. add pB, pB, #16
  771. ld2 {v0.2d, v1.2d}, [pA]
  772. add pA, pA, #32
  773. ld2 {v2.2d, v3.2d}, [pA]
  774. add pA, pA, #32
  775. OP_rr v16.2d, v0.2d, v8.d[0]
  776. OP_ii v16.2d, v1.2d, v9.d[0]
  777. OP_ri v17.2d, v0.2d, v9.d[0]
  778. OP_ir v17.2d, v1.2d, v8.d[0]
  779. OP_rr v18.2d, v2.2d, v8.d[0]
  780. OP_ii v18.2d, v3.2d, v9.d[0]
  781. OP_ri v19.2d, v2.2d, v9.d[0]
  782. OP_ir v19.2d, v3.2d, v8.d[0]
  783. .endm
  784. .macro SAVE4x1
  785. fmov alpha0_R, alpha_save_R
  786. fmov alpha0_I, alpha_save_I
  787. fmov alpha1_R, alpha0_R
  788. fmov alpha1_I, alpha0_I
  789. mov pCRow1, pCRow0
  790. fmul v0.2d, v16.2d, alphaV0_R
  791. fmls v0.2d, v17.2d, alphaV0_I
  792. fmul v1.2d, v16.2d, alphaV1_I
  793. fmla v1.2d, v17.2d, alphaV1_R
  794. st2 {v0.2d, v1.2d}, [pCRow1]
  795. add pCRow2, pCRow1, #32
  796. fmul v2.2d, v18.2d, alphaV0_R
  797. fmls v2.2d, v19.2d, alphaV0_I
  798. fmul v3.2d, v18.2d, alphaV1_I
  799. fmla v3.2d, v19.2d, alphaV1_R
  800. st2 {v2.2d, v3.2d}, [pCRow2]
  801. add pCRow0, pCRow0, #64
  802. .endm
  803. /******************************************************************************/
  804. .macro INIT2x1
  805. fmov d16, xzr
  806. fmov d17, xzr
  807. .endm
  808. .macro KERNEL2x1_SUB
  809. ld2 {v8.d, v9.d}[0], [pB]
  810. add pB, pB, #16
  811. ld2 {v0.2d, v1.2d}, [pA]
  812. add pA, pA, #32
  813. OP_rr v16.2d, v0.2d, v8.d[0]
  814. OP_ii v16.2d, v1.2d, v9.d[0]
  815. OP_ri v17.2d, v0.2d, v9.d[0]
  816. OP_ir v17.2d, v1.2d, v8.d[0]
  817. .endm
  818. .macro SAVE2x1
  819. fmov alpha0_R, alpha_save_R
  820. fmov alpha0_I, alpha_save_I
  821. fmov alpha1_R, alpha0_R
  822. fmov alpha1_I, alpha0_I
  823. mov pCRow1, pCRow0
  824. fmul v0.2d, v16.2d, alphaV0_R
  825. fmls v0.2d, v17.2d, alphaV0_I
  826. fmul v1.2d, v16.2d, alphaV1_I
  827. fmla v1.2d, v17.2d, alphaV1_R
  828. st2 {v0.2d, v1.2d}, [pCRow1]
  829. add pCRow0, pCRow0, #32
  830. .endm
  831. /******************************************************************************/
  832. .macro INIT1x1
  833. fmov d16, xzr
  834. fmov d17, xzr
  835. .endm
  836. .macro KERNEL1x1_SUB
  837. ld2 {v8.d, v9.d}[0], [pB]
  838. add pB, pB, #16
  839. ld2 {v0.d, v1.d}[0], [pA]
  840. add pA, pA, #16
  841. OP_rr d16, d0, v8.d[0]
  842. OP_ii d16, d1, v9.d[0]
  843. OP_ri d17, d0, v9.d[0]
  844. OP_ir d17, d1, v8.d[0]
  845. .endm
  846. .macro SAVE1x1
  847. fmov alpha0_R, alpha_save_R
  848. fmov alpha0_I, alpha_save_I
  849. fmov alpha1_R, alpha0_R
  850. fmov alpha1_I, alpha0_I
  851. mov pCRow1, pCRow0
  852. fmul d0, d16, alphaV0_R
  853. fmls d0, d17, alphaV0_I
  854. fmul d1, d16, alphaV1_I
  855. fmla d1, d17, alphaV1_R
  856. st2 {v0.d, v1.d}[0], [pCRow1]
  857. add pCRow0, pCRow0, #16
  858. .endm
  859. /*******************************************************************************
  860. * End of macro definitions
  861. *******************************************************************************/
  862. PROLOGUE
  863. .align 5
  864. add sp, sp, #-(11 * 16)
  865. stp d8, d9, [sp, #(0 * 16)]
  866. stp d10, d11, [sp, #(1 * 16)]
  867. stp d12, d13, [sp, #(2 * 16)]
  868. stp d14, d15, [sp, #(3 * 16)]
  869. stp d16, d17, [sp, #(4 * 16)]
  870. stp x18, x19, [sp, #(5 * 16)]
  871. stp x20, x21, [sp, #(6 * 16)]
  872. stp x22, x23, [sp, #(7 * 16)]
  873. stp x24, x25, [sp, #(8 * 16)]
  874. stp x26, x27, [sp, #(9 * 16)]
  875. str x28, [sp, #(10 * 16)]
  876. fmov alpha_save_R, d0
  877. fmov alpha_save_I, d1
  878. lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
  879. #if !defined(LEFT)
  880. neg tempOffset, offset
  881. #endif
  882. mov pB, origPB
  883. mov counterJ, origN
  884. asr counterJ, counterJ, #2 // J = J / 4
  885. cmp counterJ, #0
  886. ble ztrmm_kernel_L2_BEGIN
  887. ztrmm_kernel_L4_BEGIN:
  888. mov pCRow0, pC // pCRow0 = C
  889. add pC, pC, LDC, lsl #2
  890. #if defined(LEFT)
  891. mov tempOffset, offset
  892. #endif
  893. mov pA, origPA // pA = start of A array
  894. ztrmm_kernel_L4_M4_BEGIN:
  895. mov counterI, origM
  896. asr counterI, counterI, #2 // counterI = counterI / 4
  897. cmp counterI, #0
  898. ble ztrmm_kernel_L4_M2_BEGIN
  899. ztrmm_kernel_L4_M4_20:
  900. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  901. mov pB, origPB
  902. #else
  903. mov pB, origPB
  904. lsl temp, tempOffset, #6
  905. add pB, pB, temp
  906. add pA, pA, temp
  907. #endif
  908. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  909. sub tempK, origK, tempOffset
  910. #elif defined(LEFT)
  911. add tempK, tempOffset, #4
  912. #else
  913. add tempK, tempOffset, #4
  914. #endif
  915. asr counterL , tempK, #1 // L = K / 2
  916. cmp counterL , #2 // is there at least 4 to do?
  917. blt ztrmm_kernel_L4_M4_32
  918. KERNEL4x4_I // do one in the K
  919. KERNEL4x4_M2 // do another in the K
  920. subs counterL, counterL, #2
  921. ble ztrmm_kernel_L4_M4_22a
  922. .align 5
  923. ztrmm_kernel_L4_M4_22:
  924. KERNEL4x4_M1
  925. KERNEL4x4_M2
  926. subs counterL, counterL, #1
  927. bgt ztrmm_kernel_L4_M4_22
  928. ztrmm_kernel_L4_M4_22a:
  929. KERNEL4x4_M1
  930. KERNEL4x4_E
  931. b ztrmm_kernel_L4_M4_44
  932. ztrmm_kernel_L4_M4_32:
  933. tst counterL, #1
  934. ble ztrmm_kernel_L4_M4_40
  935. KERNEL4x4_I
  936. KERNEL4x4_E
  937. b ztrmm_kernel_L4_M4_44
  938. ztrmm_kernel_L4_M4_40:
  939. INIT4x4
  940. ztrmm_kernel_L4_M4_44:
  941. ands counterL , tempK, #1
  942. ble ztrmm_kernel_L4_M4_100
  943. ztrmm_kernel_L4_M4_46:
  944. KERNEL4x4_SUB
  945. ztrmm_kernel_L4_M4_100:
  946. SAVE4x4
  947. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  948. sub tempK, origK, tempOffset
  949. #if defined(LEFT)
  950. sub tempK, tempK, #4
  951. #else
  952. sub tempK, tempK, #4
  953. #endif
  954. lsl temp, tempK, #6
  955. add pA, pA, temp
  956. add pB, pB, temp
  957. #endif
  958. #if defined(LEFT)
  959. add tempOffset, tempOffset, #4
  960. #endif
  961. ztrmm_kernel_L4_M4_END:
  962. subs counterI, counterI, #1
  963. bne ztrmm_kernel_L4_M4_20
  964. ztrmm_kernel_L4_M2_BEGIN:
  965. mov counterI, origM
  966. tst counterI , #3
  967. ble ztrmm_kernel_L4_END
  968. tst counterI, #2 // counterI = counterI / 2
  969. ble ztrmm_kernel_L4_M1_BEGIN
  970. ztrmm_kernel_L4_M2_20:
  971. INIT2x4
  972. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  973. mov pB, origPB
  974. #else
  975. mov pB, origPB
  976. lsl temp, tempOffset, #5
  977. add pA, pA, temp
  978. lsl temp, tempOffset, #6
  979. add pB, pB, temp
  980. #endif
  981. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  982. sub tempK, origK, tempOffset
  983. #elif defined(LEFT)
  984. add tempK, tempOffset, #2
  985. #else
  986. add tempK, tempOffset, #4
  987. #endif
  988. asr counterL , tempK, #3 // counterL = counterL / 8
  989. cmp counterL , #0
  990. ble ztrmm_kernel_L4_M2_40
  991. ztrmm_kernel_L4_M2_22:
  992. KERNEL2x4_SUB
  993. KERNEL2x4_SUB
  994. KERNEL2x4_SUB
  995. KERNEL2x4_SUB
  996. KERNEL2x4_SUB
  997. KERNEL2x4_SUB
  998. KERNEL2x4_SUB
  999. KERNEL2x4_SUB
  1000. subs counterL, counterL, #1
  1001. bgt ztrmm_kernel_L4_M2_22
  1002. ztrmm_kernel_L4_M2_40:
  1003. ands counterL , tempK, #7 // counterL = counterL % 8
  1004. ble ztrmm_kernel_L4_M2_100
  1005. ztrmm_kernel_L4_M2_42:
  1006. KERNEL2x4_SUB
  1007. subs counterL, counterL, #1
  1008. bgt ztrmm_kernel_L4_M2_42
  1009. ztrmm_kernel_L4_M2_100:
  1010. SAVE2x4
  1011. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1012. sub tempK, origK, tempOffset
  1013. #if defined(LEFT)
  1014. sub tempK, tempK, #2
  1015. #else
  1016. sub tempK, tempK, #4
  1017. #endif
  1018. lsl temp, tempK, #5
  1019. add pA, pA, temp
  1020. lsl temp, tempK, #6
  1021. add pB, pB, temp
  1022. #endif
  1023. #if defined(LEFT)
  1024. add tempOffset, tempOffset, #2
  1025. #endif
  1026. ztrmm_kernel_L4_M2_END:
  1027. ztrmm_kernel_L4_M1_BEGIN:
  1028. tst counterI, #1 // counterI = counterI % 2
  1029. ble ztrmm_kernel_L4_END
  1030. ztrmm_kernel_L4_M1_20:
  1031. INIT1x4
  1032. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1033. mov pB, origPB
  1034. #else
  1035. mov pB, origPB
  1036. lsl temp, tempOffset, #6
  1037. add pB, pB, temp
  1038. lsl temp, tempOffset, #4
  1039. add pA, pA, temp
  1040. #endif
  1041. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1042. sub tempK, origK, tempOffset
  1043. #elif defined(LEFT)
  1044. add tempK, tempOffset, #1
  1045. #else
  1046. add tempK, tempOffset, #4
  1047. #endif
  1048. asr counterL , tempK, #3 // counterL = counterL / 8
  1049. cmp counterL , #0
  1050. ble ztrmm_kernel_L4_M1_40
  1051. ztrmm_kernel_L4_M1_22:
  1052. KERNEL1x4_SUB
  1053. KERNEL1x4_SUB
  1054. KERNEL1x4_SUB
  1055. KERNEL1x4_SUB
  1056. KERNEL1x4_SUB
  1057. KERNEL1x4_SUB
  1058. KERNEL1x4_SUB
  1059. KERNEL1x4_SUB
  1060. subs counterL, counterL, #1
  1061. bgt ztrmm_kernel_L4_M1_22
  1062. ztrmm_kernel_L4_M1_40:
  1063. ands counterL , tempK, #7 // counterL = counterL % 8
  1064. ble ztrmm_kernel_L4_M1_100
  1065. ztrmm_kernel_L4_M1_42:
  1066. KERNEL1x4_SUB
  1067. subs counterL, counterL, #1
  1068. bgt ztrmm_kernel_L4_M1_42
  1069. ztrmm_kernel_L4_M1_100:
  1070. SAVE1x4
  1071. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1072. sub tempK, origK, tempOffset
  1073. #if defined(LEFT)
  1074. sub tempK, tempK, #1
  1075. #else
  1076. sub tempK, tempK, #4
  1077. #endif
  1078. lsl temp, tempK, #4
  1079. add pA, pA, temp
  1080. lsl temp, tempK, #6
  1081. add pB, pB, temp
  1082. #endif
  1083. #if defined(LEFT)
  1084. add tempOffset, tempOffset, #1
  1085. #endif
  1086. ztrmm_kernel_L4_END:
  1087. lsl temp, origK, #6
  1088. add origPB, origPB, temp // B = B + K * 4 * 8 * 2
  1089. #if !defined(LEFT)
  1090. add tempOffset, tempOffset, #4
  1091. #endif
  1092. subs counterJ, counterJ , #1 // j--
  1093. bgt ztrmm_kernel_L4_BEGIN
  1094. /******************************************************************************/
  1095. ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  1096. mov counterJ , origN
  1097. tst counterJ , #3
  1098. ble ztrmm_kernel_L999 // error, N was less than 4?
  1099. tst counterJ , #2
  1100. ble ztrmm_kernel_L1_BEGIN
  1101. mov pCRow0, pC // pCRow0 = pC
  1102. add pC,pC,LDC, lsl #1
  1103. #if defined(LEFT)
  1104. mov tempOffset, offset
  1105. #endif
  1106. mov pA, origPA // pA = A
  1107. ztrmm_kernel_L2_M4_BEGIN:
  1108. mov counterI, origM
  1109. asr counterI, counterI, #2 // counterI = counterI / 4
  1110. cmp counterI,#0
  1111. ble ztrmm_kernel_L2_M2_BEGIN
  1112. ztrmm_kernel_L2_M4_20:
  1113. INIT4x2
  1114. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1115. mov pB, origPB
  1116. #else
  1117. mov pB, origPB
  1118. lsl temp, tempOffset, #5
  1119. add pB, pB, temp
  1120. lsl temp, tempOffset, #6
  1121. add pA, pA, temp
  1122. #endif
  1123. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1124. sub tempK, origK, tempOffset
  1125. #elif defined(LEFT)
  1126. add tempK, tempOffset, #4
  1127. #else
  1128. add tempK, tempOffset, #2
  1129. #endif
  1130. asr counterL , tempK, #3 // counterL = counterL / 8
  1131. cmp counterL,#0
  1132. ble ztrmm_kernel_L2_M4_40
  1133. .align 5
  1134. ztrmm_kernel_L2_M4_22:
  1135. KERNEL4x2_SUB
  1136. KERNEL4x2_SUB
  1137. KERNEL4x2_SUB
  1138. KERNEL4x2_SUB
  1139. KERNEL4x2_SUB
  1140. KERNEL4x2_SUB
  1141. KERNEL4x2_SUB
  1142. KERNEL4x2_SUB
  1143. subs counterL, counterL, #1
  1144. bgt ztrmm_kernel_L2_M4_22
  1145. ztrmm_kernel_L2_M4_40:
  1146. ands counterL , tempK, #7 // counterL = counterL % 8
  1147. ble ztrmm_kernel_L2_M4_100
  1148. ztrmm_kernel_L2_M4_42:
  1149. KERNEL4x2_SUB
  1150. subs counterL, counterL, #1
  1151. bgt ztrmm_kernel_L2_M4_42
  1152. ztrmm_kernel_L2_M4_100:
  1153. SAVE4x2
  1154. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1155. sub tempK, origK, tempOffset
  1156. #if defined(LEFT)
  1157. sub tempK, tempK, #4
  1158. #else
  1159. sub tempK, tempK, #2
  1160. #endif
  1161. lsl temp, tempK, #6
  1162. add pA, pA, temp
  1163. lsl temp, tempK, #5
  1164. add pB, pB, temp
  1165. #endif
  1166. #if defined(LEFT)
  1167. add tempOffset, tempOffset, #4
  1168. #endif
  1169. ztrmm_kernel_L2_M4_END:
  1170. subs counterI, counterI, #1
  1171. bgt ztrmm_kernel_L2_M4_20
  1172. ztrmm_kernel_L2_M2_BEGIN:
  1173. mov counterI, origM
  1174. tst counterI , #3
  1175. ble ztrmm_kernel_L2_END
  1176. tst counterI, #2 // counterI = counterI / 2
  1177. ble ztrmm_kernel_L2_M1_BEGIN
  1178. ztrmm_kernel_L2_M2_20:
  1179. INIT2x2
  1180. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1181. mov pB, origPB
  1182. #else
  1183. mov pB, origPB
  1184. lsl temp, tempOffset, #5
  1185. add pB, pB, temp
  1186. lsl temp, tempOffset, #5
  1187. add pA, pA, temp
  1188. #endif
  1189. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1190. sub tempK, origK, tempOffset
  1191. #elif defined(LEFT)
  1192. add tempK, tempOffset, #2
  1193. #else
  1194. add tempK, tempOffset, #2
  1195. #endif
  1196. asr counterL , tempK, #3 // counterL = counterL / 8
  1197. cmp counterL,#0
  1198. ble ztrmm_kernel_L2_M2_40
  1199. ztrmm_kernel_L2_M2_22:
  1200. KERNEL2x2_SUB
  1201. KERNEL2x2_SUB
  1202. KERNEL2x2_SUB
  1203. KERNEL2x2_SUB
  1204. KERNEL2x2_SUB
  1205. KERNEL2x2_SUB
  1206. KERNEL2x2_SUB
  1207. KERNEL2x2_SUB
  1208. subs counterL, counterL, #1
  1209. bgt ztrmm_kernel_L2_M2_22
  1210. ztrmm_kernel_L2_M2_40:
  1211. ands counterL , tempK, #7 // counterL = counterL % 8
  1212. ble ztrmm_kernel_L2_M2_100
  1213. ztrmm_kernel_L2_M2_42:
  1214. KERNEL2x2_SUB
  1215. subs counterL, counterL, #1
  1216. bgt ztrmm_kernel_L2_M2_42
  1217. ztrmm_kernel_L2_M2_100:
  1218. SAVE2x2
  1219. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1220. sub tempK, origK, tempOffset
  1221. #if defined(LEFT)
  1222. sub tempK, tempK, #2
  1223. #else
  1224. sub tempK, tempK, #2
  1225. #endif
  1226. lsl temp, tempK, #5
  1227. add pA, pA, temp
  1228. lsl temp, tempK, #5
  1229. add pB, pB, temp
  1230. #endif
  1231. #if defined(LEFT)
  1232. add tempOffset, tempOffset, #2
  1233. #endif
  1234. ztrmm_kernel_L2_M2_END:
  1235. ztrmm_kernel_L2_M1_BEGIN:
  1236. tst counterI, #1 // counterI = counterI % 2
  1237. ble ztrmm_kernel_L2_END
  1238. ztrmm_kernel_L2_M1_20:
  1239. INIT1x2
  1240. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1241. mov pB, origPB
  1242. #else
  1243. mov pB, origPB
  1244. lsl temp, tempOffset, #5
  1245. add pB, pB, temp
  1246. lsl temp, tempOffset, #4
  1247. add pA, pA, temp
  1248. #endif
  1249. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1250. sub tempK, origK, tempOffset
  1251. #elif defined(LEFT)
  1252. add tempK, tempOffset, #1
  1253. #else
  1254. add tempK, tempOffset, #2
  1255. #endif
  1256. asr counterL , tempK, #3 // counterL = counterL / 8
  1257. cmp counterL, #0
  1258. ble ztrmm_kernel_L2_M1_40
  1259. ztrmm_kernel_L2_M1_22:
  1260. KERNEL1x2_SUB
  1261. KERNEL1x2_SUB
  1262. KERNEL1x2_SUB
  1263. KERNEL1x2_SUB
  1264. KERNEL1x2_SUB
  1265. KERNEL1x2_SUB
  1266. KERNEL1x2_SUB
  1267. KERNEL1x2_SUB
  1268. subs counterL, counterL, #1
  1269. bgt ztrmm_kernel_L2_M1_22
  1270. ztrmm_kernel_L2_M1_40:
  1271. ands counterL , tempK, #7 // counterL = counterL % 8
  1272. ble ztrmm_kernel_L2_M1_100
  1273. ztrmm_kernel_L2_M1_42:
  1274. KERNEL1x2_SUB
  1275. subs counterL, counterL, #1
  1276. bgt ztrmm_kernel_L2_M1_42
  1277. ztrmm_kernel_L2_M1_100:
  1278. SAVE1x2
  1279. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1280. sub tempK, origK, tempOffset
  1281. #if defined(LEFT)
  1282. sub tempK, tempK, #1
  1283. #else
  1284. sub tempK, tempK, #2
  1285. #endif
  1286. lsl temp, tempK, #4
  1287. add pA, pA, temp
  1288. lsl temp, tempK, #5
  1289. add pB, pB, temp
  1290. #endif
  1291. #if defined(LEFT)
  1292. add tempOffset, tempOffset, #1
  1293. #endif
  1294. ztrmm_kernel_L2_END:
  1295. #if !defined(LEFT)
  1296. add tempOffset, tempOffset, #2
  1297. #endif
  1298. lsl temp, origK, #5
  1299. add origPB, origPB, temp // B = B + K * 2 * 8 * 2
  1300. /******************************************************************************/
  1301. ztrmm_kernel_L1_BEGIN:
  1302. mov counterJ , origN
  1303. tst counterJ , #1
  1304. ble ztrmm_kernel_L999 // done
  1305. mov pCRow0, pC // pCRow0 = C
  1306. add pC , pC , LDC // Update pC to point to next
  1307. #if defined(LEFT)
  1308. mov tempOffset, offset
  1309. #endif
  1310. mov pA, origPA // pA = A
  1311. ztrmm_kernel_L1_M4_BEGIN:
  1312. mov counterI, origM
  1313. asr counterI, counterI, #2 // counterI = counterI / 4
  1314. cmp counterI, #0
  1315. ble ztrmm_kernel_L1_M2_BEGIN
  1316. ztrmm_kernel_L1_M4_20:
  1317. INIT4x1
  1318. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1319. mov pB, origPB
  1320. #else
  1321. mov pB, origPB
  1322. lsl temp, tempOffset, #4
  1323. add pB, pB, temp
  1324. lsl temp, tempOffset, #6
  1325. add pA, pA, temp
  1326. #endif
  1327. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1328. sub tempK, origK, tempOffset
  1329. #elif defined(LEFT)
  1330. add tempK, tempOffset, #4
  1331. #else
  1332. add tempK, tempOffset, #1
  1333. #endif
  1334. asr counterL , tempK, #3 // counterL = counterL / 8
  1335. cmp counterL , #0
  1336. ble ztrmm_kernel_L1_M4_40
  1337. .align 5
  1338. ztrmm_kernel_L1_M4_22:
  1339. KERNEL4x1_SUB
  1340. KERNEL4x1_SUB
  1341. KERNEL4x1_SUB
  1342. KERNEL4x1_SUB
  1343. KERNEL4x1_SUB
  1344. KERNEL4x1_SUB
  1345. KERNEL4x1_SUB
  1346. KERNEL4x1_SUB
  1347. subs counterL, counterL, #1
  1348. bgt ztrmm_kernel_L1_M4_22
  1349. ztrmm_kernel_L1_M4_40:
  1350. ands counterL , tempK, #7 // counterL = counterL % 8
  1351. ble ztrmm_kernel_L1_M4_100
  1352. ztrmm_kernel_L1_M4_42:
  1353. KERNEL4x1_SUB
  1354. subs counterL, counterL, #1
  1355. bgt ztrmm_kernel_L1_M4_42
  1356. ztrmm_kernel_L1_M4_100:
  1357. SAVE4x1
  1358. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1359. sub tempK, origK, tempOffset
  1360. #if defined(LEFT)
  1361. sub tempK, tempK, #4
  1362. #else
  1363. sub tempK, tempK, #1
  1364. #endif
  1365. lsl temp, tempK, #6
  1366. add pA, pA, temp
  1367. lsl temp, tempK, #4
  1368. add pB, pB, temp
  1369. #endif
  1370. #if defined(LEFT)
  1371. add tempOffset, tempOffset, #4
  1372. #endif
  1373. ztrmm_kernel_L1_M4_END:
  1374. subs counterI, counterI, #1
  1375. bgt ztrmm_kernel_L1_M4_20
  1376. ztrmm_kernel_L1_M2_BEGIN:
  1377. mov counterI, origM
  1378. tst counterI , #3
  1379. ble ztrmm_kernel_L1_END
  1380. tst counterI, #2 // counterI = counterI / 2
  1381. ble ztrmm_kernel_L1_M1_BEGIN
  1382. ztrmm_kernel_L1_M2_20:
  1383. INIT2x1
  1384. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1385. mov pB, origPB
  1386. #else
  1387. mov pB, origPB
  1388. lsl temp, tempOffset, #4
  1389. add pB, pB, temp
  1390. lsl temp, tempOffset, #5
  1391. add pA, pA, temp
  1392. #endif
  1393. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1394. sub tempK, origK, tempOffset
  1395. #elif defined(LEFT)
  1396. add tempK, tempOffset, #2
  1397. #else
  1398. add tempK, tempOffset, #1
  1399. #endif
  1400. asr counterL , tempK, #3 // counterL = counterL / 8
  1401. cmp counterL , #0
  1402. ble ztrmm_kernel_L1_M2_40
  1403. ztrmm_kernel_L1_M2_22:
  1404. KERNEL2x1_SUB
  1405. KERNEL2x1_SUB
  1406. KERNEL2x1_SUB
  1407. KERNEL2x1_SUB
  1408. KERNEL2x1_SUB
  1409. KERNEL2x1_SUB
  1410. KERNEL2x1_SUB
  1411. KERNEL2x1_SUB
  1412. subs counterL, counterL, #1
  1413. bgt ztrmm_kernel_L1_M2_22
  1414. ztrmm_kernel_L1_M2_40:
  1415. ands counterL , tempK, #7 // counterL = counterL % 8
  1416. ble ztrmm_kernel_L1_M2_100
  1417. ztrmm_kernel_L1_M2_42:
  1418. KERNEL2x1_SUB
  1419. subs counterL, counterL, #1
  1420. bgt ztrmm_kernel_L1_M2_42
  1421. ztrmm_kernel_L1_M2_100:
  1422. SAVE2x1
  1423. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1424. sub tempK, origK, tempOffset
  1425. #if defined(LEFT)
  1426. sub tempK, tempK, #2
  1427. #else
  1428. sub tempK, tempK, #1
  1429. #endif
  1430. lsl temp, tempK, #5
  1431. add pA, pA, temp
  1432. lsl temp, tempK, #4
  1433. add pB, pB, temp
  1434. #endif
  1435. #if defined(LEFT)
  1436. add tempOffset, tempOffset, #2
  1437. #endif
  1438. ztrmm_kernel_L1_M2_END:
  1439. ztrmm_kernel_L1_M1_BEGIN:
  1440. tst counterI, #1 // counterI = counterI % 2
  1441. ble ztrmm_kernel_L1_END
  1442. ztrmm_kernel_L1_M1_20:
  1443. INIT1x1
  1444. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1445. mov pB, origPB
  1446. #else
  1447. mov pB, origPB
  1448. lsl temp, tempOffset, #4
  1449. add pB, pB, temp
  1450. lsl temp, tempOffset, #4
  1451. add pA, pA, temp
  1452. #endif
  1453. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1454. sub tempK, origK, tempOffset
  1455. #elif defined(LEFT)
  1456. add tempK, tempOffset, #1
  1457. #else
  1458. add tempK, tempOffset, #1
  1459. #endif
  1460. asr counterL , tempK, #3 // counterL = counterL / 8
  1461. cmp counterL , #0
  1462. ble ztrmm_kernel_L1_M1_40
  1463. ztrmm_kernel_L1_M1_22:
  1464. KERNEL1x1_SUB
  1465. KERNEL1x1_SUB
  1466. KERNEL1x1_SUB
  1467. KERNEL1x1_SUB
  1468. KERNEL1x1_SUB
  1469. KERNEL1x1_SUB
  1470. KERNEL1x1_SUB
  1471. KERNEL1x1_SUB
  1472. subs counterL, counterL, #1
  1473. bgt ztrmm_kernel_L1_M1_22
  1474. ztrmm_kernel_L1_M1_40:
  1475. ands counterL , tempK, #7 // counterL = counterL % 8
  1476. ble ztrmm_kernel_L1_M1_100
  1477. ztrmm_kernel_L1_M1_42:
  1478. KERNEL1x1_SUB
  1479. subs counterL, counterL, #1
  1480. bgt ztrmm_kernel_L1_M1_42
  1481. ztrmm_kernel_L1_M1_100:
  1482. SAVE1x1
  1483. ztrmm_kernel_L1_END:
  1484. ztrmm_kernel_L999:
  1485. mov x0, #0 // set return value
  1486. ldp d8, d9, [sp, #(0 * 16)]
  1487. ldp d10, d11, [sp, #(1 * 16)]
  1488. ldp d12, d13, [sp, #(2 * 16)]
  1489. ldp d14, d15, [sp, #(3 * 16)]
  1490. ldp d16, d17, [sp, #(4 * 16)]
  1491. ldp x18, x19, [sp, #(5 * 16)]
  1492. ldp x20, x21, [sp, #(6 * 16)]
  1493. ldp x22, x23, [sp, #(7 * 16)]
  1494. ldp x24, x25, [sp, #(8 * 16)]
  1495. ldp x26, x27, [sp, #(9 * 16)]
  1496. ldr x28, [sp, #(10 * 16)]
  1497. add sp, sp, #(11*16)
  1498. ret
  1499. EPILOGUE