You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm_kernel_sve_v1x4.S 23 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define lanes x17
  49. #define alphaR w19
  50. #define alphaI w20
  51. #define temp x21
  52. #define tempOffset x22
  53. #define tempK x23
  54. #define alphaz_R z6.s
  55. #define alphaz_I z7.s
  56. #define alpha0_R s6
  57. #define alpha0_I s7
  58. #define A_PRE_SIZE 2560
  59. #define B_PRE_SIZE 448
  60. #define C_PRE_SIZE 128
  61. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  62. #define OP_rr fmla
  63. #define OP_ii fmls
  64. #define OP_ri fmla
  65. #define OP_ir fmla
  66. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  67. #define OP_rr fmla
  68. #define OP_ii fmla
  69. #define OP_ri fmls
  70. #define OP_ir fmla
  71. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  72. #define OP_rr fmla
  73. #define OP_ii fmla
  74. #define OP_ri fmla
  75. #define OP_ir fmls
  76. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  77. #define OP_rr fmla
  78. #define OP_ii fmls
  79. #define OP_ri fmls
  80. #define OP_ir fmls
  81. #endif
  82. // 00 origM
  83. // 01 origN
  84. // 02 origK
  85. // 03 origPA
  86. // 04 origPB
  87. // 05 pC
  88. // 06 origLDC -> LDC
  89. // 07 offset -> temp
  90. // 08 counterL
  91. // 09 counterI
  92. // 10 counterJ
  93. // 11 pB
  94. // 12 pCRow0
  95. // 13 pCRow1
  96. // 14 pCRow2
  97. // 15 pCRow3
  98. // 16 pA
  99. // 17 alpha_save_R
  100. // 18 must save alpha_save_I
  101. // 19 must save
  102. // 20 must save
  103. // 21 must save
  104. // 22 must save
  105. // 23 must save
  106. // 24 must save
  107. // 25 must save
  108. // 26 must save
  109. // 27 must save
  110. // 28 must save
  111. // 29 frame
  112. // 30 link
  113. // 31 sp
  114. //v00 ALPHA_R -> pA00_R, pA01_R
  115. //v01 ALPHA_I -> pA00_I, pA01_I
  116. //v02 pA02_R, pA03_R
  117. //v03 pA02_I, pA03_I
  118. //v04 pA10_R, pA11_R
  119. //v05 pA10_I, pA11_I
  120. //v06 pA12_R, pA13_R
  121. //v07 pA12_I, pA13_I
  122. //v08 must save pB00_R, pB01_R
  123. //v09 must save pB00_I, pB01_I
  124. //v10 must save pB02_R, pB03_R OR ALPHA0_R
  125. //v11 must save pB02_I, pB03_I OR ALPHA0_I
  126. //v12 must save pB10_R, pB11_R
  127. //v13 must save pB10_I, pB11_I
  128. //v14 must save pB12_R, pB13_R OR ALPHA1_R
  129. //v15 must save pB12_I, pB13_I OR ALPHA1_R
  130. //v16 pC0R
  131. //v17 pC0I
  132. //v18 pC1R
  133. //v19 pC1I
  134. //v20 pC2R
  135. //v21 pC2I
  136. //v22 pC3R
  137. //v23 pC3I
  138. //v24 pC3R
  139. //v25 pC3I
  140. //v26 pC22_R, pC23_R
  141. //v27 pC22_I, pC23_I
  142. //v28 pC30_R, pC31_R
  143. //v29 pC30_I, pC31_I
  144. //v30 pC32_R, pC33_R
  145. //v31 pC32_I, pC33_I
  146. /*******************************************************************************
  147. * Macro definitions
  148. *******************************************************************************/
  149. .macro INITv1x4
  150. dup z16.s, #0
  151. dup z17.s, #0
  152. dup z18.s, #0
  153. dup z19.s, #0
  154. dup z20.s, #0
  155. dup z21.s, #0
  156. dup z22.s, #0
  157. dup z23.s, #0
  158. .endm
  159. .macro KERNELv1x4_I
  160. ld2w {z0.s, z1.s}, p1/z, [pA]
  161. add pA, pA, lanes, lsl #3 // pA += lanes*2*4
  162. ld2w {z2.s, z3.s}, p1/z, [pA] // next one
  163. add pA, pA, lanes, lsl #3 // pA += lanes*2*4
  164. ld1rw z8.s, p0/z, [pB]
  165. ld1rw z9.s, p0/z, [pB, 4]
  166. ld1rw z10.s, p0/z, [pB, 8]
  167. ld1rw z11.s, p0/z, [pB, 12]
  168. ld1rw z12.s, p0/z, [pB, 16]
  169. ld1rw z13.s, p0/z, [pB, 20]
  170. ld1rw z14.s, p0/z, [pB, 24]
  171. ld1rw z15.s, p0/z, [pB, 28]
  172. add pB, pB, 32
  173. fmla z16.s, p1/m, z0.s, z8.s
  174. OP_ir z17.s, p1/m, z1.s, z8.s
  175. ld1rw z8.s, p0/z, [pB]
  176. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  177. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  178. #eor z17.16b, z17.16b, z17.16b
  179. fmls z17.s, p1/m, z0.s, z9.s
  180. #else
  181. fmla z17.s, p1/m, z0.s, z9.s
  182. #endif
  183. OP_ii z16.s, p1/m, z1.s, z9.s
  184. ld1rw z9.s, p0/z, [pB, 4]
  185. fmla z18.s, p1/m, z0.s, z10.s
  186. OP_ir z19.s, p1/m, z1.s, z10.s
  187. ld1rw z10.s, p0/z, [pB, 8]
  188. OP_ii z18.s, p1/m, z1.s, z11.s
  189. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  190. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  191. #eor z19.16b, z21.16b, z21.16b
  192. fmls z19.s, p1/m, z0.s, z11.s
  193. #else
  194. fmla z19.s, p1/m, z0.s, z11.s
  195. #endif
  196. ld1rw z11.s, p0/z, [pB, 12]
  197. fmla z20.s, p1/m, z0.s, z12.s
  198. OP_ir z21.s, p1/m, z1.s, z12.s
  199. ld1rw z12.s, p0/z, [pB, 16]
  200. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  201. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  202. #eor z21.16b, z23.16b, z23.16b
  203. fmls z21.s, p1/m, z0.s, z13.s
  204. #else
  205. fmla z21.s, p1/m, z0.s, z13.s
  206. #endif
  207. OP_ii z20.s, p1/m, z1.s, z13.s
  208. ld1rw z13.s, p0/z, [pB, 20]
  209. fmla z22.s, p1/m, z0.s, z14.s
  210. OP_ir z23.s, p1/m, z1.s, z14.s
  211. ld1rw z14.s, p0/z, [pB, 24]
  212. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  213. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  214. #eor z23.16b, z19.16b, z19.16b
  215. fmls z23.s, p1/m, z0.s, z15.s
  216. #else
  217. fmla z23.s, p1/m, z0.s, z15.s
  218. #endif
  219. OP_ii z22.s, p1/m, z1.s, z15.s
  220. ld1rw z15.s, p0/z, [pB, 28]
  221. add pB, pB, 32
  222. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  223. .endm
  224. .macro KERNELv1x4_M1
  225. ld2w {z2.s, z3.s}, p1/z, [pA]
  226. add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4
  227. OP_rr z16.s, p1/m, z0.s, z8.s
  228. OP_ir z17.s, p1/m, z1.s, z8.s
  229. ld1rw z8.s, p0/z, [pB]
  230. OP_ii z16.s, p1/m, z1.s, z9.s
  231. OP_ri z17.s, p1/m, z0.s, z9.s
  232. ld1rw z9.s, p0/z, [pB, 4]
  233. OP_rr z18.s, p1/m, z0.s, z10.s
  234. OP_ir z19.s, p1/m, z1.s, z10.s
  235. ld1rw z10.s, p0/z, [pB, 8]
  236. OP_ii z18.s, p1/m, z1.s, z11.s
  237. OP_ri z19.s, p1/m, z0.s, z11.s
  238. ld1rw z11.s, p0/z, [pB, 12]
  239. OP_rr z20.s, p1/m, z0.s, z12.s
  240. OP_ir z21.s, p1/m, z1.s, z12.s
  241. ld1rw z12.s, p0/z, [pB, 16]
  242. OP_ii z20.s, p1/m, z1.s, z13.s
  243. OP_ri z21.s, p1/m, z0.s, z13.s
  244. ld1rw z13.s, p0/z, [pB, 20]
  245. OP_rr z22.s, p1/m, z0.s, z14.s
  246. OP_ir z23.s, p1/m, z1.s, z14.s
  247. ld1rw z14.s, p0/z, [pB, 24]
  248. OP_ii z22.s, p1/m, z1.s, z15.s
  249. OP_ri z23.s, p1/m, z0.s, z15.s
  250. ld1rw z15.s, p0/z, [pB, 28]
  251. add pB, pB, 32
  252. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  253. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  254. .endm
  255. .macro KERNELv1x4_M2
  256. ld2w {z0.s, z1.s}, p1/z, [pA]
  257. add pA, pA, lanes, lsl #3 // pA = pA + lanes *2 * 4
  258. OP_rr z16.s, p1/m, z2.s, z8.s
  259. OP_ir z17.s, p1/m, z3.s, z8.s
  260. ld1rw z8.s, p0/z, [pB]
  261. OP_ii z16.s, p1/m, z3.s, z9.s
  262. OP_ri z17.s, p1/m, z2.s, z9.s
  263. ld1rw z9.s, p0/z, [pB, 4]
  264. OP_rr z18.s, p1/m, z2.s, z10.s
  265. OP_ir z19.s, p1/m, z3.s, z10.s
  266. ld1rw z10.s, p0/z, [pB, 8]
  267. OP_ii z18.s, p1/m, z3.s, z11.s
  268. OP_ri z19.s, p1/m, z2.s, z11.s
  269. ld1rw z11.s, p0/z, [pB, 12]
  270. OP_rr z20.s, p1/m, z2.s, z12.s
  271. OP_ir z21.s, p1/m, z3.s, z12.s
  272. ld1rw z12.s, p0/z, [pB, 16]
  273. OP_ii z20.s, p1/m, z3.s, z13.s
  274. OP_ri z21.s, p1/m, z2.s, z13.s
  275. ld1rw z13.s, p0/z, [pB, 20]
  276. OP_rr z22.s, p1/m, z2.s, z14.s
  277. OP_ir z23.s, p1/m, z3.s, z14.s
  278. ld1rw z14.s, p0/z, [pB, 24]
  279. OP_ii z22.s, p1/m, z3.s, z15.s
  280. OP_ri z23.s, p1/m, z2.s, z15.s
  281. ld1rw z15.s, p0/z, [pB, 28]
  282. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  283. add pB, pB, 32
  284. prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
  285. .endm
  286. .macro KERNELv1x4_E
  287. OP_rr z16.s, p1/m, z2.s, z8.s
  288. OP_ir z17.s, p1/m, z3.s, z8.s
  289. OP_ii z16.s, p1/m, z3.s, z9.s
  290. OP_ri z17.s, p1/m, z2.s, z9.s
  291. OP_rr z18.s, p1/m, z2.s, z10.s
  292. OP_ir z19.s, p1/m, z3.s, z10.s
  293. OP_ii z18.s, p1/m, z3.s, z11.s
  294. OP_ri z19.s, p1/m, z2.s, z11.s
  295. OP_rr z20.s, p1/m, z2.s, z12.s
  296. OP_ir z21.s, p1/m, z3.s, z12.s
  297. OP_ii z20.s, p1/m, z3.s, z13.s
  298. OP_ri z21.s, p1/m, z2.s, z13.s
  299. OP_rr z22.s, p1/m, z2.s, z14.s
  300. OP_ir z23.s, p1/m, z3.s, z14.s
  301. OP_ii z22.s, p1/m, z3.s, z15.s
  302. OP_ri z23.s, p1/m, z2.s, z15.s
  303. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  304. prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
  305. .endm
  306. .macro KERNELv1x4_SUB
  307. ld2w {z0.s, z1.s}, p1/z, [pA]
  308. add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4
  309. ld1rw z8.s, p0/z, [pB]
  310. ld1rw z9.s, p0/z, [pB, 4]
  311. ld1rw z10.s, p0/z, [pB, 8]
  312. ld1rw z11.s, p0/z, [pB, 12]
  313. OP_rr z16.s, p1/m, z0.s, z8.s
  314. OP_ir z17.s, p1/m, z1.s, z8.s
  315. OP_ii z16.s, p1/m, z1.s, z9.s
  316. OP_ri z17.s, p1/m, z0.s, z9.s
  317. ld1rw z12.s, p0/z, [pB, 16]
  318. ld1rw z13.s, p0/z, [pB, 20]
  319. ld1rw z14.s, p0/z, [pB, 24]
  320. ld1rw z15.s, p0/z, [pB, 28]
  321. OP_rr z18.s, p1/m, z0.s, z10.s
  322. OP_ir z19.s, p1/m, z1.s, z10.s
  323. OP_ii z18.s, p1/m, z1.s, z11.s
  324. OP_ri z19.s, p1/m, z0.s, z11.s
  325. add pB, pB, 32
  326. OP_rr z20.s, p1/m, z0.s, z12.s
  327. OP_ir z21.s, p1/m, z1.s, z12.s
  328. OP_ii z20.s, p1/m, z1.s, z13.s
  329. OP_ri z21.s, p1/m, z0.s, z13.s
  330. OP_rr z22.s, p1/m, z0.s, z14.s
  331. OP_ir z23.s, p1/m, z1.s, z14.s
  332. OP_ii z22.s, p1/m, z1.s, z15.s
  333. OP_ri z23.s, p1/m, z0.s, z15.s
  334. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  335. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  336. .endm
  337. .macro SAVEv1x4
  338. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  339. eor z24.d, z16.d, z16.d
  340. eor z25.d, z16.d, z16.d
  341. fmla z24.s, p1/m, z16.s, alphaz_R
  342. fmls z24.s, p1/m, z17.s, alphaz_I
  343. fmla z25.s, p1/m, z16.s, alphaz_I
  344. fmla z25.s, p1/m, z17.s, alphaz_R
  345. st2w {z24.s, z25.s}, p1, [pCRow0]
  346. add pCRow0, pCRow0, lanes, lsl #3
  347. eor z26.d, z16.d, z16.d
  348. eor z27.d, z16.d, z16.d
  349. fmla z26.s, p1/m, z18.s, alphaz_R
  350. fmls z26.s, p1/m, z19.s, alphaz_I
  351. fmla z27.s, p1/m, z18.s, alphaz_I
  352. fmla z27.s, p1/m, z19.s, alphaz_R
  353. st2w {z26.s, z27.s}, p1, [pCRow1]
  354. add pCRow1, pCRow1, lanes, lsl #3
  355. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  356. eor z28.d, z16.d, z16.d
  357. eor z29.d, z16.d, z16.d
  358. fmla z28.s, p1/m, z20.s, alphaz_R
  359. fmls z28.s, p1/m, z21.s, alphaz_I
  360. fmla z29.s, p1/m, z20.s, alphaz_I
  361. fmla z29.s, p1/m, z21.s, alphaz_R
  362. st2w {z28.s, z29.s}, p1, [pCRow2]
  363. add pCRow2, pCRow2, lanes, lsl #3
  364. eor z30.d, z16.d, z16.d
  365. eor z31.d, z16.d, z16.d
  366. fmla z30.s, p1/m, z22.s, alphaz_R
  367. fmls z30.s, p1/m, z23.s, alphaz_I
  368. fmla z31.s, p1/m, z22.s, alphaz_I
  369. fmla z31.s, p1/m, z23.s, alphaz_R
  370. st2w {z30.s, z31.s}, p1, [pCRow3]
  371. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  372. add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4
  373. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  374. .endm
  375. /******************************************************************************/
  376. .macro INITv1x2
  377. dup z16.s, #0
  378. dup z17.s, #0
  379. dup z18.s, #0
  380. dup z19.s, #0
  381. .endm
  382. .macro KERNELv1x2_SUB
  383. ld2w {z0.s, z1.s}, p1/z, [pA]
  384. add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4
  385. ld1rw z8.s, p0/z, [pB]
  386. ld1rw z9.s, p0/z, [pB, 4]
  387. ld1rw z10.s, p0/z, [pB, 8]
  388. ld1rw z11.s, p0/z, [pB, 12]
  389. OP_rr z16.s, p1/m, z0.s, z8.s
  390. OP_ir z17.s, p1/m, z1.s, z8.s
  391. OP_ii z16.s, p1/m, z1.s, z9.s
  392. OP_ri z17.s, p1/m, z0.s, z9.s
  393. OP_rr z18.s, p1/m, z0.s, z10.s
  394. OP_ir z19.s, p1/m, z1.s, z10.s
  395. OP_ii z18.s, p1/m, z1.s, z11.s
  396. OP_ri z19.s, p1/m, z0.s, z11.s
  397. add pB, pB, 16
  398. .endm
  399. .macro SAVEv1x2
  400. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  401. eor z24.d, z16.d, z16.d
  402. eor z25.d, z16.d, z16.d
  403. fmla z24.s, p1/m, z16.s, alphaz_R
  404. fmls z24.s, p1/m, z17.s, alphaz_I
  405. fmla z25.s, p1/m, z16.s, alphaz_I
  406. fmla z25.s, p1/m, z17.s, alphaz_R
  407. st2w {z24.s, z25.s}, p1, [pCRow0]
  408. add pCRow0, pCRow0, lanes, lsl #3
  409. eor z26.d, z16.d, z16.d
  410. eor z27.d, z16.d, z16.d
  411. fmla z26.s, p1/m, z18.s, alphaz_R
  412. fmls z26.s, p1/m, z19.s, alphaz_I
  413. fmla z27.s, p1/m, z18.s, alphaz_I
  414. fmla z27.s, p1/m, z19.s, alphaz_R
  415. st2w {z26.s, z27.s}, p1, [pCRow1]
  416. add pCRow1, pCRow1, lanes, lsl #3
  417. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  418. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  419. .endm
  420. /******************************************************************************/
  421. .macro INITv1x1
  422. dup z16.s, #0
  423. dup z17.s, #0
  424. .endm
  425. .macro KERNELv1x1_SUB
  426. ld2w {z0.s, z1.s}, p1/z, [pA]
  427. add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4
  428. ld1rw z8.s, p0/z, [pB]
  429. ld1rw z9.s, p0/z, [pB, 4]
  430. add pB, pB, 8
  431. OP_rr z16.s, p1/m, z0.s, z8.s
  432. OP_ir z17.s, p1/m, z1.s, z8.s
  433. OP_ii z16.s, p1/m, z1.s, z9.s
  434. OP_ri z17.s, p1/m, z0.s, z9.s
  435. .endm
  436. .macro SAVEv1x1
  437. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  438. eor z24.d, z16.d, z16.d
  439. eor z25.d, z16.d, z16.d
  440. fmla z24.s, p1/m, z16.s, alphaz_R
  441. fmls z24.s, p1/m, z17.s, alphaz_I
  442. fmla z25.s, p1/m, z16.s, alphaz_I
  443. fmla z25.s, p1/m, z17.s, alphaz_R
  444. st2w {z24.s, z25.s}, p1, [pCRow0]
  445. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *8
  446. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  447. .endm
  448. /******************************************************************************/
  449. /*******************************************************************************
  450. * End of macro definitions
  451. *******************************************************************************/
  452. PROLOGUE
  453. .align 5
  454. add sp, sp, #-(11 * 16)
  455. stp d8, d9, [sp, #(0 * 16)]
  456. stp d10, d11, [sp, #(1 * 16)]
  457. stp d12, d13, [sp, #(2 * 16)]
  458. stp d14, d15, [sp, #(3 * 16)]
  459. stp d16, d17, [sp, #(4 * 16)]
  460. stp x18, x19, [sp, #(5 * 16)]
  461. stp x20, x21, [sp, #(6 * 16)]
  462. stp x22, x23, [sp, #(7 * 16)]
  463. stp x24, x25, [sp, #(8 * 16)]
  464. stp x26, x27, [sp, #(9 * 16)]
  465. str x28, [sp, #(10 * 16)]
  466. prfm PLDL1KEEP, [origPB]
  467. prfm PLDL1KEEP, [origPA]
  468. fmov alphaR, s0
  469. dup alphaz_R, alphaR
  470. fmov alphaI, s1
  471. dup alphaz_I, alphaI
  472. lsl LDC, LDC, #3 // ldc = ldc * 2 * 4
  473. ptrue p0.s // create true predicate
  474. #if !defined(LEFT)
  475. neg tempOffset, offset
  476. #endif
  477. mov pB, origPB
  478. // Loop over N
  479. mov counterJ, origN
  480. asr counterJ, counterJ, #2 // J = J / 4
  481. cmp counterJ, #0
  482. ble .Lctrmm_kernel_L2_BEGIN
  483. /******************************************************************************/
  484. .Lctrmm_kernel_L4_BEGIN:
  485. mov pCRow0, pC
  486. add pCRow1, pCRow0, LDC
  487. add pCRow2, pCRow1, LDC
  488. add pCRow3, pCRow2, LDC
  489. add pC, pCRow3, LDC
  490. #if defined(LEFT)
  491. mov tempOffset, offset
  492. #endif
  493. mov pA, origPA // pA = start of A array
  494. .Lctrmm_kernel_L4_Mv1_BEGIN:
  495. /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
  496. mov counterI, #0
  497. whilelt p1.s, counterI, origM
  498. cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
  499. .align 5
  500. .Lctrmm_kernel_L4_Mv1_20:
  501. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  502. mov pB, origPB
  503. #else
  504. mov pB, origPB
  505. mul temp, tempOffset, lanes
  506. add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2
  507. lsl temp, tempOffset, #5
  508. add pB, pB, temp
  509. #endif
  510. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  511. sub tempK, origK, tempOffset
  512. #elif defined(LEFT)
  513. add tempK, tempOffset, lanes
  514. #else
  515. add tempK, tempOffset, #4
  516. #endif
  517. INITv1x4 // fill with zeros
  518. asr counterL , tempK, #3
  519. cmp counterL , #2
  520. blt .Lctrmm_kernel_L4_Mv1_32
  521. KERNELv1x4_I
  522. KERNELv1x4_M2
  523. KERNELv1x4_M1
  524. KERNELv1x4_M2
  525. KERNELv1x4_M1
  526. KERNELv1x4_M2
  527. KERNELv1x4_M1
  528. KERNELv1x4_M2
  529. subs counterL, counterL, #2 // subtract 2
  530. ble .Lctrmm_kernel_L4_Mv1_22a
  531. .align 5
  532. .Lctrmm_kernel_L4_Mv1_22:
  533. KERNELv1x4_M1
  534. KERNELv1x4_M2
  535. KERNELv1x4_M1
  536. KERNELv1x4_M2
  537. KERNELv1x4_M1
  538. KERNELv1x4_M2
  539. KERNELv1x4_M1
  540. KERNELv1x4_M2
  541. subs counterL, counterL, #1
  542. bgt .Lctrmm_kernel_L4_Mv1_22
  543. .align 5
  544. .Lctrmm_kernel_L4_Mv1_22a:
  545. KERNELv1x4_M1
  546. KERNELv1x4_M2
  547. KERNELv1x4_M1
  548. KERNELv1x4_M2
  549. KERNELv1x4_M1
  550. KERNELv1x4_M2
  551. KERNELv1x4_M1
  552. KERNELv1x4_E
  553. b .Lctrmm_kernel_L4_Mv1_44
  554. .align 5
  555. .Lctrmm_kernel_L4_Mv1_32:
  556. tst counterL, #1
  557. ble .Lctrmm_kernel_L4_Mv1_40
  558. KERNELv1x4_I
  559. KERNELv1x4_M2
  560. KERNELv1x4_M1
  561. KERNELv1x4_M2
  562. KERNELv1x4_M1
  563. KERNELv1x4_M2
  564. KERNELv1x4_M1
  565. KERNELv1x4_E
  566. b .Lctrmm_kernel_L4_Mv1_44
  567. .Lctrmm_kernel_L4_Mv1_40:
  568. INITv1x4
  569. .Lctrmm_kernel_L4_Mv1_44:
  570. ands counterL , tempK, #7
  571. ble .Lctrmm_kernel_L4_Mv1_100
  572. .align 5
  573. .Lctrmm_kernel_L4_Mv1_46:
  574. KERNELv1x4_SUB
  575. subs counterL, counterL, #1
  576. bne .Lctrmm_kernel_L4_Mv1_46
  577. .Lctrmm_kernel_L4_Mv1_100:
  578. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  579. sub tempK, origK, tempOffset
  580. #if defined(LEFT)
  581. sub tempK, tempK, lanes
  582. #else
  583. sub tempK, tempK, #4
  584. #endif
  585. mul temp, tempK, lanes
  586. add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2
  587. lsl temp, tempK, #5
  588. add pB, pB, temp
  589. #endif
  590. #if defined(LEFT)
  591. add tempOffset, tempOffset, lanes
  592. #endif
  593. prfm PLDL1KEEP, [pA]
  594. prfm PLDL1KEEP, [pA, #64]
  595. prfm PLDL1KEEP, [origPB]
  596. SAVEv1x4
  597. .Lctrmm_kernel_L4_Mv1_END:
  598. incw counterI
  599. whilelt p1.s, counterI, origM //SVE instruction
  600. cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension
  601. b.any .Lctrmm_kernel_L4_Mv1_20
  602. .Lctrmm_kernel_L4_END:
  603. lsl temp, origK, #5
  604. add origPB, origPB, temp // B = B + K * 4 * 8 * 2
  605. #if !defined(LEFT)
  606. add tempOffset, tempOffset, #4
  607. #endif
  608. subs counterJ, counterJ , #1 // j--
  609. bgt .Lctrmm_kernel_L4_BEGIN
  610. /******************************************************************************/
  611. .Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  612. mov counterJ , origN
  613. tst counterJ , #3
  614. ble .Lctrmm_kernel_L999
  615. tst counterJ , #2
  616. ble .Lctrmm_kernel_L1_BEGIN
  617. mov pCRow0, pC // pCRow0 = pC
  618. add pCRow1, pCRow0, LDC
  619. add pC,pC,LDC, lsl #1
  620. #if defined(LEFT)
  621. mov tempOffset, offset
  622. #endif
  623. mov pA, origPA // pA = A
  624. .Lctrmm_kernel_L2_Mv1_BEGIN:
  625. mov counterI, #0
  626. whilelt p1.s, counterI, origM //SVE instruction
  627. cntp lanes, p0, p1.s
  628. .Lctrmm_kernel_L2_Mv1_20:
  629. INITv1x2
  630. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  631. mov pB, origPB
  632. #else
  633. mov pB, origPB
  634. mul temp, tempOffset, lanes
  635. add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2
  636. lsl temp, tempOffset, #4
  637. add pB, pB, temp
  638. #endif
  639. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  640. sub tempK, origK, tempOffset
  641. #elif defined(LEFT)
  642. add tempK, tempOffset, lanes
  643. #else
  644. add tempK, tempOffset, #2
  645. #endif
  646. asr counterL , tempK, #3 // counterL = counterL / 8
  647. cmp counterL,#0
  648. ble .Lctrmm_kernel_L2_Mv1_40
  649. .align 5
  650. .Lctrmm_kernel_L2_Mv1_22:
  651. KERNELv1x2_SUB
  652. KERNELv1x2_SUB
  653. KERNELv1x2_SUB
  654. KERNELv1x2_SUB
  655. KERNELv1x2_SUB
  656. KERNELv1x2_SUB
  657. KERNELv1x2_SUB
  658. KERNELv1x2_SUB
  659. subs counterL, counterL, #1
  660. bgt .Lctrmm_kernel_L2_Mv1_22
  661. .Lctrmm_kernel_L2_Mv1_40:
  662. ands counterL , tempK, #7 // counterL = counterL % 8
  663. ble .Lctrmm_kernel_L2_Mv1_100
  664. .Lctrmm_kernel_L2_Mv1_42:
  665. KERNELv1x2_SUB
  666. subs counterL, counterL, #1
  667. bgt .Lctrmm_kernel_L2_Mv1_42
  668. .Lctrmm_kernel_L2_Mv1_100:
  669. SAVEv1x2
  670. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  671. sub tempK, origK, tempOffset
  672. #if defined(LEFT)
  673. sub tempK, tempK, lanes
  674. #else
  675. sub tempK, tempK, #2
  676. #endif
  677. mul temp, tempK, lanes
  678. add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2
  679. lsl temp, tempK, #4
  680. add pB, pB, temp
  681. #endif
  682. #if defined(LEFT)
  683. add tempOffset, tempOffset, lanes
  684. #endif
  685. .Lctrmm_kernel_L2_Mv1_END:
  686. incw counterI
  687. whilelt p1.s, counterI, origM //SVE instruction
  688. cntp lanes, p0, p1.s
  689. b.any .Lctrmm_kernel_L2_Mv1_20
  690. .Lctrmm_kernel_L2_END:
  691. #if !defined(LEFT)
  692. add tempOffset, tempOffset, #2
  693. #endif
  694. lsl temp, origK, #4
  695. add origPB, origPB, temp // B = B + K * 2 * 8 * 2
  696. /******************************************************************************/
  697. .Lctrmm_kernel_L1_BEGIN:
  698. mov counterJ , origN
  699. tst counterJ , #1
  700. ble .Lctrmm_kernel_L999 // done
  701. mov pCRow0, pC // pCRow0 = C
  702. add pC , pC , LDC // Update pC to point to next
  703. #if defined(LEFT)
  704. mov tempOffset, offset
  705. #endif
  706. mov pA, origPA // pA = A
  707. .Lctrmm_kernel_L1_Mv1_BEGIN:
  708. mov counterI, #0
  709. whilelt p1.s, counterI, origM //SVE instruction
  710. cntp lanes, p0, p1.s
  711. .Lctrmm_kernel_L1_Mv1_20:
  712. INITv1x1
  713. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  714. mov pB, origPB
  715. #else
  716. mov pB, origPB
  717. mul temp, tempOffset, lanes
  718. add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2
  719. lsl temp, tempOffset, #3
  720. add pB, pB, temp
  721. #endif
  722. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  723. sub tempK, origK, tempOffset
  724. #elif defined(LEFT)
  725. add tempK, tempOffset, lanes
  726. #else
  727. add tempK, tempOffset, #1
  728. #endif
  729. asr counterL , tempK, #3 // counterL = counterL / 8
  730. cmp counterL , #0
  731. ble .Lctrmm_kernel_L1_Mv1_40
  732. .align 5
  733. .Lctrmm_kernel_L1_Mv1_22:
  734. KERNELv1x1_SUB
  735. KERNELv1x1_SUB
  736. KERNELv1x1_SUB
  737. KERNELv1x1_SUB
  738. KERNELv1x1_SUB
  739. KERNELv1x1_SUB
  740. KERNELv1x1_SUB
  741. KERNELv1x1_SUB
  742. subs counterL, counterL, #1
  743. bgt .Lctrmm_kernel_L1_Mv1_22
  744. .Lctrmm_kernel_L1_Mv1_40:
  745. ands counterL , tempK, #7 // counterL = counterL % 8
  746. ble .Lctrmm_kernel_L1_Mv1_100
  747. .Lctrmm_kernel_L1_Mv1_42:
  748. KERNELv1x1_SUB
  749. subs counterL, counterL, #1
  750. bgt .Lctrmm_kernel_L1_Mv1_42
  751. .Lctrmm_kernel_L1_Mv1_100:
  752. SAVEv1x1
  753. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  754. sub tempK, origK, tempOffset
  755. #if defined(LEFT)
  756. sub tempK, tempK, lanes
  757. #else
  758. sub tempK, tempK, #1
  759. #endif
  760. mul temp, tempK, lanes
  761. add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2
  762. lsl temp, tempK, #3
  763. add pB, pB, temp
  764. #endif
  765. #if defined(LEFT)
  766. add tempOffset, tempOffset, lanes
  767. #endif
  768. .Lctrmm_kernel_L1_Mv1_END:
  769. incw counterI
  770. whilelt p1.s, counterI, origM //SVE instruction
  771. cntp lanes, p0, p1.s
  772. b.any .Lctrmm_kernel_L1_Mv1_20
  773. .Lctrmm_kernel_L1_END:
  774. /******************************************************************************/
  775. .Lctrmm_kernel_L999:
  776. mov x0, #0 // set return value
  777. ldp d8, d9, [sp, #(0 * 16)]
  778. ldp d10, d11, [sp, #(1 * 16)]
  779. ldp d12, d13, [sp, #(2 * 16)]
  780. ldp d14, d15, [sp, #(3 * 16)]
  781. ldp d16, d17, [sp, #(4 * 16)]
  782. ldp x18, x19, [sp, #(5 * 16)]
  783. ldp x20, x21, [sp, #(6 * 16)]
  784. ldp x22, x23, [sp, #(7 * 16)]
  785. ldp x24, x25, [sp, #(8 * 16)]
  786. ldp x26, x27, [sp, #(9 * 16)]
  787. ldr x28, [sp, #(10 * 16)]
  788. add sp, sp, #(11*16)
  789. ret
  790. EPILOGUE