You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm_kernel_8x4.S 50 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7*/
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0, FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define temp x16
  48. #define tempOffset x17
  49. #define tempK x18
  50. #define alpha0_R s10
  51. #define alphaV0_R v10.s[0]
  52. #define alpha0_I s11
  53. #define alphaV0_I v11.s[0]
  54. #define alpha1_R s14
  55. #define alphaV1_R v14.s[0]
  56. #define alpha1_I s15
  57. #define alphaV1_I v15.s[0]
  58. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  59. #define OP_rr fmla
  60. #define OP_ii fmls
  61. #define OP_ri fmla
  62. #define OP_ir fmla
  63. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  64. #define OP_rr fmla
  65. #define OP_ii fmla
  66. #define OP_ri fmls
  67. #define OP_ir fmla
  68. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  69. #define OP_rr fmla
  70. #define OP_ii fmla
  71. #define OP_ri fmla
  72. #define OP_ir fmls
  73. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  74. #define OP_rr fmla
  75. #define OP_ii fmls
  76. #define OP_ri fmls
  77. #define OP_ir fmls
  78. #endif
  79. // 00 origM
  80. // 01 origN
  81. // 02 origK
  82. // 03 origPA
  83. // 04 origPB
  84. // 05 pC
  85. // 06 origLDC -> LDC
  86. // 07 offset
  87. // 08 counterL
  88. // 09 counterI
  89. // 10 counterJ
  90. // 11 pB
  91. // 12 pCRow0
  92. // 13 pCRow1
  93. // 14 pCRow2
  94. // 15 pA
  95. // 16 temp
  96. // 17 tempOffset
  97. // 18 must save tempK
  98. // 19 must save
  99. // 20 must save
  100. // 21 must save
  101. // 22 must save
  102. // 23 must save
  103. // 24 must save
  104. // 25 must save
  105. // 26 must save
  106. // 27 must save
  107. // 28 must save
  108. // 29 frame
  109. // 30 link
  110. // 31 sp
  111. //v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R
  112. //v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I
  113. //v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R
  114. //v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I
  115. //v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R
  116. //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I
  117. //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R
  118. //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I
  119. //v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R
  120. //v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I
  121. //v10 must save ALPHA0_R
  122. //v11 must save ALPHA0_I
  123. //v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R
  124. //v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I
  125. //v14 must save ALPHA1_R
  126. //v15 must save ALPHA1_I
  127. //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R
  128. //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I
  129. //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R
  130. //v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I
  131. //v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R
  132. //v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I
  133. //v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R
  134. //v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I
  135. //v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R
  136. //v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I
  137. //v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R
  138. //v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I
  139. //v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R
  140. //v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I
  141. //v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R
  142. //v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I
  143. /*******************************************************************************
  144. * Macro definitions
  145. *******************************************************************************/
  146. .macro INIT8x4
  147. fmov s16, wzr
  148. fmov s17, wzr
  149. fmov s18, wzr
  150. fmov s19, s16
  151. fmov s20, wzr
  152. fmov s21, s16
  153. fmov s22, s17
  154. fmov s23, s18
  155. fmov s24, wzr
  156. fmov s25, s16
  157. fmov s26, s17
  158. fmov s27, s18
  159. fmov s28, wzr
  160. fmov s29, s16
  161. fmov s30, s17
  162. fmov s31, s18
  163. .endm
  164. .macro KERNEL8x4_I
  165. ld2 {v8.4s, v9.4s}, [pB]
  166. add pB, pB, #32
  167. ld2 {v0.4s, v1.4s}, [pA]
  168. add pA, pA, #32
  169. ld2 {v2.4s, v3.4s}, [pA]
  170. add pA, pA, #32
  171. fmul v16.4s, v0.4s, v8.s[0]
  172. OP_ii v16.4s, v1.4s, v9.s[0]
  173. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  174. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  175. eor v17.16b, v17.16b, v17.16b
  176. fmls v17.4s, v0.4s, v9.s[0]
  177. #else
  178. fmul v17.4s, v0.4s, v9.s[0]
  179. #endif
  180. OP_ir v17.4s, v1.4s, v8.s[0]
  181. fmul v18.4s, v2.4s, v8.s[0]
  182. OP_ii v18.4s, v3.4s, v9.s[0]
  183. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  184. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  185. eor v19.16b, v19.16b, v19.16b
  186. fmls v19.4s, v2.4s, v9.s[0]
  187. #else
  188. fmul v19.4s, v2.4s, v9.s[0]
  189. #endif
  190. OP_ir v19.4s, v3.4s, v8.s[0]
  191. fmul v20.4s, v0.4s, v8.s[1]
  192. OP_ii v20.4s, v1.4s, v9.s[1]
  193. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  194. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  195. eor v21.16b, v21.16b, v21.16b
  196. fmls v21.4s, v0.4s, v9.s[1]
  197. #else
  198. fmul v21.4s, v0.4s, v9.s[1]
  199. #endif
  200. OP_ir v21.4s, v1.4s, v8.s[1]
  201. fmul v22.4s, v2.4s, v8.s[1]
  202. OP_ii v22.4s, v3.4s, v9.s[1]
  203. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  204. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  205. eor v23.16b, v23.16b, v23.16b
  206. fmls v23.4s, v2.4s, v9.s[1]
  207. #else
  208. fmul v23.4s, v2.4s, v9.s[1]
  209. #endif
  210. OP_ir v23.4s, v3.4s, v8.s[1]
  211. fmul v24.4s, v0.4s, v8.s[2]
  212. OP_ii v24.4s, v1.4s, v9.s[2]
  213. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  214. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  215. eor v25.16b, v25.16b, v25.16b
  216. fmls v25.4s, v0.4s, v9.s[2]
  217. #else
  218. fmul v25.4s, v0.4s, v9.s[2]
  219. #endif
  220. OP_ir v25.4s, v1.4s, v8.s[2]
  221. fmul v26.4s, v2.4s, v8.s[2]
  222. OP_ii v26.4s, v3.4s, v9.s[2]
  223. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  224. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  225. eor v27.16b, v27.16b, v27.16b
  226. fmls v27.4s, v2.4s, v9.s[2]
  227. #else
  228. fmul v27.4s, v2.4s, v9.s[2]
  229. #endif
  230. OP_ir v27.4s, v3.4s, v8.s[2]
  231. fmul v28.4s, v0.4s, v8.s[3]
  232. OP_ii v28.4s, v1.4s, v9.s[3]
  233. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  234. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  235. eor v29.16b, v29.16b, v29.16b
  236. fmls v29.4s, v0.4s, v9.s[3]
  237. #else
  238. fmul v29.4s, v0.4s, v9.s[3]
  239. #endif
  240. OP_ir v29.4s, v1.4s, v8.s[3]
  241. fmul v30.4s, v2.4s, v8.s[3]
  242. OP_ii v30.4s, v3.4s, v9.s[3]
  243. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  244. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  245. eor v31.16b, v31.16b, v31.16b
  246. fmls v31.4s, v2.4s, v9.s[3]
  247. #else
  248. fmul v31.4s, v2.4s, v9.s[3]
  249. #endif
  250. OP_ir v31.4s, v3.4s, v8.s[3]
  251. ld2 {v12.4s, v13.4s}, [pB]
  252. add pB, pB, #32
  253. ld2 {v4.4s, v5.4s}, [pA]
  254. add pA, pA, #32
  255. ld2 {v6.4s, v7.4s}, [pA]
  256. add pA, pA, #32
  257. .endm
  258. .macro KERNEL8x4_M1
  259. OP_rr v16.4s, v0.4s, v8.s[0]
  260. OP_ii v16.4s, v1.4s, v9.s[0]
  261. OP_ri v17.4s, v0.4s, v9.s[0]
  262. OP_ir v17.4s, v1.4s, v8.s[0]
  263. OP_rr v18.4s, v2.4s, v8.s[0]
  264. OP_ii v18.4s, v3.4s, v9.s[0]
  265. OP_ri v19.4s, v2.4s, v9.s[0]
  266. OP_ir v19.4s, v3.4s, v8.s[0]
  267. OP_rr v20.4s, v0.4s, v8.s[1]
  268. OP_ii v20.4s, v1.4s, v9.s[1]
  269. OP_ri v21.4s, v0.4s, v9.s[1]
  270. OP_ir v21.4s, v1.4s, v8.s[1]
  271. OP_rr v22.4s, v2.4s, v8.s[1]
  272. OP_ii v22.4s, v3.4s, v9.s[1]
  273. OP_ri v23.4s, v2.4s, v9.s[1]
  274. OP_ir v23.4s, v3.4s, v8.s[1]
  275. OP_rr v24.4s, v0.4s, v8.s[2]
  276. OP_ii v24.4s, v1.4s, v9.s[2]
  277. OP_ri v25.4s, v0.4s, v9.s[2]
  278. OP_ir v25.4s, v1.4s, v8.s[2]
  279. OP_rr v26.4s, v2.4s, v8.s[2]
  280. OP_ii v26.4s, v3.4s, v9.s[2]
  281. OP_ri v27.4s, v2.4s, v9.s[2]
  282. OP_ir v27.4s, v3.4s, v8.s[2]
  283. OP_rr v28.4s, v0.4s, v8.s[3]
  284. OP_ii v28.4s, v1.4s, v9.s[3]
  285. OP_ri v29.4s, v0.4s, v9.s[3]
  286. OP_ir v29.4s, v1.4s, v8.s[3]
  287. OP_rr v30.4s, v2.4s, v8.s[3]
  288. OP_ii v30.4s, v3.4s, v9.s[3]
  289. OP_ri v31.4s, v2.4s, v9.s[3]
  290. OP_ir v31.4s, v3.4s, v8.s[3]
  291. ld2 {v12.4s, v13.4s}, [pB] // For next round
  292. add pB, pB, #32
  293. ld2 {v4.4s, v5.4s}, [pA] // For next round
  294. add pA, pA, #32
  295. ld2 {v6.4s, v7.4s}, [pA]
  296. add pA, pA, #32
  297. .endm
  298. .macro KERNEL8x4_M2
  299. OP_rr v16.4s, v4.4s, v12.s[0]
  300. OP_ii v16.4s, v5.4s, v13.s[0]
  301. OP_ri v17.4s, v4.4s, v13.s[0]
  302. OP_ir v17.4s, v5.4s, v12.s[0]
  303. OP_rr v18.4s, v6.4s, v12.s[0]
  304. OP_ii v18.4s, v7.4s, v13.s[0]
  305. OP_ri v19.4s, v6.4s, v13.s[0]
  306. OP_ir v19.4s, v7.4s, v12.s[0]
  307. OP_rr v20.4s, v4.4s, v12.s[1]
  308. OP_ii v20.4s, v5.4s, v13.s[1]
  309. OP_ri v21.4s, v4.4s, v13.s[1]
  310. OP_ir v21.4s, v5.4s, v12.s[1]
  311. OP_rr v22.4s, v6.4s, v12.s[1]
  312. OP_ii v22.4s, v7.4s, v13.s[1]
  313. OP_ri v23.4s, v6.4s, v13.s[1]
  314. OP_ir v23.4s, v7.4s, v12.s[1]
  315. OP_rr v24.4s, v4.4s, v12.s[2]
  316. OP_ii v24.4s, v5.4s, v13.s[2]
  317. OP_ri v25.4s, v4.4s, v13.s[2]
  318. OP_ir v25.4s, v5.4s, v12.s[2]
  319. OP_rr v26.4s, v6.4s, v12.s[2]
  320. OP_ii v26.4s, v7.4s, v13.s[2]
  321. OP_ri v27.4s, v6.4s, v13.s[2]
  322. OP_ir v27.4s, v7.4s, v12.s[2]
  323. OP_rr v28.4s, v4.4s, v12.s[3]
  324. OP_ii v28.4s, v5.4s, v13.s[3]
  325. OP_ri v29.4s, v4.4s, v13.s[3]
  326. OP_ir v29.4s, v5.4s, v12.s[3]
  327. OP_rr v30.4s, v6.4s, v12.s[3]
  328. OP_ii v30.4s, v7.4s, v13.s[3]
  329. OP_ri v31.4s, v6.4s, v13.s[3]
  330. OP_ir v31.4s, v7.4s, v12.s[3]
  331. ld2 {v8.4s, v9.4s}, [pB]
  332. add pB, pB, #32
  333. ld2 {v0.4s, v1.4s}, [pA]
  334. add pA, pA, #32
  335. ld2 {v2.4s, v3.4s}, [pA]
  336. add pA, pA, #32
  337. .endm
  338. .macro KERNEL8x4_E
  339. OP_rr v16.4s, v4.4s, v12.s[0]
  340. OP_ii v16.4s, v5.4s, v13.s[0]
  341. OP_ri v17.4s, v4.4s, v13.s[0]
  342. OP_ir v17.4s, v5.4s, v12.s[0]
  343. OP_rr v18.4s, v6.4s, v12.s[0]
  344. OP_ii v18.4s, v7.4s, v13.s[0]
  345. OP_ri v19.4s, v6.4s, v13.s[0]
  346. OP_ir v19.4s, v7.4s, v12.s[0]
  347. OP_rr v20.4s, v4.4s, v12.s[1]
  348. OP_ii v20.4s, v5.4s, v13.s[1]
  349. OP_ri v21.4s, v4.4s, v13.s[1]
  350. OP_ir v21.4s, v5.4s, v12.s[1]
  351. OP_rr v22.4s, v6.4s, v12.s[1]
  352. OP_ii v22.4s, v7.4s, v13.s[1]
  353. OP_ri v23.4s, v6.4s, v13.s[1]
  354. OP_ir v23.4s, v7.4s, v12.s[1]
  355. OP_rr v24.4s, v4.4s, v12.s[2]
  356. OP_ii v24.4s, v5.4s, v13.s[2]
  357. OP_ri v25.4s, v4.4s, v13.s[2]
  358. OP_ir v25.4s, v5.4s, v12.s[2]
  359. OP_rr v26.4s, v6.4s, v12.s[2]
  360. OP_ii v26.4s, v7.4s, v13.s[2]
  361. OP_ri v27.4s, v6.4s, v13.s[2]
  362. OP_ir v27.4s, v7.4s, v12.s[2]
  363. OP_rr v28.4s, v4.4s, v12.s[3]
  364. OP_ii v28.4s, v5.4s, v13.s[3]
  365. OP_ri v29.4s, v4.4s, v13.s[3]
  366. OP_ir v29.4s, v5.4s, v12.s[3]
  367. OP_rr v30.4s, v6.4s, v12.s[3]
  368. OP_ii v30.4s, v7.4s, v13.s[3]
  369. OP_ri v31.4s, v6.4s, v13.s[3]
  370. OP_ir v31.4s, v7.4s, v12.s[3]
  371. .endm
  372. .macro KERNEL8x4_SUB
  373. ld2 {v8.4s, v9.4s}, [pB]
  374. add pB, pB, #32
  375. ld2 {v0.4s, v1.4s}, [pA]
  376. add pA, pA, #32
  377. ld2 {v2.4s, v3.4s}, [pA]
  378. add pA, pA, #32
  379. OP_rr v16.4s, v0.4s, v8.s[0]
  380. OP_ii v16.4s, v1.4s, v9.s[0]
  381. OP_ri v17.4s, v0.4s, v9.s[0]
  382. OP_ir v17.4s, v1.4s, v8.s[0]
  383. OP_rr v18.4s, v2.4s, v8.s[0]
  384. OP_ii v18.4s, v3.4s, v9.s[0]
  385. OP_ri v19.4s, v2.4s, v9.s[0]
  386. OP_ir v19.4s, v3.4s, v8.s[0]
  387. OP_rr v20.4s, v0.4s, v8.s[1]
  388. OP_ii v20.4s, v1.4s, v9.s[1]
  389. OP_ri v21.4s, v0.4s, v9.s[1]
  390. OP_ir v21.4s, v1.4s, v8.s[1]
  391. OP_rr v22.4s, v2.4s, v8.s[1]
  392. OP_ii v22.4s, v3.4s, v9.s[1]
  393. OP_ri v23.4s, v2.4s, v9.s[1]
  394. OP_ir v23.4s, v3.4s, v8.s[1]
  395. OP_rr v24.4s, v0.4s, v8.s[2]
  396. OP_ii v24.4s, v1.4s, v9.s[2]
  397. OP_ri v25.4s, v0.4s, v9.s[2]
  398. OP_ir v25.4s, v1.4s, v8.s[2]
  399. OP_rr v26.4s, v2.4s, v8.s[2]
  400. OP_ii v26.4s, v3.4s, v9.s[2]
  401. OP_ri v27.4s, v2.4s, v9.s[2]
  402. OP_ir v27.4s, v3.4s, v8.s[2]
  403. OP_rr v28.4s, v0.4s, v8.s[3]
  404. OP_ii v28.4s, v1.4s, v9.s[3]
  405. OP_ri v29.4s, v0.4s, v9.s[3]
  406. OP_ir v29.4s, v1.4s, v8.s[3]
  407. OP_rr v30.4s, v2.4s, v8.s[3]
  408. OP_ii v30.4s, v3.4s, v9.s[3]
  409. OP_ri v31.4s, v2.4s, v9.s[3]
  410. OP_ir v31.4s, v3.4s, v8.s[3]
  411. .endm
  412. .macro SAVE8x4
  413. mov pCRow1, pCRow0
  414. fmul v0.4s, v16.4s, alphaV0_R
  415. fmls v0.4s, v17.4s, alphaV0_I
  416. fmul v1.4s, v16.4s, alphaV1_I
  417. fmla v1.4s, v17.4s, alphaV1_R
  418. st2 {v0.4s, v1.4s}, [pCRow1]
  419. add pCRow2, pCRow1, #32
  420. fmul v2.4s, v18.4s, alphaV0_R
  421. fmls v2.4s, v19.4s, alphaV0_I
  422. fmul v3.4s, v18.4s, alphaV1_I
  423. fmla v3.4s, v19.4s, alphaV1_R
  424. st2 {v2.4s, v3.4s}, [pCRow2]
  425. add pCRow1, pCRow1, LDC
  426. fmul v4.4s, v20.4s, alphaV0_R
  427. fmls v4.4s, v21.4s, alphaV0_I
  428. fmul v5.4s, v20.4s, alphaV1_I
  429. fmla v5.4s, v21.4s, alphaV1_R
  430. st2 {v4.4s, v5.4s}, [pCRow1]
  431. add pCRow2, pCRow1, #32
  432. fmul v6.4s, v22.4s, alphaV0_R
  433. fmls v6.4s, v23.4s, alphaV0_I
  434. fmul v7.4s, v22.4s, alphaV1_I
  435. fmla v7.4s, v23.4s, alphaV1_R
  436. st2 {v6.4s, v7.4s}, [pCRow2]
  437. add pCRow1, pCRow1, LDC
  438. fmul v0.4s, v24.4s, alphaV0_R
  439. fmls v0.4s, v25.4s, alphaV0_I
  440. fmul v1.4s, v24.4s, alphaV1_I
  441. fmla v1.4s, v25.4s, alphaV1_R
  442. st2 {v0.4s, v1.4s}, [pCRow1]
  443. add pCRow2, pCRow1, #32
  444. fmul v2.4s, v26.4s, alphaV0_R
  445. fmls v2.4s, v27.4s, alphaV0_I
  446. fmul v3.4s, v26.4s, alphaV1_I
  447. fmla v3.4s, v27.4s, alphaV1_R
  448. st2 {v2.4s, v3.4s}, [pCRow2]
  449. add pCRow1, pCRow1, LDC
  450. fmul v4.4s, v28.4s, alphaV0_R
  451. fmls v4.4s, v29.4s, alphaV0_I
  452. fmul v5.4s, v28.4s, alphaV1_I
  453. fmla v5.4s, v29.4s, alphaV1_R
  454. st2 {v4.4s, v5.4s}, [pCRow1]
  455. add pCRow2, pCRow1, #32
  456. fmul v6.4s, v30.4s, alphaV0_R
  457. fmls v6.4s, v31.4s, alphaV0_I
  458. fmul v7.4s, v30.4s, alphaV1_I
  459. fmla v7.4s, v31.4s, alphaV1_R
  460. st2 {v6.4s, v7.4s}, [pCRow2]
  461. add pCRow0, pCRow0, #64
  462. .endm
  463. /******************************************************************************/
  464. .macro INIT4x4
  465. fmov s16, wzr
  466. fmov s17, s16
  467. fmov s20, s17
  468. fmov s21, s16
  469. fmov s24, s17
  470. fmov s25, s16
  471. fmov s28, s17
  472. fmov s29, s16
  473. .endm
  474. .macro KERNEL4x4_I
  475. ld2 {v8.4s, v9.4s}, [pB]
  476. add pB, pB, #32
  477. ld2 {v0.4s, v1.4s}, [pA]
  478. add pA, pA, #32
  479. fmul v16.4s, v0.4s, v8.s[0]
  480. OP_ii v16.4s, v1.4s, v9.s[0]
  481. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  482. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  483. eor v17.16b, v17.16b, v17.16b
  484. fmls v17.4s, v0.4s, v9.s[0]
  485. #else
  486. fmul v17.4s, v0.4s, v9.s[0]
  487. #endif
  488. OP_ir v17.4s, v1.4s, v8.s[0]
  489. fmul v20.4s, v0.4s, v8.s[1]
  490. OP_ii v20.4s, v1.4s, v9.s[1]
  491. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  492. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  493. eor v21.16b, v21.16b, v21.16b
  494. fmls v21.4s, v0.4s, v9.s[1]
  495. #else
  496. fmul v21.4s, v0.4s, v9.s[1]
  497. #endif
  498. OP_ir v21.4s, v1.4s, v8.s[1]
  499. fmul v24.4s, v0.4s, v8.s[2]
  500. OP_ii v24.4s, v1.4s, v9.s[2]
  501. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  502. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  503. eor v25.16b, v25.16b, v25.16b
  504. fmls v25.4s, v0.4s, v9.s[2]
  505. #else
  506. fmul v25.4s, v0.4s, v9.s[2]
  507. #endif
  508. OP_ir v25.4s, v1.4s, v8.s[2]
  509. fmul v28.4s, v0.4s, v8.s[3]
  510. OP_ii v28.4s, v1.4s, v9.s[3]
  511. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  512. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  513. eor v29.16b, v29.16b, v29.16b
  514. fmls v29.4s, v0.4s, v9.s[3]
  515. #else
  516. fmul v29.4s, v0.4s, v9.s[3]
  517. #endif
  518. OP_ir v29.4s, v1.4s, v8.s[3]
  519. ld2 {v12.4s, v13.4s}, [pB]
  520. add pB, pB, #32
  521. ld2 {v4.4s, v5.4s}, [pA]
  522. add pA, pA, #32
  523. .endm
  524. .macro KERNEL4x4_M1
  525. OP_rr v16.4s, v0.4s, v8.s[0]
  526. OP_ii v16.4s, v1.4s, v9.s[0]
  527. OP_ri v17.4s, v0.4s, v9.s[0]
  528. OP_ir v17.4s, v1.4s, v8.s[0]
  529. ld2 {v12.4s, v13.4s}, [pB] // For next round
  530. add pB, pB, #32
  531. OP_rr v20.4s, v0.4s, v8.s[1]
  532. OP_ii v20.4s, v1.4s, v9.s[1]
  533. OP_ri v21.4s, v0.4s, v9.s[1]
  534. OP_ir v21.4s, v1.4s, v8.s[1]
  535. ld2 {v4.4s, v5.4s}, [pA] // For next round
  536. add pA, pA, #32
  537. OP_rr v24.4s, v0.4s, v8.s[2]
  538. OP_ii v24.4s, v1.4s, v9.s[2]
  539. OP_ri v25.4s, v0.4s, v9.s[2]
  540. OP_ir v25.4s, v1.4s, v8.s[2]
  541. prfm PLDL1KEEP, [pA, #512]
  542. OP_rr v28.4s, v0.4s, v8.s[3]
  543. OP_ii v28.4s, v1.4s, v9.s[3]
  544. OP_ri v29.4s, v0.4s, v9.s[3]
  545. OP_ir v29.4s, v1.4s, v8.s[3]
  546. .endm
  547. .macro KERNEL4x4_M2
  548. OP_rr v16.4s, v4.4s, v12.s[0]
  549. OP_ii v16.4s, v5.4s, v13.s[0]
  550. OP_ri v17.4s, v4.4s, v13.s[0]
  551. OP_ir v17.4s, v5.4s, v12.s[0]
  552. ld2 {v8.4s, v9.4s}, [pB] // For next round
  553. add pB, pB, #32
  554. OP_rr v20.4s, v4.4s, v12.s[1]
  555. OP_ii v20.4s, v5.4s, v13.s[1]
  556. OP_ri v21.4s, v4.4s, v13.s[1]
  557. OP_ir v21.4s, v5.4s, v12.s[1]
  558. ld2 {v0.4s, v1.4s}, [pA] // For next round
  559. add pA, pA, #32
  560. OP_rr v24.4s, v4.4s, v12.s[2]
  561. OP_ii v24.4s, v5.4s, v13.s[2]
  562. OP_ri v25.4s, v4.4s, v13.s[2]
  563. OP_ir v25.4s, v5.4s, v12.s[2]
  564. prfm PLDL1KEEP, [pB, #512]
  565. OP_rr v28.4s, v4.4s, v12.s[3]
  566. OP_ii v28.4s, v5.4s, v13.s[3]
  567. OP_ri v29.4s, v4.4s, v13.s[3]
  568. OP_ir v29.4s, v5.4s, v12.s[3]
  569. .endm
  570. .macro KERNEL4x4_E
  571. OP_rr v16.4s, v4.4s, v12.s[0]
  572. OP_ii v16.4s, v5.4s, v13.s[0]
  573. OP_ri v17.4s, v4.4s, v13.s[0]
  574. OP_ir v17.4s, v5.4s, v12.s[0]
  575. OP_rr v20.4s, v4.4s, v12.s[1]
  576. OP_ii v20.4s, v5.4s, v13.s[1]
  577. OP_ri v21.4s, v4.4s, v13.s[1]
  578. OP_ir v21.4s, v5.4s, v12.s[1]
  579. OP_rr v24.4s, v4.4s, v12.s[2]
  580. OP_ii v24.4s, v5.4s, v13.s[2]
  581. OP_ri v25.4s, v4.4s, v13.s[2]
  582. OP_ir v25.4s, v5.4s, v12.s[2]
  583. OP_rr v28.4s, v4.4s, v12.s[3]
  584. OP_ii v28.4s, v5.4s, v13.s[3]
  585. OP_ri v29.4s, v4.4s, v13.s[3]
  586. OP_ir v29.4s, v5.4s, v12.s[3]
  587. .endm
  588. .macro KERNEL4x4_SUB
  589. ld2 {v8.4s, v9.4s}, [pB]
  590. add pB, pB, #32
  591. ld2 {v0.4s, v1.4s}, [pA]
  592. add pA, pA, #32
  593. OP_rr v16.4s, v0.4s, v8.s[0]
  594. OP_ii v16.4s, v1.4s, v9.s[0]
  595. OP_ri v17.4s, v0.4s, v9.s[0]
  596. OP_ir v17.4s, v1.4s, v8.s[0]
  597. OP_rr v20.4s, v0.4s, v8.s[1]
  598. OP_ii v20.4s, v1.4s, v9.s[1]
  599. OP_ri v21.4s, v0.4s, v9.s[1]
  600. OP_ir v21.4s, v1.4s, v8.s[1]
  601. OP_rr v24.4s, v0.4s, v8.s[2]
  602. OP_ii v24.4s, v1.4s, v9.s[2]
  603. OP_ri v25.4s, v0.4s, v9.s[2]
  604. OP_ir v25.4s, v1.4s, v8.s[2]
  605. OP_rr v28.4s, v0.4s, v8.s[3]
  606. OP_ii v28.4s, v1.4s, v9.s[3]
  607. OP_ri v29.4s, v0.4s, v9.s[3]
  608. OP_ir v29.4s, v1.4s, v8.s[3]
  609. .endm
  610. .macro SAVE4x4
  611. mov pCRow1, pCRow0
  612. fmul v0.4s, v16.4s, alphaV0_R
  613. fmls v0.4s, v17.4s, alphaV0_I
  614. fmul v1.4s, v16.4s, alphaV1_I
  615. fmla v1.4s, v17.4s, alphaV1_R
  616. st2 {v0.4s, v1.4s}, [pCRow1]
  617. add pCRow1, pCRow1, LDC
  618. fmul v4.4s, v20.4s, alphaV0_R
  619. fmls v4.4s, v21.4s, alphaV0_I
  620. fmul v5.4s, v20.4s, alphaV1_I
  621. fmla v5.4s, v21.4s, alphaV1_R
  622. st2 {v4.4s, v5.4s}, [pCRow1]
  623. add pCRow1, pCRow1, LDC
  624. fmul v0.4s, v24.4s, alphaV0_R
  625. fmls v0.4s, v25.4s, alphaV0_I
  626. fmul v1.4s, v24.4s, alphaV1_I
  627. fmla v1.4s, v25.4s, alphaV1_R
  628. st2 {v0.4s, v1.4s}, [pCRow1]
  629. add pCRow1, pCRow1, LDC
  630. fmul v4.4s, v28.4s, alphaV0_R
  631. fmls v4.4s, v29.4s, alphaV0_I
  632. fmul v5.4s, v28.4s, alphaV1_I
  633. fmla v5.4s, v29.4s, alphaV1_R
  634. st2 {v4.4s, v5.4s}, [pCRow1]
  635. add pCRow0, pCRow0, #32
  636. .endm
  637. /******************************************************************************/
  638. .macro INIT2x4
  639. fmov s16, wzr
  640. fmov s17, wzr
  641. fmov s20, s16
  642. fmov s21, s17
  643. fmov s24, s16
  644. fmov s25, s17
  645. fmov s28, s16
  646. fmov s29, s17
  647. .endm
  648. .macro KERNEL2x4_SUB
  649. ld2 {v8.4s, v9.4s}, [pB]
  650. add pB, pB, #32
  651. ld2 {v0.2s, v1.2s}, [pA]
  652. add pA, pA, #16
  653. OP_rr v16.2s, v0.2s, v8.s[0]
  654. OP_ii v16.2s, v1.2s, v9.s[0]
  655. OP_ri v17.2s, v0.2s, v9.s[0]
  656. OP_ir v17.2s, v1.2s, v8.s[0]
  657. OP_rr v20.2s, v0.2s, v8.s[1]
  658. OP_ii v20.2s, v1.2s, v9.s[1]
  659. OP_ri v21.2s, v0.2s, v9.s[1]
  660. OP_ir v21.2s, v1.2s, v8.s[1]
  661. OP_rr v24.2s, v0.2s, v8.s[2]
  662. OP_ii v24.2s, v1.2s, v9.s[2]
  663. OP_ri v25.2s, v0.2s, v9.s[2]
  664. OP_ir v25.2s, v1.2s, v8.s[2]
  665. OP_rr v28.2s, v0.2s, v8.s[3]
  666. OP_ii v28.2s, v1.2s, v9.s[3]
  667. OP_ri v29.2s, v0.2s, v9.s[3]
  668. OP_ir v29.2s, v1.2s, v8.s[3]
  669. .endm
  670. .macro SAVE2x4
  671. mov pCRow1, pCRow0
  672. fmul v0.2s, v16.2s, alphaV0_R
  673. fmls v0.2s, v17.2s, alphaV0_I
  674. fmul v1.2s, v16.2s, alphaV1_I
  675. fmla v1.2s, v17.2s, alphaV1_R
  676. st2 {v0.2s, v1.2s}, [pCRow1]
  677. add pCRow1, pCRow1, LDC
  678. fmul v4.2s, v20.2s, alphaV0_R
  679. fmls v4.2s, v21.2s, alphaV0_I
  680. fmul v5.2s, v20.2s, alphaV1_I
  681. fmla v5.2s, v21.2s, alphaV1_R
  682. st2 {v4.2s, v5.2s}, [pCRow1]
  683. add pCRow1, pCRow1, LDC
  684. fmul v0.2s, v24.2s, alphaV0_R
  685. fmls v0.2s, v25.2s, alphaV0_I
  686. fmul v1.2s, v24.2s, alphaV1_I
  687. fmla v1.2s, v25.2s, alphaV1_R
  688. st2 {v0.2s, v1.2s}, [pCRow1]
  689. add pCRow1, pCRow1, LDC
  690. fmul v4.2s, v28.2s, alphaV0_R
  691. fmls v4.2s, v29.2s, alphaV0_I
  692. fmul v5.2s, v28.2s, alphaV1_I
  693. fmla v5.2s, v29.2s, alphaV1_R
  694. st2 {v4.2s, v5.2s}, [pCRow1]
  695. add pCRow0, pCRow0, #16
  696. .endm
  697. /******************************************************************************/
  698. .macro INIT1x4
  699. fmov s16, wzr
  700. fmov s17, wzr
  701. fmov s20, s16
  702. fmov s21, s17
  703. fmov s24, s16
  704. fmov s25, s17
  705. fmov s28, s16
  706. fmov s29, s17
  707. .endm
  708. .macro KERNEL1x4_SUB
  709. ld2 {v8.4s, v9.4s}, [pB]
  710. add pB, pB, #32
  711. ld2 {v0.s, v1.s}[0], [pA]
  712. add pA, pA, #8
  713. OP_rr s16, s0, v8.s[0]
  714. OP_ii s16, s1, v9.s[0]
  715. OP_ri s17, s0, v9.s[0]
  716. OP_ir s17, s1, v8.s[0]
  717. OP_rr s20, s0, v8.s[1]
  718. OP_ii s20, s1, v9.s[1]
  719. OP_ri s21, s0, v9.s[1]
  720. OP_ir s21, s1, v8.s[1]
  721. OP_rr s24, s0, v8.s[2]
  722. OP_ii s24, s1, v9.s[2]
  723. OP_ri s25, s0, v9.s[2]
  724. OP_ir s25, s1, v8.s[2]
  725. OP_rr s28, s0, v8.s[3]
  726. OP_ii s28, s1, v9.s[3]
  727. OP_ri s29, s0, v9.s[3]
  728. OP_ir s29, s1, v8.s[3]
  729. .endm
  730. .macro SAVE1x4
  731. mov pCRow1, pCRow0
  732. fmul s0, s16, alphaV0_R
  733. fmls s0, s17, alphaV0_I
  734. fmul s1, s16, alphaV1_I
  735. fmla s1, s17, alphaV1_R
  736. st2 {v0.s, v1.s}[0], [pCRow1]
  737. add pCRow1, pCRow1, LDC
  738. fmul s4, s20, alphaV0_R
  739. fmls s4, s21, alphaV0_I
  740. fmul s5, s20, alphaV1_I
  741. fmla s5, s21, alphaV1_R
  742. st2 {v4.s, v5.s}[0], [pCRow1]
  743. add pCRow1, pCRow1, LDC
  744. fmul s0, s24, alphaV0_R
  745. fmls s0, s25, alphaV0_I
  746. fmul s1, s24, alphaV1_I
  747. fmla s1, s25, alphaV1_R
  748. st2 {v0.s, v1.s}[0], [pCRow1]
  749. add pCRow1, pCRow1, LDC
  750. fmul s4, s28, alphaV0_R
  751. fmls s4, s29, alphaV0_I
  752. fmul s5, s28, alphaV1_I
  753. fmla s5, s29, alphaV1_R
  754. st2 {v4.s, v5.s}[0], [pCRow1]
  755. add pCRow0, pCRow0, #8
  756. .endm
  757. /******************************************************************************/
  758. .macro INIT8x2
  759. fmov s16, wzr
  760. fmov s17, wzr
  761. fmov s18, wzr
  762. fmov s19, s16
  763. fmov s20, wzr
  764. fmov s21, s16
  765. fmov s22, s17
  766. fmov s23, s18
  767. .endm
  768. .macro KERNEL8x2_SUB
  769. ld2 {v8.2s, v9.2s}, [pB]
  770. add pB, pB, #16
  771. ld2 {v0.4s, v1.4s}, [pA]
  772. add pA, pA, #32
  773. ld2 {v2.4s, v3.4s}, [pA]
  774. add pA, pA, #32
  775. OP_rr v16.4s, v0.4s, v8.s[0]
  776. OP_ii v16.4s, v1.4s, v9.s[0]
  777. OP_ri v17.4s, v0.4s, v9.s[0]
  778. OP_ir v17.4s, v1.4s, v8.s[0]
  779. OP_rr v18.4s, v2.4s, v8.s[0]
  780. OP_ii v18.4s, v3.4s, v9.s[0]
  781. OP_ri v19.4s, v2.4s, v9.s[0]
  782. OP_ir v19.4s, v3.4s, v8.s[0]
  783. OP_rr v20.4s, v0.4s, v8.s[1]
  784. OP_ii v20.4s, v1.4s, v9.s[1]
  785. OP_ri v21.4s, v0.4s, v9.s[1]
  786. OP_ir v21.4s, v1.4s, v8.s[1]
  787. OP_rr v22.4s, v2.4s, v8.s[1]
  788. OP_ii v22.4s, v3.4s, v9.s[1]
  789. OP_ri v23.4s, v2.4s, v9.s[1]
  790. OP_ir v23.4s, v3.4s, v8.s[1]
  791. .endm
  792. .macro SAVE8x2
  793. mov pCRow1, pCRow0
  794. fmul v0.4s, v16.4s, alphaV0_R
  795. fmls v0.4s, v17.4s, alphaV0_I
  796. fmul v1.4s, v16.4s, alphaV1_I
  797. fmla v1.4s, v17.4s, alphaV1_R
  798. st2 {v0.4s, v1.4s}, [pCRow1]
  799. add pCRow2, pCRow1, #32
  800. fmul v2.4s, v18.4s, alphaV0_R
  801. fmls v2.4s, v19.4s, alphaV0_I
  802. fmul v3.4s, v18.4s, alphaV1_I
  803. fmla v3.4s, v19.4s, alphaV1_R
  804. st2 {v2.4s, v3.4s}, [pCRow2]
  805. add pCRow1, pCRow1, LDC
  806. fmul v4.4s, v20.4s, alphaV0_R
  807. fmls v4.4s, v21.4s, alphaV0_I
  808. fmul v5.4s, v20.4s, alphaV1_I
  809. fmla v5.4s, v21.4s, alphaV1_R
  810. st2 {v4.4s, v5.4s}, [pCRow1]
  811. add pCRow2, pCRow1, #32
  812. fmul v6.4s, v22.4s, alphaV0_R
  813. fmls v6.4s, v23.4s, alphaV0_I
  814. fmul v7.4s, v22.4s, alphaV1_I
  815. fmla v7.4s, v23.4s, alphaV1_R
  816. st2 {v6.4s, v7.4s}, [pCRow2]
  817. add pCRow0, pCRow0, #64
  818. .endm
  819. /******************************************************************************/
  820. .macro INIT4x2
  821. fmov s16, wzr
  822. fmov s17, wzr
  823. fmov s20, s16
  824. fmov s21, s17
  825. .endm
  826. .macro KERNEL4x2_SUB
  827. ld2 {v8.2s, v9.2s}, [pB]
  828. add pB, pB, #16
  829. ld2 {v0.4s, v1.4s}, [pA]
  830. add pA, pA, #32
  831. OP_rr v16.4s, v0.4s, v8.s[0]
  832. OP_ii v16.4s, v1.4s, v9.s[0]
  833. OP_ri v17.4s, v0.4s, v9.s[0]
  834. OP_ir v17.4s, v1.4s, v8.s[0]
  835. OP_rr v20.4s, v0.4s, v8.s[1]
  836. OP_ii v20.4s, v1.4s, v9.s[1]
  837. OP_ri v21.4s, v0.4s, v9.s[1]
  838. OP_ir v21.4s, v1.4s, v8.s[1]
  839. .endm
  840. .macro SAVE4x2
  841. mov pCRow1, pCRow0
  842. fmul v0.4s, v16.4s, alphaV0_R
  843. fmls v0.4s, v17.4s, alphaV0_I
  844. fmul v1.4s, v16.4s, alphaV1_I
  845. fmla v1.4s, v17.4s, alphaV1_R
  846. st2 {v0.4s, v1.4s}, [pCRow1]
  847. add pCRow1, pCRow1, LDC
  848. fmul v4.4s, v20.4s, alphaV0_R
  849. fmls v4.4s, v21.4s, alphaV0_I
  850. fmul v5.4s, v20.4s, alphaV1_I
  851. fmla v5.4s, v21.4s, alphaV1_R
  852. st2 {v4.4s, v5.4s}, [pCRow1]
  853. add pCRow0, pCRow0, #32
  854. .endm
  855. /******************************************************************************/
  856. .macro INIT2x2
  857. fmov s16, wzr
  858. fmov s17, wzr
  859. fmov s20, s16
  860. fmov s21, s17
  861. .endm
  862. .macro KERNEL2x2_SUB
  863. ld2 {v8.2s, v9.2s}, [pB]
  864. add pB, pB, #16
  865. ld2 {v0.2s, v1.2s}, [pA]
  866. add pA, pA, #16
  867. OP_rr v16.2s, v0.2s, v8.s[0]
  868. OP_ii v16.2s, v1.2s, v9.s[0]
  869. OP_ri v17.2s, v0.2s, v9.s[0]
  870. OP_ir v17.2s, v1.2s, v8.s[0]
  871. OP_rr v20.2s, v0.2s, v8.s[1]
  872. OP_ii v20.2s, v1.2s, v9.s[1]
  873. OP_ri v21.2s, v0.2s, v9.s[1]
  874. OP_ir v21.2s, v1.2s, v8.s[1]
  875. .endm
  876. .macro SAVE2x2
  877. mov pCRow1, pCRow0
  878. fmul v0.2s, v16.2s, alphaV0_R
  879. fmls v0.2s, v17.2s, alphaV0_I
  880. fmul v1.2s, v16.2s, alphaV1_I
  881. fmla v1.2s, v17.2s, alphaV1_R
  882. st2 {v0.2s, v1.2s}, [pCRow1]
  883. add pCRow1, pCRow1, LDC
  884. fmul v4.2s, v20.2s, alphaV0_R
  885. fmls v4.2s, v21.2s, alphaV0_I
  886. fmul v5.2s, v20.2s, alphaV1_I
  887. fmla v5.2s, v21.2s, alphaV1_R
  888. st2 {v4.2s, v5.2s}, [pCRow1]
  889. add pCRow0, pCRow0, #16
  890. .endm
  891. /******************************************************************************/
  892. .macro INIT1x2
  893. fmov s16, wzr
  894. fmov s17, wzr
  895. fmov s20, wzr
  896. fmov s21, wzr
  897. .endm
  898. .macro KERNEL1x2_SUB
  899. ld2 {v8.2s, v9.2s}, [pB]
  900. add pB, pB, #16
  901. ld2 {v0.s, v1.s}[0], [pA]
  902. add pA, pA, #8
  903. OP_rr s16, s0, v8.s[0]
  904. OP_ii s16, s1, v9.s[0]
  905. OP_ri s17, s0, v9.s[0]
  906. OP_ir s17, s1, v8.s[0]
  907. OP_rr s20, s0, v8.s[1]
  908. OP_ii s20, s1, v9.s[1]
  909. OP_ri s21, s0, v9.s[1]
  910. OP_ir s21, s1, v8.s[1]
  911. .endm
  912. .macro SAVE1x2
  913. mov pCRow1, pCRow0
  914. fmul s0, s16, alphaV0_R
  915. fmls s0, s17, alphaV0_I
  916. fmul s1, s16, alphaV1_I
  917. fmla s1, s17, alphaV1_R
  918. st2 {v0.s, v1.s}[0], [pCRow1]
  919. add pCRow1, pCRow1, LDC
  920. fmul s4, s20, alphaV0_R
  921. fmls s4, s21, alphaV0_I
  922. fmul s5, s20, alphaV1_I
  923. fmla s5, s21, alphaV1_R
  924. st2 {v4.s, v5.s}[0], [pCRow1]
  925. add pCRow0, pCRow0, #8
  926. .endm
  927. /******************************************************************************/
  928. .macro INIT8x1
  929. fmov s16, wzr
  930. fmov s17, wzr
  931. fmov s18, wzr
  932. fmov s19, s16
  933. .endm
  934. .macro KERNEL8x1_SUB
  935. ld1 {v8.2s}, [pB]
  936. add pB, pB, #8
  937. ld2 {v0.4s, v1.4s}, [pA]
  938. add pA, pA, #32
  939. ld2 {v2.4s, v3.4s}, [pA]
  940. add pA, pA, #32
  941. OP_rr v16.4s, v0.4s, v8.s[0]
  942. OP_ii v16.4s, v1.4s, v8.s[1]
  943. OP_ri v17.4s, v0.4s, v8.s[1]
  944. OP_ir v17.4s, v1.4s, v8.s[0]
  945. OP_rr v18.4s, v2.4s, v8.s[0]
  946. OP_ii v18.4s, v3.4s, v8.s[1]
  947. OP_ri v19.4s, v2.4s, v8.s[1]
  948. OP_ir v19.4s, v3.4s, v8.s[0]
  949. .endm
  950. .macro SAVE8x1
  951. mov pCRow1, pCRow0
  952. fmul v0.4s, v16.4s, alphaV0_R
  953. fmls v0.4s, v17.4s, alphaV0_I
  954. fmul v1.4s, v16.4s, alphaV1_I
  955. fmla v1.4s, v17.4s, alphaV1_R
  956. st2 {v0.4s, v1.4s}, [pCRow1]
  957. add pCRow1, pCRow1, #32
  958. fmul v2.4s, v18.4s, alphaV0_R
  959. fmls v2.4s, v19.4s, alphaV0_I
  960. fmul v3.4s, v18.4s, alphaV1_I
  961. fmla v3.4s, v19.4s, alphaV1_R
  962. st2 {v2.4s, v3.4s}, [pCRow1]
  963. add pCRow0, pCRow0, #64
  964. .endm
  965. /******************************************************************************/
  966. .macro INIT4x1
  967. fmov s16, wzr
  968. fmov s17, s16
  969. .endm
  970. .macro KERNEL4x1_SUB
  971. ld2 {v8.s, v9.s}[0], [pB]
  972. add pB, pB, #8
  973. ld2 {v0.4s, v1.4s}, [pA]
  974. add pA, pA, #32
  975. OP_rr v16.4s, v0.4s, v8.s[0]
  976. OP_ii v16.4s, v1.4s, v9.s[0]
  977. OP_ri v17.4s, v0.4s, v9.s[0]
  978. OP_ir v17.4s, v1.4s, v8.s[0]
  979. .endm
  980. .macro SAVE4x1
  981. mov pCRow1, pCRow0
  982. fmul v0.4s, v16.4s, alphaV0_R
  983. fmls v0.4s, v17.4s, alphaV0_I
  984. fmul v1.4s, v16.4s, alphaV1_I
  985. fmla v1.4s, v17.4s, alphaV1_R
  986. st2 {v0.4s, v1.4s}, [pCRow1]
  987. add pCRow0, pCRow0, #32
  988. .endm
  989. /******************************************************************************/
  990. .macro INIT2x1
  991. fmov s16, wzr
  992. fmov s17, wzr
  993. .endm
  994. .macro KERNEL2x1_SUB
  995. ld2 {v8.s, v9.s}[0], [pB]
  996. add pB, pB, #8
  997. ld2 {v0.2s, v1.2s}, [pA]
  998. add pA, pA, #16
  999. OP_rr v16.2s, v0.2s, v8.s[0]
  1000. OP_ii v16.2s, v1.2s, v9.s[0]
  1001. OP_ri v17.2s, v0.2s, v9.s[0]
  1002. OP_ir v17.2s, v1.2s, v8.s[0]
  1003. .endm
  1004. .macro SAVE2x1
  1005. mov pCRow1, pCRow0
  1006. fmul v0.2s, v16.2s, alphaV0_R
  1007. fmls v0.2s, v17.2s, alphaV0_I
  1008. fmul v1.2s, v16.2s, alphaV1_I
  1009. fmla v1.2s, v17.2s, alphaV1_R
  1010. st2 {v0.2s, v1.2s}, [pCRow1]
  1011. add pCRow0, pCRow0, #16
  1012. .endm
  1013. /******************************************************************************/
  1014. .macro INIT1x1
  1015. fmov s16, wzr
  1016. fmov s17, wzr
  1017. .endm
  1018. .macro KERNEL1x1_SUB
  1019. ld2 {v8.s, v9.s}[0], [pB]
  1020. add pB, pB, #8
  1021. ld2 {v0.s, v1.s}[0], [pA]
  1022. add pA, pA, #8
  1023. OP_rr s16, s0, v8.s[0]
  1024. OP_ii s16, s1, v9.s[0]
  1025. OP_ri s17, s0, v9.s[0]
  1026. OP_ir s17, s1, v8.s[0]
  1027. .endm
  1028. .macro SAVE1x1
  1029. mov pCRow1, pCRow0
  1030. fmul s0, s16, alphaV0_R
  1031. fmls s0, s17, alphaV0_I
  1032. fmul s1, s16, alphaV1_I
  1033. fmla s1, s17, alphaV1_R
  1034. st2 {v0.s, v1.s}[0], [pCRow1]
  1035. add pCRow0, pCRow0, #8
  1036. .endm
  1037. /*******************************************************************************
  1038. * End of macro definitions
  1039. *******************************************************************************/
  1040. PROLOGUE
  1041. .align 5
  1042. add sp, sp, #-(11 * 16)
  1043. stp d8, d9, [sp, #(0 * 16)]
  1044. stp d10, d11, [sp, #(1 * 16)]
  1045. stp d12, d13, [sp, #(2 * 16)]
  1046. stp d14, d15, [sp, #(3 * 16)]
  1047. stp d16, d17, [sp, #(4 * 16)]
  1048. stp x18, x19, [sp, #(5 * 16)]
  1049. stp x20, x21, [sp, #(6 * 16)]
  1050. stp x22, x23, [sp, #(7 * 16)]
  1051. stp x24, x25, [sp, #(8 * 16)]
  1052. stp x26, x27, [sp, #(9 * 16)]
  1053. str x28, [sp, #(10 * 16)]
  1054. fmov alpha0_R, s0
  1055. fmov alpha0_I, s1
  1056. fmov alpha1_R, s0
  1057. fmov alpha1_I, s1
  1058. lsl LDC, LDC, #3 // ldc = ldc * 8
  1059. #if !defined(LEFT)
  1060. neg tempOffset, offset
  1061. #endif
  1062. mov pB, origPB
  1063. mov counterJ, origN
  1064. asr counterJ, counterJ, #2 // J = J / 4
  1065. cmp counterJ, #0
  1066. ble ctrmm_kernel_L2_BEGIN
  1067. /******************************************************************************/
  1068. ctrmm_kernel_L4_BEGIN:
  1069. mov pCRow0, pC // pCRow0 = C
  1070. add pC, pC, LDC, lsl #2
  1071. #if defined(LEFT)
  1072. mov tempOffset, offset
  1073. #endif
  1074. mov pA, origPA // pA = start of A array
  1075. ctrmm_kernel_L4_M8_BEGIN:
  1076. mov counterI, origM
  1077. asr counterI, counterI, #3 // counterI = counterI / 8
  1078. cmp counterI, #0
  1079. ble ctrmm_kernel_L4_M4_BEGIN
  1080. ctrmm_kernel_L4_M8_20:
  1081. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1082. mov pB, origPB
  1083. #else
  1084. mov pB, origPB
  1085. lsl temp, tempOffset, #6
  1086. add pA, pA, temp
  1087. lsl temp, tempOffset, #5
  1088. add pB, pB, temp
  1089. #endif
  1090. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1091. sub tempK, origK, tempOffset
  1092. #elif defined(LEFT)
  1093. add tempK, tempOffset, #8
  1094. #else
  1095. add tempK, tempOffset, #4
  1096. #endif
  1097. asr counterL , tempK, #1 // L = K / 2
  1098. cmp counterL , #2 // is there at least 4 to do?
  1099. blt ctrmm_kernel_L4_M8_32
  1100. KERNEL8x4_I // do one in the K
  1101. KERNEL8x4_M2 // do another in the K
  1102. subs counterL, counterL, #2 // subtract 2
  1103. ble ctrmm_kernel_L4_M8_22a
  1104. .align 5
  1105. ctrmm_kernel_L4_M8_22:
  1106. KERNEL8x4_M1
  1107. KERNEL8x4_M2
  1108. subs counterL, counterL, #1
  1109. bgt ctrmm_kernel_L4_M8_22
  1110. ctrmm_kernel_L4_M8_22a:
  1111. KERNEL8x4_M1
  1112. KERNEL8x4_E
  1113. b ctrmm_kernel_L4_M8_44
  1114. ctrmm_kernel_L4_M8_32:
  1115. tst counterL, #1
  1116. ble ctrmm_kernel_L4_M8_40
  1117. KERNEL8x4_I
  1118. KERNEL8x4_E
  1119. b ctrmm_kernel_L4_M8_44
  1120. ctrmm_kernel_L4_M8_40:
  1121. INIT8x4
  1122. ctrmm_kernel_L4_M8_44:
  1123. ands counterL , tempK, #1
  1124. ble ctrmm_kernel_L4_M8_100
  1125. ctrmm_kernel_L4_M8_46:
  1126. KERNEL8x4_SUB
  1127. ctrmm_kernel_L4_M8_100:
  1128. SAVE8x4
  1129. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1130. sub tempK, origK, tempOffset
  1131. #if defined(LEFT)
  1132. sub tempK, tempK, #8
  1133. #else
  1134. sub tempK, tempK, #4
  1135. #endif
  1136. lsl temp, tempK, #6
  1137. add pA, pA, temp
  1138. lsl temp, tempK, #5
  1139. add pB, pB, temp
  1140. #endif
  1141. #if defined(LEFT)
  1142. add tempOffset, tempOffset, #8
  1143. #endif
  1144. ctrmm_kernel_L4_M8_END:
  1145. subs counterI, counterI, #1
  1146. bne ctrmm_kernel_L4_M8_20
  1147. ctrmm_kernel_L4_M4_BEGIN:
  1148. mov counterI, origM
  1149. tst counterI , #7
  1150. ble ctrmm_kernel_L4_END
  1151. tst counterI, #4
  1152. ble ctrmm_kernel_L4_M2_BEGIN
  1153. ctrmm_kernel_L4_M4_20:
  1154. INIT4x4
  1155. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1156. mov pB, origPB
  1157. #else
  1158. mov pB, origPB
  1159. lsl temp, tempOffset, #5
  1160. add pB, pB, temp
  1161. add pA, pA, temp
  1162. #endif
  1163. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1164. sub tempK, origK, tempOffset
  1165. #elif defined(LEFT)
  1166. add tempK, tempOffset, #4
  1167. #else
  1168. add tempK, tempOffset, #4
  1169. #endif
  1170. asr counterL , tempK, #3 // counterL = counterL / 8
  1171. cmp counterL , #0
  1172. ble ctrmm_kernel_L4_M4_40
  1173. ctrmm_kernel_L4_M4_22:
  1174. KERNEL4x4_SUB
  1175. KERNEL4x4_SUB
  1176. KERNEL4x4_SUB
  1177. KERNEL4x4_SUB
  1178. KERNEL4x4_SUB
  1179. KERNEL4x4_SUB
  1180. KERNEL4x4_SUB
  1181. KERNEL4x4_SUB
  1182. subs counterL, counterL, #1
  1183. bgt ctrmm_kernel_L4_M4_22
  1184. ctrmm_kernel_L4_M4_40:
  1185. ands counterL , tempK, #7 // counterL = counterL % 8
  1186. ble ctrmm_kernel_L4_M4_100
  1187. ctrmm_kernel_L4_M4_42:
  1188. KERNEL4x4_SUB
  1189. subs counterL, counterL, #1
  1190. bgt ctrmm_kernel_L4_M4_42
  1191. ctrmm_kernel_L4_M4_100:
  1192. SAVE4x4
  1193. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1194. sub tempK, origK, tempOffset
  1195. #if defined(LEFT)
  1196. sub tempK, tempK, #4
  1197. #else
  1198. sub tempK, tempK, #4
  1199. #endif
  1200. lsl temp, tempK, #5
  1201. add pA, pA, temp
  1202. add pB, pB, temp
  1203. #endif
  1204. #if defined(LEFT)
  1205. add tempOffset, tempOffset, #4
  1206. #endif
  1207. ctrmm_kernel_L4_M4_END:
  1208. ctrmm_kernel_L4_M2_BEGIN:
  1209. mov counterI, origM
  1210. tst counterI , #3
  1211. ble ctrmm_kernel_L4_END
  1212. tst counterI, #2 // counterI = counterI / 2
  1213. ble ctrmm_kernel_L4_M1_BEGIN
  1214. ctrmm_kernel_L4_M2_20:
  1215. INIT2x4
  1216. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1217. mov pB, origPB
  1218. #else
  1219. mov pB, origPB
  1220. lsl temp, tempOffset, #4
  1221. add pA, pA, temp
  1222. lsl temp, tempOffset, #5
  1223. add pB, pB, temp
  1224. #endif
  1225. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1226. sub tempK, origK, tempOffset
  1227. #elif defined(LEFT)
  1228. add tempK, tempOffset, #2
  1229. #else
  1230. add tempK, tempOffset, #4
  1231. #endif
  1232. asr counterL , tempK, #3 // counterL = counterL / 8
  1233. cmp counterL , #0
  1234. ble ctrmm_kernel_L4_M2_40
  1235. ctrmm_kernel_L4_M2_22:
  1236. KERNEL2x4_SUB
  1237. KERNEL2x4_SUB
  1238. KERNEL2x4_SUB
  1239. KERNEL2x4_SUB
  1240. KERNEL2x4_SUB
  1241. KERNEL2x4_SUB
  1242. KERNEL2x4_SUB
  1243. KERNEL2x4_SUB
  1244. subs counterL, counterL, #1
  1245. bgt ctrmm_kernel_L4_M2_22
  1246. ctrmm_kernel_L4_M2_40:
  1247. ands counterL , tempK, #7 // counterL = counterL % 8
  1248. ble ctrmm_kernel_L4_M2_100
  1249. ctrmm_kernel_L4_M2_42:
  1250. KERNEL2x4_SUB
  1251. subs counterL, counterL, #1
  1252. bgt ctrmm_kernel_L4_M2_42
  1253. ctrmm_kernel_L4_M2_100:
  1254. SAVE2x4
  1255. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1256. sub tempK, origK, tempOffset
  1257. #if defined(LEFT)
  1258. sub tempK, tempK, #2
  1259. #else
  1260. sub tempK, tempK, #4
  1261. #endif
  1262. lsl temp, tempK, #4
  1263. add pA, pA, temp
  1264. lsl temp, tempK, #5
  1265. add pB, pB, temp
  1266. #endif
  1267. #if defined(LEFT)
  1268. add tempOffset, tempOffset, #2
  1269. #endif
  1270. ctrmm_kernel_L4_M2_END:
  1271. ctrmm_kernel_L4_M1_BEGIN:
  1272. tst counterI, #1 // counterI = counterI % 2
  1273. ble ctrmm_kernel_L4_END
  1274. ctrmm_kernel_L4_M1_20:
  1275. INIT1x4
  1276. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1277. mov pB, origPB
  1278. #else
  1279. mov pB, origPB
  1280. lsl temp, tempOffset, #5
  1281. add pB, pB, temp
  1282. lsl temp, tempOffset, #3
  1283. add pA, pA, temp
  1284. #endif
  1285. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1286. sub tempK, origK, tempOffset
  1287. #elif defined(LEFT)
  1288. add tempK, tempOffset, #1
  1289. #else
  1290. add tempK, tempOffset, #4
  1291. #endif
  1292. asr counterL , tempK, #3 // counterL = counterL / 8
  1293. cmp counterL , #0
  1294. ble ctrmm_kernel_L4_M1_40
  1295. ctrmm_kernel_L4_M1_22:
  1296. KERNEL1x4_SUB
  1297. KERNEL1x4_SUB
  1298. KERNEL1x4_SUB
  1299. KERNEL1x4_SUB
  1300. KERNEL1x4_SUB
  1301. KERNEL1x4_SUB
  1302. KERNEL1x4_SUB
  1303. KERNEL1x4_SUB
  1304. subs counterL, counterL, #1
  1305. bgt ctrmm_kernel_L4_M1_22
  1306. ctrmm_kernel_L4_M1_40:
  1307. ands counterL , tempK, #7 // counterL = counterL % 8
  1308. ble ctrmm_kernel_L4_M1_100
  1309. ctrmm_kernel_L4_M1_42:
  1310. KERNEL1x4_SUB
  1311. subs counterL, counterL, #1
  1312. bgt ctrmm_kernel_L4_M1_42
  1313. ctrmm_kernel_L4_M1_100:
  1314. SAVE1x4
  1315. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1316. sub tempK, origK, tempOffset
  1317. #if defined(LEFT)
  1318. sub tempK, tempK, #1
  1319. #else
  1320. sub tempK, tempK, #4
  1321. #endif
  1322. lsl temp, tempK, #3
  1323. add pA, pA, temp
  1324. lsl temp, tempK, #5
  1325. add pB, pB, temp
  1326. #endif
  1327. #if defined(LEFT)
  1328. add tempOffset, tempOffset, #1
  1329. #endif
  1330. ctrmm_kernel_L4_END:
  1331. lsl temp, origK, #5
  1332. add origPB, origPB, temp // B = B + K * 4 * 8
  1333. #if !defined(LEFT)
  1334. add tempOffset, tempOffset, #4
  1335. #endif
  1336. subs counterJ, counterJ , #1 // j--
  1337. bgt ctrmm_kernel_L4_BEGIN
  1338. /******************************************************************************/
  1339. ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  1340. mov counterJ , origN
  1341. tst counterJ , #3
  1342. ble ctrmm_kernel_L999 // error, N was less than 4?
  1343. tst counterJ , #2
  1344. ble ctrmm_kernel_L1_BEGIN
  1345. mov pCRow0, pC // pCRow0 = pC
  1346. add pC,pC,LDC, lsl #1
  1347. #if defined(LEFT)
  1348. mov tempOffset, offset
  1349. #endif
  1350. mov pA, origPA // pA = A
  1351. ctrmm_kernel_L2_M8_BEGIN:
  1352. mov counterI, origM
  1353. asr counterI, counterI, #3 // counterI = counterI / 8
  1354. cmp counterI, #0
  1355. ble ctrmm_kernel_L2_M4_BEGIN
  1356. ctrmm_kernel_L2_M8_20:
  1357. INIT8x2
  1358. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1359. mov pB, origPB
  1360. #else
  1361. mov pB, origPB
  1362. lsl temp, tempOffset, #6
  1363. add pA, pA, temp
  1364. lsl temp, tempOffset, #4
  1365. add pB, pB, temp
  1366. #endif
  1367. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1368. sub tempK, origK, tempOffset
  1369. #elif defined(LEFT)
  1370. add tempK, tempOffset, #8
  1371. #else
  1372. add tempK, tempOffset, #2
  1373. #endif
  1374. asr counterL , tempK, #3 // counterL = counterL / 8
  1375. cmp counterL,#0
  1376. ble ctrmm_kernel_L2_M8_40
  1377. .align 5
  1378. ctrmm_kernel_L2_M8_22:
  1379. KERNEL8x2_SUB
  1380. KERNEL8x2_SUB
  1381. KERNEL8x2_SUB
  1382. KERNEL8x2_SUB
  1383. KERNEL8x2_SUB
  1384. KERNEL8x2_SUB
  1385. KERNEL8x2_SUB
  1386. KERNEL8x2_SUB
  1387. subs counterL, counterL, #1
  1388. bgt ctrmm_kernel_L2_M8_22
  1389. ctrmm_kernel_L2_M8_40:
  1390. ands counterL , tempK, #7 // counterL = counterL % 8
  1391. ble ctrmm_kernel_L2_M8_100
  1392. ctrmm_kernel_L2_M8_42:
  1393. KERNEL8x2_SUB
  1394. subs counterL, counterL, #1
  1395. bgt ctrmm_kernel_L2_M8_42
  1396. ctrmm_kernel_L2_M8_100:
  1397. SAVE8x2
  1398. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1399. sub tempK, origK, tempOffset
  1400. #if defined(LEFT)
  1401. sub tempK, tempK, #8
  1402. #else
  1403. sub tempK, tempK, #2
  1404. #endif
  1405. lsl temp, tempK, #6
  1406. add pA, pA, temp
  1407. lsl temp, tempK, #4
  1408. add pB, pB, temp
  1409. #endif
  1410. #if defined(LEFT)
  1411. add tempOffset, tempOffset, #8
  1412. #endif
  1413. ctrmm_kernel_L2_M8_END:
  1414. subs counterI, counterI, #1
  1415. bgt ctrmm_kernel_L2_M8_20
  1416. ctrmm_kernel_L2_M4_BEGIN:
  1417. mov counterI, origM
  1418. tst counterI , #7
  1419. ble ctrmm_kernel_L2_END
  1420. tst counterI, #4 // counterI = counterI / 2
  1421. ble ctrmm_kernel_L2_M2_BEGIN
  1422. ctrmm_kernel_L2_M4_20:
  1423. INIT4x2
  1424. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1425. mov pB, origPB
  1426. #else
  1427. mov pB, origPB
  1428. lsl temp, tempOffset, #4
  1429. add pB, pB, temp
  1430. lsl temp, tempOffset, #5
  1431. add pA, pA, temp
  1432. #endif
  1433. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1434. sub tempK, origK, tempOffset
  1435. #elif defined(LEFT)
  1436. add tempK, tempOffset, #4
  1437. #else
  1438. add tempK, tempOffset, #2
  1439. #endif
  1440. asr counterL , tempK, #3 // counterL = counterL / 8
  1441. cmp counterL,#0
  1442. ble ctrmm_kernel_L2_M4_40
  1443. .align 5
  1444. ctrmm_kernel_L2_M4_22:
  1445. KERNEL4x2_SUB
  1446. KERNEL4x2_SUB
  1447. KERNEL4x2_SUB
  1448. KERNEL4x2_SUB
  1449. KERNEL4x2_SUB
  1450. KERNEL4x2_SUB
  1451. KERNEL4x2_SUB
  1452. KERNEL4x2_SUB
  1453. subs counterL, counterL, #1
  1454. bgt ctrmm_kernel_L2_M4_22
  1455. ctrmm_kernel_L2_M4_40:
  1456. ands counterL , tempK, #7 // counterL = counterL % 8
  1457. ble ctrmm_kernel_L2_M4_100
  1458. ctrmm_kernel_L2_M4_42:
  1459. KERNEL4x2_SUB
  1460. subs counterL, counterL, #1
  1461. bgt ctrmm_kernel_L2_M4_42
  1462. ctrmm_kernel_L2_M4_100:
  1463. SAVE4x2
  1464. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1465. sub tempK, origK, tempOffset
  1466. #if defined(LEFT)
  1467. sub tempK, tempK, #4
  1468. #else
  1469. sub tempK, tempK, #2
  1470. #endif
  1471. lsl temp, tempK, #5
  1472. add pA, pA, temp
  1473. lsl temp, tempK, #4
  1474. add pB, pB, temp
  1475. #endif
  1476. #if defined(LEFT)
  1477. add tempOffset, tempOffset, #4
  1478. #endif
  1479. ctrmm_kernel_L2_M4_END:
  1480. ctrmm_kernel_L2_M2_BEGIN:
  1481. mov counterI, origM
  1482. tst counterI , #3
  1483. ble ctrmm_kernel_L2_END
  1484. tst counterI, #2 // counterI = counterI / 2
  1485. ble ctrmm_kernel_L2_M1_BEGIN
  1486. ctrmm_kernel_L2_M2_20:
  1487. INIT2x2
  1488. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1489. mov pB, origPB
  1490. #else
  1491. mov pB, origPB
  1492. lsl temp, tempOffset, #4
  1493. add pB, pB, temp
  1494. lsl temp, tempOffset, #4
  1495. add pA, pA, temp
  1496. #endif
  1497. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1498. sub tempK, origK, tempOffset
  1499. #elif defined(LEFT)
  1500. add tempK, tempOffset, #2
  1501. #else
  1502. add tempK, tempOffset, #2
  1503. #endif
  1504. asr counterL , tempK, #3 // counterL = counterL / 8
  1505. cmp counterL,#0
  1506. ble ctrmm_kernel_L2_M2_40
  1507. ctrmm_kernel_L2_M2_22:
  1508. KERNEL2x2_SUB
  1509. KERNEL2x2_SUB
  1510. KERNEL2x2_SUB
  1511. KERNEL2x2_SUB
  1512. KERNEL2x2_SUB
  1513. KERNEL2x2_SUB
  1514. KERNEL2x2_SUB
  1515. KERNEL2x2_SUB
  1516. subs counterL, counterL, #1
  1517. bgt ctrmm_kernel_L2_M2_22
  1518. ctrmm_kernel_L2_M2_40:
  1519. ands counterL , tempK, #7 // counterL = counterL % 8
  1520. ble ctrmm_kernel_L2_M2_100
  1521. ctrmm_kernel_L2_M2_42:
  1522. KERNEL2x2_SUB
  1523. subs counterL, counterL, #1
  1524. bgt ctrmm_kernel_L2_M2_42
  1525. ctrmm_kernel_L2_M2_100:
  1526. SAVE2x2
  1527. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1528. sub tempK, origK, tempOffset
  1529. #if defined(LEFT)
  1530. sub tempK, tempK, #2
  1531. #else
  1532. sub tempK, tempK, #2
  1533. #endif
  1534. lsl temp, tempK, #4
  1535. add pA, pA, temp
  1536. lsl temp, tempK, #4
  1537. add pB, pB, temp
  1538. #endif
  1539. #if defined(LEFT)
  1540. add tempOffset, tempOffset, #2
  1541. #endif
  1542. ctrmm_kernel_L2_M2_END:
  1543. ctrmm_kernel_L2_M1_BEGIN:
  1544. tst counterI, #1 // counterI = counterI % 2
  1545. ble ctrmm_kernel_L2_END
  1546. ctrmm_kernel_L2_M1_20:
  1547. INIT1x2
  1548. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1549. mov pB, origPB
  1550. #else
  1551. mov pB, origPB
  1552. lsl temp, tempOffset, #4
  1553. add pB, pB, temp
  1554. lsl temp, tempOffset, #3
  1555. add pA, pA, temp
  1556. #endif
  1557. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1558. sub tempK, origK, tempOffset
  1559. #elif defined(LEFT)
  1560. add tempK, tempOffset, #1
  1561. #else
  1562. add tempK, tempOffset, #2
  1563. #endif
  1564. asr counterL , tempK, #3 // counterL = counterL / 8
  1565. cmp counterL, #0
  1566. ble ctrmm_kernel_L2_M1_40
  1567. ctrmm_kernel_L2_M1_22:
  1568. KERNEL1x2_SUB
  1569. KERNEL1x2_SUB
  1570. KERNEL1x2_SUB
  1571. KERNEL1x2_SUB
  1572. KERNEL1x2_SUB
  1573. KERNEL1x2_SUB
  1574. KERNEL1x2_SUB
  1575. KERNEL1x2_SUB
  1576. subs counterL, counterL, #1
  1577. bgt ctrmm_kernel_L2_M1_22
  1578. ctrmm_kernel_L2_M1_40:
  1579. ands counterL , tempK, #7 // counterL = counterL % 8
  1580. ble ctrmm_kernel_L2_M1_100
  1581. ctrmm_kernel_L2_M1_42:
  1582. KERNEL1x2_SUB
  1583. subs counterL, counterL, #1
  1584. bgt ctrmm_kernel_L2_M1_42
  1585. ctrmm_kernel_L2_M1_100:
  1586. SAVE1x2
  1587. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1588. sub tempK, origK, tempOffset
  1589. #if defined(LEFT)
  1590. sub tempK, tempK, #1
  1591. #else
  1592. sub tempK, tempK, #2
  1593. #endif
  1594. lsl temp, tempK, #3
  1595. add pA, pA, temp
  1596. lsl temp, tempK, #4
  1597. add pB, pB, temp
  1598. #endif
  1599. #if defined(LEFT)
  1600. add tempOffset, tempOffset, #1
  1601. #endif
  1602. ctrmm_kernel_L2_END:
  1603. #if !defined(LEFT)
  1604. add tempOffset, tempOffset, #2
  1605. #endif
  1606. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1607. /******************************************************************************/
  1608. ctrmm_kernel_L1_BEGIN:
  1609. mov counterJ , origN
  1610. tst counterJ , #1
  1611. ble ctrmm_kernel_L999 // done
  1612. mov pCRow0, pC // pCRow0 = C
  1613. add pC , pC , LDC // Update pC to point to next
  1614. #if defined(LEFT)
  1615. mov tempOffset, offset
  1616. #endif
  1617. mov pA, origPA // pA = A
  1618. ctrmm_kernel_L1_M8_BEGIN:
  1619. mov counterI, origM
  1620. asr counterI, counterI, #3 // counterI = counterI / 8
  1621. cmp counterI, #0
  1622. ble ctrmm_kernel_L1_M4_BEGIN
  1623. ctrmm_kernel_L1_M8_20:
  1624. INIT8x1
  1625. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1626. mov pB, origPB
  1627. #else
  1628. mov pB, origPB
  1629. lsl temp, tempOffset, #6
  1630. add pA, pA, temp
  1631. lsl temp, tempOffset, #3
  1632. add pB, pB, temp
  1633. #endif
  1634. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1635. sub tempK, origK, tempOffset
  1636. #elif defined(LEFT)
  1637. add tempK, tempOffset, #8
  1638. #else
  1639. add tempK, tempOffset, #1
  1640. #endif
  1641. asr counterL , tempK, #3 // counterL = counterL / 8
  1642. cmp counterL , #0
  1643. ble ctrmm_kernel_L1_M8_40
  1644. .align 5
  1645. ctrmm_kernel_L1_M8_22:
  1646. KERNEL8x1_SUB
  1647. KERNEL8x1_SUB
  1648. KERNEL8x1_SUB
  1649. KERNEL8x1_SUB
  1650. KERNEL8x1_SUB
  1651. KERNEL8x1_SUB
  1652. KERNEL8x1_SUB
  1653. KERNEL8x1_SUB
  1654. subs counterL, counterL, #1
  1655. bgt ctrmm_kernel_L1_M8_22
  1656. ctrmm_kernel_L1_M8_40:
  1657. ands counterL , tempK, #7 // counterL = counterL % 8
  1658. ble ctrmm_kernel_L1_M8_100
  1659. ctrmm_kernel_L1_M8_42:
  1660. KERNEL8x1_SUB
  1661. subs counterL, counterL, #1
  1662. bgt ctrmm_kernel_L1_M8_42
  1663. ctrmm_kernel_L1_M8_100:
  1664. SAVE8x1
  1665. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1666. sub tempK, origK, tempOffset
  1667. #if defined(LEFT)
  1668. sub tempK, tempK, #8
  1669. #else
  1670. sub tempK, tempK, #1
  1671. #endif
  1672. lsl temp, tempK, #6
  1673. add pA, pA, temp
  1674. lsl temp, tempK, #3
  1675. add pB, pB, temp
  1676. #endif
  1677. #if defined(LEFT)
  1678. add tempOffset, tempOffset, #8
  1679. #endif
  1680. ctrmm_kernel_L1_M8_END:
  1681. subs counterI, counterI, #1
  1682. bgt ctrmm_kernel_L1_M8_20
  1683. ctrmm_kernel_L1_M4_BEGIN:
  1684. mov counterI, origM
  1685. tst counterI , #7
  1686. ble ctrmm_kernel_L1_END
  1687. tst counterI, #4 // counterI = counterI / 2
  1688. ble ctrmm_kernel_L1_M2_BEGIN
  1689. ctrmm_kernel_L1_M4_20:
  1690. INIT4x1
  1691. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1692. mov pB, origPB
  1693. #else
  1694. mov pB, origPB
  1695. lsl temp, tempOffset, #3
  1696. add pB, pB, temp
  1697. lsl temp, tempOffset, #5
  1698. add pA, pA, temp
  1699. #endif
  1700. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1701. sub tempK, origK, tempOffset
  1702. #elif defined(LEFT)
  1703. add tempK, tempOffset, #4
  1704. #else
  1705. add tempK, tempOffset, #1
  1706. #endif
  1707. asr counterL , tempK, #3 // counterL = counterL / 8
  1708. cmp counterL , #0
  1709. ble ctrmm_kernel_L1_M4_40
  1710. .align 5
  1711. ctrmm_kernel_L1_M4_22:
  1712. KERNEL4x1_SUB
  1713. KERNEL4x1_SUB
  1714. KERNEL4x1_SUB
  1715. KERNEL4x1_SUB
  1716. KERNEL4x1_SUB
  1717. KERNEL4x1_SUB
  1718. KERNEL4x1_SUB
  1719. KERNEL4x1_SUB
  1720. subs counterL, counterL, #1
  1721. bgt ctrmm_kernel_L1_M4_22
  1722. ctrmm_kernel_L1_M4_40:
  1723. ands counterL , tempK, #7 // counterL = counterL % 8
  1724. ble ctrmm_kernel_L1_M4_100
  1725. ctrmm_kernel_L1_M4_42:
  1726. KERNEL4x1_SUB
  1727. subs counterL, counterL, #1
  1728. bgt ctrmm_kernel_L1_M4_42
  1729. ctrmm_kernel_L1_M4_100:
  1730. SAVE4x1
  1731. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1732. sub tempK, origK, tempOffset
  1733. #if defined(LEFT)
  1734. sub tempK, tempK, #4
  1735. #else
  1736. sub tempK, tempK, #1
  1737. #endif
  1738. lsl temp, tempK, #5
  1739. add pA, pA, temp
  1740. lsl temp, tempK, #3
  1741. add pB, pB, temp
  1742. #endif
  1743. #if defined(LEFT)
  1744. add tempOffset, tempOffset, #4
  1745. #endif
  1746. ctrmm_kernel_L1_M4_END:
  1747. ctrmm_kernel_L1_M2_BEGIN:
  1748. mov counterI, origM
  1749. tst counterI , #3
  1750. ble ctrmm_kernel_L1_END
  1751. tst counterI, #2 // counterI = counterI / 2
  1752. ble ctrmm_kernel_L1_M1_BEGIN
  1753. ctrmm_kernel_L1_M2_20:
  1754. INIT2x1
  1755. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1756. mov pB, origPB
  1757. #else
  1758. mov pB, origPB
  1759. lsl temp, tempOffset, #3
  1760. add pB, pB, temp
  1761. lsl temp, tempOffset, #4
  1762. add pA, pA, temp
  1763. #endif
  1764. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1765. sub tempK, origK, tempOffset
  1766. #elif defined(LEFT)
  1767. add tempK, tempOffset, #2
  1768. #else
  1769. add tempK, tempOffset, #1
  1770. #endif
  1771. asr counterL , tempK, #3 // counterL = counterL / 8
  1772. cmp counterL , #0
  1773. ble ctrmm_kernel_L1_M2_40
  1774. ctrmm_kernel_L1_M2_22:
  1775. KERNEL2x1_SUB
  1776. KERNEL2x1_SUB
  1777. KERNEL2x1_SUB
  1778. KERNEL2x1_SUB
  1779. KERNEL2x1_SUB
  1780. KERNEL2x1_SUB
  1781. KERNEL2x1_SUB
  1782. KERNEL2x1_SUB
  1783. subs counterL, counterL, #1
  1784. bgt ctrmm_kernel_L1_M2_22
  1785. ctrmm_kernel_L1_M2_40:
  1786. ands counterL , tempK, #7 // counterL = counterL % 8
  1787. ble ctrmm_kernel_L1_M2_100
  1788. ctrmm_kernel_L1_M2_42:
  1789. KERNEL2x1_SUB
  1790. subs counterL, counterL, #1
  1791. bgt ctrmm_kernel_L1_M2_42
  1792. ctrmm_kernel_L1_M2_100:
  1793. SAVE2x1
  1794. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1795. sub tempK, origK, tempOffset
  1796. #if defined(LEFT)
  1797. sub tempK, tempK, #2
  1798. #else
  1799. sub tempK, tempK, #1
  1800. #endif
  1801. lsl temp, tempK, #4
  1802. add pA, pA, temp
  1803. lsl temp, tempK, #3
  1804. add pB, pB, temp
  1805. #endif
  1806. #if defined(LEFT)
  1807. add tempOffset, tempOffset, #2
  1808. #endif
  1809. ctrmm_kernel_L1_M2_END:
  1810. ctrmm_kernel_L1_M1_BEGIN:
  1811. tst counterI, #1 // counterI = counterI % 2
  1812. ble ctrmm_kernel_L1_END
  1813. ctrmm_kernel_L1_M1_20:
  1814. INIT1x1
  1815. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1816. mov pB, origPB
  1817. #else
  1818. mov pB, origPB
  1819. lsl temp, tempOffset, #3
  1820. add pB, pB, temp
  1821. lsl temp, tempOffset, #3
  1822. add pA, pA, temp
  1823. #endif
  1824. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1825. sub tempK, origK, tempOffset
  1826. #elif defined(LEFT)
  1827. add tempK, tempOffset, #1
  1828. #else
  1829. add tempK, tempOffset, #1
  1830. #endif
  1831. asr counterL , tempK, #3 // counterL = counterL / 8
  1832. cmp counterL , #0
  1833. ble ctrmm_kernel_L1_M1_40
  1834. ctrmm_kernel_L1_M1_22:
  1835. KERNEL1x1_SUB
  1836. KERNEL1x1_SUB
  1837. KERNEL1x1_SUB
  1838. KERNEL1x1_SUB
  1839. KERNEL1x1_SUB
  1840. KERNEL1x1_SUB
  1841. KERNEL1x1_SUB
  1842. KERNEL1x1_SUB
  1843. subs counterL, counterL, #1
  1844. bgt ctrmm_kernel_L1_M1_22
  1845. ctrmm_kernel_L1_M1_40:
  1846. ands counterL , tempK, #7 // counterL = counterL % 8
  1847. ble ctrmm_kernel_L1_M1_100
  1848. ctrmm_kernel_L1_M1_42:
  1849. KERNEL1x1_SUB
  1850. subs counterL, counterL, #1
  1851. bgt ctrmm_kernel_L1_M1_42
  1852. ctrmm_kernel_L1_M1_100:
  1853. SAVE1x1
  1854. ctrmm_kernel_L1_END:
  1855. ctrmm_kernel_L999:
  1856. mov x0, #0 // set return value
  1857. ldp d8, d9, [sp, #(0 * 16)]
  1858. ldp d10, d11, [sp, #(1 * 16)]
  1859. ldp d12, d13, [sp, #(2 * 16)]
  1860. ldp d14, d15, [sp, #(3 * 16)]
  1861. ldp d16, d17, [sp, #(4 * 16)]
  1862. ldp x18, x19, [sp, #(5 * 16)]
  1863. ldp x20, x21, [sp, #(6 * 16)]
  1864. ldp x22, x23, [sp, #(7 * 16)]
  1865. ldp x24, x25, [sp, #(8 * 16)]
  1866. ldp x26, x27, [sp, #(9 * 16)]
  1867. ldr x28, [sp, #(10 * 16)]
  1868. add sp, sp, #(11*16)
  1869. ret
  1870. EPILOGUE