You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_8x4.S 44 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alphaR w17
  49. #define alphaI w18
  50. #define alpha0_R s10
  51. #define alphaV0_R v10.s[0]
  52. #define alpha0_I s11
  53. #define alphaV0_I v11.s[0]
  54. #define A_PRE_SIZE 2560
  55. #define B_PRE_SIZE 448
  56. #define C_PRE_SIZE 128
  57. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  58. #define OP_rr fmla
  59. #define OP_ii fmls
  60. #define OP_ri fmla
  61. #define OP_ir fmla
  62. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  63. #define OP_rr fmla
  64. #define OP_ii fmla
  65. #define OP_ri fmls
  66. #define OP_ir fmla
  67. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  68. #define OP_rr fmla
  69. #define OP_ii fmla
  70. #define OP_ri fmla
  71. #define OP_ir fmls
  72. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  73. #define OP_rr fmla
  74. #define OP_ii fmls
  75. #define OP_ri fmls
  76. #define OP_ir fmls
  77. #endif
  78. // 00 origM
  79. // 01 origN
  80. // 02 origK
  81. // 03 origPA
  82. // 04 origPB
  83. // 05 pC
  84. // 06 origLDC -> LDC
  85. // 07 offset -> temp
  86. // 08 counterL
  87. // 09 counterI
  88. // 10 counterJ
  89. // 11 pB
  90. // 12 pCRow0
  91. // 13 pCRow1
  92. // 14 pCRow2
  93. // 15 pCRow3
  94. // 16 pA
  95. // 17
  96. // 18 must save
  97. // 19 must save
  98. // 20 must save
  99. // 21 must save
  100. // 22 must save
  101. // 23 must save
  102. // 24 must save
  103. // 25 must save
  104. // 26 must save
  105. // 27 must save
  106. // 28 must save
  107. // 29 frame
  108. // 30 link
  109. // 31 sp
  110. //v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R
  111. //v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I
  112. //v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R
  113. //v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I
  114. //v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R
  115. //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I
  116. //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R
  117. //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I
  118. //v08 must save pB0_00_R, pB0_01_R
  119. //v09 must save pB0_00_I, pB0_01_I
  120. //v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R
  121. //v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I
  122. //v12 must save pB1_00_R, pB1_01_R
  123. //v13 must save pB1_00_I, pB1_01_I
  124. //v14 must save pB1_02_R, pB1_03_R
  125. //v15 must save pB1_02_I, pB1_03_I
  126. //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R
  127. //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I
  128. //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R
  129. //v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I
  130. //v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R
  131. //v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I
  132. //v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R
  133. //v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I
  134. //v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R
  135. //v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I
  136. //v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R
  137. //v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I
  138. //v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R
  139. //v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I
  140. //v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R
  141. //v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I
  142. /*******************************************************************************
  143. * Macro definitions
  144. *******************************************************************************/
  145. .macro INIT8x4
  146. fmov s16, wzr
  147. fmov s17, wzr
  148. fmov s18, wzr
  149. fmov s19, s16
  150. fmov s20, wzr
  151. fmov s21, s16
  152. fmov s22, s17
  153. fmov s23, s18
  154. fmov s24, wzr
  155. fmov s25, s16
  156. fmov s26, s17
  157. fmov s27, s18
  158. fmov s28, wzr
  159. fmov s29, s16
  160. fmov s30, s17
  161. fmov s31, s18
  162. .endm
  163. .macro KERNEL8x4_I
  164. ld2 {v8.2s, v9.2s}, [pB]
  165. add pB, pB, #16
  166. ld2 {v0.4s, v1.4s}, [pA]
  167. add pA, pA, #32
  168. ld2 {v2.4s, v3.4s}, [pA]
  169. add pA, pA, #32
  170. fmul v16.4s, v0.4s, v8.s[0]
  171. OP_ii v16.4s, v1.4s, v9.s[0]
  172. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  173. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  174. eor v17.16b, v17.16b, v17.16b
  175. fmls v17.4s, v0.4s, v9.s[0]
  176. #else
  177. fmul v17.4s, v0.4s, v9.s[0]
  178. #endif
  179. OP_ir v17.4s, v1.4s, v8.s[0]
  180. ld2 {v10.2s, v11.2s}, [pB]
  181. add pB, pB, #16
  182. fmul v18.4s, v2.4s, v8.s[0]
  183. OP_ii v18.4s, v3.4s, v9.s[0]
  184. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  185. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  186. eor v19.16b, v19.16b, v19.16b
  187. fmls v19.4s, v2.4s, v9.s[0]
  188. #else
  189. fmul v19.4s, v2.4s, v9.s[0]
  190. #endif
  191. OP_ir v19.4s, v3.4s, v8.s[0]
  192. ld2 {v12.2s, v13.2s}, [pB]
  193. add pB, pB, #16
  194. fmul v20.4s, v0.4s, v8.s[1]
  195. OP_ii v20.4s, v1.4s, v9.s[1]
  196. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  197. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  198. eor v21.16b, v21.16b, v21.16b
  199. fmls v21.4s, v0.4s, v9.s[1]
  200. #else
  201. fmul v21.4s, v0.4s, v9.s[1]
  202. #endif
  203. OP_ir v21.4s, v1.4s, v8.s[1]
  204. ld2 {v14.2s, v15.2s}, [pB]
  205. add pB, pB, #16
  206. fmul v22.4s, v2.4s, v8.s[1]
  207. OP_ii v22.4s, v3.4s, v9.s[1]
  208. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  209. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  210. eor v23.16b, v23.16b, v23.16b
  211. fmls v23.4s, v2.4s, v9.s[1]
  212. #else
  213. fmul v23.4s, v2.4s, v9.s[1]
  214. #endif
  215. OP_ir v23.4s, v3.4s, v8.s[1]
  216. ld2 {v4.4s, v5.4s}, [pA]
  217. add pA, pA, #32
  218. fmul v24.4s, v0.4s, v10.s[0]
  219. OP_ii v24.4s, v1.4s, v11.s[0]
  220. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  221. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  222. eor v25.16b, v25.16b, v25.16b
  223. fmls v25.4s, v0.4s, v11.s[0]
  224. #else
  225. fmul v25.4s, v0.4s, v11.s[0]
  226. #endif
  227. OP_ir v25.4s, v1.4s, v10.s[0]
  228. ld2 {v6.4s, v7.4s}, [pA]
  229. add pA, pA, #32
  230. fmul v26.4s, v2.4s, v10.s[0]
  231. OP_ii v26.4s, v3.4s, v11.s[0]
  232. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  233. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  234. eor v27.16b, v27.16b, v27.16b
  235. fmls v27.4s, v2.4s, v11.s[0]
  236. #else
  237. fmul v27.4s, v2.4s, v11.s[0]
  238. #endif
  239. OP_ir v27.4s, v3.4s, v10.s[0]
  240. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  241. fmul v28.4s, v0.4s, v10.s[1]
  242. OP_ii v28.4s, v1.4s, v11.s[1]
  243. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  244. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  245. eor v29.16b, v29.16b, v29.16b
  246. fmls v29.4s, v0.4s, v11.s[1]
  247. #else
  248. fmul v29.4s, v0.4s, v11.s[1]
  249. #endif
  250. OP_ir v29.4s, v1.4s, v10.s[1]
  251. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  252. fmul v30.4s, v2.4s, v10.s[1]
  253. OP_ii v30.4s, v3.4s, v11.s[1]
  254. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  255. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  256. eor v31.16b, v31.16b, v31.16b
  257. fmls v31.4s, v2.4s, v11.s[1]
  258. #else
  259. fmul v31.4s, v2.4s, v11.s[1]
  260. #endif
  261. OP_ir v31.4s, v3.4s, v10.s[1]
  262. .endm
  263. .macro KERNEL8x4_M1
  264. OP_rr v16.4s, v0.4s, v8.s[0]
  265. OP_ii v16.4s, v1.4s, v9.s[0]
  266. OP_ri v17.4s, v0.4s, v9.s[0]
  267. OP_ir v17.4s, v1.4s, v8.s[0]
  268. ld2 {v12.2s, v13.2s}, [pB]
  269. add pB, pB, #16
  270. OP_rr v18.4s, v2.4s, v8.s[0]
  271. OP_ii v18.4s, v3.4s, v9.s[0]
  272. OP_ri v19.4s, v2.4s, v9.s[0]
  273. OP_ir v19.4s, v3.4s, v8.s[0]
  274. ld2 {v4.4s, v5.4s}, [pA]
  275. add pA, pA, #32
  276. OP_rr v20.4s, v0.4s, v8.s[1]
  277. OP_ii v20.4s, v1.4s, v9.s[1]
  278. OP_ri v21.4s, v0.4s, v9.s[1]
  279. OP_ir v21.4s, v1.4s, v8.s[1]
  280. ld2 {v6.4s, v7.4s}, [pA]
  281. add pA, pA, #32
  282. OP_rr v22.4s, v2.4s, v8.s[1]
  283. OP_ii v22.4s, v3.4s, v9.s[1]
  284. OP_ri v23.4s, v2.4s, v9.s[1]
  285. OP_ir v23.4s, v3.4s, v8.s[1]
  286. ld2 {v14.2s, v15.2s}, [pB]
  287. add pB, pB, #16
  288. OP_rr v24.4s, v0.4s, v10.s[0]
  289. OP_ii v24.4s, v1.4s, v11.s[0]
  290. OP_ri v25.4s, v0.4s, v11.s[0]
  291. OP_ir v25.4s, v1.4s, v10.s[0]
  292. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  293. OP_rr v26.4s, v2.4s, v10.s[0]
  294. OP_ii v26.4s, v3.4s, v11.s[0]
  295. OP_ri v27.4s, v2.4s, v11.s[0]
  296. OP_ir v27.4s, v3.4s, v10.s[0]
  297. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  298. OP_rr v28.4s, v0.4s, v10.s[1]
  299. OP_ii v28.4s, v1.4s, v11.s[1]
  300. OP_ri v29.4s, v0.4s, v11.s[1]
  301. OP_ir v29.4s, v1.4s, v10.s[1]
  302. OP_rr v30.4s, v2.4s, v10.s[1]
  303. OP_ii v30.4s, v3.4s, v11.s[1]
  304. OP_ri v31.4s, v2.4s, v11.s[1]
  305. OP_ir v31.4s, v3.4s, v10.s[1]
  306. .endm
  307. .macro KERNEL8x4_M2
  308. OP_rr v16.4s, v4.4s, v12.s[0]
  309. OP_ii v16.4s, v5.4s, v13.s[0]
  310. OP_ri v17.4s, v4.4s, v13.s[0]
  311. OP_ir v17.4s, v5.4s, v12.s[0]
  312. ld2 {v8.2s, v9.2s}, [pB]
  313. add pB, pB, #16
  314. OP_rr v18.4s, v6.4s, v12.s[0]
  315. OP_ii v18.4s, v7.4s, v13.s[0]
  316. OP_ri v19.4s, v6.4s, v13.s[0]
  317. OP_ir v19.4s, v7.4s, v12.s[0]
  318. ld2 {v0.4s, v1.4s}, [pA]
  319. add pA, pA, #32
  320. OP_rr v20.4s, v4.4s, v12.s[1]
  321. OP_ii v20.4s, v5.4s, v13.s[1]
  322. OP_ri v21.4s, v4.4s, v13.s[1]
  323. OP_ir v21.4s, v5.4s, v12.s[1]
  324. ld2 {v2.4s, v3.4s}, [pA]
  325. add pA, pA, #32
  326. OP_rr v22.4s, v6.4s, v12.s[1]
  327. OP_ii v22.4s, v7.4s, v13.s[1]
  328. OP_ri v23.4s, v6.4s, v13.s[1]
  329. OP_ir v23.4s, v7.4s, v12.s[1]
  330. ld2 {v10.2s, v11.2s}, [pB]
  331. add pB, pB, #16
  332. OP_rr v24.4s, v4.4s, v14.s[0]
  333. OP_ii v24.4s, v5.4s, v15.s[0]
  334. OP_ri v25.4s, v4.4s, v15.s[0]
  335. OP_ir v25.4s, v5.4s, v14.s[0]
  336. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  337. OP_rr v26.4s, v6.4s, v14.s[0]
  338. OP_ii v26.4s, v7.4s, v15.s[0]
  339. OP_ri v27.4s, v6.4s, v15.s[0]
  340. OP_ir v27.4s, v7.4s, v14.s[0]
  341. OP_rr v28.4s, v4.4s, v14.s[1]
  342. OP_ii v28.4s, v5.4s, v15.s[1]
  343. OP_ri v29.4s, v4.4s, v15.s[1]
  344. OP_ir v29.4s, v5.4s, v14.s[1]
  345. OP_rr v30.4s, v6.4s, v14.s[1]
  346. OP_ii v30.4s, v7.4s, v15.s[1]
  347. OP_ri v31.4s, v6.4s, v15.s[1]
  348. OP_ir v31.4s, v7.4s, v14.s[1]
  349. .endm
  350. .macro KERNEL8x4_E
  351. OP_rr v16.4s, v4.4s, v12.s[0]
  352. OP_ii v16.4s, v5.4s, v13.s[0]
  353. OP_ri v17.4s, v4.4s, v13.s[0]
  354. OP_ir v17.4s, v5.4s, v12.s[0]
  355. OP_rr v18.4s, v6.4s, v12.s[0]
  356. OP_ii v18.4s, v7.4s, v13.s[0]
  357. OP_ri v19.4s, v6.4s, v13.s[0]
  358. OP_ir v19.4s, v7.4s, v12.s[0]
  359. OP_rr v20.4s, v4.4s, v12.s[1]
  360. OP_ii v20.4s, v5.4s, v13.s[1]
  361. OP_ri v21.4s, v4.4s, v13.s[1]
  362. OP_ir v21.4s, v5.4s, v12.s[1]
  363. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  364. OP_rr v22.4s, v6.4s, v12.s[1]
  365. OP_ii v22.4s, v7.4s, v13.s[1]
  366. OP_ri v23.4s, v6.4s, v13.s[1]
  367. OP_ir v23.4s, v7.4s, v12.s[1]
  368. OP_rr v24.4s, v4.4s, v14.s[0]
  369. OP_ii v24.4s, v5.4s, v15.s[0]
  370. OP_ri v25.4s, v4.4s, v15.s[0]
  371. OP_ir v25.4s, v5.4s, v14.s[0]
  372. OP_rr v26.4s, v6.4s, v14.s[0]
  373. OP_ii v26.4s, v7.4s, v15.s[0]
  374. OP_ri v27.4s, v6.4s, v15.s[0]
  375. OP_ir v27.4s, v7.4s, v14.s[0]
  376. OP_rr v28.4s, v4.4s, v14.s[1]
  377. OP_ii v28.4s, v5.4s, v15.s[1]
  378. OP_ri v29.4s, v4.4s, v15.s[1]
  379. OP_ir v29.4s, v5.4s, v14.s[1]
  380. OP_rr v30.4s, v6.4s, v14.s[1]
  381. OP_ii v30.4s, v7.4s, v15.s[1]
  382. OP_ri v31.4s, v6.4s, v15.s[1]
  383. OP_ir v31.4s, v7.4s, v14.s[1]
  384. .endm
  385. .macro KERNEL8x4_SUB
  386. ld2 {v8.2s, v9.2s}, [pB]
  387. add pB, pB, #16
  388. ld2 {v0.4s, v1.4s}, [pA]
  389. add pA, pA, #32
  390. OP_rr v16.4s, v0.4s, v8.s[0]
  391. OP_ii v16.4s, v1.4s, v9.s[0]
  392. OP_ri v17.4s, v0.4s, v9.s[0]
  393. OP_ir v17.4s, v1.4s, v8.s[0]
  394. ld2 {v2.4s, v3.4s}, [pA]
  395. add pA, pA, #32
  396. OP_rr v20.4s, v0.4s, v8.s[1]
  397. OP_ii v20.4s, v1.4s, v9.s[1]
  398. OP_ri v21.4s, v0.4s, v9.s[1]
  399. OP_ir v21.4s, v1.4s, v8.s[1]
  400. ld2 {v10.2s, v11.2s}, [pB]
  401. add pB, pB, #16
  402. OP_rr v18.4s, v2.4s, v8.s[0]
  403. OP_ii v18.4s, v3.4s, v9.s[0]
  404. OP_ri v19.4s, v2.4s, v9.s[0]
  405. OP_ir v19.4s, v3.4s, v8.s[0]
  406. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  407. OP_rr v22.4s, v2.4s, v8.s[1]
  408. OP_ii v22.4s, v3.4s, v9.s[1]
  409. OP_ri v23.4s, v2.4s, v9.s[1]
  410. OP_ir v23.4s, v3.4s, v8.s[1]
  411. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  412. OP_rr v24.4s, v0.4s, v10.s[0]
  413. OP_ii v24.4s, v1.4s, v11.s[0]
  414. OP_ri v25.4s, v0.4s, v11.s[0]
  415. OP_ir v25.4s, v1.4s, v10.s[0]
  416. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  417. OP_rr v26.4s, v2.4s, v10.s[0]
  418. OP_ii v26.4s, v3.4s, v11.s[0]
  419. OP_ri v27.4s, v2.4s, v11.s[0]
  420. OP_ir v27.4s, v3.4s, v10.s[0]
  421. OP_rr v28.4s, v0.4s, v10.s[1]
  422. OP_ii v28.4s, v1.4s, v11.s[1]
  423. OP_ri v29.4s, v0.4s, v11.s[1]
  424. OP_ir v29.4s, v1.4s, v10.s[1]
  425. OP_rr v30.4s, v2.4s, v10.s[1]
  426. OP_ii v30.4s, v3.4s, v11.s[1]
  427. OP_ri v31.4s, v2.4s, v11.s[1]
  428. OP_ir v31.4s, v3.4s, v10.s[1]
  429. .endm
  430. .macro SAVE8x4
  431. fmov alpha0_R, alphaR
  432. fmov alpha0_I, alphaI
  433. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  434. ld2 {v0.4s, v1.4s}, [pCRow0]
  435. fmla v0.4s, v16.4s, alphaV0_R
  436. fmls v0.4s, v17.4s, alphaV0_I
  437. fmla v1.4s, v16.4s, alphaV0_I
  438. fmla v1.4s, v17.4s, alphaV0_R
  439. st2 {v0.4s, v1.4s}, [pCRow0]
  440. add pCRow0, pCRow0, #32
  441. ld2 {v2.4s, v3.4s}, [pCRow0]
  442. fmla v2.4s, v18.4s, alphaV0_R
  443. fmls v2.4s, v19.4s, alphaV0_I
  444. fmla v3.4s, v18.4s, alphaV0_I
  445. fmla v3.4s, v19.4s, alphaV0_R
  446. st2 {v2.4s, v3.4s}, [pCRow0]
  447. add pCRow0, pCRow0, #32
  448. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  449. ld2 {v4.4s, v5.4s}, [pCRow1]
  450. fmla v4.4s, v20.4s, alphaV0_R
  451. fmls v4.4s, v21.4s, alphaV0_I
  452. fmla v5.4s, v20.4s, alphaV0_I
  453. fmla v5.4s, v21.4s, alphaV0_R
  454. st2 {v4.4s, v5.4s}, [pCRow1]
  455. add pCRow1, pCRow1, #32
  456. ld2 {v6.4s, v7.4s}, [pCRow1]
  457. fmla v6.4s, v22.4s, alphaV0_R
  458. fmls v6.4s, v23.4s, alphaV0_I
  459. fmla v7.4s, v22.4s, alphaV0_I
  460. fmla v7.4s, v23.4s, alphaV0_R
  461. st2 {v6.4s, v7.4s}, [pCRow1]
  462. add pCRow1, pCRow1, #32
  463. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  464. ld2 {v0.4s, v1.4s}, [pCRow2]
  465. fmla v0.4s, v24.4s, alphaV0_R
  466. fmls v0.4s, v25.4s, alphaV0_I
  467. fmla v1.4s, v24.4s, alphaV0_I
  468. fmla v1.4s, v25.4s, alphaV0_R
  469. st2 {v0.4s, v1.4s}, [pCRow2]
  470. add pCRow2, pCRow2, #32
  471. ld2 {v2.4s, v3.4s}, [pCRow2]
  472. fmla v2.4s, v26.4s, alphaV0_R
  473. fmls v2.4s, v27.4s, alphaV0_I
  474. fmla v3.4s, v26.4s, alphaV0_I
  475. fmla v3.4s, v27.4s, alphaV0_R
  476. st2 {v2.4s, v3.4s}, [pCRow2]
  477. add pCRow2, pCRow2, #32
  478. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  479. ld2 {v4.4s, v5.4s}, [pCRow3]
  480. fmla v4.4s, v28.4s, alphaV0_R
  481. fmls v4.4s, v29.4s, alphaV0_I
  482. fmla v5.4s, v28.4s, alphaV0_I
  483. fmla v5.4s, v29.4s, alphaV0_R
  484. st2 {v4.4s, v5.4s}, [pCRow3]
  485. add pCRow3, pCRow3, #32
  486. ld2 {v6.4s, v7.4s}, [pCRow3]
  487. fmla v6.4s, v30.4s, alphaV0_R
  488. fmls v6.4s, v31.4s, alphaV0_I
  489. fmla v7.4s, v30.4s, alphaV0_I
  490. fmla v7.4s, v31.4s, alphaV0_R
  491. st2 {v6.4s, v7.4s}, [pCRow3]
  492. add pCRow3, pCRow3, #32
  493. .endm
  494. /******************************************************************************/
  495. .macro INIT4x4
  496. fmov s16, wzr
  497. fmov s17, s16
  498. fmov s20, s17
  499. fmov s21, s16
  500. fmov s24, s17
  501. fmov s25, s16
  502. fmov s28, s17
  503. fmov s29, s16
  504. .endm
  505. .macro KERNEL4x4_I
  506. ld2 {v8.4s, v9.4s}, [pB]
  507. add pB, pB, #32
  508. ld2 {v0.4s, v1.4s}, [pA]
  509. add pA, pA, #32
  510. fmul v16.4s, v0.4s, v8.s[0]
  511. OP_ii v16.4s, v1.4s, v9.s[0]
  512. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  513. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  514. eor v17.16b, v17.16b, v17.16b
  515. fmls v17.4s, v0.4s, v9.s[0]
  516. #else
  517. fmul v17.4s, v0.4s, v9.s[0]
  518. #endif
  519. OP_ir v17.4s, v1.4s, v8.s[0]
  520. fmul v20.4s, v0.4s, v8.s[1]
  521. OP_ii v20.4s, v1.4s, v9.s[1]
  522. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  523. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  524. eor v21.16b, v21.16b, v21.16b
  525. fmls v21.4s, v0.4s, v9.s[1]
  526. #else
  527. fmul v21.4s, v0.4s, v9.s[1]
  528. #endif
  529. OP_ir v21.4s, v1.4s, v8.s[1]
  530. fmul v24.4s, v0.4s, v8.s[2]
  531. OP_ii v24.4s, v1.4s, v9.s[2]
  532. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  533. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  534. eor v25.16b, v25.16b, v25.16b
  535. fmls v25.4s, v0.4s, v9.s[2]
  536. #else
  537. fmul v25.4s, v0.4s, v9.s[2]
  538. #endif
  539. OP_ir v25.4s, v1.4s, v8.s[2]
  540. fmul v28.4s, v0.4s, v8.s[3]
  541. OP_ii v28.4s, v1.4s, v9.s[3]
  542. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  543. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  544. eor v29.16b, v29.16b, v29.16b
  545. fmls v29.4s, v0.4s, v9.s[3]
  546. #else
  547. fmul v29.4s, v0.4s, v9.s[3]
  548. #endif
  549. OP_ir v29.4s, v1.4s, v8.s[3]
  550. ld2 {v12.4s, v13.4s}, [pB]
  551. add pB, pB, #32
  552. ld2 {v4.4s, v5.4s}, [pA]
  553. add pA, pA, #32
  554. .endm
  555. .macro KERNEL4x4_M1
  556. OP_rr v16.4s, v0.4s, v8.s[0]
  557. OP_ii v16.4s, v1.4s, v9.s[0]
  558. OP_ri v17.4s, v0.4s, v9.s[0]
  559. OP_ir v17.4s, v1.4s, v8.s[0]
  560. ld2 {v12.4s, v13.4s}, [pB] // For next round
  561. add pB, pB, #32
  562. OP_rr v20.4s, v0.4s, v8.s[1]
  563. OP_ii v20.4s, v1.4s, v9.s[1]
  564. OP_ri v21.4s, v0.4s, v9.s[1]
  565. OP_ir v21.4s, v1.4s, v8.s[1]
  566. ld2 {v4.4s, v5.4s}, [pA] // For next round
  567. add pA, pA, #32
  568. OP_rr v24.4s, v0.4s, v8.s[2]
  569. OP_ii v24.4s, v1.4s, v9.s[2]
  570. OP_ri v25.4s, v0.4s, v9.s[2]
  571. OP_ir v25.4s, v1.4s, v8.s[2]
  572. prfm PLDL1KEEP, [pA, #512]
  573. OP_rr v28.4s, v0.4s, v8.s[3]
  574. OP_ii v28.4s, v1.4s, v9.s[3]
  575. OP_ri v29.4s, v0.4s, v9.s[3]
  576. OP_ir v29.4s, v1.4s, v8.s[3]
  577. .endm
  578. .macro KERNEL4x4_M2
  579. OP_rr v16.4s, v4.4s, v12.s[0]
  580. OP_ii v16.4s, v5.4s, v13.s[0]
  581. OP_ri v17.4s, v4.4s, v13.s[0]
  582. OP_ir v17.4s, v5.4s, v12.s[0]
  583. ld2 {v8.4s, v9.4s}, [pB] // For next round
  584. add pB, pB, #32
  585. OP_rr v20.4s, v4.4s, v12.s[1]
  586. OP_ii v20.4s, v5.4s, v13.s[1]
  587. OP_ri v21.4s, v4.4s, v13.s[1]
  588. OP_ir v21.4s, v5.4s, v12.s[1]
  589. ld2 {v0.4s, v1.4s}, [pA] // For next round
  590. add pA, pA, #32
  591. OP_rr v24.4s, v4.4s, v12.s[2]
  592. OP_ii v24.4s, v5.4s, v13.s[2]
  593. OP_ri v25.4s, v4.4s, v13.s[2]
  594. OP_ir v25.4s, v5.4s, v12.s[2]
  595. prfm PLDL1KEEP, [pB, #512]
  596. OP_rr v28.4s, v4.4s, v12.s[3]
  597. OP_ii v28.4s, v5.4s, v13.s[3]
  598. OP_ri v29.4s, v4.4s, v13.s[3]
  599. OP_ir v29.4s, v5.4s, v12.s[3]
  600. .endm
  601. .macro KERNEL4x4_E
  602. OP_rr v16.4s, v4.4s, v12.s[0]
  603. OP_ii v16.4s, v5.4s, v13.s[0]
  604. OP_ri v17.4s, v4.4s, v13.s[0]
  605. OP_ir v17.4s, v5.4s, v12.s[0]
  606. OP_rr v20.4s, v4.4s, v12.s[1]
  607. OP_ii v20.4s, v5.4s, v13.s[1]
  608. OP_ri v21.4s, v4.4s, v13.s[1]
  609. OP_ir v21.4s, v5.4s, v12.s[1]
  610. OP_rr v24.4s, v4.4s, v12.s[2]
  611. OP_ii v24.4s, v5.4s, v13.s[2]
  612. OP_ri v25.4s, v4.4s, v13.s[2]
  613. OP_ir v25.4s, v5.4s, v12.s[2]
  614. OP_rr v28.4s, v4.4s, v12.s[3]
  615. OP_ii v28.4s, v5.4s, v13.s[3]
  616. OP_ri v29.4s, v4.4s, v13.s[3]
  617. OP_ir v29.4s, v5.4s, v12.s[3]
  618. .endm
  619. .macro KERNEL4x4_SUB
  620. ld2 {v8.4s, v9.4s}, [pB]
  621. add pB, pB, #32
  622. ld2 {v0.4s, v1.4s}, [pA]
  623. add pA, pA, #32
  624. OP_rr v16.4s, v0.4s, v8.s[0]
  625. OP_ii v16.4s, v1.4s, v9.s[0]
  626. OP_ri v17.4s, v0.4s, v9.s[0]
  627. OP_ir v17.4s, v1.4s, v8.s[0]
  628. OP_rr v20.4s, v0.4s, v8.s[1]
  629. OP_ii v20.4s, v1.4s, v9.s[1]
  630. OP_ri v21.4s, v0.4s, v9.s[1]
  631. OP_ir v21.4s, v1.4s, v8.s[1]
  632. OP_rr v24.4s, v0.4s, v8.s[2]
  633. OP_ii v24.4s, v1.4s, v9.s[2]
  634. OP_ri v25.4s, v0.4s, v9.s[2]
  635. OP_ir v25.4s, v1.4s, v8.s[2]
  636. OP_rr v28.4s, v0.4s, v8.s[3]
  637. OP_ii v28.4s, v1.4s, v9.s[3]
  638. OP_ri v29.4s, v0.4s, v9.s[3]
  639. OP_ir v29.4s, v1.4s, v8.s[3]
  640. .endm
  641. .macro SAVE4x4
  642. fmov alpha0_R, alphaR
  643. fmov alpha0_I, alphaI
  644. mov pCRow1, pCRow0
  645. ld2 {v0.4s, v1.4s}, [pCRow1]
  646. fmla v0.4s, v16.4s, alphaV0_R
  647. fmls v0.4s, v17.4s, alphaV0_I
  648. fmla v1.4s, v16.4s, alphaV0_I
  649. fmla v1.4s, v17.4s, alphaV0_R
  650. st2 {v0.4s, v1.4s}, [pCRow1]
  651. add pCRow1, pCRow1, LDC
  652. ld2 {v4.4s, v5.4s}, [pCRow1]
  653. fmla v4.4s, v20.4s, alphaV0_R
  654. fmls v4.4s, v21.4s, alphaV0_I
  655. fmla v5.4s, v20.4s, alphaV0_I
  656. fmla v5.4s, v21.4s, alphaV0_R
  657. st2 {v4.4s, v5.4s}, [pCRow1]
  658. add pCRow1, pCRow1, LDC
  659. ld2 {v0.4s, v1.4s}, [pCRow1]
  660. fmla v0.4s, v24.4s, alphaV0_R
  661. fmls v0.4s, v25.4s, alphaV0_I
  662. fmla v1.4s, v24.4s, alphaV0_I
  663. fmla v1.4s, v25.4s, alphaV0_R
  664. st2 {v0.4s, v1.4s}, [pCRow1]
  665. add pCRow1, pCRow1, LDC
  666. ld2 {v4.4s, v5.4s}, [pCRow1]
  667. fmla v4.4s, v28.4s, alphaV0_R
  668. fmls v4.4s, v29.4s, alphaV0_I
  669. fmla v5.4s, v28.4s, alphaV0_I
  670. fmla v5.4s, v29.4s, alphaV0_R
  671. st2 {v4.4s, v5.4s}, [pCRow1]
  672. add pCRow0, pCRow0, #32
  673. .endm
  674. /******************************************************************************/
  675. .macro INIT2x4
  676. fmov s16, wzr
  677. fmov s17, wzr
  678. fmov s20, s16
  679. fmov s21, s17
  680. fmov s24, s16
  681. fmov s25, s17
  682. fmov s28, s16
  683. fmov s29, s17
  684. .endm
  685. .macro KERNEL2x4_SUB
  686. ld2 {v8.4s, v9.4s}, [pB]
  687. add pB, pB, #32
  688. ld2 {v0.2s, v1.2s}, [pA]
  689. add pA, pA, #16
  690. OP_rr v16.2s, v0.2s, v8.s[0]
  691. OP_ii v16.2s, v1.2s, v9.s[0]
  692. OP_ri v17.2s, v0.2s, v9.s[0]
  693. OP_ir v17.2s, v1.2s, v8.s[0]
  694. OP_rr v20.2s, v0.2s, v8.s[1]
  695. OP_ii v20.2s, v1.2s, v9.s[1]
  696. OP_ri v21.2s, v0.2s, v9.s[1]
  697. OP_ir v21.2s, v1.2s, v8.s[1]
  698. OP_rr v24.2s, v0.2s, v8.s[2]
  699. OP_ii v24.2s, v1.2s, v9.s[2]
  700. OP_ri v25.2s, v0.2s, v9.s[2]
  701. OP_ir v25.2s, v1.2s, v8.s[2]
  702. OP_rr v28.2s, v0.2s, v8.s[3]
  703. OP_ii v28.2s, v1.2s, v9.s[3]
  704. OP_ri v29.2s, v0.2s, v9.s[3]
  705. OP_ir v29.2s, v1.2s, v8.s[3]
  706. .endm
  707. .macro SAVE2x4
  708. fmov alpha0_R, alphaR
  709. fmov alpha0_I, alphaI
  710. mov pCRow1, pCRow0
  711. ld2 {v0.2s, v1.2s}, [pCRow1]
  712. fmla v0.2s, v16.2s, alphaV0_R
  713. fmls v0.2s, v17.2s, alphaV0_I
  714. fmla v1.2s, v16.2s, alphaV0_I
  715. fmla v1.2s, v17.2s, alphaV0_R
  716. st2 {v0.2s, v1.2s}, [pCRow1]
  717. add pCRow1, pCRow1, LDC
  718. ld2 {v4.2s, v5.2s}, [pCRow1]
  719. fmla v4.2s, v20.2s, alphaV0_R
  720. fmls v4.2s, v21.2s, alphaV0_I
  721. fmla v5.2s, v20.2s, alphaV0_I
  722. fmla v5.2s, v21.2s, alphaV0_R
  723. st2 {v4.2s, v5.2s}, [pCRow1]
  724. add pCRow1, pCRow1, LDC
  725. ld2 {v0.2s, v1.2s}, [pCRow1]
  726. fmla v0.2s, v24.2s, alphaV0_R
  727. fmls v0.2s, v25.2s, alphaV0_I
  728. fmla v1.2s, v24.2s, alphaV0_I
  729. fmla v1.2s, v25.2s, alphaV0_R
  730. st2 {v0.2s, v1.2s}, [pCRow1]
  731. add pCRow1, pCRow1, LDC
  732. ld2 {v4.2s, v5.2s}, [pCRow1]
  733. fmla v4.2s, v28.2s, alphaV0_R
  734. fmls v4.2s, v29.2s, alphaV0_I
  735. fmla v5.2s, v28.2s, alphaV0_I
  736. fmla v5.2s, v29.2s, alphaV0_R
  737. st2 {v4.2s, v5.2s}, [pCRow1]
  738. add pCRow0, pCRow0, #16
  739. .endm
  740. /******************************************************************************/
  741. .macro INIT1x4
  742. fmov s16, wzr
  743. fmov s17, wzr
  744. fmov s20, s16
  745. fmov s21, s17
  746. fmov s24, s16
  747. fmov s25, s17
  748. fmov s28, s16
  749. fmov s29, s17
  750. .endm
  751. .macro KERNEL1x4_SUB
  752. ld2 {v8.4s, v9.4s}, [pB]
  753. add pB, pB, #32
  754. ld2 {v0.s, v1.s}[0], [pA]
  755. add pA, pA, #8
  756. OP_rr s16, s0, v8.s[0]
  757. OP_ii s16, s1, v9.s[0]
  758. OP_ri s17, s0, v9.s[0]
  759. OP_ir s17, s1, v8.s[0]
  760. OP_rr s20, s0, v8.s[1]
  761. OP_ii s20, s1, v9.s[1]
  762. OP_ri s21, s0, v9.s[1]
  763. OP_ir s21, s1, v8.s[1]
  764. OP_rr s24, s0, v8.s[2]
  765. OP_ii s24, s1, v9.s[2]
  766. OP_ri s25, s0, v9.s[2]
  767. OP_ir s25, s1, v8.s[2]
  768. OP_rr s28, s0, v8.s[3]
  769. OP_ii s28, s1, v9.s[3]
  770. OP_ri s29, s0, v9.s[3]
  771. OP_ir s29, s1, v8.s[3]
  772. .endm
  773. .macro SAVE1x4
  774. fmov alpha0_R, alphaR
  775. fmov alpha0_I, alphaI
  776. mov pCRow1, pCRow0
  777. ld2 {v0.s, v1.s}[0], [pCRow1]
  778. fmla s0, s16, alphaV0_R
  779. fmls s0, s17, alphaV0_I
  780. fmla s1, s16, alphaV0_I
  781. fmla s1, s17, alphaV0_R
  782. st2 {v0.s, v1.s}[0], [pCRow1]
  783. add pCRow1, pCRow1, LDC
  784. ld2 {v4.s, v5.s}[0], [pCRow1]
  785. fmla s4, s20, alphaV0_R
  786. fmls s4, s21, alphaV0_I
  787. fmla s5, s20, alphaV0_I
  788. fmla s5, s21, alphaV0_R
  789. st2 {v4.s, v5.s}[0], [pCRow1]
  790. add pCRow1, pCRow1, LDC
  791. ld2 {v0.s, v1.s}[0], [pCRow1]
  792. fmla s0, s24, alphaV0_R
  793. fmls s0, s25, alphaV0_I
  794. fmla s1, s24, alphaV0_I
  795. fmla s1, s25, alphaV0_R
  796. st2 {v0.s, v1.s}[0], [pCRow1]
  797. add pCRow1, pCRow1, LDC
  798. ld2 {v4.s, v5.s}[0], [pCRow1]
  799. fmla s4, s28, alphaV0_R
  800. fmls s4, s29, alphaV0_I
  801. fmla s5, s28, alphaV0_I
  802. fmla s5, s29, alphaV0_R
  803. st2 {v4.s, v5.s}[0], [pCRow1]
  804. add pCRow0, pCRow0, #8
  805. .endm
  806. /******************************************************************************/
  807. .macro INIT8x2
  808. fmov s16, wzr
  809. fmov s17, wzr
  810. fmov s18, wzr
  811. fmov s19, s16
  812. fmov s20, wzr
  813. fmov s21, s16
  814. fmov s22, s17
  815. fmov s23, s18
  816. .endm
  817. .macro KERNEL8x2_SUB
  818. ld2 {v8.2s, v9.2s}, [pB]
  819. add pB, pB, #16
  820. ld2 {v0.4s, v1.4s}, [pA]
  821. add pA, pA, #32
  822. ld2 {v2.4s, v3.4s}, [pA]
  823. add pA, pA, #32
  824. OP_rr v16.4s, v0.4s, v8.s[0]
  825. OP_ii v16.4s, v1.4s, v9.s[0]
  826. OP_ri v17.4s, v0.4s, v9.s[0]
  827. OP_ir v17.4s, v1.4s, v8.s[0]
  828. OP_rr v18.4s, v2.4s, v8.s[0]
  829. OP_ii v18.4s, v3.4s, v9.s[0]
  830. OP_ri v19.4s, v2.4s, v9.s[0]
  831. OP_ir v19.4s, v3.4s, v8.s[0]
  832. OP_rr v20.4s, v0.4s, v8.s[1]
  833. OP_ii v20.4s, v1.4s, v9.s[1]
  834. OP_ri v21.4s, v0.4s, v9.s[1]
  835. OP_ir v21.4s, v1.4s, v8.s[1]
  836. OP_rr v22.4s, v2.4s, v8.s[1]
  837. OP_ii v22.4s, v3.4s, v9.s[1]
  838. OP_ri v23.4s, v2.4s, v9.s[1]
  839. OP_ir v23.4s, v3.4s, v8.s[1]
  840. .endm
  841. .macro SAVE8x2
  842. fmov alpha0_R, alphaR
  843. fmov alpha0_I, alphaI
  844. mov pCRow1, pCRow0
  845. ld2 {v0.4s, v1.4s}, [pCRow1]
  846. fmla v0.4s, v16.4s, alphaV0_R
  847. fmls v0.4s, v17.4s, alphaV0_I
  848. fmla v1.4s, v16.4s, alphaV0_I
  849. fmla v1.4s, v17.4s, alphaV0_R
  850. st2 {v0.4s, v1.4s}, [pCRow1]
  851. add pCRow2, pCRow1, #32
  852. ld2 {v2.4s, v3.4s}, [pCRow2]
  853. fmla v2.4s, v18.4s, alphaV0_R
  854. fmls v2.4s, v19.4s, alphaV0_I
  855. fmla v3.4s, v18.4s, alphaV0_I
  856. fmla v3.4s, v19.4s, alphaV0_R
  857. st2 {v2.4s, v3.4s}, [pCRow2]
  858. add pCRow1, pCRow1, LDC
  859. ld2 {v4.4s, v5.4s}, [pCRow1]
  860. fmla v4.4s, v20.4s, alphaV0_R
  861. fmls v4.4s, v21.4s, alphaV0_I
  862. fmla v5.4s, v20.4s, alphaV0_I
  863. fmla v5.4s, v21.4s, alphaV0_R
  864. st2 {v4.4s, v5.4s}, [pCRow1]
  865. add pCRow2, pCRow1, #32
  866. ld2 {v6.4s, v7.4s}, [pCRow2]
  867. fmla v6.4s, v22.4s, alphaV0_R
  868. fmls v6.4s, v23.4s, alphaV0_I
  869. fmla v7.4s, v22.4s, alphaV0_I
  870. fmla v7.4s, v23.4s, alphaV0_R
  871. st2 {v6.4s, v7.4s}, [pCRow2]
  872. add pCRow0, pCRow0, #64
  873. .endm
  874. /******************************************************************************/
  875. .macro INIT4x2
  876. fmov s16, wzr
  877. fmov s17, wzr
  878. fmov s20, s16
  879. fmov s21, s17
  880. .endm
  881. .macro KERNEL4x2_SUB
  882. ld2 {v8.2s, v9.2s}, [pB]
  883. add pB, pB, #16
  884. ld2 {v0.4s, v1.4s}, [pA]
  885. add pA, pA, #32
  886. OP_rr v16.4s, v0.4s, v8.s[0]
  887. OP_ii v16.4s, v1.4s, v9.s[0]
  888. OP_ri v17.4s, v0.4s, v9.s[0]
  889. OP_ir v17.4s, v1.4s, v8.s[0]
  890. OP_rr v20.4s, v0.4s, v8.s[1]
  891. OP_ii v20.4s, v1.4s, v9.s[1]
  892. OP_ri v21.4s, v0.4s, v9.s[1]
  893. OP_ir v21.4s, v1.4s, v8.s[1]
  894. .endm
  895. .macro SAVE4x2
  896. fmov alpha0_R, alphaR
  897. fmov alpha0_I, alphaI
  898. mov pCRow1, pCRow0
  899. ld2 {v0.4s, v1.4s}, [pCRow1]
  900. fmla v0.4s, v16.4s, alphaV0_R
  901. fmls v0.4s, v17.4s, alphaV0_I
  902. fmla v1.4s, v16.4s, alphaV0_I
  903. fmla v1.4s, v17.4s, alphaV0_R
  904. st2 {v0.4s, v1.4s}, [pCRow1]
  905. add pCRow1, pCRow1, LDC
  906. ld2 {v4.4s, v5.4s}, [pCRow1]
  907. fmla v4.4s, v20.4s, alphaV0_R
  908. fmls v4.4s, v21.4s, alphaV0_I
  909. fmla v5.4s, v20.4s, alphaV0_I
  910. fmla v5.4s, v21.4s, alphaV0_R
  911. st2 {v4.4s, v5.4s}, [pCRow1]
  912. add pCRow0, pCRow0, #32
  913. .endm
  914. /******************************************************************************/
  915. .macro INIT2x2
  916. fmov s16, wzr
  917. fmov s17, wzr
  918. fmov s20, s16
  919. fmov s21, s17
  920. .endm
  921. .macro KERNEL2x2_SUB
  922. ld2 {v8.2s, v9.2s}, [pB]
  923. add pB, pB, #16
  924. ld2 {v0.2s, v1.2s}, [pA]
  925. add pA, pA, #16
  926. OP_rr v16.2s, v0.2s, v8.s[0]
  927. OP_ii v16.2s, v1.2s, v9.s[0]
  928. OP_ri v17.2s, v0.2s, v9.s[0]
  929. OP_ir v17.2s, v1.2s, v8.s[0]
  930. OP_rr v20.2s, v0.2s, v8.s[1]
  931. OP_ii v20.2s, v1.2s, v9.s[1]
  932. OP_ri v21.2s, v0.2s, v9.s[1]
  933. OP_ir v21.2s, v1.2s, v8.s[1]
  934. .endm
  935. .macro SAVE2x2
  936. fmov alpha0_R, alphaR
  937. fmov alpha0_I, alphaI
  938. mov pCRow1, pCRow0
  939. ld2 {v0.2s, v1.2s}, [pCRow1]
  940. fmla v0.2s, v16.2s, alphaV0_R
  941. fmls v0.2s, v17.2s, alphaV0_I
  942. fmla v1.2s, v16.2s, alphaV0_I
  943. fmla v1.2s, v17.2s, alphaV0_R
  944. st2 {v0.2s, v1.2s}, [pCRow1]
  945. add pCRow1, pCRow1, LDC
  946. ld2 {v4.2s, v5.2s}, [pCRow1]
  947. fmla v4.2s, v20.2s, alphaV0_R
  948. fmls v4.2s, v21.2s, alphaV0_I
  949. fmla v5.2s, v20.2s, alphaV0_I
  950. fmla v5.2s, v21.2s, alphaV0_R
  951. st2 {v4.2s, v5.2s}, [pCRow1]
  952. add pCRow0, pCRow0, #16
  953. .endm
  954. /******************************************************************************/
  955. .macro INIT1x2
  956. fmov s16, wzr
  957. fmov s17, wzr
  958. fmov s20, wzr
  959. fmov s21, wzr
  960. .endm
  961. .macro KERNEL1x2_SUB
  962. ld2 {v8.2s, v9.2s}, [pB]
  963. add pB, pB, #16
  964. ld2 {v0.s, v1.s}[0], [pA]
  965. add pA, pA, #8
  966. OP_rr s16, s0, v8.s[0]
  967. OP_ii s16, s1, v9.s[0]
  968. OP_ri s17, s0, v9.s[0]
  969. OP_ir s17, s1, v8.s[0]
  970. OP_rr s20, s0, v8.s[1]
  971. OP_ii s20, s1, v9.s[1]
  972. OP_ri s21, s0, v9.s[1]
  973. OP_ir s21, s1, v8.s[1]
  974. .endm
  975. .macro SAVE1x2
  976. fmov alpha0_R, alphaR
  977. fmov alpha0_I, alphaI
  978. mov pCRow1, pCRow0
  979. ld2 {v0.s, v1.s}[0], [pCRow1]
  980. fmla s0, s16, alphaV0_R
  981. fmls s0, s17, alphaV0_I
  982. fmla s1, s16, alphaV0_I
  983. fmla s1, s17, alphaV0_R
  984. st2 {v0.s, v1.s}[0], [pCRow1]
  985. add pCRow1, pCRow1, LDC
  986. ld2 {v4.s, v5.s}[0], [pCRow1]
  987. fmla s4, s20, alphaV0_R
  988. fmls s4, s21, alphaV0_I
  989. fmla s5, s20, alphaV0_I
  990. fmla s5, s21, alphaV0_R
  991. st2 {v4.s, v5.s}[0], [pCRow1]
  992. add pCRow0, pCRow0, #8
  993. .endm
  994. /******************************************************************************/
  995. .macro INIT8x1
  996. fmov s16, wzr
  997. fmov s17, wzr
  998. fmov s18, wzr
  999. fmov s19, s16
  1000. .endm
  1001. .macro KERNEL8x1_SUB
  1002. ld1 {v8.2s}, [pB]
  1003. add pB, pB, #8
  1004. ld2 {v0.4s, v1.4s}, [pA]
  1005. add pA, pA, #32
  1006. ld2 {v2.4s, v3.4s}, [pA]
  1007. add pA, pA, #32
  1008. OP_rr v16.4s, v0.4s, v8.s[0]
  1009. OP_ii v16.4s, v1.4s, v8.s[1]
  1010. OP_ri v17.4s, v0.4s, v8.s[1]
  1011. OP_ir v17.4s, v1.4s, v8.s[0]
  1012. OP_rr v18.4s, v2.4s, v8.s[0]
  1013. OP_ii v18.4s, v3.4s, v8.s[1]
  1014. OP_ri v19.4s, v2.4s, v8.s[1]
  1015. OP_ir v19.4s, v3.4s, v8.s[0]
  1016. .endm
  1017. .macro SAVE8x1
  1018. fmov alpha0_R, alphaR
  1019. fmov alpha0_I, alphaI
  1020. mov pCRow1, pCRow0
  1021. ld2 {v0.4s, v1.4s}, [pCRow1]
  1022. fmla v0.4s, v16.4s, alphaV0_R
  1023. fmls v0.4s, v17.4s, alphaV0_I
  1024. fmla v1.4s, v16.4s, alphaV0_I
  1025. fmla v1.4s, v17.4s, alphaV0_R
  1026. st2 {v0.4s, v1.4s}, [pCRow1]
  1027. add pCRow1, pCRow1, #32
  1028. ld2 {v2.4s, v3.4s}, [pCRow1]
  1029. fmla v2.4s, v18.4s, alphaV0_R
  1030. fmls v2.4s, v19.4s, alphaV0_I
  1031. fmla v3.4s, v18.4s, alphaV0_I
  1032. fmla v3.4s, v19.4s, alphaV0_R
  1033. st2 {v2.4s, v3.4s}, [pCRow1]
  1034. add pCRow0, pCRow0, #64
  1035. .endm
  1036. /******************************************************************************/
  1037. .macro INIT4x1
  1038. fmov s16, wzr
  1039. fmov s17, s16
  1040. .endm
  1041. .macro KERNEL4x1_SUB
  1042. ld2 {v8.s, v9.s}[0], [pB]
  1043. add pB, pB, #8
  1044. ld2 {v0.4s, v1.4s}, [pA]
  1045. add pA, pA, #32
  1046. OP_rr v16.4s, v0.4s, v8.s[0]
  1047. OP_ii v16.4s, v1.4s, v9.s[0]
  1048. OP_ri v17.4s, v0.4s, v9.s[0]
  1049. OP_ir v17.4s, v1.4s, v8.s[0]
  1050. .endm
  1051. .macro SAVE4x1
  1052. fmov alpha0_R, alphaR
  1053. fmov alpha0_I, alphaI
  1054. mov pCRow1, pCRow0
  1055. ld2 {v0.4s, v1.4s}, [pCRow1]
  1056. fmla v0.4s, v16.4s, alphaV0_R
  1057. fmls v0.4s, v17.4s, alphaV0_I
  1058. fmla v1.4s, v16.4s, alphaV0_I
  1059. fmla v1.4s, v17.4s, alphaV0_R
  1060. st2 {v0.4s, v1.4s}, [pCRow1]
  1061. add pCRow0, pCRow0, #32
  1062. .endm
  1063. /******************************************************************************/
  1064. .macro INIT2x1
  1065. fmov s16, wzr
  1066. fmov s17, wzr
  1067. .endm
  1068. .macro KERNEL2x1_SUB
  1069. ld2 {v8.s, v9.s}[0], [pB]
  1070. add pB, pB, #8
  1071. ld2 {v0.2s, v1.2s}, [pA]
  1072. add pA, pA, #16
  1073. OP_rr v16.2s, v0.2s, v8.s[0]
  1074. OP_ii v16.2s, v1.2s, v9.s[0]
  1075. OP_ri v17.2s, v0.2s, v9.s[0]
  1076. OP_ir v17.2s, v1.2s, v8.s[0]
  1077. .endm
  1078. .macro SAVE2x1
  1079. fmov alpha0_R, alphaR
  1080. fmov alpha0_I, alphaI
  1081. mov pCRow1, pCRow0
  1082. ld2 {v0.2s, v1.2s}, [pCRow1]
  1083. fmla v0.2s, v16.2s, alphaV0_R
  1084. fmls v0.2s, v17.2s, alphaV0_I
  1085. fmla v1.2s, v16.2s, alphaV0_I
  1086. fmla v1.2s, v17.2s, alphaV0_R
  1087. st2 {v0.2s, v1.2s}, [pCRow1]
  1088. add pCRow0, pCRow0, #16
  1089. .endm
  1090. /******************************************************************************/
  1091. .macro INIT1x1
  1092. fmov s16, wzr
  1093. fmov s17, wzr
  1094. .endm
  1095. .macro KERNEL1x1_SUB
  1096. ld2 {v8.s, v9.s}[0], [pB]
  1097. add pB, pB, #8
  1098. ld2 {v0.s, v1.s}[0], [pA]
  1099. add pA, pA, #8
  1100. OP_rr s16, s0, v8.s[0]
  1101. OP_ii s16, s1, v9.s[0]
  1102. OP_ri s17, s0, v9.s[0]
  1103. OP_ir s17, s1, v8.s[0]
  1104. .endm
  1105. .macro SAVE1x1
  1106. fmov alpha0_R, alphaR
  1107. fmov alpha0_I, alphaI
  1108. mov pCRow1, pCRow0
  1109. ld2 {v0.s, v1.s}[0], [pCRow1]
  1110. fmla s0, s16, alphaV0_R
  1111. fmls s0, s17, alphaV0_I
  1112. fmla s1, s16, alphaV0_I
  1113. fmla s1, s17, alphaV0_R
  1114. st2 {v0.s, v1.s}[0], [pCRow1]
  1115. add pCRow0, pCRow0, #8
  1116. .endm
  1117. /*******************************************************************************
  1118. * End of macro definitions
  1119. *******************************************************************************/
  1120. PROLOGUE
  1121. .align 5
  1122. add sp, sp, #-(11 * 16)
  1123. stp d8, d9, [sp, #(0 * 16)]
  1124. stp d10, d11, [sp, #(1 * 16)]
  1125. stp d12, d13, [sp, #(2 * 16)]
  1126. stp d14, d15, [sp, #(3 * 16)]
  1127. stp d16, d17, [sp, #(4 * 16)]
  1128. stp x18, x19, [sp, #(5 * 16)]
  1129. stp x20, x21, [sp, #(6 * 16)]
  1130. stp x22, x23, [sp, #(7 * 16)]
  1131. stp x24, x25, [sp, #(8 * 16)]
  1132. stp x26, x27, [sp, #(9 * 16)]
  1133. str x28, [sp, #(10 * 16)]
  1134. prfm PLDL1KEEP, [origPB]
  1135. prfm PLDL1KEEP, [origPA]
  1136. fmov alphaR, s0
  1137. fmov alphaI, s1
  1138. lsl LDC, LDC, #3 // ldc = ldc * 8
  1139. mov pB, origPB
  1140. mov counterJ, origN
  1141. asr counterJ, counterJ, #2 // J = J / 4
  1142. cmp counterJ, #0
  1143. ble .Lcgemm_kernel_L2_BEGIN
  1144. /******************************************************************************/
  1145. .Lcgemm_kernel_L4_BEGIN:
  1146. mov pCRow0, pC
  1147. add pCRow1, pCRow0, LDC
  1148. add pCRow2, pCRow1, LDC
  1149. add pCRow3, pCRow2, LDC
  1150. add pC, pCRow3, LDC
  1151. mov pA, origPA // pA = start of A array
  1152. .Lcgemm_kernel_L4_M8_BEGIN:
  1153. mov counterI, origM
  1154. asr counterI, counterI, #3 // counterI = counterI / 8
  1155. cmp counterI, #0
  1156. ble .Lcgemm_kernel_L4_M4_BEGIN
  1157. .align 5
  1158. .Lcgemm_kernel_L4_M8_20:
  1159. mov pB, origPB
  1160. asr counterL , origK, #3
  1161. cmp counterL , #2
  1162. blt .Lcgemm_kernel_L4_M8_32
  1163. KERNEL8x4_I
  1164. KERNEL8x4_M2
  1165. KERNEL8x4_M1
  1166. KERNEL8x4_M2
  1167. KERNEL8x4_M1
  1168. KERNEL8x4_M2
  1169. KERNEL8x4_M1
  1170. KERNEL8x4_M2
  1171. subs counterL, counterL, #2 // subtract 2
  1172. ble .Lcgemm_kernel_L4_M8_22a
  1173. .align 5
  1174. .Lcgemm_kernel_L4_M8_22:
  1175. KERNEL8x4_M1
  1176. KERNEL8x4_M2
  1177. KERNEL8x4_M1
  1178. KERNEL8x4_M2
  1179. KERNEL8x4_M1
  1180. KERNEL8x4_M2
  1181. KERNEL8x4_M1
  1182. KERNEL8x4_M2
  1183. subs counterL, counterL, #1
  1184. bgt .Lcgemm_kernel_L4_M8_22
  1185. .align 5
  1186. .Lcgemm_kernel_L4_M8_22a:
  1187. KERNEL8x4_M1
  1188. KERNEL8x4_M2
  1189. KERNEL8x4_M1
  1190. KERNEL8x4_M2
  1191. KERNEL8x4_M1
  1192. KERNEL8x4_M2
  1193. KERNEL8x4_M1
  1194. KERNEL8x4_E
  1195. b .Lcgemm_kernel_L4_M8_44
  1196. .align 5
  1197. .Lcgemm_kernel_L4_M8_32:
  1198. tst counterL, #1
  1199. ble .Lcgemm_kernel_L4_M8_40
  1200. KERNEL8x4_I
  1201. KERNEL8x4_M2
  1202. KERNEL8x4_M1
  1203. KERNEL8x4_M2
  1204. KERNEL8x4_M1
  1205. KERNEL8x4_M2
  1206. KERNEL8x4_M1
  1207. KERNEL8x4_E
  1208. b .Lcgemm_kernel_L4_M8_44
  1209. .Lcgemm_kernel_L4_M8_40:
  1210. INIT8x4
  1211. .Lcgemm_kernel_L4_M8_44:
  1212. ands counterL , origK, #7
  1213. ble .Lcgemm_kernel_L4_M8_100
  1214. .align 5
  1215. .Lcgemm_kernel_L4_M8_46:
  1216. KERNEL8x4_SUB
  1217. subs counterL, counterL, #1
  1218. bne .Lcgemm_kernel_L4_M8_46
  1219. .Lcgemm_kernel_L4_M8_100:
  1220. prfm PLDL1KEEP, [pA]
  1221. prfm PLDL1KEEP, [pA, #64]
  1222. prfm PLDL1KEEP, [origPB]
  1223. SAVE8x4
  1224. .Lcgemm_kernel_L4_M8_END:
  1225. subs counterI, counterI, #1
  1226. bne .Lcgemm_kernel_L4_M8_20
  1227. .Lcgemm_kernel_L4_M4_BEGIN:
  1228. mov counterI, origM
  1229. tst counterI , #7
  1230. ble .Lcgemm_kernel_L4_END
  1231. tst counterI, #4
  1232. ble .Lcgemm_kernel_L4_M2_BEGIN
  1233. .Lcgemm_kernel_L4_M4_20:
  1234. mov pB, origPB
  1235. asr counterL , origK, #1 // L = K / 2
  1236. cmp counterL , #2 // is there at least 4 to do?
  1237. blt .Lcgemm_kernel_L4_M4_32
  1238. KERNEL4x4_I // do one in the K
  1239. KERNEL4x4_M2 // do another in the K
  1240. subs counterL, counterL, #2
  1241. ble .Lcgemm_kernel_L4_M4_22a
  1242. .align 5
  1243. .Lcgemm_kernel_L4_M4_22:
  1244. KERNEL4x4_M1
  1245. KERNEL4x4_M2
  1246. subs counterL, counterL, #1
  1247. bgt .Lcgemm_kernel_L4_M4_22
  1248. .Lcgemm_kernel_L4_M4_22a:
  1249. KERNEL4x4_M1
  1250. KERNEL4x4_E
  1251. b .Lcgemm_kernel_L4_M4_44
  1252. .Lcgemm_kernel_L4_M4_32:
  1253. tst counterL, #1
  1254. ble .Lcgemm_kernel_L4_M4_40
  1255. KERNEL4x4_I
  1256. KERNEL4x4_E
  1257. b .Lcgemm_kernel_L4_M4_44
  1258. .Lcgemm_kernel_L4_M4_40:
  1259. INIT4x4
  1260. .Lcgemm_kernel_L4_M4_44:
  1261. ands counterL , origK, #1
  1262. ble .Lcgemm_kernel_L4_M4_100
  1263. .Lcgemm_kernel_L4_M4_46:
  1264. KERNEL4x4_SUB
  1265. .Lcgemm_kernel_L4_M4_100:
  1266. SAVE4x4
  1267. .Lcgemm_kernel_L4_M4_END:
  1268. .Lcgemm_kernel_L4_M2_BEGIN:
  1269. mov counterI, origM
  1270. tst counterI , #3
  1271. ble .Lcgemm_kernel_L4_END
  1272. tst counterI, #2 // counterI = counterI / 2
  1273. ble .Lcgemm_kernel_L4_M1_BEGIN
  1274. .Lcgemm_kernel_L4_M2_20:
  1275. INIT2x4
  1276. mov pB, origPB
  1277. asr counterL , origK, #3 // counterL = counterL / 8
  1278. cmp counterL , #0
  1279. ble .Lcgemm_kernel_L4_M2_40
  1280. .Lcgemm_kernel_L4_M2_22:
  1281. KERNEL2x4_SUB
  1282. KERNEL2x4_SUB
  1283. KERNEL2x4_SUB
  1284. KERNEL2x4_SUB
  1285. KERNEL2x4_SUB
  1286. KERNEL2x4_SUB
  1287. KERNEL2x4_SUB
  1288. KERNEL2x4_SUB
  1289. subs counterL, counterL, #1
  1290. bgt .Lcgemm_kernel_L4_M2_22
  1291. .Lcgemm_kernel_L4_M2_40:
  1292. ands counterL , origK, #7 // counterL = counterL % 8
  1293. ble .Lcgemm_kernel_L4_M2_100
  1294. .Lcgemm_kernel_L4_M2_42:
  1295. KERNEL2x4_SUB
  1296. subs counterL, counterL, #1
  1297. bgt .Lcgemm_kernel_L4_M2_42
  1298. .Lcgemm_kernel_L4_M2_100:
  1299. SAVE2x4
  1300. .Lcgemm_kernel_L4_M2_END:
  1301. .Lcgemm_kernel_L4_M1_BEGIN:
  1302. tst counterI, #1 // counterI = counterI % 2
  1303. ble .Lcgemm_kernel_L4_END
  1304. .Lcgemm_kernel_L4_M1_20:
  1305. INIT1x4
  1306. mov pB, origPB
  1307. asr counterL , origK, #3 // counterL = counterL / 8
  1308. cmp counterL , #0
  1309. ble .Lcgemm_kernel_L4_M1_40
  1310. .Lcgemm_kernel_L4_M1_22:
  1311. KERNEL1x4_SUB
  1312. KERNEL1x4_SUB
  1313. KERNEL1x4_SUB
  1314. KERNEL1x4_SUB
  1315. KERNEL1x4_SUB
  1316. KERNEL1x4_SUB
  1317. KERNEL1x4_SUB
  1318. KERNEL1x4_SUB
  1319. subs counterL, counterL, #1
  1320. bgt .Lcgemm_kernel_L4_M1_22
  1321. .Lcgemm_kernel_L4_M1_40:
  1322. ands counterL , origK, #7 // counterL = counterL % 8
  1323. ble .Lcgemm_kernel_L4_M1_100
  1324. .Lcgemm_kernel_L4_M1_42:
  1325. KERNEL1x4_SUB
  1326. subs counterL, counterL, #1
  1327. bgt .Lcgemm_kernel_L4_M1_42
  1328. .Lcgemm_kernel_L4_M1_100:
  1329. SAVE1x4
  1330. .Lcgemm_kernel_L4_END:
  1331. lsl temp, origK, #5
  1332. add origPB, origPB, temp // B = B + K * 4 * 8
  1333. subs counterJ, counterJ , #1 // j--
  1334. bgt .Lcgemm_kernel_L4_BEGIN
  1335. /******************************************************************************/
  1336. .Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  1337. mov counterJ , origN
  1338. tst counterJ , #3
  1339. ble .Lcgemm_kernel_L999 // error, N was less than 4?
  1340. tst counterJ , #2
  1341. ble .Lcgemm_kernel_L1_BEGIN
  1342. mov pCRow0, pC // pCRow0 = pC
  1343. add pC,pC,LDC, lsl #1
  1344. mov pA, origPA // pA = A
  1345. .Lcgemm_kernel_L2_M8_BEGIN:
  1346. mov counterI, origM
  1347. asr counterI, counterI, #3 // counterI = counterI / 8
  1348. cmp counterI, #0
  1349. ble .Lcgemm_kernel_L2_M4_BEGIN
  1350. .Lcgemm_kernel_L2_M8_20:
  1351. INIT8x2
  1352. mov pB, origPB
  1353. asr counterL , origK, #3 // counterL = counterL / 8
  1354. cmp counterL,#0
  1355. ble .Lcgemm_kernel_L2_M8_40
  1356. .align 5
  1357. .Lcgemm_kernel_L2_M8_22:
  1358. KERNEL8x2_SUB
  1359. KERNEL8x2_SUB
  1360. KERNEL8x2_SUB
  1361. KERNEL8x2_SUB
  1362. KERNEL8x2_SUB
  1363. KERNEL8x2_SUB
  1364. KERNEL8x2_SUB
  1365. KERNEL8x2_SUB
  1366. subs counterL, counterL, #1
  1367. bgt .Lcgemm_kernel_L2_M8_22
  1368. .Lcgemm_kernel_L2_M8_40:
  1369. ands counterL , origK, #7 // counterL = counterL % 8
  1370. ble .Lcgemm_kernel_L2_M8_100
  1371. .Lcgemm_kernel_L2_M8_42:
  1372. KERNEL8x2_SUB
  1373. subs counterL, counterL, #1
  1374. bgt .Lcgemm_kernel_L2_M8_42
  1375. .Lcgemm_kernel_L2_M8_100:
  1376. SAVE8x2
  1377. .Lcgemm_kernel_L2_M8_END:
  1378. subs counterI, counterI, #1
  1379. bgt .Lcgemm_kernel_L2_M8_20
  1380. .Lcgemm_kernel_L2_M4_BEGIN:
  1381. mov counterI, origM
  1382. tst counterI , #7
  1383. ble .Lcgemm_kernel_L2_END
  1384. tst counterI, #4 // counterI = counterI / 2
  1385. ble .Lcgemm_kernel_L2_M2_BEGIN
  1386. .Lcgemm_kernel_L2_M4_20:
  1387. INIT4x2
  1388. mov pB, origPB
  1389. asr counterL , origK, #3 // counterL = counterL / 8
  1390. cmp counterL,#0
  1391. ble .Lcgemm_kernel_L2_M4_40
  1392. .align 5
  1393. .Lcgemm_kernel_L2_M4_22:
  1394. KERNEL4x2_SUB
  1395. KERNEL4x2_SUB
  1396. KERNEL4x2_SUB
  1397. KERNEL4x2_SUB
  1398. KERNEL4x2_SUB
  1399. KERNEL4x2_SUB
  1400. KERNEL4x2_SUB
  1401. KERNEL4x2_SUB
  1402. subs counterL, counterL, #1
  1403. bgt .Lcgemm_kernel_L2_M4_22
  1404. .Lcgemm_kernel_L2_M4_40:
  1405. ands counterL , origK, #7 // counterL = counterL % 8
  1406. ble .Lcgemm_kernel_L2_M4_100
  1407. .Lcgemm_kernel_L2_M4_42:
  1408. KERNEL4x2_SUB
  1409. subs counterL, counterL, #1
  1410. bgt .Lcgemm_kernel_L2_M4_42
  1411. .Lcgemm_kernel_L2_M4_100:
  1412. SAVE4x2
  1413. .Lcgemm_kernel_L2_M4_END:
  1414. .Lcgemm_kernel_L2_M2_BEGIN:
  1415. mov counterI, origM
  1416. tst counterI , #3
  1417. ble .Lcgemm_kernel_L2_END
  1418. tst counterI, #2 // counterI = counterI / 2
  1419. ble .Lcgemm_kernel_L2_M1_BEGIN
  1420. .Lcgemm_kernel_L2_M2_20:
  1421. INIT2x2
  1422. mov pB, origPB
  1423. asr counterL , origK, #3 // counterL = counterL / 8
  1424. cmp counterL,#0
  1425. ble .Lcgemm_kernel_L2_M2_40
  1426. .Lcgemm_kernel_L2_M2_22:
  1427. KERNEL2x2_SUB
  1428. KERNEL2x2_SUB
  1429. KERNEL2x2_SUB
  1430. KERNEL2x2_SUB
  1431. KERNEL2x2_SUB
  1432. KERNEL2x2_SUB
  1433. KERNEL2x2_SUB
  1434. KERNEL2x2_SUB
  1435. subs counterL, counterL, #1
  1436. bgt .Lcgemm_kernel_L2_M2_22
  1437. .Lcgemm_kernel_L2_M2_40:
  1438. ands counterL , origK, #7 // counterL = counterL % 8
  1439. ble .Lcgemm_kernel_L2_M2_100
  1440. .Lcgemm_kernel_L2_M2_42:
  1441. KERNEL2x2_SUB
  1442. subs counterL, counterL, #1
  1443. bgt .Lcgemm_kernel_L2_M2_42
  1444. .Lcgemm_kernel_L2_M2_100:
  1445. SAVE2x2
  1446. .Lcgemm_kernel_L2_M2_END:
  1447. .Lcgemm_kernel_L2_M1_BEGIN:
  1448. tst counterI, #1 // counterI = counterI % 2
  1449. ble .Lcgemm_kernel_L2_END
  1450. .Lcgemm_kernel_L2_M1_20:
  1451. INIT1x2
  1452. mov pB, origPB
  1453. asr counterL , origK, #3 // counterL = counterL / 8
  1454. cmp counterL, #0
  1455. ble .Lcgemm_kernel_L2_M1_40
  1456. .Lcgemm_kernel_L2_M1_22:
  1457. KERNEL1x2_SUB
  1458. KERNEL1x2_SUB
  1459. KERNEL1x2_SUB
  1460. KERNEL1x2_SUB
  1461. KERNEL1x2_SUB
  1462. KERNEL1x2_SUB
  1463. KERNEL1x2_SUB
  1464. KERNEL1x2_SUB
  1465. subs counterL, counterL, #1
  1466. bgt .Lcgemm_kernel_L2_M1_22
  1467. .Lcgemm_kernel_L2_M1_40:
  1468. ands counterL , origK, #7 // counterL = counterL % 8
  1469. ble .Lcgemm_kernel_L2_M1_100
  1470. .Lcgemm_kernel_L2_M1_42:
  1471. KERNEL1x2_SUB
  1472. subs counterL, counterL, #1
  1473. bgt .Lcgemm_kernel_L2_M1_42
  1474. .Lcgemm_kernel_L2_M1_100:
  1475. SAVE1x2
  1476. .Lcgemm_kernel_L2_END:
  1477. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1478. /******************************************************************************/
  1479. .Lcgemm_kernel_L1_BEGIN:
  1480. mov counterJ , origN
  1481. tst counterJ , #1
  1482. ble .Lcgemm_kernel_L999 // done
  1483. mov pCRow0, pC // pCRow0 = C
  1484. add pC , pC , LDC // Update pC to point to next
  1485. mov pA, origPA // pA = A
  1486. .Lcgemm_kernel_L1_M8_BEGIN:
  1487. mov counterI, origM
  1488. asr counterI, counterI, #3 // counterI = counterI / 8
  1489. cmp counterI, #0
  1490. ble .Lcgemm_kernel_L1_M4_BEGIN
  1491. .Lcgemm_kernel_L1_M8_20:
  1492. INIT8x1
  1493. mov pB, origPB
  1494. asr counterL , origK, #3 // counterL = counterL / 8
  1495. cmp counterL , #0
  1496. ble .Lcgemm_kernel_L1_M8_40
  1497. .align 5
  1498. .Lcgemm_kernel_L1_M8_22:
  1499. KERNEL8x1_SUB
  1500. KERNEL8x1_SUB
  1501. KERNEL8x1_SUB
  1502. KERNEL8x1_SUB
  1503. KERNEL8x1_SUB
  1504. KERNEL8x1_SUB
  1505. KERNEL8x1_SUB
  1506. KERNEL8x1_SUB
  1507. subs counterL, counterL, #1
  1508. bgt .Lcgemm_kernel_L1_M8_22
  1509. .Lcgemm_kernel_L1_M8_40:
  1510. ands counterL , origK, #7 // counterL = counterL % 8
  1511. ble .Lcgemm_kernel_L1_M8_100
  1512. .Lcgemm_kernel_L1_M8_42:
  1513. KERNEL8x1_SUB
  1514. subs counterL, counterL, #1
  1515. bgt .Lcgemm_kernel_L1_M8_42
  1516. .Lcgemm_kernel_L1_M8_100:
  1517. SAVE8x1
  1518. .Lcgemm_kernel_L1_M8_END:
  1519. subs counterI, counterI, #1
  1520. bgt .Lcgemm_kernel_L1_M8_20
  1521. .Lcgemm_kernel_L1_M4_BEGIN:
  1522. mov counterI, origM
  1523. tst counterI , #7
  1524. ble .Lcgemm_kernel_L1_END
  1525. tst counterI, #4 // counterI = counterI / 2
  1526. ble .Lcgemm_kernel_L1_M2_BEGIN
  1527. .Lcgemm_kernel_L1_M4_20:
  1528. INIT4x1
  1529. mov pB, origPB
  1530. asr counterL , origK, #3 // counterL = counterL / 8
  1531. cmp counterL , #0
  1532. ble .Lcgemm_kernel_L1_M4_40
  1533. .align 5
  1534. .Lcgemm_kernel_L1_M4_22:
  1535. KERNEL4x1_SUB
  1536. KERNEL4x1_SUB
  1537. KERNEL4x1_SUB
  1538. KERNEL4x1_SUB
  1539. KERNEL4x1_SUB
  1540. KERNEL4x1_SUB
  1541. KERNEL4x1_SUB
  1542. KERNEL4x1_SUB
  1543. subs counterL, counterL, #1
  1544. bgt .Lcgemm_kernel_L1_M4_22
  1545. .Lcgemm_kernel_L1_M4_40:
  1546. ands counterL , origK, #7 // counterL = counterL % 8
  1547. ble .Lcgemm_kernel_L1_M4_100
  1548. .Lcgemm_kernel_L1_M4_42:
  1549. KERNEL4x1_SUB
  1550. subs counterL, counterL, #1
  1551. bgt .Lcgemm_kernel_L1_M4_42
  1552. .Lcgemm_kernel_L1_M4_100:
  1553. SAVE4x1
  1554. .Lcgemm_kernel_L1_M4_END:
  1555. .Lcgemm_kernel_L1_M2_BEGIN:
  1556. mov counterI, origM
  1557. tst counterI , #3
  1558. ble .Lcgemm_kernel_L1_END
  1559. tst counterI, #2 // counterI = counterI / 2
  1560. ble .Lcgemm_kernel_L1_M1_BEGIN
  1561. .Lcgemm_kernel_L1_M2_20:
  1562. INIT2x1
  1563. mov pB, origPB
  1564. asr counterL , origK, #3 // counterL = counterL / 8
  1565. cmp counterL , #0
  1566. ble .Lcgemm_kernel_L1_M2_40
  1567. .Lcgemm_kernel_L1_M2_22:
  1568. KERNEL2x1_SUB
  1569. KERNEL2x1_SUB
  1570. KERNEL2x1_SUB
  1571. KERNEL2x1_SUB
  1572. KERNEL2x1_SUB
  1573. KERNEL2x1_SUB
  1574. KERNEL2x1_SUB
  1575. KERNEL2x1_SUB
  1576. subs counterL, counterL, #1
  1577. bgt .Lcgemm_kernel_L1_M2_22
  1578. .Lcgemm_kernel_L1_M2_40:
  1579. ands counterL , origK, #7 // counterL = counterL % 8
  1580. ble .Lcgemm_kernel_L1_M2_100
  1581. .Lcgemm_kernel_L1_M2_42:
  1582. KERNEL2x1_SUB
  1583. subs counterL, counterL, #1
  1584. bgt .Lcgemm_kernel_L1_M2_42
  1585. .Lcgemm_kernel_L1_M2_100:
  1586. SAVE2x1
  1587. .Lcgemm_kernel_L1_M2_END:
  1588. .Lcgemm_kernel_L1_M1_BEGIN:
  1589. tst counterI, #1 // counterI = counterI % 2
  1590. ble .Lcgemm_kernel_L1_END
  1591. .Lcgemm_kernel_L1_M1_20:
  1592. INIT1x1
  1593. mov pB, origPB
  1594. asr counterL , origK, #3 // counterL = counterL / 8
  1595. cmp counterL , #0
  1596. ble .Lcgemm_kernel_L1_M1_40
  1597. .Lcgemm_kernel_L1_M1_22:
  1598. KERNEL1x1_SUB
  1599. KERNEL1x1_SUB
  1600. KERNEL1x1_SUB
  1601. KERNEL1x1_SUB
  1602. KERNEL1x1_SUB
  1603. KERNEL1x1_SUB
  1604. KERNEL1x1_SUB
  1605. KERNEL1x1_SUB
  1606. subs counterL, counterL, #1
  1607. bgt .Lcgemm_kernel_L1_M1_22
  1608. .Lcgemm_kernel_L1_M1_40:
  1609. ands counterL , origK, #7 // counterL = counterL % 8
  1610. ble .Lcgemm_kernel_L1_M1_100
  1611. .Lcgemm_kernel_L1_M1_42:
  1612. KERNEL1x1_SUB
  1613. subs counterL, counterL, #1
  1614. bgt .Lcgemm_kernel_L1_M1_42
  1615. .Lcgemm_kernel_L1_M1_100:
  1616. SAVE1x1
  1617. .Lcgemm_kernel_L1_END:
  1618. .Lcgemm_kernel_L999:
  1619. mov x0, #0 // set return value
  1620. ldp d8, d9, [sp, #(0 * 16)]
  1621. ldp d10, d11, [sp, #(1 * 16)]
  1622. ldp d12, d13, [sp, #(2 * 16)]
  1623. ldp d14, d15, [sp, #(3 * 16)]
  1624. ldp d16, d17, [sp, #(4 * 16)]
  1625. ldp x18, x19, [sp, #(5 * 16)]
  1626. ldp x20, x21, [sp, #(6 * 16)]
  1627. ldp x22, x23, [sp, #(7 * 16)]
  1628. ldp x24, x25, [sp, #(8 * 16)]
  1629. ldp x26, x27, [sp, #(9 * 16)]
  1630. ldr x28, [sp, #(10 * 16)]
  1631. add sp, sp, #(11*16)
  1632. ret
  1633. EPILOGUE