You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_8x4.S 42 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define alpha0_R s10
  48. #define alphaV0_R v10.s[0]
  49. #define alpha0_I s11
  50. #define alphaV0_I v11.s[0]
  51. #define alpha1_R s14
  52. #define alphaV1_R v14.s[0]
  53. #define alpha1_I s15
  54. #define alphaV1_I v15.s[0]
  55. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  56. #define OP_rr fmla
  57. #define OP_ii fmls
  58. #define OP_ri fmla
  59. #define OP_ir fmla
  60. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  61. #define OP_rr fmla
  62. #define OP_ii fmla
  63. #define OP_ri fmls
  64. #define OP_ir fmla
  65. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  66. #define OP_rr fmla
  67. #define OP_ii fmla
  68. #define OP_ri fmla
  69. #define OP_ir fmls
  70. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  71. #define OP_rr fmla
  72. #define OP_ii fmls
  73. #define OP_ri fmls
  74. #define OP_ir fmls
  75. #endif
  76. // 00 origM
  77. // 01 origN
  78. // 02 origK
  79. // 03 origPA
  80. // 04 origPB
  81. // 05 pC
  82. // 06 origLDC -> LDC
  83. // 07 offset -> temp
  84. // 08 counterL
  85. // 09 counterI
  86. // 10 counterJ
  87. // 11 pB
  88. // 12 pCRow0
  89. // 13 pCRow1
  90. // 14 pCRow2
  91. // 15 pA
  92. // 16
  93. // 17
  94. // 18 must save
  95. // 19 must save
  96. // 20 must save
  97. // 21 must save
  98. // 22 must save
  99. // 23 must save
  100. // 24 must save
  101. // 25 must save
  102. // 26 must save
  103. // 27 must save
  104. // 28 must save
  105. // 29 frame
  106. // 30 link
  107. // 31 sp
  108. //v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R
  109. //v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I
  110. //v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R
  111. //v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I
  112. //v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R
  113. //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I
  114. //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R
  115. //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I
  116. //v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R
  117. //v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I
  118. //v10 must save ALPHA0_R
  119. //v11 must save ALPHA0_I
  120. //v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R
  121. //v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I
  122. //v14 must save ALPHA1_R
  123. //v15 must save ALPHA1_I
  124. //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R
  125. //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I
  126. //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R
  127. //v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I
  128. //v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R
  129. //v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I
  130. //v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R
  131. //v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I
  132. //v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R
  133. //v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I
  134. //v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R
  135. //v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I
  136. //v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R
  137. //v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I
  138. //v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R
  139. //v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I
  140. /*******************************************************************************
  141. * Macro definitions
  142. *******************************************************************************/
  143. .macro INIT8x4
  144. fmov s16, wzr
  145. fmov s17, wzr
  146. fmov s18, wzr
  147. fmov s19, s16
  148. fmov s20, wzr
  149. fmov s21, s16
  150. fmov s22, s17
  151. fmov s23, s18
  152. fmov s24, wzr
  153. fmov s25, s16
  154. fmov s26, s17
  155. fmov s27, s18
  156. fmov s28, wzr
  157. fmov s29, s16
  158. fmov s30, s17
  159. fmov s31, s18
  160. .endm
  161. .macro KERNEL8x4_I
  162. ld2 {v8.4s, v9.4s}, [pB]
  163. add pB, pB, #32
  164. ld2 {v0.4s, v1.4s}, [pA]
  165. add pA, pA, #32
  166. ld2 {v2.4s, v3.4s}, [pA]
  167. add pA, pA, #32
  168. fmul v16.4s, v0.4s, v8.s[0]
  169. OP_ii v16.4s, v1.4s, v9.s[0]
  170. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  171. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  172. eor v17.16b, v17.16b, v17.16b
  173. fmls v17.4s, v0.4s, v9.s[0]
  174. #else
  175. fmul v17.4s, v0.4s, v9.s[0]
  176. #endif
  177. OP_ir v17.4s, v1.4s, v8.s[0]
  178. fmul v18.4s, v2.4s, v8.s[0]
  179. OP_ii v18.4s, v3.4s, v9.s[0]
  180. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  181. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  182. eor v19.16b, v19.16b, v19.16b
  183. fmls v19.4s, v2.4s, v9.s[0]
  184. #else
  185. fmul v19.4s, v2.4s, v9.s[0]
  186. #endif
  187. OP_ir v19.4s, v3.4s, v8.s[0]
  188. fmul v20.4s, v0.4s, v8.s[1]
  189. OP_ii v20.4s, v1.4s, v9.s[1]
  190. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  191. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  192. eor v21.16b, v21.16b, v21.16b
  193. fmls v21.4s, v0.4s, v9.s[1]
  194. #else
  195. fmul v21.4s, v0.4s, v9.s[1]
  196. #endif
  197. OP_ir v21.4s, v1.4s, v8.s[1]
  198. fmul v22.4s, v2.4s, v8.s[1]
  199. OP_ii v22.4s, v3.4s, v9.s[1]
  200. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  201. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  202. eor v23.16b, v23.16b, v23.16b
  203. fmls v23.4s, v2.4s, v9.s[1]
  204. #else
  205. fmul v23.4s, v2.4s, v9.s[1]
  206. #endif
  207. OP_ir v23.4s, v3.4s, v8.s[1]
  208. fmul v24.4s, v0.4s, v8.s[2]
  209. OP_ii v24.4s, v1.4s, v9.s[2]
  210. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  211. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  212. eor v25.16b, v25.16b, v25.16b
  213. fmls v25.4s, v0.4s, v9.s[2]
  214. #else
  215. fmul v25.4s, v0.4s, v9.s[2]
  216. #endif
  217. OP_ir v25.4s, v1.4s, v8.s[2]
  218. fmul v26.4s, v2.4s, v8.s[2]
  219. OP_ii v26.4s, v3.4s, v9.s[2]
  220. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  221. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  222. eor v27.16b, v27.16b, v27.16b
  223. fmls v27.4s, v2.4s, v9.s[2]
  224. #else
  225. fmul v27.4s, v2.4s, v9.s[2]
  226. #endif
  227. OP_ir v27.4s, v3.4s, v8.s[2]
  228. fmul v28.4s, v0.4s, v8.s[3]
  229. OP_ii v28.4s, v1.4s, v9.s[3]
  230. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  231. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  232. eor v29.16b, v29.16b, v29.16b
  233. fmls v29.4s, v0.4s, v9.s[3]
  234. #else
  235. fmul v29.4s, v0.4s, v9.s[3]
  236. #endif
  237. OP_ir v29.4s, v1.4s, v8.s[3]
  238. fmul v30.4s, v2.4s, v8.s[3]
  239. OP_ii v30.4s, v3.4s, v9.s[3]
  240. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  241. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  242. eor v31.16b, v31.16b, v31.16b
  243. fmls v31.4s, v2.4s, v9.s[3]
  244. #else
  245. fmul v31.4s, v2.4s, v9.s[3]
  246. #endif
  247. OP_ir v31.4s, v3.4s, v8.s[3]
  248. ld2 {v12.4s, v13.4s}, [pB]
  249. add pB, pB, #32
  250. ld2 {v4.4s, v5.4s}, [pA]
  251. add pA, pA, #32
  252. ld2 {v6.4s, v7.4s}, [pA]
  253. add pA, pA, #32
  254. .endm
  255. .macro KERNEL8x4_M1
  256. OP_rr v16.4s, v0.4s, v8.s[0]
  257. OP_ii v16.4s, v1.4s, v9.s[0]
  258. OP_ri v17.4s, v0.4s, v9.s[0]
  259. OP_ir v17.4s, v1.4s, v8.s[0]
  260. OP_rr v18.4s, v2.4s, v8.s[0]
  261. OP_ii v18.4s, v3.4s, v9.s[0]
  262. OP_ri v19.4s, v2.4s, v9.s[0]
  263. OP_ir v19.4s, v3.4s, v8.s[0]
  264. OP_rr v20.4s, v0.4s, v8.s[1]
  265. OP_ii v20.4s, v1.4s, v9.s[1]
  266. OP_ri v21.4s, v0.4s, v9.s[1]
  267. OP_ir v21.4s, v1.4s, v8.s[1]
  268. OP_rr v22.4s, v2.4s, v8.s[1]
  269. OP_ii v22.4s, v3.4s, v9.s[1]
  270. OP_ri v23.4s, v2.4s, v9.s[1]
  271. OP_ir v23.4s, v3.4s, v8.s[1]
  272. OP_rr v24.4s, v0.4s, v8.s[2]
  273. OP_ii v24.4s, v1.4s, v9.s[2]
  274. OP_ri v25.4s, v0.4s, v9.s[2]
  275. OP_ir v25.4s, v1.4s, v8.s[2]
  276. OP_rr v26.4s, v2.4s, v8.s[2]
  277. OP_ii v26.4s, v3.4s, v9.s[2]
  278. OP_ri v27.4s, v2.4s, v9.s[2]
  279. OP_ir v27.4s, v3.4s, v8.s[2]
  280. OP_rr v28.4s, v0.4s, v8.s[3]
  281. OP_ii v28.4s, v1.4s, v9.s[3]
  282. OP_ri v29.4s, v0.4s, v9.s[3]
  283. OP_ir v29.4s, v1.4s, v8.s[3]
  284. OP_rr v30.4s, v2.4s, v8.s[3]
  285. OP_ii v30.4s, v3.4s, v9.s[3]
  286. OP_ri v31.4s, v2.4s, v9.s[3]
  287. OP_ir v31.4s, v3.4s, v8.s[3]
  288. ld2 {v12.4s, v13.4s}, [pB] // For next round
  289. add pB, pB, #32
  290. ld2 {v4.4s, v5.4s}, [pA] // For next round
  291. add pA, pA, #32
  292. ld2 {v6.4s, v7.4s}, [pA]
  293. add pA, pA, #32
  294. .endm
  295. .macro KERNEL8x4_M2
  296. OP_rr v16.4s, v4.4s, v12.s[0]
  297. OP_ii v16.4s, v5.4s, v13.s[0]
  298. OP_ri v17.4s, v4.4s, v13.s[0]
  299. OP_ir v17.4s, v5.4s, v12.s[0]
  300. OP_rr v18.4s, v6.4s, v12.s[0]
  301. OP_ii v18.4s, v7.4s, v13.s[0]
  302. OP_ri v19.4s, v6.4s, v13.s[0]
  303. OP_ir v19.4s, v7.4s, v12.s[0]
  304. OP_rr v20.4s, v4.4s, v12.s[1]
  305. OP_ii v20.4s, v5.4s, v13.s[1]
  306. OP_ri v21.4s, v4.4s, v13.s[1]
  307. OP_ir v21.4s, v5.4s, v12.s[1]
  308. OP_rr v22.4s, v6.4s, v12.s[1]
  309. OP_ii v22.4s, v7.4s, v13.s[1]
  310. OP_ri v23.4s, v6.4s, v13.s[1]
  311. OP_ir v23.4s, v7.4s, v12.s[1]
  312. OP_rr v24.4s, v4.4s, v12.s[2]
  313. OP_ii v24.4s, v5.4s, v13.s[2]
  314. OP_ri v25.4s, v4.4s, v13.s[2]
  315. OP_ir v25.4s, v5.4s, v12.s[2]
  316. OP_rr v26.4s, v6.4s, v12.s[2]
  317. OP_ii v26.4s, v7.4s, v13.s[2]
  318. OP_ri v27.4s, v6.4s, v13.s[2]
  319. OP_ir v27.4s, v7.4s, v12.s[2]
  320. OP_rr v28.4s, v4.4s, v12.s[3]
  321. OP_ii v28.4s, v5.4s, v13.s[3]
  322. OP_ri v29.4s, v4.4s, v13.s[3]
  323. OP_ir v29.4s, v5.4s, v12.s[3]
  324. OP_rr v30.4s, v6.4s, v12.s[3]
  325. OP_ii v30.4s, v7.4s, v13.s[3]
  326. OP_ri v31.4s, v6.4s, v13.s[3]
  327. OP_ir v31.4s, v7.4s, v12.s[3]
  328. ld2 {v8.4s, v9.4s}, [pB]
  329. add pB, pB, #32
  330. ld2 {v0.4s, v1.4s}, [pA]
  331. add pA, pA, #32
  332. ld2 {v2.4s, v3.4s}, [pA]
  333. add pA, pA, #32
  334. .endm
  335. .macro KERNEL8x4_E
  336. OP_rr v16.4s, v4.4s, v12.s[0]
  337. OP_ii v16.4s, v5.4s, v13.s[0]
  338. OP_ri v17.4s, v4.4s, v13.s[0]
  339. OP_ir v17.4s, v5.4s, v12.s[0]
  340. OP_rr v18.4s, v6.4s, v12.s[0]
  341. OP_ii v18.4s, v7.4s, v13.s[0]
  342. OP_ri v19.4s, v6.4s, v13.s[0]
  343. OP_ir v19.4s, v7.4s, v12.s[0]
  344. OP_rr v20.4s, v4.4s, v12.s[1]
  345. OP_ii v20.4s, v5.4s, v13.s[1]
  346. OP_ri v21.4s, v4.4s, v13.s[1]
  347. OP_ir v21.4s, v5.4s, v12.s[1]
  348. OP_rr v22.4s, v6.4s, v12.s[1]
  349. OP_ii v22.4s, v7.4s, v13.s[1]
  350. OP_ri v23.4s, v6.4s, v13.s[1]
  351. OP_ir v23.4s, v7.4s, v12.s[1]
  352. OP_rr v24.4s, v4.4s, v12.s[2]
  353. OP_ii v24.4s, v5.4s, v13.s[2]
  354. OP_ri v25.4s, v4.4s, v13.s[2]
  355. OP_ir v25.4s, v5.4s, v12.s[2]
  356. OP_rr v26.4s, v6.4s, v12.s[2]
  357. OP_ii v26.4s, v7.4s, v13.s[2]
  358. OP_ri v27.4s, v6.4s, v13.s[2]
  359. OP_ir v27.4s, v7.4s, v12.s[2]
  360. OP_rr v28.4s, v4.4s, v12.s[3]
  361. OP_ii v28.4s, v5.4s, v13.s[3]
  362. OP_ri v29.4s, v4.4s, v13.s[3]
  363. OP_ir v29.4s, v5.4s, v12.s[3]
  364. OP_rr v30.4s, v6.4s, v12.s[3]
  365. OP_ii v30.4s, v7.4s, v13.s[3]
  366. OP_ri v31.4s, v6.4s, v13.s[3]
  367. OP_ir v31.4s, v7.4s, v12.s[3]
  368. .endm
  369. .macro KERNEL8x4_SUB
  370. ld2 {v8.4s, v9.4s}, [pB]
  371. add pB, pB, #32
  372. ld2 {v0.4s, v1.4s}, [pA]
  373. add pA, pA, #32
  374. ld2 {v2.4s, v3.4s}, [pA]
  375. add pA, pA, #32
  376. OP_rr v16.4s, v0.4s, v8.s[0]
  377. OP_ii v16.4s, v1.4s, v9.s[0]
  378. OP_ri v17.4s, v0.4s, v9.s[0]
  379. OP_ir v17.4s, v1.4s, v8.s[0]
  380. OP_rr v18.4s, v2.4s, v8.s[0]
  381. OP_ii v18.4s, v3.4s, v9.s[0]
  382. OP_ri v19.4s, v2.4s, v9.s[0]
  383. OP_ir v19.4s, v3.4s, v8.s[0]
  384. OP_rr v20.4s, v0.4s, v8.s[1]
  385. OP_ii v20.4s, v1.4s, v9.s[1]
  386. OP_ri v21.4s, v0.4s, v9.s[1]
  387. OP_ir v21.4s, v1.4s, v8.s[1]
  388. OP_rr v22.4s, v2.4s, v8.s[1]
  389. OP_ii v22.4s, v3.4s, v9.s[1]
  390. OP_ri v23.4s, v2.4s, v9.s[1]
  391. OP_ir v23.4s, v3.4s, v8.s[1]
  392. OP_rr v24.4s, v0.4s, v8.s[2]
  393. OP_ii v24.4s, v1.4s, v9.s[2]
  394. OP_ri v25.4s, v0.4s, v9.s[2]
  395. OP_ir v25.4s, v1.4s, v8.s[2]
  396. OP_rr v26.4s, v2.4s, v8.s[2]
  397. OP_ii v26.4s, v3.4s, v9.s[2]
  398. OP_ri v27.4s, v2.4s, v9.s[2]
  399. OP_ir v27.4s, v3.4s, v8.s[2]
  400. OP_rr v28.4s, v0.4s, v8.s[3]
  401. OP_ii v28.4s, v1.4s, v9.s[3]
  402. OP_ri v29.4s, v0.4s, v9.s[3]
  403. OP_ir v29.4s, v1.4s, v8.s[3]
  404. OP_rr v30.4s, v2.4s, v8.s[3]
  405. OP_ii v30.4s, v3.4s, v9.s[3]
  406. OP_ri v31.4s, v2.4s, v9.s[3]
  407. OP_ir v31.4s, v3.4s, v8.s[3]
  408. .endm
  409. .macro SAVE8x4
  410. mov pCRow1, pCRow0
  411. ld2 {v0.4s, v1.4s}, [pCRow1]
  412. fmla v0.4s, v16.4s, alphaV0_R
  413. fmls v0.4s, v17.4s, alphaV0_I
  414. fmla v1.4s, v16.4s, alphaV1_I
  415. fmla v1.4s, v17.4s, alphaV1_R
  416. st2 {v0.4s, v1.4s}, [pCRow1]
  417. add pCRow2, pCRow1, #32
  418. ld2 {v2.4s, v3.4s}, [pCRow2]
  419. fmla v2.4s, v18.4s, alphaV0_R
  420. fmls v2.4s, v19.4s, alphaV0_I
  421. fmla v3.4s, v18.4s, alphaV1_I
  422. fmla v3.4s, v19.4s, alphaV1_R
  423. st2 {v2.4s, v3.4s}, [pCRow2]
  424. add pCRow1, pCRow1, LDC
  425. ld2 {v4.4s, v5.4s}, [pCRow1]
  426. fmla v4.4s, v20.4s, alphaV0_R
  427. fmls v4.4s, v21.4s, alphaV0_I
  428. fmla v5.4s, v20.4s, alphaV1_I
  429. fmla v5.4s, v21.4s, alphaV1_R
  430. st2 {v4.4s, v5.4s}, [pCRow1]
  431. add pCRow2, pCRow1, #32
  432. ld2 {v6.4s, v7.4s}, [pCRow2]
  433. fmla v6.4s, v22.4s, alphaV0_R
  434. fmls v6.4s, v23.4s, alphaV0_I
  435. fmla v7.4s, v22.4s, alphaV1_I
  436. fmla v7.4s, v23.4s, alphaV1_R
  437. st2 {v6.4s, v7.4s}, [pCRow2]
  438. add pCRow1, pCRow1, LDC
  439. ld2 {v0.4s, v1.4s}, [pCRow1]
  440. fmla v0.4s, v24.4s, alphaV0_R
  441. fmls v0.4s, v25.4s, alphaV0_I
  442. fmla v1.4s, v24.4s, alphaV1_I
  443. fmla v1.4s, v25.4s, alphaV1_R
  444. st2 {v0.4s, v1.4s}, [pCRow1]
  445. add pCRow2, pCRow1, #32
  446. ld2 {v2.4s, v3.4s}, [pCRow2]
  447. fmla v2.4s, v26.4s, alphaV0_R
  448. fmls v2.4s, v27.4s, alphaV0_I
  449. fmla v3.4s, v26.4s, alphaV1_I
  450. fmla v3.4s, v27.4s, alphaV1_R
  451. st2 {v2.4s, v3.4s}, [pCRow2]
  452. add pCRow1, pCRow1, LDC
  453. ld2 {v4.4s, v5.4s}, [pCRow1]
  454. fmla v4.4s, v28.4s, alphaV0_R
  455. fmls v4.4s, v29.4s, alphaV0_I
  456. fmla v5.4s, v28.4s, alphaV1_I
  457. fmla v5.4s, v29.4s, alphaV1_R
  458. st2 {v4.4s, v5.4s}, [pCRow1]
  459. add pCRow2, pCRow1, #32
  460. ld2 {v6.4s, v7.4s}, [pCRow2]
  461. fmla v6.4s, v30.4s, alphaV0_R
  462. fmls v6.4s, v31.4s, alphaV0_I
  463. fmla v7.4s, v30.4s, alphaV1_I
  464. fmla v7.4s, v31.4s, alphaV1_R
  465. st2 {v6.4s, v7.4s}, [pCRow2]
  466. add pCRow0, pCRow0, #64
  467. .endm
  468. /******************************************************************************/
  469. .macro INIT4x4
  470. fmov s16, wzr
  471. fmov s17, s16
  472. fmov s20, s17
  473. fmov s21, s16
  474. fmov s24, s17
  475. fmov s25, s16
  476. fmov s28, s17
  477. fmov s29, s16
  478. .endm
  479. .macro KERNEL4x4_I
  480. ld2 {v8.4s, v9.4s}, [pB]
  481. add pB, pB, #32
  482. ld2 {v0.4s, v1.4s}, [pA]
  483. add pA, pA, #32
  484. fmul v16.4s, v0.4s, v8.s[0]
  485. OP_ii v16.4s, v1.4s, v9.s[0]
  486. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  487. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  488. eor v17.16b, v17.16b, v17.16b
  489. fmls v17.4s, v0.4s, v9.s[0]
  490. #else
  491. fmul v17.4s, v0.4s, v9.s[0]
  492. #endif
  493. OP_ir v17.4s, v1.4s, v8.s[0]
  494. fmul v20.4s, v0.4s, v8.s[1]
  495. OP_ii v20.4s, v1.4s, v9.s[1]
  496. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  497. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  498. eor v21.16b, v21.16b, v21.16b
  499. fmls v21.4s, v0.4s, v9.s[1]
  500. #else
  501. fmul v21.4s, v0.4s, v9.s[1]
  502. #endif
  503. OP_ir v21.4s, v1.4s, v8.s[1]
  504. fmul v24.4s, v0.4s, v8.s[2]
  505. OP_ii v24.4s, v1.4s, v9.s[2]
  506. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  507. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  508. eor v25.16b, v25.16b, v25.16b
  509. fmls v25.4s, v0.4s, v9.s[2]
  510. #else
  511. fmul v25.4s, v0.4s, v9.s[2]
  512. #endif
  513. OP_ir v25.4s, v1.4s, v8.s[2]
  514. fmul v28.4s, v0.4s, v8.s[3]
  515. OP_ii v28.4s, v1.4s, v9.s[3]
  516. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  517. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  518. eor v29.16b, v29.16b, v29.16b
  519. fmls v29.4s, v0.4s, v9.s[3]
  520. #else
  521. fmul v29.4s, v0.4s, v9.s[3]
  522. #endif
  523. OP_ir v29.4s, v1.4s, v8.s[3]
  524. ld2 {v12.4s, v13.4s}, [pB]
  525. add pB, pB, #32
  526. ld2 {v4.4s, v5.4s}, [pA]
  527. add pA, pA, #32
  528. .endm
  529. .macro KERNEL4x4_M1
  530. OP_rr v16.4s, v0.4s, v8.s[0]
  531. OP_ii v16.4s, v1.4s, v9.s[0]
  532. OP_ri v17.4s, v0.4s, v9.s[0]
  533. OP_ir v17.4s, v1.4s, v8.s[0]
  534. ld2 {v12.4s, v13.4s}, [pB] // For next round
  535. add pB, pB, #32
  536. OP_rr v20.4s, v0.4s, v8.s[1]
  537. OP_ii v20.4s, v1.4s, v9.s[1]
  538. OP_ri v21.4s, v0.4s, v9.s[1]
  539. OP_ir v21.4s, v1.4s, v8.s[1]
  540. ld2 {v4.4s, v5.4s}, [pA] // For next round
  541. add pA, pA, #32
  542. OP_rr v24.4s, v0.4s, v8.s[2]
  543. OP_ii v24.4s, v1.4s, v9.s[2]
  544. OP_ri v25.4s, v0.4s, v9.s[2]
  545. OP_ir v25.4s, v1.4s, v8.s[2]
  546. prfm PLDL1KEEP, [pA, #512]
  547. OP_rr v28.4s, v0.4s, v8.s[3]
  548. OP_ii v28.4s, v1.4s, v9.s[3]
  549. OP_ri v29.4s, v0.4s, v9.s[3]
  550. OP_ir v29.4s, v1.4s, v8.s[3]
  551. .endm
  552. .macro KERNEL4x4_M2
  553. OP_rr v16.4s, v4.4s, v12.s[0]
  554. OP_ii v16.4s, v5.4s, v13.s[0]
  555. OP_ri v17.4s, v4.4s, v13.s[0]
  556. OP_ir v17.4s, v5.4s, v12.s[0]
  557. ld2 {v8.4s, v9.4s}, [pB] // For next round
  558. add pB, pB, #32
  559. OP_rr v20.4s, v4.4s, v12.s[1]
  560. OP_ii v20.4s, v5.4s, v13.s[1]
  561. OP_ri v21.4s, v4.4s, v13.s[1]
  562. OP_ir v21.4s, v5.4s, v12.s[1]
  563. ld2 {v0.4s, v1.4s}, [pA] // For next round
  564. add pA, pA, #32
  565. OP_rr v24.4s, v4.4s, v12.s[2]
  566. OP_ii v24.4s, v5.4s, v13.s[2]
  567. OP_ri v25.4s, v4.4s, v13.s[2]
  568. OP_ir v25.4s, v5.4s, v12.s[2]
  569. prfm PLDL1KEEP, [pB, #512]
  570. OP_rr v28.4s, v4.4s, v12.s[3]
  571. OP_ii v28.4s, v5.4s, v13.s[3]
  572. OP_ri v29.4s, v4.4s, v13.s[3]
  573. OP_ir v29.4s, v5.4s, v12.s[3]
  574. .endm
  575. .macro KERNEL4x4_E
  576. OP_rr v16.4s, v4.4s, v12.s[0]
  577. OP_ii v16.4s, v5.4s, v13.s[0]
  578. OP_ri v17.4s, v4.4s, v13.s[0]
  579. OP_ir v17.4s, v5.4s, v12.s[0]
  580. OP_rr v20.4s, v4.4s, v12.s[1]
  581. OP_ii v20.4s, v5.4s, v13.s[1]
  582. OP_ri v21.4s, v4.4s, v13.s[1]
  583. OP_ir v21.4s, v5.4s, v12.s[1]
  584. OP_rr v24.4s, v4.4s, v12.s[2]
  585. OP_ii v24.4s, v5.4s, v13.s[2]
  586. OP_ri v25.4s, v4.4s, v13.s[2]
  587. OP_ir v25.4s, v5.4s, v12.s[2]
  588. OP_rr v28.4s, v4.4s, v12.s[3]
  589. OP_ii v28.4s, v5.4s, v13.s[3]
  590. OP_ri v29.4s, v4.4s, v13.s[3]
  591. OP_ir v29.4s, v5.4s, v12.s[3]
  592. .endm
  593. .macro KERNEL4x4_SUB
  594. ld2 {v8.4s, v9.4s}, [pB]
  595. add pB, pB, #32
  596. ld2 {v0.4s, v1.4s}, [pA]
  597. add pA, pA, #32
  598. OP_rr v16.4s, v0.4s, v8.s[0]
  599. OP_ii v16.4s, v1.4s, v9.s[0]
  600. OP_ri v17.4s, v0.4s, v9.s[0]
  601. OP_ir v17.4s, v1.4s, v8.s[0]
  602. OP_rr v20.4s, v0.4s, v8.s[1]
  603. OP_ii v20.4s, v1.4s, v9.s[1]
  604. OP_ri v21.4s, v0.4s, v9.s[1]
  605. OP_ir v21.4s, v1.4s, v8.s[1]
  606. OP_rr v24.4s, v0.4s, v8.s[2]
  607. OP_ii v24.4s, v1.4s, v9.s[2]
  608. OP_ri v25.4s, v0.4s, v9.s[2]
  609. OP_ir v25.4s, v1.4s, v8.s[2]
  610. OP_rr v28.4s, v0.4s, v8.s[3]
  611. OP_ii v28.4s, v1.4s, v9.s[3]
  612. OP_ri v29.4s, v0.4s, v9.s[3]
  613. OP_ir v29.4s, v1.4s, v8.s[3]
  614. .endm
  615. .macro SAVE4x4
  616. mov pCRow1, pCRow0
  617. ld2 {v0.4s, v1.4s}, [pCRow1]
  618. fmla v0.4s, v16.4s, alphaV0_R
  619. fmls v0.4s, v17.4s, alphaV0_I
  620. fmla v1.4s, v16.4s, alphaV1_I
  621. fmla v1.4s, v17.4s, alphaV1_R
  622. st2 {v0.4s, v1.4s}, [pCRow1]
  623. add pCRow1, pCRow1, LDC
  624. ld2 {v4.4s, v5.4s}, [pCRow1]
  625. fmla v4.4s, v20.4s, alphaV0_R
  626. fmls v4.4s, v21.4s, alphaV0_I
  627. fmla v5.4s, v20.4s, alphaV1_I
  628. fmla v5.4s, v21.4s, alphaV1_R
  629. st2 {v4.4s, v5.4s}, [pCRow1]
  630. add pCRow1, pCRow1, LDC
  631. ld2 {v0.4s, v1.4s}, [pCRow1]
  632. fmla v0.4s, v24.4s, alphaV0_R
  633. fmls v0.4s, v25.4s, alphaV0_I
  634. fmla v1.4s, v24.4s, alphaV1_I
  635. fmla v1.4s, v25.4s, alphaV1_R
  636. st2 {v0.4s, v1.4s}, [pCRow1]
  637. add pCRow1, pCRow1, LDC
  638. ld2 {v4.4s, v5.4s}, [pCRow1]
  639. fmla v4.4s, v28.4s, alphaV0_R
  640. fmls v4.4s, v29.4s, alphaV0_I
  641. fmla v5.4s, v28.4s, alphaV1_I
  642. fmla v5.4s, v29.4s, alphaV1_R
  643. st2 {v4.4s, v5.4s}, [pCRow1]
  644. add pCRow0, pCRow0, #32
  645. .endm
  646. /******************************************************************************/
  647. .macro INIT2x4
  648. fmov s16, wzr
  649. fmov s17, wzr
  650. fmov s20, s16
  651. fmov s21, s17
  652. fmov s24, s16
  653. fmov s25, s17
  654. fmov s28, s16
  655. fmov s29, s17
  656. .endm
  657. .macro KERNEL2x4_SUB
  658. ld2 {v8.4s, v9.4s}, [pB]
  659. add pB, pB, #32
  660. ld2 {v0.2s, v1.2s}, [pA]
  661. add pA, pA, #16
  662. OP_rr v16.2s, v0.2s, v8.s[0]
  663. OP_ii v16.2s, v1.2s, v9.s[0]
  664. OP_ri v17.2s, v0.2s, v9.s[0]
  665. OP_ir v17.2s, v1.2s, v8.s[0]
  666. OP_rr v20.2s, v0.2s, v8.s[1]
  667. OP_ii v20.2s, v1.2s, v9.s[1]
  668. OP_ri v21.2s, v0.2s, v9.s[1]
  669. OP_ir v21.2s, v1.2s, v8.s[1]
  670. OP_rr v24.2s, v0.2s, v8.s[2]
  671. OP_ii v24.2s, v1.2s, v9.s[2]
  672. OP_ri v25.2s, v0.2s, v9.s[2]
  673. OP_ir v25.2s, v1.2s, v8.s[2]
  674. OP_rr v28.2s, v0.2s, v8.s[3]
  675. OP_ii v28.2s, v1.2s, v9.s[3]
  676. OP_ri v29.2s, v0.2s, v9.s[3]
  677. OP_ir v29.2s, v1.2s, v8.s[3]
  678. .endm
  679. .macro SAVE2x4
  680. mov pCRow1, pCRow0
  681. ld2 {v0.2s, v1.2s}, [pCRow1]
  682. fmla v0.2s, v16.2s, alphaV0_R
  683. fmls v0.2s, v17.2s, alphaV0_I
  684. fmla v1.2s, v16.2s, alphaV1_I
  685. fmla v1.2s, v17.2s, alphaV1_R
  686. st2 {v0.2s, v1.2s}, [pCRow1]
  687. add pCRow1, pCRow1, LDC
  688. ld2 {v4.2s, v5.2s}, [pCRow1]
  689. fmla v4.2s, v20.2s, alphaV0_R
  690. fmls v4.2s, v21.2s, alphaV0_I
  691. fmla v5.2s, v20.2s, alphaV1_I
  692. fmla v5.2s, v21.2s, alphaV1_R
  693. st2 {v4.2s, v5.2s}, [pCRow1]
  694. add pCRow1, pCRow1, LDC
  695. ld2 {v0.2s, v1.2s}, [pCRow1]
  696. fmla v0.2s, v24.2s, alphaV0_R
  697. fmls v0.2s, v25.2s, alphaV0_I
  698. fmla v1.2s, v24.2s, alphaV1_I
  699. fmla v1.2s, v25.2s, alphaV1_R
  700. st2 {v0.2s, v1.2s}, [pCRow1]
  701. add pCRow1, pCRow1, LDC
  702. ld2 {v4.2s, v5.2s}, [pCRow1]
  703. fmla v4.2s, v28.2s, alphaV0_R
  704. fmls v4.2s, v29.2s, alphaV0_I
  705. fmla v5.2s, v28.2s, alphaV1_I
  706. fmla v5.2s, v29.2s, alphaV1_R
  707. st2 {v4.2s, v5.2s}, [pCRow1]
  708. add pCRow0, pCRow0, #16
  709. .endm
  710. /******************************************************************************/
  711. .macro INIT1x4
  712. fmov s16, wzr
  713. fmov s17, wzr
  714. fmov s20, s16
  715. fmov s21, s17
  716. fmov s24, s16
  717. fmov s25, s17
  718. fmov s28, s16
  719. fmov s29, s17
  720. .endm
  721. .macro KERNEL1x4_SUB
  722. ld2 {v8.4s, v9.4s}, [pB]
  723. add pB, pB, #32
  724. ld2 {v0.s, v1.s}[0], [pA]
  725. add pA, pA, #8
  726. OP_rr s16, s0, v8.s[0]
  727. OP_ii s16, s1, v9.s[0]
  728. OP_ri s17, s0, v9.s[0]
  729. OP_ir s17, s1, v8.s[0]
  730. OP_rr s20, s0, v8.s[1]
  731. OP_ii s20, s1, v9.s[1]
  732. OP_ri s21, s0, v9.s[1]
  733. OP_ir s21, s1, v8.s[1]
  734. OP_rr s24, s0, v8.s[2]
  735. OP_ii s24, s1, v9.s[2]
  736. OP_ri s25, s0, v9.s[2]
  737. OP_ir s25, s1, v8.s[2]
  738. OP_rr s28, s0, v8.s[3]
  739. OP_ii s28, s1, v9.s[3]
  740. OP_ri s29, s0, v9.s[3]
  741. OP_ir s29, s1, v8.s[3]
  742. .endm
  743. .macro SAVE1x4
  744. mov pCRow1, pCRow0
  745. ld2 {v0.s, v1.s}[0], [pCRow1]
  746. fmla s0, s16, alphaV0_R
  747. fmls s0, s17, alphaV0_I
  748. fmla s1, s16, alphaV1_I
  749. fmla s1, s17, alphaV1_R
  750. st2 {v0.s, v1.s}[0], [pCRow1]
  751. add pCRow1, pCRow1, LDC
  752. ld2 {v4.s, v5.s}[0], [pCRow1]
  753. fmla s4, s20, alphaV0_R
  754. fmls s4, s21, alphaV0_I
  755. fmla s5, s20, alphaV1_I
  756. fmla s5, s21, alphaV1_R
  757. st2 {v4.s, v5.s}[0], [pCRow1]
  758. add pCRow1, pCRow1, LDC
  759. ld2 {v0.s, v1.s}[0], [pCRow1]
  760. fmla s0, s24, alphaV0_R
  761. fmls s0, s25, alphaV0_I
  762. fmla s1, s24, alphaV1_I
  763. fmla s1, s25, alphaV1_R
  764. st2 {v0.s, v1.s}[0], [pCRow1]
  765. add pCRow1, pCRow1, LDC
  766. ld2 {v4.s, v5.s}[0], [pCRow1]
  767. fmla s4, s28, alphaV0_R
  768. fmls s4, s29, alphaV0_I
  769. fmla s5, s28, alphaV1_I
  770. fmla s5, s29, alphaV1_R
  771. st2 {v4.s, v5.s}[0], [pCRow1]
  772. add pCRow0, pCRow0, #8
  773. .endm
  774. /******************************************************************************/
  775. .macro INIT8x2
  776. fmov s16, wzr
  777. fmov s17, wzr
  778. fmov s18, wzr
  779. fmov s19, s16
  780. fmov s20, wzr
  781. fmov s21, s16
  782. fmov s22, s17
  783. fmov s23, s18
  784. .endm
  785. .macro KERNEL8x2_SUB
  786. ld2 {v8.2s, v9.2s}, [pB]
  787. add pB, pB, #16
  788. ld2 {v0.4s, v1.4s}, [pA]
  789. add pA, pA, #32
  790. ld2 {v2.4s, v3.4s}, [pA]
  791. add pA, pA, #32
  792. OP_rr v16.4s, v0.4s, v8.s[0]
  793. OP_ii v16.4s, v1.4s, v9.s[0]
  794. OP_ri v17.4s, v0.4s, v9.s[0]
  795. OP_ir v17.4s, v1.4s, v8.s[0]
  796. OP_rr v18.4s, v2.4s, v8.s[0]
  797. OP_ii v18.4s, v3.4s, v9.s[0]
  798. OP_ri v19.4s, v2.4s, v9.s[0]
  799. OP_ir v19.4s, v3.4s, v8.s[0]
  800. OP_rr v20.4s, v0.4s, v8.s[1]
  801. OP_ii v20.4s, v1.4s, v9.s[1]
  802. OP_ri v21.4s, v0.4s, v9.s[1]
  803. OP_ir v21.4s, v1.4s, v8.s[1]
  804. OP_rr v22.4s, v2.4s, v8.s[1]
  805. OP_ii v22.4s, v3.4s, v9.s[1]
  806. OP_ri v23.4s, v2.4s, v9.s[1]
  807. OP_ir v23.4s, v3.4s, v8.s[1]
  808. .endm
  809. .macro SAVE8x2
  810. mov pCRow1, pCRow0
  811. ld2 {v0.4s, v1.4s}, [pCRow1]
  812. fmla v0.4s, v16.4s, alphaV0_R
  813. fmls v0.4s, v17.4s, alphaV0_I
  814. fmla v1.4s, v16.4s, alphaV1_I
  815. fmla v1.4s, v17.4s, alphaV1_R
  816. st2 {v0.4s, v1.4s}, [pCRow1]
  817. add pCRow2, pCRow1, #32
  818. ld2 {v2.4s, v3.4s}, [pCRow2]
  819. fmla v2.4s, v18.4s, alphaV0_R
  820. fmls v2.4s, v19.4s, alphaV0_I
  821. fmla v3.4s, v18.4s, alphaV1_I
  822. fmla v3.4s, v19.4s, alphaV1_R
  823. st2 {v2.4s, v3.4s}, [pCRow2]
  824. add pCRow1, pCRow1, LDC
  825. ld2 {v4.4s, v5.4s}, [pCRow1]
  826. fmla v4.4s, v20.4s, alphaV0_R
  827. fmls v4.4s, v21.4s, alphaV0_I
  828. fmla v5.4s, v20.4s, alphaV1_I
  829. fmla v5.4s, v21.4s, alphaV1_R
  830. st2 {v4.4s, v5.4s}, [pCRow1]
  831. add pCRow2, pCRow1, #32
  832. ld2 {v6.4s, v7.4s}, [pCRow2]
  833. fmla v6.4s, v22.4s, alphaV0_R
  834. fmls v6.4s, v23.4s, alphaV0_I
  835. fmla v7.4s, v22.4s, alphaV1_I
  836. fmla v7.4s, v23.4s, alphaV1_R
  837. st2 {v6.4s, v7.4s}, [pCRow2]
  838. add pCRow0, pCRow0, #64
  839. .endm
  840. /******************************************************************************/
  841. .macro INIT4x2
  842. fmov s16, wzr
  843. fmov s17, wzr
  844. fmov s20, s16
  845. fmov s21, s17
  846. .endm
  847. .macro KERNEL4x2_SUB
  848. ld2 {v8.2s, v9.2s}, [pB]
  849. add pB, pB, #16
  850. ld2 {v0.4s, v1.4s}, [pA]
  851. add pA, pA, #32
  852. OP_rr v16.4s, v0.4s, v8.s[0]
  853. OP_ii v16.4s, v1.4s, v9.s[0]
  854. OP_ri v17.4s, v0.4s, v9.s[0]
  855. OP_ir v17.4s, v1.4s, v8.s[0]
  856. OP_rr v20.4s, v0.4s, v8.s[1]
  857. OP_ii v20.4s, v1.4s, v9.s[1]
  858. OP_ri v21.4s, v0.4s, v9.s[1]
  859. OP_ir v21.4s, v1.4s, v8.s[1]
  860. .endm
  861. .macro SAVE4x2
  862. mov pCRow1, pCRow0
  863. ld2 {v0.4s, v1.4s}, [pCRow1]
  864. fmla v0.4s, v16.4s, alphaV0_R
  865. fmls v0.4s, v17.4s, alphaV0_I
  866. fmla v1.4s, v16.4s, alphaV1_I
  867. fmla v1.4s, v17.4s, alphaV1_R
  868. st2 {v0.4s, v1.4s}, [pCRow1]
  869. add pCRow1, pCRow1, LDC
  870. ld2 {v4.4s, v5.4s}, [pCRow1]
  871. fmla v4.4s, v20.4s, alphaV0_R
  872. fmls v4.4s, v21.4s, alphaV0_I
  873. fmla v5.4s, v20.4s, alphaV1_I
  874. fmla v5.4s, v21.4s, alphaV1_R
  875. st2 {v4.4s, v5.4s}, [pCRow1]
  876. add pCRow0, pCRow0, #32
  877. .endm
  878. /******************************************************************************/
  879. .macro INIT2x2
  880. fmov s16, wzr
  881. fmov s17, wzr
  882. fmov s20, s16
  883. fmov s21, s17
  884. .endm
  885. .macro KERNEL2x2_SUB
  886. ld2 {v8.2s, v9.2s}, [pB]
  887. add pB, pB, #16
  888. ld2 {v0.2s, v1.2s}, [pA]
  889. add pA, pA, #16
  890. OP_rr v16.2s, v0.2s, v8.s[0]
  891. OP_ii v16.2s, v1.2s, v9.s[0]
  892. OP_ri v17.2s, v0.2s, v9.s[0]
  893. OP_ir v17.2s, v1.2s, v8.s[0]
  894. OP_rr v20.2s, v0.2s, v8.s[1]
  895. OP_ii v20.2s, v1.2s, v9.s[1]
  896. OP_ri v21.2s, v0.2s, v9.s[1]
  897. OP_ir v21.2s, v1.2s, v8.s[1]
  898. .endm
  899. .macro SAVE2x2
  900. mov pCRow1, pCRow0
  901. ld2 {v0.2s, v1.2s}, [pCRow1]
  902. fmla v0.2s, v16.2s, alphaV0_R
  903. fmls v0.2s, v17.2s, alphaV0_I
  904. fmla v1.2s, v16.2s, alphaV1_I
  905. fmla v1.2s, v17.2s, alphaV1_R
  906. st2 {v0.2s, v1.2s}, [pCRow1]
  907. add pCRow1, pCRow1, LDC
  908. ld2 {v4.2s, v5.2s}, [pCRow1]
  909. fmla v4.2s, v20.2s, alphaV0_R
  910. fmls v4.2s, v21.2s, alphaV0_I
  911. fmla v5.2s, v20.2s, alphaV1_I
  912. fmla v5.2s, v21.2s, alphaV1_R
  913. st2 {v4.2s, v5.2s}, [pCRow1]
  914. add pCRow0, pCRow0, #16
  915. .endm
  916. /******************************************************************************/
  917. .macro INIT1x2
  918. fmov s16, wzr
  919. fmov s17, wzr
  920. fmov s20, wzr
  921. fmov s21, wzr
  922. .endm
  923. .macro KERNEL1x2_SUB
  924. ld2 {v8.2s, v9.2s}, [pB]
  925. add pB, pB, #16
  926. ld2 {v0.s, v1.s}[0], [pA]
  927. add pA, pA, #8
  928. OP_rr s16, s0, v8.s[0]
  929. OP_ii s16, s1, v9.s[0]
  930. OP_ri s17, s0, v9.s[0]
  931. OP_ir s17, s1, v8.s[0]
  932. OP_rr s20, s0, v8.s[1]
  933. OP_ii s20, s1, v9.s[1]
  934. OP_ri s21, s0, v9.s[1]
  935. OP_ir s21, s1, v8.s[1]
  936. .endm
  937. .macro SAVE1x2
  938. mov pCRow1, pCRow0
  939. ld2 {v0.s, v1.s}[0], [pCRow1]
  940. fmla s0, s16, alphaV0_R
  941. fmls s0, s17, alphaV0_I
  942. fmla s1, s16, alphaV1_I
  943. fmla s1, s17, alphaV1_R
  944. st2 {v0.s, v1.s}[0], [pCRow1]
  945. add pCRow1, pCRow1, LDC
  946. ld2 {v4.s, v5.s}[0], [pCRow1]
  947. fmla s4, s20, alphaV0_R
  948. fmls s4, s21, alphaV0_I
  949. fmla s5, s20, alphaV1_I
  950. fmla s5, s21, alphaV1_R
  951. st2 {v4.s, v5.s}[0], [pCRow1]
  952. add pCRow0, pCRow0, #8
  953. .endm
  954. /******************************************************************************/
  955. .macro INIT8x1
  956. fmov s16, wzr
  957. fmov s17, wzr
  958. fmov s18, wzr
  959. fmov s19, s16
  960. .endm
  961. .macro KERNEL8x1_SUB
  962. ld1 {v8.2s}, [pB]
  963. add pB, pB, #8
  964. ld2 {v0.4s, v1.4s}, [pA]
  965. add pA, pA, #32
  966. ld2 {v2.4s, v3.4s}, [pA]
  967. add pA, pA, #32
  968. OP_rr v16.4s, v0.4s, v8.s[0]
  969. OP_ii v16.4s, v1.4s, v8.s[1]
  970. OP_ri v17.4s, v0.4s, v8.s[1]
  971. OP_ir v17.4s, v1.4s, v8.s[0]
  972. OP_rr v18.4s, v2.4s, v8.s[0]
  973. OP_ii v18.4s, v3.4s, v8.s[1]
  974. OP_ri v19.4s, v2.4s, v8.s[1]
  975. OP_ir v19.4s, v3.4s, v8.s[0]
  976. .endm
  977. .macro SAVE8x1
  978. mov pCRow1, pCRow0
  979. ld2 {v0.4s, v1.4s}, [pCRow1]
  980. fmla v0.4s, v16.4s, alphaV0_R
  981. fmls v0.4s, v17.4s, alphaV0_I
  982. fmla v1.4s, v16.4s, alphaV1_I
  983. fmla v1.4s, v17.4s, alphaV1_R
  984. st2 {v0.4s, v1.4s}, [pCRow1]
  985. add pCRow1, pCRow1, #32
  986. ld2 {v2.4s, v3.4s}, [pCRow1]
  987. fmla v2.4s, v18.4s, alphaV0_R
  988. fmls v2.4s, v19.4s, alphaV0_I
  989. fmla v3.4s, v18.4s, alphaV1_I
  990. fmla v3.4s, v19.4s, alphaV1_R
  991. st2 {v2.4s, v3.4s}, [pCRow1]
  992. add pCRow0, pCRow0, #64
  993. .endm
  994. /******************************************************************************/
  995. .macro INIT4x1
  996. fmov s16, wzr
  997. fmov s17, s16
  998. .endm
  999. .macro KERNEL4x1_SUB
  1000. ld2 {v8.s, v9.s}[0], [pB]
  1001. add pB, pB, #8
  1002. ld2 {v0.4s, v1.4s}, [pA]
  1003. add pA, pA, #32
  1004. OP_rr v16.4s, v0.4s, v8.s[0]
  1005. OP_ii v16.4s, v1.4s, v9.s[0]
  1006. OP_ri v17.4s, v0.4s, v9.s[0]
  1007. OP_ir v17.4s, v1.4s, v8.s[0]
  1008. .endm
  1009. .macro SAVE4x1
  1010. mov pCRow1, pCRow0
  1011. ld2 {v0.4s, v1.4s}, [pCRow1]
  1012. fmla v0.4s, v16.4s, alphaV0_R
  1013. fmls v0.4s, v17.4s, alphaV0_I
  1014. fmla v1.4s, v16.4s, alphaV1_I
  1015. fmla v1.4s, v17.4s, alphaV1_R
  1016. st2 {v0.4s, v1.4s}, [pCRow1]
  1017. add pCRow0, pCRow0, #32
  1018. .endm
  1019. /******************************************************************************/
  1020. .macro INIT2x1
  1021. fmov s16, wzr
  1022. fmov s17, wzr
  1023. .endm
  1024. .macro KERNEL2x1_SUB
  1025. ld2 {v8.s, v9.s}[0], [pB]
  1026. add pB, pB, #8
  1027. ld2 {v0.2s, v1.2s}, [pA]
  1028. add pA, pA, #16
  1029. OP_rr v16.2s, v0.2s, v8.s[0]
  1030. OP_ii v16.2s, v1.2s, v9.s[0]
  1031. OP_ri v17.2s, v0.2s, v9.s[0]
  1032. OP_ir v17.2s, v1.2s, v8.s[0]
  1033. .endm
  1034. .macro SAVE2x1
  1035. mov pCRow1, pCRow0
  1036. ld2 {v0.2s, v1.2s}, [pCRow1]
  1037. fmla v0.2s, v16.2s, alphaV0_R
  1038. fmls v0.2s, v17.2s, alphaV0_I
  1039. fmla v1.2s, v16.2s, alphaV1_I
  1040. fmla v1.2s, v17.2s, alphaV1_R
  1041. st2 {v0.2s, v1.2s}, [pCRow1]
  1042. add pCRow0, pCRow0, #16
  1043. .endm
  1044. /******************************************************************************/
  1045. .macro INIT1x1
  1046. fmov s16, wzr
  1047. fmov s17, wzr
  1048. .endm
  1049. .macro KERNEL1x1_SUB
  1050. ld2 {v8.s, v9.s}[0], [pB]
  1051. add pB, pB, #8
  1052. ld2 {v0.s, v1.s}[0], [pA]
  1053. add pA, pA, #8
  1054. OP_rr s16, s0, v8.s[0]
  1055. OP_ii s16, s1, v9.s[0]
  1056. OP_ri s17, s0, v9.s[0]
  1057. OP_ir s17, s1, v8.s[0]
  1058. .endm
  1059. .macro SAVE1x1
  1060. mov pCRow1, pCRow0
  1061. ld2 {v0.s, v1.s}[0], [pCRow1]
  1062. fmla s0, s16, alphaV0_R
  1063. fmls s0, s17, alphaV0_I
  1064. fmla s1, s16, alphaV1_I
  1065. fmla s1, s17, alphaV1_R
  1066. st2 {v0.s, v1.s}[0], [pCRow1]
  1067. add pCRow0, pCRow0, #8
  1068. .endm
  1069. /*******************************************************************************
  1070. * End of macro definitions
  1071. *******************************************************************************/
  1072. PROLOGUE
  1073. .align 5
  1074. add sp, sp, #-(11 * 16)
  1075. stp d8, d9, [sp, #(0 * 16)]
  1076. stp d10, d11, [sp, #(1 * 16)]
  1077. stp d12, d13, [sp, #(2 * 16)]
  1078. stp d14, d15, [sp, #(3 * 16)]
  1079. stp d16, d17, [sp, #(4 * 16)]
  1080. stp x18, x19, [sp, #(5 * 16)]
  1081. stp x20, x21, [sp, #(6 * 16)]
  1082. stp x22, x23, [sp, #(7 * 16)]
  1083. stp x24, x25, [sp, #(8 * 16)]
  1084. stp x26, x27, [sp, #(9 * 16)]
  1085. str x28, [sp, #(10 * 16)]
  1086. fmov alpha0_R, s0
  1087. fmov alpha0_I, s1
  1088. fmov alpha1_R, s0
  1089. fmov alpha1_I, s1
  1090. lsl LDC, LDC, #3 // ldc = ldc * 8
  1091. mov pB, origPB
  1092. mov counterJ, origN
  1093. asr counterJ, counterJ, #2 // J = J / 4
  1094. cmp counterJ, #0
  1095. ble cgemm_kernel_L2_BEGIN
  1096. /******************************************************************************/
  1097. cgemm_kernel_L4_BEGIN:
  1098. mov pCRow0, pC // pCRow0 = C
  1099. add pC, pC, LDC, lsl #2
  1100. mov pA, origPA // pA = start of A array
  1101. cgemm_kernel_L4_M8_BEGIN:
  1102. mov counterI, origM
  1103. asr counterI, counterI, #3 // counterI = counterI / 8
  1104. cmp counterI, #0
  1105. ble cgemm_kernel_L4_M4_BEGIN
  1106. cgemm_kernel_L4_M8_20:
  1107. mov pB, origPB
  1108. asr counterL , origK, #1 // L = K / 2
  1109. cmp counterL , #2 // is there at least 4 to do?
  1110. blt cgemm_kernel_L4_M8_32
  1111. KERNEL8x4_I // do one in the K
  1112. KERNEL8x4_M2 // do another in the K
  1113. subs counterL, counterL, #2 // subtract 2
  1114. ble cgemm_kernel_L4_M8_22a
  1115. .align 5
  1116. cgemm_kernel_L4_M8_22:
  1117. KERNEL8x4_M1
  1118. KERNEL8x4_M2
  1119. subs counterL, counterL, #1
  1120. bgt cgemm_kernel_L4_M8_22
  1121. cgemm_kernel_L4_M8_22a:
  1122. KERNEL8x4_M1
  1123. KERNEL8x4_E
  1124. b cgemm_kernel_L4_M8_44
  1125. cgemm_kernel_L4_M8_32:
  1126. tst counterL, #1
  1127. ble cgemm_kernel_L4_M8_40
  1128. KERNEL8x4_I
  1129. KERNEL8x4_E
  1130. b cgemm_kernel_L4_M8_44
  1131. cgemm_kernel_L4_M8_40:
  1132. INIT8x4
  1133. cgemm_kernel_L4_M8_44:
  1134. ands counterL , origK, #1
  1135. ble cgemm_kernel_L4_M8_100
  1136. cgemm_kernel_L4_M8_46:
  1137. KERNEL8x4_SUB
  1138. cgemm_kernel_L4_M8_100:
  1139. SAVE8x4
  1140. cgemm_kernel_L4_M8_END:
  1141. subs counterI, counterI, #1
  1142. bne cgemm_kernel_L4_M8_20
  1143. cgemm_kernel_L4_M4_BEGIN:
  1144. mov counterI, origM
  1145. tst counterI , #7
  1146. ble cgemm_kernel_L4_END
  1147. tst counterI, #4
  1148. ble cgemm_kernel_L4_M2_BEGIN
  1149. cgemm_kernel_L4_M4_20:
  1150. mov pB, origPB
  1151. asr counterL , origK, #1 // L = K / 2
  1152. cmp counterL , #2 // is there at least 4 to do?
  1153. blt cgemm_kernel_L4_M4_32
  1154. KERNEL4x4_I // do one in the K
  1155. KERNEL4x4_M2 // do another in the K
  1156. subs counterL, counterL, #2
  1157. ble cgemm_kernel_L4_M4_22a
  1158. .align 5
  1159. cgemm_kernel_L4_M4_22:
  1160. KERNEL4x4_M1
  1161. KERNEL4x4_M2
  1162. subs counterL, counterL, #1
  1163. bgt cgemm_kernel_L4_M4_22
  1164. cgemm_kernel_L4_M4_22a:
  1165. KERNEL4x4_M1
  1166. KERNEL4x4_E
  1167. b cgemm_kernel_L4_M4_44
  1168. cgemm_kernel_L4_M4_32:
  1169. tst counterL, #1
  1170. ble cgemm_kernel_L4_M4_40
  1171. KERNEL4x4_I
  1172. KERNEL4x4_E
  1173. b cgemm_kernel_L4_M4_44
  1174. cgemm_kernel_L4_M4_40:
  1175. INIT4x4
  1176. cgemm_kernel_L4_M4_44:
  1177. ands counterL , origK, #1
  1178. ble cgemm_kernel_L4_M4_100
  1179. cgemm_kernel_L4_M4_46:
  1180. KERNEL4x4_SUB
  1181. cgemm_kernel_L4_M4_100:
  1182. SAVE4x4
  1183. cgemm_kernel_L4_M4_END:
  1184. cgemm_kernel_L4_M2_BEGIN:
  1185. mov counterI, origM
  1186. tst counterI , #3
  1187. ble cgemm_kernel_L4_END
  1188. tst counterI, #2 // counterI = counterI / 2
  1189. ble cgemm_kernel_L4_M1_BEGIN
  1190. cgemm_kernel_L4_M2_20:
  1191. INIT2x4
  1192. mov pB, origPB
  1193. asr counterL , origK, #3 // counterL = counterL / 8
  1194. cmp counterL , #0
  1195. ble cgemm_kernel_L4_M2_40
  1196. cgemm_kernel_L4_M2_22:
  1197. KERNEL2x4_SUB
  1198. KERNEL2x4_SUB
  1199. KERNEL2x4_SUB
  1200. KERNEL2x4_SUB
  1201. KERNEL2x4_SUB
  1202. KERNEL2x4_SUB
  1203. KERNEL2x4_SUB
  1204. KERNEL2x4_SUB
  1205. subs counterL, counterL, #1
  1206. bgt cgemm_kernel_L4_M2_22
  1207. cgemm_kernel_L4_M2_40:
  1208. ands counterL , origK, #7 // counterL = counterL % 8
  1209. ble cgemm_kernel_L4_M2_100
  1210. cgemm_kernel_L4_M2_42:
  1211. KERNEL2x4_SUB
  1212. subs counterL, counterL, #1
  1213. bgt cgemm_kernel_L4_M2_42
  1214. cgemm_kernel_L4_M2_100:
  1215. SAVE2x4
  1216. cgemm_kernel_L4_M2_END:
  1217. cgemm_kernel_L4_M1_BEGIN:
  1218. tst counterI, #1 // counterI = counterI % 2
  1219. ble cgemm_kernel_L4_END
  1220. cgemm_kernel_L4_M1_20:
  1221. INIT1x4
  1222. mov pB, origPB
  1223. asr counterL , origK, #3 // counterL = counterL / 8
  1224. cmp counterL , #0
  1225. ble cgemm_kernel_L4_M1_40
  1226. cgemm_kernel_L4_M1_22:
  1227. KERNEL1x4_SUB
  1228. KERNEL1x4_SUB
  1229. KERNEL1x4_SUB
  1230. KERNEL1x4_SUB
  1231. KERNEL1x4_SUB
  1232. KERNEL1x4_SUB
  1233. KERNEL1x4_SUB
  1234. KERNEL1x4_SUB
  1235. subs counterL, counterL, #1
  1236. bgt cgemm_kernel_L4_M1_22
  1237. cgemm_kernel_L4_M1_40:
  1238. ands counterL , origK, #7 // counterL = counterL % 8
  1239. ble cgemm_kernel_L4_M1_100
  1240. cgemm_kernel_L4_M1_42:
  1241. KERNEL1x4_SUB
  1242. subs counterL, counterL, #1
  1243. bgt cgemm_kernel_L4_M1_42
  1244. cgemm_kernel_L4_M1_100:
  1245. SAVE1x4
  1246. cgemm_kernel_L4_END:
  1247. lsl temp, origK, #5
  1248. add origPB, origPB, temp // B = B + K * 4 * 8
  1249. subs counterJ, counterJ , #1 // j--
  1250. bgt cgemm_kernel_L4_BEGIN
  1251. /******************************************************************************/
  1252. cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  1253. mov counterJ , origN
  1254. tst counterJ , #3
  1255. ble cgemm_kernel_L999 // error, N was less than 4?
  1256. tst counterJ , #2
  1257. ble cgemm_kernel_L1_BEGIN
  1258. mov pCRow0, pC // pCRow0 = pC
  1259. add pC,pC,LDC, lsl #1
  1260. mov pA, origPA // pA = A
  1261. cgemm_kernel_L2_M8_BEGIN:
  1262. mov counterI, origM
  1263. asr counterI, counterI, #3 // counterI = counterI / 8
  1264. cmp counterI, #0
  1265. ble cgemm_kernel_L2_M4_BEGIN
  1266. cgemm_kernel_L2_M8_20:
  1267. INIT8x2
  1268. mov pB, origPB
  1269. asr counterL , origK, #3 // counterL = counterL / 8
  1270. cmp counterL,#0
  1271. ble cgemm_kernel_L2_M8_40
  1272. .align 5
  1273. cgemm_kernel_L2_M8_22:
  1274. KERNEL8x2_SUB
  1275. KERNEL8x2_SUB
  1276. KERNEL8x2_SUB
  1277. KERNEL8x2_SUB
  1278. KERNEL8x2_SUB
  1279. KERNEL8x2_SUB
  1280. KERNEL8x2_SUB
  1281. KERNEL8x2_SUB
  1282. subs counterL, counterL, #1
  1283. bgt cgemm_kernel_L2_M8_22
  1284. cgemm_kernel_L2_M8_40:
  1285. ands counterL , origK, #7 // counterL = counterL % 8
  1286. ble cgemm_kernel_L2_M8_100
  1287. cgemm_kernel_L2_M8_42:
  1288. KERNEL8x2_SUB
  1289. subs counterL, counterL, #1
  1290. bgt cgemm_kernel_L2_M8_42
  1291. cgemm_kernel_L2_M8_100:
  1292. SAVE8x2
  1293. cgemm_kernel_L2_M8_END:
  1294. subs counterI, counterI, #1
  1295. bgt cgemm_kernel_L2_M8_20
  1296. cgemm_kernel_L2_M4_BEGIN:
  1297. mov counterI, origM
  1298. tst counterI , #7
  1299. ble cgemm_kernel_L2_END
  1300. tst counterI, #4 // counterI = counterI / 2
  1301. ble cgemm_kernel_L2_M2_BEGIN
  1302. cgemm_kernel_L2_M4_20:
  1303. INIT4x2
  1304. mov pB, origPB
  1305. asr counterL , origK, #3 // counterL = counterL / 8
  1306. cmp counterL,#0
  1307. ble cgemm_kernel_L2_M4_40
  1308. .align 5
  1309. cgemm_kernel_L2_M4_22:
  1310. KERNEL4x2_SUB
  1311. KERNEL4x2_SUB
  1312. KERNEL4x2_SUB
  1313. KERNEL4x2_SUB
  1314. KERNEL4x2_SUB
  1315. KERNEL4x2_SUB
  1316. KERNEL4x2_SUB
  1317. KERNEL4x2_SUB
  1318. subs counterL, counterL, #1
  1319. bgt cgemm_kernel_L2_M4_22
  1320. cgemm_kernel_L2_M4_40:
  1321. ands counterL , origK, #7 // counterL = counterL % 8
  1322. ble cgemm_kernel_L2_M4_100
  1323. cgemm_kernel_L2_M4_42:
  1324. KERNEL4x2_SUB
  1325. subs counterL, counterL, #1
  1326. bgt cgemm_kernel_L2_M4_42
  1327. cgemm_kernel_L2_M4_100:
  1328. SAVE4x2
  1329. cgemm_kernel_L2_M4_END:
  1330. cgemm_kernel_L2_M2_BEGIN:
  1331. mov counterI, origM
  1332. tst counterI , #3
  1333. ble cgemm_kernel_L2_END
  1334. tst counterI, #2 // counterI = counterI / 2
  1335. ble cgemm_kernel_L2_M1_BEGIN
  1336. cgemm_kernel_L2_M2_20:
  1337. INIT2x2
  1338. mov pB, origPB
  1339. asr counterL , origK, #3 // counterL = counterL / 8
  1340. cmp counterL,#0
  1341. ble cgemm_kernel_L2_M2_40
  1342. cgemm_kernel_L2_M2_22:
  1343. KERNEL2x2_SUB
  1344. KERNEL2x2_SUB
  1345. KERNEL2x2_SUB
  1346. KERNEL2x2_SUB
  1347. KERNEL2x2_SUB
  1348. KERNEL2x2_SUB
  1349. KERNEL2x2_SUB
  1350. KERNEL2x2_SUB
  1351. subs counterL, counterL, #1
  1352. bgt cgemm_kernel_L2_M2_22
  1353. cgemm_kernel_L2_M2_40:
  1354. ands counterL , origK, #7 // counterL = counterL % 8
  1355. ble cgemm_kernel_L2_M2_100
  1356. cgemm_kernel_L2_M2_42:
  1357. KERNEL2x2_SUB
  1358. subs counterL, counterL, #1
  1359. bgt cgemm_kernel_L2_M2_42
  1360. cgemm_kernel_L2_M2_100:
  1361. SAVE2x2
  1362. cgemm_kernel_L2_M2_END:
  1363. cgemm_kernel_L2_M1_BEGIN:
  1364. tst counterI, #1 // counterI = counterI % 2
  1365. ble cgemm_kernel_L2_END
  1366. cgemm_kernel_L2_M1_20:
  1367. INIT1x2
  1368. mov pB, origPB
  1369. asr counterL , origK, #3 // counterL = counterL / 8
  1370. cmp counterL, #0
  1371. ble cgemm_kernel_L2_M1_40
  1372. cgemm_kernel_L2_M1_22:
  1373. KERNEL1x2_SUB
  1374. KERNEL1x2_SUB
  1375. KERNEL1x2_SUB
  1376. KERNEL1x2_SUB
  1377. KERNEL1x2_SUB
  1378. KERNEL1x2_SUB
  1379. KERNEL1x2_SUB
  1380. KERNEL1x2_SUB
  1381. subs counterL, counterL, #1
  1382. bgt cgemm_kernel_L2_M1_22
  1383. cgemm_kernel_L2_M1_40:
  1384. ands counterL , origK, #7 // counterL = counterL % 8
  1385. ble cgemm_kernel_L2_M1_100
  1386. cgemm_kernel_L2_M1_42:
  1387. KERNEL1x2_SUB
  1388. subs counterL, counterL, #1
  1389. bgt cgemm_kernel_L2_M1_42
  1390. cgemm_kernel_L2_M1_100:
  1391. SAVE1x2
  1392. cgemm_kernel_L2_END:
  1393. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1394. /******************************************************************************/
  1395. cgemm_kernel_L1_BEGIN:
  1396. mov counterJ , origN
  1397. tst counterJ , #1
  1398. ble cgemm_kernel_L999 // done
  1399. mov pCRow0, pC // pCRow0 = C
  1400. add pC , pC , LDC // Update pC to point to next
  1401. mov pA, origPA // pA = A
  1402. cgemm_kernel_L1_M8_BEGIN:
  1403. mov counterI, origM
  1404. asr counterI, counterI, #3 // counterI = counterI / 8
  1405. cmp counterI, #0
  1406. ble cgemm_kernel_L1_M4_BEGIN
  1407. cgemm_kernel_L1_M8_20:
  1408. INIT8x1
  1409. mov pB, origPB
  1410. asr counterL , origK, #3 // counterL = counterL / 8
  1411. cmp counterL , #0
  1412. ble cgemm_kernel_L1_M8_40
  1413. .align 5
  1414. cgemm_kernel_L1_M8_22:
  1415. KERNEL8x1_SUB
  1416. KERNEL8x1_SUB
  1417. KERNEL8x1_SUB
  1418. KERNEL8x1_SUB
  1419. KERNEL8x1_SUB
  1420. KERNEL8x1_SUB
  1421. KERNEL8x1_SUB
  1422. KERNEL8x1_SUB
  1423. subs counterL, counterL, #1
  1424. bgt cgemm_kernel_L1_M8_22
  1425. cgemm_kernel_L1_M8_40:
  1426. ands counterL , origK, #7 // counterL = counterL % 8
  1427. ble cgemm_kernel_L1_M8_100
  1428. cgemm_kernel_L1_M8_42:
  1429. KERNEL8x1_SUB
  1430. subs counterL, counterL, #1
  1431. bgt cgemm_kernel_L1_M8_42
  1432. cgemm_kernel_L1_M8_100:
  1433. SAVE8x1
  1434. cgemm_kernel_L1_M8_END:
  1435. subs counterI, counterI, #1
  1436. bgt cgemm_kernel_L1_M8_20
  1437. cgemm_kernel_L1_M4_BEGIN:
  1438. mov counterI, origM
  1439. tst counterI , #7
  1440. ble cgemm_kernel_L1_END
  1441. tst counterI, #4 // counterI = counterI / 2
  1442. ble cgemm_kernel_L1_M2_BEGIN
  1443. cgemm_kernel_L1_M4_20:
  1444. INIT4x1
  1445. mov pB, origPB
  1446. asr counterL , origK, #3 // counterL = counterL / 8
  1447. cmp counterL , #0
  1448. ble cgemm_kernel_L1_M4_40
  1449. .align 5
  1450. cgemm_kernel_L1_M4_22:
  1451. KERNEL4x1_SUB
  1452. KERNEL4x1_SUB
  1453. KERNEL4x1_SUB
  1454. KERNEL4x1_SUB
  1455. KERNEL4x1_SUB
  1456. KERNEL4x1_SUB
  1457. KERNEL4x1_SUB
  1458. KERNEL4x1_SUB
  1459. subs counterL, counterL, #1
  1460. bgt cgemm_kernel_L1_M4_22
  1461. cgemm_kernel_L1_M4_40:
  1462. ands counterL , origK, #7 // counterL = counterL % 8
  1463. ble cgemm_kernel_L1_M4_100
  1464. cgemm_kernel_L1_M4_42:
  1465. KERNEL4x1_SUB
  1466. subs counterL, counterL, #1
  1467. bgt cgemm_kernel_L1_M4_42
  1468. cgemm_kernel_L1_M4_100:
  1469. SAVE4x1
  1470. cgemm_kernel_L1_M4_END:
  1471. cgemm_kernel_L1_M2_BEGIN:
  1472. mov counterI, origM
  1473. tst counterI , #3
  1474. ble cgemm_kernel_L1_END
  1475. tst counterI, #2 // counterI = counterI / 2
  1476. ble cgemm_kernel_L1_M1_BEGIN
  1477. cgemm_kernel_L1_M2_20:
  1478. INIT2x1
  1479. mov pB, origPB
  1480. asr counterL , origK, #3 // counterL = counterL / 8
  1481. cmp counterL , #0
  1482. ble cgemm_kernel_L1_M2_40
  1483. cgemm_kernel_L1_M2_22:
  1484. KERNEL2x1_SUB
  1485. KERNEL2x1_SUB
  1486. KERNEL2x1_SUB
  1487. KERNEL2x1_SUB
  1488. KERNEL2x1_SUB
  1489. KERNEL2x1_SUB
  1490. KERNEL2x1_SUB
  1491. KERNEL2x1_SUB
  1492. subs counterL, counterL, #1
  1493. bgt cgemm_kernel_L1_M2_22
  1494. cgemm_kernel_L1_M2_40:
  1495. ands counterL , origK, #7 // counterL = counterL % 8
  1496. ble cgemm_kernel_L1_M2_100
  1497. cgemm_kernel_L1_M2_42:
  1498. KERNEL2x1_SUB
  1499. subs counterL, counterL, #1
  1500. bgt cgemm_kernel_L1_M2_42
  1501. cgemm_kernel_L1_M2_100:
  1502. SAVE2x1
  1503. cgemm_kernel_L1_M2_END:
  1504. cgemm_kernel_L1_M1_BEGIN:
  1505. tst counterI, #1 // counterI = counterI % 2
  1506. ble cgemm_kernel_L1_END
  1507. cgemm_kernel_L1_M1_20:
  1508. INIT1x1
  1509. mov pB, origPB
  1510. asr counterL , origK, #3 // counterL = counterL / 8
  1511. cmp counterL , #0
  1512. ble cgemm_kernel_L1_M1_40
  1513. cgemm_kernel_L1_M1_22:
  1514. KERNEL1x1_SUB
  1515. KERNEL1x1_SUB
  1516. KERNEL1x1_SUB
  1517. KERNEL1x1_SUB
  1518. KERNEL1x1_SUB
  1519. KERNEL1x1_SUB
  1520. KERNEL1x1_SUB
  1521. KERNEL1x1_SUB
  1522. subs counterL, counterL, #1
  1523. bgt cgemm_kernel_L1_M1_22
  1524. cgemm_kernel_L1_M1_40:
  1525. ands counterL , origK, #7 // counterL = counterL % 8
  1526. ble cgemm_kernel_L1_M1_100
  1527. cgemm_kernel_L1_M1_42:
  1528. KERNEL1x1_SUB
  1529. subs counterL, counterL, #1
  1530. bgt cgemm_kernel_L1_M1_42
  1531. cgemm_kernel_L1_M1_100:
  1532. SAVE1x1
  1533. cgemm_kernel_L1_END:
  1534. cgemm_kernel_L999:
  1535. mov x0, #0 // set return value
  1536. ldp d8, d9, [sp, #(0 * 16)]
  1537. ldp d10, d11, [sp, #(1 * 16)]
  1538. ldp d12, d13, [sp, #(2 * 16)]
  1539. ldp d14, d15, [sp, #(3 * 16)]
  1540. ldp d16, d17, [sp, #(4 * 16)]
  1541. ldp x18, x19, [sp, #(5 * 16)]
  1542. ldp x20, x21, [sp, #(6 * 16)]
  1543. ldp x22, x23, [sp, #(7 * 16)]
  1544. ldp x24, x25, [sp, #(8 * 16)]
  1545. ldp x26, x27, [sp, #(9 * 16)]
  1546. ldr x28, [sp, #(10 * 16)]
  1547. add sp, sp, #(11*16)
  1548. ret
  1549. EPILOGUE