You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_4x4.S 34 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define alpha_save_R x16
  48. #define alpha_save_I x17
  49. #define alpha0_R d10
  50. #define alphaV0_R v10.d[0]
  51. #define alpha0_I d11
  52. #define alphaV0_I v11.d[0]
  53. #define alpha1_R d14
  54. #define alphaV1_R v14.d[0]
  55. #define alpha1_I d15
  56. #define alphaV1_I v15.d[0]
  57. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  58. #define OP_rr fmla
  59. #define OP_ii fmls
  60. #define OP_ri fmla
  61. #define OP_ir fmla
  62. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  63. #define OP_rr fmla
  64. #define OP_ii fmla
  65. #define OP_ri fmls
  66. #define OP_ir fmla
  67. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  68. #define OP_rr fmla
  69. #define OP_ii fmla
  70. #define OP_ri fmla
  71. #define OP_ir fmls
  72. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  73. #define OP_rr fmla
  74. #define OP_ii fmls
  75. #define OP_ri fmls
  76. #define OP_ir fmls
  77. #endif
  78. // 00 origM
  79. // 01 origN
  80. // 02 origK
  81. // 03 origPA
  82. // 04 origPB
  83. // 05 pC
  84. // 06 origLDC -> LDC
  85. // 07 offset -> temp
  86. // 08 counterL
  87. // 09 counterI
  88. // 10 counterJ
  89. // 11 pB
  90. // 12 pCRow0
  91. // 13 pCRow1
  92. // 14 pCRow2
  93. // 15 pA
  94. // 16 alpha_save_R
  95. // 17 alpha_save_I
  96. // 18 must save
  97. // 19 must save
  98. // 20 must save
  99. // 21 must save
  100. // 22 must save
  101. // 23 must save
  102. // 24 must save
  103. // 25 must save
  104. // 26 must save
  105. // 27 must save
  106. // 28 must save
  107. // 29 frame
  108. // 30 link
  109. // 31 sp
  110. //v00 ALPHA_R -> pA00_R, pA01_R
  111. //v01 ALPHA_I -> pA00_I, pA01_I
  112. //v02 pA02_R, pA03_R
  113. //v03 pA02_I, pA03_I
  114. //v04 pA10_R, pA11_R
  115. //v05 pA10_I, pA11_I
  116. //v06 pA12_R, pA13_R
  117. //v07 pA12_I, pA13_I
  118. //v08 must save pB00_R, pB01_R
  119. //v09 must save pB00_I, pB01_I
  120. //v10 must save pB02_R, pB03_R OR ALPHA0_R
  121. //v11 must save pB02_I, pB03_I OR ALPHA0_I
  122. //v12 must save pB10_R, pB11_R
  123. //v13 must save pB10_I, pB11_I
  124. //v14 must save pB12_R, pB13_R OR ALPHA1_R
  125. //v15 must save pB12_I, pB13_I OR ALPHA1_R
  126. //v16 must save pC00_R, pC01_R
  127. //v17 must save pC00_I, pC01_I
  128. //v18 pC02_R, pC03_R
  129. //v19 pC02_I, pC03_I
  130. //v20 pC10_R, pC11_R
  131. //v21 pC10_I, pC11_I
  132. //v22 pC12_R, pC13_R
  133. //v23 pC12_I, pC13_I
  134. //v24 pC20_R, pC21_R
  135. //v25 pC20_I, pC21_I
  136. //v26 pC22_R, pC23_R
  137. //v27 pC22_I, pC23_I
  138. //v28 pC30_R, pC31_R
  139. //v29 pC30_I, pC31_I
  140. //v30 pC32_R, pC33_R
  141. //v31 pC32_I, pC33_I
  142. /*******************************************************************************
  143. * Macro definitions
  144. *******************************************************************************/
  145. .macro INIT4x4
  146. fmov d16, xzr
  147. fmov d17, d16
  148. fmov d18, d17
  149. fmov d19, d16
  150. fmov d20, d17
  151. fmov d21, d16
  152. fmov d22, d17
  153. fmov d23, d16
  154. fmov d24, d17
  155. fmov d25, d16
  156. fmov d26, d17
  157. fmov d27, d16
  158. fmov d28, d17
  159. fmov d29, d16
  160. fmov d30, d17
  161. fmov d31, d16
  162. .endm
  163. .macro KERNEL4x4_I
  164. ld2 {v8.2d, v9.2d}, [pB]
  165. add pB, pB, #32
  166. ld2 {v10.2d, v11.2d}, [pB]
  167. add pB, pB, #32
  168. ld2 {v0.2d, v1.2d}, [pA]
  169. add pA, pA, #32
  170. ld2 {v2.2d, v3.2d}, [pA]
  171. add pA, pA, #32
  172. fmul v16.2d, v0.2d, v8.d[0]
  173. OP_ii v16.2d, v1.2d, v9.d[0]
  174. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  175. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  176. eor v17.16b, v17.16b, v17.16b
  177. fmls v17.2d, v0.2d, v9.d[0]
  178. #else
  179. fmul v17.2d, v0.2d, v9.d[0]
  180. #endif
  181. OP_ir v17.2d, v1.2d, v8.d[0]
  182. fmul v18.2d, v2.2d, v8.d[0]
  183. OP_ii v18.2d, v3.2d, v9.d[0]
  184. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  185. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  186. eor v19.16b, v19.16b, v19.16b
  187. fmls v19.2d, v2.2d, v9.d[0]
  188. #else
  189. fmul v19.2d, v2.2d, v9.d[0]
  190. #endif
  191. OP_ir v19.2d, v3.2d, v8.d[0]
  192. fmul v20.2d, v0.2d, v8.d[1]
  193. OP_ii v20.2d, v1.2d, v9.d[1]
  194. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  195. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  196. eor v21.16b, v21.16b, v21.16b
  197. fmls v21.2d, v0.2d, v9.d[1]
  198. #else
  199. fmul v21.2d, v0.2d, v9.d[1]
  200. #endif
  201. OP_ir v21.2d, v1.2d, v8.d[1]
  202. fmul v22.2d, v2.2d, v8.d[1]
  203. OP_ii v22.2d, v3.2d, v9.d[1]
  204. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  205. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  206. eor v23.16b, v23.16b, v23.16b
  207. fmls v23.2d, v2.2d, v9.d[1]
  208. #else
  209. fmul v23.2d, v2.2d, v9.d[1]
  210. #endif
  211. OP_ir v23.2d, v3.2d, v8.d[1]
  212. fmul v24.2d, v0.2d, v10.d[0]
  213. OP_ii v24.2d, v1.2d, v11.d[0]
  214. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  215. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  216. eor v25.16b, v25.16b, v25.16b
  217. fmls v25.2d, v0.2d, v11.d[0]
  218. #else
  219. fmul v25.2d, v0.2d, v11.d[0]
  220. #endif
  221. OP_ir v25.2d, v1.2d, v10.d[0]
  222. fmul v26.2d, v2.2d, v10.d[0]
  223. OP_ii v26.2d, v3.2d, v11.d[0]
  224. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  225. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  226. eor v27.16b, v27.16b, v27.16b
  227. fmls v27.2d, v2.2d, v11.d[0]
  228. #else
  229. fmul v27.2d, v2.2d, v11.d[0]
  230. #endif
  231. OP_ir v27.2d, v3.2d, v10.d[0]
  232. fmul v28.2d, v0.2d, v10.d[1]
  233. OP_ii v28.2d, v1.2d, v11.d[1]
  234. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  235. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  236. eor v29.16b, v29.16b, v29.16b
  237. fmls v29.2d, v0.2d, v11.d[1]
  238. #else
  239. fmul v29.2d, v0.2d, v11.d[1]
  240. #endif
  241. OP_ir v29.2d, v1.2d, v10.d[1]
  242. fmul v30.2d, v2.2d, v10.d[1]
  243. OP_ii v30.2d, v3.2d, v11.d[1]
  244. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  245. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  246. eor v31.16b, v31.16b, v31.16b
  247. fmls v31.2d, v2.2d, v11.d[1]
  248. #else
  249. fmul v31.2d, v2.2d, v11.d[1]
  250. #endif
  251. OP_ir v31.2d, v3.2d, v10.d[1]
  252. ld2 {v12.2d, v13.2d}, [pB]
  253. add pB, pB, #32
  254. ld2 {v14.2d, v15.2d}, [pB]
  255. add pB, pB, #32
  256. ld2 {v4.2d, v5.2d} , [pA]
  257. add pA, pA, #32
  258. ld2 {v6.2d, v7.2d} , [pA]
  259. add pA, pA, #32
  260. .endm
  261. .macro KERNEL4x4_M1
  262. OP_rr v16.2d, v0.2d, v8.d[0]
  263. OP_ii v16.2d, v1.2d, v9.d[0]
  264. OP_ri v17.2d, v0.2d, v9.d[0]
  265. OP_ir v17.2d, v1.2d, v8.d[0]
  266. ld2 {v12.2d, v13.2d}, [pB] // For next round
  267. add pB, pB, #32
  268. OP_rr v18.2d, v2.2d, v8.d[0]
  269. OP_ii v18.2d, v3.2d, v9.d[0]
  270. OP_ri v19.2d, v2.2d, v9.d[0]
  271. OP_ir v19.2d, v3.2d, v8.d[0]
  272. ld2 {v14.2d, v15.2d}, [pB] // For next round
  273. add pB, pB, #32
  274. OP_rr v20.2d, v0.2d, v8.d[1]
  275. OP_ii v20.2d, v1.2d, v9.d[1]
  276. OP_ri v21.2d, v0.2d, v9.d[1]
  277. OP_ir v21.2d, v1.2d, v8.d[1]
  278. ld2 {v4.2d, v5.2d} , [pA] // For next round
  279. add pA, pA, #32
  280. OP_rr v22.2d, v2.2d, v8.d[1]
  281. OP_ii v22.2d, v3.2d, v9.d[1]
  282. OP_ri v23.2d, v2.2d, v9.d[1]
  283. OP_ir v23.2d, v3.2d, v8.d[1]
  284. ld2 {v6.2d, v7.2d} , [pA] // For next round
  285. add pA, pA, #32
  286. OP_rr v24.2d, v0.2d, v10.d[0]
  287. OP_ii v24.2d, v1.2d, v11.d[0]
  288. OP_ri v25.2d, v0.2d, v11.d[0]
  289. OP_ir v25.2d, v1.2d, v10.d[0]
  290. prfm PLDL1KEEP, [pA, #512]
  291. OP_rr v26.2d, v2.2d, v10.d[0]
  292. OP_ii v26.2d, v3.2d, v11.d[0]
  293. OP_ri v27.2d, v2.2d, v11.d[0]
  294. OP_ir v27.2d, v3.2d, v10.d[0]
  295. prfm PLDL1KEEP, [pB, #512]
  296. OP_rr v28.2d, v0.2d, v10.d[1]
  297. OP_ii v28.2d, v1.2d, v11.d[1]
  298. OP_ri v29.2d, v0.2d, v11.d[1]
  299. OP_ir v29.2d, v1.2d, v10.d[1]
  300. OP_rr v30.2d, v2.2d, v10.d[1]
  301. OP_ii v30.2d, v3.2d, v11.d[1]
  302. OP_ri v31.2d, v2.2d, v11.d[1]
  303. OP_ir v31.2d, v3.2d, v10.d[1]
  304. .endm
  305. .macro KERNEL4x4_M2
  306. OP_rr v16.2d, v4.2d, v12.d[0]
  307. OP_ii v16.2d, v5.2d, v13.d[0]
  308. OP_ri v17.2d, v4.2d, v13.d[0]
  309. OP_ir v17.2d, v5.2d, v12.d[0]
  310. ld2 {v8.2d, v9.2d}, [pB] // For next round
  311. add pB, pB, #32
  312. OP_rr v18.2d, v6.2d, v12.d[0]
  313. OP_ii v18.2d, v7.2d, v13.d[0]
  314. OP_ri v19.2d, v6.2d, v13.d[0]
  315. OP_ir v19.2d, v7.2d, v12.d[0]
  316. ld2 {v10.2d, v11.2d}, [pB] // For next round
  317. add pB, pB, #32
  318. OP_rr v20.2d, v4.2d, v12.d[1]
  319. OP_ii v20.2d, v5.2d, v13.d[1]
  320. OP_ri v21.2d, v4.2d, v13.d[1]
  321. OP_ir v21.2d, v5.2d, v12.d[1]
  322. ld2 {v0.2d, v1.2d}, [pA] // For next round
  323. add pA, pA, #32
  324. OP_rr v22.2d, v6.2d, v12.d[1]
  325. OP_ii v22.2d, v7.2d, v13.d[1]
  326. OP_ri v23.2d, v6.2d, v13.d[1]
  327. OP_ir v23.2d, v7.2d, v12.d[1]
  328. ld2 {v2.2d, v3.2d}, [pA] // For next round
  329. add pA, pA, #32
  330. OP_rr v24.2d, v4.2d, v14.d[0]
  331. OP_ii v24.2d, v5.2d, v15.d[0]
  332. OP_ri v25.2d, v4.2d, v15.d[0]
  333. OP_ir v25.2d, v5.2d, v14.d[0]
  334. prfm PLDL1KEEP, [pA, #512]
  335. OP_rr v26.2d, v6.2d, v14.d[0]
  336. OP_ii v26.2d, v7.2d, v15.d[0]
  337. OP_ri v27.2d, v6.2d, v15.d[0]
  338. OP_ir v27.2d, v7.2d, v14.d[0]
  339. prfm PLDL1KEEP, [pB, #512]
  340. OP_rr v28.2d, v4.2d, v14.d[1]
  341. OP_ii v28.2d, v5.2d, v15.d[1]
  342. OP_ri v29.2d, v4.2d, v15.d[1]
  343. OP_ir v29.2d, v5.2d, v14.d[1]
  344. OP_rr v30.2d, v6.2d, v14.d[1]
  345. OP_ii v30.2d, v7.2d, v15.d[1]
  346. OP_ri v31.2d, v6.2d, v15.d[1]
  347. OP_ir v31.2d, v7.2d, v14.d[1]
  348. .endm
  349. .macro KERNEL4x4_E
  350. OP_rr v16.2d, v4.2d, v12.d[0]
  351. OP_ii v16.2d, v5.2d, v13.d[0]
  352. OP_ri v17.2d, v4.2d, v13.d[0]
  353. OP_ir v17.2d, v5.2d, v12.d[0]
  354. OP_rr v18.2d, v6.2d, v12.d[0]
  355. OP_ii v18.2d, v7.2d, v13.d[0]
  356. OP_ri v19.2d, v6.2d, v13.d[0]
  357. OP_ir v19.2d, v7.2d, v12.d[0]
  358. OP_rr v20.2d, v4.2d, v12.d[1]
  359. OP_ii v20.2d, v5.2d, v13.d[1]
  360. OP_ri v21.2d, v4.2d, v13.d[1]
  361. OP_ir v21.2d, v5.2d, v12.d[1]
  362. OP_rr v22.2d, v6.2d, v12.d[1]
  363. OP_ii v22.2d, v7.2d, v13.d[1]
  364. OP_ri v23.2d, v6.2d, v13.d[1]
  365. OP_ir v23.2d, v7.2d, v12.d[1]
  366. OP_rr v24.2d, v4.2d, v14.d[0]
  367. OP_ii v24.2d, v5.2d, v15.d[0]
  368. OP_ri v25.2d, v4.2d, v15.d[0]
  369. OP_ir v25.2d, v5.2d, v14.d[0]
  370. OP_rr v26.2d, v6.2d, v14.d[0]
  371. OP_ii v26.2d, v7.2d, v15.d[0]
  372. OP_ri v27.2d, v6.2d, v15.d[0]
  373. OP_ir v27.2d, v7.2d, v14.d[0]
  374. OP_rr v28.2d, v4.2d, v14.d[1]
  375. OP_ii v28.2d, v5.2d, v15.d[1]
  376. OP_ri v29.2d, v4.2d, v15.d[1]
  377. OP_ir v29.2d, v5.2d, v14.d[1]
  378. OP_rr v30.2d, v6.2d, v14.d[1]
  379. OP_ii v30.2d, v7.2d, v15.d[1]
  380. OP_ri v31.2d, v6.2d, v15.d[1]
  381. OP_ir v31.2d, v7.2d, v14.d[1]
  382. .endm
  383. .macro KERNEL4x4_SUB
  384. ld2 {v8.2d, v9.2d}, [pB]
  385. add pB, pB, #32
  386. ld2 {v10.2d, v11.2d}, [pB]
  387. add pB, pB, #32
  388. ld2 {v0.2d, v1.2d}, [pA]
  389. add pA, pA, #32
  390. ld2 {v2.2d, v3.2d}, [pA]
  391. add pA, pA, #32
  392. OP_rr v16.2d, v0.2d, v8.d[0]
  393. OP_ii v16.2d, v1.2d, v9.d[0]
  394. OP_ri v17.2d, v0.2d, v9.d[0]
  395. OP_ir v17.2d, v1.2d, v8.d[0]
  396. OP_rr v18.2d, v2.2d, v8.d[0]
  397. OP_ii v18.2d, v3.2d, v9.d[0]
  398. OP_ri v19.2d, v2.2d, v9.d[0]
  399. OP_ir v19.2d, v3.2d, v8.d[0]
  400. OP_rr v20.2d, v0.2d, v8.d[1]
  401. OP_ii v20.2d, v1.2d, v9.d[1]
  402. OP_ri v21.2d, v0.2d, v9.d[1]
  403. OP_ir v21.2d, v1.2d, v8.d[1]
  404. OP_rr v22.2d, v2.2d, v8.d[1]
  405. OP_ii v22.2d, v3.2d, v9.d[1]
  406. OP_ri v23.2d, v2.2d, v9.d[1]
  407. OP_ir v23.2d, v3.2d, v8.d[1]
  408. OP_rr v24.2d, v0.2d, v10.d[0]
  409. OP_ii v24.2d, v1.2d, v11.d[0]
  410. OP_ri v25.2d, v0.2d, v11.d[0]
  411. OP_ir v25.2d, v1.2d, v10.d[0]
  412. OP_rr v26.2d, v2.2d, v10.d[0]
  413. OP_ii v26.2d, v3.2d, v11.d[0]
  414. OP_ri v27.2d, v2.2d, v11.d[0]
  415. OP_ir v27.2d, v3.2d, v10.d[0]
  416. OP_rr v28.2d, v0.2d, v10.d[1]
  417. OP_ii v28.2d, v1.2d, v11.d[1]
  418. OP_ri v29.2d, v0.2d, v11.d[1]
  419. OP_ir v29.2d, v1.2d, v10.d[1]
  420. OP_rr v30.2d, v2.2d, v10.d[1]
  421. OP_ii v30.2d, v3.2d, v11.d[1]
  422. OP_ri v31.2d, v2.2d, v11.d[1]
  423. OP_ir v31.2d, v3.2d, v10.d[1]
  424. .endm
  425. .macro SAVE4x4
  426. fmov alpha0_R, alpha_save_R
  427. fmov alpha0_I, alpha_save_I
  428. fmov alpha1_R, alpha0_R
  429. fmov alpha1_I, alpha0_I
  430. mov pCRow1, pCRow0
  431. ld2 {v0.2d, v1.2d}, [pCRow1]
  432. fmla v0.2d, v16.2d, alphaV0_R
  433. fmls v0.2d, v17.2d, alphaV0_I
  434. fmla v1.2d, v16.2d, alphaV1_I
  435. fmla v1.2d, v17.2d, alphaV1_R
  436. st2 {v0.2d, v1.2d}, [pCRow1]
  437. add pCRow2, pCRow1, #32
  438. ld2 {v2.2d, v3.2d}, [pCRow2]
  439. fmla v2.2d, v18.2d, alphaV0_R
  440. fmls v2.2d, v19.2d, alphaV0_I
  441. fmla v3.2d, v18.2d, alphaV1_I
  442. fmla v3.2d, v19.2d, alphaV1_R
  443. st2 {v2.2d, v3.2d}, [pCRow2]
  444. add pCRow1, pCRow1, LDC
  445. ld2 {v4.2d, v5.2d}, [pCRow1]
  446. fmla v4.2d, v20.2d, alphaV0_R
  447. fmls v4.2d, v21.2d, alphaV0_I
  448. fmla v5.2d, v20.2d, alphaV1_I
  449. fmla v5.2d, v21.2d, alphaV1_R
  450. st2 {v4.2d, v5.2d}, [pCRow1]
  451. add pCRow2, pCRow1, #32
  452. ld2 {v6.2d, v7.2d}, [pCRow2]
  453. fmla v6.2d, v22.2d, alphaV0_R
  454. fmls v6.2d, v23.2d, alphaV0_I
  455. fmla v7.2d, v22.2d, alphaV1_I
  456. fmla v7.2d, v23.2d, alphaV1_R
  457. st2 {v6.2d, v7.2d}, [pCRow2]
  458. add pCRow1, pCRow1, LDC
  459. ld2 {v0.2d, v1.2d}, [pCRow1]
  460. fmla v0.2d, v24.2d, alphaV0_R
  461. fmls v0.2d, v25.2d, alphaV0_I
  462. fmla v1.2d, v24.2d, alphaV1_I
  463. fmla v1.2d, v25.2d, alphaV1_R
  464. st2 {v0.2d, v1.2d}, [pCRow1]
  465. add pCRow2, pCRow1, #32
  466. ld2 {v2.2d, v3.2d}, [pCRow2]
  467. fmla v2.2d, v26.2d, alphaV0_R
  468. fmls v2.2d, v27.2d, alphaV0_I
  469. fmla v3.2d, v26.2d, alphaV1_I
  470. fmla v3.2d, v27.2d, alphaV1_R
  471. st2 {v2.2d, v3.2d}, [pCRow2]
  472. add pCRow1, pCRow1, LDC
  473. ld2 {v4.2d, v5.2d}, [pCRow1]
  474. fmla v4.2d, v28.2d, alphaV0_R
  475. fmls v4.2d, v29.2d, alphaV0_I
  476. fmla v5.2d, v28.2d, alphaV1_I
  477. fmla v5.2d, v29.2d, alphaV1_R
  478. st2 {v4.2d, v5.2d}, [pCRow1]
  479. add pCRow2, pCRow1, #32
  480. ld2 {v6.2d, v7.2d}, [pCRow2]
  481. fmla v6.2d, v30.2d, alphaV0_R
  482. fmls v6.2d, v31.2d, alphaV0_I
  483. fmla v7.2d, v30.2d, alphaV1_I
  484. fmla v7.2d, v31.2d, alphaV1_R
  485. st2 {v6.2d, v7.2d}, [pCRow2]
  486. add pCRow0, pCRow0, #64
  487. .endm
  488. /******************************************************************************/
  489. .macro INIT2x4
  490. fmov d16, xzr
  491. fmov d17, xzr
  492. fmov d20, d16
  493. fmov d21, d17
  494. fmov d24, d16
  495. fmov d25, d17
  496. fmov d28, d16
  497. fmov d29, d17
  498. .endm
  499. .macro KERNEL2x4_SUB
  500. ld2 {v8.2d, v9.2d}, [pB]
  501. add pB, pB, #32
  502. ld2 {v10.2d, v11.2d}, [pB]
  503. add pB, pB, #32
  504. ld2 {v0.2d, v1.2d}, [pA]
  505. add pA, pA, #32
  506. OP_rr v16.2d, v0.2d, v8.d[0]
  507. OP_ii v16.2d, v1.2d, v9.d[0]
  508. OP_ri v17.2d, v0.2d, v9.d[0]
  509. OP_ir v17.2d, v1.2d, v8.d[0]
  510. OP_rr v20.2d, v0.2d, v8.d[1]
  511. OP_ii v20.2d, v1.2d, v9.d[1]
  512. OP_ri v21.2d, v0.2d, v9.d[1]
  513. OP_ir v21.2d, v1.2d, v8.d[1]
  514. OP_rr v24.2d, v0.2d, v10.d[0]
  515. OP_ii v24.2d, v1.2d, v11.d[0]
  516. OP_ri v25.2d, v0.2d, v11.d[0]
  517. OP_ir v25.2d, v1.2d, v10.d[0]
  518. OP_rr v28.2d, v0.2d, v10.d[1]
  519. OP_ii v28.2d, v1.2d, v11.d[1]
  520. OP_ri v29.2d, v0.2d, v11.d[1]
  521. OP_ir v29.2d, v1.2d, v10.d[1]
  522. .endm
  523. .macro SAVE2x4
  524. fmov alpha0_R, alpha_save_R
  525. fmov alpha0_I, alpha_save_I
  526. fmov alpha1_R, alpha0_R
  527. fmov alpha1_I, alpha0_I
  528. mov pCRow1, pCRow0
  529. ld2 {v0.2d, v1.2d}, [pCRow1]
  530. fmla v0.2d, v16.2d, alphaV0_R
  531. fmls v0.2d, v17.2d, alphaV0_I
  532. fmla v1.2d, v16.2d, alphaV1_I
  533. fmla v1.2d, v17.2d, alphaV1_R
  534. st2 {v0.2d, v1.2d}, [pCRow1]
  535. add pCRow1, pCRow1, LDC
  536. ld2 {v4.2d, v5.2d}, [pCRow1]
  537. fmla v4.2d, v20.2d, alphaV0_R
  538. fmls v4.2d, v21.2d, alphaV0_I
  539. fmla v5.2d, v20.2d, alphaV1_I
  540. fmla v5.2d, v21.2d, alphaV1_R
  541. st2 {v4.2d, v5.2d}, [pCRow1]
  542. add pCRow1, pCRow1, LDC
  543. ld2 {v0.2d, v1.2d}, [pCRow1]
  544. fmla v0.2d, v24.2d, alphaV0_R
  545. fmls v0.2d, v25.2d, alphaV0_I
  546. fmla v1.2d, v24.2d, alphaV1_I
  547. fmla v1.2d, v25.2d, alphaV1_R
  548. st2 {v0.2d, v1.2d}, [pCRow1]
  549. add pCRow1, pCRow1, LDC
  550. ld2 {v4.2d, v5.2d}, [pCRow1]
  551. fmla v4.2d, v28.2d, alphaV0_R
  552. fmls v4.2d, v29.2d, alphaV0_I
  553. fmla v5.2d, v28.2d, alphaV1_I
  554. fmla v5.2d, v29.2d, alphaV1_R
  555. st2 {v4.2d, v5.2d}, [pCRow1]
  556. add pCRow0, pCRow0, #32
  557. .endm
  558. /******************************************************************************/
  559. .macro INIT1x4
  560. fmov d16, xzr
  561. fmov d17, xzr
  562. fmov d20, d16
  563. fmov d21, d17
  564. fmov d24, d16
  565. fmov d25, d17
  566. fmov d28, d16
  567. fmov d29, d17
  568. .endm
  569. .macro KERNEL1x4_SUB
  570. ld2 {v8.2d, v9.2d}, [pB]
  571. add pB, pB, #32
  572. ld2 {v10.2d, v11.2d}, [pB]
  573. add pB, pB, #32
  574. ld2 {v0.d, v1.d}[0], [pA]
  575. add pA, pA, #16
  576. OP_rr d16, d0, v8.d[0]
  577. OP_ii d16, d1, v9.d[0]
  578. OP_ri d17, d0, v9.d[0]
  579. OP_ir d17, d1, v8.d[0]
  580. OP_rr d20, d0, v8.d[1]
  581. OP_ii d20, d1, v9.d[1]
  582. OP_ri d21, d0, v9.d[1]
  583. OP_ir d21, d1, v8.d[1]
  584. OP_rr d24, d0, v10.d[0]
  585. OP_ii d24, d1, v11.d[0]
  586. OP_ri d25, d0, v11.d[0]
  587. OP_ir d25, d1, v10.d[0]
  588. OP_rr d28, d0, v10.d[1]
  589. OP_ii d28, d1, v11.d[1]
  590. OP_ri d29, d0, v11.d[1]
  591. OP_ir d29, d1, v10.d[1]
  592. .endm
  593. .macro SAVE1x4
  594. fmov alpha0_R, alpha_save_R
  595. fmov alpha0_I, alpha_save_I
  596. fmov alpha1_R, alpha0_R
  597. fmov alpha1_I, alpha0_I
  598. mov pCRow1, pCRow0
  599. ld2 {v0.d, v1.d}[0], [pCRow1]
  600. fmla d0, d16, alphaV0_R
  601. fmls d0, d17, alphaV0_I
  602. fmla d1, d16, alphaV1_I
  603. fmla d1, d17, alphaV1_R
  604. st2 {v0.d, v1.d}[0], [pCRow1]
  605. add pCRow1, pCRow1, LDC
  606. ld2 {v4.d, v5.d}[0], [pCRow1]
  607. fmla d4, d20, alphaV0_R
  608. fmls d4, d21, alphaV0_I
  609. fmla d5, d20, alphaV1_I
  610. fmla d5, d21, alphaV1_R
  611. st2 {v4.d, v5.d}[0], [pCRow1]
  612. add pCRow1, pCRow1, LDC
  613. ld2 {v0.d, v1.d}[0], [pCRow1]
  614. fmla d0, d24, alphaV0_R
  615. fmls d0, d25, alphaV0_I
  616. fmla d1, d24, alphaV1_I
  617. fmla d1, d25, alphaV1_R
  618. st2 {v0.d, v1.d}[0], [pCRow1]
  619. add pCRow1, pCRow1, LDC
  620. ld2 {v4.d, v5.d}[0], [pCRow1]
  621. fmla d4, d28, alphaV0_R
  622. fmls d4, d29, alphaV0_I
  623. fmla d5, d28, alphaV1_I
  624. fmla d5, d29, alphaV1_R
  625. st2 {v4.d, v5.d}[0], [pCRow1]
  626. add pCRow0, pCRow0, #16
  627. .endm
  628. /******************************************************************************/
  629. .macro INIT4x2
  630. fmov d16, xzr
  631. fmov d17, xzr
  632. fmov d18, d16
  633. fmov d19, d17
  634. fmov d20, d16
  635. fmov d21, d17
  636. fmov d22, d16
  637. fmov d23, d17
  638. .endm
  639. .macro KERNEL4x2_SUB
  640. ld2 {v8.2d, v9.2d}, [pB]
  641. add pB, pB, #32
  642. ld2 {v0.2d, v1.2d}, [pA]
  643. add pA, pA, #32
  644. ld2 {v2.2d, v3.2d}, [pA]
  645. add pA, pA, #32
  646. OP_rr v16.2d, v0.2d, v8.d[0]
  647. OP_ii v16.2d, v1.2d, v9.d[0]
  648. OP_ri v17.2d, v0.2d, v9.d[0]
  649. OP_ir v17.2d, v1.2d, v8.d[0]
  650. OP_rr v18.2d, v2.2d, v8.d[0]
  651. OP_ii v18.2d, v3.2d, v9.d[0]
  652. OP_ri v19.2d, v2.2d, v9.d[0]
  653. OP_ir v19.2d, v3.2d, v8.d[0]
  654. OP_rr v20.2d, v0.2d, v8.d[1]
  655. OP_ii v20.2d, v1.2d, v9.d[1]
  656. OP_ri v21.2d, v0.2d, v9.d[1]
  657. OP_ir v21.2d, v1.2d, v8.d[1]
  658. OP_rr v22.2d, v2.2d, v8.d[1]
  659. OP_ii v22.2d, v3.2d, v9.d[1]
  660. OP_ri v23.2d, v2.2d, v9.d[1]
  661. OP_ir v23.2d, v3.2d, v8.d[1]
  662. .endm
  663. .macro SAVE4x2
  664. fmov alpha0_R, alpha_save_R
  665. fmov alpha0_I, alpha_save_I
  666. fmov alpha1_R, alpha0_R
  667. fmov alpha1_I, alpha0_I
  668. mov pCRow1, pCRow0
  669. ld2 {v0.2d, v1.2d}, [pCRow1]
  670. fmla v0.2d, v16.2d, alphaV0_R
  671. fmls v0.2d, v17.2d, alphaV0_I
  672. fmla v1.2d, v16.2d, alphaV1_I
  673. fmla v1.2d, v17.2d, alphaV1_R
  674. st2 {v0.2d, v1.2d}, [pCRow1]
  675. add pCRow2, pCRow1, #32
  676. ld2 {v2.2d, v3.2d}, [pCRow2]
  677. fmla v2.2d, v18.2d, alphaV0_R
  678. fmls v2.2d, v19.2d, alphaV0_I
  679. fmla v3.2d, v18.2d, alphaV1_I
  680. fmla v3.2d, v19.2d, alphaV1_R
  681. st2 {v2.2d, v3.2d}, [pCRow2]
  682. add pCRow1, pCRow1, LDC
  683. ld2 {v4.2d, v5.2d}, [pCRow1]
  684. fmla v4.2d, v20.2d, alphaV0_R
  685. fmls v4.2d, v21.2d, alphaV0_I
  686. fmla v5.2d, v20.2d, alphaV1_I
  687. fmla v5.2d, v21.2d, alphaV1_R
  688. st2 {v4.2d, v5.2d}, [pCRow1]
  689. add pCRow2, pCRow1, #32
  690. ld2 {v6.2d, v7.2d}, [pCRow2]
  691. fmla v6.2d, v22.2d, alphaV0_R
  692. fmls v6.2d, v23.2d, alphaV0_I
  693. fmla v7.2d, v22.2d, alphaV1_I
  694. fmla v7.2d, v23.2d, alphaV1_R
  695. st2 {v6.2d, v7.2d}, [pCRow2]
  696. add pCRow0, pCRow0, #64
  697. .endm
  698. /******************************************************************************/
  699. .macro INIT2x2
  700. fmov d16, xzr
  701. fmov d17, xzr
  702. fmov d20, d16
  703. fmov d21, d17
  704. .endm
  705. .macro KERNEL2x2_SUB
  706. ld2 {v8.2d, v9.2d}, [pB]
  707. add pB, pB, #32
  708. ld2 {v0.2d, v1.2d}, [pA]
  709. add pA, pA, #32
  710. OP_rr v16.2d, v0.2d, v8.d[0]
  711. OP_ii v16.2d, v1.2d, v9.d[0]
  712. OP_ri v17.2d, v0.2d, v9.d[0]
  713. OP_ir v17.2d, v1.2d, v8.d[0]
  714. OP_rr v20.2d, v0.2d, v8.d[1]
  715. OP_ii v20.2d, v1.2d, v9.d[1]
  716. OP_ri v21.2d, v0.2d, v9.d[1]
  717. OP_ir v21.2d, v1.2d, v8.d[1]
  718. .endm
  719. .macro SAVE2x2
  720. fmov alpha0_R, alpha_save_R
  721. fmov alpha0_I, alpha_save_I
  722. fmov alpha1_R, alpha0_R
  723. fmov alpha1_I, alpha0_I
  724. mov pCRow1, pCRow0
  725. ld2 {v0.2d, v1.2d}, [pCRow1]
  726. fmla v0.2d, v16.2d, alphaV0_R
  727. fmls v0.2d, v17.2d, alphaV0_I
  728. fmla v1.2d, v16.2d, alphaV1_I
  729. fmla v1.2d, v17.2d, alphaV1_R
  730. st2 {v0.2d, v1.2d}, [pCRow1]
  731. add pCRow1, pCRow1, LDC
  732. ld2 {v4.2d, v5.2d}, [pCRow1]
  733. fmla v4.2d, v20.2d, alphaV0_R
  734. fmls v4.2d, v21.2d, alphaV0_I
  735. fmla v5.2d, v20.2d, alphaV1_I
  736. fmla v5.2d, v21.2d, alphaV1_R
  737. st2 {v4.2d, v5.2d}, [pCRow1]
  738. add pCRow0, pCRow0, #32
  739. .endm
  740. /******************************************************************************/
  741. .macro INIT1x2
  742. fmov d16, xzr
  743. fmov d17, xzr
  744. fmov d20, xzr
  745. fmov d21, xzr
  746. .endm
  747. .macro KERNEL1x2_SUB
  748. ld2 {v8.2d, v9.2d}, [pB]
  749. add pB, pB, #32
  750. ld2 {v0.d, v1.d}[0], [pA]
  751. add pA, pA, #16
  752. OP_rr d16, d0, v8.d[0]
  753. OP_ii d16, d1, v9.d[0]
  754. OP_ri d17, d0, v9.d[0]
  755. OP_ir d17, d1, v8.d[0]
  756. OP_rr d20, d0, v8.d[1]
  757. OP_ii d20, d1, v9.d[1]
  758. OP_ri d21, d0, v9.d[1]
  759. OP_ir d21, d1, v8.d[1]
  760. .endm
  761. .macro SAVE1x2
  762. fmov alpha0_R, alpha_save_R
  763. fmov alpha0_I, alpha_save_I
  764. fmov alpha1_R, alpha0_R
  765. fmov alpha1_I, alpha0_I
  766. mov pCRow1, pCRow0
  767. ld2 {v0.d, v1.d}[0], [pCRow1]
  768. fmla d0, d16, alphaV0_R
  769. fmls d0, d17, alphaV0_I
  770. fmla d1, d16, alphaV1_I
  771. fmla d1, d17, alphaV1_R
  772. st2 {v0.d, v1.d}[0], [pCRow1]
  773. add pCRow1, pCRow1, LDC
  774. ld2 {v4.d, v5.d}[0], [pCRow1]
  775. fmla d4, d20, alphaV0_R
  776. fmls d4, d21, alphaV0_I
  777. fmla d5, d20, alphaV1_I
  778. fmla d5, d21, alphaV1_R
  779. st2 {v4.d, v5.d}[0], [pCRow1]
  780. add pCRow0, pCRow0, #16
  781. .endm
  782. /******************************************************************************/
  783. .macro INIT4x1
  784. fmov d16, xzr
  785. fmov d17, d16
  786. fmov d18, d16
  787. fmov d19, d17
  788. .endm
  789. .macro KERNEL4x1_SUB
  790. ld2 {v8.d, v9.d}[0], [pB]
  791. add pB, pB, #16
  792. ld2 {v0.2d, v1.2d}, [pA]
  793. add pA, pA, #32
  794. ld2 {v2.2d, v3.2d}, [pA]
  795. add pA, pA, #32
  796. OP_rr v16.2d, v0.2d, v8.d[0]
  797. OP_ii v16.2d, v1.2d, v9.d[0]
  798. OP_ri v17.2d, v0.2d, v9.d[0]
  799. OP_ir v17.2d, v1.2d, v8.d[0]
  800. OP_rr v18.2d, v2.2d, v8.d[0]
  801. OP_ii v18.2d, v3.2d, v9.d[0]
  802. OP_ri v19.2d, v2.2d, v9.d[0]
  803. OP_ir v19.2d, v3.2d, v8.d[0]
  804. .endm
  805. .macro SAVE4x1
  806. fmov alpha0_R, alpha_save_R
  807. fmov alpha0_I, alpha_save_I
  808. fmov alpha1_R, alpha0_R
  809. fmov alpha1_I, alpha0_I
  810. mov pCRow1, pCRow0
  811. ld2 {v0.2d, v1.2d}, [pCRow1]
  812. fmla v0.2d, v16.2d, alphaV0_R
  813. fmls v0.2d, v17.2d, alphaV0_I
  814. fmla v1.2d, v16.2d, alphaV1_I
  815. fmla v1.2d, v17.2d, alphaV1_R
  816. st2 {v0.2d, v1.2d}, [pCRow1]
  817. add pCRow2, pCRow1, #32
  818. ld2 {v2.2d, v3.2d}, [pCRow2]
  819. fmla v2.2d, v18.2d, alphaV0_R
  820. fmls v2.2d, v19.2d, alphaV0_I
  821. fmla v3.2d, v18.2d, alphaV1_I
  822. fmla v3.2d, v19.2d, alphaV1_R
  823. st2 {v2.2d, v3.2d}, [pCRow2]
  824. add pCRow0, pCRow0, #64
  825. .endm
  826. /******************************************************************************/
  827. .macro INIT2x1
  828. fmov d16, xzr
  829. fmov d17, xzr
  830. .endm
  831. .macro KERNEL2x1_SUB
  832. ld2 {v8.d, v9.d}[0], [pB]
  833. add pB, pB, #16
  834. ld2 {v0.2d, v1.2d}, [pA]
  835. add pA, pA, #32
  836. OP_rr v16.2d, v0.2d, v8.d[0]
  837. OP_ii v16.2d, v1.2d, v9.d[0]
  838. OP_ri v17.2d, v0.2d, v9.d[0]
  839. OP_ir v17.2d, v1.2d, v8.d[0]
  840. .endm
  841. .macro SAVE2x1
  842. fmov alpha0_R, alpha_save_R
  843. fmov alpha0_I, alpha_save_I
  844. fmov alpha1_R, alpha0_R
  845. fmov alpha1_I, alpha0_I
  846. mov pCRow1, pCRow0
  847. ld2 {v0.2d, v1.2d}, [pCRow1]
  848. fmla v0.2d, v16.2d, alphaV0_R
  849. fmls v0.2d, v17.2d, alphaV0_I
  850. fmla v1.2d, v16.2d, alphaV1_I
  851. fmla v1.2d, v17.2d, alphaV1_R
  852. st2 {v0.2d, v1.2d}, [pCRow1]
  853. add pCRow0, pCRow0, #32
  854. .endm
  855. /******************************************************************************/
  856. .macro INIT1x1
  857. fmov d16, xzr
  858. fmov d17, xzr
  859. .endm
  860. .macro KERNEL1x1_SUB
  861. ld2 {v8.d, v9.d}[0], [pB]
  862. add pB, pB, #16
  863. ld2 {v0.d, v1.d}[0], [pA]
  864. add pA, pA, #16
  865. OP_rr d16, d0, v8.d[0]
  866. OP_ii d16, d1, v9.d[0]
  867. OP_ri d17, d0, v9.d[0]
  868. OP_ir d17, d1, v8.d[0]
  869. .endm
  870. .macro SAVE1x1
  871. fmov alpha0_R, alpha_save_R
  872. fmov alpha0_I, alpha_save_I
  873. fmov alpha1_R, alpha0_R
  874. fmov alpha1_I, alpha0_I
  875. mov pCRow1, pCRow0
  876. ld2 {v0.d, v1.d}[0], [pCRow1]
  877. fmla d0, d16, alphaV0_R
  878. fmls d0, d17, alphaV0_I
  879. fmla d1, d16, alphaV1_I
  880. fmla d1, d17, alphaV1_R
  881. st2 {v0.d, v1.d}[0], [pCRow1]
  882. add pCRow0, pCRow0, #16
  883. .endm
  884. /*******************************************************************************
  885. * End of macro definitions
  886. *******************************************************************************/
  887. PROLOGUE
  888. .align 5
  889. add sp, sp, #-(11 * 16)
  890. stp d8, d9, [sp, #(0 * 16)]
  891. stp d10, d11, [sp, #(1 * 16)]
  892. stp d12, d13, [sp, #(2 * 16)]
  893. stp d14, d15, [sp, #(3 * 16)]
  894. stp d16, d17, [sp, #(4 * 16)]
  895. stp x18, x19, [sp, #(5 * 16)]
  896. stp x20, x21, [sp, #(6 * 16)]
  897. stp x22, x23, [sp, #(7 * 16)]
  898. stp x24, x25, [sp, #(8 * 16)]
  899. stp x26, x27, [sp, #(9 * 16)]
  900. str x28, [sp, #(10 * 16)]
  901. fmov alpha_save_R, d0
  902. fmov alpha_save_I, d1
  903. lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
  904. mov pB, origPB
  905. mov counterJ, origN
  906. asr counterJ, counterJ, #2 // J = J / 4
  907. cmp counterJ, #0
  908. ble zgemm_kernel_L2_BEGIN
  909. zgemm_kernel_L4_BEGIN:
  910. mov pCRow0, pC // pCRow0 = C
  911. add pC, pC, LDC, lsl #2
  912. mov pA, origPA // pA = start of A array
  913. zgemm_kernel_L4_M4_BEGIN:
  914. mov counterI, origM
  915. asr counterI, counterI, #2 // counterI = counterI / 4
  916. cmp counterI, #0
  917. ble zgemm_kernel_L4_M2_BEGIN
  918. zgemm_kernel_L4_M4_20:
  919. mov pB, origPB
  920. asr counterL , origK, #1 // L = K / 2
  921. cmp counterL , #2 // is there at least 4 to do?
  922. blt zgemm_kernel_L4_M4_32
  923. KERNEL4x4_I // do one in the K
  924. KERNEL4x4_M2 // do another in the K
  925. subs counterL, counterL, #2 // subtract 2
  926. ble zgemm_kernel_L4_M4_22a
  927. .align 5
  928. zgemm_kernel_L4_M4_22:
  929. KERNEL4x4_M1
  930. KERNEL4x4_M2
  931. subs counterL, counterL, #1
  932. bgt zgemm_kernel_L4_M4_22
  933. zgemm_kernel_L4_M4_22a:
  934. KERNEL4x4_M1
  935. KERNEL4x4_E
  936. b zgemm_kernel_L4_M4_44
  937. zgemm_kernel_L4_M4_32:
  938. tst counterL, #1
  939. ble zgemm_kernel_L4_M4_40
  940. KERNEL4x4_I
  941. KERNEL4x4_E
  942. b zgemm_kernel_L4_M4_44
  943. zgemm_kernel_L4_M4_40:
  944. INIT4x4
  945. zgemm_kernel_L4_M4_44:
  946. ands counterL , origK, #1
  947. ble zgemm_kernel_L4_M4_100
  948. zgemm_kernel_L4_M4_46:
  949. KERNEL4x4_SUB
  950. zgemm_kernel_L4_M4_100:
  951. SAVE4x4
  952. zgemm_kernel_L4_M4_END:
  953. subs counterI, counterI, #1
  954. bne zgemm_kernel_L4_M4_20
  955. zgemm_kernel_L4_M2_BEGIN:
  956. mov counterI, origM
  957. tst counterI , #3
  958. ble zgemm_kernel_L4_END
  959. tst counterI, #2 // counterI = counterI / 2
  960. ble zgemm_kernel_L4_M1_BEGIN
  961. zgemm_kernel_L4_M2_20:
  962. INIT2x4
  963. mov pB, origPB
  964. asr counterL , origK, #3 // counterL = counterL / 8
  965. cmp counterL , #0
  966. ble zgemm_kernel_L4_M2_40
  967. zgemm_kernel_L4_M2_22:
  968. KERNEL2x4_SUB
  969. KERNEL2x4_SUB
  970. KERNEL2x4_SUB
  971. KERNEL2x4_SUB
  972. KERNEL2x4_SUB
  973. KERNEL2x4_SUB
  974. KERNEL2x4_SUB
  975. KERNEL2x4_SUB
  976. subs counterL, counterL, #1
  977. bgt zgemm_kernel_L4_M2_22
  978. zgemm_kernel_L4_M2_40:
  979. ands counterL , origK, #7 // counterL = counterL % 8
  980. ble zgemm_kernel_L4_M2_100
  981. zgemm_kernel_L4_M2_42:
  982. KERNEL2x4_SUB
  983. subs counterL, counterL, #1
  984. bgt zgemm_kernel_L4_M2_42
  985. zgemm_kernel_L4_M2_100:
  986. SAVE2x4
  987. zgemm_kernel_L4_M2_END:
  988. zgemm_kernel_L4_M1_BEGIN:
  989. tst counterI, #1 // counterI = counterI % 2
  990. ble zgemm_kernel_L4_END
  991. zgemm_kernel_L4_M1_20:
  992. INIT1x4
  993. mov pB, origPB
  994. asr counterL , origK, #3 // counterL = counterL / 8
  995. cmp counterL , #0
  996. ble zgemm_kernel_L4_M1_40
  997. zgemm_kernel_L4_M1_22:
  998. KERNEL1x4_SUB
  999. KERNEL1x4_SUB
  1000. KERNEL1x4_SUB
  1001. KERNEL1x4_SUB
  1002. KERNEL1x4_SUB
  1003. KERNEL1x4_SUB
  1004. KERNEL1x4_SUB
  1005. KERNEL1x4_SUB
  1006. subs counterL, counterL, #1
  1007. bgt zgemm_kernel_L4_M1_22
  1008. zgemm_kernel_L4_M1_40:
  1009. ands counterL , origK, #7 // counterL = counterL % 8
  1010. ble zgemm_kernel_L4_M1_100
  1011. zgemm_kernel_L4_M1_42:
  1012. KERNEL1x4_SUB
  1013. subs counterL, counterL, #1
  1014. bgt zgemm_kernel_L4_M1_42
  1015. zgemm_kernel_L4_M1_100:
  1016. SAVE1x4
  1017. zgemm_kernel_L4_END:
  1018. lsl temp, origK, #6
  1019. add origPB, origPB, temp // B = B + K * 4 * 8 * 2
  1020. subs counterJ, counterJ , #1 // j--
  1021. bgt zgemm_kernel_L4_BEGIN
  1022. /******************************************************************************/
  1023. zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  1024. mov counterJ , origN
  1025. tst counterJ , #3
  1026. ble zgemm_kernel_L999
  1027. tst counterJ , #2
  1028. ble zgemm_kernel_L1_BEGIN
  1029. mov pCRow0, pC // pCRow0 = pC
  1030. add pC,pC,LDC, lsl #1
  1031. mov pA, origPA // pA = A
  1032. zgemm_kernel_L2_M4_BEGIN:
  1033. mov counterI, origM
  1034. asr counterI, counterI, #2 // counterI = counterI / 4
  1035. cmp counterI,#0
  1036. ble zgemm_kernel_L2_M2_BEGIN
  1037. zgemm_kernel_L2_M4_20:
  1038. INIT4x2
  1039. mov pB, origPB
  1040. asr counterL , origK, #3 // counterL = counterL / 8
  1041. cmp counterL,#0
  1042. ble zgemm_kernel_L2_M4_40
  1043. .align 5
  1044. zgemm_kernel_L2_M4_22:
  1045. KERNEL4x2_SUB
  1046. KERNEL4x2_SUB
  1047. KERNEL4x2_SUB
  1048. KERNEL4x2_SUB
  1049. KERNEL4x2_SUB
  1050. KERNEL4x2_SUB
  1051. KERNEL4x2_SUB
  1052. KERNEL4x2_SUB
  1053. subs counterL, counterL, #1
  1054. bgt zgemm_kernel_L2_M4_22
  1055. zgemm_kernel_L2_M4_40:
  1056. ands counterL , origK, #7 // counterL = counterL % 8
  1057. ble zgemm_kernel_L2_M4_100
  1058. zgemm_kernel_L2_M4_42:
  1059. KERNEL4x2_SUB
  1060. subs counterL, counterL, #1
  1061. bgt zgemm_kernel_L2_M4_42
  1062. zgemm_kernel_L2_M4_100:
  1063. SAVE4x2
  1064. zgemm_kernel_L2_M4_END:
  1065. subs counterI, counterI, #1
  1066. bgt zgemm_kernel_L2_M4_20
  1067. zgemm_kernel_L2_M2_BEGIN:
  1068. mov counterI, origM
  1069. tst counterI , #3
  1070. ble zgemm_kernel_L2_END
  1071. tst counterI, #2 // counterI = counterI / 2
  1072. ble zgemm_kernel_L2_M1_BEGIN
  1073. zgemm_kernel_L2_M2_20:
  1074. INIT2x2
  1075. mov pB, origPB
  1076. asr counterL , origK, #3 // counterL = counterL / 8
  1077. cmp counterL,#0
  1078. ble zgemm_kernel_L2_M2_40
  1079. zgemm_kernel_L2_M2_22:
  1080. KERNEL2x2_SUB
  1081. KERNEL2x2_SUB
  1082. KERNEL2x2_SUB
  1083. KERNEL2x2_SUB
  1084. KERNEL2x2_SUB
  1085. KERNEL2x2_SUB
  1086. KERNEL2x2_SUB
  1087. KERNEL2x2_SUB
  1088. subs counterL, counterL, #1
  1089. bgt zgemm_kernel_L2_M2_22
  1090. zgemm_kernel_L2_M2_40:
  1091. ands counterL , origK, #7 // counterL = counterL % 8
  1092. ble zgemm_kernel_L2_M2_100
  1093. zgemm_kernel_L2_M2_42:
  1094. KERNEL2x2_SUB
  1095. subs counterL, counterL, #1
  1096. bgt zgemm_kernel_L2_M2_42
  1097. zgemm_kernel_L2_M2_100:
  1098. SAVE2x2
  1099. zgemm_kernel_L2_M2_END:
  1100. zgemm_kernel_L2_M1_BEGIN:
  1101. tst counterI, #1 // counterI = counterI % 2
  1102. ble zgemm_kernel_L2_END
  1103. zgemm_kernel_L2_M1_20:
  1104. INIT1x2
  1105. mov pB, origPB
  1106. asr counterL , origK, #3 // counterL = counterL / 8
  1107. cmp counterL, #0
  1108. ble zgemm_kernel_L2_M1_40
  1109. zgemm_kernel_L2_M1_22:
  1110. KERNEL1x2_SUB
  1111. KERNEL1x2_SUB
  1112. KERNEL1x2_SUB
  1113. KERNEL1x2_SUB
  1114. KERNEL1x2_SUB
  1115. KERNEL1x2_SUB
  1116. KERNEL1x2_SUB
  1117. KERNEL1x2_SUB
  1118. subs counterL, counterL, #1
  1119. bgt zgemm_kernel_L2_M1_22
  1120. zgemm_kernel_L2_M1_40:
  1121. ands counterL , origK, #7 // counterL = counterL % 8
  1122. ble zgemm_kernel_L2_M1_100
  1123. zgemm_kernel_L2_M1_42:
  1124. KERNEL1x2_SUB
  1125. subs counterL, counterL, #1
  1126. bgt zgemm_kernel_L2_M1_42
  1127. zgemm_kernel_L2_M1_100:
  1128. SAVE1x2
  1129. zgemm_kernel_L2_END:
  1130. lsl temp, origK, #5
  1131. add origPB, origPB, temp // B = B + K * 2 * 8 * 2
  1132. /******************************************************************************/
  1133. zgemm_kernel_L1_BEGIN:
  1134. mov counterJ , origN
  1135. tst counterJ , #1
  1136. ble zgemm_kernel_L999 // done
  1137. mov pCRow0, pC // pCRow0 = C
  1138. add pC , pC , LDC // Update pC to point to next
  1139. mov pA, origPA // pA = A
  1140. zgemm_kernel_L1_M4_BEGIN:
  1141. mov counterI, origM
  1142. asr counterI, counterI, #2 // counterI = counterI / 4
  1143. cmp counterI, #0
  1144. ble zgemm_kernel_L1_M2_BEGIN
  1145. zgemm_kernel_L1_M4_20:
  1146. INIT4x1
  1147. mov pB, origPB
  1148. asr counterL , origK, #3 // counterL = counterL / 8
  1149. cmp counterL , #0
  1150. ble zgemm_kernel_L1_M4_40
  1151. .align 5
  1152. zgemm_kernel_L1_M4_22:
  1153. KERNEL4x1_SUB
  1154. KERNEL4x1_SUB
  1155. KERNEL4x1_SUB
  1156. KERNEL4x1_SUB
  1157. KERNEL4x1_SUB
  1158. KERNEL4x1_SUB
  1159. KERNEL4x1_SUB
  1160. KERNEL4x1_SUB
  1161. subs counterL, counterL, #1
  1162. bgt zgemm_kernel_L1_M4_22
  1163. zgemm_kernel_L1_M4_40:
  1164. ands counterL , origK, #7 // counterL = counterL % 8
  1165. ble zgemm_kernel_L1_M4_100
  1166. zgemm_kernel_L1_M4_42:
  1167. KERNEL4x1_SUB
  1168. subs counterL, counterL, #1
  1169. bgt zgemm_kernel_L1_M4_42
  1170. zgemm_kernel_L1_M4_100:
  1171. SAVE4x1
  1172. zgemm_kernel_L1_M4_END:
  1173. subs counterI, counterI, #1
  1174. bgt zgemm_kernel_L1_M4_20
  1175. zgemm_kernel_L1_M2_BEGIN:
  1176. mov counterI, origM
  1177. tst counterI , #3
  1178. ble zgemm_kernel_L1_END
  1179. tst counterI, #2 // counterI = counterI / 2
  1180. ble zgemm_kernel_L1_M1_BEGIN
  1181. zgemm_kernel_L1_M2_20:
  1182. INIT2x1
  1183. mov pB, origPB
  1184. asr counterL , origK, #3 // counterL = counterL / 8
  1185. cmp counterL , #0
  1186. ble zgemm_kernel_L1_M2_40
  1187. zgemm_kernel_L1_M2_22:
  1188. KERNEL2x1_SUB
  1189. KERNEL2x1_SUB
  1190. KERNEL2x1_SUB
  1191. KERNEL2x1_SUB
  1192. KERNEL2x1_SUB
  1193. KERNEL2x1_SUB
  1194. KERNEL2x1_SUB
  1195. KERNEL2x1_SUB
  1196. subs counterL, counterL, #1
  1197. bgt zgemm_kernel_L1_M2_22
  1198. zgemm_kernel_L1_M2_40:
  1199. ands counterL , origK, #7 // counterL = counterL % 8
  1200. ble zgemm_kernel_L1_M2_100
  1201. zgemm_kernel_L1_M2_42:
  1202. KERNEL2x1_SUB
  1203. subs counterL, counterL, #1
  1204. bgt zgemm_kernel_L1_M2_42
  1205. zgemm_kernel_L1_M2_100:
  1206. SAVE2x1
  1207. zgemm_kernel_L1_M2_END:
  1208. zgemm_kernel_L1_M1_BEGIN:
  1209. tst counterI, #1 // counterI = counterI % 2
  1210. ble zgemm_kernel_L1_END
  1211. zgemm_kernel_L1_M1_20:
  1212. INIT1x1
  1213. mov pB, origPB
  1214. asr counterL , origK, #3 // counterL = counterL / 8
  1215. cmp counterL , #0
  1216. ble zgemm_kernel_L1_M1_40
  1217. zgemm_kernel_L1_M1_22:
  1218. KERNEL1x1_SUB
  1219. KERNEL1x1_SUB
  1220. KERNEL1x1_SUB
  1221. KERNEL1x1_SUB
  1222. KERNEL1x1_SUB
  1223. KERNEL1x1_SUB
  1224. KERNEL1x1_SUB
  1225. KERNEL1x1_SUB
  1226. subs counterL, counterL, #1
  1227. bgt zgemm_kernel_L1_M1_22
  1228. zgemm_kernel_L1_M1_40:
  1229. ands counterL , origK, #7 // counterL = counterL % 8
  1230. ble zgemm_kernel_L1_M1_100
  1231. zgemm_kernel_L1_M1_42:
  1232. KERNEL1x1_SUB
  1233. subs counterL, counterL, #1
  1234. bgt zgemm_kernel_L1_M1_42
  1235. zgemm_kernel_L1_M1_100:
  1236. SAVE1x1
  1237. zgemm_kernel_L1_END:
  1238. zgemm_kernel_L999:
  1239. mov x0, #0 // set return value
  1240. ldp d8, d9, [sp, #(0 * 16)]
  1241. ldp d10, d11, [sp, #(1 * 16)]
  1242. ldp d12, d13, [sp, #(2 * 16)]
  1243. ldp d14, d15, [sp, #(3 * 16)]
  1244. ldp d16, d17, [sp, #(4 * 16)]
  1245. ldp x18, x19, [sp, #(5 * 16)]
  1246. ldp x20, x21, [sp, #(6 * 16)]
  1247. ldp x22, x23, [sp, #(7 * 16)]
  1248. ldp x24, x25, [sp, #(8 * 16)]
  1249. ldp x26, x27, [sp, #(9 * 16)]
  1250. ldr x28, [sp, #(10 * 16)]
  1251. add sp, sp, #(11*16)
  1252. ret
  1253. EPILOGUE