You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_16x4.S 37 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alpha w17
  49. #define alpha0 s10
  50. #define alphaV0 v10.s[0]
  51. #define A_PRE_SIZE 2560
  52. #define B_PRE_SIZE 224
  53. #define C_PRE_SIZE 160
  54. // 00 origM
  55. // 01 origN
  56. // 02 origK
  57. // 03 origPA
  58. // 04 origPB
  59. // 05 pC
  60. // 06 origLDC -> LDC
  61. // 07 offset
  62. // 08 counterL
  63. // 09 counterI
  64. // 10 counterJ
  65. // 11 pB
  66. // 12 pCRow0
  67. // 13 pCRow1
  68. // 14 pCRow2
  69. // 15 pA
  70. // 16 temp
  71. // 17
  72. // 18 must save
  73. // 19 must save
  74. // 20 must save
  75. // 21 must save
  76. // 22 must save
  77. // 23 must save
  78. // 24 must save
  79. // 25 must save
  80. // 26 must save
  81. // 27 must save
  82. // 28 must save
  83. // 29 frame
  84. // 30 link
  85. // 31 sp
  86. //v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03
  87. //v01 pA0_04, pA0_05, pA0_06, pA0_07
  88. //v02 pA0_08, pA0_09, pA0_10, pA0_11
  89. //v03 pA0_12, pA0_13, pA0_14, pA0_15
  90. //v04 pA1_00, pA1_01, pA1_02, pA1_03
  91. //v05 pA1_04, pA1_05, pA1_06, pA1_07
  92. //v06 pA1_08, pA1_09, pA1_10, pA1_11
  93. //v07 pA1_12, pA1_13, pA1_14, pA1_15
  94. //v08 must save pB00
  95. //v09 must save pB01
  96. //v10 must save pB02
  97. //v11 must save pB03
  98. //v12 must save pB10
  99. //v13 must save pB11
  100. //v14 must save pB12
  101. //v15 must save pB13
  102. //v16 must save C00, C01, C02, C03
  103. //v17 must save C04, C05, C06, C07
  104. //v18 C08, C09, C10, C11
  105. //v19 C12, C13, C14, C15
  106. //v20 C16, C17, C18, C19
  107. //v21 C20, C21, C22, C23
  108. //v22 C24, C25, C26, C27
  109. //v23 C28, C29, C30, C31
  110. //v24 C32, C33, C34, C35
  111. //v25 C36, C37, C38, C39
  112. //v26 C40, C41, C42, C43
  113. //v27 C44, C45, C46, C47
  114. //v28 C48, C49, C50, C51
  115. //v29 C52, C53, C54, C55
  116. //v30 C56, C57, C58, C59
  117. //v31 C60, C61, C62, C63
  118. /*******************************************************************************
  119. * Macro definitions
  120. *******************************************************************************/
  121. .macro INIT16x4
  122. fmov s16, wzr
  123. fmov s17, wzr
  124. fmov s18, s16
  125. fmov s19, s17
  126. fmov s20, wzr
  127. fmov s21, s16
  128. fmov s22, s17
  129. fmov s23, s18
  130. fmov s24, wzr
  131. fmov s25, s16
  132. fmov s26, s17
  133. fmov s27, s18
  134. fmov s28, wzr
  135. fmov s29, s16
  136. fmov s30, s17
  137. fmov s31, s18
  138. .endm
  139. .macro KERNEL16x4_I
  140. ldp q0, q1, [pA], #32
  141. ldp s8, s9, [pB], #8
  142. fmul v16.4s, v0.4s, v8.s[0]
  143. fmul v20.4s, v0.4s, v9.s[0]
  144. ldp s10, s11, [pB], #8
  145. fmul v24.4s, v0.4s, v10.s[0]
  146. fmul v28.4s, v0.4s, v11.s[0]
  147. ldp q2, q3, [pA], #32
  148. fmul v17.4s, v1.4s, v8.s[0]
  149. fmul v21.4s, v1.4s, v9.s[0]
  150. ldp q4, q5, [pA], #32
  151. fmul v25.4s, v1.4s, v10.s[0]
  152. fmul v29.4s, v1.4s, v11.s[0]
  153. ldp s12, s13, [pB], #8
  154. fmul v18.4s, v2.4s, v8.s[0]
  155. fmul v22.4s, v2.4s, v9.s[0]
  156. ldp s14, s15, [pB], #8
  157. fmul v19.4s, v3.4s, v8.s[0]
  158. fmul v23.4s, v3.4s, v9.s[0]
  159. ldp q6, q7, [pA], #32
  160. fmul v26.4s, v2.4s, v10.s[0]
  161. fmul v30.4s, v2.4s, v11.s[0]
  162. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  163. fmul v27.4s, v3.4s, v10.s[0]
  164. fmul v31.4s, v3.4s, v11.s[0]
  165. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  166. .endm
  167. .macro KERNEL16x4_M1
  168. fmla v16.4s, v0.4s, v8.s[0]
  169. fmla v17.4s, v1.4s, v8.s[0]
  170. ldp q4, q5, [pA], #32
  171. fmla v18.4s, v2.4s, v8.s[0]
  172. fmla v19.4s, v3.4s, v8.s[0]
  173. fmla v20.4s, v0.4s, v9.s[0]
  174. fmla v21.4s, v1.4s, v9.s[0]
  175. ldp s12, s13, [pB], #8
  176. fmla v22.4s, v2.4s, v9.s[0]
  177. fmla v23.4s, v3.4s, v9.s[0]
  178. ldp s14, s15, [pB], #8
  179. fmla v24.4s, v0.4s, v10.s[0]
  180. fmla v25.4s, v1.4s, v10.s[0]
  181. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  182. fmla v26.4s, v2.4s, v10.s[0]
  183. fmla v27.4s, v3.4s, v10.s[0]
  184. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  185. fmla v28.4s, v0.4s, v11.s[0]
  186. fmla v29.4s, v1.4s, v11.s[0]
  187. ldp q6, q7, [pA], #32
  188. fmla v30.4s, v2.4s, v11.s[0]
  189. fmla v31.4s, v3.4s, v11.s[0]
  190. .endm
  191. .macro KERNEL16x4_M2
  192. fmla v16.4s, v4.4s, v12.s[0]
  193. fmla v17.4s, v5.4s, v12.s[0]
  194. ldp q0, q1, [pA], #32
  195. fmla v18.4s, v6.4s, v12.s[0]
  196. fmla v19.4s, v7.4s, v12.s[0]
  197. fmla v20.4s, v4.4s, v13.s[0]
  198. fmla v21.4s, v5.4s, v13.s[0]
  199. ldp s8, s9, [pB], #8
  200. fmla v22.4s, v6.4s, v13.s[0]
  201. fmla v23.4s, v7.4s, v13.s[0]
  202. ldp s10, s11, [pB], #8
  203. fmla v24.4s, v4.4s, v14.s[0]
  204. fmla v25.4s, v5.4s, v14.s[0]
  205. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  206. fmla v26.4s, v6.4s, v14.s[0]
  207. fmla v27.4s, v7.4s, v14.s[0]
  208. ldp q2, q3, [pA], #32
  209. fmla v28.4s, v4.4s, v15.s[0]
  210. fmla v29.4s, v5.4s, v15.s[0]
  211. fmla v30.4s, v6.4s, v15.s[0]
  212. fmla v31.4s, v7.4s, v15.s[0]
  213. .endm
  214. .macro KERNEL16x4_E
  215. fmla v16.4s, v4.4s, v12.s[0]
  216. fmla v20.4s, v4.4s, v13.s[0]
  217. fmla v24.4s, v4.4s, v14.s[0]
  218. fmla v28.4s, v4.4s, v15.s[0]
  219. fmla v17.4s, v5.4s, v12.s[0]
  220. fmla v21.4s, v5.4s, v13.s[0]
  221. fmla v25.4s, v5.4s, v14.s[0]
  222. fmla v29.4s, v5.4s, v15.s[0]
  223. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  224. fmla v18.4s, v6.4s, v12.s[0]
  225. fmla v22.4s, v6.4s, v13.s[0]
  226. fmla v26.4s, v6.4s, v14.s[0]
  227. fmla v30.4s, v6.4s, v15.s[0]
  228. fmla v19.4s, v7.4s, v12.s[0]
  229. fmla v23.4s, v7.4s, v13.s[0]
  230. fmla v27.4s, v7.4s, v14.s[0]
  231. fmla v31.4s, v7.4s, v15.s[0]
  232. .endm
  233. .macro KERNEL16x4_SUB
  234. ldp q0, q1, [pA], #32
  235. ldp s8, s9, [pB], #8
  236. fmla v16.4s, v0.4s, v8.s[0]
  237. fmla v20.4s, v0.4s, v9.s[0]
  238. ldp s10, s11, [pB], #8
  239. fmla v24.4s, v0.4s, v10.s[0]
  240. fmla v28.4s, v0.4s, v11.s[0]
  241. ldp q2, q3, [pA], #32
  242. fmla v17.4s, v1.4s, v8.s[0]
  243. fmla v21.4s, v1.4s, v9.s[0]
  244. fmla v25.4s, v1.4s, v10.s[0]
  245. fmla v29.4s, v1.4s, v11.s[0]
  246. fmla v18.4s, v2.4s, v8.s[0]
  247. fmla v22.4s, v2.4s, v9.s[0]
  248. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  249. fmla v19.4s, v3.4s, v8.s[0]
  250. fmla v23.4s, v3.4s, v9.s[0]
  251. fmla v26.4s, v2.4s, v10.s[0]
  252. fmla v30.4s, v2.4s, v11.s[0]
  253. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  254. fmla v27.4s, v3.4s, v10.s[0]
  255. fmla v31.4s, v3.4s, v11.s[0]
  256. .endm
  257. .macro SAVE16x4
  258. fmov alpha0, alpha
  259. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  260. ldp q0, q1, [pCRow0]
  261. fmla v0.4s, v16.4s, alphaV0
  262. fmla v1.4s, v17.4s, alphaV0
  263. stp q0, q1, [pCRow0]
  264. add pCRow0, pCRow0, #32
  265. ldp q2, q3, [pCRow0]
  266. fmla v2.4s, v18.4s, alphaV0
  267. fmla v3.4s, v19.4s, alphaV0
  268. stp q2, q3, [pCRow0]
  269. add pCRow0, pCRow0, #32
  270. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  271. ldp q4, q5, [pCRow1]
  272. fmla v4.4s, v20.4s, alphaV0
  273. fmla v5.4s, v21.4s, alphaV0
  274. stp q4, q5, [pCRow1]
  275. add pCRow1, pCRow1, #32
  276. ldp q6, q7, [pCRow1]
  277. fmla v6.4s, v22.4s, alphaV0
  278. fmla v7.4s, v23.4s, alphaV0
  279. stp q6, q7, [pCRow1]
  280. add pCRow1, pCRow1, #32
  281. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  282. ldp q0, q1, [pCRow2]
  283. fmla v0.4s, v24.4s, alphaV0
  284. fmla v1.4s, v25.4s, alphaV0
  285. stp q0, q1, [pCRow2]
  286. add pCRow2, pCRow2, #32
  287. ldp q2, q3, [pCRow2]
  288. fmla v2.4s, v26.4s, alphaV0
  289. fmla v3.4s, v27.4s, alphaV0
  290. stp q2, q3, [pCRow2]
  291. add pCRow2, pCRow2, #32
  292. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  293. ldp q4, q5, [pCRow3]
  294. fmla v4.4s, v28.4s, alphaV0
  295. fmla v5.4s, v29.4s, alphaV0
  296. stp q4, q5, [pCRow3]
  297. add pCRow3, pCRow3, #32
  298. ldp q6, q7, [pCRow3]
  299. fmla v6.4s, v30.4s, alphaV0
  300. fmla v7.4s, v31.4s, alphaV0
  301. stp q6, q7, [pCRow3]
  302. add pCRow3, pCRow3, #32
  303. .endm
  304. /******************************************************************************/
  305. .macro INIT8x4
  306. fmov s16, wzr
  307. fmov s17, wzr
  308. fmov s20, wzr
  309. fmov s21, s16
  310. fmov s24, wzr
  311. fmov s25, s16
  312. fmov s28, wzr
  313. fmov s29, s16
  314. .endm
  315. .macro KERNEL8x4_I
  316. ldp s8, s9, [pB], #8
  317. ldp s10, s11, [pB], #8
  318. ldr q0, [pA], #16
  319. ldr q1, [pA], #16
  320. fmul v16.4s, v0.4s, v8.s[0]
  321. fmul v17.4s, v1.4s, v8.s[0]
  322. fmul v20.4s, v0.4s, v9.s[0]
  323. fmul v21.4s, v1.4s, v9.s[0]
  324. fmul v24.4s, v0.4s, v10.s[0]
  325. fmul v25.4s, v1.4s, v10.s[0]
  326. fmul v28.4s, v0.4s, v11.s[0]
  327. fmul v29.4s, v1.4s, v11.s[0]
  328. ldp s12, s13, [pB], #8
  329. ldp s14, s15, [pB], #8
  330. ldr q4, [pA], #16
  331. ldr q5, [pA], #16
  332. .endm
  333. .macro KERNEL8x4_M1
  334. fmla v16.4s, v0.4s, v8.s[0]
  335. fmla v17.4s, v1.4s, v8.s[0]
  336. fmla v20.4s, v0.4s, v9.s[0]
  337. fmla v21.4s, v1.4s, v9.s[0]
  338. fmla v24.4s, v0.4s, v10.s[0]
  339. fmla v25.4s, v1.4s, v10.s[0]
  340. fmla v28.4s, v0.4s, v11.s[0]
  341. fmla v29.4s, v1.4s, v11.s[0]
  342. ldp s12, s13, [pB], #8
  343. ldp s14, s15, [pB], #8
  344. ldr q4, [pA], #16
  345. ldr q5, [pA], #16
  346. .endm
  347. .macro KERNEL8x4_M2
  348. fmla v16.4s, v4.4s, v12.s[0]
  349. fmla v17.4s, v5.4s, v12.s[0]
  350. fmla v20.4s, v4.4s, v13.s[0]
  351. fmla v21.4s, v5.4s, v13.s[0]
  352. fmla v24.4s, v4.4s, v14.s[0]
  353. fmla v25.4s, v5.4s, v14.s[0]
  354. fmla v28.4s, v4.4s, v15.s[0]
  355. fmla v29.4s, v5.4s, v15.s[0]
  356. ldp s8, s9, [pB], #8
  357. ldp s10, s11, [pB], #8
  358. ldr q0, [pA], #16
  359. ldr q1, [pA], #16
  360. .endm
  361. .macro KERNEL8x4_E
  362. fmla v16.4s, v4.4s, v12.s[0]
  363. fmla v17.4s, v5.4s, v12.s[0]
  364. fmla v20.4s, v4.4s, v13.s[0]
  365. fmla v21.4s, v5.4s, v13.s[0]
  366. fmla v24.4s, v4.4s, v14.s[0]
  367. fmla v25.4s, v5.4s, v14.s[0]
  368. fmla v28.4s, v4.4s, v15.s[0]
  369. fmla v29.4s, v5.4s, v15.s[0]
  370. .endm
  371. .macro KERNEL8x4_SUB
  372. ldp s8, s9, [pB], #8
  373. ldp s10, s11, [pB], #8
  374. ldr q0, [pA], #16
  375. ldr q1, [pA], #16
  376. fmla v16.4s, v0.4s, v8.s[0]
  377. fmla v17.4s, v1.4s, v8.s[0]
  378. fmla v20.4s, v0.4s, v9.s[0]
  379. fmla v21.4s, v1.4s, v9.s[0]
  380. fmla v24.4s, v0.4s, v10.s[0]
  381. fmla v25.4s, v1.4s, v10.s[0]
  382. fmla v28.4s, v0.4s, v11.s[0]
  383. fmla v29.4s, v1.4s, v11.s[0]
  384. .endm
  385. .macro SAVE8x4
  386. fmov alpha0, alpha
  387. ldp q0, q1, [pCRow0]
  388. fmla v0.4s, v16.4s, alphaV0
  389. fmla v1.4s, v17.4s, alphaV0
  390. stp q0, q1, [pCRow0]
  391. add pCRow0, pCRow0, #32
  392. ldp q2, q3, [pCRow1]
  393. fmla v2.4s, v20.4s, alphaV0
  394. fmla v3.4s, v21.4s, alphaV0
  395. stp q2, q3, [pCRow1]
  396. add pCRow1, pCRow1, #32
  397. ldp q4, q5, [pCRow2]
  398. fmla v4.4s, v24.4s, alphaV0
  399. fmla v5.4s, v25.4s, alphaV0
  400. stp q4, q5, [pCRow2]
  401. add pCRow2, pCRow2, #32
  402. ldp q6, q7, [pCRow3]
  403. fmla v6.4s, v28.4s, alphaV0
  404. fmla v7.4s, v29.4s, alphaV0
  405. stp q6, q7, [pCRow3]
  406. add pCRow3, pCRow3, #32
  407. .endm
  408. /******************************************************************************/
  409. .macro INIT4x4
  410. fmov s16, wzr
  411. fmov s20, wzr
  412. fmov s24, wzr
  413. fmov s28, wzr
  414. .endm
  415. .macro KERNEL4x4_I
  416. ldp s8, s9, [pB], #8
  417. ldp s10, s11, [pB], #8
  418. ldr q0, [pA], #16
  419. fmul v16.4s, v0.4s, v8.s[0]
  420. fmul v20.4s, v0.4s, v9.s[0]
  421. fmul v24.4s, v0.4s, v10.s[0]
  422. fmul v28.4s, v0.4s, v11.s[0]
  423. ldp s12, s13, [pB], #8
  424. ldp s14, s15, [pB], #8
  425. ldr q1, [pA], #16
  426. .endm
  427. .macro KERNEL4x4_M1
  428. fmla v16.4s, v0.4s, v8.s[0]
  429. fmla v20.4s, v0.4s, v9.s[0]
  430. fmla v24.4s, v0.4s, v10.s[0]
  431. fmla v28.4s, v0.4s, v11.s[0]
  432. ldp s12, s13, [pB], #8
  433. ldp s14, s15, [pB], #8
  434. ldr q1, [pA], #16
  435. .endm
  436. .macro KERNEL4x4_M2
  437. fmla v16.4s, v1.4s, v12.s[0]
  438. fmla v20.4s, v1.4s, v13.s[0]
  439. fmla v24.4s, v1.4s, v14.s[0]
  440. fmla v28.4s, v1.4s, v15.s[0]
  441. ldp s8, s9, [pB], #8
  442. ldp s10, s11, [pB], #8
  443. ldr q0, [pA], #16
  444. .endm
  445. .macro KERNEL4x4_E
  446. fmla v16.4s, v1.4s, v12.s[0]
  447. fmla v20.4s, v1.4s, v13.s[0]
  448. fmla v24.4s, v1.4s, v14.s[0]
  449. fmla v28.4s, v1.4s, v15.s[0]
  450. .endm
  451. .macro KERNEL4x4_SUB
  452. ldp s8, s9, [pB], #8
  453. ldp s10, s11, [pB], #8
  454. ldr q0, [pA], #16
  455. fmla v16.4s, v0.4s, v8.s[0]
  456. fmla v20.4s, v0.4s, v9.s[0]
  457. fmla v24.4s, v0.4s, v10.s[0]
  458. fmla v28.4s, v0.4s, v11.s[0]
  459. .endm
  460. .macro SAVE4x4
  461. fmov alpha0, alpha
  462. ldr q0, [pCRow0]
  463. fmla v0.4s, v16.4s, alphaV0
  464. str q0, [pCRow0]
  465. add pCRow0, pCRow0, #16
  466. ldr q1, [pCRow1]
  467. fmla v1.4s, v20.4s, alphaV0
  468. str q1, [pCRow1]
  469. add pCRow1, pCRow1, #16
  470. ldr q2, [pCRow2]
  471. fmla v2.4s, v24.4s, alphaV0
  472. str q2, [pCRow2]
  473. add pCRow2, pCRow2, #16
  474. ldr q3, [pCRow3]
  475. fmla v3.4s, v28.4s, alphaV0
  476. str q3, [pCRow3]
  477. add pCRow3, pCRow3, #16
  478. .endm
  479. /******************************************************************************/
  480. .macro INIT2x4
  481. fmov s16, wzr
  482. fmov s20, s16
  483. fmov s24, s20
  484. fmov s28, s16
  485. .endm
  486. .macro KERNEL2x4_SUB
  487. ldp s8, s9, [pB], #8
  488. ldp s10, s11, [pB], #8
  489. ldr d0, [pA], #8
  490. fmla v16.2s, v0.2s, v8.s[0]
  491. fmla v20.2s, v0.2s, v9.s[0]
  492. fmla v24.2s, v0.2s, v10.s[0]
  493. fmla v28.2s, v0.2s, v11.s[0]
  494. .endm
  495. .macro SAVE2x4
  496. fmov alpha0, alpha
  497. ldr d0, [pCRow0]
  498. fmla v0.2s, v16.2s, alphaV0
  499. str d0, [pCRow0]
  500. add pCRow0, pCRow0, #8
  501. ldr d1, [pCRow1]
  502. fmla v1.2s, v20.2s, alphaV0
  503. str d1, [pCRow1]
  504. add pCRow1, pCRow1, #8
  505. ldr d0, [pCRow2]
  506. fmla v0.2s, v24.2s, alphaV0
  507. str d0, [pCRow2]
  508. add pCRow2, pCRow2, #8
  509. ldr d1, [pCRow3]
  510. fmla v1.2s, v28.2s, alphaV0
  511. str d1, [pCRow3]
  512. add pCRow3, pCRow3, #8
  513. .endm
  514. /******************************************************************************/
  515. .macro INIT1x4
  516. fmov s16, wzr
  517. fmov s20, s16
  518. .endm
  519. .macro KERNEL1x4_SUB
  520. ldr s0, [pA]
  521. add pA, pA, #4
  522. ld1 {v8.2s, v9.2s}, [pB]
  523. add pB, pB, #16
  524. fmla v16.2s, v8.2s, v0.s[0]
  525. fmla v20.2s, v9.2s, v0.s[0]
  526. .endm
  527. .macro SAVE1x4
  528. fmov alpha0, alpha
  529. ld1 {v8.s}[0], [pCRow0]
  530. ld1 {v8.s}[1], [pCRow1]
  531. fmla v8.2s, v16.2s, alphaV0
  532. st1 {v8.s}[0], [pCRow0]
  533. st1 {v8.s}[1], [pCRow1]
  534. add pCRow0, pCRow0, #4
  535. add pCRow1, pCRow1, #4
  536. ld1 {v12.s}[0], [pCRow2]
  537. ld1 {v12.s}[1], [pCRow3]
  538. fmla v12.2s, v20.2s, alphaV0
  539. st1 {v12.s}[0], [pCRow2]
  540. st1 {v12.s}[1], [pCRow3]
  541. add pCRow2, pCRow2, #4
  542. add pCRow3, pCRow3, #4
  543. .endm
  544. /******************************************************************************/
  545. .macro INIT16x2
  546. fmov s16, wzr
  547. fmov s17, wzr
  548. fmov s18, wzr
  549. fmov s19, s16
  550. fmov s20, wzr
  551. fmov s21, s16
  552. fmov s22, wzr
  553. fmov s23, s16
  554. .endm
  555. .macro KERNEL16x2_SUB
  556. ld1 {v8.2s}, [pB]
  557. add pB, pB, #8
  558. ld1 {v0.4s}, [pA]
  559. add pA, pA, #16
  560. ld1 {v1.4s}, [pA]
  561. add pA, pA, #16
  562. ld1 {v2.4s}, [pA]
  563. add pA, pA, #16
  564. ld1 {v3.4s}, [pA]
  565. add pA, pA, #16
  566. fmla v16.4s, v0.4s, v8.s[0]
  567. fmla v17.4s, v1.4s, v8.s[0]
  568. fmla v18.4s, v2.4s, v8.s[0]
  569. fmla v19.4s, v3.4s, v8.s[0]
  570. fmla v20.4s, v0.4s, v8.s[1]
  571. fmla v21.4s, v1.4s, v8.s[1]
  572. fmla v22.4s, v2.4s, v8.s[1]
  573. fmla v23.4s, v3.4s, v8.s[1]
  574. .endm
  575. .macro SAVE16x2
  576. fmov alpha0, alpha
  577. add pCRow1, pCRow0, LDC
  578. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  579. fmla v0.4s, v16.4s, alphaV0
  580. fmla v1.4s, v17.4s, alphaV0
  581. fmla v2.4s, v18.4s, alphaV0
  582. fmla v3.4s, v19.4s, alphaV0
  583. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  584. ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  585. fmla v4.4s, v20.4s, alphaV0
  586. fmla v5.4s, v21.4s, alphaV0
  587. fmla v6.4s, v22.4s, alphaV0
  588. fmla v7.4s, v23.4s, alphaV0
  589. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  590. add pCRow0, pCRow0, #64
  591. .endm
  592. /******************************************************************************/
  593. .macro INIT8x2
  594. fmov s16, wzr
  595. fmov s17, s16
  596. fmov s20, s17
  597. fmov s21, s16
  598. .endm
  599. .macro KERNEL8x2_SUB
  600. ld1 {v8.2s}, [pB]
  601. add pB, pB, #8
  602. ld1 {v0.4s}, [pA]
  603. add pA, pA, #16
  604. ld1 {v1.4s}, [pA]
  605. add pA, pA, #16
  606. fmla v16.4s, v0.4s, v8.s[0]
  607. fmla v17.4s, v1.4s, v8.s[0]
  608. fmla v20.4s, v0.4s, v8.s[1]
  609. fmla v21.4s, v1.4s, v8.s[1]
  610. .endm
  611. .macro SAVE8x2
  612. fmov alpha0, alpha
  613. add pCRow1, pCRow0, LDC
  614. ld1 {v0.4s, v1.4s}, [pCRow0]
  615. fmla v0.4s, v16.4s, alphaV0
  616. fmla v1.4s, v17.4s, alphaV0
  617. st1 {v0.4s, v1.4s}, [pCRow0]
  618. add pCRow2, pCRow1, LDC
  619. ld1 {v4.4s, v5.4s}, [pCRow1]
  620. fmla v4.4s, v20.4s, alphaV0
  621. fmla v5.4s, v21.4s, alphaV0
  622. st1 {v4.4s, v5.4s}, [pCRow1]
  623. add pCRow0, pCRow0, #32
  624. .endm
  625. /******************************************************************************/
  626. .macro INIT4x2
  627. fmov s16, wzr
  628. fmov s17, s16
  629. fmov s20, s17
  630. fmov s21, s16
  631. .endm
  632. .macro KERNEL4x2_SUB
  633. ld1 {v8.2s}, [pB]
  634. add pB, pB, #8
  635. ld1 {v0.2s, v1.2s}, [pA]
  636. add pA, pA, #16
  637. fmla v16.2s, v0.2s, v8.s[0]
  638. fmla v17.2s, v1.2s, v8.s[0]
  639. fmla v20.2s, v0.2s, v8.s[1]
  640. fmla v21.2s, v1.2s, v8.s[1]
  641. .endm
  642. .macro SAVE4x2
  643. fmov alpha0, alpha
  644. ld1 {v8.2s, v9.2s}, [pCRow0]
  645. fmla v8.2s, v16.2s, alphaV0
  646. fmla v9.2s, v17.2s, alphaV0
  647. st1 {v8.2s, v9.2s}, [pCRow0]
  648. add pCRow1, pCRow0, LDC
  649. ld1 {v12.2s, v13.2s}, [pCRow1]
  650. fmla v12.2s, v20.2s, alphaV0
  651. fmla v13.2s, v21.2s, alphaV0
  652. st1 {v12.2s, v13.2s}, [pCRow1]
  653. add pCRow0, pCRow0, #16
  654. .endm
  655. /******************************************************************************/
  656. .macro INIT2x2
  657. fmov s16, wzr
  658. fmov s20, s16
  659. .endm
  660. .macro KERNEL2x2_SUB
  661. ld1 {v8.2s}, [pB]
  662. add pB, pB, #8
  663. ld1 {v0.2s}, [pA]
  664. add pA, pA, #8
  665. fmla v16.2s, v0.2s, v8.s[0]
  666. fmla v20.2s, v0.2s, v8.s[1]
  667. .endm
  668. .macro SAVE2x2
  669. fmov alpha0, alpha
  670. ld1 {v8.2s}, [pCRow0]
  671. fmla v8.2s, v16.2s, alphaV0
  672. st1 {v8.2s}, [pCRow0]
  673. add pCRow1 , pCRow0, LDC
  674. ld1 {v12.2s}, [pCRow1]
  675. fmla v12.2s, v20.2s, alphaV0
  676. st1 {v12.2s}, [pCRow1]
  677. add pCRow0, pCRow0, #8
  678. .endm
  679. /******************************************************************************/
  680. .macro INIT1x2
  681. fmov s16, wzr
  682. .endm
  683. .macro KERNEL1x2_SUB
  684. ld1 {v8.2s} , [pB]
  685. add pB , pB, #8
  686. ldr s0 , [pA]
  687. add pA, pA, #4
  688. fmla v16.2s, v8.2s, v0.s[0]
  689. .endm
  690. .macro SAVE1x2
  691. fmov alpha0, alpha
  692. add pCRow1 , pCRow0, LDC
  693. ld1 {v8.s}[0], [pCRow0]
  694. ld1 {v8.s}[1], [pCRow1]
  695. fmla v8.2s, v16.2s, alphaV0
  696. st1 {v8.s}[0], [pCRow0]
  697. st1 {v8.s}[1], [pCRow1]
  698. add pCRow0, pCRow0, #4
  699. .endm
  700. /******************************************************************************/
  701. .macro INIT16x1
  702. fmov s16, wzr
  703. fmov s17, wzr
  704. fmov s18, wzr
  705. fmov s19, s16
  706. .endm
  707. .macro KERNEL16x1_SUB
  708. ldr s8, [pB]
  709. add pB , pB, #4
  710. ld1 {v0.4s}, [pA]
  711. add pA, pA, #16
  712. ld1 {v1.4s}, [pA]
  713. add pA, pA, #16
  714. ld1 {v2.4s}, [pA]
  715. add pA, pA, #16
  716. ld1 {v3.4s}, [pA]
  717. add pA, pA, #16
  718. fmla v16.4s, v0.4s, v8.s[0]
  719. fmla v17.4s, v1.4s, v8.s[0]
  720. fmla v18.4s, v2.4s, v8.s[0]
  721. fmla v19.4s, v3.4s, v8.s[0]
  722. .endm
  723. .macro SAVE16x1
  724. fmov alpha0, alpha
  725. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  726. fmla v0.4s, v16.4s, alphaV0
  727. fmla v1.4s, v17.4s, alphaV0
  728. fmla v2.4s, v18.4s, alphaV0
  729. fmla v3.4s, v19.4s, alphaV0
  730. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  731. add pCRow0, pCRow0, #64
  732. .endm
  733. /******************************************************************************/
  734. .macro INIT8x1
  735. fmov s16, wzr
  736. fmov s17, wzr
  737. .endm
  738. .macro KERNEL8x1_SUB
  739. ldr s8, [pB]
  740. add pB , pB, #4
  741. ld1 {v0.4s}, [pA]
  742. add pA, pA, #16
  743. ld1 {v1.4s}, [pA]
  744. add pA, pA, #16
  745. fmla v16.4s, v0.4s, v8.s[0]
  746. fmla v17.4s, v1.4s, v8.s[0]
  747. .endm
  748. .macro SAVE8x1
  749. fmov alpha0, alpha
  750. ld1 {v0.4s, v1.4s}, [pCRow0]
  751. fmla v0.4s, v16.4s, alphaV0
  752. fmla v1.4s, v17.4s, alphaV0
  753. st1 {v0.4s, v1.4s}, [pCRow0]
  754. add pCRow0, pCRow0, #32
  755. .endm
  756. /******************************************************************************/
  757. .macro INIT4x1
  758. fmov s16, wzr
  759. fmov s17, s16
  760. .endm
  761. .macro KERNEL4x1_SUB
  762. ldr s8, [pB]
  763. add pB , pB, #4
  764. ld1 {v0.2s, v1.2s}, [pA]
  765. add pA , pA, #16
  766. fmla v16.2s, v0.2s, v8.s[0]
  767. fmla v17.2s, v1.2s, v8.s[0]
  768. .endm
  769. .macro SAVE4x1
  770. fmov alpha0, alpha
  771. ld1 {v8.2s, v9.2s}, [pCRow0]
  772. fmla v8.2s, v16.2s, alphaV0
  773. fmla v9.2s, v17.2s, alphaV0
  774. st1 {v8.2s, v9.2s}, [pCRow0]
  775. add pCRow0, pCRow0, #16
  776. .endm
  777. /******************************************************************************/
  778. .macro INIT2x1
  779. fmov s16, wzr
  780. .endm
  781. .macro KERNEL2x1_SUB
  782. ldr s8, [pB]
  783. add pB , pB, #4
  784. ld1 {v0.2s}, [pA]
  785. add pA , pA, #8
  786. fmla v16.2s, v0.2s, v8.s[0]
  787. .endm
  788. .macro SAVE2x1
  789. fmov alpha0, alpha
  790. ld1 {v8.2s}, [pCRow0]
  791. fmla v8.2s, v16.2s, alphaV0
  792. st1 {v8.2s}, [pCRow0]
  793. add pCRow0, pCRow0, #8
  794. .endm
  795. /******************************************************************************/
  796. .macro INIT1x1
  797. fmov s16, wzr
  798. .endm
  799. .macro KERNEL1x1_SUB
  800. ldr s8, [pB]
  801. add pB , pB, #4
  802. ldr s0, [pA]
  803. add pA , pA, #4
  804. fmadd s16, s0, s8, s16
  805. .endm
  806. .macro SAVE1x1
  807. fmov alpha0, alpha
  808. ldr s8, [pCRow0]
  809. fmla s8, s16, alphaV0
  810. str s8, [pCRow0]
  811. add pCRow0, pCRow0, #4
  812. .endm
  813. /*******************************************************************************
  814. * End of macro definitions
  815. *******************************************************************************/
  816. PROLOGUE
  817. .Lsgemm_kernel_begin:
  818. .align 5
  819. add sp, sp, #-(11 * 16)
  820. stp d8, d9, [sp, #(0 * 16)]
  821. stp d10, d11, [sp, #(1 * 16)]
  822. stp d12, d13, [sp, #(2 * 16)]
  823. stp d14, d15, [sp, #(3 * 16)]
  824. stp d16, d17, [sp, #(4 * 16)]
  825. stp x18, x19, [sp, #(5 * 16)]
  826. stp x20, x21, [sp, #(6 * 16)]
  827. stp x22, x23, [sp, #(7 * 16)]
  828. stp x24, x25, [sp, #(8 * 16)]
  829. stp x26, x27, [sp, #(9 * 16)]
  830. str x28, [sp, #(10 * 16)]
  831. prfm PLDL1KEEP, [origPB]
  832. prfm PLDL1KEEP, [origPA]
  833. fmov alpha, s0
  834. lsl LDC, LDC, #2 // ldc = ldc * 4
  835. mov pB, origPB
  836. mov counterJ, origN
  837. asr counterJ, counterJ, #2 // J = J / 4
  838. cmp counterJ, #0
  839. ble .Lsgemm_kernel_L2_BEGIN
  840. /******************************************************************************/
  841. .Lsgemm_kernel_L4_BEGIN:
  842. mov pCRow0, pC
  843. add pCRow1, pCRow0, LDC
  844. add pCRow2, pCRow1, LDC
  845. add pCRow3, pCRow2, LDC
  846. add pC, pCRow3, LDC
  847. mov pA, origPA // pA = start of A array
  848. .Lsgemm_kernel_L4_M16_BEGIN:
  849. mov counterI, origM
  850. asr counterI, counterI, #4 // counterI = counterI / 16
  851. cmp counterI, #0
  852. ble .Lsgemm_kernel_L4_M8_BEGIN
  853. .align 5
  854. .Lsgemm_kernel_L4_M16_20:
  855. mov pB, origPB
  856. asr counterL , origK, #3
  857. cmp counterL , #2
  858. blt .Lsgemm_kernel_L4_M16_32
  859. KERNEL16x4_I
  860. KERNEL16x4_M2
  861. KERNEL16x4_M1
  862. KERNEL16x4_M2
  863. KERNEL16x4_M1
  864. KERNEL16x4_M2
  865. KERNEL16x4_M1
  866. KERNEL16x4_M2
  867. subs counterL, counterL, #2
  868. ble .Lsgemm_kernel_L4_M16_22a
  869. .align 5
  870. .Lsgemm_kernel_L4_M16_22:
  871. KERNEL16x4_M1
  872. KERNEL16x4_M2
  873. KERNEL16x4_M1
  874. KERNEL16x4_M2
  875. KERNEL16x4_M1
  876. KERNEL16x4_M2
  877. KERNEL16x4_M1
  878. KERNEL16x4_M2
  879. subs counterL, counterL, #1
  880. bgt .Lsgemm_kernel_L4_M16_22
  881. .align 5
  882. .Lsgemm_kernel_L4_M16_22a:
  883. KERNEL16x4_M1
  884. KERNEL16x4_M2
  885. KERNEL16x4_M1
  886. KERNEL16x4_M2
  887. KERNEL16x4_M1
  888. KERNEL16x4_M2
  889. KERNEL16x4_M1
  890. KERNEL16x4_E
  891. b .Lsgemm_kernel_L4_M16_44
  892. .align 5
  893. .Lsgemm_kernel_L4_M16_32:
  894. tst counterL, #1
  895. ble .Lsgemm_kernel_L4_M16_40
  896. KERNEL16x4_I
  897. KERNEL16x4_M2
  898. KERNEL16x4_M1
  899. KERNEL16x4_M2
  900. KERNEL16x4_M1
  901. KERNEL16x4_M2
  902. KERNEL16x4_M1
  903. KERNEL16x4_E
  904. b .Lsgemm_kernel_L4_M16_44
  905. .Lsgemm_kernel_L4_M16_40:
  906. INIT16x4
  907. .Lsgemm_kernel_L4_M16_44:
  908. ands counterL , origK, #7
  909. ble .Lsgemm_kernel_L4_M16_100
  910. .align 5
  911. .Lsgemm_kernel_L4_M16_46:
  912. KERNEL16x4_SUB
  913. subs counterL, counterL, #1
  914. bne .Lsgemm_kernel_L4_M16_46
  915. .Lsgemm_kernel_L4_M16_100:
  916. prfm PLDL1KEEP, [pA]
  917. prfm PLDL1KEEP, [pA, #64]
  918. prfm PLDL1KEEP, [origPB]
  919. SAVE16x4
  920. .Lsgemm_kernel_L4_M16_END:
  921. subs counterI, counterI, #1
  922. bne .Lsgemm_kernel_L4_M16_20
  923. //------------------------------------------------------------------------------
  924. .Lsgemm_kernel_L4_M8_BEGIN:
  925. mov counterI, origM
  926. tst counterI , #15
  927. ble .Lsgemm_kernel_L4_END
  928. tst counterI, #8
  929. ble .Lsgemm_kernel_L4_M4_BEGIN
  930. .Lsgemm_kernel_L4_M8_20:
  931. mov pB, origPB
  932. asr counterL , origK, #1 // L = K / 2
  933. cmp counterL , #2 // is there at least 4 to do?
  934. blt .Lsgemm_kernel_L4_M8_32
  935. KERNEL8x4_I // do one in the K
  936. KERNEL8x4_M2 // do another in the K
  937. subs counterL, counterL, #2
  938. ble .Lsgemm_kernel_L4_M8_22a
  939. .align 5
  940. .Lsgemm_kernel_L4_M8_22:
  941. KERNEL8x4_M1
  942. KERNEL8x4_M2
  943. subs counterL, counterL, #1
  944. bgt .Lsgemm_kernel_L4_M8_22
  945. .Lsgemm_kernel_L4_M8_22a:
  946. KERNEL8x4_M1
  947. KERNEL8x4_E
  948. b .Lsgemm_kernel_L4_M8_44
  949. .Lsgemm_kernel_L4_M8_32:
  950. tst counterL, #1
  951. ble .Lsgemm_kernel_L4_M8_40
  952. KERNEL8x4_I
  953. KERNEL8x4_E
  954. b .Lsgemm_kernel_L4_M8_44
  955. .Lsgemm_kernel_L4_M8_40:
  956. INIT8x4
  957. .Lsgemm_kernel_L4_M8_44:
  958. ands counterL , origK, #1
  959. ble .Lsgemm_kernel_L4_M8_100
  960. .Lsgemm_kernel_L4_M8_46:
  961. KERNEL8x4_SUB
  962. .Lsgemm_kernel_L4_M8_100:
  963. SAVE8x4
  964. .Lsgemm_kernel_L4_M8_END:
  965. //------------------------------------------------------------------------------
  966. .Lsgemm_kernel_L4_M4_BEGIN:
  967. mov counterI, origM
  968. tst counterI , #7
  969. ble .Lsgemm_kernel_L4_END
  970. tst counterI, #4
  971. ble .Lsgemm_kernel_L4_M2_BEGIN
  972. .Lsgemm_kernel_L4_M4_20:
  973. mov pB, origPB
  974. asr counterL , origK, #1 // L = K / 2
  975. cmp counterL , #2 // is there at least 4 to do?
  976. blt .Lsgemm_kernel_L4_M4_32
  977. KERNEL4x4_I // do one in the K
  978. KERNEL4x4_M2 // do another in the K
  979. subs counterL, counterL, #2
  980. ble .Lsgemm_kernel_L4_M4_22a
  981. .align 5
  982. .Lsgemm_kernel_L4_M4_22:
  983. KERNEL4x4_M1
  984. KERNEL4x4_M2
  985. subs counterL, counterL, #1
  986. bgt .Lsgemm_kernel_L4_M4_22
  987. .Lsgemm_kernel_L4_M4_22a:
  988. KERNEL4x4_M1
  989. KERNEL4x4_E
  990. b .Lsgemm_kernel_L4_M4_44
  991. .Lsgemm_kernel_L4_M4_32:
  992. tst counterL, #1
  993. ble .Lsgemm_kernel_L4_M4_40
  994. KERNEL4x4_I
  995. KERNEL4x4_E
  996. b .Lsgemm_kernel_L4_M4_44
  997. .Lsgemm_kernel_L4_M4_40:
  998. INIT4x4
  999. .Lsgemm_kernel_L4_M4_44:
  1000. ands counterL , origK, #1
  1001. ble .Lsgemm_kernel_L4_M4_100
  1002. .Lsgemm_kernel_L4_M4_46:
  1003. KERNEL4x4_SUB
  1004. .Lsgemm_kernel_L4_M4_100:
  1005. SAVE4x4
  1006. .Lsgemm_kernel_L4_M4_END:
  1007. //------------------------------------------------------------------------------
  1008. .Lsgemm_kernel_L4_M2_BEGIN:
  1009. mov counterI, origM
  1010. tst counterI , #3
  1011. ble .Lsgemm_kernel_L4_END
  1012. tst counterI, #2 // counterI = counterI / 2
  1013. ble .Lsgemm_kernel_L4_M1_BEGIN
  1014. .Lsgemm_kernel_L4_M2_20:
  1015. INIT2x4
  1016. mov pB, origPB
  1017. asr counterL , origK, #3 // counterL = counterL / 8
  1018. cmp counterL , #0
  1019. ble .Lsgemm_kernel_L4_M2_40
  1020. .Lsgemm_kernel_L4_M2_22:
  1021. KERNEL2x4_SUB
  1022. KERNEL2x4_SUB
  1023. KERNEL2x4_SUB
  1024. KERNEL2x4_SUB
  1025. KERNEL2x4_SUB
  1026. KERNEL2x4_SUB
  1027. KERNEL2x4_SUB
  1028. KERNEL2x4_SUB
  1029. subs counterL, counterL, #1
  1030. bgt .Lsgemm_kernel_L4_M2_22
  1031. .Lsgemm_kernel_L4_M2_40:
  1032. ands counterL , origK, #7 // counterL = counterL % 8
  1033. ble .Lsgemm_kernel_L4_M2_100
  1034. .Lsgemm_kernel_L4_M2_42:
  1035. KERNEL2x4_SUB
  1036. subs counterL, counterL, #1
  1037. bgt .Lsgemm_kernel_L4_M2_42
  1038. .Lsgemm_kernel_L4_M2_100:
  1039. SAVE2x4
  1040. .Lsgemm_kernel_L4_M2_END:
  1041. .Lsgemm_kernel_L4_M1_BEGIN:
  1042. tst counterI, #1 // counterI = counterI % 2
  1043. ble .Lsgemm_kernel_L4_END
  1044. .Lsgemm_kernel_L4_M1_20:
  1045. INIT1x4
  1046. mov pB, origPB
  1047. asr counterL , origK, #3 // counterL = counterL / 8
  1048. cmp counterL , #0
  1049. ble .Lsgemm_kernel_L4_M1_40
  1050. .Lsgemm_kernel_L4_M1_22:
  1051. KERNEL1x4_SUB
  1052. KERNEL1x4_SUB
  1053. KERNEL1x4_SUB
  1054. KERNEL1x4_SUB
  1055. KERNEL1x4_SUB
  1056. KERNEL1x4_SUB
  1057. KERNEL1x4_SUB
  1058. KERNEL1x4_SUB
  1059. subs counterL, counterL, #1
  1060. bgt .Lsgemm_kernel_L4_M1_22
  1061. .Lsgemm_kernel_L4_M1_40:
  1062. ands counterL , origK, #7 // counterL = counterL % 8
  1063. ble .Lsgemm_kernel_L4_M1_100
  1064. .Lsgemm_kernel_L4_M1_42:
  1065. KERNEL1x4_SUB
  1066. subs counterL, counterL, #1
  1067. bgt .Lsgemm_kernel_L4_M1_42
  1068. .Lsgemm_kernel_L4_M1_100:
  1069. SAVE1x4
  1070. .Lsgemm_kernel_L4_END:
  1071. add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
  1072. subs counterJ, counterJ , #1 // j--
  1073. bgt .Lsgemm_kernel_L4_BEGIN
  1074. /******************************************************************************/
  1075. .Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  1076. mov counterJ , origN
  1077. tst counterJ , #3
  1078. ble .Lsgemm_kernel_L999
  1079. tst counterJ , #2
  1080. ble .Lsgemm_kernel_L1_BEGIN
  1081. mov pCRow0, pC // pCRow0 = pC
  1082. add pC,pC,LDC, lsl #1
  1083. mov pA, origPA // pA = A
  1084. .Lsgemm_kernel_L2_M16_BEGIN:
  1085. mov counterI, origM
  1086. asr counterI, counterI, #4 // counterI = counterI / 16
  1087. cmp counterI,#0
  1088. ble .Lsgemm_kernel_L2_M8_BEGIN
  1089. .Lsgemm_kernel_L2_M16_20:
  1090. INIT16x2
  1091. mov pB, origPB
  1092. asr counterL , origK, #3 // counterL = counterL / 8
  1093. cmp counterL,#0
  1094. ble .Lsgemm_kernel_L2_M16_40
  1095. .align 5
  1096. .Lsgemm_kernel_L2_M16_22:
  1097. KERNEL16x2_SUB
  1098. KERNEL16x2_SUB
  1099. KERNEL16x2_SUB
  1100. KERNEL16x2_SUB
  1101. KERNEL16x2_SUB
  1102. KERNEL16x2_SUB
  1103. KERNEL16x2_SUB
  1104. KERNEL16x2_SUB
  1105. subs counterL, counterL, #1
  1106. bgt .Lsgemm_kernel_L2_M16_22
  1107. .Lsgemm_kernel_L2_M16_40:
  1108. ands counterL , origK, #7 // counterL = counterL % 8
  1109. ble .Lsgemm_kernel_L2_M16_100
  1110. .Lsgemm_kernel_L2_M16_42:
  1111. KERNEL16x2_SUB
  1112. subs counterL, counterL, #1
  1113. bgt .Lsgemm_kernel_L2_M16_42
  1114. .Lsgemm_kernel_L2_M16_100:
  1115. SAVE16x2
  1116. .Lsgemm_kernel_L2_M16_END:
  1117. subs counterI, counterI, #1
  1118. bgt .Lsgemm_kernel_L2_M16_20
  1119. //------------------------------------------------------------------------------
  1120. .Lsgemm_kernel_L2_M8_BEGIN:
  1121. mov counterI, origM
  1122. tst counterI , #15
  1123. ble .Lsgemm_kernel_L2_END
  1124. tst counterI, #8
  1125. ble .Lsgemm_kernel_L2_M4_BEGIN
  1126. .Lsgemm_kernel_L2_M8_20:
  1127. INIT8x2
  1128. mov pB, origPB
  1129. asr counterL , origK, #3 // counterL = counterL / 8
  1130. cmp counterL,#0
  1131. ble .Lsgemm_kernel_L2_M8_40
  1132. .align 5
  1133. .Lsgemm_kernel_L2_M8_22:
  1134. KERNEL8x2_SUB
  1135. KERNEL8x2_SUB
  1136. KERNEL8x2_SUB
  1137. KERNEL8x2_SUB
  1138. KERNEL8x2_SUB
  1139. KERNEL8x2_SUB
  1140. KERNEL8x2_SUB
  1141. KERNEL8x2_SUB
  1142. subs counterL, counterL, #1
  1143. bgt .Lsgemm_kernel_L2_M8_22
  1144. .Lsgemm_kernel_L2_M8_40:
  1145. ands counterL , origK, #7 // counterL = counterL % 8
  1146. ble .Lsgemm_kernel_L2_M8_100
  1147. .Lsgemm_kernel_L2_M8_42:
  1148. KERNEL8x2_SUB
  1149. subs counterL, counterL, #1
  1150. bgt .Lsgemm_kernel_L2_M8_42
  1151. .Lsgemm_kernel_L2_M8_100:
  1152. SAVE8x2
  1153. .Lsgemm_kernel_L2_M8_END:
  1154. //------------------------------------------------------------------------------
  1155. .Lsgemm_kernel_L2_M4_BEGIN:
  1156. mov counterI, origM
  1157. tst counterI , #7
  1158. ble .Lsgemm_kernel_L2_END
  1159. tst counterI, #4
  1160. ble .Lsgemm_kernel_L2_M2_BEGIN
  1161. .Lsgemm_kernel_L2_M4_20:
  1162. INIT4x2
  1163. mov pB, origPB
  1164. asr counterL , origK, #3 // counterL = counterL / 8
  1165. cmp counterL,#0
  1166. ble .Lsgemm_kernel_L2_M4_40
  1167. .align 5
  1168. .Lsgemm_kernel_L2_M4_22:
  1169. KERNEL4x2_SUB
  1170. KERNEL4x2_SUB
  1171. KERNEL4x2_SUB
  1172. KERNEL4x2_SUB
  1173. KERNEL4x2_SUB
  1174. KERNEL4x2_SUB
  1175. KERNEL4x2_SUB
  1176. KERNEL4x2_SUB
  1177. subs counterL, counterL, #1
  1178. bgt .Lsgemm_kernel_L2_M4_22
  1179. .Lsgemm_kernel_L2_M4_40:
  1180. ands counterL , origK, #7 // counterL = counterL % 8
  1181. ble .Lsgemm_kernel_L2_M4_100
  1182. .Lsgemm_kernel_L2_M4_42:
  1183. KERNEL4x2_SUB
  1184. subs counterL, counterL, #1
  1185. bgt .Lsgemm_kernel_L2_M4_42
  1186. .Lsgemm_kernel_L2_M4_100:
  1187. SAVE4x2
  1188. .Lsgemm_kernel_L2_M4_END:
  1189. //------------------------------------------------------------------------------
  1190. .Lsgemm_kernel_L2_M2_BEGIN:
  1191. mov counterI, origM
  1192. tst counterI , #3
  1193. ble .Lsgemm_kernel_L2_END
  1194. tst counterI, #2 // counterI = counterI / 2
  1195. ble .Lsgemm_kernel_L2_M1_BEGIN
  1196. .Lsgemm_kernel_L2_M2_20:
  1197. INIT2x2
  1198. mov pB, origPB
  1199. asr counterL , origK, #3 // counterL = counterL / 8
  1200. cmp counterL,#0
  1201. ble .Lsgemm_kernel_L2_M2_40
  1202. .Lsgemm_kernel_L2_M2_22:
  1203. KERNEL2x2_SUB
  1204. KERNEL2x2_SUB
  1205. KERNEL2x2_SUB
  1206. KERNEL2x2_SUB
  1207. KERNEL2x2_SUB
  1208. KERNEL2x2_SUB
  1209. KERNEL2x2_SUB
  1210. KERNEL2x2_SUB
  1211. subs counterL, counterL, #1
  1212. bgt .Lsgemm_kernel_L2_M2_22
  1213. .Lsgemm_kernel_L2_M2_40:
  1214. ands counterL , origK, #7 // counterL = counterL % 8
  1215. ble .Lsgemm_kernel_L2_M2_100
  1216. .Lsgemm_kernel_L2_M2_42:
  1217. KERNEL2x2_SUB
  1218. subs counterL, counterL, #1
  1219. bgt .Lsgemm_kernel_L2_M2_42
  1220. .Lsgemm_kernel_L2_M2_100:
  1221. SAVE2x2
  1222. .Lsgemm_kernel_L2_M2_END:
  1223. .Lsgemm_kernel_L2_M1_BEGIN:
  1224. tst counterI, #1 // counterI = counterI % 2
  1225. ble .Lsgemm_kernel_L2_END
  1226. .Lsgemm_kernel_L2_M1_20:
  1227. INIT1x2
  1228. mov pB, origPB
  1229. asr counterL , origK, #3 // counterL = counterL / 8
  1230. cmp counterL, #0
  1231. ble .Lsgemm_kernel_L2_M1_40
  1232. .Lsgemm_kernel_L2_M1_22:
  1233. KERNEL1x2_SUB
  1234. KERNEL1x2_SUB
  1235. KERNEL1x2_SUB
  1236. KERNEL1x2_SUB
  1237. KERNEL1x2_SUB
  1238. KERNEL1x2_SUB
  1239. KERNEL1x2_SUB
  1240. KERNEL1x2_SUB
  1241. subs counterL, counterL, #1
  1242. bgt .Lsgemm_kernel_L2_M1_22
  1243. .Lsgemm_kernel_L2_M1_40:
  1244. ands counterL , origK, #7 // counterL = counterL % 8
  1245. ble .Lsgemm_kernel_L2_M1_100
  1246. .Lsgemm_kernel_L2_M1_42:
  1247. KERNEL1x2_SUB
  1248. subs counterL, counterL, #1
  1249. bgt .Lsgemm_kernel_L2_M1_42
  1250. .Lsgemm_kernel_L2_M1_100:
  1251. SAVE1x2
  1252. .Lsgemm_kernel_L2_END:
  1253. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  1254. /******************************************************************************/
  1255. .Lsgemm_kernel_L1_BEGIN:
  1256. mov counterJ , origN
  1257. tst counterJ , #1
  1258. ble .Lsgemm_kernel_L999 // done
  1259. mov pCRow0, pC // pCRow0 = C
  1260. add pC , pC , LDC // Update pC to point to next
  1261. mov pA, origPA // pA = A
  1262. .Lsgemm_kernel_L1_M16_BEGIN:
  1263. mov counterI, origM
  1264. asr counterI, counterI, #4 // counterI = counterI / 16
  1265. cmp counterI, #0
  1266. ble .Lsgemm_kernel_L1_M8_BEGIN
  1267. .Lsgemm_kernel_L1_M16_20:
  1268. INIT16x1
  1269. mov pB, origPB
  1270. asr counterL , origK, #3 // counterL = counterL / 8
  1271. cmp counterL , #0
  1272. ble .Lsgemm_kernel_L1_M16_40
  1273. .align 5
  1274. .Lsgemm_kernel_L1_M16_22:
  1275. KERNEL16x1_SUB
  1276. KERNEL16x1_SUB
  1277. KERNEL16x1_SUB
  1278. KERNEL16x1_SUB
  1279. KERNEL16x1_SUB
  1280. KERNEL16x1_SUB
  1281. KERNEL16x1_SUB
  1282. KERNEL16x1_SUB
  1283. subs counterL, counterL, #1
  1284. bgt .Lsgemm_kernel_L1_M16_22
  1285. .Lsgemm_kernel_L1_M16_40:
  1286. ands counterL , origK, #7 // counterL = counterL % 8
  1287. ble .Lsgemm_kernel_L1_M16_100
  1288. .Lsgemm_kernel_L1_M16_42:
  1289. KERNEL16x1_SUB
  1290. subs counterL, counterL, #1
  1291. bgt .Lsgemm_kernel_L1_M16_42
  1292. .Lsgemm_kernel_L1_M16_100:
  1293. SAVE16x1
  1294. .Lsgemm_kernel_L1_M16_END:
  1295. subs counterI, counterI, #1
  1296. bgt .Lsgemm_kernel_L1_M16_20
  1297. //------------------------------------------------------------------------------
  1298. .Lsgemm_kernel_L1_M8_BEGIN:
  1299. mov counterI, origM
  1300. tst counterI , #15
  1301. ble .Lsgemm_kernel_L1_END
  1302. tst counterI, #8
  1303. ble .Lsgemm_kernel_L1_M4_BEGIN
  1304. .Lsgemm_kernel_L1_M8_20:
  1305. INIT8x1
  1306. mov pB, origPB
  1307. asr counterL , origK, #3 // counterL = counterL / 8
  1308. cmp counterL , #0
  1309. ble .Lsgemm_kernel_L1_M8_40
  1310. .align 5
  1311. .Lsgemm_kernel_L1_M8_22:
  1312. KERNEL8x1_SUB
  1313. KERNEL8x1_SUB
  1314. KERNEL8x1_SUB
  1315. KERNEL8x1_SUB
  1316. KERNEL8x1_SUB
  1317. KERNEL8x1_SUB
  1318. KERNEL8x1_SUB
  1319. KERNEL8x1_SUB
  1320. subs counterL, counterL, #1
  1321. bgt .Lsgemm_kernel_L1_M8_22
  1322. .Lsgemm_kernel_L1_M8_40:
  1323. ands counterL , origK, #7 // counterL = counterL % 8
  1324. ble .Lsgemm_kernel_L1_M8_100
  1325. .Lsgemm_kernel_L1_M8_42:
  1326. KERNEL8x1_SUB
  1327. subs counterL, counterL, #1
  1328. bgt .Lsgemm_kernel_L1_M8_42
  1329. .Lsgemm_kernel_L1_M8_100:
  1330. SAVE8x1
  1331. .Lsgemm_kernel_L1_M8_END:
  1332. //------------------------------------------------------------------------------
  1333. .Lsgemm_kernel_L1_M4_BEGIN:
  1334. mov counterI, origM
  1335. tst counterI , #7
  1336. ble .Lsgemm_kernel_L1_END
  1337. tst counterI, #4
  1338. ble .Lsgemm_kernel_L1_M2_BEGIN
  1339. .Lsgemm_kernel_L1_M4_20:
  1340. INIT4x1
  1341. mov pB, origPB
  1342. asr counterL , origK, #3 // counterL = counterL / 8
  1343. cmp counterL , #0
  1344. ble .Lsgemm_kernel_L1_M4_40
  1345. .align 5
  1346. .Lsgemm_kernel_L1_M4_22:
  1347. KERNEL4x1_SUB
  1348. KERNEL4x1_SUB
  1349. KERNEL4x1_SUB
  1350. KERNEL4x1_SUB
  1351. KERNEL4x1_SUB
  1352. KERNEL4x1_SUB
  1353. KERNEL4x1_SUB
  1354. KERNEL4x1_SUB
  1355. subs counterL, counterL, #1
  1356. bgt .Lsgemm_kernel_L1_M4_22
  1357. .Lsgemm_kernel_L1_M4_40:
  1358. ands counterL , origK, #7 // counterL = counterL % 8
  1359. ble .Lsgemm_kernel_L1_M4_100
  1360. .Lsgemm_kernel_L1_M4_42:
  1361. KERNEL4x1_SUB
  1362. subs counterL, counterL, #1
  1363. bgt .Lsgemm_kernel_L1_M4_42
  1364. .Lsgemm_kernel_L1_M4_100:
  1365. SAVE4x1
  1366. .Lsgemm_kernel_L1_M4_END:
  1367. //------------------------------------------------------------------------------
  1368. .Lsgemm_kernel_L1_M2_BEGIN:
  1369. mov counterI, origM
  1370. tst counterI , #3
  1371. ble .Lsgemm_kernel_L1_END
  1372. tst counterI, #2 // counterI = counterI / 2
  1373. ble .Lsgemm_kernel_L1_M1_BEGIN
  1374. .Lsgemm_kernel_L1_M2_20:
  1375. INIT2x1
  1376. mov pB, origPB
  1377. asr counterL , origK, #3 // counterL = counterL / 8
  1378. cmp counterL , #0
  1379. ble .Lsgemm_kernel_L1_M2_40
  1380. .Lsgemm_kernel_L1_M2_22:
  1381. KERNEL2x1_SUB
  1382. KERNEL2x1_SUB
  1383. KERNEL2x1_SUB
  1384. KERNEL2x1_SUB
  1385. KERNEL2x1_SUB
  1386. KERNEL2x1_SUB
  1387. KERNEL2x1_SUB
  1388. KERNEL2x1_SUB
  1389. subs counterL, counterL, #1
  1390. bgt .Lsgemm_kernel_L1_M2_22
  1391. .Lsgemm_kernel_L1_M2_40:
  1392. ands counterL , origK, #7 // counterL = counterL % 8
  1393. ble .Lsgemm_kernel_L1_M2_100
  1394. .Lsgemm_kernel_L1_M2_42:
  1395. KERNEL2x1_SUB
  1396. subs counterL, counterL, #1
  1397. bgt .Lsgemm_kernel_L1_M2_42
  1398. .Lsgemm_kernel_L1_M2_100:
  1399. SAVE2x1
  1400. .Lsgemm_kernel_L1_M2_END:
  1401. .Lsgemm_kernel_L1_M1_BEGIN:
  1402. tst counterI, #1 // counterI = counterI % 2
  1403. ble .Lsgemm_kernel_L1_END
  1404. .Lsgemm_kernel_L1_M1_20:
  1405. INIT1x1
  1406. mov pB, origPB
  1407. asr counterL , origK, #3 // counterL = counterL / 8
  1408. cmp counterL , #0
  1409. ble .Lsgemm_kernel_L1_M1_40
  1410. .Lsgemm_kernel_L1_M1_22:
  1411. KERNEL1x1_SUB
  1412. KERNEL1x1_SUB
  1413. KERNEL1x1_SUB
  1414. KERNEL1x1_SUB
  1415. KERNEL1x1_SUB
  1416. KERNEL1x1_SUB
  1417. KERNEL1x1_SUB
  1418. KERNEL1x1_SUB
  1419. subs counterL, counterL, #1
  1420. bgt .Lsgemm_kernel_L1_M1_22
  1421. .Lsgemm_kernel_L1_M1_40:
  1422. ands counterL , origK, #7 // counterL = counterL % 8
  1423. ble .Lsgemm_kernel_L1_M1_100
  1424. .Lsgemm_kernel_L1_M1_42:
  1425. KERNEL1x1_SUB
  1426. subs counterL, counterL, #1
  1427. bgt .Lsgemm_kernel_L1_M1_42
  1428. .Lsgemm_kernel_L1_M1_100:
  1429. SAVE1x1
  1430. .Lsgemm_kernel_L1_END:
  1431. .Lsgemm_kernel_L999:
  1432. mov x0, #0 // set return value
  1433. ldp d8, d9, [sp, #(0 * 16)]
  1434. ldp d10, d11, [sp, #(1 * 16)]
  1435. ldp d12, d13, [sp, #(2 * 16)]
  1436. ldp d14, d15, [sp, #(3 * 16)]
  1437. ldp d16, d17, [sp, #(4 * 16)]
  1438. ldp x18, x19, [sp, #(5 * 16)]
  1439. ldp x20, x21, [sp, #(6 * 16)]
  1440. ldp x22, x23, [sp, #(7 * 16)]
  1441. ldp x24, x25, [sp, #(8 * 16)]
  1442. ldp x26, x27, [sp, #(9 * 16)]
  1443. ldr x28, [sp, #(10 * 16)]
  1444. add sp, sp, #(11*16)
  1445. ret
  1446. EPILOGUE