You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_8x4.S 29 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alpha x17
  49. #define alpha0 d10
  50. #define alphaV0 v10.d[0]
  51. #define A_PRE_SIZE 2560
  52. #define B_PRE_SIZE 448
  53. #define C_PRE_SIZE 128
  54. // 00 origM
  55. // 01 origN
  56. // 02 origK
  57. // 03 origPA
  58. // 04 origPB
  59. // 05 pC
  60. // 06 origLDC -> LDC
  61. // 07 temp
  62. // 08 counterL
  63. // 09 counterI
  64. // 10 counterJ
  65. // 11 pB
  66. // 12 pCRow0
  67. // 13 pCRow1
  68. // 14 pCRow2
  69. // 15 pCRow3
  70. // 16 pA
  71. // 17
  72. // 18 must save
  73. // 19 must save
  74. // 20 must save
  75. // 21 must save
  76. // 22 must save
  77. // 23 must save
  78. // 24 must save
  79. // 25 must save
  80. // 26 must save
  81. // 27 must save
  82. // 28 must save
  83. // 29 frame
  84. // 30 link
  85. // 31 sp
  86. //v00 ALPHA -> pA0_0, pA0_1
  87. //v01 pA0_2, pA0_3
  88. //v02 pA0_4, pA0_5
  89. //v03 pA0_6, pA0_7
  90. //v04 pA1_0, pA1_1
  91. //v05 pA1_2, pA1_3
  92. //v06 pA1_4, pA1_5
  93. //v07 pA1_6, pA1_7
  94. //v08 must save pB0_0
  95. //v09 must save pB0_1
  96. //v10 must save pB0_2 --> ALPHA0
  97. //v11 must save pB0_3
  98. //v12 must save pB1_0
  99. //v13 must save pB1_1
  100. //v14 must save pB1_2
  101. //v15 must save pB1_3
  102. //v16 must save C00, C01
  103. //v17 must save C02, C03
  104. //v18 C04, C05
  105. //v19 C06, C07
  106. //v20 C10, C11
  107. //v21 C12, C13
  108. //v22 C14, C15
  109. //v23 C16, C17
  110. //v24 C20, C21
  111. //v25 C22, C23
  112. //v26 C24, C25
  113. //v27 C26, C27
  114. //v28 C30, C31
  115. //v29 C32, C33
  116. //v30 C34, C35
  117. //v31 C36, C37
  118. /*******************************************************************************
  119. * Macro definitions
  120. *******************************************************************************/
  121. .macro INIT8x4
  122. fmov d16, xzr
  123. fmov d17, xzr
  124. fmov d18, d16
  125. fmov d19, xzr
  126. fmov d20, xzr
  127. fmov d21, d16
  128. fmov d22, d17
  129. fmov d23, d18
  130. fmov d24, xzr
  131. fmov d25, d16
  132. fmov d26, d17
  133. fmov d27, d18
  134. fmov d28, xzr
  135. fmov d29, d16
  136. fmov d30, d17
  137. fmov d31, d18
  138. .endm
  139. .macro KERNEL8x4_I
  140. ldp q0, q1, [pA], #32
  141. ldp d8, d9, [pB], #16
  142. fmul v16.2d, v0.2d, v8.d[0]
  143. fmul v20.2d, v0.2d, v9.d[0]
  144. ldp d10, d11, [pB], #16
  145. fmul v17.2d, v1.2d, v8.d[0]
  146. fmul v21.2d, v1.2d, v9.d[0]
  147. ldp q2, q3, [pA], #32
  148. fmul v24.2d, v0.2d, v10.d[0]
  149. fmul v28.2d, v0.2d, v11.d[0]
  150. ldp q4, q5, [pA], #32
  151. fmul v25.2d, v1.2d, v10.d[0]
  152. fmul v29.2d, v1.2d, v11.d[0]
  153. ldp d12, d13, [pB], #16
  154. fmul v18.2d, v2.2d, v8.d[0]
  155. fmul v22.2d, v2.2d, v9.d[0]
  156. ldp d14, d15, [pB], #16
  157. fmul v26.2d, v2.2d, v10.d[0]
  158. fmul v30.2d, v2.2d, v11.d[0]
  159. ldp q6, q7, [pA], #32
  160. fmul v19.2d, v3.2d, v8.d[0]
  161. fmul v27.2d, v3.2d, v10.d[0]
  162. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  163. fmul v31.2d, v3.2d, v11.d[0]
  164. fmul v23.2d, v3.2d, v9.d[0]
  165. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  166. .endm
  167. .macro KERNEL8x4_M1
  168. fmla v16.2d, v0.2d, v8.d[0]
  169. fmla v20.2d, v0.2d, v9.d[0]
  170. ldp q4, q5, [pA], #32
  171. fmla v24.2d, v0.2d, v10.d[0]
  172. fmla v28.2d, v0.2d, v11.d[0]
  173. ldp d12, d13, [pB], #16
  174. fmla v17.2d, v1.2d, v8.d[0]
  175. fmla v25.2d, v1.2d, v10.d[0]
  176. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  177. fmla v21.2d, v1.2d, v9.d[0]
  178. fmla v29.2d, v1.2d, v11.d[0]
  179. ldp d14, d15, [pB], #16
  180. fmla v18.2d, v2.2d, v8.d[0]
  181. fmla v22.2d, v2.2d, v9.d[0]
  182. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  183. fmla v26.2d, v2.2d, v10.d[0]
  184. fmla v30.2d, v2.2d, v11.d[0]
  185. fmla v19.2d, v3.2d, v8.d[0]
  186. fmla v23.2d, v3.2d, v9.d[0]
  187. ldp q6, q7, [pA], #32
  188. fmla v27.2d, v3.2d, v10.d[0]
  189. fmla v31.2d, v3.2d, v11.d[0]
  190. .endm
  191. .macro KERNEL8x4_M2
  192. fmla v16.2d, v4.2d, v12.d[0]
  193. fmla v20.2d, v4.2d, v13.d[0]
  194. fmla v24.2d, v4.2d, v14.d[0]
  195. fmla v28.2d, v4.2d, v15.d[0]
  196. ldp q0, q1, [pA], #32
  197. fmla v17.2d, v5.2d, v12.d[0]
  198. fmla v25.2d, v5.2d, v14.d[0]
  199. ldp d8, d9, [pB], #16
  200. fmla v21.2d, v5.2d, v13.d[0]
  201. fmla v29.2d, v5.2d, v15.d[0]
  202. ldp d10, d11, [pB], #16
  203. fmla v18.2d, v6.2d, v12.d[0]
  204. fmla v22.2d, v6.2d, v13.d[0]
  205. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  206. fmla v26.2d, v6.2d, v14.d[0]
  207. fmla v30.2d, v6.2d, v15.d[0]
  208. fmla v19.2d, v7.2d, v12.d[0]
  209. fmla v23.2d, v7.2d, v13.d[0]
  210. ldp q2, q3, [pA], #32
  211. fmla v27.2d, v7.2d, v14.d[0]
  212. fmla v31.2d, v7.2d, v15.d[0]
  213. .endm
  214. .macro KERNEL8x4_E
  215. fmla v16.2d, v4.2d, v12.d[0]
  216. fmla v20.2d, v4.2d, v13.d[0]
  217. fmla v24.2d, v4.2d, v14.d[0]
  218. fmla v28.2d, v4.2d, v15.d[0]
  219. fmla v17.2d, v5.2d, v12.d[0]
  220. fmla v25.2d, v5.2d, v14.d[0]
  221. fmla v21.2d, v5.2d, v13.d[0]
  222. fmla v29.2d, v5.2d, v15.d[0]
  223. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  224. fmla v18.2d, v6.2d, v12.d[0]
  225. fmla v22.2d, v6.2d, v13.d[0]
  226. fmla v26.2d, v6.2d, v14.d[0]
  227. fmla v30.2d, v6.2d, v15.d[0]
  228. fmla v19.2d, v7.2d, v12.d[0]
  229. fmla v23.2d, v7.2d, v13.d[0]
  230. fmla v27.2d, v7.2d, v14.d[0]
  231. fmla v31.2d, v7.2d, v15.d[0]
  232. .endm
  233. .macro KERNEL8x4_SUB
  234. ldp q0, q1, [pA], #32
  235. ldp d8, d9, [pB], #16
  236. fmla v16.2d, v0.2d, v8.d[0]
  237. fmla v20.2d, v0.2d, v9.d[0]
  238. ldp d10, d11, [pB], #16
  239. fmla v17.2d, v1.2d, v8.d[0]
  240. fmla v21.2d, v1.2d, v9.d[0]
  241. ldp q2, q3, [pA], #32
  242. fmla v24.2d, v0.2d, v10.d[0]
  243. fmla v28.2d, v0.2d, v11.d[0]
  244. fmla v25.2d, v1.2d, v10.d[0]
  245. fmla v29.2d, v1.2d, v11.d[0]
  246. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  247. fmla v18.2d, v2.2d, v8.d[0]
  248. fmla v22.2d, v2.2d, v9.d[0]
  249. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  250. fmla v26.2d, v2.2d, v10.d[0]
  251. fmla v30.2d, v2.2d, v11.d[0]
  252. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  253. fmla v19.2d, v3.2d, v8.d[0]
  254. fmla v27.2d, v3.2d, v10.d[0]
  255. fmla v31.2d, v3.2d, v11.d[0]
  256. fmla v23.2d, v3.2d, v9.d[0]
  257. .endm
  258. .macro SAVE8x4
  259. fmov alpha0, alpha
  260. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  261. ldp q0, q1, [pCRow0]
  262. fmla v0.2d, v16.2d, alphaV0
  263. fmla v1.2d, v17.2d, alphaV0
  264. stp q0, q1, [pCRow0]
  265. add pCRow0, pCRow0, #32
  266. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  267. ldp q2, q3, [pCRow0]
  268. fmla v2.2d, v18.2d, alphaV0
  269. fmla v3.2d, v19.2d, alphaV0
  270. stp q2, q3, [pCRow0]
  271. add pCRow0, pCRow0, #32
  272. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  273. ldp q4, q5, [pCRow1]
  274. fmla v4.2d, v20.2d, alphaV0
  275. fmla v5.2d, v21.2d, alphaV0
  276. stp q4, q5, [pCRow1]
  277. add pCRow1, pCRow1, #32
  278. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  279. ldp q6, q7, [pCRow1]
  280. fmla v6.2d, v22.2d, alphaV0
  281. fmla v7.2d, v23.2d, alphaV0
  282. stp q6, q7, [pCRow1]
  283. add pCRow1, pCRow1, #32
  284. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  285. ldp q0, q1, [pCRow2]
  286. fmla v0.2d, v24.2d, alphaV0
  287. fmla v1.2d, v25.2d, alphaV0
  288. stp q0, q1, [pCRow2]
  289. add pCRow2, pCRow2, #32
  290. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  291. ldp q2, q3, [pCRow2]
  292. fmla v2.2d, v26.2d, alphaV0
  293. fmla v3.2d, v27.2d, alphaV0
  294. stp q2, q3, [pCRow2]
  295. add pCRow2, pCRow2, #32
  296. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  297. ldp q4, q5, [pCRow3]
  298. fmla v4.2d, v28.2d, alphaV0
  299. fmla v5.2d, v29.2d, alphaV0
  300. stp q4, q5, [pCRow3]
  301. add pCRow3, pCRow3, #32
  302. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  303. ldp q6, q7, [pCRow3]
  304. fmla v6.2d, v30.2d, alphaV0
  305. fmla v7.2d, v31.2d, alphaV0
  306. stp q6, q7, [pCRow3]
  307. add pCRow3, pCRow3, #32
  308. .endm
  309. /******************************************************************************/
  310. .macro INIT4x4
  311. fmov d16, xzr
  312. fmov d17, d16
  313. fmov d20, d17
  314. fmov d21, d16
  315. fmov d24, d17
  316. fmov d25, d16
  317. fmov d28, d17
  318. fmov d29, d16
  319. .endm
  320. .macro KERNEL4x4_SUB
  321. ld1 {v8.2d, v9.2d}, [pB]
  322. add pB, pB, #32
  323. ld1 {v0.2d, v1.2d}, [pA]
  324. add pA, pA, #32
  325. fmla v16.2d, v0.2d, v8.d[0]
  326. fmla v29.2d, v1.2d, v9.d[1]
  327. fmla v20.2d, v0.2d, v8.d[1]
  328. fmla v25.2d, v1.2d, v9.d[0]
  329. fmla v24.2d, v0.2d, v9.d[0]
  330. fmla v21.2d, v1.2d, v8.d[1]
  331. fmla v28.2d, v0.2d, v9.d[1]
  332. fmla v17.2d, v1.2d, v8.d[0]
  333. .endm
  334. .macro SAVE4x4
  335. fmov alpha0, alpha
  336. ld1 {v8.2d, v9.2d}, [pCRow0]
  337. fmla v8.2d, v16.2d, alphaV0
  338. fmla v9.2d, v17.2d, alphaV0
  339. st1 {v8.2d, v9.2d}, [pCRow0]
  340. add pCRow1, pCRow0, LDC
  341. ld1 {v12.2d, v13.2d}, [pCRow1]
  342. fmla v12.2d, v20.2d, alphaV0
  343. fmla v13.2d, v21.2d, alphaV0
  344. st1 {v12.2d, v13.2d}, [pCRow1]
  345. add pCRow2, pCRow1, LDC
  346. ld1 {v8.2d, v9.2d}, [pCRow2]
  347. fmla v8.2d, v24.2d, alphaV0
  348. fmla v9.2d, v25.2d, alphaV0
  349. st1 {v8.2d, v9.2d}, [pCRow2]
  350. add pCRow1, pCRow2, LDC
  351. ld1 {v12.2d, v13.2d}, [pCRow1]
  352. fmla v12.2d, v28.2d, alphaV0
  353. fmla v13.2d, v29.2d, alphaV0
  354. st1 {v12.2d, v13.2d}, [pCRow1]
  355. add pCRow0, pCRow0, #32
  356. .endm
  357. /******************************************************************************/
  358. .macro INIT2x4
  359. fmov d16, xzr
  360. fmov d20, d16
  361. fmov d24, d20
  362. fmov d28, d16
  363. .endm
  364. .macro KERNEL2x4_SUB
  365. ld1 {v8.2d, v9.2d}, [pB]
  366. add pB, pB, #32
  367. ld1 {v0.2d}, [pA]
  368. add pA, pA, #16
  369. fmla v16.2d, v0.2d, v8.d[0]
  370. fmla v20.2d, v0.2d, v8.d[1]
  371. fmla v24.2d, v0.2d, v9.d[0]
  372. fmla v28.2d, v0.2d, v9.d[1]
  373. .endm
  374. .macro SAVE2x4
  375. fmov alpha0, alpha
  376. ld1 {v8.2d}, [pCRow0]
  377. fmla v8.2d, v16.2d, alphaV0
  378. st1 {v8.2d}, [pCRow0]
  379. add pCRow1, pCRow0, LDC
  380. ld1 {v12.2d}, [pCRow1]
  381. fmla v12.2d, v20.2d, alphaV0
  382. st1 {v12.2d}, [pCRow1]
  383. add pCRow2, pCRow1, LDC
  384. ld1 {v8.2d}, [pCRow2]
  385. fmla v8.2d, v24.2d, alphaV0
  386. st1 {v8.2d}, [pCRow2]
  387. add pCRow1, pCRow2, LDC
  388. ld1 {v12.2d}, [pCRow1]
  389. fmla v12.2d, v28.2d, alphaV0
  390. st1 {v12.2d}, [pCRow1]
  391. add pCRow0, pCRow0, #16
  392. .endm
  393. /******************************************************************************/
  394. .macro INIT1x4
  395. fmov d16, xzr
  396. fmov d20, d16
  397. .endm
  398. .macro KERNEL1x4_SUB
  399. ldr d0, [pA]
  400. add pA, pA, #8
  401. ld1 {v8.2d, v9.2d}, [pB]
  402. add pB, pB, #32
  403. fmla v16.2d, v8.2d, v0.d[0]
  404. fmla v20.2d, v9.2d, v0.d[0]
  405. .endm
  406. .macro SAVE1x4
  407. fmov alpha0, alpha
  408. add pCRow1, pCRow0, LDC
  409. ld1 {v8.d}[0], [pCRow0]
  410. ld1 {v8.d}[1], [pCRow1]
  411. fmla v8.2d, v16.2d, alphaV0
  412. st1 {v8.d}[0], [pCRow0]
  413. st1 {v8.d}[1], [pCRow1]
  414. add pCRow2, pCRow1, LDC
  415. add pCRow1, pCRow2, LDC
  416. ld1 {v12.d}[0], [pCRow2]
  417. ld1 {v12.d}[1], [pCRow1]
  418. fmla v12.2d, v20.2d, alphaV0
  419. st1 {v12.d}[0], [pCRow2]
  420. st1 {v12.d}[1], [pCRow1]
  421. add pCRow0, pCRow0, #8
  422. .endm
  423. /******************************************************************************/
  424. .macro INIT8x2
  425. fmov d16, xzr
  426. fmov d17, xzr
  427. fmov d18, d16
  428. fmov d19, d17
  429. fmov d20, xzr
  430. fmov d21, d16
  431. fmov d22, d17
  432. fmov d23, d18
  433. .endm
  434. .macro KERNEL8x2_SUB
  435. ld1 {v0.2d, v1.2d}, [pA]
  436. add pA, pA, #32
  437. ld1 {v8.2d}, [pB]
  438. add pB, pB, #16
  439. ld1 {v2.2d, v3.2d}, [pA]
  440. add pA, pA, #32
  441. fmla v16.2d, v0.2d, v8.d[0]
  442. fmla v17.2d, v1.2d, v8.d[0]
  443. fmla v18.2d, v2.2d, v8.d[0]
  444. fmla v19.2d, v3.2d, v8.d[0]
  445. fmla v20.2d, v0.2d, v8.d[1]
  446. fmla v21.2d, v1.2d, v8.d[1]
  447. fmla v22.2d, v2.2d, v8.d[1]
  448. fmla v23.2d, v3.2d, v8.d[1]
  449. .endm
  450. .macro SAVE8x2
  451. fmov alpha0, alpha
  452. add pCRow1, pCRow0, LDC
  453. ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  454. fmla v0.2d, v16.2d, alphaV0
  455. fmla v1.2d, v17.2d, alphaV0
  456. fmla v2.2d, v18.2d, alphaV0
  457. fmla v3.2d, v19.2d, alphaV0
  458. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  459. ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
  460. fmla v4.2d, v20.2d, alphaV0
  461. fmla v5.2d, v21.2d, alphaV0
  462. fmla v6.2d, v22.2d, alphaV0
  463. fmla v7.2d, v23.2d, alphaV0
  464. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
  465. add pCRow0, pCRow0, #64
  466. .endm
  467. /******************************************************************************/
  468. .macro INIT4x2
  469. fmov d16, xzr
  470. fmov d17, d16
  471. fmov d20, d17
  472. fmov d21, d16
  473. .endm
  474. .macro KERNEL4x2_SUB
  475. ld1 {v8.2d}, [pB]
  476. add pB, pB, #16
  477. ld1 {v0.2d, v1.2d}, [pA]
  478. add pA, pA, #32
  479. fmla v16.2d, v0.2d, v8.d[0]
  480. fmla v17.2d, v1.2d, v8.d[0]
  481. fmla v20.2d, v0.2d, v8.d[1]
  482. fmla v21.2d, v1.2d, v8.d[1]
  483. .endm
  484. .macro SAVE4x2
  485. fmov alpha0, alpha
  486. ld1 {v8.2d, v9.2d}, [pCRow0]
  487. fmla v8.2d, v16.2d, alphaV0
  488. fmla v9.2d, v17.2d, alphaV0
  489. st1 {v8.2d, v9.2d}, [pCRow0]
  490. add pCRow1, pCRow0, LDC
  491. ld1 {v12.2d, v13.2d}, [pCRow1]
  492. fmla v12.2d, v20.2d, alphaV0
  493. fmla v13.2d, v21.2d, alphaV0
  494. st1 {v12.2d, v13.2d}, [pCRow1]
  495. add pCRow0, pCRow0, #32
  496. .endm
  497. /******************************************************************************/
  498. .macro INIT2x2
  499. fmov d16, xzr
  500. fmov d20, d16
  501. .endm
  502. .macro KERNEL2x2_SUB
  503. ld1 {v8.2d}, [pB]
  504. add pB, pB, #16
  505. ld1 {v0.2d}, [pA]
  506. add pA, pA, #16
  507. fmla v16.2d, v0.2d, v8.d[0]
  508. fmla v20.2d, v0.2d, v8.d[1]
  509. .endm
  510. .macro SAVE2x2
  511. fmov alpha0, alpha
  512. ld1 {v8.2d}, [pCRow0]
  513. fmla v8.2d, v16.2d, alphaV0
  514. st1 {v8.2d}, [pCRow0]
  515. add pCRow1 , pCRow0, LDC
  516. ld1 {v12.2d}, [pCRow1]
  517. fmla v12.2d, v20.2d, alphaV0
  518. st1 {v12.2d}, [pCRow1]
  519. add pCRow0, pCRow0, #16
  520. .endm
  521. /******************************************************************************/
  522. .macro INIT1x2
  523. fmov d16, xzr
  524. .endm
  525. .macro KERNEL1x2_SUB
  526. ld1 {v8.2d} , [pB]
  527. add pB , pB, #16
  528. ldr d0 , [pA]
  529. add pA, pA, #8
  530. fmla v16.2d, v8.2d, v0.d[0]
  531. .endm
  532. .macro SAVE1x2
  533. fmov alpha0, alpha
  534. add pCRow1 , pCRow0, LDC
  535. ld1 {v8.d}[0], [pCRow0]
  536. ld1 {v8.d}[1], [pCRow1]
  537. fmla v8.2d, v16.2d, alphaV0
  538. st1 {v8.d}[0], [pCRow0]
  539. st1 {v8.d}[1], [pCRow1]
  540. add pCRow0, pCRow0, #8
  541. .endm
  542. /******************************************************************************/
  543. .macro INIT8x1
  544. fmov d16, xzr
  545. fmov d17, xzr
  546. fmov d18, d16
  547. fmov d19, d17
  548. .endm
  549. .macro KERNEL8x1_SUB
  550. ld1 {v0.2d, v1.2d}, [pA]
  551. add pA , pA, #32
  552. ldr d8, [pB]
  553. add pB , pB, #8
  554. ld1 {v2.2d, v3.2d}, [pA]
  555. add pA, pA, #32
  556. fmla v16.2d, v0.2d, v8.d[0]
  557. fmla v17.2d, v1.2d, v8.d[0]
  558. fmla v18.2d, v2.2d, v8.d[0]
  559. fmla v19.2d, v3.2d, v8.d[0]
  560. .endm
  561. .macro SAVE8x1
  562. fmov alpha0, alpha
  563. ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  564. fmla v0.2d, v16.2d, alphaV0
  565. fmla v1.2d, v17.2d, alphaV0
  566. fmla v2.2d, v18.2d, alphaV0
  567. fmla v3.2d, v19.2d, alphaV0
  568. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  569. add pCRow0, pCRow0, #64
  570. .endm
  571. /******************************************************************************/
  572. .macro INIT4x1
  573. fmov d16, xzr
  574. fmov d17, d16
  575. .endm
  576. .macro KERNEL4x1_SUB
  577. ldr d8, [pB]
  578. add pB , pB, #8
  579. ld1 {v0.2d, v1.2d}, [pA]
  580. add pA , pA, #32
  581. fmla v16.2d, v0.2d, v8.d[0]
  582. fmla v17.2d, v1.2d, v8.d[0]
  583. .endm
  584. .macro SAVE4x1
  585. fmov alpha0, alpha
  586. ld1 {v8.2d, v9.2d}, [pCRow0]
  587. fmla v8.2d, v16.2d, alphaV0
  588. fmla v9.2d, v17.2d, alphaV0
  589. st1 {v8.2d, v9.2d}, [pCRow0]
  590. add pCRow0, pCRow0, #32
  591. .endm
  592. /******************************************************************************/
  593. .macro INIT2x1
  594. fmov d16, xzr
  595. .endm
  596. .macro KERNEL2x1_SUB
  597. ldr d8, [pB]
  598. add pB , pB, #8
  599. ld1 {v0.2d}, [pA]
  600. add pA , pA, #16
  601. fmla v16.2d, v0.2d, v8.d[0]
  602. .endm
  603. .macro SAVE2x1
  604. fmov alpha0, alpha
  605. ld1 {v8.2d}, [pCRow0]
  606. fmla v8.2d, v16.2d, alphaV0
  607. st1 {v8.2d}, [pCRow0]
  608. add pCRow0, pCRow0, #16
  609. .endm
  610. /******************************************************************************/
  611. .macro INIT1x1
  612. fmov d16, xzr
  613. .endm
  614. .macro KERNEL1x1_SUB
  615. ldr d8, [pB]
  616. add pB , pB, #8
  617. ldr d0, [pA]
  618. add pA , pA, #8
  619. fmadd d16, d0, d8, d16
  620. .endm
  621. .macro SAVE1x1
  622. fmov alpha0, alpha
  623. ldr d8, [pCRow0]
  624. fmadd d8, d16, alpha0, d8
  625. str d8, [pCRow0]
  626. add pCRow0, pCRow0, #8
  627. .endm
  628. /*******************************************************************************
  629. * End of macro definitions
  630. *******************************************************************************/
  631. PROLOGUE
  632. .align 5
  633. add sp, sp, #-(11 * 16)
  634. stp d8, d9, [sp, #(0 * 16)]
  635. stp d10, d11, [sp, #(1 * 16)]
  636. stp d12, d13, [sp, #(2 * 16)]
  637. stp d14, d15, [sp, #(3 * 16)]
  638. stp d16, d17, [sp, #(4 * 16)]
  639. stp x18, x19, [sp, #(5 * 16)]
  640. stp x20, x21, [sp, #(6 * 16)]
  641. stp x22, x23, [sp, #(7 * 16)]
  642. stp x24, x25, [sp, #(8 * 16)]
  643. stp x26, x27, [sp, #(9 * 16)]
  644. str x28, [sp, #(10 * 16)]
  645. prfm PLDL1KEEP, [origPB]
  646. prfm PLDL1KEEP, [origPA]
  647. fmov alpha, d0
  648. lsl LDC, LDC, #3 // ldc = ldc * 8
  649. mov pB, origPB
  650. mov counterJ, origN
  651. asr counterJ, counterJ, #2 // J = J / 4
  652. cmp counterJ, #0
  653. ble dgemm_kernel_L2_BEGIN
  654. /******************************************************************************/
  655. dgemm_kernel_L4_BEGIN:
  656. mov pCRow0, pC
  657. add pCRow1, pCRow0, LDC
  658. add pCRow2, pCRow1, LDC
  659. add pCRow3, pCRow2, LDC
  660. add pC, pCRow3, LDC
  661. mov pA, origPA // pA = start of A array
  662. dgemm_kernel_L4_M8_BEGIN:
  663. mov counterI, origM
  664. asr counterI, counterI, #3 // counterI = counterI / 8
  665. cmp counterI, #0
  666. ble dgemm_kernel_L4_M4_BEGIN
  667. .align 5
  668. dgemm_kernel_L4_M8_20:
  669. mov pB, origPB
  670. asr counterL , origK, #3 // L = K / 8
  671. cmp counterL , #2 // is there at least 4 to do?
  672. blt dgemm_kernel_L4_M8_32
  673. KERNEL8x4_I
  674. KERNEL8x4_M2
  675. KERNEL8x4_M1
  676. KERNEL8x4_M2
  677. KERNEL8x4_M1
  678. KERNEL8x4_M2
  679. KERNEL8x4_M1
  680. KERNEL8x4_M2
  681. subs counterL, counterL, #2 // subtract 2
  682. ble dgemm_kernel_L4_M8_22a
  683. .align 5
  684. dgemm_kernel_L4_M8_22:
  685. KERNEL8x4_M1
  686. KERNEL8x4_M2
  687. KERNEL8x4_M1
  688. KERNEL8x4_M2
  689. KERNEL8x4_M1
  690. KERNEL8x4_M2
  691. KERNEL8x4_M1
  692. KERNEL8x4_M2
  693. subs counterL, counterL, #1
  694. bgt dgemm_kernel_L4_M8_22
  695. .align 5
  696. dgemm_kernel_L4_M8_22a:
  697. KERNEL8x4_M1
  698. KERNEL8x4_M2
  699. KERNEL8x4_M1
  700. KERNEL8x4_M2
  701. KERNEL8x4_M1
  702. KERNEL8x4_M2
  703. KERNEL8x4_M1
  704. KERNEL8x4_E
  705. b dgemm_kernel_L4_M8_44
  706. .align 5
  707. dgemm_kernel_L4_M8_32:
  708. tst counterL, #1
  709. ble dgemm_kernel_L4_M8_40
  710. KERNEL8x4_I
  711. KERNEL8x4_M2
  712. KERNEL8x4_M1
  713. KERNEL8x4_M2
  714. KERNEL8x4_M1
  715. KERNEL8x4_M2
  716. KERNEL8x4_M1
  717. KERNEL8x4_E
  718. b dgemm_kernel_L4_M8_44
  719. dgemm_kernel_L4_M8_40:
  720. INIT8x4
  721. dgemm_kernel_L4_M8_44:
  722. ands counterL , origK, #7
  723. ble dgemm_kernel_L4_M8_100
  724. .align 5
  725. dgemm_kernel_L4_M8_46:
  726. KERNEL8x4_SUB
  727. subs counterL, counterL, #1
  728. bne dgemm_kernel_L4_M8_46
  729. dgemm_kernel_L4_M8_100:
  730. prfm PLDL1KEEP, [pA]
  731. prfm PLDL1KEEP, [pA, #64]
  732. prfm PLDL1KEEP, [origPB]
  733. SAVE8x4
  734. dgemm_kernel_L4_M8_END:
  735. subs counterI, counterI, #1
  736. bne dgemm_kernel_L4_M8_20
  737. dgemm_kernel_L4_M4_BEGIN:
  738. mov counterI, origM
  739. tst counterI , #7
  740. ble dgemm_kernel_L4_END
  741. tst counterI, #4
  742. ble dgemm_kernel_L4_M2_BEGIN
  743. dgemm_kernel_L4_M4_20:
  744. INIT4x4
  745. mov pB, origPB
  746. asr counterL , origK, #3 // counterL = counterL / 8
  747. cmp counterL , #0
  748. ble dgemm_kernel_L4_M4_40
  749. dgemm_kernel_L4_M4_22:
  750. KERNEL4x4_SUB
  751. KERNEL4x4_SUB
  752. KERNEL4x4_SUB
  753. KERNEL4x4_SUB
  754. KERNEL4x4_SUB
  755. KERNEL4x4_SUB
  756. KERNEL4x4_SUB
  757. KERNEL4x4_SUB
  758. subs counterL, counterL, #1
  759. bgt dgemm_kernel_L4_M4_22
  760. dgemm_kernel_L4_M4_40:
  761. ands counterL , origK, #7 // counterL = counterL % 8
  762. ble dgemm_kernel_L4_M4_100
  763. dgemm_kernel_L4_M4_42:
  764. KERNEL4x4_SUB
  765. subs counterL, counterL, #1
  766. bgt dgemm_kernel_L4_M4_42
  767. dgemm_kernel_L4_M4_100:
  768. SAVE4x4
  769. dgemm_kernel_L4_M4_END:
  770. dgemm_kernel_L4_M2_BEGIN:
  771. mov counterI, origM
  772. tst counterI , #3
  773. ble dgemm_kernel_L4_END
  774. tst counterI, #2 // counterI = counterI / 2
  775. ble dgemm_kernel_L4_M1_BEGIN
  776. dgemm_kernel_L4_M2_20:
  777. INIT2x4
  778. mov pB, origPB
  779. asr counterL , origK, #3 // counterL = counterL / 8
  780. cmp counterL , #0
  781. ble dgemm_kernel_L4_M2_40
  782. dgemm_kernel_L4_M2_22:
  783. KERNEL2x4_SUB
  784. KERNEL2x4_SUB
  785. KERNEL2x4_SUB
  786. KERNEL2x4_SUB
  787. KERNEL2x4_SUB
  788. KERNEL2x4_SUB
  789. KERNEL2x4_SUB
  790. KERNEL2x4_SUB
  791. subs counterL, counterL, #1
  792. bgt dgemm_kernel_L4_M2_22
  793. dgemm_kernel_L4_M2_40:
  794. ands counterL , origK, #7 // counterL = counterL % 8
  795. ble dgemm_kernel_L4_M2_100
  796. dgemm_kernel_L4_M2_42:
  797. KERNEL2x4_SUB
  798. subs counterL, counterL, #1
  799. bgt dgemm_kernel_L4_M2_42
  800. dgemm_kernel_L4_M2_100:
  801. SAVE2x4
  802. dgemm_kernel_L4_M2_END:
  803. dgemm_kernel_L4_M1_BEGIN:
  804. tst counterI, #1 // counterI = counterI % 2
  805. ble dgemm_kernel_L4_END
  806. dgemm_kernel_L4_M1_20:
  807. INIT1x4
  808. mov pB, origPB
  809. asr counterL , origK, #3 // counterL = counterL / 8
  810. cmp counterL , #0
  811. ble dgemm_kernel_L4_M1_40
  812. dgemm_kernel_L4_M1_22:
  813. KERNEL1x4_SUB
  814. KERNEL1x4_SUB
  815. KERNEL1x4_SUB
  816. KERNEL1x4_SUB
  817. KERNEL1x4_SUB
  818. KERNEL1x4_SUB
  819. KERNEL1x4_SUB
  820. KERNEL1x4_SUB
  821. subs counterL, counterL, #1
  822. bgt dgemm_kernel_L4_M1_22
  823. dgemm_kernel_L4_M1_40:
  824. ands counterL , origK, #7 // counterL = counterL % 8
  825. ble dgemm_kernel_L4_M1_100
  826. dgemm_kernel_L4_M1_42:
  827. KERNEL1x4_SUB
  828. subs counterL, counterL, #1
  829. bgt dgemm_kernel_L4_M1_42
  830. dgemm_kernel_L4_M1_100:
  831. SAVE1x4
  832. dgemm_kernel_L4_END:
  833. lsl temp, origK, #5
  834. add origPB, origPB, temp // B = B + K * 4 * 8
  835. subs counterJ, counterJ , #1 // j--
  836. bgt dgemm_kernel_L4_BEGIN
  837. /******************************************************************************/
  838. dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  839. mov counterJ , origN
  840. tst counterJ , #3
  841. ble dgemm_kernel_L999 // error, N was less than 4?
  842. tst counterJ , #2
  843. ble dgemm_kernel_L1_BEGIN
  844. mov pCRow0, pC // pCRow0 = pC
  845. add pC,pC,LDC, lsl #1
  846. mov pA, origPA // pA = A
  847. dgemm_kernel_L2_M8_BEGIN:
  848. mov counterI, origM
  849. asr counterI, counterI, #3 // counterI = counterI / 8
  850. cmp counterI, #0
  851. ble dgemm_kernel_L2_M4_BEGIN
  852. dgemm_kernel_L2_M8_20:
  853. INIT8x2
  854. mov pB, origPB
  855. asr counterL , origK, #3 // counterL = counterL / 8
  856. cmp counterL,#0
  857. ble dgemm_kernel_L2_M8_40
  858. .align 5
  859. dgemm_kernel_L2_M8_22:
  860. KERNEL8x2_SUB
  861. KERNEL8x2_SUB
  862. KERNEL8x2_SUB
  863. KERNEL8x2_SUB
  864. KERNEL8x2_SUB
  865. KERNEL8x2_SUB
  866. KERNEL8x2_SUB
  867. KERNEL8x2_SUB
  868. subs counterL, counterL, #1
  869. bgt dgemm_kernel_L2_M8_22
  870. dgemm_kernel_L2_M8_40:
  871. ands counterL , origK, #7 // counterL = counterL % 8
  872. ble dgemm_kernel_L2_M8_100
  873. dgemm_kernel_L2_M8_42:
  874. KERNEL8x2_SUB
  875. subs counterL, counterL, #1
  876. bgt dgemm_kernel_L2_M8_42
  877. dgemm_kernel_L2_M8_100:
  878. SAVE8x2
  879. dgemm_kernel_L2_M8_END:
  880. subs counterI, counterI, #1
  881. bgt dgemm_kernel_L2_M8_20
  882. dgemm_kernel_L2_M4_BEGIN:
  883. mov counterI, origM
  884. tst counterI , #7
  885. ble dgemm_kernel_L2_END
  886. tst counterI, #4 // counterI = counterI / 2
  887. ble dgemm_kernel_L2_M2_BEGIN
  888. dgemm_kernel_L2_M4_20:
  889. INIT4x2
  890. mov pB, origPB
  891. asr counterL , origK, #3 // counterL = counterL / 8
  892. cmp counterL,#0
  893. ble dgemm_kernel_L2_M4_40
  894. .align 5
  895. dgemm_kernel_L2_M4_22:
  896. KERNEL4x2_SUB
  897. KERNEL4x2_SUB
  898. KERNEL4x2_SUB
  899. KERNEL4x2_SUB
  900. KERNEL4x2_SUB
  901. KERNEL4x2_SUB
  902. KERNEL4x2_SUB
  903. KERNEL4x2_SUB
  904. subs counterL, counterL, #1
  905. bgt dgemm_kernel_L2_M4_22
  906. dgemm_kernel_L2_M4_40:
  907. ands counterL , origK, #7 // counterL = counterL % 8
  908. ble dgemm_kernel_L2_M4_100
  909. dgemm_kernel_L2_M4_42:
  910. KERNEL4x2_SUB
  911. subs counterL, counterL, #1
  912. bgt dgemm_kernel_L2_M4_42
  913. dgemm_kernel_L2_M4_100:
  914. SAVE4x2
  915. dgemm_kernel_L2_M4_END:
  916. dgemm_kernel_L2_M2_BEGIN:
  917. mov counterI, origM
  918. tst counterI , #3
  919. ble dgemm_kernel_L2_END
  920. tst counterI, #2 // counterI = counterI / 2
  921. ble dgemm_kernel_L2_M1_BEGIN
  922. dgemm_kernel_L2_M2_20:
  923. INIT2x2
  924. mov pB, origPB
  925. asr counterL , origK, #3 // counterL = counterL / 8
  926. cmp counterL,#0
  927. ble dgemm_kernel_L2_M2_40
  928. dgemm_kernel_L2_M2_22:
  929. KERNEL2x2_SUB
  930. KERNEL2x2_SUB
  931. KERNEL2x2_SUB
  932. KERNEL2x2_SUB
  933. KERNEL2x2_SUB
  934. KERNEL2x2_SUB
  935. KERNEL2x2_SUB
  936. KERNEL2x2_SUB
  937. subs counterL, counterL, #1
  938. bgt dgemm_kernel_L2_M2_22
  939. dgemm_kernel_L2_M2_40:
  940. ands counterL , origK, #7 // counterL = counterL % 8
  941. ble dgemm_kernel_L2_M2_100
  942. dgemm_kernel_L2_M2_42:
  943. KERNEL2x2_SUB
  944. subs counterL, counterL, #1
  945. bgt dgemm_kernel_L2_M2_42
  946. dgemm_kernel_L2_M2_100:
  947. SAVE2x2
  948. dgemm_kernel_L2_M2_END:
  949. dgemm_kernel_L2_M1_BEGIN:
  950. tst counterI, #1 // counterI = counterI % 2
  951. ble dgemm_kernel_L2_END
  952. dgemm_kernel_L2_M1_20:
  953. INIT1x2
  954. mov pB, origPB
  955. asr counterL , origK, #3 // counterL = counterL / 8
  956. cmp counterL, #0
  957. ble dgemm_kernel_L2_M1_40
  958. dgemm_kernel_L2_M1_22:
  959. KERNEL1x2_SUB
  960. KERNEL1x2_SUB
  961. KERNEL1x2_SUB
  962. KERNEL1x2_SUB
  963. KERNEL1x2_SUB
  964. KERNEL1x2_SUB
  965. KERNEL1x2_SUB
  966. KERNEL1x2_SUB
  967. subs counterL, counterL, #1
  968. bgt dgemm_kernel_L2_M1_22
  969. dgemm_kernel_L2_M1_40:
  970. ands counterL , origK, #7 // counterL = counterL % 8
  971. ble dgemm_kernel_L2_M1_100
  972. dgemm_kernel_L2_M1_42:
  973. KERNEL1x2_SUB
  974. subs counterL, counterL, #1
  975. bgt dgemm_kernel_L2_M1_42
  976. dgemm_kernel_L2_M1_100:
  977. SAVE1x2
  978. dgemm_kernel_L2_END:
  979. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  980. /******************************************************************************/
  981. dgemm_kernel_L1_BEGIN:
  982. mov counterJ , origN
  983. tst counterJ , #1
  984. ble dgemm_kernel_L999 // done
  985. mov pCRow0, pC // pCRow0 = C
  986. add pC , pC , LDC // Update pC to point to next
  987. mov pA, origPA // pA = A
  988. dgemm_kernel_L1_M8_BEGIN:
  989. mov counterI, origM
  990. asr counterI, counterI, #3 // counterI = counterI / 8
  991. cmp counterI, #0
  992. ble dgemm_kernel_L1_M4_BEGIN
  993. dgemm_kernel_L1_M8_20:
  994. INIT8x1
  995. mov pB, origPB
  996. asr counterL , origK, #3 // counterL = counterL / 8
  997. cmp counterL , #0
  998. ble dgemm_kernel_L1_M8_40
  999. .align 5
  1000. dgemm_kernel_L1_M8_22:
  1001. KERNEL8x1_SUB
  1002. KERNEL8x1_SUB
  1003. KERNEL8x1_SUB
  1004. KERNEL8x1_SUB
  1005. KERNEL8x1_SUB
  1006. KERNEL8x1_SUB
  1007. KERNEL8x1_SUB
  1008. KERNEL8x1_SUB
  1009. subs counterL, counterL, #1
  1010. bgt dgemm_kernel_L1_M8_22
  1011. dgemm_kernel_L1_M8_40:
  1012. ands counterL , origK, #7 // counterL = counterL % 8
  1013. ble dgemm_kernel_L1_M8_100
  1014. dgemm_kernel_L1_M8_42:
  1015. KERNEL8x1_SUB
  1016. subs counterL, counterL, #1
  1017. bgt dgemm_kernel_L1_M8_42
  1018. dgemm_kernel_L1_M8_100:
  1019. SAVE8x1
  1020. dgemm_kernel_L1_M8_END:
  1021. subs counterI, counterI, #1
  1022. bgt dgemm_kernel_L1_M8_20
  1023. dgemm_kernel_L1_M4_BEGIN:
  1024. mov counterI, origM
  1025. tst counterI , #7
  1026. ble dgemm_kernel_L1_END
  1027. tst counterI, #4 // counterI = counterI / 2
  1028. ble dgemm_kernel_L1_M2_BEGIN
  1029. dgemm_kernel_L1_M4_20:
  1030. INIT4x1
  1031. mov pB, origPB
  1032. asr counterL , origK, #3 // counterL = counterL / 8
  1033. cmp counterL , #0
  1034. ble dgemm_kernel_L1_M4_40
  1035. .align 5
  1036. dgemm_kernel_L1_M4_22:
  1037. KERNEL4x1_SUB
  1038. KERNEL4x1_SUB
  1039. KERNEL4x1_SUB
  1040. KERNEL4x1_SUB
  1041. KERNEL4x1_SUB
  1042. KERNEL4x1_SUB
  1043. KERNEL4x1_SUB
  1044. KERNEL4x1_SUB
  1045. subs counterL, counterL, #1
  1046. bgt dgemm_kernel_L1_M4_22
  1047. dgemm_kernel_L1_M4_40:
  1048. ands counterL , origK, #7 // counterL = counterL % 8
  1049. ble dgemm_kernel_L1_M4_100
  1050. dgemm_kernel_L1_M4_42:
  1051. KERNEL4x1_SUB
  1052. subs counterL, counterL, #1
  1053. bgt dgemm_kernel_L1_M4_42
  1054. dgemm_kernel_L1_M4_100:
  1055. SAVE4x1
  1056. dgemm_kernel_L1_M4_END:
  1057. dgemm_kernel_L1_M2_BEGIN:
  1058. mov counterI, origM
  1059. tst counterI , #3
  1060. ble dgemm_kernel_L1_END
  1061. tst counterI, #2 // counterI = counterI / 2
  1062. ble dgemm_kernel_L1_M1_BEGIN
  1063. dgemm_kernel_L1_M2_20:
  1064. INIT2x1
  1065. mov pB, origPB
  1066. asr counterL , origK, #3 // counterL = counterL / 8
  1067. cmp counterL , #0
  1068. ble dgemm_kernel_L1_M2_40
  1069. dgemm_kernel_L1_M2_22:
  1070. KERNEL2x1_SUB
  1071. KERNEL2x1_SUB
  1072. KERNEL2x1_SUB
  1073. KERNEL2x1_SUB
  1074. KERNEL2x1_SUB
  1075. KERNEL2x1_SUB
  1076. KERNEL2x1_SUB
  1077. KERNEL2x1_SUB
  1078. subs counterL, counterL, #1
  1079. bgt dgemm_kernel_L1_M2_22
  1080. dgemm_kernel_L1_M2_40:
  1081. ands counterL , origK, #7 // counterL = counterL % 8
  1082. ble dgemm_kernel_L1_M2_100
  1083. dgemm_kernel_L1_M2_42:
  1084. KERNEL2x1_SUB
  1085. subs counterL, counterL, #1
  1086. bgt dgemm_kernel_L1_M2_42
  1087. dgemm_kernel_L1_M2_100:
  1088. SAVE2x1
  1089. dgemm_kernel_L1_M2_END:
  1090. dgemm_kernel_L1_M1_BEGIN:
  1091. tst counterI, #1 // counterI = counterI % 2
  1092. ble dgemm_kernel_L1_END
  1093. dgemm_kernel_L1_M1_20:
  1094. INIT1x1
  1095. mov pB, origPB
  1096. asr counterL , origK, #3 // counterL = counterL / 8
  1097. cmp counterL , #0
  1098. ble dgemm_kernel_L1_M1_40
  1099. dgemm_kernel_L1_M1_22:
  1100. KERNEL1x1_SUB
  1101. KERNEL1x1_SUB
  1102. KERNEL1x1_SUB
  1103. KERNEL1x1_SUB
  1104. KERNEL1x1_SUB
  1105. KERNEL1x1_SUB
  1106. KERNEL1x1_SUB
  1107. KERNEL1x1_SUB
  1108. subs counterL, counterL, #1
  1109. bgt dgemm_kernel_L1_M1_22
  1110. dgemm_kernel_L1_M1_40:
  1111. ands counterL , origK, #7 // counterL = counterL % 8
  1112. ble dgemm_kernel_L1_M1_100
  1113. dgemm_kernel_L1_M1_42:
  1114. KERNEL1x1_SUB
  1115. subs counterL, counterL, #1
  1116. bgt dgemm_kernel_L1_M1_42
  1117. dgemm_kernel_L1_M1_100:
  1118. SAVE1x1
  1119. dgemm_kernel_L1_END:
  1120. dgemm_kernel_L999:
  1121. mov x0, #0 // set return value
  1122. ldp d8, d9, [sp, #(0 * 16)]
  1123. ldp d10, d11, [sp, #(1 * 16)]
  1124. ldp d12, d13, [sp, #(2 * 16)]
  1125. ldp d14, d15, [sp, #(3 * 16)]
  1126. ldp d16, d17, [sp, #(4 * 16)]
  1127. ldp x18, x19, [sp, #(5 * 16)]
  1128. ldp x20, x21, [sp, #(6 * 16)]
  1129. ldp x22, x23, [sp, #(7 * 16)]
  1130. ldp x24, x25, [sp, #(8 * 16)]
  1131. ldp x26, x27, [sp, #(9 * 16)]
  1132. ldr x28, [sp, #(10 * 16)]
  1133. add sp, sp, #(11*16)
  1134. ret
  1135. EPILOGUE