You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrmm_kernel_sve_v1x8.S 23 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define lanes x15
  47. #define pA x16
  48. #define alpha x17
  49. //#define temp x18
  50. #define tempOffset x19
  51. #define tempK x20
  52. #define temp x21
  53. #define alpha0 d10
  54. #define alphaZ z2.d
  55. #define A_PRE_SIZE 1536
  56. #define B_PRE_SIZE 512
  57. #define C_PRE_SIZE 128
  58. // 00 origM
  59. // 01 origN
  60. // 02 origK
  61. // 03 origPA
  62. // 04 origPB
  63. // 05 pC
  64. // 06 origLDC -> LDC
  65. // 07 temp
  66. // 08 counterL
  67. // 09 counterI
  68. // 10 counterJ
  69. // 11 pB
  70. // 12 pCRow0
  71. // 13 pCRow1
  72. // 14 pCRow2
  73. // 15 lanes
  74. // 16 pA
  75. // 17
  76. // 18 must save
  77. // 19 must save
  78. // 20 must save
  79. // 21 must save
  80. // 22 must save
  81. // 23 must save
  82. // 24 must save
  83. // 25 must save
  84. // 26 must save
  85. // 27 must save
  86. // 28 must save
  87. // 29 frame
  88. // 30 link
  89. // 31 sp
  90. //v00 ALPHA -> pA0_0
  91. //v01 pA0_1
  92. //v02 ALPHA0
  93. //v03
  94. //v04
  95. //v05
  96. //v06
  97. //v07
  98. //v08 must save pB0_0
  99. //v09 must save pB0_1
  100. //v10 must save pB0_2
  101. //v11 must save pB0_3
  102. //v12 must save pB0_4
  103. //v13 must save pB0_5
  104. //v14 must save pB0_6
  105. //v15 must save pB0_7
  106. //v16 must save C0
  107. //v17 must save C1
  108. //v18 must save C2
  109. //v19 must save C3
  110. //v20 must save C4
  111. //v21 must save C5
  112. //v22 must save C6
  113. //v23 must save C7
  114. /*******************************************************************************
  115. * Macro definitions
  116. *******************************************************************************/
  117. .macro INITv1x8
  118. dup z16.d, #0
  119. dup z17.d, #0
  120. dup z18.d, #0
  121. dup z19.d, #0
  122. dup z20.d, #0
  123. dup z21.d, #0
  124. dup z22.d, #0
  125. dup z23.d, #0
  126. .endm
  127. .macro KERNELv1x8_I
  128. ld1d z0.d, p1/z, [pA]
  129. ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
  130. add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8
  131. ld1rd z8.d, p0/z, [pB]
  132. ld1rd z9.d, p0/z, [pB, 8]
  133. ld1rd z10.d, p0/z, [pB, 16]
  134. ld1rd z11.d, p0/z, [pB, 24]
  135. ld1rd z12.d, p0/z, [pB, 32]
  136. ld1rd z13.d, p0/z, [pB, 40]
  137. ld1rd z14.d, p0/z, [pB, 48]
  138. ld1rd z15.d, p0/z, [pB, 56]
  139. add pB, pB, 64
  140. fmla z16.d, p1/m, z0.d, z8.d
  141. ld1rd z8.d, p0/z, [pB]
  142. fmla z17.d, p1/m, z0.d, z9.d
  143. ld1rd z9.d, p0/z, [pB, 8]
  144. fmla z18.d, p1/m, z0.d, z10.d
  145. ld1rd z10.d, p0/z, [pB, 16]
  146. fmla z19.d, p1/m, z0.d, z11.d
  147. ld1rd z11.d, p0/z, [pB, 24]
  148. fmla z20.d, p1/m, z0.d, z12.d
  149. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  150. ld1rd z12.d, p0/z, [pB, 32]
  151. fmla z21.d, p1/m, z0.d, z13.d
  152. ld1rd z13.d, p0/z, [pB, 40]
  153. fmla z22.d, p1/m, z0.d, z14.d
  154. ld1rd z14.d, p0/z, [pB, 48]
  155. fmla z23.d, p1/m, z0.d, z15.d
  156. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  157. ld1rd z15.d, p0/z, [pB, 56]
  158. add pB, pB, 64
  159. .endm
  160. .macro KERNELv1x8_M1
  161. ld1d z1.d, p1/z, [pA]
  162. add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
  163. fmla z16.d, p1/m, z0.d, z8.d
  164. ld1rd z8.d, p0/z, [pB]
  165. fmla z17.d, p1/m, z0.d, z9.d
  166. ld1rd z9.d, p0/z, [pB, 8]
  167. fmla z18.d, p1/m, z0.d, z10.d
  168. ld1rd z10.d, p0/z, [pB, 16]
  169. fmla z19.d, p1/m, z0.d, z11.d
  170. ld1rd z11.d, p0/z, [pB, 24]
  171. fmla z20.d, p1/m, z0.d, z12.d
  172. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  173. ld1rd z12.d, p0/z, [pB, 32]
  174. fmla z21.d, p1/m, z0.d, z13.d
  175. ld1rd z13.d, p0/z, [pB, 40]
  176. fmla z22.d, p1/m, z0.d, z14.d
  177. ld1rd z14.d, p0/z, [pB, 48]
  178. fmla z23.d, p1/m, z0.d, z15.d
  179. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  180. ld1rd z15.d, p0/z, [pB, 56]
  181. add pB, pB, 64
  182. .endm
  183. .macro KERNELv1x8_M2
  184. ld1d z0.d, p1/z, [pA]
  185. add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
  186. fmla z16.d, p1/m, z1.d, z8.d
  187. ld1rd z8.d, p0/z, [pB]
  188. fmla z17.d, p1/m, z1.d, z9.d
  189. ld1rd z9.d, p0/z, [pB, 8]
  190. fmla z18.d, p1/m, z1.d, z10.d
  191. ld1rd z10.d, p0/z, [pB, 16]
  192. fmla z19.d, p1/m, z1.d, z11.d
  193. ld1rd z11.d, p0/z, [pB, 24]
  194. fmla z20.d, p1/m, z1.d, z12.d
  195. ld1rd z12.d, p0/z, [pB, 32]
  196. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  197. fmla z21.d, p1/m, z1.d, z13.d
  198. ld1rd z13.d, p0/z, [pB, 40]
  199. fmla z22.d, p1/m, z1.d, z14.d
  200. ld1rd z14.d, p0/z, [pB, 48]
  201. fmla z23.d, p1/m, z1.d, z15.d
  202. ld1rd z15.d, p0/z, [pB, 56]
  203. add pB, pB, 64
  204. .endm
  205. .macro KERNELv1x8_E
  206. fmla z16.d, p1/m, z1.d, z8.d
  207. fmla z17.d, p1/m, z1.d, z9.d
  208. fmla z18.d, p1/m, z1.d, z10.d
  209. fmla z19.d, p1/m, z1.d, z11.d
  210. fmla z20.d, p1/m, z1.d, z12.d
  211. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  212. fmla z21.d, p1/m, z1.d, z13.d
  213. fmla z22.d, p1/m, z1.d, z14.d
  214. fmla z23.d, p1/m, z1.d, z15.d
  215. .endm
  216. .macro KERNELv1x8_SUB
  217. ld1d z0.d, p1/z, [pA]
  218. add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
  219. ld1rd z8.d, p0/z, [pB]
  220. ld1rd z9.d, p0/z, [pB, 8]
  221. ld1rd z10.d, p0/z, [pB, 16]
  222. ld1rd z11.d, p0/z, [pB, 24]
  223. ld1rd z12.d, p0/z, [pB, 32]
  224. ld1rd z13.d, p0/z, [pB, 40]
  225. ld1rd z14.d, p0/z, [pB, 48]
  226. ld1rd z15.d, p0/z, [pB, 56]
  227. add pB, pB, 64
  228. fmla z16.d, p1/m, z0.d, z8.d
  229. fmla z17.d, p1/m, z0.d, z9.d
  230. fmla z18.d, p1/m, z0.d, z10.d
  231. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  232. fmla z19.d, p1/m, z0.d, z11.d
  233. fmla z20.d, p1/m, z0.d, z12.d
  234. fmla z21.d, p1/m, z0.d, z13.d
  235. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  236. fmla z22.d, p1/m, z0.d, z14.d
  237. fmla z23.d, p1/m, z0.d, z15.d
  238. .endm
  239. .macro SAVEv1x8
  240. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  241. add pCRow1, pCRow0, LDC
  242. fmul z16.d, p1/m, z16.d, alphaZ
  243. st1d z16.d, p1, [pCRow0]
  244. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  245. add pCRow2, pCRow1, LDC
  246. fmul z17.d, p1/m, z17.d, alphaZ
  247. st1d z17.d, p1, [pCRow1]
  248. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  249. add pCRow1, pCRow2, LDC
  250. fmul z18.d, p1/m, z18.d, alphaZ
  251. st1d z18.d, p1, [pCRow2]
  252. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  253. add pCRow2, pCRow1, LDC
  254. fmul z19.d, p1/m, z19.d, alphaZ
  255. st1d z19.d, p1, [pCRow1]
  256. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  257. add pCRow1, pCRow2, LDC
  258. fmul z20.d, p1/m, z20.d, alphaZ
  259. st1d z20.d, p1, [pCRow2]
  260. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  261. add pCRow2, pCRow1, LDC
  262. fmul z21.d, p1/m, z21.d, alphaZ
  263. st1d z21.d, p1, [pCRow1]
  264. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  265. add pCRow1, pCRow2, LDC
  266. fmul z22.d, p1/m, z22.d, alphaZ
  267. st1d z22.d, p1, [pCRow2]
  268. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  269. fmul z23.d, p1/m, z23.d, alphaZ
  270. st1d z23.d, p1, [pCRow1]
  271. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
  272. .endm
  273. /******************************************************************************/
  274. .macro INITv1x4
  275. dup z16.d, #0
  276. dup z17.d, #0
  277. dup z18.d, #0
  278. dup z19.d, #0
  279. .endm
  280. .macro KERNELv1x4_SUB
  281. ld1d z0.d, p1/z, [pA]
  282. add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
  283. ld1rd z8.d, p0/z, [pB]
  284. ld1rd z9.d, p0/z, [pB, 8]
  285. ld1rd z10.d, p0/z, [pB, 16]
  286. ld1rd z11.d, p0/z, [pB, 24]
  287. add pB, pB, 32
  288. fmla z16.d, p1/m, z0.d, z8.d
  289. fmla z17.d, p1/m, z0.d, z9.d
  290. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  291. fmla z18.d, p1/m, z0.d, z10.d
  292. fmla z19.d, p1/m, z0.d, z11.d
  293. .endm
  294. .macro SAVEv1x4
  295. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  296. add pCRow1, pCRow0, LDC
  297. fmul z16.d, p1/m, z16.d, alphaZ
  298. st1d z16.d, p1, [pCRow0]
  299. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  300. add pCRow2, pCRow1, LDC
  301. fmul z17.d, p1/m, z17.d, alphaZ
  302. st1d z17.d, p1, [pCRow1]
  303. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  304. add pCRow1, pCRow2, LDC
  305. fmul z18.d, p1/m, z18.d, alphaZ
  306. st1d z18.d, p1, [pCRow2]
  307. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  308. fmul z19.d, p1/m, z19.d, alphaZ
  309. st1d z19.d, p1, [pCRow1]
  310. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
  311. .endm
  312. /******************************************************************************/
  313. .macro INITv1x2
  314. dup z16.d, #0
  315. dup z17.d, #0
  316. .endm
  317. .macro KERNELv1x2_SUB
  318. ld1d z0.d, p1/z, [pA]
  319. add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
  320. ld1rd z8.d, p0/z, [pB]
  321. ld1rd z9.d, p0/z, [pB, 8]
  322. add pB, pB, 16
  323. fmla z16.d, p1/m, z0.d, z8.d
  324. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  325. fmla z17.d, p1/m, z0.d, z9.d
  326. .endm
  327. .macro SAVEv1x2
  328. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  329. add pCRow1, pCRow0, LDC
  330. fmul z16.d, p1/m, z16.d, alphaZ
  331. st1d z16.d, p1, [pCRow0]
  332. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  333. fmul z17.d, p1/m, z17.d, alphaZ
  334. st1d z17.d, p1, [pCRow1]
  335. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
  336. .endm
  337. /******************************************************************************/
  338. .macro INITv1x1
  339. dup z16.d, #0
  340. .endm
  341. .macro KERNELv1x1_SUB
  342. ld1d z0.d, p1/z, [pA]
  343. add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8
  344. ld1rd z8.d, p0/z, [pB]
  345. add pB, pB, 8
  346. fmla z16.d, p1/m, z0.d, z8.d
  347. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  348. .endm
  349. .macro SAVEv1x1
  350. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  351. fmul z16.d, p1/m, z16.d, alphaZ
  352. st1d z16.d, p1, [pCRow0]
  353. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8
  354. .endm
  355. /*******************************************************************************
  356. * End of macro definitions
  357. *******************************************************************************/
  358. PROLOGUE
  359. .align 5
  360. add sp, sp, #-(11 * 16)
  361. stp d8, d9, [sp, #(0 * 16)]
  362. stp d10, d11, [sp, #(1 * 16)]
  363. stp d12, d13, [sp, #(2 * 16)]
  364. stp d14, d15, [sp, #(3 * 16)]
  365. stp d16, d17, [sp, #(4 * 16)]
  366. stp x18, x19, [sp, #(5 * 16)]
  367. stp x20, x21, [sp, #(6 * 16)]
  368. stp x22, x23, [sp, #(7 * 16)]
  369. stp x24, x25, [sp, #(8 * 16)]
  370. stp x26, x27, [sp, #(9 * 16)]
  371. str x28, [sp, #(10 * 16)]
  372. prfm PLDL1KEEP, [origPB]
  373. prfm PLDL1KEEP, [origPA]
  374. fmov alpha, d0
  375. dup alphaZ, alpha
  376. lsl LDC, LDC, #3 // ldc = ldc * 8
  377. ptrue p0.d // create true predicate
  378. #if !defined(LEFT)
  379. neg tempOffset, offset
  380. #endif
  381. mov pB, origPB
  382. // Loop over N
  383. mov counterJ, origN
  384. asr counterJ, counterJ, #3 // J = J / 8
  385. cmp counterJ, #0
  386. ble .Ldtrmm_kernel_L4_BEGIN
  387. /******************************************************************************/
  388. /* Repeat this as long as there are 8 left in N */
  389. .align 5
  390. .Ldtrmm_kernel_L8_BEGIN:
  391. mov pCRow0, pC
  392. add pC, pC, LDC, lsl #3 // add 8 x LDC
  393. #if defined(LEFT)
  394. mov tempOffset, offset
  395. #endif
  396. mov pA, origPA // pA = start of A array
  397. .Ldtrmm_kernel_L8_Mv1_BEGIN:
  398. /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
  399. mov counterI, #0
  400. whilelt p1.d, counterI, origM
  401. cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension
  402. .align 5
  403. .Ldtrmm_kernel_L8_Mv1_20:
  404. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  405. mov pB, origPB
  406. #else
  407. mov pB, origPB
  408. mul temp, tempOffset, lanes
  409. add pA, pA, temp, lsl #3 // add tempOffset*lanes*8
  410. lsl temp, tempOffset, #6
  411. add pB, pB, temp
  412. #endif
  413. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  414. sub tempK, origK, tempOffset
  415. #elif defined(LEFT)
  416. add tempK, tempOffset, lanes
  417. #else
  418. add tempK, tempOffset, #8
  419. #endif
  420. INITv1x8 // fill with zeros
  421. asr counterL , tempK, #3 // L = K / 8
  422. cmp counterL , #2 // is there at least 4 to do?
  423. blt .Ldtrmm_kernel_L8_Mv1_32
  424. KERNELv1x8_I
  425. KERNELv1x8_M2
  426. KERNELv1x8_M1
  427. KERNELv1x8_M2
  428. KERNELv1x8_M1
  429. KERNELv1x8_M2
  430. KERNELv1x8_M1
  431. KERNELv1x8_M2
  432. subs counterL, counterL, #2 // subtract 2
  433. ble .Ldtrmm_kernel_L8_Mv1_22a
  434. .align 5
  435. .Ldtrmm_kernel_L8_Mv1_22:
  436. KERNELv1x8_M1
  437. KERNELv1x8_M2
  438. KERNELv1x8_M1
  439. KERNELv1x8_M2
  440. KERNELv1x8_M1
  441. KERNELv1x8_M2
  442. KERNELv1x8_M1
  443. KERNELv1x8_M2
  444. subs counterL, counterL, #1
  445. bgt .Ldtrmm_kernel_L8_Mv1_22
  446. .align 5
  447. .Ldtrmm_kernel_L8_Mv1_22a:
  448. KERNELv1x8_M1
  449. KERNELv1x8_M2
  450. KERNELv1x8_M1
  451. KERNELv1x8_M2
  452. KERNELv1x8_M1
  453. KERNELv1x8_M2
  454. KERNELv1x8_M1
  455. KERNELv1x8_E
  456. b .Ldtrmm_kernel_L8_Mv1_44
  457. .align 5
  458. .Ldtrmm_kernel_L8_Mv1_32:
  459. tst counterL, #1
  460. ble .Ldtrmm_kernel_L8_Mv1_40
  461. KERNELv1x8_I
  462. KERNELv1x8_M2
  463. KERNELv1x8_M1
  464. KERNELv1x8_M2
  465. KERNELv1x8_M1
  466. KERNELv1x8_M2
  467. KERNELv1x8_M1
  468. KERNELv1x8_E
  469. b .Ldtrmm_kernel_L8_Mv1_44
  470. .Ldtrmm_kernel_L8_Mv1_40:
  471. INITv1x8
  472. .Ldtrmm_kernel_L8_Mv1_44:
  473. ands counterL , tempK, #7
  474. ble .Ldtrmm_kernel_L8_Mv1_100
  475. .align 5
  476. .Ldtrmm_kernel_L8_Mv1_46:
  477. KERNELv1x8_SUB
  478. subs counterL, counterL, #1
  479. bne .Ldtrmm_kernel_L8_Mv1_46
  480. .Ldtrmm_kernel_L8_Mv1_100:
  481. prfm PLDL1KEEP, [pA]
  482. prfm PLDL1KEEP, [pA, #64]
  483. prfm PLDL1KEEP, [origPB]
  484. SAVEv1x8
  485. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  486. sub tempK, origK, tempOffset
  487. #if defined(LEFT)
  488. sub tempK, tempK, lanes
  489. #else
  490. sub tempK, tempK, #8
  491. #endif
  492. mul temp, tempK, lanes
  493. add pA, pA, temp, lsl #3 // add tempOffset*lanes*8
  494. lsl temp, tempK, #6
  495. add pB, pB, temp
  496. #endif
  497. #if defined(LEFT)
  498. add tempOffset, tempOffset, lanes
  499. #endif
  500. .Ldtrmm_kernel_L8_Mv1_END:
  501. incd counterI
  502. whilelt p1.d, counterI, origM //SVE instruction
  503. cntp lanes, p0, p1.d
  504. b.any .Ldtrmm_kernel_L8_Mv1_20
  505. .Ldtrmm_kernel_L8_END:
  506. lsl temp, origK, #6
  507. add origPB, origPB, temp // B = B + K * 8 * 8
  508. #if !defined(LEFT)
  509. add tempOffset, tempOffset, #8
  510. #endif
  511. subs counterJ, counterJ , #1 // j--
  512. bgt .Ldtrmm_kernel_L8_BEGIN
  513. /******************************************************************************/
  514. /* Repeat the same thing if 4 left in N */
  515. .align 5
  516. .Ldtrmm_kernel_L4_BEGIN:
  517. mov counterJ , origN
  518. tst counterJ , #4
  519. ble .Ldtrmm_kernel_L2_BEGIN
  520. #if defined(LEFT)
  521. mov tempOffset, offset
  522. #endif
  523. mov pCRow0, pC
  524. add pC, pC, LDC, lsl #2 // add 4 x LDC
  525. mov pA, origPA // pA = start of A array
  526. .Ldtrmm_kernel_L4_Mv1_BEGIN:
  527. mov counterI, #0
  528. whilelt p1.d, counterI, origM //SVE instruction
  529. cntp lanes, p0, p1.d
  530. .align 5
  531. .Ldtrmm_kernel_L4_Mv1_20:
  532. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  533. mov pB, origPB
  534. #else
  535. mov pB, origPB
  536. mul temp, tempOffset, lanes
  537. add pA, pA, temp, lsl #3 // add tempOffset*lanes*8
  538. lsl temp, tempOffset, #5
  539. add pB, pB, temp
  540. #endif
  541. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  542. sub tempK, origK, tempOffset
  543. #elif defined(LEFT)
  544. add tempK, tempOffset, lanes
  545. #else
  546. add tempK, tempOffset, #4
  547. #endif
  548. INITv1x4 // fill with zeros
  549. asr counterL , tempK, #3 // L = K / 8
  550. cmp counterL , #0 // is there at least 4 to do?
  551. ble .Ldtrmm_kernel_L4_Mv1_44
  552. .align 5
  553. .Ldtrmm_kernel_L4_Mv1_22:
  554. KERNELv1x4_SUB
  555. KERNELv1x4_SUB
  556. KERNELv1x4_SUB
  557. KERNELv1x4_SUB
  558. KERNELv1x4_SUB
  559. KERNELv1x4_SUB
  560. KERNELv1x4_SUB
  561. KERNELv1x4_SUB
  562. subs counterL, counterL, #1
  563. bgt .Ldtrmm_kernel_L4_Mv1_22
  564. .Ldtrmm_kernel_L4_Mv1_44:
  565. ands counterL , tempK, #7
  566. ble .Ldtrmm_kernel_L4_Mv1_100
  567. .align 5
  568. .Ldtrmm_kernel_L4_Mv1_46:
  569. KERNELv1x4_SUB
  570. subs counterL, counterL, #1
  571. bne .Ldtrmm_kernel_L4_Mv1_46
  572. .Ldtrmm_kernel_L4_Mv1_100:
  573. SAVEv1x4
  574. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  575. sub tempK, origK, tempOffset
  576. #if defined(LEFT)
  577. sub tempK, tempK, lanes
  578. #else
  579. sub tempK, tempK, #4
  580. #endif
  581. mul temp, tempK, lanes
  582. add pA, pA, temp, lsl #3 // add tempOffset*lanes*8
  583. lsl temp, tempK, #5
  584. add pB, pB, temp
  585. #endif
  586. #if defined(LEFT)
  587. add tempOffset, tempOffset, lanes
  588. #endif
  589. .Ldtrmm_kernel_L4_Mv1_END:
  590. incd counterI
  591. whilelt p1.d, counterI, origM //SVE instruction
  592. cntp lanes, p0, p1.d
  593. b.any .Ldtrmm_kernel_L4_Mv1_20
  594. .Ldtrmm_kernel_L4_END:
  595. lsl temp, origK, #5
  596. add origPB, origPB, temp // B = B + K * 4 * 8
  597. #if !defined(LEFT)
  598. add tempOffset, tempOffset, #4
  599. #endif
  600. /******************************************************************************/
  601. /* Repeat the same thing if 2 left in N */
  602. .align 5
  603. .Ldtrmm_kernel_L2_BEGIN:
  604. mov counterJ , origN
  605. tst counterJ , #2
  606. ble .Ldtrmm_kernel_L1_BEGIN
  607. mov pCRow0, pC
  608. add pC, pC, LDC, lsl #1 // add 2 x LDC
  609. #if defined(LEFT)
  610. mov tempOffset, offset
  611. #endif
  612. mov pA, origPA // pA = start of A array
  613. .Ldtrmm_kernel_L2_Mv1_BEGIN:
  614. mov counterI, #0
  615. whilelt p1.d, counterI, origM //SVE instruction
  616. cntp lanes, p0, p1.d
  617. .align 5
  618. .Ldtrmm_kernel_L2_Mv1_20:
  619. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  620. mov pB, origPB
  621. #else
  622. mov pB, origPB
  623. mul temp, tempOffset, lanes
  624. add pA, pA, temp, lsl #3 // add tempOffset*lanes*8
  625. lsl temp, tempOffset, #4
  626. add pB, pB, temp
  627. #endif
  628. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  629. sub tempK, origK, tempOffset
  630. #elif defined(LEFT)
  631. add tempK, tempOffset, lanes
  632. #else
  633. add tempK, tempOffset, #2
  634. #endif
  635. INITv1x2 // fill with zeros
  636. asr counterL , tempK, #3 // L = K / 8
  637. cmp counterL , #0 // is there at least 4 to do?
  638. ble .Ldtrmm_kernel_L2_Mv1_44
  639. .align 5
  640. .Ldtrmm_kernel_L2_Mv1_22:
  641. KERNELv1x2_SUB
  642. KERNELv1x2_SUB
  643. KERNELv1x2_SUB
  644. KERNELv1x2_SUB
  645. KERNELv1x2_SUB
  646. KERNELv1x2_SUB
  647. KERNELv1x2_SUB
  648. KERNELv1x2_SUB
  649. subs counterL, counterL, #1
  650. bgt .Ldtrmm_kernel_L2_Mv1_22
  651. .Ldtrmm_kernel_L2_Mv1_44:
  652. ands counterL , tempK, #7
  653. ble .Ldtrmm_kernel_L2_Mv1_100
  654. .align 5
  655. .Ldtrmm_kernel_L2_Mv1_46:
  656. KERNELv1x2_SUB
  657. subs counterL, counterL, #1
  658. bne .Ldtrmm_kernel_L2_Mv1_46
  659. .Ldtrmm_kernel_L2_Mv1_100:
  660. SAVEv1x2
  661. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  662. sub tempK, origK, tempOffset
  663. #if defined(LEFT)
  664. sub tempK, tempK, lanes
  665. #else
  666. sub tempK, tempK, #2
  667. #endif
  668. mul temp, tempK, lanes
  669. add pA, pA, temp, lsl #3 // add tempOffset*lanes*8
  670. lsl temp, tempK, #4
  671. add pB, pB, temp
  672. #endif
  673. #if defined(LEFT)
  674. add tempOffset, tempOffset, lanes
  675. #endif
  676. .Ldtrmm_kernel_L2_Mv1_END:
  677. incd counterI
  678. whilelt p1.d, counterI, origM //SVE instruction
  679. cntp lanes, p0, p1.d
  680. b.any .Ldtrmm_kernel_L2_Mv1_20
  681. .Ldtrmm_kernel_L2_END:
  682. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  683. #if !defined(LEFT)
  684. add tempOffset, tempOffset, #2
  685. #endif
  686. /******************************************************************************/
  687. /* Repeat the same thing if 1 left in N */
  688. .align 5
  689. .Ldtrmm_kernel_L1_BEGIN:
  690. mov counterJ , origN
  691. tst counterJ , #1
  692. ble .Ldtrmm_kernel_L999 // done
  693. mov pCRow0, pC
  694. add pC, pC, LDC // add 1 x LDC
  695. #if defined(LEFT)
  696. mov tempOffset, offset
  697. #endif
  698. mov pA, origPA // pA = start of A array
  699. .Ldtrmm_kernel_L1_Mv1_BEGIN:
  700. mov counterI, #0
  701. whilelt p1.d, counterI, origM //SVE instruction
  702. cntp lanes, p0, p1.d
  703. .align 5
  704. .Ldtrmm_kernel_L1_Mv1_20:
  705. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  706. mov pB, origPB
  707. #else
  708. mov pB, origPB
  709. mul temp, tempOffset, lanes
  710. add pA, pA, temp, lsl #3 // add tempOffset*lanes*8
  711. lsl temp, tempOffset, #3
  712. add pB, pB, temp
  713. #endif
  714. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  715. sub tempK, origK, tempOffset
  716. #elif defined(LEFT)
  717. add tempK, tempOffset, lanes
  718. #else
  719. add tempK, tempOffset, #1
  720. #endif
  721. INITv1x1 // fill with zeros
  722. asr counterL , tempK, #3 // L = K / 8
  723. cmp counterL , #0 // is there at least 8 to do?
  724. ble .Ldtrmm_kernel_L1_Mv1_44
  725. .align 5
  726. .Ldtrmm_kernel_L1_Mv1_22:
  727. KERNELv1x1_SUB
  728. KERNELv1x1_SUB
  729. KERNELv1x1_SUB
  730. KERNELv1x1_SUB
  731. KERNELv1x1_SUB
  732. KERNELv1x1_SUB
  733. KERNELv1x1_SUB
  734. KERNELv1x1_SUB
  735. subs counterL, counterL, #1
  736. bgt .Ldtrmm_kernel_L1_Mv1_22
  737. .Ldtrmm_kernel_L1_Mv1_44:
  738. ands counterL , tempK, #7
  739. ble .Ldtrmm_kernel_L1_Mv1_100
  740. .align 5
  741. .Ldtrmm_kernel_L1_Mv1_46:
  742. KERNELv1x1_SUB
  743. subs counterL, counterL, #1
  744. bgt .Ldtrmm_kernel_L1_Mv1_46
  745. .Ldtrmm_kernel_L1_Mv1_100:
  746. SAVEv1x1
  747. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  748. sub tempK, origK, tempOffset
  749. #if defined(LEFT)
  750. sub tempK, tempK, lanes
  751. #else
  752. sub tempK, tempK, #1
  753. #endif
  754. mul temp, tempK, lanes
  755. add pA, pA, temp, lsl #3 // add tempOffset*lanes*8
  756. lsl temp, tempK, #3
  757. add pB, pB, temp
  758. #endif
  759. #if defined(LEFT)
  760. add tempOffset, tempOffset, lanes
  761. #endif
  762. .Ldtrmm_kernel_L1_Mv1_END:
  763. incd counterI
  764. whilelt p1.d, counterI, origM //SVE instruction
  765. cntp lanes, p0, p1.d
  766. b.any .Ldtrmm_kernel_L1_Mv1_20
  767. .Ldtrmm_kernel_L1_END:
  768. /******************************************************************************/
  769. .Ldtrmm_kernel_L999:
  770. mov x0, #0 // set return value
  771. ldp d8, d9, [sp, #(0 * 16)]
  772. ldp d10, d11, [sp, #(1 * 16)]
  773. ldp d12, d13, [sp, #(2 * 16)]
  774. ldp d14, d15, [sp, #(3 * 16)]
  775. ldp d16, d17, [sp, #(4 * 16)]
  776. ldp x18, x19, [sp, #(5 * 16)]
  777. ldp x20, x21, [sp, #(6 * 16)]
  778. ldp x22, x23, [sp, #(7 * 16)]
  779. ldp x24, x25, [sp, #(8 * 16)]
  780. ldp x26, x27, [sp, #(9 * 16)]
  781. ldr x28, [sp, #(10 * 16)]
  782. add sp, sp, #(11*16)
  783. ret
  784. EPILOGUE