You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_tcopy_16.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814
  1. /***************************************************************************
  2. Copyright (c) 2019, The OpenBLAS Project
  3. All rights reserved.
  4. *****************************************************************************/
  5. #define ASSEMBLER
  6. #include "common.h"
  7. #define M x0
  8. #define N x1
  9. #define A x2
  10. #define LDA x3
  11. #define B x4
  12. #define M8 x5
  13. #define A01 x6
  14. #define A02 x7
  15. #define A03 x8
  16. #define A04 x9
  17. #define A05 x10
  18. #define A06 x11
  19. #define A07 x12
  20. #define A08 x13
  21. #define B01 x14
  22. #define B02 x15
  23. #define B03 x16
  24. #define B04 x17
  25. #define B00 x22
  26. #define I x21
  27. #define J x19
  28. #define TEMP1 x20
  29. #define A_PREFETCH 256
  30. /**************************************************************************************
  31. * Macro definitions
  32. **************************************************************************************/
  33. .macro SAVE_REGS
  34. add sp, sp, #-(11 * 16)
  35. stp d8, d9, [sp, #(0 * 16)]
  36. stp d10, d11, [sp, #(1 * 16)]
  37. stp d12, d13, [sp, #(2 * 16)]
  38. stp d14, d15, [sp, #(3 * 16)]
  39. stp d16, d17, [sp, #(4 * 16)]
  40. stp x18, x19, [sp, #(5 * 16)]
  41. stp x20, x21, [sp, #(6 * 16)]
  42. stp x22, x23, [sp, #(7 * 16)]
  43. stp x24, x25, [sp, #(8 * 16)]
  44. stp x26, x27, [sp, #(9 * 16)]
  45. str x28, [sp, #(10 * 16)]
  46. .endm
  47. .macro RESTORE_REGS
  48. ldp d8, d9, [sp, #(0 * 16)]
  49. ldp d10, d11, [sp, #(1 * 16)]
  50. ldp d12, d13, [sp, #(2 * 16)]
  51. ldp d14, d15, [sp, #(3 * 16)]
  52. ldp d16, d17, [sp, #(4 * 16)]
  53. ldp x18, x19, [sp, #(5 * 16)]
  54. ldp x20, x21, [sp, #(6 * 16)]
  55. ldp x22, x23, [sp, #(7 * 16)]
  56. ldp x24, x25, [sp, #(8 * 16)]
  57. ldp x26, x27, [sp, #(9 * 16)]
  58. ldr x28, [sp, #(10 * 16)]
  59. add sp, sp, #(11*16)
  60. .endm
  61. /*************************************************************************************************************************/
  62. .macro COPY16x8
  63. prfm PLDL1KEEP, [A01, #A_PREFETCH]
  64. prfm PLDL1KEEP, [A02, #A_PREFETCH]
  65. prfm PLDL1KEEP, [A03, #A_PREFETCH]
  66. prfm PLDL1KEEP, [A04, #A_PREFETCH]
  67. prfm PLDL1KEEP, [A05, #A_PREFETCH]
  68. prfm PLDL1KEEP, [A06, #A_PREFETCH]
  69. prfm PLDL1KEEP, [A07, #A_PREFETCH]
  70. prfm PLDL1KEEP, [A08, #A_PREFETCH]
  71. //prfm PSTL1KEEP, [B00, M8]
  72. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
  73. add A01, A01, #64
  74. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
  75. add TEMP1, B00, #64
  76. ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
  77. add A02, A02, #64
  78. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
  79. add TEMP1, TEMP1, #64
  80. ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
  81. add A03, A03, #64
  82. st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
  83. add TEMP1, TEMP1, #64
  84. ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
  85. add A04, A04, #64
  86. st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
  87. add TEMP1, TEMP1, #64
  88. ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05]
  89. add A05, A05, #64
  90. st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1]
  91. add TEMP1, TEMP1, #64
  92. ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06]
  93. add A06, A06, #64
  94. st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1]
  95. add TEMP1, TEMP1, #64
  96. ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07]
  97. add A07, A07, #64
  98. st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1]
  99. add TEMP1, TEMP1, #64
  100. ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08]
  101. add A08, A08, #64
  102. st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1]
  103. add TEMP1, TEMP1, #64
  104. add B00, B00, M8
  105. .endm
  106. .macro COPY8x8
  107. prfm PLDL1KEEP, [A01, #A_PREFETCH]
  108. prfm PLDL1KEEP, [A02, #A_PREFETCH]
  109. prfm PLDL1KEEP, [A03, #A_PREFETCH]
  110. prfm PLDL1KEEP, [A04, #A_PREFETCH]
  111. prfm PLDL1KEEP, [A05, #A_PREFETCH]
  112. prfm PLDL1KEEP, [A06, #A_PREFETCH]
  113. prfm PLDL1KEEP, [A07, #A_PREFETCH]
  114. prfm PLDL1KEEP, [A08, #A_PREFETCH]
  115. ldp q0, q1, [A01]
  116. ldp q2, q3, [A02]
  117. add A01, A01, #32
  118. add A02, A02, #32
  119. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
  120. add B01, B01, #64
  121. ldp q4, q5, [A03]
  122. ldp q6, q7, [A04]
  123. add A03, A03, #32
  124. add A04, A04, #32
  125. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
  126. add B01, B01, #64
  127. ldp q8, q9, [A05]
  128. ldp q10, q11, [A06]
  129. add A05, A05, #32
  130. add A06, A06, #32
  131. st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B01]
  132. add B01, B01, #64
  133. ldp q12, q13, [A07]
  134. ldp q14, q15, [A08]
  135. add A07, A07, #32
  136. add A08, A08, #32
  137. st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B01]
  138. add B01, B01, #64
  139. .endm
  140. .macro COPY4x8
  141. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  142. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  143. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  144. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  145. //prfm PLDL1KEEP, [A05, #A_PREFETCH]
  146. //prfm PLDL1KEEP, [A06, #A_PREFETCH]
  147. //prfm PLDL1KEEP, [A07, #A_PREFETCH]
  148. //prfm PLDL1KEEP, [A08, #A_PREFETCH]
  149. ldr q0, [A01]
  150. ldr q1, [A02]
  151. ldr q2, [A03]
  152. ldr q3, [A04]
  153. add A01, A01, #16
  154. add A02, A02, #16
  155. add A03, A03, #16
  156. add A04, A04, #16
  157. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
  158. add B02, B02, #64
  159. ldr q4, [A05]
  160. ldr q5, [A06]
  161. ldr q6, [A07]
  162. ldr q7, [A08]
  163. add A05, A05, #16
  164. add A06, A06, #16
  165. add A07, A07, #16
  166. add A08, A08, #16
  167. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B02]
  168. add B02, B02, #64
  169. .endm
  170. .macro COPY2x8
  171. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  172. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  173. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  174. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  175. //prfm PLDL1KEEP, [A05, #A_PREFETCH]
  176. //prfm PLDL1KEEP, [A06, #A_PREFETCH]
  177. //prfm PLDL1KEEP, [A07, #A_PREFETCH]
  178. //prfm PLDL1KEEP, [A08, #A_PREFETCH]
  179. ldr d0, [A01]
  180. ldr d1, [A02]
  181. ldr d2, [A03]
  182. ldr d3, [A04]
  183. add A01, A01, #8
  184. add A02, A02, #8
  185. add A03, A03, #8
  186. add A04, A04, #8
  187. stp d0, d1, [B03]
  188. add B03, B03, #16
  189. stp d2, d3, [B03]
  190. add B03, B03, #16
  191. ldr d4, [A05]
  192. ldr d5, [A06]
  193. ldr d6, [A07]
  194. ldr d7, [A08]
  195. add A05, A05, #8
  196. add A06, A06, #8
  197. add A07, A07, #8
  198. add A08, A08, #8
  199. stp d4, d5, [B03]
  200. add B03, B03, #16
  201. stp d6, d7, [B03]
  202. add B03, B03, #16
  203. .endm
  204. .macro COPY1x8
  205. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  206. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  207. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  208. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  209. //prfm PLDL1KEEP, [A05, #A_PREFETCH]
  210. //prfm PLDL1KEEP, [A06, #A_PREFETCH]
  211. //prfm PLDL1KEEP, [A07, #A_PREFETCH]
  212. //prfm PLDL1KEEP, [A08, #A_PREFETCH]
  213. ldr s0, [A01]
  214. ldr s1, [A02]
  215. ldr s2, [A03]
  216. ldr s3, [A04]
  217. stp s0, s1, [B04]
  218. add B04, B04, #8
  219. stp s2, s3, [B04]
  220. add B04, B04, #8
  221. ldr s4, [A05]
  222. ldr s5, [A06]
  223. ldr s6, [A07]
  224. ldr s7, [A08]
  225. stp s4, s5, [B04]
  226. add B04, B04, #8
  227. stp s6, s7, [B04]
  228. add B04, B04, #8
  229. .endm
  230. /*************************************************************************************************************************/
  231. .macro COPY16x4
  232. prfm PLDL1KEEP, [A01, #A_PREFETCH]
  233. prfm PLDL1KEEP, [A02, #A_PREFETCH]
  234. prfm PLDL1KEEP, [A03, #A_PREFETCH]
  235. prfm PLDL1KEEP, [A04, #A_PREFETCH]
  236. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
  237. add A01, A01, #64
  238. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
  239. add TEMP1, B00, #64
  240. ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
  241. add A02, A02, #64
  242. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
  243. add TEMP1, TEMP1, #64
  244. ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
  245. add A03, A03, #64
  246. st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
  247. add TEMP1, TEMP1, #64
  248. ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
  249. add A04, A04, #64
  250. st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
  251. add B00, B00, M8
  252. .endm
  253. .macro COPY8x4
  254. prfm PLDL1KEEP, [A01, #A_PREFETCH]
  255. prfm PLDL1KEEP, [A02, #A_PREFETCH]
  256. prfm PLDL1KEEP, [A03, #A_PREFETCH]
  257. prfm PLDL1KEEP, [A04, #A_PREFETCH]
  258. ldp q0, q1, [A01]
  259. ldp q2, q3, [A02]
  260. add A01, A01, #32
  261. add A02, A02, #32
  262. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
  263. add B01, B01, #64
  264. ldp q4, q5, [A03]
  265. ldp q6, q7, [A04]
  266. add A03, A03, #32
  267. add A04, A04, #32
  268. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
  269. add B01, B01, #64
  270. .endm
  271. .macro COPY4x4
  272. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  273. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  274. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  275. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  276. ldr q0, [A01]
  277. ldr q1, [A02]
  278. ldr q2, [A03]
  279. ldr q3, [A04]
  280. add A01, A01, #16
  281. add A02, A02, #16
  282. add A03, A03, #16
  283. add A04, A04, #16
  284. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
  285. add B02, B02, #64
  286. .endm
  287. .macro COPY2x4
  288. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  289. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  290. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  291. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  292. ldr d0, [A01]
  293. ldr d1, [A02]
  294. ldr d2, [A03]
  295. ldr d3, [A04]
  296. add A01, A01, #8
  297. add A02, A02, #8
  298. add A03, A03, #8
  299. add A04, A04, #8
  300. stp d0, d1, [B03]
  301. add B03, B03, #16
  302. stp d2, d3, [B03]
  303. add B03, B03, #16
  304. .endm
  305. .macro COPY1x4
  306. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  307. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  308. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  309. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  310. ldr s0, [A01]
  311. ldr s1, [A02]
  312. ldr s2, [A03]
  313. ldr s3, [A04]
  314. add A01, A01, #4
  315. add A02, A02, #4
  316. add A03, A03, #4
  317. add A04, A04, #4
  318. stp s0, s1, [B04]
  319. add B04, B04, #8
  320. stp s2, s3, [B04]
  321. add B04, B04, #8
  322. .endm
  323. /*************************************************************************************************************************/
  324. .macro COPY16x2
  325. prfm PLDL1KEEP, [A01, #A_PREFETCH]
  326. prfm PLDL1KEEP, [A02, #A_PREFETCH]
  327. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
  328. add A01, A01, #64
  329. ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
  330. add A02, A02, #64
  331. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
  332. add TEMP1, B00, #64
  333. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
  334. add B00, B00, M8
  335. .endm
  336. .macro COPY8x2
  337. prfm PLDL1KEEP, [A01, #A_PREFETCH]
  338. prfm PLDL1KEEP, [A02, #A_PREFETCH]
  339. ld1 {v0.4s, v1.4s}, [A01]
  340. ld1 {v2.4s, v3.4s}, [A02]
  341. add A01, A01, #32
  342. add A02, A02, #32
  343. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
  344. add B01, B01, #64
  345. .endm
  346. .macro COPY4x2
  347. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  348. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  349. ldr q0, [A01]
  350. ldr q1, [A02]
  351. add A01, A01, #16
  352. add A02, A02, #16
  353. stp q0, q1, [B02]
  354. add B02, B02, #32
  355. .endm
  356. .macro COPY2x2
  357. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  358. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  359. ldr d0, [A01]
  360. ldr d1, [A02]
  361. add A01, A01, #8
  362. add A02, A02, #8
  363. stp d0, d1, [B03]
  364. add B03, B03, #16
  365. .endm
  366. .macro COPY1x2
  367. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  368. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  369. ldr s0, [A01]
  370. ldr s1, [A02]
  371. add A01, A01, #4
  372. add A02, A02, #4
  373. stp s0, s1, [B04]
  374. add B04, B04, #8
  375. .endm
  376. /*************************************************************************************************************************/
  377. .macro COPY16x1
  378. prfm PLDL1KEEP, [A01, #A_PREFETCH]
  379. ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
  380. add A01, A01, #64
  381. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
  382. add B00, B00, M8
  383. .endm
  384. .macro COPY8x1
  385. prfm PLDL1KEEP, [A01, #A_PREFETCH]
  386. ldp q0, q1, [A01]
  387. add A01, A01, #32
  388. stp q0, q1, [B01]
  389. add B01, B01, #32
  390. .endm
  391. .macro COPY4x1
  392. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  393. ldr q0, [A01]
  394. add A01, A01, #16
  395. str q0, [B02]
  396. add B02, B02, #16
  397. .endm
  398. .macro COPY2x1
  399. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  400. ldr d0, [A01]
  401. add A01, A01, #8
  402. str d0, [B03]
  403. add B03, B03, #8
  404. .endm
  405. .macro COPY1x1
  406. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  407. ldr s0, [A01]
  408. add A01, A01, #4
  409. str s0, [B04]
  410. add B04, B04, #4
  411. .endm
  412. /**************************************************************************************
  413. * End of macro definitions
  414. **************************************************************************************/
  415. PROLOGUE
  416. .align 5
  417. SAVE_REGS
  418. lsl LDA, LDA, #2 // LDA = LDA * SIZE
  419. lsl TEMP1, M, #2 // TEMP1 = M * SIZE
  420. and B01 , N , #-16
  421. and B02 , N , #-8
  422. and B03 , N , #-4
  423. and B04 , N , #-2
  424. mul B01, B01, TEMP1
  425. mul B02, B02, TEMP1
  426. mul B03, B03, TEMP1
  427. mul B04, B04, TEMP1
  428. add B01 , B01, B
  429. add B02 , B02, B
  430. add B03 , B03, B
  431. add B04 , B04, B
  432. lsl M8, M, #6 // M8 = M * 16 * SIZE
  433. .Lsgemm_tcopy_L8_BEGIN:
  434. asr J, M, #3 // J = M / 8
  435. cmp J, #0
  436. ble .Lsgemm_tcopy_L4_BEGIN
  437. .align 5
  438. .Lsgemm_tcopy_L8_M16_BEGIN:
  439. mov A01, A
  440. add A02, A01, LDA
  441. add A03, A02, LDA
  442. add A04, A03, LDA
  443. add A05, A04, LDA
  444. add A06, A05, LDA
  445. add A07, A06, LDA
  446. add A08, A07, LDA
  447. add A, A08, LDA
  448. mov B00, B
  449. add B, B00, #512 // B = B + 8 * 16 * SIZE
  450. asr I, N, #4 // I = N / 16
  451. cmp I, #0
  452. ble .Lsgemm_tcopy_L8_M16_40
  453. .align 5
  454. .Lsgemm_tcopy_L8_M16_20:
  455. COPY16x8
  456. subs I , I , #1
  457. bne .Lsgemm_tcopy_L8_M16_20
  458. .Lsgemm_tcopy_L8_M16_40:
  459. tst N , #8
  460. ble .Lsgemm_tcopy_L8_M16_60
  461. COPY8x8
  462. .Lsgemm_tcopy_L8_M16_60:
  463. tst N , #4
  464. ble .Lsgemm_tcopy_L8_M16_80
  465. COPY4x8
  466. .Lsgemm_tcopy_L8_M16_80:
  467. tst N , #2
  468. ble .Lsgemm_tcopy_L8_M16_100
  469. COPY2x8
  470. .Lsgemm_tcopy_L8_M16_100:
  471. tst N, #1
  472. ble .Lsgemm_tcopy_L8_M16_END
  473. COPY1x8
  474. .Lsgemm_tcopy_L8_M16_END:
  475. subs J , J, #1 // j--
  476. bne .Lsgemm_tcopy_L8_M16_BEGIN
  477. /*********************************************************************************************/
  478. .Lsgemm_tcopy_L4_BEGIN:
  479. tst M, #7
  480. ble .Lsgemm_tcopy_L999
  481. tst M, #4
  482. ble .Lsgemm_tcopy_L2_BEGIN
  483. .Lsgemm_tcopy_L4_M16_BEGIN:
  484. mov A01, A
  485. add A02, A01, LDA
  486. add A03, A02, LDA
  487. add A04, A03, LDA
  488. add A, A04, LDA
  489. mov B00, B
  490. add B, B00, #256 // B = B + 4 * 16 * SIZE
  491. asr I, N, #4 // I = N / 16
  492. cmp I, #0
  493. ble .Lsgemm_tcopy_L4_M16_40
  494. .align 5
  495. .Lsgemm_tcopy_L4_M16_20:
  496. COPY16x4
  497. subs I , I , #1
  498. bne .Lsgemm_tcopy_L4_M16_20
  499. .Lsgemm_tcopy_L4_M16_40:
  500. tst N , #8
  501. ble .Lsgemm_tcopy_L4_M16_60
  502. COPY8x4
  503. .Lsgemm_tcopy_L4_M16_60:
  504. tst N , #4
  505. ble .Lsgemm_tcopy_L4_M16_80
  506. COPY4x4
  507. .Lsgemm_tcopy_L4_M16_80:
  508. tst N , #2
  509. ble .Lsgemm_tcopy_L4_M16_100
  510. COPY2x4
  511. .Lsgemm_tcopy_L4_M16_100:
  512. tst N, #1
  513. ble .Lsgemm_tcopy_L4_M16_END
  514. COPY1x4
  515. .Lsgemm_tcopy_L4_M16_END:
  516. /*********************************************************************************************/
  517. .Lsgemm_tcopy_L2_BEGIN:
  518. tst M, #3
  519. ble .Lsgemm_tcopy_L999
  520. tst M, #2
  521. ble .Lsgemm_tcopy_L1_BEGIN
  522. .Lsgemm_tcopy_L2_M16_BEGIN:
  523. mov A01, A
  524. add A02, A01, LDA
  525. add A, A02, LDA
  526. mov B00, B
  527. add B, B00, #128 // B = B + 2 * 16 * SIZE
  528. asr I, N, #4 // I = N / 16
  529. cmp I, #0
  530. ble .Lsgemm_tcopy_L2_M16_40
  531. .align 5
  532. .Lsgemm_tcopy_L2_M16_20:
  533. COPY16x2
  534. subs I , I , #1
  535. bne .Lsgemm_tcopy_L2_M16_20
  536. .Lsgemm_tcopy_L2_M16_40:
  537. tst N , #8
  538. ble .Lsgemm_tcopy_L2_M16_60
  539. COPY8x2
  540. .Lsgemm_tcopy_L2_M16_60:
  541. tst N , #4
  542. ble .Lsgemm_tcopy_L2_M16_80
  543. COPY4x2
  544. .Lsgemm_tcopy_L2_M16_80:
  545. tst N , #2
  546. ble .Lsgemm_tcopy_L2_M16_100
  547. COPY2x2
  548. .Lsgemm_tcopy_L2_M16_100:
  549. tst N , #1
  550. ble .Lsgemm_tcopy_L2_M16_END
  551. COPY1x2
  552. .Lsgemm_tcopy_L2_M16_END:
  553. /*********************************************************************************************/
  554. .Lsgemm_tcopy_L1_BEGIN:
  555. tst M, #1
  556. ble .Lsgemm_tcopy_L999
  557. .Lsgemm_tcopy_L1_M16_BEGIN:
  558. mov A01, A // A01 = A
  559. mov B00, B
  560. asr I, N, #4 // I = M / 16
  561. cmp I, #0
  562. ble .Lsgemm_tcopy_L1_M16_40
  563. .align 5
  564. .Lsgemm_tcopy_L1_M16_20:
  565. COPY16x1
  566. subs I , I , #1
  567. bne .Lsgemm_tcopy_L1_M16_20
  568. .Lsgemm_tcopy_L1_M16_40:
  569. tst N , #8
  570. ble .Lsgemm_tcopy_L1_M16_60
  571. COPY8x1
  572. .Lsgemm_tcopy_L1_M16_60:
  573. tst N , #4
  574. ble .Lsgemm_tcopy_L1_M16_80
  575. COPY4x1
  576. .Lsgemm_tcopy_L1_M16_80:
  577. tst N , #2
  578. ble .Lsgemm_tcopy_L1_M16_100
  579. COPY2x1
  580. .Lsgemm_tcopy_L1_M16_100:
  581. tst N , #1
  582. ble .Lsgemm_tcopy_L1_M16_END
  583. COPY1x1
  584. .Lsgemm_tcopy_L1_M16_END:
  585. .Lsgemm_tcopy_L999:
  586. mov x0, #0 // set return value
  587. RESTORE_REGS
  588. ret
  589. EPILOGUE