You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_tcopy_8.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682
  1. /***************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M x0
  30. #define N x1
  31. #define A x2
  32. #define LDA x3
  33. #define B x4
  34. #define M8 x5
  35. #define A01 x6
  36. #define A02 x7
  37. #define A03 x8
  38. #define A04 x9
  39. #define A05 x10
  40. #define A06 x11
  41. #define A07 x12
  42. #define A08 x13
  43. #define B01 x14
  44. #define B02 x15
  45. #define B03 x16
  46. #define B04 x17
  47. #define I x18
  48. #define J x19
  49. #define TEMP1 x20
  50. #define TEMP2 x21
  51. #define A_PREFETCH 2560
  52. #define B_PREFETCH 256
  53. /**************************************************************************************
  54. * Macro definitions
  55. **************************************************************************************/
  56. .macro SAVE_REGS
  57. add sp, sp, #-(11 * 16)
  58. stp d8, d9, [sp, #(0 * 16)]
  59. stp d10, d11, [sp, #(1 * 16)]
  60. stp d12, d13, [sp, #(2 * 16)]
  61. stp d14, d15, [sp, #(3 * 16)]
  62. stp d16, d17, [sp, #(4 * 16)]
  63. stp x18, x19, [sp, #(5 * 16)]
  64. stp x20, x21, [sp, #(6 * 16)]
  65. stp x22, x23, [sp, #(7 * 16)]
  66. stp x24, x25, [sp, #(8 * 16)]
  67. stp x26, x27, [sp, #(9 * 16)]
  68. str x28, [sp, #(10 * 16)]
  69. .endm
  70. .macro RESTORE_REGS
  71. ldp d8, d9, [sp, #(0 * 16)]
  72. ldp d10, d11, [sp, #(1 * 16)]
  73. ldp d12, d13, [sp, #(2 * 16)]
  74. ldp d14, d15, [sp, #(3 * 16)]
  75. ldp d16, d17, [sp, #(4 * 16)]
  76. ldp x18, x19, [sp, #(5 * 16)]
  77. ldp x20, x21, [sp, #(6 * 16)]
  78. ldp x22, x23, [sp, #(7 * 16)]
  79. ldp x24, x25, [sp, #(8 * 16)]
  80. ldp x26, x27, [sp, #(9 * 16)]
  81. ldr x28, [sp, #(10 * 16)]
  82. add sp, sp, #(11*16)
  83. .endm
  84. /*************************************************************************************************************************/
  85. .macro COPY8x8
  86. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  87. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  88. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  89. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  90. //prfm PLDL1KEEP, [A05, #A_PREFETCH]
  91. //prfm PLDL1KEEP, [A06, #A_PREFETCH]
  92. //prfm PLDL1KEEP, [A07, #A_PREFETCH]
  93. //prfm PLDL1KEEP, [A08, #A_PREFETCH]
  94. ldp q0, q1, [A01], #32
  95. ldp q2, q3, [A01], #32
  96. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01]
  97. add TEMP1, B01, #64
  98. ldp q4, q5, [A02], #32
  99. ldp q6, q7, [A02], #32
  100. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1]
  101. add TEMP1, TEMP1, #64
  102. ldp q8, q9, [A03], #32
  103. ldp q10, q11, [A03], #32
  104. st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [TEMP1]
  105. add TEMP1, TEMP1, #64
  106. ldp q12, q13, [A04], #32
  107. ldp q14, q15, [A04], #32
  108. st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [TEMP1]
  109. add TEMP1, TEMP1, #64
  110. ldp q16, q17, [A05], #32
  111. ldp q18, q19, [A05], #32
  112. st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [TEMP1]
  113. add TEMP1, TEMP1, #64
  114. ldp q20, q21, [A06], #32
  115. ldp q22, q23, [A06], #32
  116. st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [TEMP1]
  117. add TEMP1, TEMP1, #64
  118. ldp q24, q25, [A07], #32
  119. ldp q26, q27, [A07], #32
  120. st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [TEMP1]
  121. add TEMP1, TEMP1, #64
  122. ldp q28, q29, [A08], #32
  123. ldp q30, q31, [A08], #32
  124. st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [TEMP1]
  125. add TEMP1, TEMP1, #64
  126. add B01, B01, M8
  127. .endm
  128. .macro COPY4x8
  129. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  130. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  131. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  132. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  133. //prfm PLDL1KEEP, [A05, #A_PREFETCH]
  134. //prfm PLDL1KEEP, [A06, #A_PREFETCH]
  135. //prfm PLDL1KEEP, [A07, #A_PREFETCH]
  136. //prfm PLDL1KEEP, [A08, #A_PREFETCH]
  137. ldp q0, q1, [A01], #32
  138. ldp q2, q3, [A02], #32
  139. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02]
  140. add B02, B02, #64
  141. ldp q4, q5, [A03], #32
  142. ldp q6, q7, [A04], #32
  143. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B02]
  144. add B02, B02, #64
  145. ldp q8, q9, [A05], #32
  146. ldp q10, q11, [A06], #32
  147. st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B02]
  148. add B02, B02, #64
  149. ldp q12, q13, [A07], #32
  150. ldp q14, q15, [A08], #32
  151. st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B02]
  152. add B02, B02, #64
  153. .endm
  154. .macro COPY2x8
  155. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  156. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  157. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  158. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  159. //prfm PLDL1KEEP, [A05, #A_PREFETCH]
  160. //prfm PLDL1KEEP, [A06, #A_PREFETCH]
  161. //prfm PLDL1KEEP, [A07, #A_PREFETCH]
  162. //prfm PLDL1KEEP, [A08, #A_PREFETCH]
  163. ldr q0, [A01], #16
  164. ldr q1, [A02], #16
  165. ldr q2, [A03], #16
  166. ldr q3, [A04], #16
  167. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B03]
  168. add B03, B03, #64
  169. ldr q4, [A05], #16
  170. ldr q5, [A06], #16
  171. ldr q6, [A07], #16
  172. ldr q7, [A08], #16
  173. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B03]
  174. add B03, B03, #64
  175. .endm
  176. .macro COPY1x8
  177. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  178. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  179. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  180. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  181. //prfm PLDL1KEEP, [A05, #A_PREFETCH]
  182. //prfm PLDL1KEEP, [A06, #A_PREFETCH]
  183. //prfm PLDL1KEEP, [A07, #A_PREFETCH]
  184. //prfm PLDL1KEEP, [A08, #A_PREFETCH]
  185. ldr d0, [A01], #8
  186. ldr d1, [A02], #8
  187. ldr d2, [A03], #8
  188. ldr d3, [A04], #8
  189. st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B04]
  190. add B04, B04, #32
  191. ldr d4, [A05], #8
  192. ldr d5, [A06], #8
  193. ldr d6, [A07], #8
  194. ldr d7, [A08], #8
  195. st1 {v4.1d, v5.1d, v6.1d, v7.1d}, [B04]
  196. add B04, B04, #32
  197. .endm
  198. /*************************************************************************************************************************/
  199. .macro COPY8x4
  200. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  201. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  202. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  203. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  204. ldp q0, q1, [A01], #32
  205. ldp q2, q3, [A01], #32
  206. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01]
  207. add TEMP1, B01, #64
  208. ldp q4, q5, [A02], #32
  209. ldp q6, q7, [A02], #32
  210. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1]
  211. add TEMP1, TEMP1, #64
  212. ldp q8, q9, [A03], #32
  213. ldp q10, q11, [A03], #32
  214. st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [TEMP1]
  215. add TEMP1, TEMP1, #64
  216. ldp q12, q13, [A04], #32
  217. ldp q14, q15, [A04], #32
  218. st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [TEMP1]
  219. add TEMP1, TEMP1, #64
  220. add B01, B01, M8
  221. .endm
  222. .macro COPY4x4
  223. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  224. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  225. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  226. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  227. ldp q0, q1, [A01], #32
  228. ldp q2, q3, [A02], #32
  229. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02]
  230. add B02, B02, #64
  231. ldp q4, q5, [A03], #32
  232. ldp q6, q7, [A04], #32
  233. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B02]
  234. add B02, B02, #64
  235. .endm
  236. .macro COPY2x4
  237. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  238. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  239. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  240. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  241. ldr q0, [A01], #16
  242. ldr q1, [A02], #16
  243. ldr q2, [A03], #16
  244. ldr q3, [A04], #16
  245. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B03]
  246. add B03, B03, #64
  247. .endm
  248. .macro COPY1x4
  249. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  250. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  251. //prfm PLDL1KEEP, [A03, #A_PREFETCH]
  252. //prfm PLDL1KEEP, [A04, #A_PREFETCH]
  253. ldr d0, [A01], #8
  254. ldr d1, [A02], #8
  255. ldr d2, [A03], #8
  256. ldr d3, [A04], #8
  257. st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B04]
  258. add B04, B04, #32
  259. .endm
  260. /*************************************************************************************************************************/
  261. .macro COPY8x2
  262. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  263. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  264. ldp q0, q1, [A01], #32
  265. ldp q2, q3, [A01], #32
  266. ldp q4, q5, [A02], #32
  267. ldp q6, q7, [A02], #32
  268. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01]
  269. add TEMP1, B01, #64
  270. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1]
  271. add B01, B01, M8
  272. .endm
  273. .macro COPY4x2
  274. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  275. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  276. ldp q0, q1, [A01], #32
  277. ldp q2, q3, [A02], #32
  278. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02]
  279. add B02, B02, #64
  280. .endm
  281. .macro COPY2x2
  282. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  283. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  284. ldr q0, [A01], #16
  285. ldr q1, [A02], #16
  286. stp q0, q1, [B03]
  287. add B03, B03, #32
  288. .endm
  289. .macro COPY1x2
  290. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  291. //prfm PLDL1KEEP, [A02, #A_PREFETCH]
  292. ldr d0, [A01], #8
  293. ldr d1, [A02], #8
  294. stp d0, d1, [B04]
  295. add B04, B04, #16
  296. .endm
  297. /*************************************************************************************************************************/
  298. .macro COPY8x1
  299. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  300. ldp q0, q1, [A01], #32
  301. ldp q2, q3, [A01], #32
  302. stp q0, q1, [B01]
  303. add TEMP1, B01, #32
  304. stp q2, q3, [TEMP1]
  305. add B01, B01, M8
  306. .endm
  307. .macro COPY4x1
  308. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  309. ldp q0, q1, [A01], #32
  310. stp q0, q1, [B02]
  311. add B02, B02, #32
  312. .endm
  313. .macro COPY2x1
  314. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  315. ldr q0, [A01], #16
  316. str q0, [B03]
  317. add B03, B03, #16
  318. .endm
  319. .macro COPY1x1
  320. //prfm PLDL1KEEP, [A01, #A_PREFETCH]
  321. ldr d0, [A01], #8
  322. str d0, [B04]
  323. add B04, B04, #8
  324. .endm
  325. /**************************************************************************************
  326. * End of macro definitions
  327. **************************************************************************************/
  328. PROLOGUE
  329. .align 5
  330. SAVE_REGS
  331. lsl LDA, LDA, #3 // LDA = LDA * SIZE
  332. lsl TEMP1, M, #3 // TEMP1 = M * SIZE
  333. and B02 , N , #-8
  334. and B03 , N , #-4
  335. and B04 , N , #-2
  336. mul B02, B02, TEMP1
  337. mul B03, B03, TEMP1
  338. mul B04, B04, TEMP1
  339. add B02 , B02, B
  340. add B03 , B03, B
  341. add B04 , B04, B
  342. lsl M8, M, #6 // M8 = M * 8 * SIZE
  343. .Ldgemm_tcopy_L8_BEGIN:
  344. asr J, M, #3 // J = M / 4
  345. cmp J, #0
  346. ble .Ldgemm_tcopy_L4_BEGIN
  347. .align 5
  348. .Ldgemm_tcopy_L8_M8_BEGIN:
  349. mov A01, A
  350. add A02, A01, LDA
  351. add A03, A02, LDA
  352. add A04, A03, LDA
  353. add A05, A04, LDA
  354. add A06, A05, LDA
  355. add A07, A06, LDA
  356. add A08, A07, LDA
  357. add A, A08, LDA
  358. mov B01, B
  359. add B, B01, #512 // B = B + 64 * SIZE
  360. asr I, N, #3 // I = N / 8
  361. cmp I, #0
  362. ble .Ldgemm_tcopy_L8_M8_40
  363. .align 5
  364. .Ldgemm_tcopy_L8_M8_20:
  365. COPY8x8
  366. subs I , I , #1
  367. bne .Ldgemm_tcopy_L8_M8_20
  368. .Ldgemm_tcopy_L8_M8_40:
  369. tst N , #4
  370. ble .Ldgemm_tcopy_L8_M8_60
  371. COPY4x8
  372. .Ldgemm_tcopy_L8_M8_60:
  373. tst N , #2
  374. ble .Ldgemm_tcopy_L8_M8_80
  375. COPY2x8
  376. .Ldgemm_tcopy_L8_M8_80:
  377. tst N, #1
  378. ble .Ldgemm_tcopy_L8_M8_END
  379. COPY1x8
  380. .Ldgemm_tcopy_L8_M8_END:
  381. subs J , J, #1 // j--
  382. bne .Ldgemm_tcopy_L8_M8_BEGIN
  383. /*********************************************************************************************/
  384. .Ldgemm_tcopy_L4_BEGIN:
  385. tst M, #7
  386. ble .Ldgemm_tcopy_L999
  387. tst M, #4
  388. ble .Ldgemm_tcopy_L2_BEGIN
  389. .Ldgemm_tcopy_L4_M8_BEGIN:
  390. mov A01, A
  391. add A02, A01, LDA
  392. add A03, A02, LDA
  393. add A04, A03, LDA
  394. add A, A04, LDA
  395. mov B01, B
  396. add B, B01, #256 // B = B + 32 * SIZE
  397. asr I, N, #3 // I = N / 8
  398. cmp I, #0
  399. ble .Ldgemm_tcopy_L4_M8_40
  400. .align 5
  401. .Ldgemm_tcopy_L4_M8_20:
  402. COPY8x4
  403. subs I , I , #1
  404. bne .Ldgemm_tcopy_L4_M8_20
  405. .Ldgemm_tcopy_L4_M8_40:
  406. tst N , #4
  407. ble .Ldgemm_tcopy_L4_M8_60
  408. COPY4x4
  409. .Ldgemm_tcopy_L4_M8_60:
  410. tst N , #2
  411. ble .Ldgemm_tcopy_L4_M8_80
  412. COPY2x4
  413. .Ldgemm_tcopy_L4_M8_80:
  414. tst N, #1
  415. ble .Ldgemm_tcopy_L4_M8_END
  416. COPY1x4
  417. .Ldgemm_tcopy_L4_M8_END:
  418. /*********************************************************************************************/
  419. .Ldgemm_tcopy_L2_BEGIN:
  420. tst M, #3
  421. ble .Ldgemm_tcopy_L999
  422. tst M, #2
  423. ble .Ldgemm_tcopy_L1_BEGIN
  424. .Ldgemm_tcopy_L2_M8_BEGIN:
  425. mov A01, A
  426. add A02, A01, LDA
  427. add A, A02, LDA
  428. mov B01, B
  429. add B, B01, #128 // B = B + 16 * SIZE
  430. asr I, N, #3 // I = N / 8
  431. cmp I, #0
  432. ble .Ldgemm_tcopy_L2_M8_40
  433. .align 5
  434. .Ldgemm_tcopy_L2_M8_20:
  435. COPY8x2
  436. subs I , I , #1
  437. bne .Ldgemm_tcopy_L2_M8_20
  438. .Ldgemm_tcopy_L2_M8_40:
  439. tst N , #4
  440. ble .Ldgemm_tcopy_L2_M8_60
  441. COPY4x2
  442. .Ldgemm_tcopy_L2_M8_60:
  443. tst N , #2
  444. ble .Ldgemm_tcopy_L2_M8_80
  445. COPY2x2
  446. .Ldgemm_tcopy_L2_M8_80:
  447. tst N , #1
  448. ble .Ldgemm_tcopy_L2_M8_END
  449. COPY1x2
  450. .Ldgemm_tcopy_L2_M8_END:
  451. /*********************************************************************************************/
  452. .Ldgemm_tcopy_L1_BEGIN:
  453. tst M, #1
  454. ble .Ldgemm_tcopy_L999
  455. .Ldgemm_tcopy_L1_M8_BEGIN:
  456. mov A01, A // A01 = A
  457. mov B01, B
  458. asr I, N, #3 // I = M / 8
  459. cmp I, #0
  460. ble .Ldgemm_tcopy_L1_M8_40
  461. .align 5
  462. .Ldgemm_tcopy_L1_M8_20:
  463. COPY8x1
  464. subs I , I , #1
  465. bne .Ldgemm_tcopy_L1_M8_20
  466. .Ldgemm_tcopy_L1_M8_40:
  467. tst N , #4
  468. ble .Ldgemm_tcopy_L1_M8_60
  469. COPY4x1
  470. .Ldgemm_tcopy_L1_M8_60:
  471. tst N , #2
  472. ble .Ldgemm_tcopy_L1_M8_80
  473. COPY2x1
  474. .Ldgemm_tcopy_L1_M8_80:
  475. tst N , #1
  476. ble .Ldgemm_tcopy_L1_M8_END
  477. COPY1x1
  478. .Ldgemm_tcopy_L1_M8_END:
  479. .Ldgemm_tcopy_L999:
  480. mov x0, #0 // set return value
  481. RESTORE_REGS
  482. ret
  483. EPILOGUE