You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_kernel_16x4.S 47 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 x7 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alpha w17
  49. #define temp x18
  50. #define tempOffset x19
  51. #define tempK x20
  52. #define alpha0 s10
  53. #define alphaV0 v10.s[0]
  54. #define A_PRE_SIZE 2560
  55. #define B_PRE_SIZE 224
  56. #define C_PRE_SIZE 160
  57. // 00 origM
  58. // 01 origN
  59. // 02 origK
  60. // 03 origPA
  61. // 04 origPB
  62. // 05 pC
  63. // 06 origLDC -> LDC
  64. // 07 offset
  65. // 08 counterL
  66. // 09 counterI
  67. // 10 counterJ
  68. // 11 pB
  69. // 12 pCRow0
  70. // 13 pCRow1
  71. // 14 pCRow2
  72. // 15 pA
  73. // 16 temp
  74. // 17 tempOffset
  75. // 18 must save tempK
  76. // 19 must save
  77. // 20 must save
  78. // 21 must save
  79. // 22 must save
  80. // 23 must save
  81. // 24 must save
  82. // 25 must save
  83. // 26 must save
  84. // 27 must save
  85. // 28 must save
  86. // 29 frame
  87. // 30 link
  88. // 31 sp
  89. //v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03
  90. //v01 pA0_04, pA0_05, pA0_06, pA0_07
  91. //v02 pA0_08, pA0_09, pA0_10, pA0_11
  92. //v03 pA0_12, pA0_13, pA0_14, pA0_15
  93. //v04 pA1_00, pA1_01, pA1_02, pA1_03
  94. //v05 pA1_04, pA1_05, pA1_06, pA1_07
  95. //v06 pA1_08, pA1_09, pA1_10, pA1_11
  96. //v07 pA1_12, pA1_13, pA1_14, pA1_15
  97. //v08 must save pB00
  98. //v09 must save pB01
  99. //v10 must save pB02
  100. //v11 must save pB03
  101. //v12 must save pB10
  102. //v13 must save pB11
  103. //v14 must save pB12
  104. //v15 must save pB13
  105. //v16 must save C00, C01, C02, C03
  106. //v17 must save C04, C05, C06, C07
  107. //v18 C08, C09, C10, C11
  108. //v19 C12, C13, C14, C15
  109. //v20 C16, C17, C18, C19
  110. //v21 C20, C21, C22, C23
  111. //v22 C24, C25, C26, C27
  112. //v23 C28, C29, C30, C31
  113. //v24 C32, C33, C34, C35
  114. //v25 C36, C37, C38, C39
  115. //v26 C40, C41, C42, C43
  116. //v27 C44, C45, C46, C47
  117. //v28 C48, C49, C50, C51
  118. //v29 C52, C53, C54, C55
  119. //v30 C56, C57, C58, C59
  120. //v31 C60, C61, C62, C63
  121. /*******************************************************************************
  122. * Macro definitions
  123. *******************************************************************************/
  124. .macro INIT16x4
  125. fmov s16, wzr
  126. fmov s17, wzr
  127. fmov s18, s16
  128. fmov s19, s17
  129. fmov s20, wzr
  130. fmov s21, s16
  131. fmov s22, s17
  132. fmov s23, s18
  133. fmov s24, wzr
  134. fmov s25, s16
  135. fmov s26, s17
  136. fmov s27, s18
  137. fmov s28, wzr
  138. fmov s29, s16
  139. fmov s30, s17
  140. fmov s31, s18
  141. .endm
  142. .macro KERNEL16x4_I
  143. ldp q0, q1, [pA], #32
  144. ldp s8, s9, [pB], #8
  145. fmul v16.4s, v0.4s, v8.s[0]
  146. fmul v20.4s, v0.4s, v9.s[0]
  147. ldp s10, s11, [pB], #8
  148. fmul v24.4s, v0.4s, v10.s[0]
  149. fmul v28.4s, v0.4s, v11.s[0]
  150. ldp q2, q3, [pA], #32
  151. fmul v17.4s, v1.4s, v8.s[0]
  152. fmul v21.4s, v1.4s, v9.s[0]
  153. ldp q4, q5, [pA], #32
  154. fmul v25.4s, v1.4s, v10.s[0]
  155. fmul v29.4s, v1.4s, v11.s[0]
  156. ldp s12, s13, [pB], #8
  157. fmul v18.4s, v2.4s, v8.s[0]
  158. fmul v22.4s, v2.4s, v9.s[0]
  159. ldp s14, s15, [pB], #8
  160. fmul v19.4s, v3.4s, v8.s[0]
  161. fmul v23.4s, v3.4s, v9.s[0]
  162. ldp q6, q7, [pA], #32
  163. fmul v26.4s, v2.4s, v10.s[0]
  164. fmul v30.4s, v2.4s, v11.s[0]
  165. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  166. fmul v27.4s, v3.4s, v10.s[0]
  167. fmul v31.4s, v3.4s, v11.s[0]
  168. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  169. .endm
  170. .macro KERNEL16x4_M1
  171. fmla v16.4s, v0.4s, v8.s[0]
  172. fmla v17.4s, v1.4s, v8.s[0]
  173. ldp q4, q5, [pA], #32
  174. fmla v18.4s, v2.4s, v8.s[0]
  175. fmla v19.4s, v3.4s, v8.s[0]
  176. fmla v20.4s, v0.4s, v9.s[0]
  177. fmla v21.4s, v1.4s, v9.s[0]
  178. ldp s12, s13, [pB], #8
  179. fmla v22.4s, v2.4s, v9.s[0]
  180. fmla v23.4s, v3.4s, v9.s[0]
  181. ldp s14, s15, [pB], #8
  182. fmla v24.4s, v0.4s, v10.s[0]
  183. fmla v25.4s, v1.4s, v10.s[0]
  184. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  185. fmla v26.4s, v2.4s, v10.s[0]
  186. fmla v27.4s, v3.4s, v10.s[0]
  187. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  188. fmla v28.4s, v0.4s, v11.s[0]
  189. fmla v29.4s, v1.4s, v11.s[0]
  190. ldp q6, q7, [pA], #32
  191. fmla v30.4s, v2.4s, v11.s[0]
  192. fmla v31.4s, v3.4s, v11.s[0]
  193. .endm
  194. .macro KERNEL16x4_M2
  195. fmla v16.4s, v4.4s, v12.s[0]
  196. fmla v17.4s, v5.4s, v12.s[0]
  197. ldp q0, q1, [pA], #32
  198. fmla v18.4s, v6.4s, v12.s[0]
  199. fmla v19.4s, v7.4s, v12.s[0]
  200. fmla v20.4s, v4.4s, v13.s[0]
  201. fmla v21.4s, v5.4s, v13.s[0]
  202. ldp s8, s9, [pB], #8
  203. fmla v22.4s, v6.4s, v13.s[0]
  204. fmla v23.4s, v7.4s, v13.s[0]
  205. ldp s10, s11, [pB], #8
  206. fmla v24.4s, v4.4s, v14.s[0]
  207. fmla v25.4s, v5.4s, v14.s[0]
  208. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  209. fmla v26.4s, v6.4s, v14.s[0]
  210. fmla v27.4s, v7.4s, v14.s[0]
  211. ldp q2, q3, [pA], #32
  212. fmla v28.4s, v4.4s, v15.s[0]
  213. fmla v29.4s, v5.4s, v15.s[0]
  214. fmla v30.4s, v6.4s, v15.s[0]
  215. fmla v31.4s, v7.4s, v15.s[0]
  216. .endm
  217. .macro KERNEL16x4_E
  218. fmla v16.4s, v4.4s, v12.s[0]
  219. fmla v20.4s, v4.4s, v13.s[0]
  220. fmla v24.4s, v4.4s, v14.s[0]
  221. fmla v28.4s, v4.4s, v15.s[0]
  222. fmla v17.4s, v5.4s, v12.s[0]
  223. fmla v21.4s, v5.4s, v13.s[0]
  224. fmla v25.4s, v5.4s, v14.s[0]
  225. fmla v29.4s, v5.4s, v15.s[0]
  226. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  227. fmla v18.4s, v6.4s, v12.s[0]
  228. fmla v22.4s, v6.4s, v13.s[0]
  229. fmla v26.4s, v6.4s, v14.s[0]
  230. fmla v30.4s, v6.4s, v15.s[0]
  231. fmla v19.4s, v7.4s, v12.s[0]
  232. fmla v23.4s, v7.4s, v13.s[0]
  233. fmla v27.4s, v7.4s, v14.s[0]
  234. fmla v31.4s, v7.4s, v15.s[0]
  235. .endm
  236. .macro KERNEL16x4_SUB
  237. ldp q0, q1, [pA], #32
  238. ldp s8, s9, [pB], #8
  239. fmla v16.4s, v0.4s, v8.s[0]
  240. fmla v20.4s, v0.4s, v9.s[0]
  241. ldp s10, s11, [pB], #8
  242. fmla v24.4s, v0.4s, v10.s[0]
  243. fmla v28.4s, v0.4s, v11.s[0]
  244. ldp q2, q3, [pA], #32
  245. fmla v17.4s, v1.4s, v8.s[0]
  246. fmla v21.4s, v1.4s, v9.s[0]
  247. fmla v25.4s, v1.4s, v10.s[0]
  248. fmla v29.4s, v1.4s, v11.s[0]
  249. fmla v18.4s, v2.4s, v8.s[0]
  250. fmla v22.4s, v2.4s, v9.s[0]
  251. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  252. fmla v19.4s, v3.4s, v8.s[0]
  253. fmla v23.4s, v3.4s, v9.s[0]
  254. fmla v26.4s, v2.4s, v10.s[0]
  255. fmla v30.4s, v2.4s, v11.s[0]
  256. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  257. fmla v27.4s, v3.4s, v10.s[0]
  258. fmla v31.4s, v3.4s, v11.s[0]
  259. .endm
  260. .macro SAVE16x4
  261. fmov alpha0, alpha
  262. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  263. fmul v0.4s, v16.4s, alphaV0
  264. fmul v1.4s, v17.4s, alphaV0
  265. stp q0, q1, [pCRow0]
  266. add pCRow0, pCRow0, #32
  267. fmul v2.4s, v18.4s, alphaV0
  268. fmul v3.4s, v19.4s, alphaV0
  269. stp q2, q3, [pCRow0]
  270. add pCRow0, pCRow0, #32
  271. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  272. fmul v4.4s, v20.4s, alphaV0
  273. fmul v5.4s, v21.4s, alphaV0
  274. stp q4, q5, [pCRow1]
  275. add pCRow1, pCRow1, #32
  276. fmul v6.4s, v22.4s, alphaV0
  277. fmul v7.4s, v23.4s, alphaV0
  278. stp q6, q7, [pCRow1]
  279. add pCRow1, pCRow1, #32
  280. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  281. fmul v0.4s, v24.4s, alphaV0
  282. fmul v1.4s, v25.4s, alphaV0
  283. stp q0, q1, [pCRow2]
  284. add pCRow2, pCRow2, #32
  285. fmul v2.4s, v26.4s, alphaV0
  286. fmul v3.4s, v27.4s, alphaV0
  287. stp q2, q3, [pCRow2]
  288. add pCRow2, pCRow2, #32
  289. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  290. fmul v4.4s, v28.4s, alphaV0
  291. fmul v5.4s, v29.4s, alphaV0
  292. stp q4, q5, [pCRow3]
  293. add pCRow3, pCRow3, #32
  294. fmul v6.4s, v30.4s, alphaV0
  295. fmul v7.4s, v31.4s, alphaV0
  296. stp q6, q7, [pCRow3]
  297. add pCRow3, pCRow3, #32
  298. .endm
  299. /******************************************************************************/
  300. .macro INIT8x4
  301. fmov s16, wzr
  302. fmov s17, wzr
  303. fmov s20, wzr
  304. fmov s21, s16
  305. fmov s24, wzr
  306. fmov s25, s16
  307. fmov s28, wzr
  308. fmov s29, s16
  309. .endm
  310. .macro KERNEL8x4_I
  311. ldp s8, s9, [pB], #8
  312. ldp s10, s11, [pB], #8
  313. ldr q0, [pA], #16
  314. ldr q1, [pA], #16
  315. fmul v16.4s, v0.4s, v8.s[0]
  316. fmul v17.4s, v1.4s, v8.s[0]
  317. fmul v20.4s, v0.4s, v9.s[0]
  318. fmul v21.4s, v1.4s, v9.s[0]
  319. fmul v24.4s, v0.4s, v10.s[0]
  320. fmul v25.4s, v1.4s, v10.s[0]
  321. fmul v28.4s, v0.4s, v11.s[0]
  322. fmul v29.4s, v1.4s, v11.s[0]
  323. ldp s12, s13, [pB], #8
  324. ldp s14, s15, [pB], #8
  325. ldr q4, [pA], #16
  326. ldr q5, [pA], #16
  327. .endm
  328. .macro KERNEL8x4_M1
  329. fmla v16.4s, v0.4s, v8.s[0]
  330. fmla v17.4s, v1.4s, v8.s[0]
  331. fmla v20.4s, v0.4s, v9.s[0]
  332. fmla v21.4s, v1.4s, v9.s[0]
  333. fmla v24.4s, v0.4s, v10.s[0]
  334. fmla v25.4s, v1.4s, v10.s[0]
  335. fmla v28.4s, v0.4s, v11.s[0]
  336. fmla v29.4s, v1.4s, v11.s[0]
  337. ldp s12, s13, [pB], #8
  338. ldp s14, s15, [pB], #8
  339. ldr q4, [pA], #16
  340. ldr q5, [pA], #16
  341. .endm
  342. .macro KERNEL8x4_M2
  343. fmla v16.4s, v4.4s, v12.s[0]
  344. fmla v17.4s, v5.4s, v12.s[0]
  345. fmla v20.4s, v4.4s, v13.s[0]
  346. fmla v21.4s, v5.4s, v13.s[0]
  347. fmla v24.4s, v4.4s, v14.s[0]
  348. fmla v25.4s, v5.4s, v14.s[0]
  349. fmla v28.4s, v4.4s, v15.s[0]
  350. fmla v29.4s, v5.4s, v15.s[0]
  351. ldp s8, s9, [pB], #8
  352. ldp s10, s11, [pB], #8
  353. ldr q0, [pA], #16
  354. ldr q1, [pA], #16
  355. .endm
  356. .macro KERNEL8x4_E
  357. fmla v16.4s, v4.4s, v12.s[0]
  358. fmla v17.4s, v5.4s, v12.s[0]
  359. fmla v20.4s, v4.4s, v13.s[0]
  360. fmla v21.4s, v5.4s, v13.s[0]
  361. fmla v24.4s, v4.4s, v14.s[0]
  362. fmla v25.4s, v5.4s, v14.s[0]
  363. fmla v28.4s, v4.4s, v15.s[0]
  364. fmla v29.4s, v5.4s, v15.s[0]
  365. .endm
  366. .macro KERNEL8x4_SUB
  367. ldp s8, s9, [pB], #8
  368. ldp s10, s11, [pB], #8
  369. ldr q0, [pA], #16
  370. ldr q1, [pA], #16
  371. fmla v16.4s, v0.4s, v8.s[0]
  372. fmla v17.4s, v1.4s, v8.s[0]
  373. fmla v20.4s, v0.4s, v9.s[0]
  374. fmla v21.4s, v1.4s, v9.s[0]
  375. fmla v24.4s, v0.4s, v10.s[0]
  376. fmla v25.4s, v1.4s, v10.s[0]
  377. fmla v28.4s, v0.4s, v11.s[0]
  378. fmla v29.4s, v1.4s, v11.s[0]
  379. .endm
  380. .macro SAVE8x4
  381. fmov alpha0, alpha
  382. fmul v0.4s, v16.4s, alphaV0
  383. fmul v1.4s, v17.4s, alphaV0
  384. stp q0, q1, [pCRow0]
  385. add pCRow0, pCRow0, #32
  386. fmul v2.4s, v20.4s, alphaV0
  387. fmul v3.4s, v21.4s, alphaV0
  388. stp q2, q3, [pCRow1]
  389. add pCRow1, pCRow1, #32
  390. fmul v4.4s, v24.4s, alphaV0
  391. fmul v5.4s, v25.4s, alphaV0
  392. stp q4, q5, [pCRow2]
  393. add pCRow2, pCRow2, #32
  394. fmul v6.4s, v28.4s, alphaV0
  395. fmul v7.4s, v29.4s, alphaV0
  396. stp q6, q7, [pCRow3]
  397. add pCRow3, pCRow3, #32
  398. .endm
  399. /******************************************************************************/
  400. .macro INIT4x4
  401. fmov s16, wzr
  402. fmov s20, wzr
  403. fmov s24, wzr
  404. fmov s28, wzr
  405. .endm
  406. .macro KERNEL4x4_I
  407. ldp s8, s9, [pB], #8
  408. ldp s10, s11, [pB], #8
  409. ldr q0, [pA], #16
  410. fmul v16.4s, v0.4s, v8.s[0]
  411. fmul v20.4s, v0.4s, v9.s[0]
  412. fmul v24.4s, v0.4s, v10.s[0]
  413. fmul v28.4s, v0.4s, v11.s[0]
  414. ldp s12, s13, [pB], #8
  415. ldp s14, s15, [pB], #8
  416. ldr q1, [pA], #16
  417. .endm
  418. .macro KERNEL4x4_M1
  419. fmla v16.4s, v0.4s, v8.s[0]
  420. fmla v20.4s, v0.4s, v9.s[0]
  421. fmla v24.4s, v0.4s, v10.s[0]
  422. fmla v28.4s, v0.4s, v11.s[0]
  423. ldp s12, s13, [pB], #8
  424. ldp s14, s15, [pB], #8
  425. ldr q1, [pA], #16
  426. .endm
  427. .macro KERNEL4x4_M2
  428. fmla v16.4s, v1.4s, v12.s[0]
  429. fmla v20.4s, v1.4s, v13.s[0]
  430. fmla v24.4s, v1.4s, v14.s[0]
  431. fmla v28.4s, v1.4s, v15.s[0]
  432. ldp s8, s9, [pB], #8
  433. ldp s10, s11, [pB], #8
  434. ldr q0, [pA], #16
  435. .endm
  436. .macro KERNEL4x4_E
  437. fmla v16.4s, v1.4s, v12.s[0]
  438. fmla v20.4s, v1.4s, v13.s[0]
  439. fmla v24.4s, v1.4s, v14.s[0]
  440. fmla v28.4s, v1.4s, v15.s[0]
  441. .endm
  442. .macro KERNEL4x4_SUB
  443. ldp s8, s9, [pB], #8
  444. ldp s10, s11, [pB], #8
  445. ldr q0, [pA], #16
  446. fmla v16.4s, v0.4s, v8.s[0]
  447. fmla v20.4s, v0.4s, v9.s[0]
  448. fmla v24.4s, v0.4s, v10.s[0]
  449. fmla v28.4s, v0.4s, v11.s[0]
  450. .endm
  451. .macro SAVE4x4
  452. fmov alpha0, alpha
  453. fmul v0.4s, v16.4s, alphaV0
  454. str q0, [pCRow0]
  455. add pCRow0, pCRow0, #16
  456. fmul v1.4s, v20.4s, alphaV0
  457. str q1, [pCRow1]
  458. add pCRow1, pCRow1, #16
  459. fmul v2.4s, v24.4s, alphaV0
  460. str q2, [pCRow2]
  461. add pCRow2, pCRow2, #16
  462. fmul v3.4s, v28.4s, alphaV0
  463. str q3, [pCRow3]
  464. add pCRow3, pCRow3, #16
  465. .endm
  466. /******************************************************************************/
  467. .macro INIT2x4
  468. fmov s16, wzr
  469. fmov s20, s16
  470. fmov s24, s20
  471. fmov s28, s16
  472. .endm
  473. .macro KERNEL2x4_SUB
  474. ldp s8, s9, [pB], #8
  475. ldp s10, s11, [pB], #8
  476. ldr d0, [pA], #8
  477. fmla v16.2s, v0.2s, v8.s[0]
  478. fmla v20.2s, v0.2s, v9.s[0]
  479. fmla v24.2s, v0.2s, v10.s[0]
  480. fmla v28.2s, v0.2s, v11.s[0]
  481. .endm
  482. .macro SAVE2x4
  483. fmov alpha0, alpha
  484. fmul v0.2s, v16.2s, alphaV0
  485. str d0, [pCRow0]
  486. add pCRow0, pCRow0, #8
  487. fmul v1.2s, v20.2s, alphaV0
  488. str d1, [pCRow1]
  489. add pCRow1, pCRow1, #8
  490. fmul v0.2s, v24.2s, alphaV0
  491. str d0, [pCRow2]
  492. add pCRow2, pCRow2, #8
  493. fmul v1.2s, v28.2s, alphaV0
  494. str d1, [pCRow3]
  495. add pCRow3, pCRow3, #8
  496. .endm
  497. /******************************************************************************/
  498. .macro INIT1x4
  499. fmov s16, wzr
  500. fmov s20, s16
  501. .endm
  502. .macro KERNEL1x4_SUB
  503. ldr s0, [pA]
  504. add pA, pA, #4
  505. ld1 {v8.2s, v9.2s}, [pB]
  506. add pB, pB, #16
  507. fmla v16.2s, v8.2s, v0.s[0]
  508. fmla v20.2s, v9.2s, v0.s[0]
  509. .endm
  510. .macro SAVE1x4
  511. fmov alpha0, alpha
  512. fmul v8.2s, v16.2s, alphaV0
  513. st1 {v8.s}[0], [pCRow0]
  514. st1 {v8.s}[1], [pCRow1]
  515. add pCRow0, pCRow0, #4
  516. add pCRow1, pCRow1, #4
  517. fmul v12.2s, v20.2s, alphaV0
  518. st1 {v12.s}[0], [pCRow2]
  519. st1 {v12.s}[1], [pCRow3]
  520. add pCRow2, pCRow2, #4
  521. add pCRow3, pCRow3, #4
  522. .endm
  523. /******************************************************************************/
  524. .macro INIT16x2
  525. fmov s16, wzr
  526. fmov s17, wzr
  527. fmov s18, wzr
  528. fmov s19, s16
  529. fmov s20, wzr
  530. fmov s21, s16
  531. fmov s22, wzr
  532. fmov s23, s16
  533. .endm
  534. .macro KERNEL16x2_SUB
  535. ld1 {v8.2s}, [pB]
  536. add pB, pB, #8
  537. ld1 {v0.4s}, [pA]
  538. add pA, pA, #16
  539. ld1 {v1.4s}, [pA]
  540. add pA, pA, #16
  541. ld1 {v2.4s}, [pA]
  542. add pA, pA, #16
  543. ld1 {v3.4s}, [pA]
  544. add pA, pA, #16
  545. fmla v16.4s, v0.4s, v8.s[0]
  546. fmla v17.4s, v1.4s, v8.s[0]
  547. fmla v18.4s, v2.4s, v8.s[0]
  548. fmla v19.4s, v3.4s, v8.s[0]
  549. fmla v20.4s, v0.4s, v8.s[1]
  550. fmla v21.4s, v1.4s, v8.s[1]
  551. fmla v22.4s, v2.4s, v8.s[1]
  552. fmla v23.4s, v3.4s, v8.s[1]
  553. .endm
  554. .macro SAVE16x2
  555. fmov alpha0, alpha
  556. add pCRow1, pCRow0, LDC
  557. fmul v0.4s, v16.4s, alphaV0
  558. fmul v1.4s, v17.4s, alphaV0
  559. fmul v2.4s, v18.4s, alphaV0
  560. fmul v3.4s, v19.4s, alphaV0
  561. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  562. fmul v4.4s, v20.4s, alphaV0
  563. fmul v5.4s, v21.4s, alphaV0
  564. fmul v6.4s, v22.4s, alphaV0
  565. fmul v7.4s, v23.4s, alphaV0
  566. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  567. add pCRow0, pCRow0, #64
  568. .endm
  569. /******************************************************************************/
  570. .macro INIT8x2
  571. fmov s16, wzr
  572. fmov s17, s16
  573. fmov s20, s17
  574. fmov s21, s16
  575. .endm
  576. .macro KERNEL8x2_SUB
  577. ld1 {v8.2s}, [pB]
  578. add pB, pB, #8
  579. ld1 {v0.4s}, [pA]
  580. add pA, pA, #16
  581. ld1 {v1.4s}, [pA]
  582. add pA, pA, #16
  583. fmla v16.4s, v0.4s, v8.s[0]
  584. fmla v17.4s, v1.4s, v8.s[0]
  585. fmla v20.4s, v0.4s, v8.s[1]
  586. fmla v21.4s, v1.4s, v8.s[1]
  587. .endm
  588. .macro SAVE8x2
  589. fmov alpha0, alpha
  590. add pCRow1, pCRow0, LDC
  591. fmul v0.4s, v16.4s, alphaV0
  592. fmul v1.4s, v17.4s, alphaV0
  593. st1 {v0.4s, v1.4s}, [pCRow0]
  594. add pCRow2, pCRow1, LDC
  595. fmul v4.4s, v20.4s, alphaV0
  596. fmul v5.4s, v21.4s, alphaV0
  597. st1 {v4.4s, v5.4s}, [pCRow1]
  598. add pCRow0, pCRow0, #32
  599. .endm
  600. /******************************************************************************/
  601. .macro INIT4x2
  602. fmov s16, wzr
  603. fmov s17, s16
  604. fmov s20, s17
  605. fmov s21, s16
  606. .endm
  607. .macro KERNEL4x2_SUB
  608. ld1 {v8.2s}, [pB]
  609. add pB, pB, #8
  610. ld1 {v0.2s, v1.2s}, [pA]
  611. add pA, pA, #16
  612. fmla v16.2s, v0.2s, v8.s[0]
  613. fmla v17.2s, v1.2s, v8.s[0]
  614. fmla v20.2s, v0.2s, v8.s[1]
  615. fmla v21.2s, v1.2s, v8.s[1]
  616. .endm
  617. .macro SAVE4x2
  618. fmov alpha0, alpha
  619. fmul v8.2s, v16.2s, alphaV0
  620. fmul v9.2s, v17.2s, alphaV0
  621. st1 {v8.2s, v9.2s}, [pCRow0]
  622. add pCRow1, pCRow0, LDC
  623. fmul v12.2s, v20.2s, alphaV0
  624. fmul v13.2s, v21.2s, alphaV0
  625. st1 {v12.2s, v13.2s}, [pCRow1]
  626. add pCRow0, pCRow0, #16
  627. .endm
  628. /******************************************************************************/
  629. .macro INIT2x2
  630. fmov s16, wzr
  631. fmov s20, s16
  632. .endm
  633. .macro KERNEL2x2_SUB
  634. ld1 {v8.2s}, [pB]
  635. add pB, pB, #8
  636. ld1 {v0.2s}, [pA]
  637. add pA, pA, #8
  638. fmla v16.2s, v0.2s, v8.s[0]
  639. fmla v20.2s, v0.2s, v8.s[1]
  640. .endm
  641. .macro SAVE2x2
  642. fmov alpha0, alpha
  643. fmul v8.2s, v16.2s, alphaV0
  644. st1 {v8.2s}, [pCRow0]
  645. add pCRow1 , pCRow0, LDC
  646. fmul v12.2s, v20.2s, alphaV0
  647. st1 {v12.2s}, [pCRow1]
  648. add pCRow0, pCRow0, #8
  649. .endm
  650. /******************************************************************************/
  651. .macro INIT1x2
  652. fmov s16, wzr
  653. .endm
  654. .macro KERNEL1x2_SUB
  655. ld1 {v8.2s} , [pB]
  656. add pB , pB, #8
  657. ldr s0 , [pA]
  658. add pA, pA, #4
  659. fmla v16.2s, v8.2s, v0.s[0]
  660. .endm
  661. .macro SAVE1x2
  662. fmov alpha0, alpha
  663. add pCRow1 , pCRow0, LDC
  664. fmul v8.2s, v16.2s, alphaV0
  665. st1 {v8.s}[0], [pCRow0]
  666. st1 {v8.s}[1], [pCRow1]
  667. add pCRow0, pCRow0, #4
  668. .endm
  669. /******************************************************************************/
  670. .macro INIT16x1
  671. fmov s16, wzr
  672. fmov s17, wzr
  673. fmov s18, wzr
  674. fmov s19, s16
  675. .endm
  676. .macro KERNEL16x1_SUB
  677. ldr s8, [pB]
  678. add pB , pB, #4
  679. ld1 {v0.4s}, [pA]
  680. add pA, pA, #16
  681. ld1 {v1.4s}, [pA]
  682. add pA, pA, #16
  683. ld1 {v2.4s}, [pA]
  684. add pA, pA, #16
  685. ld1 {v3.4s}, [pA]
  686. add pA, pA, #16
  687. fmla v16.4s, v0.4s, v8.s[0]
  688. fmla v17.4s, v1.4s, v8.s[0]
  689. fmla v18.4s, v2.4s, v8.s[0]
  690. fmla v19.4s, v3.4s, v8.s[0]
  691. .endm
  692. .macro SAVE16x1
  693. fmov alpha0, alpha
  694. fmul v0.4s, v16.4s, alphaV0
  695. fmul v1.4s, v17.4s, alphaV0
  696. fmul v2.4s, v18.4s, alphaV0
  697. fmul v3.4s, v19.4s, alphaV0
  698. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  699. add pCRow0, pCRow0, #64
  700. .endm
  701. /******************************************************************************/
  702. .macro INIT8x1
  703. fmov s16, wzr
  704. fmov s17, wzr
  705. .endm
  706. .macro KERNEL8x1_SUB
  707. ldr s8, [pB]
  708. add pB , pB, #4
  709. ld1 {v0.4s}, [pA]
  710. add pA, pA, #16
  711. ld1 {v1.4s}, [pA]
  712. add pA, pA, #16
  713. fmla v16.4s, v0.4s, v8.s[0]
  714. fmla v17.4s, v1.4s, v8.s[0]
  715. .endm
  716. .macro SAVE8x1
  717. fmov alpha0, alpha
  718. fmul v0.4s, v16.4s, alphaV0
  719. fmul v1.4s, v17.4s, alphaV0
  720. st1 {v0.4s, v1.4s}, [pCRow0]
  721. add pCRow0, pCRow0, #32
  722. .endm
  723. /******************************************************************************/
  724. .macro INIT4x1
  725. fmov s16, wzr
  726. fmov s17, s16
  727. .endm
  728. .macro KERNEL4x1_SUB
  729. ldr s8, [pB]
  730. add pB , pB, #4
  731. ld1 {v0.2s, v1.2s}, [pA]
  732. add pA , pA, #16
  733. fmla v16.2s, v0.2s, v8.s[0]
  734. fmla v17.2s, v1.2s, v8.s[0]
  735. .endm
  736. .macro SAVE4x1
  737. fmov alpha0, alpha
  738. fmul v8.2s, v16.2s, alphaV0
  739. fmul v9.2s, v17.2s, alphaV0
  740. st1 {v8.2s, v9.2s}, [pCRow0]
  741. add pCRow0, pCRow0, #16
  742. .endm
  743. /******************************************************************************/
  744. .macro INIT2x1
  745. fmov s16, wzr
  746. .endm
  747. .macro KERNEL2x1_SUB
  748. ldr s8, [pB]
  749. add pB , pB, #4
  750. ld1 {v0.2s}, [pA]
  751. add pA , pA, #8
  752. fmla v16.2s, v0.2s, v8.s[0]
  753. .endm
  754. .macro SAVE2x1
  755. fmov alpha0, alpha
  756. fmul v8.2s, v16.2s, alphaV0
  757. st1 {v8.2s}, [pCRow0]
  758. add pCRow0, pCRow0, #8
  759. .endm
  760. /******************************************************************************/
  761. .macro INIT1x1
  762. fmov s16, wzr
  763. .endm
  764. .macro KERNEL1x1_SUB
  765. ldr s8, [pB]
  766. add pB , pB, #4
  767. ldr s0, [pA]
  768. add pA , pA, #4
  769. fmadd s16, s0, s8, s16
  770. .endm
  771. .macro SAVE1x1
  772. fmov alpha0, alpha
  773. fmul s8, s16, alpha0
  774. str s8, [pCRow0]
  775. add pCRow0, pCRow0, #4
  776. .endm
  777. /*******************************************************************************
  778. * End of macro definitions
  779. *******************************************************************************/
  780. PROLOGUE
  781. .Lstrmm_kernel_begin:
  782. .align 5
  783. add sp, sp, #-(11 * 16)
  784. stp d8, d9, [sp, #(0 * 16)]
  785. stp d10, d11, [sp, #(1 * 16)]
  786. stp d12, d13, [sp, #(2 * 16)]
  787. stp d14, d15, [sp, #(3 * 16)]
  788. stp d16, d17, [sp, #(4 * 16)]
  789. stp x18, x19, [sp, #(5 * 16)]
  790. stp x20, x21, [sp, #(6 * 16)]
  791. stp x22, x23, [sp, #(7 * 16)]
  792. stp x24, x25, [sp, #(8 * 16)]
  793. stp x26, x27, [sp, #(9 * 16)]
  794. str x28, [sp, #(10 * 16)]
  795. prfm PLDL1KEEP, [origPB]
  796. prfm PLDL1KEEP, [origPA]
  797. fmov alpha, s0
  798. lsl LDC, LDC, #2 // ldc = ldc * 4
  799. #if !defined(LEFT)
  800. neg tempOffset, offset
  801. #endif
  802. mov pB, origPB
  803. mov counterJ, origN
  804. asr counterJ, counterJ, #2 // J = J / 4
  805. cmp counterJ, #0
  806. ble .Lstrmm_kernel_L2_BEGIN
  807. /******************************************************************************/
  808. .Lstrmm_kernel_L4_BEGIN:
  809. mov pCRow0, pC
  810. add pCRow1, pCRow0, LDC
  811. add pCRow2, pCRow1, LDC
  812. add pCRow3, pCRow2, LDC
  813. add pC, pCRow3, LDC
  814. #if defined(LEFT)
  815. mov tempOffset, offset
  816. #endif
  817. mov pA, origPA // pA = start of A array
  818. .Lstrmm_kernel_L4_M16_BEGIN:
  819. mov counterI, origM
  820. asr counterI, counterI, #4 // counterI = counterI / 16
  821. cmp counterI, #0
  822. ble .Lstrmm_kernel_L4_M8_BEGIN
  823. .align 5
  824. .Lstrmm_kernel_L4_M16_20:
  825. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  826. mov pB, origPB
  827. #else
  828. mov pB, origPB
  829. lsl temp, tempOffset, #6
  830. add pA, pA, temp
  831. lsl temp, tempOffset, #4
  832. add pB, pB, temp
  833. #endif
  834. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  835. sub tempK, origK, tempOffset
  836. #elif defined(LEFT)
  837. add tempK, tempOffset, #16
  838. #else
  839. add tempK, tempOffset, #4
  840. #endif
  841. asr counterL , tempK, #3
  842. cmp counterL , #2
  843. blt .Lstrmm_kernel_L4_M16_32
  844. KERNEL16x4_I
  845. KERNEL16x4_M2
  846. KERNEL16x4_M1
  847. KERNEL16x4_M2
  848. KERNEL16x4_M1
  849. KERNEL16x4_M2
  850. KERNEL16x4_M1
  851. KERNEL16x4_M2
  852. subs counterL, counterL, #2
  853. ble .Lstrmm_kernel_L4_M16_22a
  854. .align 5
  855. .Lstrmm_kernel_L4_M16_22:
  856. KERNEL16x4_M1
  857. KERNEL16x4_M2
  858. KERNEL16x4_M1
  859. KERNEL16x4_M2
  860. KERNEL16x4_M1
  861. KERNEL16x4_M2
  862. KERNEL16x4_M1
  863. KERNEL16x4_M2
  864. subs counterL, counterL, #1
  865. bgt .Lstrmm_kernel_L4_M16_22
  866. .align 5
  867. .Lstrmm_kernel_L4_M16_22a:
  868. KERNEL16x4_M1
  869. KERNEL16x4_M2
  870. KERNEL16x4_M1
  871. KERNEL16x4_M2
  872. KERNEL16x4_M1
  873. KERNEL16x4_M2
  874. KERNEL16x4_M1
  875. KERNEL16x4_E
  876. b .Lstrmm_kernel_L4_M16_44
  877. .align 5
  878. .Lstrmm_kernel_L4_M16_32:
  879. tst counterL, #1
  880. ble .Lstrmm_kernel_L4_M16_40
  881. KERNEL16x4_I
  882. KERNEL16x4_M2
  883. KERNEL16x4_M1
  884. KERNEL16x4_M2
  885. KERNEL16x4_M1
  886. KERNEL16x4_M2
  887. KERNEL16x4_M1
  888. KERNEL16x4_E
  889. b .Lstrmm_kernel_L4_M16_44
  890. .Lstrmm_kernel_L4_M16_40:
  891. INIT16x4
  892. .Lstrmm_kernel_L4_M16_44:
  893. ands counterL , tempK, #7
  894. ble .Lstrmm_kernel_L4_M16_100
  895. .align 5
  896. .Lstrmm_kernel_L4_M16_46:
  897. KERNEL16x4_SUB
  898. subs counterL, counterL, #1
  899. bne .Lstrmm_kernel_L4_M16_46
  900. .Lstrmm_kernel_L4_M16_100:
  901. SAVE16x4
  902. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  903. sub tempK, origK, tempOffset
  904. #if defined(LEFT)
  905. sub tempK, tempK, #16
  906. #else
  907. sub tempK, tempK, #4
  908. #endif
  909. lsl temp, tempK, #6
  910. add pA, pA, temp
  911. lsl temp, tempK, #4
  912. add pB, pB, temp
  913. #endif
  914. #if defined(LEFT)
  915. add tempOffset, tempOffset, #16
  916. #endif
  917. prfm PLDL1KEEP, [pA]
  918. prfm PLDL1KEEP, [pA, #64]
  919. prfm PLDL1KEEP, [origPB]
  920. .Lstrmm_kernel_L4_M16_END:
  921. subs counterI, counterI, #1
  922. bne .Lstrmm_kernel_L4_M16_20
  923. //------------------------------------------------------------------------------
  924. .Lstrmm_kernel_L4_M8_BEGIN:
  925. mov counterI, origM
  926. tst counterI , #15
  927. ble .Lstrmm_kernel_L4_END
  928. tst counterI, #8
  929. ble .Lstrmm_kernel_L4_M4_BEGIN
  930. .Lstrmm_kernel_L4_M8_20:
  931. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  932. mov pB, origPB
  933. #else
  934. mov pB, origPB
  935. lsl temp, tempOffset, #5
  936. add pA, pA, temp
  937. lsl temp, tempOffset, #4
  938. add pB, pB, temp
  939. #endif
  940. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  941. sub tempK, origK, tempOffset
  942. #elif defined(LEFT)
  943. add tempK, tempOffset, #8
  944. #else
  945. add tempK, tempOffset, #4
  946. #endif
  947. asr counterL , tempK, #1 // L = K / 2
  948. cmp counterL , #2 // is there at least 4 to do?
  949. blt .Lstrmm_kernel_L4_M8_32
  950. KERNEL8x4_I // do one in the K
  951. KERNEL8x4_M2 // do another in the K
  952. subs counterL, counterL, #2
  953. ble .Lstrmm_kernel_L4_M8_22a
  954. .align 5
  955. .Lstrmm_kernel_L4_M8_22:
  956. KERNEL8x4_M1
  957. KERNEL8x4_M2
  958. subs counterL, counterL, #1
  959. bgt .Lstrmm_kernel_L4_M8_22
  960. .Lstrmm_kernel_L4_M8_22a:
  961. KERNEL8x4_M1
  962. KERNEL8x4_E
  963. b .Lstrmm_kernel_L4_M8_44
  964. .Lstrmm_kernel_L4_M8_32:
  965. tst counterL, #1
  966. ble .Lstrmm_kernel_L4_M8_40
  967. KERNEL8x4_I
  968. KERNEL8x4_E
  969. b .Lstrmm_kernel_L4_M8_44
  970. .Lstrmm_kernel_L4_M8_40:
  971. INIT8x4
  972. .Lstrmm_kernel_L4_M8_44:
  973. ands counterL , tempK, #1
  974. ble .Lstrmm_kernel_L4_M8_100
  975. .Lstrmm_kernel_L4_M8_46:
  976. KERNEL8x4_SUB
  977. .Lstrmm_kernel_L4_M8_100:
  978. SAVE8x4
  979. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  980. sub tempK, origK, tempOffset
  981. #if defined(LEFT)
  982. sub tempK, tempK, #8
  983. #else
  984. sub tempK, tempK, #4
  985. #endif
  986. lsl temp, tempK, #5
  987. add pA, pA, temp
  988. lsl temp, tempK, #4
  989. add pB, pB, temp
  990. #endif
  991. #if defined(LEFT)
  992. add tempOffset, tempOffset, #8
  993. #endif
  994. .Lstrmm_kernel_L4_M8_END:
  995. //------------------------------------------------------------------------------
  996. .Lstrmm_kernel_L4_M4_BEGIN:
  997. mov counterI, origM
  998. tst counterI , #7
  999. ble .Lstrmm_kernel_L4_END
  1000. tst counterI, #4
  1001. ble .Lstrmm_kernel_L4_M2_BEGIN
  1002. .Lstrmm_kernel_L4_M4_20:
  1003. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1004. mov pB, origPB
  1005. #else
  1006. mov pB, origPB
  1007. lsl temp, tempOffset, #4
  1008. add pB, pB, temp
  1009. add pA, pA, temp
  1010. #endif
  1011. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1012. sub tempK, origK, tempOffset
  1013. #elif defined(LEFT)
  1014. add tempK, tempOffset, #4
  1015. #else
  1016. add tempK, tempOffset, #4
  1017. #endif
  1018. asr counterL , tempK, #1 // L = K / 2
  1019. cmp counterL , #2 // is there at least 4 to do?
  1020. blt .Lstrmm_kernel_L4_M4_32
  1021. KERNEL4x4_I // do one in the K
  1022. KERNEL4x4_M2 // do another in the K
  1023. subs counterL, counterL, #2
  1024. ble .Lstrmm_kernel_L4_M4_22a
  1025. .align 5
  1026. .Lstrmm_kernel_L4_M4_22:
  1027. KERNEL4x4_M1
  1028. KERNEL4x4_M2
  1029. subs counterL, counterL, #1
  1030. bgt .Lstrmm_kernel_L4_M4_22
  1031. .Lstrmm_kernel_L4_M4_22a:
  1032. KERNEL4x4_M1
  1033. KERNEL4x4_E
  1034. b .Lstrmm_kernel_L4_M4_44
  1035. .Lstrmm_kernel_L4_M4_32:
  1036. tst counterL, #1
  1037. ble .Lstrmm_kernel_L4_M4_40
  1038. KERNEL4x4_I
  1039. KERNEL4x4_E
  1040. b .Lstrmm_kernel_L4_M4_44
  1041. .Lstrmm_kernel_L4_M4_40:
  1042. INIT4x4
  1043. .Lstrmm_kernel_L4_M4_44:
  1044. ands counterL , tempK, #1
  1045. ble .Lstrmm_kernel_L4_M4_100
  1046. .Lstrmm_kernel_L4_M4_46:
  1047. KERNEL4x4_SUB
  1048. .Lstrmm_kernel_L4_M4_100:
  1049. SAVE4x4
  1050. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1051. sub tempK, origK, tempOffset
  1052. #if defined(LEFT)
  1053. sub tempK, tempK, #4
  1054. #else
  1055. sub tempK, tempK, #4
  1056. #endif
  1057. lsl temp, tempK, #4
  1058. add pA, pA, temp
  1059. add pB, pB, temp
  1060. #endif
  1061. #if defined(LEFT)
  1062. add tempOffset, tempOffset, #4
  1063. #endif
  1064. .Lstrmm_kernel_L4_M4_END:
  1065. //------------------------------------------------------------------------------
  1066. .Lstrmm_kernel_L4_M2_BEGIN:
  1067. mov counterI, origM
  1068. tst counterI , #3
  1069. ble .Lstrmm_kernel_L4_END
  1070. tst counterI, #2 // counterI = counterI / 2
  1071. ble .Lstrmm_kernel_L4_M1_BEGIN
  1072. .Lstrmm_kernel_L4_M2_20:
  1073. INIT2x4
  1074. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1075. mov pB, origPB
  1076. #else
  1077. mov pB, origPB
  1078. lsl temp, tempOffset, #3
  1079. add pA, pA, temp
  1080. lsl temp, tempOffset, #4
  1081. add pB, pB, temp
  1082. #endif
  1083. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1084. sub tempK, origK, tempOffset
  1085. #elif defined(LEFT)
  1086. add tempK, tempOffset, #2
  1087. #else
  1088. add tempK, tempOffset, #4
  1089. #endif
  1090. asr counterL , tempK, #3 // counterL = counterL / 8
  1091. cmp counterL , #0
  1092. ble .Lstrmm_kernel_L4_M2_40
  1093. .Lstrmm_kernel_L4_M2_22:
  1094. KERNEL2x4_SUB
  1095. KERNEL2x4_SUB
  1096. KERNEL2x4_SUB
  1097. KERNEL2x4_SUB
  1098. KERNEL2x4_SUB
  1099. KERNEL2x4_SUB
  1100. KERNEL2x4_SUB
  1101. KERNEL2x4_SUB
  1102. subs counterL, counterL, #1
  1103. bgt .Lstrmm_kernel_L4_M2_22
  1104. .Lstrmm_kernel_L4_M2_40:
  1105. ands counterL , tempK, #7 // counterL = counterL % 8
  1106. ble .Lstrmm_kernel_L4_M2_100
  1107. .Lstrmm_kernel_L4_M2_42:
  1108. KERNEL2x4_SUB
  1109. subs counterL, counterL, #1
  1110. bgt .Lstrmm_kernel_L4_M2_42
  1111. .Lstrmm_kernel_L4_M2_100:
  1112. SAVE2x4
  1113. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1114. sub tempK, origK, tempOffset
  1115. #if defined(LEFT)
  1116. sub tempK, tempK, #2
  1117. #else
  1118. sub tempK, tempK, #4
  1119. #endif
  1120. lsl temp, tempK, #3
  1121. add pA, pA, temp
  1122. lsl temp, tempK, #4
  1123. add pB, pB, temp
  1124. #endif
  1125. #if defined(LEFT)
  1126. add tempOffset, tempOffset, #2
  1127. #endif
  1128. .Lstrmm_kernel_L4_M2_END:
  1129. .Lstrmm_kernel_L4_M1_BEGIN:
  1130. tst counterI, #1 // counterI = counterI % 2
  1131. ble .Lstrmm_kernel_L4_END
  1132. .Lstrmm_kernel_L4_M1_20:
  1133. INIT1x4
  1134. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1135. mov pB, origPB
  1136. #else
  1137. mov pB, origPB
  1138. lsl temp, tempOffset, #4
  1139. add pB, pB, temp
  1140. lsl temp, tempOffset, #2
  1141. add pA, pA, temp
  1142. #endif
  1143. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1144. sub tempK, origK, tempOffset
  1145. #elif defined(LEFT)
  1146. add tempK, tempOffset, #1
  1147. #else
  1148. add tempK, tempOffset, #4
  1149. #endif
  1150. asr counterL , tempK, #3 // counterL = counterL / 8
  1151. cmp counterL , #0
  1152. ble .Lstrmm_kernel_L4_M1_40
  1153. .Lstrmm_kernel_L4_M1_22:
  1154. KERNEL1x4_SUB
  1155. KERNEL1x4_SUB
  1156. KERNEL1x4_SUB
  1157. KERNEL1x4_SUB
  1158. KERNEL1x4_SUB
  1159. KERNEL1x4_SUB
  1160. KERNEL1x4_SUB
  1161. KERNEL1x4_SUB
  1162. subs counterL, counterL, #1
  1163. bgt .Lstrmm_kernel_L4_M1_22
  1164. .Lstrmm_kernel_L4_M1_40:
  1165. ands counterL , tempK, #7 // counterL = counterL % 8
  1166. ble .Lstrmm_kernel_L4_M1_100
  1167. .Lstrmm_kernel_L4_M1_42:
  1168. KERNEL1x4_SUB
  1169. subs counterL, counterL, #1
  1170. bgt .Lstrmm_kernel_L4_M1_42
  1171. .Lstrmm_kernel_L4_M1_100:
  1172. SAVE1x4
  1173. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1174. sub tempK, origK, tempOffset
  1175. #if defined(LEFT)
  1176. sub tempK, tempK, #1
  1177. #else
  1178. sub tempK, tempK, #4
  1179. #endif
  1180. lsl temp, tempK, #2
  1181. add pA, pA, temp
  1182. lsl temp, tempK, #4
  1183. add pB, pB, temp
  1184. #endif
  1185. #if defined(LEFT)
  1186. add tempOffset, tempOffset, #1
  1187. #endif
  1188. .Lstrmm_kernel_L4_END:
  1189. add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
  1190. #if !defined(LEFT)
  1191. add tempOffset, tempOffset, #4
  1192. #endif
  1193. subs counterJ, counterJ , #1 // j--
  1194. bgt .Lstrmm_kernel_L4_BEGIN
  1195. /******************************************************************************/
  1196. .Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  1197. mov counterJ , origN
  1198. tst counterJ , #3
  1199. ble .Lstrmm_kernel_L999
  1200. tst counterJ , #2
  1201. ble .Lstrmm_kernel_L1_BEGIN
  1202. mov pCRow0, pC // pCRow0 = pC
  1203. add pC,pC,LDC, lsl #1
  1204. #if defined(LEFT)
  1205. mov tempOffset, offset
  1206. #endif
  1207. mov pA, origPA // pA = A
  1208. .Lstrmm_kernel_L2_M16_BEGIN:
  1209. mov counterI, origM
  1210. asr counterI, counterI, #4 // counterI = counterI / 16
  1211. cmp counterI,#0
  1212. ble .Lstrmm_kernel_L2_M8_BEGIN
  1213. .Lstrmm_kernel_L2_M16_20:
  1214. INIT16x2
  1215. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1216. mov pB, origPB
  1217. #else
  1218. mov pB, origPB
  1219. lsl temp, tempOffset, #6
  1220. add pA, pA, temp
  1221. lsl temp, tempOffset, #3
  1222. add pB, pB, temp
  1223. #endif
  1224. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1225. sub tempK, origK, tempOffset
  1226. #elif defined(LEFT)
  1227. add tempK, tempOffset, #16
  1228. #else
  1229. add tempK, tempOffset, #2
  1230. #endif
  1231. asr counterL , tempK, #3 // counterL = counterL / 8
  1232. cmp counterL,#0
  1233. ble .Lstrmm_kernel_L2_M16_40
  1234. .align 5
  1235. .Lstrmm_kernel_L2_M16_22:
  1236. KERNEL16x2_SUB
  1237. KERNEL16x2_SUB
  1238. KERNEL16x2_SUB
  1239. KERNEL16x2_SUB
  1240. KERNEL16x2_SUB
  1241. KERNEL16x2_SUB
  1242. KERNEL16x2_SUB
  1243. KERNEL16x2_SUB
  1244. subs counterL, counterL, #1
  1245. bgt .Lstrmm_kernel_L2_M16_22
  1246. .Lstrmm_kernel_L2_M16_40:
  1247. ands counterL , tempK, #7 // counterL = counterL % 8
  1248. ble .Lstrmm_kernel_L2_M16_100
  1249. .Lstrmm_kernel_L2_M16_42:
  1250. KERNEL16x2_SUB
  1251. subs counterL, counterL, #1
  1252. bgt .Lstrmm_kernel_L2_M16_42
  1253. .Lstrmm_kernel_L2_M16_100:
  1254. SAVE16x2
  1255. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1256. sub tempK, origK, tempOffset
  1257. #if defined(LEFT)
  1258. sub tempK, tempK, #16
  1259. #else
  1260. sub tempK, tempK, #2
  1261. #endif
  1262. lsl temp, tempK, #6
  1263. add pA, pA, temp
  1264. lsl temp, tempK, #3
  1265. add pB, pB, temp
  1266. #endif
  1267. #if defined(LEFT)
  1268. add tempOffset, tempOffset, #16
  1269. #endif
  1270. .Lstrmm_kernel_L2_M16_END:
  1271. subs counterI, counterI, #1
  1272. bgt .Lstrmm_kernel_L2_M16_20
  1273. //------------------------------------------------------------------------------
  1274. .Lstrmm_kernel_L2_M8_BEGIN:
  1275. mov counterI, origM
  1276. tst counterI , #15
  1277. ble .Lstrmm_kernel_L2_END
  1278. tst counterI, #8
  1279. ble .Lstrmm_kernel_L2_M4_BEGIN
  1280. .Lstrmm_kernel_L2_M8_20:
  1281. INIT8x2
  1282. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1283. mov pB, origPB
  1284. #else
  1285. mov pB, origPB
  1286. lsl temp, tempOffset, #5
  1287. add pA, pA, temp
  1288. lsl temp, tempOffset, #3
  1289. add pB, pB, temp
  1290. #endif
  1291. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1292. sub tempK, origK, tempOffset
  1293. #elif defined(LEFT)
  1294. add tempK, tempOffset, #8
  1295. #else
  1296. add tempK, tempOffset, #2
  1297. #endif
  1298. asr counterL , tempK, #3 // counterL = counterL / 8
  1299. cmp counterL,#0
  1300. ble .Lstrmm_kernel_L2_M8_40
  1301. .align 5
  1302. .Lstrmm_kernel_L2_M8_22:
  1303. KERNEL8x2_SUB
  1304. KERNEL8x2_SUB
  1305. KERNEL8x2_SUB
  1306. KERNEL8x2_SUB
  1307. KERNEL8x2_SUB
  1308. KERNEL8x2_SUB
  1309. KERNEL8x2_SUB
  1310. KERNEL8x2_SUB
  1311. subs counterL, counterL, #1
  1312. bgt .Lstrmm_kernel_L2_M8_22
  1313. .Lstrmm_kernel_L2_M8_40:
  1314. ands counterL , tempK, #7 // counterL = counterL % 8
  1315. ble .Lstrmm_kernel_L2_M8_100
  1316. .Lstrmm_kernel_L2_M8_42:
  1317. KERNEL8x2_SUB
  1318. subs counterL, counterL, #1
  1319. bgt .Lstrmm_kernel_L2_M8_42
  1320. .Lstrmm_kernel_L2_M8_100:
  1321. SAVE8x2
  1322. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1323. sub tempK, origK, tempOffset
  1324. #if defined(LEFT)
  1325. sub tempK, tempK, #8
  1326. #else
  1327. sub tempK, tempK, #2
  1328. #endif
  1329. lsl temp, tempK, #5
  1330. add pA, pA, temp
  1331. lsl temp, tempK, #3
  1332. add pB, pB, temp
  1333. #endif
  1334. #if defined(LEFT)
  1335. add tempOffset, tempOffset, #8
  1336. #endif
  1337. .Lstrmm_kernel_L2_M8_END:
  1338. //------------------------------------------------------------------------------
  1339. .Lstrmm_kernel_L2_M4_BEGIN:
  1340. mov counterI, origM
  1341. tst counterI , #7
  1342. ble .Lstrmm_kernel_L2_END
  1343. tst counterI, #4
  1344. ble .Lstrmm_kernel_L2_M2_BEGIN
  1345. .Lstrmm_kernel_L2_M4_20:
  1346. INIT4x2
  1347. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1348. mov pB, origPB
  1349. #else
  1350. mov pB, origPB
  1351. lsl temp, tempOffset, #3
  1352. add pB, pB, temp
  1353. lsl temp, tempOffset, #4
  1354. add pA, pA, temp
  1355. #endif
  1356. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1357. sub tempK, origK, tempOffset
  1358. #elif defined(LEFT)
  1359. add tempK, tempOffset, #4
  1360. #else
  1361. add tempK, tempOffset, #2
  1362. #endif
  1363. asr counterL , tempK, #3 // counterL = counterL / 8
  1364. cmp counterL,#0
  1365. ble .Lstrmm_kernel_L2_M4_40
  1366. .align 5
  1367. .Lstrmm_kernel_L2_M4_22:
  1368. KERNEL4x2_SUB
  1369. KERNEL4x2_SUB
  1370. KERNEL4x2_SUB
  1371. KERNEL4x2_SUB
  1372. KERNEL4x2_SUB
  1373. KERNEL4x2_SUB
  1374. KERNEL4x2_SUB
  1375. KERNEL4x2_SUB
  1376. subs counterL, counterL, #1
  1377. bgt .Lstrmm_kernel_L2_M4_22
  1378. .Lstrmm_kernel_L2_M4_40:
  1379. ands counterL , tempK, #7 // counterL = counterL % 8
  1380. ble .Lstrmm_kernel_L2_M4_100
  1381. .Lstrmm_kernel_L2_M4_42:
  1382. KERNEL4x2_SUB
  1383. subs counterL, counterL, #1
  1384. bgt .Lstrmm_kernel_L2_M4_42
  1385. .Lstrmm_kernel_L2_M4_100:
  1386. SAVE4x2
  1387. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1388. sub tempK, origK, tempOffset
  1389. #if defined(LEFT)
  1390. sub tempK, tempK, #4
  1391. #else
  1392. sub tempK, tempK, #2
  1393. #endif
  1394. lsl temp, tempK, #4
  1395. add pA, pA, temp
  1396. lsl temp, tempK, #3
  1397. add pB, pB, temp
  1398. #endif
  1399. #if defined(LEFT)
  1400. add tempOffset, tempOffset, #4
  1401. #endif
  1402. .Lstrmm_kernel_L2_M4_END:
  1403. //------------------------------------------------------------------------------
  1404. .Lstrmm_kernel_L2_M2_BEGIN:
  1405. mov counterI, origM
  1406. tst counterI , #3
  1407. ble .Lstrmm_kernel_L2_END
  1408. tst counterI, #2 // counterI = counterI / 2
  1409. ble .Lstrmm_kernel_L2_M1_BEGIN
  1410. .Lstrmm_kernel_L2_M2_20:
  1411. INIT2x2
  1412. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1413. mov pB, origPB
  1414. #else
  1415. mov pB, origPB
  1416. lsl temp, tempOffset, #3
  1417. add pB, pB, temp
  1418. lsl temp, tempOffset, #3
  1419. add pA, pA, temp
  1420. #endif
  1421. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1422. sub tempK, origK, tempOffset
  1423. #elif defined(LEFT)
  1424. add tempK, tempOffset, #2
  1425. #else
  1426. add tempK, tempOffset, #2
  1427. #endif
  1428. asr counterL , tempK, #3 // counterL = counterL / 8
  1429. cmp counterL,#0
  1430. ble .Lstrmm_kernel_L2_M2_40
  1431. .Lstrmm_kernel_L2_M2_22:
  1432. KERNEL2x2_SUB
  1433. KERNEL2x2_SUB
  1434. KERNEL2x2_SUB
  1435. KERNEL2x2_SUB
  1436. KERNEL2x2_SUB
  1437. KERNEL2x2_SUB
  1438. KERNEL2x2_SUB
  1439. KERNEL2x2_SUB
  1440. subs counterL, counterL, #1
  1441. bgt .Lstrmm_kernel_L2_M2_22
  1442. .Lstrmm_kernel_L2_M2_40:
  1443. ands counterL , tempK, #7 // counterL = counterL % 8
  1444. ble .Lstrmm_kernel_L2_M2_100
  1445. .Lstrmm_kernel_L2_M2_42:
  1446. KERNEL2x2_SUB
  1447. subs counterL, counterL, #1
  1448. bgt .Lstrmm_kernel_L2_M2_42
  1449. .Lstrmm_kernel_L2_M2_100:
  1450. SAVE2x2
  1451. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1452. sub tempK, origK, tempOffset
  1453. #if defined(LEFT)
  1454. sub tempK, tempK, #2
  1455. #else
  1456. sub tempK, tempK, #2
  1457. #endif
  1458. lsl temp, tempK, #3
  1459. add pA, pA, temp
  1460. lsl temp, tempK, #3
  1461. add pB, pB, temp
  1462. #endif
  1463. #if defined(LEFT)
  1464. add tempOffset, tempOffset, #2
  1465. #endif
  1466. .Lstrmm_kernel_L2_M2_END:
  1467. .Lstrmm_kernel_L2_M1_BEGIN:
  1468. tst counterI, #1 // counterI = counterI % 2
  1469. ble .Lstrmm_kernel_L2_END
  1470. .Lstrmm_kernel_L2_M1_20:
  1471. INIT1x2
  1472. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1473. mov pB, origPB
  1474. #else
  1475. mov pB, origPB
  1476. lsl temp, tempOffset, #3
  1477. add pB, pB, temp
  1478. lsl temp, tempOffset, #2
  1479. add pA, pA, temp
  1480. #endif
  1481. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1482. sub tempK, origK, tempOffset
  1483. #elif defined(LEFT)
  1484. add tempK, tempOffset, #1
  1485. #else
  1486. add tempK, tempOffset, #2
  1487. #endif
  1488. asr counterL , tempK, #3 // counterL = counterL / 8
  1489. cmp counterL, #0
  1490. ble .Lstrmm_kernel_L2_M1_40
  1491. .Lstrmm_kernel_L2_M1_22:
  1492. KERNEL1x2_SUB
  1493. KERNEL1x2_SUB
  1494. KERNEL1x2_SUB
  1495. KERNEL1x2_SUB
  1496. KERNEL1x2_SUB
  1497. KERNEL1x2_SUB
  1498. KERNEL1x2_SUB
  1499. KERNEL1x2_SUB
  1500. subs counterL, counterL, #1
  1501. bgt .Lstrmm_kernel_L2_M1_22
  1502. .Lstrmm_kernel_L2_M1_40:
  1503. ands counterL , tempK, #7 // counterL = counterL % 8
  1504. ble .Lstrmm_kernel_L2_M1_100
  1505. .Lstrmm_kernel_L2_M1_42:
  1506. KERNEL1x2_SUB
  1507. subs counterL, counterL, #1
  1508. bgt .Lstrmm_kernel_L2_M1_42
  1509. .Lstrmm_kernel_L2_M1_100:
  1510. SAVE1x2
  1511. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1512. sub tempK, origK, tempOffset
  1513. #if defined(LEFT)
  1514. sub tempK, tempK, #1
  1515. #else
  1516. sub tempK, tempK, #2
  1517. #endif
  1518. lsl temp, tempK, #2
  1519. add pA, pA, temp
  1520. lsl temp, tempK, #3
  1521. add pB, pB, temp
  1522. #endif
  1523. #if defined(LEFT)
  1524. add tempOffset, tempOffset, #1
  1525. #endif
  1526. .Lstrmm_kernel_L2_END:
  1527. #if !defined(LEFT)
  1528. add tempOffset, tempOffset, #2
  1529. #endif
  1530. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  1531. /******************************************************************************/
  1532. .Lstrmm_kernel_L1_BEGIN:
  1533. mov counterJ , origN
  1534. tst counterJ , #1
  1535. ble .Lstrmm_kernel_L999 // done
  1536. mov pCRow0, pC // pCRow0 = C
  1537. add pC , pC , LDC // Update pC to point to next
  1538. #if defined(LEFT)
  1539. mov tempOffset, offset
  1540. #endif
  1541. mov pA, origPA // pA = A
  1542. .Lstrmm_kernel_L1_M16_BEGIN:
  1543. mov counterI, origM
  1544. asr counterI, counterI, #4 // counterI = counterI / 16
  1545. cmp counterI, #0
  1546. ble .Lstrmm_kernel_L1_M8_BEGIN
  1547. .Lstrmm_kernel_L1_M16_20:
  1548. INIT16x1
  1549. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1550. mov pB, origPB
  1551. #else
  1552. mov pB, origPB
  1553. lsl temp, tempOffset, #6
  1554. add pA, pA, temp
  1555. lsl temp, tempOffset, #2
  1556. add pB, pB, temp
  1557. #endif
  1558. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1559. sub tempK, origK, tempOffset
  1560. #elif defined(LEFT)
  1561. add tempK, tempOffset, #16
  1562. #else
  1563. add tempK, tempOffset, #1
  1564. #endif
  1565. asr counterL , tempK, #3 // counterL = counterL / 8
  1566. cmp counterL , #0
  1567. ble .Lstrmm_kernel_L1_M16_40
  1568. .align 5
  1569. .Lstrmm_kernel_L1_M16_22:
  1570. KERNEL16x1_SUB
  1571. KERNEL16x1_SUB
  1572. KERNEL16x1_SUB
  1573. KERNEL16x1_SUB
  1574. KERNEL16x1_SUB
  1575. KERNEL16x1_SUB
  1576. KERNEL16x1_SUB
  1577. KERNEL16x1_SUB
  1578. subs counterL, counterL, #1
  1579. bgt .Lstrmm_kernel_L1_M16_22
  1580. .Lstrmm_kernel_L1_M16_40:
  1581. ands counterL , tempK, #7 // counterL = counterL % 8
  1582. ble .Lstrmm_kernel_L1_M16_100
  1583. .Lstrmm_kernel_L1_M16_42:
  1584. KERNEL16x1_SUB
  1585. subs counterL, counterL, #1
  1586. bgt .Lstrmm_kernel_L1_M16_42
  1587. .Lstrmm_kernel_L1_M16_100:
  1588. SAVE16x1
  1589. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1590. sub tempK, origK, tempOffset
  1591. #if defined(LEFT)
  1592. sub tempK, tempK, #16
  1593. #else
  1594. sub tempK, tempK, #1
  1595. #endif
  1596. lsl temp, tempK, #6
  1597. add pA, pA, temp
  1598. lsl temp, tempK, #2
  1599. add pB, pB, temp
  1600. #endif
  1601. #if defined(LEFT)
  1602. add tempOffset, tempOffset, #16
  1603. #endif
  1604. .Lstrmm_kernel_L1_M16_END:
  1605. subs counterI, counterI, #1
  1606. bgt .Lstrmm_kernel_L1_M16_20
  1607. //------------------------------------------------------------------------------
  1608. .Lstrmm_kernel_L1_M8_BEGIN:
  1609. mov counterI, origM
  1610. tst counterI , #15
  1611. ble .Lstrmm_kernel_L1_END
  1612. tst counterI, #8
  1613. ble .Lstrmm_kernel_L1_M4_BEGIN
  1614. .Lstrmm_kernel_L1_M8_20:
  1615. INIT8x1
  1616. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1617. mov pB, origPB
  1618. #else
  1619. mov pB, origPB
  1620. lsl temp, tempOffset, #5
  1621. add pA, pA, temp
  1622. lsl temp, tempOffset, #2
  1623. add pB, pB, temp
  1624. #endif
  1625. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1626. sub tempK, origK, tempOffset
  1627. #elif defined(LEFT)
  1628. add tempK, tempOffset, #8
  1629. #else
  1630. add tempK, tempOffset, #1
  1631. #endif
  1632. asr counterL , tempK, #3 // counterL = counterL / 8
  1633. cmp counterL , #0
  1634. ble .Lstrmm_kernel_L1_M8_40
  1635. .align 5
  1636. .Lstrmm_kernel_L1_M8_22:
  1637. KERNEL8x1_SUB
  1638. KERNEL8x1_SUB
  1639. KERNEL8x1_SUB
  1640. KERNEL8x1_SUB
  1641. KERNEL8x1_SUB
  1642. KERNEL8x1_SUB
  1643. KERNEL8x1_SUB
  1644. KERNEL8x1_SUB
  1645. subs counterL, counterL, #1
  1646. bgt .Lstrmm_kernel_L1_M8_22
  1647. .Lstrmm_kernel_L1_M8_40:
  1648. ands counterL , tempK, #7 // counterL = counterL % 8
  1649. ble .Lstrmm_kernel_L1_M8_100
  1650. .Lstrmm_kernel_L1_M8_42:
  1651. KERNEL8x1_SUB
  1652. subs counterL, counterL, #1
  1653. bgt .Lstrmm_kernel_L1_M8_42
  1654. .Lstrmm_kernel_L1_M8_100:
  1655. SAVE8x1
  1656. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1657. sub tempK, origK, tempOffset
  1658. #if defined(LEFT)
  1659. sub tempK, tempK, #8
  1660. #else
  1661. sub tempK, tempK, #1
  1662. #endif
  1663. lsl temp, tempK, #5
  1664. add pA, pA, temp
  1665. lsl temp, tempK, #2
  1666. add pB, pB, temp
  1667. #endif
  1668. #if defined(LEFT)
  1669. add tempOffset, tempOffset, #8
  1670. #endif
  1671. .Lstrmm_kernel_L1_M8_END:
  1672. //------------------------------------------------------------------------------
  1673. .Lstrmm_kernel_L1_M4_BEGIN:
  1674. mov counterI, origM
  1675. tst counterI , #7
  1676. ble .Lstrmm_kernel_L1_END
  1677. tst counterI, #4
  1678. ble .Lstrmm_kernel_L1_M2_BEGIN
  1679. .Lstrmm_kernel_L1_M4_20:
  1680. INIT4x1
  1681. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1682. mov pB, origPB
  1683. #else
  1684. mov pB, origPB
  1685. lsl temp, tempOffset, #2
  1686. add pB, pB, temp
  1687. lsl temp, tempOffset, #4
  1688. add pA, pA, temp
  1689. #endif
  1690. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1691. sub tempK, origK, tempOffset
  1692. #elif defined(LEFT)
  1693. add tempK, tempOffset, #4
  1694. #else
  1695. add tempK, tempOffset, #1
  1696. #endif
  1697. asr counterL , tempK, #3 // counterL = counterL / 8
  1698. cmp counterL , #0
  1699. ble .Lstrmm_kernel_L1_M4_40
  1700. .align 5
  1701. .Lstrmm_kernel_L1_M4_22:
  1702. KERNEL4x1_SUB
  1703. KERNEL4x1_SUB
  1704. KERNEL4x1_SUB
  1705. KERNEL4x1_SUB
  1706. KERNEL4x1_SUB
  1707. KERNEL4x1_SUB
  1708. KERNEL4x1_SUB
  1709. KERNEL4x1_SUB
  1710. subs counterL, counterL, #1
  1711. bgt .Lstrmm_kernel_L1_M4_22
  1712. .Lstrmm_kernel_L1_M4_40:
  1713. ands counterL , tempK, #7 // counterL = counterL % 8
  1714. ble .Lstrmm_kernel_L1_M4_100
  1715. .Lstrmm_kernel_L1_M4_42:
  1716. KERNEL4x1_SUB
  1717. subs counterL, counterL, #1
  1718. bgt .Lstrmm_kernel_L1_M4_42
  1719. .Lstrmm_kernel_L1_M4_100:
  1720. SAVE4x1
  1721. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1722. sub tempK, origK, tempOffset
  1723. #if defined(LEFT)
  1724. sub tempK, tempK, #4
  1725. #else
  1726. sub tempK, tempK, #1
  1727. #endif
  1728. lsl temp, tempK, #4
  1729. add pA, pA, temp
  1730. lsl temp, tempK, #2
  1731. add pB, pB, temp
  1732. #endif
  1733. #if defined(LEFT)
  1734. add tempOffset, tempOffset, #4
  1735. #endif
  1736. .Lstrmm_kernel_L1_M4_END:
  1737. //------------------------------------------------------------------------------
  1738. .Lstrmm_kernel_L1_M2_BEGIN:
  1739. mov counterI, origM
  1740. tst counterI , #3
  1741. ble .Lstrmm_kernel_L1_END
  1742. tst counterI, #2 // counterI = counterI / 2
  1743. ble .Lstrmm_kernel_L1_M1_BEGIN
  1744. .Lstrmm_kernel_L1_M2_20:
  1745. INIT2x1
  1746. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1747. mov pB, origPB
  1748. #else
  1749. mov pB, origPB
  1750. lsl temp, tempOffset, #2
  1751. add pB, pB, temp
  1752. lsl temp, tempOffset, #3
  1753. add pA, pA, temp
  1754. #endif
  1755. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1756. sub tempK, origK, tempOffset
  1757. #elif defined(LEFT)
  1758. add tempK, tempOffset, #2
  1759. #else
  1760. add tempK, tempOffset, #1
  1761. #endif
  1762. asr counterL , tempK, #3 // counterL = counterL / 8
  1763. cmp counterL , #0
  1764. ble .Lstrmm_kernel_L1_M2_40
  1765. .Lstrmm_kernel_L1_M2_22:
  1766. KERNEL2x1_SUB
  1767. KERNEL2x1_SUB
  1768. KERNEL2x1_SUB
  1769. KERNEL2x1_SUB
  1770. KERNEL2x1_SUB
  1771. KERNEL2x1_SUB
  1772. KERNEL2x1_SUB
  1773. KERNEL2x1_SUB
  1774. subs counterL, counterL, #1
  1775. bgt .Lstrmm_kernel_L1_M2_22
  1776. .Lstrmm_kernel_L1_M2_40:
  1777. ands counterL , tempK, #7 // counterL = counterL % 8
  1778. ble .Lstrmm_kernel_L1_M2_100
  1779. .Lstrmm_kernel_L1_M2_42:
  1780. KERNEL2x1_SUB
  1781. subs counterL, counterL, #1
  1782. bgt .Lstrmm_kernel_L1_M2_42
  1783. .Lstrmm_kernel_L1_M2_100:
  1784. SAVE2x1
  1785. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1786. sub tempK, origK, tempOffset
  1787. #if defined(LEFT)
  1788. sub tempK, tempK, #2
  1789. #else
  1790. sub tempK, tempK, #1
  1791. #endif
  1792. lsl temp, tempK, #3
  1793. add pA, pA, temp
  1794. lsl temp, tempK, #2
  1795. add pB, pB, temp
  1796. #endif
  1797. #if defined(LEFT)
  1798. add tempOffset, tempOffset, #2
  1799. #endif
  1800. .Lstrmm_kernel_L1_M2_END:
  1801. .Lstrmm_kernel_L1_M1_BEGIN:
  1802. tst counterI, #1 // counterI = counterI % 2
  1803. ble .Lstrmm_kernel_L1_END
  1804. .Lstrmm_kernel_L1_M1_20:
  1805. INIT1x1
  1806. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1807. mov pB, origPB
  1808. #else
  1809. mov pB, origPB
  1810. lsl temp, tempOffset, #2
  1811. add pB, pB, temp
  1812. lsl temp, tempOffset, #2
  1813. add pA, pA, temp
  1814. #endif
  1815. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1816. sub tempK, origK, tempOffset
  1817. #elif defined(LEFT)
  1818. add tempK, tempOffset, #1
  1819. #else
  1820. add tempK, tempOffset, #1
  1821. #endif
  1822. asr counterL , tempK, #3 // counterL = counterL / 8
  1823. cmp counterL , #0
  1824. ble .Lstrmm_kernel_L1_M1_40
  1825. .Lstrmm_kernel_L1_M1_22:
  1826. KERNEL1x1_SUB
  1827. KERNEL1x1_SUB
  1828. KERNEL1x1_SUB
  1829. KERNEL1x1_SUB
  1830. KERNEL1x1_SUB
  1831. KERNEL1x1_SUB
  1832. KERNEL1x1_SUB
  1833. KERNEL1x1_SUB
  1834. subs counterL, counterL, #1
  1835. bgt .Lstrmm_kernel_L1_M1_22
  1836. .Lstrmm_kernel_L1_M1_40:
  1837. ands counterL , tempK, #7 // counterL = counterL % 8
  1838. ble .Lstrmm_kernel_L1_M1_100
  1839. .Lstrmm_kernel_L1_M1_42:
  1840. KERNEL1x1_SUB
  1841. subs counterL, counterL, #1
  1842. bgt .Lstrmm_kernel_L1_M1_42
  1843. .Lstrmm_kernel_L1_M1_100:
  1844. SAVE1x1
  1845. .Lstrmm_kernel_L1_END:
  1846. .Lstrmm_kernel_L999:
  1847. mov x0, #0 // set return value
  1848. ldp d8, d9, [sp, #(0 * 16)]
  1849. ldp d10, d11, [sp, #(1 * 16)]
  1850. ldp d12, d13, [sp, #(2 * 16)]
  1851. ldp d14, d15, [sp, #(3 * 16)]
  1852. ldp d16, d17, [sp, #(4 * 16)]
  1853. ldp x18, x19, [sp, #(5 * 16)]
  1854. ldp x20, x21, [sp, #(6 * 16)]
  1855. ldp x22, x23, [sp, #(7 * 16)]
  1856. ldp x24, x25, [sp, #(8 * 16)]
  1857. ldp x26, x27, [sp, #(9 * 16)]
  1858. ldr x28, [sp, #(10 * 16)]
  1859. add sp, sp, #(11*16)
  1860. ret
  1861. EPILOGUE