You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_kernel_16x4.S 47 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 x7 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alpha w17
  49. //#define temp x18
  50. #define tempOffset x19
  51. #define tempK x20
  52. #define temp x21
  53. #define alpha0 s10
  54. #define alphaV0 v10.s[0]
  55. #define A_PRE_SIZE 2560
  56. #define B_PRE_SIZE 224
  57. #define C_PRE_SIZE 160
  58. // 00 origM
  59. // 01 origN
  60. // 02 origK
  61. // 03 origPA
  62. // 04 origPB
  63. // 05 pC
  64. // 06 origLDC -> LDC
  65. // 07 offset
  66. // 08 counterL
  67. // 09 counterI
  68. // 10 counterJ
  69. // 11 pB
  70. // 12 pCRow0
  71. // 13 pCRow1
  72. // 14 pCRow2
  73. // 15 pA
  74. // 16 temp
  75. // 17 tempOffset
  76. // 18 must save tempK
  77. // 19 must save
  78. // 20 must save
  79. // 21 must save
  80. // 22 must save
  81. // 23 must save
  82. // 24 must save
  83. // 25 must save
  84. // 26 must save
  85. // 27 must save
  86. // 28 must save
  87. // 29 frame
  88. // 30 link
  89. // 31 sp
  90. //v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03
  91. //v01 pA0_04, pA0_05, pA0_06, pA0_07
  92. //v02 pA0_08, pA0_09, pA0_10, pA0_11
  93. //v03 pA0_12, pA0_13, pA0_14, pA0_15
  94. //v04 pA1_00, pA1_01, pA1_02, pA1_03
  95. //v05 pA1_04, pA1_05, pA1_06, pA1_07
  96. //v06 pA1_08, pA1_09, pA1_10, pA1_11
  97. //v07 pA1_12, pA1_13, pA1_14, pA1_15
  98. //v08 must save pB00
  99. //v09 must save pB01
  100. //v10 must save pB02
  101. //v11 must save pB03
  102. //v12 must save pB10
  103. //v13 must save pB11
  104. //v14 must save pB12
  105. //v15 must save pB13
  106. //v16 must save C00, C01, C02, C03
  107. //v17 must save C04, C05, C06, C07
  108. //v18 C08, C09, C10, C11
  109. //v19 C12, C13, C14, C15
  110. //v20 C16, C17, C18, C19
  111. //v21 C20, C21, C22, C23
  112. //v22 C24, C25, C26, C27
  113. //v23 C28, C29, C30, C31
  114. //v24 C32, C33, C34, C35
  115. //v25 C36, C37, C38, C39
  116. //v26 C40, C41, C42, C43
  117. //v27 C44, C45, C46, C47
  118. //v28 C48, C49, C50, C51
  119. //v29 C52, C53, C54, C55
  120. //v30 C56, C57, C58, C59
  121. //v31 C60, C61, C62, C63
  122. /*******************************************************************************
  123. * Macro definitions
  124. *******************************************************************************/
  125. .macro INIT16x4
  126. fmov s16, wzr
  127. fmov s17, wzr
  128. fmov s18, s16
  129. fmov s19, s17
  130. fmov s20, wzr
  131. fmov s21, s16
  132. fmov s22, s17
  133. fmov s23, s18
  134. fmov s24, wzr
  135. fmov s25, s16
  136. fmov s26, s17
  137. fmov s27, s18
  138. fmov s28, wzr
  139. fmov s29, s16
  140. fmov s30, s17
  141. fmov s31, s18
  142. .endm
  143. .macro KERNEL16x4_I
  144. ldp q0, q1, [pA], #32
  145. ldp s8, s9, [pB], #8
  146. fmul v16.4s, v0.4s, v8.s[0]
  147. fmul v20.4s, v0.4s, v9.s[0]
  148. ldp s10, s11, [pB], #8
  149. fmul v24.4s, v0.4s, v10.s[0]
  150. fmul v28.4s, v0.4s, v11.s[0]
  151. ldp q2, q3, [pA], #32
  152. fmul v17.4s, v1.4s, v8.s[0]
  153. fmul v21.4s, v1.4s, v9.s[0]
  154. ldp q4, q5, [pA], #32
  155. fmul v25.4s, v1.4s, v10.s[0]
  156. fmul v29.4s, v1.4s, v11.s[0]
  157. ldp s12, s13, [pB], #8
  158. fmul v18.4s, v2.4s, v8.s[0]
  159. fmul v22.4s, v2.4s, v9.s[0]
  160. ldp s14, s15, [pB], #8
  161. fmul v19.4s, v3.4s, v8.s[0]
  162. fmul v23.4s, v3.4s, v9.s[0]
  163. ldp q6, q7, [pA], #32
  164. fmul v26.4s, v2.4s, v10.s[0]
  165. fmul v30.4s, v2.4s, v11.s[0]
  166. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  167. fmul v27.4s, v3.4s, v10.s[0]
  168. fmul v31.4s, v3.4s, v11.s[0]
  169. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  170. .endm
  171. .macro KERNEL16x4_M1
  172. fmla v16.4s, v0.4s, v8.s[0]
  173. fmla v17.4s, v1.4s, v8.s[0]
  174. ldp q4, q5, [pA], #32
  175. fmla v18.4s, v2.4s, v8.s[0]
  176. fmla v19.4s, v3.4s, v8.s[0]
  177. fmla v20.4s, v0.4s, v9.s[0]
  178. fmla v21.4s, v1.4s, v9.s[0]
  179. ldp s12, s13, [pB], #8
  180. fmla v22.4s, v2.4s, v9.s[0]
  181. fmla v23.4s, v3.4s, v9.s[0]
  182. ldp s14, s15, [pB], #8
  183. fmla v24.4s, v0.4s, v10.s[0]
  184. fmla v25.4s, v1.4s, v10.s[0]
  185. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  186. fmla v26.4s, v2.4s, v10.s[0]
  187. fmla v27.4s, v3.4s, v10.s[0]
  188. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  189. fmla v28.4s, v0.4s, v11.s[0]
  190. fmla v29.4s, v1.4s, v11.s[0]
  191. ldp q6, q7, [pA], #32
  192. fmla v30.4s, v2.4s, v11.s[0]
  193. fmla v31.4s, v3.4s, v11.s[0]
  194. .endm
  195. .macro KERNEL16x4_M2
  196. fmla v16.4s, v4.4s, v12.s[0]
  197. fmla v17.4s, v5.4s, v12.s[0]
  198. ldp q0, q1, [pA], #32
  199. fmla v18.4s, v6.4s, v12.s[0]
  200. fmla v19.4s, v7.4s, v12.s[0]
  201. fmla v20.4s, v4.4s, v13.s[0]
  202. fmla v21.4s, v5.4s, v13.s[0]
  203. ldp s8, s9, [pB], #8
  204. fmla v22.4s, v6.4s, v13.s[0]
  205. fmla v23.4s, v7.4s, v13.s[0]
  206. ldp s10, s11, [pB], #8
  207. fmla v24.4s, v4.4s, v14.s[0]
  208. fmla v25.4s, v5.4s, v14.s[0]
  209. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  210. fmla v26.4s, v6.4s, v14.s[0]
  211. fmla v27.4s, v7.4s, v14.s[0]
  212. ldp q2, q3, [pA], #32
  213. fmla v28.4s, v4.4s, v15.s[0]
  214. fmla v29.4s, v5.4s, v15.s[0]
  215. fmla v30.4s, v6.4s, v15.s[0]
  216. fmla v31.4s, v7.4s, v15.s[0]
  217. .endm
  218. .macro KERNEL16x4_E
  219. fmla v16.4s, v4.4s, v12.s[0]
  220. fmla v20.4s, v4.4s, v13.s[0]
  221. fmla v24.4s, v4.4s, v14.s[0]
  222. fmla v28.4s, v4.4s, v15.s[0]
  223. fmla v17.4s, v5.4s, v12.s[0]
  224. fmla v21.4s, v5.4s, v13.s[0]
  225. fmla v25.4s, v5.4s, v14.s[0]
  226. fmla v29.4s, v5.4s, v15.s[0]
  227. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  228. fmla v18.4s, v6.4s, v12.s[0]
  229. fmla v22.4s, v6.4s, v13.s[0]
  230. fmla v26.4s, v6.4s, v14.s[0]
  231. fmla v30.4s, v6.4s, v15.s[0]
  232. fmla v19.4s, v7.4s, v12.s[0]
  233. fmla v23.4s, v7.4s, v13.s[0]
  234. fmla v27.4s, v7.4s, v14.s[0]
  235. fmla v31.4s, v7.4s, v15.s[0]
  236. .endm
  237. .macro KERNEL16x4_SUB
  238. ldp q0, q1, [pA], #32
  239. ldp s8, s9, [pB], #8
  240. fmla v16.4s, v0.4s, v8.s[0]
  241. fmla v20.4s, v0.4s, v9.s[0]
  242. ldp s10, s11, [pB], #8
  243. fmla v24.4s, v0.4s, v10.s[0]
  244. fmla v28.4s, v0.4s, v11.s[0]
  245. ldp q2, q3, [pA], #32
  246. fmla v17.4s, v1.4s, v8.s[0]
  247. fmla v21.4s, v1.4s, v9.s[0]
  248. fmla v25.4s, v1.4s, v10.s[0]
  249. fmla v29.4s, v1.4s, v11.s[0]
  250. fmla v18.4s, v2.4s, v8.s[0]
  251. fmla v22.4s, v2.4s, v9.s[0]
  252. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  253. fmla v19.4s, v3.4s, v8.s[0]
  254. fmla v23.4s, v3.4s, v9.s[0]
  255. fmla v26.4s, v2.4s, v10.s[0]
  256. fmla v30.4s, v2.4s, v11.s[0]
  257. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  258. fmla v27.4s, v3.4s, v10.s[0]
  259. fmla v31.4s, v3.4s, v11.s[0]
  260. .endm
  261. .macro SAVE16x4
  262. fmov alpha0, alpha
  263. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  264. fmul v0.4s, v16.4s, alphaV0
  265. fmul v1.4s, v17.4s, alphaV0
  266. stp q0, q1, [pCRow0]
  267. add pCRow0, pCRow0, #32
  268. fmul v2.4s, v18.4s, alphaV0
  269. fmul v3.4s, v19.4s, alphaV0
  270. stp q2, q3, [pCRow0]
  271. add pCRow0, pCRow0, #32
  272. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  273. fmul v4.4s, v20.4s, alphaV0
  274. fmul v5.4s, v21.4s, alphaV0
  275. stp q4, q5, [pCRow1]
  276. add pCRow1, pCRow1, #32
  277. fmul v6.4s, v22.4s, alphaV0
  278. fmul v7.4s, v23.4s, alphaV0
  279. stp q6, q7, [pCRow1]
  280. add pCRow1, pCRow1, #32
  281. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  282. fmul v0.4s, v24.4s, alphaV0
  283. fmul v1.4s, v25.4s, alphaV0
  284. stp q0, q1, [pCRow2]
  285. add pCRow2, pCRow2, #32
  286. fmul v2.4s, v26.4s, alphaV0
  287. fmul v3.4s, v27.4s, alphaV0
  288. stp q2, q3, [pCRow2]
  289. add pCRow2, pCRow2, #32
  290. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  291. fmul v4.4s, v28.4s, alphaV0
  292. fmul v5.4s, v29.4s, alphaV0
  293. stp q4, q5, [pCRow3]
  294. add pCRow3, pCRow3, #32
  295. fmul v6.4s, v30.4s, alphaV0
  296. fmul v7.4s, v31.4s, alphaV0
  297. stp q6, q7, [pCRow3]
  298. add pCRow3, pCRow3, #32
  299. .endm
  300. /******************************************************************************/
  301. .macro INIT8x4
  302. fmov s16, wzr
  303. fmov s17, wzr
  304. fmov s20, wzr
  305. fmov s21, s16
  306. fmov s24, wzr
  307. fmov s25, s16
  308. fmov s28, wzr
  309. fmov s29, s16
  310. .endm
  311. .macro KERNEL8x4_I
  312. ldp s8, s9, [pB], #8
  313. ldp s10, s11, [pB], #8
  314. ldr q0, [pA], #16
  315. ldr q1, [pA], #16
  316. fmul v16.4s, v0.4s, v8.s[0]
  317. fmul v17.4s, v1.4s, v8.s[0]
  318. fmul v20.4s, v0.4s, v9.s[0]
  319. fmul v21.4s, v1.4s, v9.s[0]
  320. fmul v24.4s, v0.4s, v10.s[0]
  321. fmul v25.4s, v1.4s, v10.s[0]
  322. fmul v28.4s, v0.4s, v11.s[0]
  323. fmul v29.4s, v1.4s, v11.s[0]
  324. ldp s12, s13, [pB], #8
  325. ldp s14, s15, [pB], #8
  326. ldr q4, [pA], #16
  327. ldr q5, [pA], #16
  328. .endm
  329. .macro KERNEL8x4_M1
  330. fmla v16.4s, v0.4s, v8.s[0]
  331. fmla v17.4s, v1.4s, v8.s[0]
  332. fmla v20.4s, v0.4s, v9.s[0]
  333. fmla v21.4s, v1.4s, v9.s[0]
  334. fmla v24.4s, v0.4s, v10.s[0]
  335. fmla v25.4s, v1.4s, v10.s[0]
  336. fmla v28.4s, v0.4s, v11.s[0]
  337. fmla v29.4s, v1.4s, v11.s[0]
  338. ldp s12, s13, [pB], #8
  339. ldp s14, s15, [pB], #8
  340. ldr q4, [pA], #16
  341. ldr q5, [pA], #16
  342. .endm
  343. .macro KERNEL8x4_M2
  344. fmla v16.4s, v4.4s, v12.s[0]
  345. fmla v17.4s, v5.4s, v12.s[0]
  346. fmla v20.4s, v4.4s, v13.s[0]
  347. fmla v21.4s, v5.4s, v13.s[0]
  348. fmla v24.4s, v4.4s, v14.s[0]
  349. fmla v25.4s, v5.4s, v14.s[0]
  350. fmla v28.4s, v4.4s, v15.s[0]
  351. fmla v29.4s, v5.4s, v15.s[0]
  352. ldp s8, s9, [pB], #8
  353. ldp s10, s11, [pB], #8
  354. ldr q0, [pA], #16
  355. ldr q1, [pA], #16
  356. .endm
  357. .macro KERNEL8x4_E
  358. fmla v16.4s, v4.4s, v12.s[0]
  359. fmla v17.4s, v5.4s, v12.s[0]
  360. fmla v20.4s, v4.4s, v13.s[0]
  361. fmla v21.4s, v5.4s, v13.s[0]
  362. fmla v24.4s, v4.4s, v14.s[0]
  363. fmla v25.4s, v5.4s, v14.s[0]
  364. fmla v28.4s, v4.4s, v15.s[0]
  365. fmla v29.4s, v5.4s, v15.s[0]
  366. .endm
  367. .macro KERNEL8x4_SUB
  368. ldp s8, s9, [pB], #8
  369. ldp s10, s11, [pB], #8
  370. ldr q0, [pA], #16
  371. ldr q1, [pA], #16
  372. fmla v16.4s, v0.4s, v8.s[0]
  373. fmla v17.4s, v1.4s, v8.s[0]
  374. fmla v20.4s, v0.4s, v9.s[0]
  375. fmla v21.4s, v1.4s, v9.s[0]
  376. fmla v24.4s, v0.4s, v10.s[0]
  377. fmla v25.4s, v1.4s, v10.s[0]
  378. fmla v28.4s, v0.4s, v11.s[0]
  379. fmla v29.4s, v1.4s, v11.s[0]
  380. .endm
  381. .macro SAVE8x4
  382. fmov alpha0, alpha
  383. fmul v0.4s, v16.4s, alphaV0
  384. fmul v1.4s, v17.4s, alphaV0
  385. stp q0, q1, [pCRow0]
  386. add pCRow0, pCRow0, #32
  387. fmul v2.4s, v20.4s, alphaV0
  388. fmul v3.4s, v21.4s, alphaV0
  389. stp q2, q3, [pCRow1]
  390. add pCRow1, pCRow1, #32
  391. fmul v4.4s, v24.4s, alphaV0
  392. fmul v5.4s, v25.4s, alphaV0
  393. stp q4, q5, [pCRow2]
  394. add pCRow2, pCRow2, #32
  395. fmul v6.4s, v28.4s, alphaV0
  396. fmul v7.4s, v29.4s, alphaV0
  397. stp q6, q7, [pCRow3]
  398. add pCRow3, pCRow3, #32
  399. .endm
  400. /******************************************************************************/
  401. .macro INIT4x4
  402. fmov s16, wzr
  403. fmov s20, wzr
  404. fmov s24, wzr
  405. fmov s28, wzr
  406. .endm
  407. .macro KERNEL4x4_I
  408. ldp s8, s9, [pB], #8
  409. ldp s10, s11, [pB], #8
  410. ldr q0, [pA], #16
  411. fmul v16.4s, v0.4s, v8.s[0]
  412. fmul v20.4s, v0.4s, v9.s[0]
  413. fmul v24.4s, v0.4s, v10.s[0]
  414. fmul v28.4s, v0.4s, v11.s[0]
  415. ldp s12, s13, [pB], #8
  416. ldp s14, s15, [pB], #8
  417. ldr q1, [pA], #16
  418. .endm
  419. .macro KERNEL4x4_M1
  420. fmla v16.4s, v0.4s, v8.s[0]
  421. fmla v20.4s, v0.4s, v9.s[0]
  422. fmla v24.4s, v0.4s, v10.s[0]
  423. fmla v28.4s, v0.4s, v11.s[0]
  424. ldp s12, s13, [pB], #8
  425. ldp s14, s15, [pB], #8
  426. ldr q1, [pA], #16
  427. .endm
  428. .macro KERNEL4x4_M2
  429. fmla v16.4s, v1.4s, v12.s[0]
  430. fmla v20.4s, v1.4s, v13.s[0]
  431. fmla v24.4s, v1.4s, v14.s[0]
  432. fmla v28.4s, v1.4s, v15.s[0]
  433. ldp s8, s9, [pB], #8
  434. ldp s10, s11, [pB], #8
  435. ldr q0, [pA], #16
  436. .endm
  437. .macro KERNEL4x4_E
  438. fmla v16.4s, v1.4s, v12.s[0]
  439. fmla v20.4s, v1.4s, v13.s[0]
  440. fmla v24.4s, v1.4s, v14.s[0]
  441. fmla v28.4s, v1.4s, v15.s[0]
  442. .endm
  443. .macro KERNEL4x4_SUB
  444. ldp s8, s9, [pB], #8
  445. ldp s10, s11, [pB], #8
  446. ldr q0, [pA], #16
  447. fmla v16.4s, v0.4s, v8.s[0]
  448. fmla v20.4s, v0.4s, v9.s[0]
  449. fmla v24.4s, v0.4s, v10.s[0]
  450. fmla v28.4s, v0.4s, v11.s[0]
  451. .endm
  452. .macro SAVE4x4
  453. fmov alpha0, alpha
  454. fmul v0.4s, v16.4s, alphaV0
  455. str q0, [pCRow0]
  456. add pCRow0, pCRow0, #16
  457. fmul v1.4s, v20.4s, alphaV0
  458. str q1, [pCRow1]
  459. add pCRow1, pCRow1, #16
  460. fmul v2.4s, v24.4s, alphaV0
  461. str q2, [pCRow2]
  462. add pCRow2, pCRow2, #16
  463. fmul v3.4s, v28.4s, alphaV0
  464. str q3, [pCRow3]
  465. add pCRow3, pCRow3, #16
  466. .endm
  467. /******************************************************************************/
  468. .macro INIT2x4
  469. fmov s16, wzr
  470. fmov s20, s16
  471. fmov s24, s20
  472. fmov s28, s16
  473. .endm
  474. .macro KERNEL2x4_SUB
  475. ldp s8, s9, [pB], #8
  476. ldp s10, s11, [pB], #8
  477. ldr d0, [pA], #8
  478. fmla v16.2s, v0.2s, v8.s[0]
  479. fmla v20.2s, v0.2s, v9.s[0]
  480. fmla v24.2s, v0.2s, v10.s[0]
  481. fmla v28.2s, v0.2s, v11.s[0]
  482. .endm
  483. .macro SAVE2x4
  484. fmov alpha0, alpha
  485. fmul v0.2s, v16.2s, alphaV0
  486. str d0, [pCRow0]
  487. add pCRow0, pCRow0, #8
  488. fmul v1.2s, v20.2s, alphaV0
  489. str d1, [pCRow1]
  490. add pCRow1, pCRow1, #8
  491. fmul v0.2s, v24.2s, alphaV0
  492. str d0, [pCRow2]
  493. add pCRow2, pCRow2, #8
  494. fmul v1.2s, v28.2s, alphaV0
  495. str d1, [pCRow3]
  496. add pCRow3, pCRow3, #8
  497. .endm
  498. /******************************************************************************/
  499. .macro INIT1x4
  500. fmov s16, wzr
  501. fmov s20, s16
  502. .endm
  503. .macro KERNEL1x4_SUB
  504. ldr s0, [pA]
  505. add pA, pA, #4
  506. ld1 {v8.2s, v9.2s}, [pB]
  507. add pB, pB, #16
  508. fmla v16.2s, v8.2s, v0.s[0]
  509. fmla v20.2s, v9.2s, v0.s[0]
  510. .endm
  511. .macro SAVE1x4
  512. fmov alpha0, alpha
  513. fmul v8.2s, v16.2s, alphaV0
  514. st1 {v8.s}[0], [pCRow0]
  515. st1 {v8.s}[1], [pCRow1]
  516. add pCRow0, pCRow0, #4
  517. add pCRow1, pCRow1, #4
  518. fmul v12.2s, v20.2s, alphaV0
  519. st1 {v12.s}[0], [pCRow2]
  520. st1 {v12.s}[1], [pCRow3]
  521. add pCRow2, pCRow2, #4
  522. add pCRow3, pCRow3, #4
  523. .endm
  524. /******************************************************************************/
  525. .macro INIT16x2
  526. fmov s16, wzr
  527. fmov s17, wzr
  528. fmov s18, wzr
  529. fmov s19, s16
  530. fmov s20, wzr
  531. fmov s21, s16
  532. fmov s22, wzr
  533. fmov s23, s16
  534. .endm
  535. .macro KERNEL16x2_SUB
  536. ld1 {v8.2s}, [pB]
  537. add pB, pB, #8
  538. ld1 {v0.4s}, [pA]
  539. add pA, pA, #16
  540. ld1 {v1.4s}, [pA]
  541. add pA, pA, #16
  542. ld1 {v2.4s}, [pA]
  543. add pA, pA, #16
  544. ld1 {v3.4s}, [pA]
  545. add pA, pA, #16
  546. fmla v16.4s, v0.4s, v8.s[0]
  547. fmla v17.4s, v1.4s, v8.s[0]
  548. fmla v18.4s, v2.4s, v8.s[0]
  549. fmla v19.4s, v3.4s, v8.s[0]
  550. fmla v20.4s, v0.4s, v8.s[1]
  551. fmla v21.4s, v1.4s, v8.s[1]
  552. fmla v22.4s, v2.4s, v8.s[1]
  553. fmla v23.4s, v3.4s, v8.s[1]
  554. .endm
  555. .macro SAVE16x2
  556. fmov alpha0, alpha
  557. add pCRow1, pCRow0, LDC
  558. fmul v0.4s, v16.4s, alphaV0
  559. fmul v1.4s, v17.4s, alphaV0
  560. fmul v2.4s, v18.4s, alphaV0
  561. fmul v3.4s, v19.4s, alphaV0
  562. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  563. fmul v4.4s, v20.4s, alphaV0
  564. fmul v5.4s, v21.4s, alphaV0
  565. fmul v6.4s, v22.4s, alphaV0
  566. fmul v7.4s, v23.4s, alphaV0
  567. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  568. add pCRow0, pCRow0, #64
  569. .endm
  570. /******************************************************************************/
  571. .macro INIT8x2
  572. fmov s16, wzr
  573. fmov s17, s16
  574. fmov s20, s17
  575. fmov s21, s16
  576. .endm
  577. .macro KERNEL8x2_SUB
  578. ld1 {v8.2s}, [pB]
  579. add pB, pB, #8
  580. ld1 {v0.4s}, [pA]
  581. add pA, pA, #16
  582. ld1 {v1.4s}, [pA]
  583. add pA, pA, #16
  584. fmla v16.4s, v0.4s, v8.s[0]
  585. fmla v17.4s, v1.4s, v8.s[0]
  586. fmla v20.4s, v0.4s, v8.s[1]
  587. fmla v21.4s, v1.4s, v8.s[1]
  588. .endm
  589. .macro SAVE8x2
  590. fmov alpha0, alpha
  591. add pCRow1, pCRow0, LDC
  592. fmul v0.4s, v16.4s, alphaV0
  593. fmul v1.4s, v17.4s, alphaV0
  594. st1 {v0.4s, v1.4s}, [pCRow0]
  595. add pCRow2, pCRow1, LDC
  596. fmul v4.4s, v20.4s, alphaV0
  597. fmul v5.4s, v21.4s, alphaV0
  598. st1 {v4.4s, v5.4s}, [pCRow1]
  599. add pCRow0, pCRow0, #32
  600. .endm
  601. /******************************************************************************/
  602. .macro INIT4x2
  603. fmov s16, wzr
  604. fmov s17, s16
  605. fmov s20, s17
  606. fmov s21, s16
  607. .endm
  608. .macro KERNEL4x2_SUB
  609. ld1 {v8.2s}, [pB]
  610. add pB, pB, #8
  611. ld1 {v0.2s, v1.2s}, [pA]
  612. add pA, pA, #16
  613. fmla v16.2s, v0.2s, v8.s[0]
  614. fmla v17.2s, v1.2s, v8.s[0]
  615. fmla v20.2s, v0.2s, v8.s[1]
  616. fmla v21.2s, v1.2s, v8.s[1]
  617. .endm
  618. .macro SAVE4x2
  619. fmov alpha0, alpha
  620. fmul v8.2s, v16.2s, alphaV0
  621. fmul v9.2s, v17.2s, alphaV0
  622. st1 {v8.2s, v9.2s}, [pCRow0]
  623. add pCRow1, pCRow0, LDC
  624. fmul v12.2s, v20.2s, alphaV0
  625. fmul v13.2s, v21.2s, alphaV0
  626. st1 {v12.2s, v13.2s}, [pCRow1]
  627. add pCRow0, pCRow0, #16
  628. .endm
  629. /******************************************************************************/
  630. .macro INIT2x2
  631. fmov s16, wzr
  632. fmov s20, s16
  633. .endm
  634. .macro KERNEL2x2_SUB
  635. ld1 {v8.2s}, [pB]
  636. add pB, pB, #8
  637. ld1 {v0.2s}, [pA]
  638. add pA, pA, #8
  639. fmla v16.2s, v0.2s, v8.s[0]
  640. fmla v20.2s, v0.2s, v8.s[1]
  641. .endm
  642. .macro SAVE2x2
  643. fmov alpha0, alpha
  644. fmul v8.2s, v16.2s, alphaV0
  645. st1 {v8.2s}, [pCRow0]
  646. add pCRow1 , pCRow0, LDC
  647. fmul v12.2s, v20.2s, alphaV0
  648. st1 {v12.2s}, [pCRow1]
  649. add pCRow0, pCRow0, #8
  650. .endm
  651. /******************************************************************************/
  652. .macro INIT1x2
  653. fmov s16, wzr
  654. .endm
  655. .macro KERNEL1x2_SUB
  656. ld1 {v8.2s} , [pB]
  657. add pB , pB, #8
  658. ldr s0 , [pA]
  659. add pA, pA, #4
  660. fmla v16.2s, v8.2s, v0.s[0]
  661. .endm
  662. .macro SAVE1x2
  663. fmov alpha0, alpha
  664. add pCRow1 , pCRow0, LDC
  665. fmul v8.2s, v16.2s, alphaV0
  666. st1 {v8.s}[0], [pCRow0]
  667. st1 {v8.s}[1], [pCRow1]
  668. add pCRow0, pCRow0, #4
  669. .endm
  670. /******************************************************************************/
  671. .macro INIT16x1
  672. fmov s16, wzr
  673. fmov s17, wzr
  674. fmov s18, wzr
  675. fmov s19, s16
  676. .endm
  677. .macro KERNEL16x1_SUB
  678. ldr s8, [pB]
  679. add pB , pB, #4
  680. ld1 {v0.4s}, [pA]
  681. add pA, pA, #16
  682. ld1 {v1.4s}, [pA]
  683. add pA, pA, #16
  684. ld1 {v2.4s}, [pA]
  685. add pA, pA, #16
  686. ld1 {v3.4s}, [pA]
  687. add pA, pA, #16
  688. fmla v16.4s, v0.4s, v8.s[0]
  689. fmla v17.4s, v1.4s, v8.s[0]
  690. fmla v18.4s, v2.4s, v8.s[0]
  691. fmla v19.4s, v3.4s, v8.s[0]
  692. .endm
  693. .macro SAVE16x1
  694. fmov alpha0, alpha
  695. fmul v0.4s, v16.4s, alphaV0
  696. fmul v1.4s, v17.4s, alphaV0
  697. fmul v2.4s, v18.4s, alphaV0
  698. fmul v3.4s, v19.4s, alphaV0
  699. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  700. add pCRow0, pCRow0, #64
  701. .endm
  702. /******************************************************************************/
  703. .macro INIT8x1
  704. fmov s16, wzr
  705. fmov s17, wzr
  706. .endm
  707. .macro KERNEL8x1_SUB
  708. ldr s8, [pB]
  709. add pB , pB, #4
  710. ld1 {v0.4s}, [pA]
  711. add pA, pA, #16
  712. ld1 {v1.4s}, [pA]
  713. add pA, pA, #16
  714. fmla v16.4s, v0.4s, v8.s[0]
  715. fmla v17.4s, v1.4s, v8.s[0]
  716. .endm
  717. .macro SAVE8x1
  718. fmov alpha0, alpha
  719. fmul v0.4s, v16.4s, alphaV0
  720. fmul v1.4s, v17.4s, alphaV0
  721. st1 {v0.4s, v1.4s}, [pCRow0]
  722. add pCRow0, pCRow0, #32
  723. .endm
  724. /******************************************************************************/
  725. .macro INIT4x1
  726. fmov s16, wzr
  727. fmov s17, s16
  728. .endm
  729. .macro KERNEL4x1_SUB
  730. ldr s8, [pB]
  731. add pB , pB, #4
  732. ld1 {v0.2s, v1.2s}, [pA]
  733. add pA , pA, #16
  734. fmla v16.2s, v0.2s, v8.s[0]
  735. fmla v17.2s, v1.2s, v8.s[0]
  736. .endm
  737. .macro SAVE4x1
  738. fmov alpha0, alpha
  739. fmul v8.2s, v16.2s, alphaV0
  740. fmul v9.2s, v17.2s, alphaV0
  741. st1 {v8.2s, v9.2s}, [pCRow0]
  742. add pCRow0, pCRow0, #16
  743. .endm
  744. /******************************************************************************/
  745. .macro INIT2x1
  746. fmov s16, wzr
  747. .endm
  748. .macro KERNEL2x1_SUB
  749. ldr s8, [pB]
  750. add pB , pB, #4
  751. ld1 {v0.2s}, [pA]
  752. add pA , pA, #8
  753. fmla v16.2s, v0.2s, v8.s[0]
  754. .endm
  755. .macro SAVE2x1
  756. fmov alpha0, alpha
  757. fmul v8.2s, v16.2s, alphaV0
  758. st1 {v8.2s}, [pCRow0]
  759. add pCRow0, pCRow0, #8
  760. .endm
  761. /******************************************************************************/
  762. .macro INIT1x1
  763. fmov s16, wzr
  764. .endm
  765. .macro KERNEL1x1_SUB
  766. ldr s8, [pB]
  767. add pB , pB, #4
  768. ldr s0, [pA]
  769. add pA , pA, #4
  770. fmadd s16, s0, s8, s16
  771. .endm
  772. .macro SAVE1x1
  773. fmov alpha0, alpha
  774. fmul s8, s16, alpha0
  775. str s8, [pCRow0]
  776. add pCRow0, pCRow0, #4
  777. .endm
  778. /*******************************************************************************
  779. * End of macro definitions
  780. *******************************************************************************/
  781. PROLOGUE
  782. .Lstrmm_kernel_begin:
  783. .align 5
  784. add sp, sp, #-(11 * 16)
  785. stp d8, d9, [sp, #(0 * 16)]
  786. stp d10, d11, [sp, #(1 * 16)]
  787. stp d12, d13, [sp, #(2 * 16)]
  788. stp d14, d15, [sp, #(3 * 16)]
  789. stp d16, d17, [sp, #(4 * 16)]
  790. stp x18, x19, [sp, #(5 * 16)]
  791. stp x20, x21, [sp, #(6 * 16)]
  792. stp x22, x23, [sp, #(7 * 16)]
  793. stp x24, x25, [sp, #(8 * 16)]
  794. stp x26, x27, [sp, #(9 * 16)]
  795. str x28, [sp, #(10 * 16)]
  796. prfm PLDL1KEEP, [origPB]
  797. prfm PLDL1KEEP, [origPA]
  798. fmov alpha, s0
  799. lsl LDC, LDC, #2 // ldc = ldc * 4
  800. #if !defined(LEFT)
  801. neg tempOffset, offset
  802. #endif
  803. mov pB, origPB
  804. mov counterJ, origN
  805. asr counterJ, counterJ, #2 // J = J / 4
  806. cmp counterJ, #0
  807. ble .Lstrmm_kernel_L2_BEGIN
  808. /******************************************************************************/
  809. .Lstrmm_kernel_L4_BEGIN:
  810. mov pCRow0, pC
  811. add pCRow1, pCRow0, LDC
  812. add pCRow2, pCRow1, LDC
  813. add pCRow3, pCRow2, LDC
  814. add pC, pCRow3, LDC
  815. #if defined(LEFT)
  816. mov tempOffset, offset
  817. #endif
  818. mov pA, origPA // pA = start of A array
  819. .Lstrmm_kernel_L4_M16_BEGIN:
  820. mov counterI, origM
  821. asr counterI, counterI, #4 // counterI = counterI / 16
  822. cmp counterI, #0
  823. ble .Lstrmm_kernel_L4_M8_BEGIN
  824. .align 5
  825. .Lstrmm_kernel_L4_M16_20:
  826. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  827. mov pB, origPB
  828. #else
  829. mov pB, origPB
  830. lsl temp, tempOffset, #6
  831. add pA, pA, temp
  832. lsl temp, tempOffset, #4
  833. add pB, pB, temp
  834. #endif
  835. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  836. sub tempK, origK, tempOffset
  837. #elif defined(LEFT)
  838. add tempK, tempOffset, #16
  839. #else
  840. add tempK, tempOffset, #4
  841. #endif
  842. asr counterL , tempK, #3
  843. cmp counterL , #2
  844. blt .Lstrmm_kernel_L4_M16_32
  845. KERNEL16x4_I
  846. KERNEL16x4_M2
  847. KERNEL16x4_M1
  848. KERNEL16x4_M2
  849. KERNEL16x4_M1
  850. KERNEL16x4_M2
  851. KERNEL16x4_M1
  852. KERNEL16x4_M2
  853. subs counterL, counterL, #2
  854. ble .Lstrmm_kernel_L4_M16_22a
  855. .align 5
  856. .Lstrmm_kernel_L4_M16_22:
  857. KERNEL16x4_M1
  858. KERNEL16x4_M2
  859. KERNEL16x4_M1
  860. KERNEL16x4_M2
  861. KERNEL16x4_M1
  862. KERNEL16x4_M2
  863. KERNEL16x4_M1
  864. KERNEL16x4_M2
  865. subs counterL, counterL, #1
  866. bgt .Lstrmm_kernel_L4_M16_22
  867. .align 5
  868. .Lstrmm_kernel_L4_M16_22a:
  869. KERNEL16x4_M1
  870. KERNEL16x4_M2
  871. KERNEL16x4_M1
  872. KERNEL16x4_M2
  873. KERNEL16x4_M1
  874. KERNEL16x4_M2
  875. KERNEL16x4_M1
  876. KERNEL16x4_E
  877. b .Lstrmm_kernel_L4_M16_44
  878. .align 5
  879. .Lstrmm_kernel_L4_M16_32:
  880. tst counterL, #1
  881. ble .Lstrmm_kernel_L4_M16_40
  882. KERNEL16x4_I
  883. KERNEL16x4_M2
  884. KERNEL16x4_M1
  885. KERNEL16x4_M2
  886. KERNEL16x4_M1
  887. KERNEL16x4_M2
  888. KERNEL16x4_M1
  889. KERNEL16x4_E
  890. b .Lstrmm_kernel_L4_M16_44
  891. .Lstrmm_kernel_L4_M16_40:
  892. INIT16x4
  893. .Lstrmm_kernel_L4_M16_44:
  894. ands counterL , tempK, #7
  895. ble .Lstrmm_kernel_L4_M16_100
  896. .align 5
  897. .Lstrmm_kernel_L4_M16_46:
  898. KERNEL16x4_SUB
  899. subs counterL, counterL, #1
  900. bne .Lstrmm_kernel_L4_M16_46
  901. .Lstrmm_kernel_L4_M16_100:
  902. SAVE16x4
  903. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  904. sub tempK, origK, tempOffset
  905. #if defined(LEFT)
  906. sub tempK, tempK, #16
  907. #else
  908. sub tempK, tempK, #4
  909. #endif
  910. lsl temp, tempK, #6
  911. add pA, pA, temp
  912. lsl temp, tempK, #4
  913. add pB, pB, temp
  914. #endif
  915. #if defined(LEFT)
  916. add tempOffset, tempOffset, #16
  917. #endif
  918. prfm PLDL1KEEP, [pA]
  919. prfm PLDL1KEEP, [pA, #64]
  920. prfm PLDL1KEEP, [origPB]
  921. .Lstrmm_kernel_L4_M16_END:
  922. subs counterI, counterI, #1
  923. bne .Lstrmm_kernel_L4_M16_20
  924. //------------------------------------------------------------------------------
  925. .Lstrmm_kernel_L4_M8_BEGIN:
  926. mov counterI, origM
  927. tst counterI , #15
  928. ble .Lstrmm_kernel_L4_END
  929. tst counterI, #8
  930. ble .Lstrmm_kernel_L4_M4_BEGIN
  931. .Lstrmm_kernel_L4_M8_20:
  932. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  933. mov pB, origPB
  934. #else
  935. mov pB, origPB
  936. lsl temp, tempOffset, #5
  937. add pA, pA, temp
  938. lsl temp, tempOffset, #4
  939. add pB, pB, temp
  940. #endif
  941. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  942. sub tempK, origK, tempOffset
  943. #elif defined(LEFT)
  944. add tempK, tempOffset, #8
  945. #else
  946. add tempK, tempOffset, #4
  947. #endif
  948. asr counterL , tempK, #1 // L = K / 2
  949. cmp counterL , #2 // is there at least 4 to do?
  950. blt .Lstrmm_kernel_L4_M8_32
  951. KERNEL8x4_I // do one in the K
  952. KERNEL8x4_M2 // do another in the K
  953. subs counterL, counterL, #2
  954. ble .Lstrmm_kernel_L4_M8_22a
  955. .align 5
  956. .Lstrmm_kernel_L4_M8_22:
  957. KERNEL8x4_M1
  958. KERNEL8x4_M2
  959. subs counterL, counterL, #1
  960. bgt .Lstrmm_kernel_L4_M8_22
  961. .Lstrmm_kernel_L4_M8_22a:
  962. KERNEL8x4_M1
  963. KERNEL8x4_E
  964. b .Lstrmm_kernel_L4_M8_44
  965. .Lstrmm_kernel_L4_M8_32:
  966. tst counterL, #1
  967. ble .Lstrmm_kernel_L4_M8_40
  968. KERNEL8x4_I
  969. KERNEL8x4_E
  970. b .Lstrmm_kernel_L4_M8_44
  971. .Lstrmm_kernel_L4_M8_40:
  972. INIT8x4
  973. .Lstrmm_kernel_L4_M8_44:
  974. ands counterL , tempK, #1
  975. ble .Lstrmm_kernel_L4_M8_100
  976. .Lstrmm_kernel_L4_M8_46:
  977. KERNEL8x4_SUB
  978. .Lstrmm_kernel_L4_M8_100:
  979. SAVE8x4
  980. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  981. sub tempK, origK, tempOffset
  982. #if defined(LEFT)
  983. sub tempK, tempK, #8
  984. #else
  985. sub tempK, tempK, #4
  986. #endif
  987. lsl temp, tempK, #5
  988. add pA, pA, temp
  989. lsl temp, tempK, #4
  990. add pB, pB, temp
  991. #endif
  992. #if defined(LEFT)
  993. add tempOffset, tempOffset, #8
  994. #endif
  995. .Lstrmm_kernel_L4_M8_END:
  996. //------------------------------------------------------------------------------
  997. .Lstrmm_kernel_L4_M4_BEGIN:
  998. mov counterI, origM
  999. tst counterI , #7
  1000. ble .Lstrmm_kernel_L4_END
  1001. tst counterI, #4
  1002. ble .Lstrmm_kernel_L4_M2_BEGIN
  1003. .Lstrmm_kernel_L4_M4_20:
  1004. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1005. mov pB, origPB
  1006. #else
  1007. mov pB, origPB
  1008. lsl temp, tempOffset, #4
  1009. add pB, pB, temp
  1010. add pA, pA, temp
  1011. #endif
  1012. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1013. sub tempK, origK, tempOffset
  1014. #elif defined(LEFT)
  1015. add tempK, tempOffset, #4
  1016. #else
  1017. add tempK, tempOffset, #4
  1018. #endif
  1019. asr counterL , tempK, #1 // L = K / 2
  1020. cmp counterL , #2 // is there at least 4 to do?
  1021. blt .Lstrmm_kernel_L4_M4_32
  1022. KERNEL4x4_I // do one in the K
  1023. KERNEL4x4_M2 // do another in the K
  1024. subs counterL, counterL, #2
  1025. ble .Lstrmm_kernel_L4_M4_22a
  1026. .align 5
  1027. .Lstrmm_kernel_L4_M4_22:
  1028. KERNEL4x4_M1
  1029. KERNEL4x4_M2
  1030. subs counterL, counterL, #1
  1031. bgt .Lstrmm_kernel_L4_M4_22
  1032. .Lstrmm_kernel_L4_M4_22a:
  1033. KERNEL4x4_M1
  1034. KERNEL4x4_E
  1035. b .Lstrmm_kernel_L4_M4_44
  1036. .Lstrmm_kernel_L4_M4_32:
  1037. tst counterL, #1
  1038. ble .Lstrmm_kernel_L4_M4_40
  1039. KERNEL4x4_I
  1040. KERNEL4x4_E
  1041. b .Lstrmm_kernel_L4_M4_44
  1042. .Lstrmm_kernel_L4_M4_40:
  1043. INIT4x4
  1044. .Lstrmm_kernel_L4_M4_44:
  1045. ands counterL , tempK, #1
  1046. ble .Lstrmm_kernel_L4_M4_100
  1047. .Lstrmm_kernel_L4_M4_46:
  1048. KERNEL4x4_SUB
  1049. .Lstrmm_kernel_L4_M4_100:
  1050. SAVE4x4
  1051. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1052. sub tempK, origK, tempOffset
  1053. #if defined(LEFT)
  1054. sub tempK, tempK, #4
  1055. #else
  1056. sub tempK, tempK, #4
  1057. #endif
  1058. lsl temp, tempK, #4
  1059. add pA, pA, temp
  1060. add pB, pB, temp
  1061. #endif
  1062. #if defined(LEFT)
  1063. add tempOffset, tempOffset, #4
  1064. #endif
  1065. .Lstrmm_kernel_L4_M4_END:
  1066. //------------------------------------------------------------------------------
  1067. .Lstrmm_kernel_L4_M2_BEGIN:
  1068. mov counterI, origM
  1069. tst counterI , #3
  1070. ble .Lstrmm_kernel_L4_END
  1071. tst counterI, #2 // counterI = counterI / 2
  1072. ble .Lstrmm_kernel_L4_M1_BEGIN
  1073. .Lstrmm_kernel_L4_M2_20:
  1074. INIT2x4
  1075. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1076. mov pB, origPB
  1077. #else
  1078. mov pB, origPB
  1079. lsl temp, tempOffset, #3
  1080. add pA, pA, temp
  1081. lsl temp, tempOffset, #4
  1082. add pB, pB, temp
  1083. #endif
  1084. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1085. sub tempK, origK, tempOffset
  1086. #elif defined(LEFT)
  1087. add tempK, tempOffset, #2
  1088. #else
  1089. add tempK, tempOffset, #4
  1090. #endif
  1091. asr counterL , tempK, #3 // counterL = counterL / 8
  1092. cmp counterL , #0
  1093. ble .Lstrmm_kernel_L4_M2_40
  1094. .Lstrmm_kernel_L4_M2_22:
  1095. KERNEL2x4_SUB
  1096. KERNEL2x4_SUB
  1097. KERNEL2x4_SUB
  1098. KERNEL2x4_SUB
  1099. KERNEL2x4_SUB
  1100. KERNEL2x4_SUB
  1101. KERNEL2x4_SUB
  1102. KERNEL2x4_SUB
  1103. subs counterL, counterL, #1
  1104. bgt .Lstrmm_kernel_L4_M2_22
  1105. .Lstrmm_kernel_L4_M2_40:
  1106. ands counterL , tempK, #7 // counterL = counterL % 8
  1107. ble .Lstrmm_kernel_L4_M2_100
  1108. .Lstrmm_kernel_L4_M2_42:
  1109. KERNEL2x4_SUB
  1110. subs counterL, counterL, #1
  1111. bgt .Lstrmm_kernel_L4_M2_42
  1112. .Lstrmm_kernel_L4_M2_100:
  1113. SAVE2x4
  1114. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1115. sub tempK, origK, tempOffset
  1116. #if defined(LEFT)
  1117. sub tempK, tempK, #2
  1118. #else
  1119. sub tempK, tempK, #4
  1120. #endif
  1121. lsl temp, tempK, #3
  1122. add pA, pA, temp
  1123. lsl temp, tempK, #4
  1124. add pB, pB, temp
  1125. #endif
  1126. #if defined(LEFT)
  1127. add tempOffset, tempOffset, #2
  1128. #endif
  1129. .Lstrmm_kernel_L4_M2_END:
  1130. .Lstrmm_kernel_L4_M1_BEGIN:
  1131. tst counterI, #1 // counterI = counterI % 2
  1132. ble .Lstrmm_kernel_L4_END
  1133. .Lstrmm_kernel_L4_M1_20:
  1134. INIT1x4
  1135. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1136. mov pB, origPB
  1137. #else
  1138. mov pB, origPB
  1139. lsl temp, tempOffset, #4
  1140. add pB, pB, temp
  1141. lsl temp, tempOffset, #2
  1142. add pA, pA, temp
  1143. #endif
  1144. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1145. sub tempK, origK, tempOffset
  1146. #elif defined(LEFT)
  1147. add tempK, tempOffset, #1
  1148. #else
  1149. add tempK, tempOffset, #4
  1150. #endif
  1151. asr counterL , tempK, #3 // counterL = counterL / 8
  1152. cmp counterL , #0
  1153. ble .Lstrmm_kernel_L4_M1_40
  1154. .Lstrmm_kernel_L4_M1_22:
  1155. KERNEL1x4_SUB
  1156. KERNEL1x4_SUB
  1157. KERNEL1x4_SUB
  1158. KERNEL1x4_SUB
  1159. KERNEL1x4_SUB
  1160. KERNEL1x4_SUB
  1161. KERNEL1x4_SUB
  1162. KERNEL1x4_SUB
  1163. subs counterL, counterL, #1
  1164. bgt .Lstrmm_kernel_L4_M1_22
  1165. .Lstrmm_kernel_L4_M1_40:
  1166. ands counterL , tempK, #7 // counterL = counterL % 8
  1167. ble .Lstrmm_kernel_L4_M1_100
  1168. .Lstrmm_kernel_L4_M1_42:
  1169. KERNEL1x4_SUB
  1170. subs counterL, counterL, #1
  1171. bgt .Lstrmm_kernel_L4_M1_42
  1172. .Lstrmm_kernel_L4_M1_100:
  1173. SAVE1x4
  1174. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1175. sub tempK, origK, tempOffset
  1176. #if defined(LEFT)
  1177. sub tempK, tempK, #1
  1178. #else
  1179. sub tempK, tempK, #4
  1180. #endif
  1181. lsl temp, tempK, #2
  1182. add pA, pA, temp
  1183. lsl temp, tempK, #4
  1184. add pB, pB, temp
  1185. #endif
  1186. #if defined(LEFT)
  1187. add tempOffset, tempOffset, #1
  1188. #endif
  1189. .Lstrmm_kernel_L4_END:
  1190. add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
  1191. #if !defined(LEFT)
  1192. add tempOffset, tempOffset, #4
  1193. #endif
  1194. subs counterJ, counterJ , #1 // j--
  1195. bgt .Lstrmm_kernel_L4_BEGIN
  1196. /******************************************************************************/
  1197. .Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  1198. mov counterJ , origN
  1199. tst counterJ , #3
  1200. ble .Lstrmm_kernel_L999
  1201. tst counterJ , #2
  1202. ble .Lstrmm_kernel_L1_BEGIN
  1203. mov pCRow0, pC // pCRow0 = pC
  1204. add pC,pC,LDC, lsl #1
  1205. #if defined(LEFT)
  1206. mov tempOffset, offset
  1207. #endif
  1208. mov pA, origPA // pA = A
  1209. .Lstrmm_kernel_L2_M16_BEGIN:
  1210. mov counterI, origM
  1211. asr counterI, counterI, #4 // counterI = counterI / 16
  1212. cmp counterI,#0
  1213. ble .Lstrmm_kernel_L2_M8_BEGIN
  1214. .Lstrmm_kernel_L2_M16_20:
  1215. INIT16x2
  1216. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1217. mov pB, origPB
  1218. #else
  1219. mov pB, origPB
  1220. lsl temp, tempOffset, #6
  1221. add pA, pA, temp
  1222. lsl temp, tempOffset, #3
  1223. add pB, pB, temp
  1224. #endif
  1225. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1226. sub tempK, origK, tempOffset
  1227. #elif defined(LEFT)
  1228. add tempK, tempOffset, #16
  1229. #else
  1230. add tempK, tempOffset, #2
  1231. #endif
  1232. asr counterL , tempK, #3 // counterL = counterL / 8
  1233. cmp counterL,#0
  1234. ble .Lstrmm_kernel_L2_M16_40
  1235. .align 5
  1236. .Lstrmm_kernel_L2_M16_22:
  1237. KERNEL16x2_SUB
  1238. KERNEL16x2_SUB
  1239. KERNEL16x2_SUB
  1240. KERNEL16x2_SUB
  1241. KERNEL16x2_SUB
  1242. KERNEL16x2_SUB
  1243. KERNEL16x2_SUB
  1244. KERNEL16x2_SUB
  1245. subs counterL, counterL, #1
  1246. bgt .Lstrmm_kernel_L2_M16_22
  1247. .Lstrmm_kernel_L2_M16_40:
  1248. ands counterL , tempK, #7 // counterL = counterL % 8
  1249. ble .Lstrmm_kernel_L2_M16_100
  1250. .Lstrmm_kernel_L2_M16_42:
  1251. KERNEL16x2_SUB
  1252. subs counterL, counterL, #1
  1253. bgt .Lstrmm_kernel_L2_M16_42
  1254. .Lstrmm_kernel_L2_M16_100:
  1255. SAVE16x2
  1256. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1257. sub tempK, origK, tempOffset
  1258. #if defined(LEFT)
  1259. sub tempK, tempK, #16
  1260. #else
  1261. sub tempK, tempK, #2
  1262. #endif
  1263. lsl temp, tempK, #6
  1264. add pA, pA, temp
  1265. lsl temp, tempK, #3
  1266. add pB, pB, temp
  1267. #endif
  1268. #if defined(LEFT)
  1269. add tempOffset, tempOffset, #16
  1270. #endif
  1271. .Lstrmm_kernel_L2_M16_END:
  1272. subs counterI, counterI, #1
  1273. bgt .Lstrmm_kernel_L2_M16_20
  1274. //------------------------------------------------------------------------------
  1275. .Lstrmm_kernel_L2_M8_BEGIN:
  1276. mov counterI, origM
  1277. tst counterI , #15
  1278. ble .Lstrmm_kernel_L2_END
  1279. tst counterI, #8
  1280. ble .Lstrmm_kernel_L2_M4_BEGIN
  1281. .Lstrmm_kernel_L2_M8_20:
  1282. INIT8x2
  1283. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1284. mov pB, origPB
  1285. #else
  1286. mov pB, origPB
  1287. lsl temp, tempOffset, #5
  1288. add pA, pA, temp
  1289. lsl temp, tempOffset, #3
  1290. add pB, pB, temp
  1291. #endif
  1292. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1293. sub tempK, origK, tempOffset
  1294. #elif defined(LEFT)
  1295. add tempK, tempOffset, #8
  1296. #else
  1297. add tempK, tempOffset, #2
  1298. #endif
  1299. asr counterL , tempK, #3 // counterL = counterL / 8
  1300. cmp counterL,#0
  1301. ble .Lstrmm_kernel_L2_M8_40
  1302. .align 5
  1303. .Lstrmm_kernel_L2_M8_22:
  1304. KERNEL8x2_SUB
  1305. KERNEL8x2_SUB
  1306. KERNEL8x2_SUB
  1307. KERNEL8x2_SUB
  1308. KERNEL8x2_SUB
  1309. KERNEL8x2_SUB
  1310. KERNEL8x2_SUB
  1311. KERNEL8x2_SUB
  1312. subs counterL, counterL, #1
  1313. bgt .Lstrmm_kernel_L2_M8_22
  1314. .Lstrmm_kernel_L2_M8_40:
  1315. ands counterL , tempK, #7 // counterL = counterL % 8
  1316. ble .Lstrmm_kernel_L2_M8_100
  1317. .Lstrmm_kernel_L2_M8_42:
  1318. KERNEL8x2_SUB
  1319. subs counterL, counterL, #1
  1320. bgt .Lstrmm_kernel_L2_M8_42
  1321. .Lstrmm_kernel_L2_M8_100:
  1322. SAVE8x2
  1323. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1324. sub tempK, origK, tempOffset
  1325. #if defined(LEFT)
  1326. sub tempK, tempK, #8
  1327. #else
  1328. sub tempK, tempK, #2
  1329. #endif
  1330. lsl temp, tempK, #5
  1331. add pA, pA, temp
  1332. lsl temp, tempK, #3
  1333. add pB, pB, temp
  1334. #endif
  1335. #if defined(LEFT)
  1336. add tempOffset, tempOffset, #8
  1337. #endif
  1338. .Lstrmm_kernel_L2_M8_END:
  1339. //------------------------------------------------------------------------------
  1340. .Lstrmm_kernel_L2_M4_BEGIN:
  1341. mov counterI, origM
  1342. tst counterI , #7
  1343. ble .Lstrmm_kernel_L2_END
  1344. tst counterI, #4
  1345. ble .Lstrmm_kernel_L2_M2_BEGIN
  1346. .Lstrmm_kernel_L2_M4_20:
  1347. INIT4x2
  1348. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1349. mov pB, origPB
  1350. #else
  1351. mov pB, origPB
  1352. lsl temp, tempOffset, #3
  1353. add pB, pB, temp
  1354. lsl temp, tempOffset, #4
  1355. add pA, pA, temp
  1356. #endif
  1357. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1358. sub tempK, origK, tempOffset
  1359. #elif defined(LEFT)
  1360. add tempK, tempOffset, #4
  1361. #else
  1362. add tempK, tempOffset, #2
  1363. #endif
  1364. asr counterL , tempK, #3 // counterL = counterL / 8
  1365. cmp counterL,#0
  1366. ble .Lstrmm_kernel_L2_M4_40
  1367. .align 5
  1368. .Lstrmm_kernel_L2_M4_22:
  1369. KERNEL4x2_SUB
  1370. KERNEL4x2_SUB
  1371. KERNEL4x2_SUB
  1372. KERNEL4x2_SUB
  1373. KERNEL4x2_SUB
  1374. KERNEL4x2_SUB
  1375. KERNEL4x2_SUB
  1376. KERNEL4x2_SUB
  1377. subs counterL, counterL, #1
  1378. bgt .Lstrmm_kernel_L2_M4_22
  1379. .Lstrmm_kernel_L2_M4_40:
  1380. ands counterL , tempK, #7 // counterL = counterL % 8
  1381. ble .Lstrmm_kernel_L2_M4_100
  1382. .Lstrmm_kernel_L2_M4_42:
  1383. KERNEL4x2_SUB
  1384. subs counterL, counterL, #1
  1385. bgt .Lstrmm_kernel_L2_M4_42
  1386. .Lstrmm_kernel_L2_M4_100:
  1387. SAVE4x2
  1388. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1389. sub tempK, origK, tempOffset
  1390. #if defined(LEFT)
  1391. sub tempK, tempK, #4
  1392. #else
  1393. sub tempK, tempK, #2
  1394. #endif
  1395. lsl temp, tempK, #4
  1396. add pA, pA, temp
  1397. lsl temp, tempK, #3
  1398. add pB, pB, temp
  1399. #endif
  1400. #if defined(LEFT)
  1401. add tempOffset, tempOffset, #4
  1402. #endif
  1403. .Lstrmm_kernel_L2_M4_END:
  1404. //------------------------------------------------------------------------------
  1405. .Lstrmm_kernel_L2_M2_BEGIN:
  1406. mov counterI, origM
  1407. tst counterI , #3
  1408. ble .Lstrmm_kernel_L2_END
  1409. tst counterI, #2 // counterI = counterI / 2
  1410. ble .Lstrmm_kernel_L2_M1_BEGIN
  1411. .Lstrmm_kernel_L2_M2_20:
  1412. INIT2x2
  1413. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1414. mov pB, origPB
  1415. #else
  1416. mov pB, origPB
  1417. lsl temp, tempOffset, #3
  1418. add pB, pB, temp
  1419. lsl temp, tempOffset, #3
  1420. add pA, pA, temp
  1421. #endif
  1422. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1423. sub tempK, origK, tempOffset
  1424. #elif defined(LEFT)
  1425. add tempK, tempOffset, #2
  1426. #else
  1427. add tempK, tempOffset, #2
  1428. #endif
  1429. asr counterL , tempK, #3 // counterL = counterL / 8
  1430. cmp counterL,#0
  1431. ble .Lstrmm_kernel_L2_M2_40
  1432. .Lstrmm_kernel_L2_M2_22:
  1433. KERNEL2x2_SUB
  1434. KERNEL2x2_SUB
  1435. KERNEL2x2_SUB
  1436. KERNEL2x2_SUB
  1437. KERNEL2x2_SUB
  1438. KERNEL2x2_SUB
  1439. KERNEL2x2_SUB
  1440. KERNEL2x2_SUB
  1441. subs counterL, counterL, #1
  1442. bgt .Lstrmm_kernel_L2_M2_22
  1443. .Lstrmm_kernel_L2_M2_40:
  1444. ands counterL , tempK, #7 // counterL = counterL % 8
  1445. ble .Lstrmm_kernel_L2_M2_100
  1446. .Lstrmm_kernel_L2_M2_42:
  1447. KERNEL2x2_SUB
  1448. subs counterL, counterL, #1
  1449. bgt .Lstrmm_kernel_L2_M2_42
  1450. .Lstrmm_kernel_L2_M2_100:
  1451. SAVE2x2
  1452. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1453. sub tempK, origK, tempOffset
  1454. #if defined(LEFT)
  1455. sub tempK, tempK, #2
  1456. #else
  1457. sub tempK, tempK, #2
  1458. #endif
  1459. lsl temp, tempK, #3
  1460. add pA, pA, temp
  1461. lsl temp, tempK, #3
  1462. add pB, pB, temp
  1463. #endif
  1464. #if defined(LEFT)
  1465. add tempOffset, tempOffset, #2
  1466. #endif
  1467. .Lstrmm_kernel_L2_M2_END:
  1468. .Lstrmm_kernel_L2_M1_BEGIN:
  1469. tst counterI, #1 // counterI = counterI % 2
  1470. ble .Lstrmm_kernel_L2_END
  1471. .Lstrmm_kernel_L2_M1_20:
  1472. INIT1x2
  1473. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1474. mov pB, origPB
  1475. #else
  1476. mov pB, origPB
  1477. lsl temp, tempOffset, #3
  1478. add pB, pB, temp
  1479. lsl temp, tempOffset, #2
  1480. add pA, pA, temp
  1481. #endif
  1482. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1483. sub tempK, origK, tempOffset
  1484. #elif defined(LEFT)
  1485. add tempK, tempOffset, #1
  1486. #else
  1487. add tempK, tempOffset, #2
  1488. #endif
  1489. asr counterL , tempK, #3 // counterL = counterL / 8
  1490. cmp counterL, #0
  1491. ble .Lstrmm_kernel_L2_M1_40
  1492. .Lstrmm_kernel_L2_M1_22:
  1493. KERNEL1x2_SUB
  1494. KERNEL1x2_SUB
  1495. KERNEL1x2_SUB
  1496. KERNEL1x2_SUB
  1497. KERNEL1x2_SUB
  1498. KERNEL1x2_SUB
  1499. KERNEL1x2_SUB
  1500. KERNEL1x2_SUB
  1501. subs counterL, counterL, #1
  1502. bgt .Lstrmm_kernel_L2_M1_22
  1503. .Lstrmm_kernel_L2_M1_40:
  1504. ands counterL , tempK, #7 // counterL = counterL % 8
  1505. ble .Lstrmm_kernel_L2_M1_100
  1506. .Lstrmm_kernel_L2_M1_42:
  1507. KERNEL1x2_SUB
  1508. subs counterL, counterL, #1
  1509. bgt .Lstrmm_kernel_L2_M1_42
  1510. .Lstrmm_kernel_L2_M1_100:
  1511. SAVE1x2
  1512. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1513. sub tempK, origK, tempOffset
  1514. #if defined(LEFT)
  1515. sub tempK, tempK, #1
  1516. #else
  1517. sub tempK, tempK, #2
  1518. #endif
  1519. lsl temp, tempK, #2
  1520. add pA, pA, temp
  1521. lsl temp, tempK, #3
  1522. add pB, pB, temp
  1523. #endif
  1524. #if defined(LEFT)
  1525. add tempOffset, tempOffset, #1
  1526. #endif
  1527. .Lstrmm_kernel_L2_END:
  1528. #if !defined(LEFT)
  1529. add tempOffset, tempOffset, #2
  1530. #endif
  1531. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  1532. /******************************************************************************/
  1533. .Lstrmm_kernel_L1_BEGIN:
  1534. mov counterJ , origN
  1535. tst counterJ , #1
  1536. ble .Lstrmm_kernel_L999 // done
  1537. mov pCRow0, pC // pCRow0 = C
  1538. add pC , pC , LDC // Update pC to point to next
  1539. #if defined(LEFT)
  1540. mov tempOffset, offset
  1541. #endif
  1542. mov pA, origPA // pA = A
  1543. .Lstrmm_kernel_L1_M16_BEGIN:
  1544. mov counterI, origM
  1545. asr counterI, counterI, #4 // counterI = counterI / 16
  1546. cmp counterI, #0
  1547. ble .Lstrmm_kernel_L1_M8_BEGIN
  1548. .Lstrmm_kernel_L1_M16_20:
  1549. INIT16x1
  1550. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1551. mov pB, origPB
  1552. #else
  1553. mov pB, origPB
  1554. lsl temp, tempOffset, #6
  1555. add pA, pA, temp
  1556. lsl temp, tempOffset, #2
  1557. add pB, pB, temp
  1558. #endif
  1559. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1560. sub tempK, origK, tempOffset
  1561. #elif defined(LEFT)
  1562. add tempK, tempOffset, #16
  1563. #else
  1564. add tempK, tempOffset, #1
  1565. #endif
  1566. asr counterL , tempK, #3 // counterL = counterL / 8
  1567. cmp counterL , #0
  1568. ble .Lstrmm_kernel_L1_M16_40
  1569. .align 5
  1570. .Lstrmm_kernel_L1_M16_22:
  1571. KERNEL16x1_SUB
  1572. KERNEL16x1_SUB
  1573. KERNEL16x1_SUB
  1574. KERNEL16x1_SUB
  1575. KERNEL16x1_SUB
  1576. KERNEL16x1_SUB
  1577. KERNEL16x1_SUB
  1578. KERNEL16x1_SUB
  1579. subs counterL, counterL, #1
  1580. bgt .Lstrmm_kernel_L1_M16_22
  1581. .Lstrmm_kernel_L1_M16_40:
  1582. ands counterL , tempK, #7 // counterL = counterL % 8
  1583. ble .Lstrmm_kernel_L1_M16_100
  1584. .Lstrmm_kernel_L1_M16_42:
  1585. KERNEL16x1_SUB
  1586. subs counterL, counterL, #1
  1587. bgt .Lstrmm_kernel_L1_M16_42
  1588. .Lstrmm_kernel_L1_M16_100:
  1589. SAVE16x1
  1590. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1591. sub tempK, origK, tempOffset
  1592. #if defined(LEFT)
  1593. sub tempK, tempK, #16
  1594. #else
  1595. sub tempK, tempK, #1
  1596. #endif
  1597. lsl temp, tempK, #6
  1598. add pA, pA, temp
  1599. lsl temp, tempK, #2
  1600. add pB, pB, temp
  1601. #endif
  1602. #if defined(LEFT)
  1603. add tempOffset, tempOffset, #16
  1604. #endif
  1605. .Lstrmm_kernel_L1_M16_END:
  1606. subs counterI, counterI, #1
  1607. bgt .Lstrmm_kernel_L1_M16_20
  1608. //------------------------------------------------------------------------------
  1609. .Lstrmm_kernel_L1_M8_BEGIN:
  1610. mov counterI, origM
  1611. tst counterI , #15
  1612. ble .Lstrmm_kernel_L1_END
  1613. tst counterI, #8
  1614. ble .Lstrmm_kernel_L1_M4_BEGIN
  1615. .Lstrmm_kernel_L1_M8_20:
  1616. INIT8x1
  1617. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1618. mov pB, origPB
  1619. #else
  1620. mov pB, origPB
  1621. lsl temp, tempOffset, #5
  1622. add pA, pA, temp
  1623. lsl temp, tempOffset, #2
  1624. add pB, pB, temp
  1625. #endif
  1626. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1627. sub tempK, origK, tempOffset
  1628. #elif defined(LEFT)
  1629. add tempK, tempOffset, #8
  1630. #else
  1631. add tempK, tempOffset, #1
  1632. #endif
  1633. asr counterL , tempK, #3 // counterL = counterL / 8
  1634. cmp counterL , #0
  1635. ble .Lstrmm_kernel_L1_M8_40
  1636. .align 5
  1637. .Lstrmm_kernel_L1_M8_22:
  1638. KERNEL8x1_SUB
  1639. KERNEL8x1_SUB
  1640. KERNEL8x1_SUB
  1641. KERNEL8x1_SUB
  1642. KERNEL8x1_SUB
  1643. KERNEL8x1_SUB
  1644. KERNEL8x1_SUB
  1645. KERNEL8x1_SUB
  1646. subs counterL, counterL, #1
  1647. bgt .Lstrmm_kernel_L1_M8_22
  1648. .Lstrmm_kernel_L1_M8_40:
  1649. ands counterL , tempK, #7 // counterL = counterL % 8
  1650. ble .Lstrmm_kernel_L1_M8_100
  1651. .Lstrmm_kernel_L1_M8_42:
  1652. KERNEL8x1_SUB
  1653. subs counterL, counterL, #1
  1654. bgt .Lstrmm_kernel_L1_M8_42
  1655. .Lstrmm_kernel_L1_M8_100:
  1656. SAVE8x1
  1657. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1658. sub tempK, origK, tempOffset
  1659. #if defined(LEFT)
  1660. sub tempK, tempK, #8
  1661. #else
  1662. sub tempK, tempK, #1
  1663. #endif
  1664. lsl temp, tempK, #5
  1665. add pA, pA, temp
  1666. lsl temp, tempK, #2
  1667. add pB, pB, temp
  1668. #endif
  1669. #if defined(LEFT)
  1670. add tempOffset, tempOffset, #8
  1671. #endif
  1672. .Lstrmm_kernel_L1_M8_END:
  1673. //------------------------------------------------------------------------------
  1674. .Lstrmm_kernel_L1_M4_BEGIN:
  1675. mov counterI, origM
  1676. tst counterI , #7
  1677. ble .Lstrmm_kernel_L1_END
  1678. tst counterI, #4
  1679. ble .Lstrmm_kernel_L1_M2_BEGIN
  1680. .Lstrmm_kernel_L1_M4_20:
  1681. INIT4x1
  1682. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1683. mov pB, origPB
  1684. #else
  1685. mov pB, origPB
  1686. lsl temp, tempOffset, #2
  1687. add pB, pB, temp
  1688. lsl temp, tempOffset, #4
  1689. add pA, pA, temp
  1690. #endif
  1691. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1692. sub tempK, origK, tempOffset
  1693. #elif defined(LEFT)
  1694. add tempK, tempOffset, #4
  1695. #else
  1696. add tempK, tempOffset, #1
  1697. #endif
  1698. asr counterL , tempK, #3 // counterL = counterL / 8
  1699. cmp counterL , #0
  1700. ble .Lstrmm_kernel_L1_M4_40
  1701. .align 5
  1702. .Lstrmm_kernel_L1_M4_22:
  1703. KERNEL4x1_SUB
  1704. KERNEL4x1_SUB
  1705. KERNEL4x1_SUB
  1706. KERNEL4x1_SUB
  1707. KERNEL4x1_SUB
  1708. KERNEL4x1_SUB
  1709. KERNEL4x1_SUB
  1710. KERNEL4x1_SUB
  1711. subs counterL, counterL, #1
  1712. bgt .Lstrmm_kernel_L1_M4_22
  1713. .Lstrmm_kernel_L1_M4_40:
  1714. ands counterL , tempK, #7 // counterL = counterL % 8
  1715. ble .Lstrmm_kernel_L1_M4_100
  1716. .Lstrmm_kernel_L1_M4_42:
  1717. KERNEL4x1_SUB
  1718. subs counterL, counterL, #1
  1719. bgt .Lstrmm_kernel_L1_M4_42
  1720. .Lstrmm_kernel_L1_M4_100:
  1721. SAVE4x1
  1722. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1723. sub tempK, origK, tempOffset
  1724. #if defined(LEFT)
  1725. sub tempK, tempK, #4
  1726. #else
  1727. sub tempK, tempK, #1
  1728. #endif
  1729. lsl temp, tempK, #4
  1730. add pA, pA, temp
  1731. lsl temp, tempK, #2
  1732. add pB, pB, temp
  1733. #endif
  1734. #if defined(LEFT)
  1735. add tempOffset, tempOffset, #4
  1736. #endif
  1737. .Lstrmm_kernel_L1_M4_END:
  1738. //------------------------------------------------------------------------------
  1739. .Lstrmm_kernel_L1_M2_BEGIN:
  1740. mov counterI, origM
  1741. tst counterI , #3
  1742. ble .Lstrmm_kernel_L1_END
  1743. tst counterI, #2 // counterI = counterI / 2
  1744. ble .Lstrmm_kernel_L1_M1_BEGIN
  1745. .Lstrmm_kernel_L1_M2_20:
  1746. INIT2x1
  1747. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1748. mov pB, origPB
  1749. #else
  1750. mov pB, origPB
  1751. lsl temp, tempOffset, #2
  1752. add pB, pB, temp
  1753. lsl temp, tempOffset, #3
  1754. add pA, pA, temp
  1755. #endif
  1756. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1757. sub tempK, origK, tempOffset
  1758. #elif defined(LEFT)
  1759. add tempK, tempOffset, #2
  1760. #else
  1761. add tempK, tempOffset, #1
  1762. #endif
  1763. asr counterL , tempK, #3 // counterL = counterL / 8
  1764. cmp counterL , #0
  1765. ble .Lstrmm_kernel_L1_M2_40
  1766. .Lstrmm_kernel_L1_M2_22:
  1767. KERNEL2x1_SUB
  1768. KERNEL2x1_SUB
  1769. KERNEL2x1_SUB
  1770. KERNEL2x1_SUB
  1771. KERNEL2x1_SUB
  1772. KERNEL2x1_SUB
  1773. KERNEL2x1_SUB
  1774. KERNEL2x1_SUB
  1775. subs counterL, counterL, #1
  1776. bgt .Lstrmm_kernel_L1_M2_22
  1777. .Lstrmm_kernel_L1_M2_40:
  1778. ands counterL , tempK, #7 // counterL = counterL % 8
  1779. ble .Lstrmm_kernel_L1_M2_100
  1780. .Lstrmm_kernel_L1_M2_42:
  1781. KERNEL2x1_SUB
  1782. subs counterL, counterL, #1
  1783. bgt .Lstrmm_kernel_L1_M2_42
  1784. .Lstrmm_kernel_L1_M2_100:
  1785. SAVE2x1
  1786. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1787. sub tempK, origK, tempOffset
  1788. #if defined(LEFT)
  1789. sub tempK, tempK, #2
  1790. #else
  1791. sub tempK, tempK, #1
  1792. #endif
  1793. lsl temp, tempK, #3
  1794. add pA, pA, temp
  1795. lsl temp, tempK, #2
  1796. add pB, pB, temp
  1797. #endif
  1798. #if defined(LEFT)
  1799. add tempOffset, tempOffset, #2
  1800. #endif
  1801. .Lstrmm_kernel_L1_M2_END:
  1802. .Lstrmm_kernel_L1_M1_BEGIN:
  1803. tst counterI, #1 // counterI = counterI % 2
  1804. ble .Lstrmm_kernel_L1_END
  1805. .Lstrmm_kernel_L1_M1_20:
  1806. INIT1x1
  1807. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1808. mov pB, origPB
  1809. #else
  1810. mov pB, origPB
  1811. lsl temp, tempOffset, #2
  1812. add pB, pB, temp
  1813. lsl temp, tempOffset, #2
  1814. add pA, pA, temp
  1815. #endif
  1816. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1817. sub tempK, origK, tempOffset
  1818. #elif defined(LEFT)
  1819. add tempK, tempOffset, #1
  1820. #else
  1821. add tempK, tempOffset, #1
  1822. #endif
  1823. asr counterL , tempK, #3 // counterL = counterL / 8
  1824. cmp counterL , #0
  1825. ble .Lstrmm_kernel_L1_M1_40
  1826. .Lstrmm_kernel_L1_M1_22:
  1827. KERNEL1x1_SUB
  1828. KERNEL1x1_SUB
  1829. KERNEL1x1_SUB
  1830. KERNEL1x1_SUB
  1831. KERNEL1x1_SUB
  1832. KERNEL1x1_SUB
  1833. KERNEL1x1_SUB
  1834. KERNEL1x1_SUB
  1835. subs counterL, counterL, #1
  1836. bgt .Lstrmm_kernel_L1_M1_22
  1837. .Lstrmm_kernel_L1_M1_40:
  1838. ands counterL , tempK, #7 // counterL = counterL % 8
  1839. ble .Lstrmm_kernel_L1_M1_100
  1840. .Lstrmm_kernel_L1_M1_42:
  1841. KERNEL1x1_SUB
  1842. subs counterL, counterL, #1
  1843. bgt .Lstrmm_kernel_L1_M1_42
  1844. .Lstrmm_kernel_L1_M1_100:
  1845. SAVE1x1
  1846. .Lstrmm_kernel_L1_END:
  1847. .Lstrmm_kernel_L999:
  1848. mov x0, #0 // set return value
  1849. ldp d8, d9, [sp, #(0 * 16)]
  1850. ldp d10, d11, [sp, #(1 * 16)]
  1851. ldp d12, d13, [sp, #(2 * 16)]
  1852. ldp d14, d15, [sp, #(3 * 16)]
  1853. ldp d16, d17, [sp, #(4 * 16)]
  1854. ldp x18, x19, [sp, #(5 * 16)]
  1855. ldp x20, x21, [sp, #(6 * 16)]
  1856. ldp x22, x23, [sp, #(7 * 16)]
  1857. ldp x24, x25, [sp, #(8 * 16)]
  1858. ldp x26, x27, [sp, #(9 * 16)]
  1859. ldr x28, [sp, #(10 * 16)]
  1860. add sp, sp, #(11*16)
  1861. ret
  1862. EPILOGUE