You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_kernel_16x4.S 47 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 x7 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define temp x16
  48. #define tempOffset x17
  49. #define tempK x18
  50. #define alpha0 s10
  51. #define alphaV0 v10.s[0]
  52. #define alpha1 s11
  53. #define alphaV1 v11.s[0]
  54. #define alpha2 s14
  55. #define alphaV2 v14.s[0]
  56. #define alpha3 s15
  57. #define alphaV3 v15.s[0]
  58. // 00 origM
  59. // 01 origN
  60. // 02 origK
  61. // 03 origPA
  62. // 04 origPB
  63. // 05 pC
  64. // 06 origLDC -> LDC
  65. // 07 offset
  66. // 08 counterL
  67. // 09 counterI
  68. // 10 counterJ
  69. // 11 pB
  70. // 12 pCRow0
  71. // 13 pCRow1
  72. // 14 pCRow2
  73. // 15 pA
  74. // 16 temp
  75. // 17 tempOffset
  76. // 18 must save tempK
  77. // 19 must save
  78. // 20 must save
  79. // 21 must save
  80. // 22 must save
  81. // 23 must save
  82. // 24 must save
  83. // 25 must save
  84. // 26 must save
  85. // 27 must save
  86. // 28 must save
  87. // 29 frame
  88. // 30 link
  89. // 31 sp
  90. //v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03
  91. //v01 pA0_04, pA0_05, pA0_06, pA0_07
  92. //v02 pA0_08, pA0_09, pA0_10, pA0_11
  93. //v03 pA0_12, pA0_13, pA0_14, pA0_15
  94. //v04 pA1_00, pA1_01, pA1_02, pA1_03
  95. //v05 pA1_04, pA1_05, pA1_06, pA1_07
  96. //v06 pA1_08, pA1_09, pA1_10, pA1_11
  97. //v07 pA1_12, pA1_13, pA1_14, pA1_15
  98. //v08 must save pB00, pB01
  99. //v09 must save pB02, pB03
  100. //v10 must save ALPHA0
  101. //v11 must save ALPHA1
  102. //v12 must save pB10, pB11
  103. //v13 must save pB12, pB13
  104. //v14 must save ALPHA2
  105. //v15 must save ALPHA3
  106. //v16 must save C00, C01, C02, C03
  107. //v17 must save C04, C05, C06, C07
  108. //v18 C08, C09, C10, C11
  109. //v19 C12, C13, C14, C15
  110. //v20 C16, C17, C18, C19
  111. //v21 C20, C21, C22, C23
  112. //v22 C24, C25, C26, C27
  113. //v23 C28, C29, C30, C31
  114. //v24 C32, C33, C34, C35
  115. //v25 C36, C37, C38, C39
  116. //v26 C40, C41, C42, C43
  117. //v27 C44, C45, C46, C47
  118. //v28 C48, C49, C50, C51
  119. //v29 C52, C53, C54, C55
  120. //v30 C56, C57, C58, C59
  121. //v31 C60, C61, C62, C63
  122. /*******************************************************************************
  123. * Macro definitions
  124. *******************************************************************************/
  125. .macro INIT16x4
  126. fmov s16, wzr
  127. fmov s17, wzr
  128. fmov s18, s16
  129. fmov s19, s17
  130. fmov s20, wzr
  131. fmov s21, s16
  132. fmov s22, s17
  133. fmov s23, s18
  134. fmov s24, wzr
  135. fmov s25, s16
  136. fmov s26, s17
  137. fmov s27, s18
  138. fmov s28, wzr
  139. fmov s29, s16
  140. fmov s30, s17
  141. fmov s31, s18
  142. .endm
  143. .macro KERNEL16x4_I
  144. ld1 {v8.2s, v9.2s}, [pB]
  145. add pB, pB, #16
  146. ld1 {v0.4s}, [pA]
  147. add pA, pA, #16
  148. ld1 {v1.4s}, [pA]
  149. add pA, pA, #16
  150. ld1 {v2.4s}, [pA]
  151. add pA, pA, #16
  152. ld1 {v3.4s}, [pA]
  153. add pA, pA, #16
  154. fmul v16.4s, v0.4s, v8.s[0]
  155. fmul v17.4s, v1.4s, v8.s[0]
  156. fmul v18.4s, v2.4s, v8.s[0]
  157. fmul v19.4s, v3.4s, v8.s[0]
  158. fmul v20.4s, v0.4s, v8.s[1]
  159. fmul v21.4s, v1.4s, v8.s[1]
  160. fmul v22.4s, v2.4s, v8.s[1]
  161. fmul v23.4s, v3.4s, v8.s[1]
  162. fmul v24.4s, v0.4s, v9.s[0]
  163. fmul v25.4s, v1.4s, v9.s[0]
  164. fmul v26.4s, v2.4s, v9.s[0]
  165. fmul v27.4s, v3.4s, v9.s[0]
  166. fmul v28.4s, v0.4s, v9.s[1]
  167. fmul v29.4s, v1.4s, v9.s[1]
  168. fmul v30.4s, v2.4s, v9.s[1]
  169. fmul v31.4s, v3.4s, v9.s[1]
  170. ld1 {v12.2s, v13.2s}, [pB]
  171. add pB, pB, #16
  172. ld1 {v4.4s}, [pA]
  173. add pA, pA, #16
  174. ld1 {v5.4s}, [pA]
  175. add pA, pA, #16
  176. ld1 {v6.4s}, [pA]
  177. add pA, pA, #16
  178. ld1 {v7.4s}, [pA]
  179. add pA, pA, #16
  180. .endm
  181. .macro KERNEL16x4_M1
  182. fmla v16.4s, v0.4s, v8.s[0]
  183. fmla v17.4s, v1.4s, v8.s[0]
  184. fmla v18.4s, v2.4s, v8.s[0]
  185. fmla v19.4s, v3.4s, v8.s[0]
  186. fmla v20.4s, v0.4s, v8.s[1]
  187. fmla v21.4s, v1.4s, v8.s[1]
  188. fmla v22.4s, v2.4s, v8.s[1]
  189. fmla v23.4s, v3.4s, v8.s[1]
  190. fmla v24.4s, v0.4s, v9.s[0]
  191. fmla v25.4s, v1.4s, v9.s[0]
  192. fmla v26.4s, v2.4s, v9.s[0]
  193. fmla v27.4s, v3.4s, v9.s[0]
  194. fmla v28.4s, v0.4s, v9.s[1]
  195. fmla v29.4s, v1.4s, v9.s[1]
  196. fmla v30.4s, v2.4s, v9.s[1]
  197. fmla v31.4s, v3.4s, v9.s[1]
  198. ld1 {v12.2s, v13.2s}, [pB]
  199. add pB, pB, #16
  200. ld1 {v4.4s}, [pA]
  201. add pA, pA, #16
  202. ld1 {v5.4s}, [pA]
  203. add pA, pA, #16
  204. ld1 {v6.4s}, [pA]
  205. add pA, pA, #16
  206. ld1 {v7.4s}, [pA]
  207. add pA, pA, #16
  208. .endm
  209. .macro KERNEL16x4_M2
  210. fmla v16.4s, v4.4s, v12.s[0]
  211. fmla v17.4s, v5.4s, v12.s[0]
  212. fmla v18.4s, v6.4s, v12.s[0]
  213. fmla v19.4s, v7.4s, v12.s[0]
  214. fmla v20.4s, v4.4s, v12.s[1]
  215. fmla v21.4s, v5.4s, v12.s[1]
  216. fmla v22.4s, v6.4s, v12.s[1]
  217. fmla v23.4s, v7.4s, v12.s[1]
  218. fmla v24.4s, v4.4s, v13.s[0]
  219. fmla v25.4s, v5.4s, v13.s[0]
  220. fmla v26.4s, v6.4s, v13.s[0]
  221. fmla v27.4s, v7.4s, v13.s[0]
  222. fmla v28.4s, v4.4s, v13.s[1]
  223. fmla v29.4s, v5.4s, v13.s[1]
  224. fmla v30.4s, v6.4s, v13.s[1]
  225. fmla v31.4s, v7.4s, v13.s[1]
  226. ld1 {v8.2s, v9.2s}, [pB]
  227. add pB, pB, #16
  228. ld1 {v0.4s}, [pA]
  229. add pA, pA, #16
  230. ld1 {v1.4s}, [pA]
  231. add pA, pA, #16
  232. ld1 {v2.4s}, [pA]
  233. add pA, pA, #16
  234. ld1 {v3.4s}, [pA]
  235. add pA, pA, #16
  236. .endm
  237. .macro KERNEL16x4_E
  238. fmla v16.4s, v4.4s, v12.s[0]
  239. fmla v17.4s, v5.4s, v12.s[0]
  240. fmla v18.4s, v6.4s, v12.s[0]
  241. fmla v19.4s, v7.4s, v12.s[0]
  242. fmla v20.4s, v4.4s, v12.s[1]
  243. fmla v21.4s, v5.4s, v12.s[1]
  244. fmla v22.4s, v6.4s, v12.s[1]
  245. fmla v23.4s, v7.4s, v12.s[1]
  246. fmla v24.4s, v4.4s, v13.s[0]
  247. fmla v25.4s, v5.4s, v13.s[0]
  248. fmla v26.4s, v6.4s, v13.s[0]
  249. fmla v27.4s, v7.4s, v13.s[0]
  250. fmla v28.4s, v4.4s, v13.s[1]
  251. fmla v29.4s, v5.4s, v13.s[1]
  252. fmla v30.4s, v6.4s, v13.s[1]
  253. fmla v31.4s, v7.4s, v13.s[1]
  254. .endm
  255. .macro KERNEL16x4_SUB
  256. ld1 {v8.2s, v9.2s}, [pB]
  257. add pB, pB, #16
  258. ld1 {v0.4s}, [pA]
  259. add pA, pA, #16
  260. ld1 {v1.4s}, [pA]
  261. add pA, pA, #16
  262. ld1 {v2.4s}, [pA]
  263. add pA, pA, #16
  264. ld1 {v3.4s}, [pA]
  265. add pA, pA, #16
  266. fmla v16.4s, v0.4s, v8.s[0]
  267. fmla v17.4s, v1.4s, v8.s[0]
  268. fmla v18.4s, v2.4s, v8.s[0]
  269. fmla v19.4s, v3.4s, v8.s[0]
  270. fmla v20.4s, v0.4s, v8.s[1]
  271. fmla v21.4s, v1.4s, v8.s[1]
  272. fmla v22.4s, v2.4s, v8.s[1]
  273. fmla v23.4s, v3.4s, v8.s[1]
  274. fmla v24.4s, v0.4s, v9.s[0]
  275. fmla v25.4s, v1.4s, v9.s[0]
  276. fmla v26.4s, v2.4s, v9.s[0]
  277. fmla v27.4s, v3.4s, v9.s[0]
  278. fmla v28.4s, v0.4s, v9.s[1]
  279. fmla v29.4s, v1.4s, v9.s[1]
  280. fmla v30.4s, v2.4s, v9.s[1]
  281. fmla v31.4s, v3.4s, v9.s[1]
  282. .endm
  283. .macro SAVE16x4
  284. add pCRow1, pCRow0, LDC
  285. fmul v0.4s, v16.4s, alphaV0
  286. fmul v1.4s, v17.4s, alphaV1
  287. fmul v2.4s, v18.4s, alphaV2
  288. fmul v3.4s, v19.4s, alphaV3
  289. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  290. add pCRow2, pCRow1, LDC
  291. fmul v4.4s, v20.4s, alphaV0
  292. fmul v5.4s, v21.4s, alphaV1
  293. fmul v6.4s, v22.4s, alphaV2
  294. fmul v7.4s, v23.4s, alphaV3
  295. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  296. add pCRow1, pCRow2, LDC
  297. fmul v0.4s, v24.4s, alphaV0
  298. fmul v1.4s, v25.4s, alphaV1
  299. fmul v2.4s, v26.4s, alphaV2
  300. fmul v3.4s, v27.4s, alphaV3
  301. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2]
  302. fmul v4.4s, v28.4s, alphaV0
  303. fmul v5.4s, v29.4s, alphaV1
  304. fmul v6.4s, v30.4s, alphaV2
  305. fmul v7.4s, v31.4s, alphaV3
  306. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  307. add pCRow0, pCRow0, #64
  308. .endm
  309. /******************************************************************************/
  310. .macro INIT8x4
  311. fmov s16, wzr
  312. fmov s17, wzr
  313. fmov s20, wzr
  314. fmov s21, s16
  315. fmov s24, wzr
  316. fmov s25, s16
  317. fmov s28, wzr
  318. fmov s29, s16
  319. .endm
  320. .macro KERNEL8x4_I
  321. ld1 {v8.2s, v9.2s}, [pB]
  322. add pB, pB, #16
  323. ld1 {v0.4s}, [pA]
  324. add pA, pA, #16
  325. ld1 {v1.4s}, [pA]
  326. add pA, pA, #16
  327. fmul v16.4s, v0.4s, v8.s[0]
  328. fmul v17.4s, v1.4s, v8.s[0]
  329. fmul v20.4s, v0.4s, v8.s[1]
  330. fmul v21.4s, v1.4s, v8.s[1]
  331. fmul v24.4s, v0.4s, v9.s[0]
  332. fmul v25.4s, v1.4s, v9.s[0]
  333. fmul v28.4s, v0.4s, v9.s[1]
  334. fmul v29.4s, v1.4s, v9.s[1]
  335. ld1 {v12.2s, v13.2s}, [pB]
  336. add pB, pB, #16
  337. ld1 {v4.4s}, [pA]
  338. add pA, pA, #16
  339. ld1 {v5.4s}, [pA]
  340. add pA, pA, #16
  341. .endm
  342. .macro KERNEL8x4_M1
  343. fmla v16.4s, v0.4s, v8.s[0]
  344. fmla v17.4s, v1.4s, v8.s[0]
  345. fmla v20.4s, v0.4s, v8.s[1]
  346. fmla v21.4s, v1.4s, v8.s[1]
  347. fmla v24.4s, v0.4s, v9.s[0]
  348. fmla v25.4s, v1.4s, v9.s[0]
  349. fmla v28.4s, v0.4s, v9.s[1]
  350. fmla v29.4s, v1.4s, v9.s[1]
  351. ld1 {v12.2s, v13.2s}, [pB]
  352. add pB, pB, #16
  353. ld1 {v4.4s}, [pA]
  354. add pA, pA, #16
  355. ld1 {v5.4s}, [pA]
  356. add pA, pA, #16
  357. .endm
  358. .macro KERNEL8x4_M2
  359. fmla v16.4s, v4.4s, v12.s[0]
  360. fmla v17.4s, v5.4s, v12.s[0]
  361. fmla v20.4s, v4.4s, v12.s[1]
  362. fmla v21.4s, v5.4s, v12.s[1]
  363. fmla v24.4s, v4.4s, v13.s[0]
  364. fmla v25.4s, v5.4s, v13.s[0]
  365. fmla v28.4s, v4.4s, v13.s[1]
  366. fmla v29.4s, v5.4s, v13.s[1]
  367. ld1 {v8.2s, v9.2s}, [pB]
  368. add pB, pB, #16
  369. ld1 {v0.4s}, [pA]
  370. add pA, pA, #16
  371. ld1 {v1.4s}, [pA]
  372. add pA, pA, #16
  373. .endm
  374. .macro KERNEL8x4_E
  375. fmla v16.4s, v4.4s, v12.s[0]
  376. fmla v17.4s, v5.4s, v12.s[0]
  377. fmla v20.4s, v4.4s, v12.s[1]
  378. fmla v21.4s, v5.4s, v12.s[1]
  379. fmla v24.4s, v4.4s, v13.s[0]
  380. fmla v25.4s, v5.4s, v13.s[0]
  381. fmla v28.4s, v4.4s, v13.s[1]
  382. fmla v29.4s, v5.4s, v13.s[1]
  383. .endm
  384. .macro KERNEL8x4_SUB
  385. ld1 {v8.2s, v9.2s}, [pB]
  386. add pB, pB, #16
  387. ld1 {v0.4s}, [pA]
  388. add pA, pA, #16
  389. ld1 {v1.4s}, [pA]
  390. add pA, pA, #16
  391. fmla v16.4s, v0.4s, v8.s[0]
  392. fmla v17.4s, v1.4s, v8.s[0]
  393. fmla v20.4s, v0.4s, v8.s[1]
  394. fmla v21.4s, v1.4s, v8.s[1]
  395. fmla v24.4s, v0.4s, v9.s[0]
  396. fmla v25.4s, v1.4s, v9.s[0]
  397. fmla v28.4s, v0.4s, v9.s[1]
  398. fmla v29.4s, v1.4s, v9.s[1]
  399. .endm
  400. .macro SAVE8x4
  401. add pCRow1, pCRow0, LDC
  402. fmul v0.4s, v16.4s, alphaV0
  403. fmul v1.4s, v17.4s, alphaV1
  404. st1 {v0.4s, v1.4s}, [pCRow0]
  405. add pCRow2, pCRow1, LDC
  406. fmul v4.4s, v20.4s, alphaV0
  407. fmul v5.4s, v21.4s, alphaV1
  408. st1 {v4.4s, v5.4s}, [pCRow1]
  409. add pCRow1, pCRow2, LDC
  410. fmul v0.4s, v24.4s, alphaV0
  411. fmul v1.4s, v25.4s, alphaV1
  412. st1 {v0.4s, v1.4s}, [pCRow2]
  413. fmul v4.4s, v28.4s, alphaV0
  414. fmul v5.4s, v29.4s, alphaV1
  415. st1 {v4.4s, v5.4s}, [pCRow1]
  416. add pCRow0, pCRow0, #32
  417. .endm
  418. /******************************************************************************/
  419. .macro INIT4x4
  420. fmov s16, wzr
  421. fmov s17, s16
  422. fmov s20, s17
  423. fmov s21, s16
  424. fmov s24, s17
  425. fmov s25, s16
  426. fmov s28, s17
  427. fmov s29, s16
  428. .endm
  429. .macro KERNEL4x4_I
  430. ld1 {v8.2s, v9.2s}, [pB]
  431. add pB, pB, #16
  432. ld1 {v0.2s, v1.2s}, [pA]
  433. add pA, pA, #16
  434. fmul v16.2s, v0.2s, v8.s[0]
  435. fmul v29.2s, v1.2s, v9.s[1]
  436. fmul v20.2s, v0.2s, v8.s[1]
  437. fmul v25.2s, v1.2s, v9.s[0]
  438. fmul v24.2s, v0.2s, v9.s[0]
  439. fmul v21.2s, v1.2s, v8.s[1]
  440. fmul v28.2s, v0.2s, v9.s[1]
  441. fmul v17.2s, v1.2s, v8.s[0]
  442. ld1 {v12.2s, v13.2s}, [pB]
  443. add pB, pB, #16
  444. ld1 {v4.2s, v5.2s}, [pA]
  445. add pA, pA, #16
  446. .endm
  447. .macro KERNEL4x4_M1
  448. fmla v16.2s, v0.2s, v8.s[0]
  449. fmla v29.2s, v1.2s, v9.s[1]
  450. ld1 {v12.2s, v13.2s}, [pB] // For next round
  451. add pB, pB, #16
  452. fmla v20.2s, v0.2s, v8.s[1]
  453. fmla v25.2s, v1.2s, v9.s[0]
  454. ld1 {v4.2s, v5.2s}, [pA] // For next round
  455. add pA, pA, #16
  456. fmla v24.2s, v0.2s, v9.s[0]
  457. fmla v21.2s, v1.2s, v8.s[1]
  458. prfm PLDL1KEEP, [pB, #512]
  459. fmla v28.2s, v0.2s, v9.s[1]
  460. fmla v17.2s, v1.2s, v8.s[0]
  461. .endm
  462. .macro KERNEL4x4_M2
  463. fmla v16.2s, v4.2s, v12.s[0]
  464. fmla v29.2s, v5.2s, v13.s[1]
  465. ld1 {v8.2s, v9.2s}, [pB] // For next round
  466. add pB, pB, #16
  467. fmla v20.2s, v4.2s, v12.s[1]
  468. fmla v25.2s, v5.2s, v13.s[0]
  469. ld1 {v0.2s, v1.2s}, [pA] // For next round
  470. add pA, pA, #16
  471. fmla v24.2s, v4.2s, v13.s[0]
  472. fmla v21.2s, v5.2s, v12.s[1]
  473. prfm PLDL1KEEP, [pA, #512]
  474. fmla v28.2s, v4.2s, v13.s[1]
  475. fmla v17.2s, v5.2s, v12.s[0]
  476. .endm
  477. .macro KERNEL4x4_E
  478. fmla v16.2s, v4.2s, v12.s[0]
  479. fmla v29.2s, v5.2s, v13.s[1]
  480. fmla v20.2s, v4.2s, v12.s[1]
  481. fmla v25.2s, v5.2s, v13.s[0]
  482. fmla v24.2s, v4.2s, v13.s[0]
  483. fmla v21.2s, v5.2s, v12.s[1]
  484. fmla v28.2s, v4.2s, v13.s[1]
  485. fmla v17.2s, v5.2s, v12.s[0]
  486. .endm
  487. .macro KERNEL4x4_SUB
  488. ld1 {v8.2s, v9.2s}, [pB]
  489. add pB, pB, #16
  490. ld1 {v0.2s, v1.2s}, [pA]
  491. add pA, pA, #16
  492. fmla v16.2s, v0.2s, v8.s[0]
  493. fmla v29.2s, v1.2s, v9.s[1]
  494. fmla v20.2s, v0.2s, v8.s[1]
  495. fmla v25.2s, v1.2s, v9.s[0]
  496. fmla v24.2s, v0.2s, v9.s[0]
  497. fmla v21.2s, v1.2s, v8.s[1]
  498. fmla v28.2s, v0.2s, v9.s[1]
  499. fmla v17.2s, v1.2s, v8.s[0]
  500. .endm
  501. .macro SAVE4x4
  502. fmul v8.2s, v16.2s, alphaV0
  503. fmul v9.2s, v17.2s, alphaV1
  504. st1 {v8.2s, v9.2s}, [pCRow0]
  505. add pCRow1, pCRow0, LDC
  506. fmul v12.2s, v20.2s, alphaV2
  507. fmul v13.2s, v21.2s, alphaV3
  508. st1 {v12.2s, v13.2s}, [pCRow1]
  509. add pCRow2, pCRow1, LDC
  510. fmul v8.2s, v24.2s, alphaV0
  511. fmul v9.2s, v25.2s, alphaV1
  512. st1 {v8.2s, v9.2s}, [pCRow2]
  513. add pCRow1, pCRow2, LDC
  514. fmul v12.2s, v28.2s, alphaV2
  515. fmul v13.2s, v29.2s, alphaV3
  516. st1 {v12.2s, v13.2s}, [pCRow1]
  517. add pCRow0, pCRow0, #16
  518. .endm
  519. /******************************************************************************/
  520. .macro INIT2x4
  521. fmov s16, wzr
  522. fmov s20, s16
  523. fmov s24, s20
  524. fmov s28, s16
  525. .endm
  526. .macro KERNEL2x4_SUB
  527. ld1 {v8.2s, v9.2s}, [pB]
  528. add pB, pB, #16
  529. ld1 {v0.2s}, [pA]
  530. add pA, pA, #8
  531. fmla v16.2s, v0.2s, v8.s[0]
  532. fmla v20.2s, v0.2s, v8.s[1]
  533. fmla v24.2s, v0.2s, v9.s[0]
  534. fmla v28.2s, v0.2s, v9.s[1]
  535. .endm
  536. .macro SAVE2x4
  537. fmul v8.2s, v16.2s, alphaV0
  538. st1 {v8.2s}, [pCRow0]
  539. add pCRow1, pCRow0, LDC
  540. fmul v12.2s, v20.2s, alphaV1
  541. st1 {v12.2s}, [pCRow1]
  542. add pCRow2, pCRow1, LDC
  543. fmul v8.2s, v24.2s, alphaV2
  544. st1 {v8.2s}, [pCRow2]
  545. add pCRow1, pCRow2, LDC
  546. fmul v12.2s, v28.2s, alphaV3
  547. st1 {v12.2s}, [pCRow1]
  548. add pCRow0, pCRow0, #8
  549. .endm
  550. /******************************************************************************/
  551. .macro INIT1x4
  552. fmov s16, wzr
  553. fmov s20, s16
  554. .endm
  555. .macro KERNEL1x4_SUB
  556. ldr s0, [pA]
  557. add pA, pA, #4
  558. ld1 {v8.2s, v9.2s}, [pB]
  559. add pB, pB, #16
  560. fmla v16.2s, v8.2s, v0.s[0]
  561. fmla v20.2s, v9.2s, v0.s[0]
  562. .endm
  563. .macro SAVE1x4
  564. add pCRow1, pCRow0, LDC
  565. fmul v8.2s, v16.2s, alphaV0
  566. st1 {v8.s}[0], [pCRow0]
  567. st1 {v8.s}[1], [pCRow1]
  568. add pCRow2, pCRow1, LDC
  569. add pCRow1, pCRow2, LDC
  570. fmul v12.2s, v20.2s, alphaV1
  571. st1 {v12.s}[0], [pCRow2]
  572. st1 {v12.s}[1], [pCRow1]
  573. add pCRow0, pCRow0, #4
  574. .endm
  575. /******************************************************************************/
  576. .macro INIT16x2
  577. fmov s16, wzr
  578. fmov s17, wzr
  579. fmov s18, wzr
  580. fmov s19, s16
  581. fmov s20, wzr
  582. fmov s21, s16
  583. fmov s22, wzr
  584. fmov s23, s16
  585. .endm
  586. .macro KERNEL16x2_SUB
  587. ld1 {v8.2s}, [pB]
  588. add pB, pB, #8
  589. ld1 {v0.4s}, [pA]
  590. add pA, pA, #16
  591. ld1 {v1.4s}, [pA]
  592. add pA, pA, #16
  593. ld1 {v2.4s}, [pA]
  594. add pA, pA, #16
  595. ld1 {v3.4s}, [pA]
  596. add pA, pA, #16
  597. fmla v16.4s, v0.4s, v8.s[0]
  598. fmla v17.4s, v1.4s, v8.s[0]
  599. fmla v18.4s, v2.4s, v8.s[0]
  600. fmla v19.4s, v3.4s, v8.s[0]
  601. fmla v20.4s, v0.4s, v8.s[1]
  602. fmla v21.4s, v1.4s, v8.s[1]
  603. fmla v22.4s, v2.4s, v8.s[1]
  604. fmla v23.4s, v3.4s, v8.s[1]
  605. .endm
  606. .macro SAVE16x2
  607. add pCRow1, pCRow0, LDC
  608. fmul v0.4s, v16.4s, alphaV0
  609. fmul v1.4s, v17.4s, alphaV1
  610. fmul v2.4s, v18.4s, alphaV2
  611. fmul v3.4s, v19.4s, alphaV3
  612. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  613. fmul v4.4s, v20.4s, alphaV0
  614. fmul v5.4s, v21.4s, alphaV1
  615. fmul v6.4s, v22.4s, alphaV2
  616. fmul v7.4s, v23.4s, alphaV3
  617. st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
  618. add pCRow0, pCRow0, #64
  619. .endm
  620. /******************************************************************************/
  621. .macro INIT8x2
  622. fmov s16, wzr
  623. fmov s17, s16
  624. fmov s20, s17
  625. fmov s21, s16
  626. .endm
  627. .macro KERNEL8x2_SUB
  628. ld1 {v8.2s}, [pB]
  629. add pB, pB, #8
  630. ld1 {v0.4s}, [pA]
  631. add pA, pA, #16
  632. ld1 {v1.4s}, [pA]
  633. add pA, pA, #16
  634. fmla v16.4s, v0.4s, v8.s[0]
  635. fmla v17.4s, v1.4s, v8.s[0]
  636. fmla v20.4s, v0.4s, v8.s[1]
  637. fmla v21.4s, v1.4s, v8.s[1]
  638. .endm
  639. .macro SAVE8x2
  640. add pCRow1, pCRow0, LDC
  641. fmul v0.4s, v16.4s, alphaV0
  642. fmul v1.4s, v17.4s, alphaV1
  643. st1 {v0.4s, v1.4s}, [pCRow0]
  644. add pCRow2, pCRow1, LDC
  645. fmul v4.4s, v20.4s, alphaV0
  646. fmul v5.4s, v21.4s, alphaV1
  647. st1 {v4.4s, v5.4s}, [pCRow1]
  648. add pCRow0, pCRow0, #32
  649. .endm
  650. /******************************************************************************/
  651. .macro INIT4x2
  652. fmov s16, wzr
  653. fmov s17, s16
  654. fmov s20, s17
  655. fmov s21, s16
  656. .endm
  657. .macro KERNEL4x2_SUB
  658. ld1 {v8.2s}, [pB]
  659. add pB, pB, #8
  660. ld1 {v0.2s, v1.2s}, [pA]
  661. add pA, pA, #16
  662. fmla v16.2s, v0.2s, v8.s[0]
  663. fmla v17.2s, v1.2s, v8.s[0]
  664. fmla v20.2s, v0.2s, v8.s[1]
  665. fmla v21.2s, v1.2s, v8.s[1]
  666. .endm
  667. .macro SAVE4x2
  668. fmul v8.2s, v16.2s, alphaV0
  669. fmul v9.2s, v17.2s, alphaV1
  670. st1 {v8.2s, v9.2s}, [pCRow0]
  671. add pCRow1, pCRow0, LDC
  672. fmul v12.2s, v20.2s, alphaV2
  673. fmul v13.2s, v21.2s, alphaV3
  674. st1 {v12.2s, v13.2s}, [pCRow1]
  675. add pCRow0, pCRow0, #16
  676. .endm
  677. /******************************************************************************/
  678. .macro INIT2x2
  679. fmov s16, wzr
  680. fmov s20, s16
  681. .endm
  682. .macro KERNEL2x2_SUB
  683. ld1 {v8.2s}, [pB]
  684. add pB, pB, #8
  685. ld1 {v0.2s}, [pA]
  686. add pA, pA, #8
  687. fmla v16.2s, v0.2s, v8.s[0]
  688. fmla v20.2s, v0.2s, v8.s[1]
  689. .endm
  690. .macro SAVE2x2
  691. fmul v8.2s, v16.2s, alphaV0
  692. st1 {v8.2s}, [pCRow0]
  693. add pCRow1 , pCRow0, LDC
  694. fmul v12.2s, v20.2s, alphaV1
  695. st1 {v12.2s}, [pCRow1]
  696. add pCRow0, pCRow0, #8
  697. .endm
  698. /******************************************************************************/
  699. .macro INIT1x2
  700. fmov s16, wzr
  701. .endm
  702. .macro KERNEL1x2_SUB
  703. ld1 {v8.2s} , [pB]
  704. add pB , pB, #8
  705. ldr s0 , [pA]
  706. add pA, pA, #4
  707. fmla v16.2s, v8.2s, v0.s[0]
  708. .endm
  709. .macro SAVE1x2
  710. add pCRow1 , pCRow0, LDC
  711. fmul v8.2s, v16.2s, alphaV0
  712. st1 {v8.s}[0], [pCRow0]
  713. st1 {v8.s}[1], [pCRow1]
  714. add pCRow0, pCRow0, #4
  715. .endm
  716. /******************************************************************************/
  717. .macro INIT16x1
  718. fmov s16, wzr
  719. fmov s17, wzr
  720. fmov s18, wzr
  721. fmov s19, s16
  722. .endm
  723. .macro KERNEL16x1_SUB
  724. ldr s8, [pB]
  725. add pB , pB, #4
  726. ld1 {v0.4s}, [pA]
  727. add pA, pA, #16
  728. ld1 {v1.4s}, [pA]
  729. add pA, pA, #16
  730. ld1 {v2.4s}, [pA]
  731. add pA, pA, #16
  732. ld1 {v3.4s}, [pA]
  733. add pA, pA, #16
  734. fmla v16.4s, v0.4s, v8.s[0]
  735. fmla v17.4s, v1.4s, v8.s[0]
  736. fmla v18.4s, v2.4s, v8.s[0]
  737. fmla v19.4s, v3.4s, v8.s[0]
  738. .endm
  739. .macro SAVE16x1
  740. fmul v0.4s, v16.4s, alphaV0
  741. fmul v1.4s, v17.4s, alphaV1
  742. fmul v2.4s, v18.4s, alphaV2
  743. fmul v3.4s, v19.4s, alphaV3
  744. st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
  745. add pCRow0, pCRow0, #64
  746. .endm
  747. /******************************************************************************/
  748. .macro INIT8x1
  749. fmov s16, wzr
  750. fmov s17, wzr
  751. .endm
  752. .macro KERNEL8x1_SUB
  753. ldr s8, [pB]
  754. add pB , pB, #4
  755. ld1 {v0.4s}, [pA]
  756. add pA, pA, #16
  757. ld1 {v1.4s}, [pA]
  758. add pA, pA, #16
  759. fmla v16.4s, v0.4s, v8.s[0]
  760. fmla v17.4s, v1.4s, v8.s[0]
  761. .endm
  762. .macro SAVE8x1
  763. fmul v0.4s, v16.4s, alphaV0
  764. fmul v1.4s, v17.4s, alphaV1
  765. st1 {v0.4s, v1.4s}, [pCRow0]
  766. add pCRow0, pCRow0, #32
  767. .endm
  768. /******************************************************************************/
  769. .macro INIT4x1
  770. fmov s16, wzr
  771. fmov s17, s16
  772. .endm
  773. .macro KERNEL4x1_SUB
  774. ldr s8, [pB]
  775. add pB , pB, #4
  776. ld1 {v0.2s, v1.2s}, [pA]
  777. add pA , pA, #16
  778. fmla v16.2s, v0.2s, v8.s[0]
  779. fmla v17.2s, v1.2s, v8.s[0]
  780. .endm
  781. .macro SAVE4x1
  782. fmul v8.2s, v16.2s, alphaV0
  783. fmul v9.2s, v17.2s, alphaV1
  784. st1 {v8.2s, v9.2s}, [pCRow0]
  785. add pCRow0, pCRow0, #16
  786. .endm
  787. /******************************************************************************/
  788. .macro INIT2x1
  789. fmov s16, wzr
  790. .endm
  791. .macro KERNEL2x1_SUB
  792. ldr s8, [pB]
  793. add pB , pB, #4
  794. ld1 {v0.2s}, [pA]
  795. add pA , pA, #8
  796. fmla v16.2s, v0.2s, v8.s[0]
  797. .endm
  798. .macro SAVE2x1
  799. fmul v8.2s, v16.2s, alphaV0
  800. st1 {v8.2s}, [pCRow0]
  801. add pCRow0, pCRow0, #8
  802. .endm
  803. /******************************************************************************/
  804. .macro INIT1x1
  805. fmov s16, wzr
  806. .endm
  807. .macro KERNEL1x1_SUB
  808. ldr s8, [pB]
  809. add pB , pB, #4
  810. ldr s0, [pA]
  811. add pA , pA, #4
  812. fmadd s16, s0, s8, s16
  813. .endm
  814. .macro SAVE1x1
  815. fmul s8, s16, alpha0
  816. str s8, [pCRow0]
  817. add pCRow0, pCRow0, #4
  818. .endm
  819. /*******************************************************************************
  820. * End of macro definitions
  821. *******************************************************************************/
  822. PROLOGUE
  823. strmm_kernel_begin:
  824. .align 5
  825. add sp, sp, #-(11 * 16)
  826. stp d8, d9, [sp, #(0 * 16)]
  827. stp d10, d11, [sp, #(1 * 16)]
  828. stp d12, d13, [sp, #(2 * 16)]
  829. stp d14, d15, [sp, #(3 * 16)]
  830. stp d16, d17, [sp, #(4 * 16)]
  831. stp x18, x19, [sp, #(5 * 16)]
  832. stp x20, x21, [sp, #(6 * 16)]
  833. stp x22, x23, [sp, #(7 * 16)]
  834. stp x24, x25, [sp, #(8 * 16)]
  835. stp x26, x27, [sp, #(9 * 16)]
  836. str x28, [sp, #(10 * 16)]
  837. fmov alpha0, s0
  838. fmov alpha1, s0
  839. fmov alpha2, s0
  840. fmov alpha3, s0
  841. lsl LDC, LDC, #2 // ldc = ldc * 4
  842. #if !defined(LEFT)
  843. neg tempOffset, offset
  844. #endif
  845. mov pB, origPB
  846. mov counterJ, origN
  847. asr counterJ, counterJ, #2 // J = J / 4
  848. cmp counterJ, #0
  849. ble strmm_kernel_L2_BEGIN
  850. /******************************************************************************/
  851. strmm_kernel_L4_BEGIN:
  852. mov pCRow0, pC // pCRow0 = C
  853. add pC, pC, LDC, lsl #2
  854. #if defined(LEFT)
  855. mov tempOffset, offset
  856. #endif
  857. mov pA, origPA // pA = start of A array
  858. strmm_kernel_L4_M16_BEGIN:
  859. mov counterI, origM
  860. asr counterI, counterI, #4 // counterI = counterI / 16
  861. cmp counterI, #0
  862. ble strmm_kernel_L4_M8_BEGIN
  863. strmm_kernel_L4_M16_20:
  864. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  865. mov pB, origPB
  866. #else
  867. mov pB, origPB
  868. lsl temp, tempOffset, #6
  869. add pA, pA, temp
  870. lsl temp, tempOffset, #4
  871. add pB, pB, temp
  872. #endif
  873. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  874. sub tempK, origK, tempOffset
  875. #elif defined(LEFT)
  876. add tempK, tempOffset, #16
  877. #else
  878. add tempK, tempOffset, #4
  879. #endif
  880. asr counterL , tempK, #1 // L = K / 2
  881. cmp counterL , #2 // is there at least 4 to do?
  882. blt strmm_kernel_L4_M16_32
  883. KERNEL16x4_I // do one in the K
  884. KERNEL16x4_M2 // do another in the K
  885. subs counterL, counterL, #2
  886. ble strmm_kernel_L4_M16_22a
  887. .align 5
  888. strmm_kernel_L4_M16_22:
  889. KERNEL16x4_M1
  890. KERNEL16x4_M2
  891. subs counterL, counterL, #1
  892. bgt strmm_kernel_L4_M16_22
  893. strmm_kernel_L4_M16_22a:
  894. KERNEL16x4_M1
  895. KERNEL16x4_E
  896. b strmm_kernel_L4_M16_44
  897. strmm_kernel_L4_M16_32:
  898. tst counterL, #1
  899. ble strmm_kernel_L4_M16_40
  900. KERNEL16x4_I
  901. KERNEL16x4_E
  902. b strmm_kernel_L4_M16_44
  903. strmm_kernel_L4_M16_40:
  904. INIT16x4
  905. strmm_kernel_L4_M16_44:
  906. ands counterL , tempK, #1
  907. ble strmm_kernel_L4_M16_100
  908. strmm_kernel_L4_M16_46:
  909. KERNEL16x4_SUB
  910. strmm_kernel_L4_M16_100:
  911. SAVE16x4
  912. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  913. sub tempK, origK, tempOffset
  914. #if defined(LEFT)
  915. sub tempK, tempK, #16
  916. #else
  917. sub tempK, tempK, #4
  918. #endif
  919. lsl temp, tempK, #6
  920. add pA, pA, temp
  921. lsl temp, tempK, #4
  922. add pB, pB, temp
  923. #endif
  924. #if defined(LEFT)
  925. add tempOffset, tempOffset, #16
  926. #endif
  927. strmm_kernel_L4_M16_END:
  928. subs counterI, counterI, #1
  929. bne strmm_kernel_L4_M16_20
  930. //------------------------------------------------------------------------------
  931. strmm_kernel_L4_M8_BEGIN:
  932. mov counterI, origM
  933. tst counterI , #15
  934. ble strmm_kernel_L4_END
  935. tst counterI, #8
  936. ble strmm_kernel_L4_M4_BEGIN
  937. strmm_kernel_L4_M8_20:
  938. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  939. mov pB, origPB
  940. #else
  941. mov pB, origPB
  942. lsl temp, tempOffset, #5
  943. add pA, pA, temp
  944. lsl temp, tempOffset, #4
  945. add pB, pB, temp
  946. #endif
  947. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  948. sub tempK, origK, tempOffset
  949. #elif defined(LEFT)
  950. add tempK, tempOffset, #8
  951. #else
  952. add tempK, tempOffset, #4
  953. #endif
  954. asr counterL , tempK, #1 // L = K / 2
  955. cmp counterL , #2 // is there at least 4 to do?
  956. blt strmm_kernel_L4_M8_32
  957. KERNEL8x4_I // do one in the K
  958. KERNEL8x4_M2 // do another in the K
  959. subs counterL, counterL, #2
  960. ble strmm_kernel_L4_M8_22a
  961. .align 5
  962. strmm_kernel_L4_M8_22:
  963. KERNEL8x4_M1
  964. KERNEL8x4_M2
  965. subs counterL, counterL, #1
  966. bgt strmm_kernel_L4_M8_22
  967. strmm_kernel_L4_M8_22a:
  968. KERNEL8x4_M1
  969. KERNEL8x4_E
  970. b strmm_kernel_L4_M8_44
  971. strmm_kernel_L4_M8_32:
  972. tst counterL, #1
  973. ble strmm_kernel_L4_M8_40
  974. KERNEL8x4_I
  975. KERNEL8x4_E
  976. b strmm_kernel_L4_M8_44
  977. strmm_kernel_L4_M8_40:
  978. INIT8x4
  979. strmm_kernel_L4_M8_44:
  980. ands counterL , tempK, #1
  981. ble strmm_kernel_L4_M8_100
  982. strmm_kernel_L4_M8_46:
  983. KERNEL8x4_SUB
  984. strmm_kernel_L4_M8_100:
  985. SAVE8x4
  986. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  987. sub tempK, origK, tempOffset
  988. #if defined(LEFT)
  989. sub tempK, tempK, #8
  990. #else
  991. sub tempK, tempK, #4
  992. #endif
  993. lsl temp, tempK, #5
  994. add pA, pA, temp
  995. lsl temp, tempK, #4
  996. add pB, pB, temp
  997. #endif
  998. #if defined(LEFT)
  999. add tempOffset, tempOffset, #8
  1000. #endif
  1001. strmm_kernel_L4_M8_END:
  1002. //------------------------------------------------------------------------------
  1003. strmm_kernel_L4_M4_BEGIN:
  1004. mov counterI, origM
  1005. tst counterI , #7
  1006. ble strmm_kernel_L4_END
  1007. tst counterI, #4
  1008. ble strmm_kernel_L4_M2_BEGIN
  1009. strmm_kernel_L4_M4_20:
  1010. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1011. mov pB, origPB
  1012. #else
  1013. mov pB, origPB
  1014. lsl temp, tempOffset, #4
  1015. add pB, pB, temp
  1016. add pA, pA, temp
  1017. #endif
  1018. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1019. sub tempK, origK, tempOffset
  1020. #elif defined(LEFT)
  1021. add tempK, tempOffset, #4
  1022. #else
  1023. add tempK, tempOffset, #4
  1024. #endif
  1025. asr counterL , tempK, #1 // L = K / 2
  1026. cmp counterL , #2 // is there at least 4 to do?
  1027. blt strmm_kernel_L4_M4_32
  1028. KERNEL4x4_I // do one in the K
  1029. KERNEL4x4_M2 // do another in the K
  1030. subs counterL, counterL, #2
  1031. ble strmm_kernel_L4_M4_22a
  1032. .align 5
  1033. strmm_kernel_L4_M4_22:
  1034. KERNEL4x4_M1
  1035. KERNEL4x4_M2
  1036. subs counterL, counterL, #1
  1037. bgt strmm_kernel_L4_M4_22
  1038. strmm_kernel_L4_M4_22a:
  1039. KERNEL4x4_M1
  1040. KERNEL4x4_E
  1041. b strmm_kernel_L4_M4_44
  1042. strmm_kernel_L4_M4_32:
  1043. tst counterL, #1
  1044. ble strmm_kernel_L4_M4_40
  1045. KERNEL4x4_I
  1046. KERNEL4x4_E
  1047. b strmm_kernel_L4_M4_44
  1048. strmm_kernel_L4_M4_40:
  1049. INIT4x4
  1050. strmm_kernel_L4_M4_44:
  1051. ands counterL , tempK, #1
  1052. ble strmm_kernel_L4_M4_100
  1053. strmm_kernel_L4_M4_46:
  1054. KERNEL4x4_SUB
  1055. strmm_kernel_L4_M4_100:
  1056. SAVE4x4
  1057. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1058. sub tempK, origK, tempOffset
  1059. #if defined(LEFT)
  1060. sub tempK, tempK, #4
  1061. #else
  1062. sub tempK, tempK, #4
  1063. #endif
  1064. lsl temp, tempK, #4
  1065. add pA, pA, temp
  1066. add pB, pB, temp
  1067. #endif
  1068. #if defined(LEFT)
  1069. add tempOffset, tempOffset, #4
  1070. #endif
  1071. strmm_kernel_L4_M4_END:
  1072. //------------------------------------------------------------------------------
  1073. strmm_kernel_L4_M2_BEGIN:
  1074. mov counterI, origM
  1075. tst counterI , #3
  1076. ble strmm_kernel_L4_END
  1077. tst counterI, #2 // counterI = counterI / 2
  1078. ble strmm_kernel_L4_M1_BEGIN
  1079. strmm_kernel_L4_M2_20:
  1080. INIT2x4
  1081. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1082. mov pB, origPB
  1083. #else
  1084. mov pB, origPB
  1085. lsl temp, tempOffset, #3
  1086. add pA, pA, temp
  1087. lsl temp, tempOffset, #4
  1088. add pB, pB, temp
  1089. #endif
  1090. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1091. sub tempK, origK, tempOffset
  1092. #elif defined(LEFT)
  1093. add tempK, tempOffset, #2
  1094. #else
  1095. add tempK, tempOffset, #4
  1096. #endif
  1097. asr counterL , tempK, #3 // counterL = counterL / 8
  1098. cmp counterL , #0
  1099. ble strmm_kernel_L4_M2_40
  1100. strmm_kernel_L4_M2_22:
  1101. KERNEL2x4_SUB
  1102. KERNEL2x4_SUB
  1103. KERNEL2x4_SUB
  1104. KERNEL2x4_SUB
  1105. KERNEL2x4_SUB
  1106. KERNEL2x4_SUB
  1107. KERNEL2x4_SUB
  1108. KERNEL2x4_SUB
  1109. subs counterL, counterL, #1
  1110. bgt strmm_kernel_L4_M2_22
  1111. strmm_kernel_L4_M2_40:
  1112. ands counterL , tempK, #7 // counterL = counterL % 8
  1113. ble strmm_kernel_L4_M2_100
  1114. strmm_kernel_L4_M2_42:
  1115. KERNEL2x4_SUB
  1116. subs counterL, counterL, #1
  1117. bgt strmm_kernel_L4_M2_42
  1118. strmm_kernel_L4_M2_100:
  1119. SAVE2x4
  1120. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1121. sub tempK, origK, tempOffset
  1122. #if defined(LEFT)
  1123. sub tempK, tempK, #2
  1124. #else
  1125. sub tempK, tempK, #4
  1126. #endif
  1127. lsl temp, tempK, #3
  1128. add pA, pA, temp
  1129. lsl temp, tempK, #4
  1130. add pB, pB, temp
  1131. #endif
  1132. #if defined(LEFT)
  1133. add tempOffset, tempOffset, #2
  1134. #endif
  1135. strmm_kernel_L4_M2_END:
  1136. strmm_kernel_L4_M1_BEGIN:
  1137. tst counterI, #1 // counterI = counterI % 2
  1138. ble strmm_kernel_L4_END
  1139. strmm_kernel_L4_M1_20:
  1140. INIT1x4
  1141. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1142. mov pB, origPB
  1143. #else
  1144. mov pB, origPB
  1145. lsl temp, tempOffset, #4
  1146. add pB, pB, temp
  1147. lsl temp, tempOffset, #2
  1148. add pA, pA, temp
  1149. #endif
  1150. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1151. sub tempK, origK, tempOffset
  1152. #elif defined(LEFT)
  1153. add tempK, tempOffset, #1
  1154. #else
  1155. add tempK, tempOffset, #4
  1156. #endif
  1157. asr counterL , tempK, #3 // counterL = counterL / 8
  1158. cmp counterL , #0
  1159. ble strmm_kernel_L4_M1_40
  1160. strmm_kernel_L4_M1_22:
  1161. KERNEL1x4_SUB
  1162. KERNEL1x4_SUB
  1163. KERNEL1x4_SUB
  1164. KERNEL1x4_SUB
  1165. KERNEL1x4_SUB
  1166. KERNEL1x4_SUB
  1167. KERNEL1x4_SUB
  1168. KERNEL1x4_SUB
  1169. subs counterL, counterL, #1
  1170. bgt strmm_kernel_L4_M1_22
  1171. strmm_kernel_L4_M1_40:
  1172. ands counterL , tempK, #7 // counterL = counterL % 8
  1173. ble strmm_kernel_L4_M1_100
  1174. strmm_kernel_L4_M1_42:
  1175. KERNEL1x4_SUB
  1176. subs counterL, counterL, #1
  1177. bgt strmm_kernel_L4_M1_42
  1178. strmm_kernel_L4_M1_100:
  1179. SAVE1x4
  1180. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1181. sub tempK, origK, tempOffset
  1182. #if defined(LEFT)
  1183. sub tempK, tempK, #1
  1184. #else
  1185. sub tempK, tempK, #4
  1186. #endif
  1187. lsl temp, tempK, #2
  1188. add pA, pA, temp
  1189. lsl temp, tempK, #4
  1190. add pB, pB, temp
  1191. #endif
  1192. #if defined(LEFT)
  1193. add tempOffset, tempOffset, #1
  1194. #endif
  1195. strmm_kernel_L4_END:
  1196. add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
  1197. #if !defined(LEFT)
  1198. add tempOffset, tempOffset, #4
  1199. #endif
  1200. subs counterJ, counterJ , #1 // j--
  1201. bgt strmm_kernel_L4_BEGIN
  1202. /******************************************************************************/
  1203. strmm_kernel_L2_BEGIN: // less than 2 left in N direction
  1204. mov counterJ , origN
  1205. tst counterJ , #3
  1206. ble strmm_kernel_L999
  1207. tst counterJ , #2
  1208. ble strmm_kernel_L1_BEGIN
  1209. mov pCRow0, pC // pCRow0 = pC
  1210. add pC,pC,LDC, lsl #1
  1211. #if defined(LEFT)
  1212. mov tempOffset, offset
  1213. #endif
  1214. mov pA, origPA // pA = A
  1215. strmm_kernel_L2_M16_BEGIN:
  1216. mov counterI, origM
  1217. asr counterI, counterI, #4 // counterI = counterI / 16
  1218. cmp counterI,#0
  1219. ble strmm_kernel_L2_M8_BEGIN
  1220. strmm_kernel_L2_M16_20:
  1221. INIT16x2
  1222. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1223. mov pB, origPB
  1224. #else
  1225. mov pB, origPB
  1226. lsl temp, tempOffset, #6
  1227. add pA, pA, temp
  1228. lsl temp, tempOffset, #3
  1229. add pB, pB, temp
  1230. #endif
  1231. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1232. sub tempK, origK, tempOffset
  1233. #elif defined(LEFT)
  1234. add tempK, tempOffset, #16
  1235. #else
  1236. add tempK, tempOffset, #2
  1237. #endif
  1238. asr counterL , tempK, #3 // counterL = counterL / 8
  1239. cmp counterL,#0
  1240. ble strmm_kernel_L2_M16_40
  1241. .align 5
  1242. strmm_kernel_L2_M16_22:
  1243. KERNEL16x2_SUB
  1244. KERNEL16x2_SUB
  1245. KERNEL16x2_SUB
  1246. KERNEL16x2_SUB
  1247. KERNEL16x2_SUB
  1248. KERNEL16x2_SUB
  1249. KERNEL16x2_SUB
  1250. KERNEL16x2_SUB
  1251. subs counterL, counterL, #1
  1252. bgt strmm_kernel_L2_M16_22
  1253. strmm_kernel_L2_M16_40:
  1254. ands counterL , tempK, #7 // counterL = counterL % 8
  1255. ble strmm_kernel_L2_M16_100
  1256. strmm_kernel_L2_M16_42:
  1257. KERNEL16x2_SUB
  1258. subs counterL, counterL, #1
  1259. bgt strmm_kernel_L2_M16_42
  1260. strmm_kernel_L2_M16_100:
  1261. SAVE16x2
  1262. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1263. sub tempK, origK, tempOffset
  1264. #if defined(LEFT)
  1265. sub tempK, tempK, #16
  1266. #else
  1267. sub tempK, tempK, #2
  1268. #endif
  1269. lsl temp, tempK, #6
  1270. add pA, pA, temp
  1271. lsl temp, tempK, #3
  1272. add pB, pB, temp
  1273. #endif
  1274. #if defined(LEFT)
  1275. add tempOffset, tempOffset, #16
  1276. #endif
  1277. strmm_kernel_L2_M16_END:
  1278. subs counterI, counterI, #1
  1279. bgt strmm_kernel_L2_M16_20
  1280. //------------------------------------------------------------------------------
  1281. strmm_kernel_L2_M8_BEGIN:
  1282. mov counterI, origM
  1283. tst counterI , #15
  1284. ble strmm_kernel_L2_END
  1285. tst counterI, #8
  1286. ble strmm_kernel_L2_M4_BEGIN
  1287. strmm_kernel_L2_M8_20:
  1288. INIT8x2
  1289. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1290. mov pB, origPB
  1291. #else
  1292. mov pB, origPB
  1293. lsl temp, tempOffset, #5
  1294. add pA, pA, temp
  1295. lsl temp, tempOffset, #3
  1296. add pB, pB, temp
  1297. #endif
  1298. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1299. sub tempK, origK, tempOffset
  1300. #elif defined(LEFT)
  1301. add tempK, tempOffset, #8
  1302. #else
  1303. add tempK, tempOffset, #2
  1304. #endif
  1305. asr counterL , tempK, #3 // counterL = counterL / 8
  1306. cmp counterL,#0
  1307. ble strmm_kernel_L2_M8_40
  1308. .align 5
  1309. strmm_kernel_L2_M8_22:
  1310. KERNEL8x2_SUB
  1311. KERNEL8x2_SUB
  1312. KERNEL8x2_SUB
  1313. KERNEL8x2_SUB
  1314. KERNEL8x2_SUB
  1315. KERNEL8x2_SUB
  1316. KERNEL8x2_SUB
  1317. KERNEL8x2_SUB
  1318. subs counterL, counterL, #1
  1319. bgt strmm_kernel_L2_M8_22
  1320. strmm_kernel_L2_M8_40:
  1321. ands counterL , tempK, #7 // counterL = counterL % 8
  1322. ble strmm_kernel_L2_M8_100
  1323. strmm_kernel_L2_M8_42:
  1324. KERNEL8x2_SUB
  1325. subs counterL, counterL, #1
  1326. bgt strmm_kernel_L2_M8_42
  1327. strmm_kernel_L2_M8_100:
  1328. SAVE8x2
  1329. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1330. sub tempK, origK, tempOffset
  1331. #if defined(LEFT)
  1332. sub tempK, tempK, #8
  1333. #else
  1334. sub tempK, tempK, #2
  1335. #endif
  1336. lsl temp, tempK, #5
  1337. add pA, pA, temp
  1338. lsl temp, tempK, #3
  1339. add pB, pB, temp
  1340. #endif
  1341. #if defined(LEFT)
  1342. add tempOffset, tempOffset, #8
  1343. #endif
  1344. strmm_kernel_L2_M8_END:
  1345. //------------------------------------------------------------------------------
  1346. strmm_kernel_L2_M4_BEGIN:
  1347. mov counterI, origM
  1348. tst counterI , #7
  1349. ble strmm_kernel_L2_END
  1350. tst counterI, #4
  1351. ble strmm_kernel_L2_M2_BEGIN
  1352. strmm_kernel_L2_M4_20:
  1353. INIT4x2
  1354. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1355. mov pB, origPB
  1356. #else
  1357. mov pB, origPB
  1358. lsl temp, tempOffset, #3
  1359. add pB, pB, temp
  1360. lsl temp, tempOffset, #4
  1361. add pA, pA, temp
  1362. #endif
  1363. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1364. sub tempK, origK, tempOffset
  1365. #elif defined(LEFT)
  1366. add tempK, tempOffset, #4
  1367. #else
  1368. add tempK, tempOffset, #2
  1369. #endif
  1370. asr counterL , tempK, #3 // counterL = counterL / 8
  1371. cmp counterL,#0
  1372. ble strmm_kernel_L2_M4_40
  1373. .align 5
  1374. strmm_kernel_L2_M4_22:
  1375. KERNEL4x2_SUB
  1376. KERNEL4x2_SUB
  1377. KERNEL4x2_SUB
  1378. KERNEL4x2_SUB
  1379. KERNEL4x2_SUB
  1380. KERNEL4x2_SUB
  1381. KERNEL4x2_SUB
  1382. KERNEL4x2_SUB
  1383. subs counterL, counterL, #1
  1384. bgt strmm_kernel_L2_M4_22
  1385. strmm_kernel_L2_M4_40:
  1386. ands counterL , tempK, #7 // counterL = counterL % 8
  1387. ble strmm_kernel_L2_M4_100
  1388. strmm_kernel_L2_M4_42:
  1389. KERNEL4x2_SUB
  1390. subs counterL, counterL, #1
  1391. bgt strmm_kernel_L2_M4_42
  1392. strmm_kernel_L2_M4_100:
  1393. SAVE4x2
  1394. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1395. sub tempK, origK, tempOffset
  1396. #if defined(LEFT)
  1397. sub tempK, tempK, #4
  1398. #else
  1399. sub tempK, tempK, #2
  1400. #endif
  1401. lsl temp, tempK, #4
  1402. add pA, pA, temp
  1403. lsl temp, tempK, #3
  1404. add pB, pB, temp
  1405. #endif
  1406. #if defined(LEFT)
  1407. add tempOffset, tempOffset, #4
  1408. #endif
  1409. strmm_kernel_L2_M4_END:
  1410. //------------------------------------------------------------------------------
  1411. strmm_kernel_L2_M2_BEGIN:
  1412. mov counterI, origM
  1413. tst counterI , #3
  1414. ble strmm_kernel_L2_END
  1415. tst counterI, #2 // counterI = counterI / 2
  1416. ble strmm_kernel_L2_M1_BEGIN
  1417. strmm_kernel_L2_M2_20:
  1418. INIT2x2
  1419. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1420. mov pB, origPB
  1421. #else
  1422. mov pB, origPB
  1423. lsl temp, tempOffset, #3
  1424. add pB, pB, temp
  1425. lsl temp, tempOffset, #3
  1426. add pA, pA, temp
  1427. #endif
  1428. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1429. sub tempK, origK, tempOffset
  1430. #elif defined(LEFT)
  1431. add tempK, tempOffset, #2
  1432. #else
  1433. add tempK, tempOffset, #2
  1434. #endif
  1435. asr counterL , tempK, #3 // counterL = counterL / 8
  1436. cmp counterL,#0
  1437. ble strmm_kernel_L2_M2_40
  1438. strmm_kernel_L2_M2_22:
  1439. KERNEL2x2_SUB
  1440. KERNEL2x2_SUB
  1441. KERNEL2x2_SUB
  1442. KERNEL2x2_SUB
  1443. KERNEL2x2_SUB
  1444. KERNEL2x2_SUB
  1445. KERNEL2x2_SUB
  1446. KERNEL2x2_SUB
  1447. subs counterL, counterL, #1
  1448. bgt strmm_kernel_L2_M2_22
  1449. strmm_kernel_L2_M2_40:
  1450. ands counterL , tempK, #7 // counterL = counterL % 8
  1451. ble strmm_kernel_L2_M2_100
  1452. strmm_kernel_L2_M2_42:
  1453. KERNEL2x2_SUB
  1454. subs counterL, counterL, #1
  1455. bgt strmm_kernel_L2_M2_42
  1456. strmm_kernel_L2_M2_100:
  1457. SAVE2x2
  1458. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1459. sub tempK, origK, tempOffset
  1460. #if defined(LEFT)
  1461. sub tempK, tempK, #2
  1462. #else
  1463. sub tempK, tempK, #2
  1464. #endif
  1465. lsl temp, tempK, #3
  1466. add pA, pA, temp
  1467. lsl temp, tempK, #3
  1468. add pB, pB, temp
  1469. #endif
  1470. #if defined(LEFT)
  1471. add tempOffset, tempOffset, #2
  1472. #endif
  1473. strmm_kernel_L2_M2_END:
  1474. strmm_kernel_L2_M1_BEGIN:
  1475. tst counterI, #1 // counterI = counterI % 2
  1476. ble strmm_kernel_L2_END
  1477. strmm_kernel_L2_M1_20:
  1478. INIT1x2
  1479. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1480. mov pB, origPB
  1481. #else
  1482. mov pB, origPB
  1483. lsl temp, tempOffset, #3
  1484. add pB, pB, temp
  1485. lsl temp, tempOffset, #2
  1486. add pA, pA, temp
  1487. #endif
  1488. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1489. sub tempK, origK, tempOffset
  1490. #elif defined(LEFT)
  1491. add tempK, tempOffset, #1
  1492. #else
  1493. add tempK, tempOffset, #2
  1494. #endif
  1495. asr counterL , tempK, #3 // counterL = counterL / 8
  1496. cmp counterL, #0
  1497. ble strmm_kernel_L2_M1_40
  1498. strmm_kernel_L2_M1_22:
  1499. KERNEL1x2_SUB
  1500. KERNEL1x2_SUB
  1501. KERNEL1x2_SUB
  1502. KERNEL1x2_SUB
  1503. KERNEL1x2_SUB
  1504. KERNEL1x2_SUB
  1505. KERNEL1x2_SUB
  1506. KERNEL1x2_SUB
  1507. subs counterL, counterL, #1
  1508. bgt strmm_kernel_L2_M1_22
  1509. strmm_kernel_L2_M1_40:
  1510. ands counterL , tempK, #7 // counterL = counterL % 8
  1511. ble strmm_kernel_L2_M1_100
  1512. strmm_kernel_L2_M1_42:
  1513. KERNEL1x2_SUB
  1514. subs counterL, counterL, #1
  1515. bgt strmm_kernel_L2_M1_42
  1516. strmm_kernel_L2_M1_100:
  1517. SAVE1x2
  1518. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1519. sub tempK, origK, tempOffset
  1520. #if defined(LEFT)
  1521. sub tempK, tempK, #1
  1522. #else
  1523. sub tempK, tempK, #2
  1524. #endif
  1525. lsl temp, tempK, #2
  1526. add pA, pA, temp
  1527. lsl temp, tempK, #3
  1528. add pB, pB, temp
  1529. #endif
  1530. #if defined(LEFT)
  1531. add tempOffset, tempOffset, #1
  1532. #endif
  1533. strmm_kernel_L2_END:
  1534. #if !defined(LEFT)
  1535. add tempOffset, tempOffset, #2
  1536. #endif
  1537. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  1538. /******************************************************************************/
  1539. strmm_kernel_L1_BEGIN:
  1540. mov counterJ , origN
  1541. tst counterJ , #1
  1542. ble strmm_kernel_L999 // done
  1543. mov pCRow0, pC // pCRow0 = C
  1544. add pC , pC , LDC // Update pC to point to next
  1545. #if defined(LEFT)
  1546. mov tempOffset, offset
  1547. #endif
  1548. mov pA, origPA // pA = A
  1549. strmm_kernel_L1_M16_BEGIN:
  1550. mov counterI, origM
  1551. asr counterI, counterI, #4 // counterI = counterI / 16
  1552. cmp counterI, #0
  1553. ble strmm_kernel_L1_M8_BEGIN
  1554. strmm_kernel_L1_M16_20:
  1555. INIT16x1
  1556. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1557. mov pB, origPB
  1558. #else
  1559. mov pB, origPB
  1560. lsl temp, tempOffset, #6
  1561. add pA, pA, temp
  1562. lsl temp, tempOffset, #2
  1563. add pB, pB, temp
  1564. #endif
  1565. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1566. sub tempK, origK, tempOffset
  1567. #elif defined(LEFT)
  1568. add tempK, tempOffset, #16
  1569. #else
  1570. add tempK, tempOffset, #1
  1571. #endif
  1572. asr counterL , tempK, #3 // counterL = counterL / 8
  1573. cmp counterL , #0
  1574. ble strmm_kernel_L1_M16_40
  1575. .align 5
  1576. strmm_kernel_L1_M16_22:
  1577. KERNEL16x1_SUB
  1578. KERNEL16x1_SUB
  1579. KERNEL16x1_SUB
  1580. KERNEL16x1_SUB
  1581. KERNEL16x1_SUB
  1582. KERNEL16x1_SUB
  1583. KERNEL16x1_SUB
  1584. KERNEL16x1_SUB
  1585. subs counterL, counterL, #1
  1586. bgt strmm_kernel_L1_M16_22
  1587. strmm_kernel_L1_M16_40:
  1588. ands counterL , tempK, #7 // counterL = counterL % 8
  1589. ble strmm_kernel_L1_M16_100
  1590. strmm_kernel_L1_M16_42:
  1591. KERNEL16x1_SUB
  1592. subs counterL, counterL, #1
  1593. bgt strmm_kernel_L1_M16_42
  1594. strmm_kernel_L1_M16_100:
  1595. SAVE16x1
  1596. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1597. sub tempK, origK, tempOffset
  1598. #if defined(LEFT)
  1599. sub tempK, tempK, #16
  1600. #else
  1601. sub tempK, tempK, #1
  1602. #endif
  1603. lsl temp, tempK, #6
  1604. add pA, pA, temp
  1605. lsl temp, tempK, #2
  1606. add pB, pB, temp
  1607. #endif
  1608. #if defined(LEFT)
  1609. add tempOffset, tempOffset, #16
  1610. #endif
  1611. strmm_kernel_L1_M16_END:
  1612. subs counterI, counterI, #1
  1613. bgt strmm_kernel_L1_M16_20
  1614. //------------------------------------------------------------------------------
  1615. strmm_kernel_L1_M8_BEGIN:
  1616. mov counterI, origM
  1617. tst counterI , #15
  1618. ble strmm_kernel_L1_END
  1619. tst counterI, #8
  1620. ble strmm_kernel_L1_M4_BEGIN
  1621. strmm_kernel_L1_M8_20:
  1622. INIT8x1
  1623. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1624. mov pB, origPB
  1625. #else
  1626. mov pB, origPB
  1627. lsl temp, tempOffset, #5
  1628. add pA, pA, temp
  1629. lsl temp, tempOffset, #2
  1630. add pB, pB, temp
  1631. #endif
  1632. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1633. sub tempK, origK, tempOffset
  1634. #elif defined(LEFT)
  1635. add tempK, tempOffset, #8
  1636. #else
  1637. add tempK, tempOffset, #1
  1638. #endif
  1639. asr counterL , tempK, #3 // counterL = counterL / 8
  1640. cmp counterL , #0
  1641. ble strmm_kernel_L1_M8_40
  1642. .align 5
  1643. strmm_kernel_L1_M8_22:
  1644. KERNEL8x1_SUB
  1645. KERNEL8x1_SUB
  1646. KERNEL8x1_SUB
  1647. KERNEL8x1_SUB
  1648. KERNEL8x1_SUB
  1649. KERNEL8x1_SUB
  1650. KERNEL8x1_SUB
  1651. KERNEL8x1_SUB
  1652. subs counterL, counterL, #1
  1653. bgt strmm_kernel_L1_M8_22
  1654. strmm_kernel_L1_M8_40:
  1655. ands counterL , tempK, #7 // counterL = counterL % 8
  1656. ble strmm_kernel_L1_M8_100
  1657. strmm_kernel_L1_M8_42:
  1658. KERNEL8x1_SUB
  1659. subs counterL, counterL, #1
  1660. bgt strmm_kernel_L1_M8_42
  1661. strmm_kernel_L1_M8_100:
  1662. SAVE8x1
  1663. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1664. sub tempK, origK, tempOffset
  1665. #if defined(LEFT)
  1666. sub tempK, tempK, #8
  1667. #else
  1668. sub tempK, tempK, #1
  1669. #endif
  1670. lsl temp, tempK, #5
  1671. add pA, pA, temp
  1672. lsl temp, tempK, #2
  1673. add pB, pB, temp
  1674. #endif
  1675. #if defined(LEFT)
  1676. add tempOffset, tempOffset, #8
  1677. #endif
  1678. strmm_kernel_L1_M8_END:
  1679. //------------------------------------------------------------------------------
  1680. strmm_kernel_L1_M4_BEGIN:
  1681. mov counterI, origM
  1682. tst counterI , #7
  1683. ble strmm_kernel_L1_END
  1684. tst counterI, #4
  1685. ble strmm_kernel_L1_M2_BEGIN
  1686. strmm_kernel_L1_M4_20:
  1687. INIT4x1
  1688. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1689. mov pB, origPB
  1690. #else
  1691. mov pB, origPB
  1692. lsl temp, tempOffset, #2
  1693. add pB, pB, temp
  1694. lsl temp, tempOffset, #4
  1695. add pA, pA, temp
  1696. #endif
  1697. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1698. sub tempK, origK, tempOffset
  1699. #elif defined(LEFT)
  1700. add tempK, tempOffset, #4
  1701. #else
  1702. add tempK, tempOffset, #1
  1703. #endif
  1704. asr counterL , tempK, #3 // counterL = counterL / 8
  1705. cmp counterL , #0
  1706. ble strmm_kernel_L1_M4_40
  1707. .align 5
  1708. strmm_kernel_L1_M4_22:
  1709. KERNEL4x1_SUB
  1710. KERNEL4x1_SUB
  1711. KERNEL4x1_SUB
  1712. KERNEL4x1_SUB
  1713. KERNEL4x1_SUB
  1714. KERNEL4x1_SUB
  1715. KERNEL4x1_SUB
  1716. KERNEL4x1_SUB
  1717. subs counterL, counterL, #1
  1718. bgt strmm_kernel_L1_M4_22
  1719. strmm_kernel_L1_M4_40:
  1720. ands counterL , tempK, #7 // counterL = counterL % 8
  1721. ble strmm_kernel_L1_M4_100
  1722. strmm_kernel_L1_M4_42:
  1723. KERNEL4x1_SUB
  1724. subs counterL, counterL, #1
  1725. bgt strmm_kernel_L1_M4_42
  1726. strmm_kernel_L1_M4_100:
  1727. SAVE4x1
  1728. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1729. sub tempK, origK, tempOffset
  1730. #if defined(LEFT)
  1731. sub tempK, tempK, #4
  1732. #else
  1733. sub tempK, tempK, #1
  1734. #endif
  1735. lsl temp, tempK, #4
  1736. add pA, pA, temp
  1737. lsl temp, tempK, #2
  1738. add pB, pB, temp
  1739. #endif
  1740. #if defined(LEFT)
  1741. add tempOffset, tempOffset, #4
  1742. #endif
  1743. strmm_kernel_L1_M4_END:
  1744. //------------------------------------------------------------------------------
  1745. strmm_kernel_L1_M2_BEGIN:
  1746. mov counterI, origM
  1747. tst counterI , #3
  1748. ble strmm_kernel_L1_END
  1749. tst counterI, #2 // counterI = counterI / 2
  1750. ble strmm_kernel_L1_M1_BEGIN
  1751. strmm_kernel_L1_M2_20:
  1752. INIT2x1
  1753. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1754. mov pB, origPB
  1755. #else
  1756. mov pB, origPB
  1757. lsl temp, tempOffset, #2
  1758. add pB, pB, temp
  1759. lsl temp, tempOffset, #3
  1760. add pA, pA, temp
  1761. #endif
  1762. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1763. sub tempK, origK, tempOffset
  1764. #elif defined(LEFT)
  1765. add tempK, tempOffset, #2
  1766. #else
  1767. add tempK, tempOffset, #1
  1768. #endif
  1769. asr counterL , tempK, #3 // counterL = counterL / 8
  1770. cmp counterL , #0
  1771. ble strmm_kernel_L1_M2_40
  1772. strmm_kernel_L1_M2_22:
  1773. KERNEL2x1_SUB
  1774. KERNEL2x1_SUB
  1775. KERNEL2x1_SUB
  1776. KERNEL2x1_SUB
  1777. KERNEL2x1_SUB
  1778. KERNEL2x1_SUB
  1779. KERNEL2x1_SUB
  1780. KERNEL2x1_SUB
  1781. subs counterL, counterL, #1
  1782. bgt strmm_kernel_L1_M2_22
  1783. strmm_kernel_L1_M2_40:
  1784. ands counterL , tempK, #7 // counterL = counterL % 8
  1785. ble strmm_kernel_L1_M2_100
  1786. strmm_kernel_L1_M2_42:
  1787. KERNEL2x1_SUB
  1788. subs counterL, counterL, #1
  1789. bgt strmm_kernel_L1_M2_42
  1790. strmm_kernel_L1_M2_100:
  1791. SAVE2x1
  1792. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1793. sub tempK, origK, tempOffset
  1794. #if defined(LEFT)
  1795. sub tempK, tempK, #2
  1796. #else
  1797. sub tempK, tempK, #1
  1798. #endif
  1799. lsl temp, tempK, #3
  1800. add pA, pA, temp
  1801. lsl temp, tempK, #2
  1802. add pB, pB, temp
  1803. #endif
  1804. #if defined(LEFT)
  1805. add tempOffset, tempOffset, #2
  1806. #endif
  1807. strmm_kernel_L1_M2_END:
  1808. strmm_kernel_L1_M1_BEGIN:
  1809. tst counterI, #1 // counterI = counterI % 2
  1810. ble strmm_kernel_L1_END
  1811. strmm_kernel_L1_M1_20:
  1812. INIT1x1
  1813. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1814. mov pB, origPB
  1815. #else
  1816. mov pB, origPB
  1817. lsl temp, tempOffset, #2
  1818. add pB, pB, temp
  1819. lsl temp, tempOffset, #2
  1820. add pA, pA, temp
  1821. #endif
  1822. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1823. sub tempK, origK, tempOffset
  1824. #elif defined(LEFT)
  1825. add tempK, tempOffset, #1
  1826. #else
  1827. add tempK, tempOffset, #1
  1828. #endif
  1829. asr counterL , tempK, #3 // counterL = counterL / 8
  1830. cmp counterL , #0
  1831. ble strmm_kernel_L1_M1_40
  1832. strmm_kernel_L1_M1_22:
  1833. KERNEL1x1_SUB
  1834. KERNEL1x1_SUB
  1835. KERNEL1x1_SUB
  1836. KERNEL1x1_SUB
  1837. KERNEL1x1_SUB
  1838. KERNEL1x1_SUB
  1839. KERNEL1x1_SUB
  1840. KERNEL1x1_SUB
  1841. subs counterL, counterL, #1
  1842. bgt strmm_kernel_L1_M1_22
  1843. strmm_kernel_L1_M1_40:
  1844. ands counterL , tempK, #7 // counterL = counterL % 8
  1845. ble strmm_kernel_L1_M1_100
  1846. strmm_kernel_L1_M1_42:
  1847. KERNEL1x1_SUB
  1848. subs counterL, counterL, #1
  1849. bgt strmm_kernel_L1_M1_42
  1850. strmm_kernel_L1_M1_100:
  1851. SAVE1x1
  1852. strmm_kernel_L1_END:
  1853. strmm_kernel_L999:
  1854. mov x0, #0 // set return value
  1855. ldp d8, d9, [sp, #(0 * 16)]
  1856. ldp d10, d11, [sp, #(1 * 16)]
  1857. ldp d12, d13, [sp, #(2 * 16)]
  1858. ldp d14, d15, [sp, #(3 * 16)]
  1859. ldp d16, d17, [sp, #(4 * 16)]
  1860. ldp x18, x19, [sp, #(5 * 16)]
  1861. ldp x20, x21, [sp, #(6 * 16)]
  1862. ldp x22, x23, [sp, #(7 * 16)]
  1863. ldp x24, x25, [sp, #(8 * 16)]
  1864. ldp x26, x27, [sp, #(9 * 16)]
  1865. ldr x28, [sp, #(10 * 16)]
  1866. add sp, sp, #(11*16)
  1867. ret
  1868. EPILOGUE