You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrmm_kernel_8x4.S 35 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 x7*/
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define temp x16
  48. #define tempOffset x17
  49. #define tempK x18
  50. #define alpha0 d10
  51. #define alphaV0 v10.d[0]
  52. #define alpha1 d11
  53. #define alphaV1 v11.d[0]
  54. #define alpha2 d14
  55. #define alphaV2 v14.d[0]
  56. #define alpha3 d15
  57. #define alphaV3 v15.d[0]
  58. // 00 origM
  59. // 01 origN
  60. // 02 origK
  61. // 03 origPA
  62. // 04 origPB
  63. // 05 pC
  64. // 06 origLDC -> LDC
  65. // 07 offset
  66. // 08 counterL
  67. // 09 counterI
  68. // 10 counterJ
  69. // 11 pB
  70. // 12 pCRow0
  71. // 13 pCRow1
  72. // 14 pCRow2
  73. // 15 pA
  74. // 16 temp
  75. // 17 tempOffset
  76. // 18 must save tempK
  77. // 19 must save
  78. // 20 must save
  79. // 21 must save
  80. // 22 must save
  81. // 23 must save
  82. // 24 must save
  83. // 25 must save
  84. // 26 must save
  85. // 27 must save
  86. // 28 must save
  87. // 29 frame
  88. // 30 link
  89. // 31 sp
  90. //v00 ALPHA -> pA0_0, pA0_1
  91. //v01 pA0_2, pA0_3
  92. //v02 pA0_4, pA0_5
  93. //v03 pA0_6, pA0_7
  94. //v04 pA1_0, pA1_1
  95. //v05 pA1_2, pA1_3
  96. //v06 pA1_4, pA1_5
  97. //v07 pA1_6, pA1_7
  98. //v08 must save pB0_0, pB0_1
  99. //v09 must save pB0_2, pB0_3
  100. //v10 must save ALPHA0
  101. //v11 must save ALPHA1
  102. //v12 must save pB1_0, pB1_1
  103. //v13 must save pB1_2, pB1_3
  104. //v14 must save ALPHA2
  105. //v15 must save ALPHA3
  106. //v16 must save C00, C01
  107. //v17 must save C02, C03
  108. //v18 C04, C05
  109. //v19 C06, C07
  110. //v20 C10, C11
  111. //v21 C12, C13
  112. //v22 C14, C15
  113. //v23 C16, C17
  114. //v24 C20, C21
  115. //v25 C22, C23
  116. //v26 C24, C25
  117. //v27 C26, C27
  118. //v28 C30, C31
  119. //v29 C32, C33
  120. //v30 C34, C35
  121. //v31 C36, C37
  122. /*******************************************************************************
  123. * Macro definitions
  124. *******************************************************************************/
  125. .macro INIT8x4
  126. fmov d16, xzr
  127. fmov d17, xzr
  128. fmov d18, d16
  129. fmov d19, xzr
  130. fmov d20, xzr
  131. fmov d21, d16
  132. fmov d22, d17
  133. fmov d23, d18
  134. fmov d24, xzr
  135. fmov d25, d16
  136. fmov d26, d17
  137. fmov d27, d18
  138. fmov d28, xzr
  139. fmov d29, d16
  140. fmov d30, d17
  141. fmov d31, d18
  142. .endm
  143. .macro KERNEL8x4_I
  144. ld1 {v0.2d, v1.2d}, [pA]
  145. add pA, pA, #32
  146. ld1 {v8.2d, v9.2d}, [pB]
  147. add pB, pB, #32
  148. ld1 {v2.2d, v3.2d}, [pA]
  149. add pA, pA, #32
  150. fmul v16.2d, v0.2d, v8.d[0]
  151. fmul v17.2d, v1.2d, v8.d[0]
  152. fmul v18.2d, v2.2d, v8.d[0]
  153. fmul v19.2d, v3.2d, v8.d[0]
  154. fmul v20.2d, v0.2d, v8.d[1]
  155. fmul v21.2d, v1.2d, v8.d[1]
  156. fmul v22.2d, v2.2d, v8.d[1]
  157. fmul v23.2d, v3.2d, v8.d[1]
  158. fmul v24.2d, v0.2d, v9.d[0]
  159. fmul v25.2d, v1.2d, v9.d[0]
  160. fmul v26.2d, v2.2d, v9.d[0]
  161. fmul v27.2d, v3.2d, v9.d[0]
  162. fmul v28.2d, v0.2d, v9.d[1]
  163. fmul v29.2d, v1.2d, v9.d[1]
  164. fmul v30.2d, v2.2d, v9.d[1]
  165. fmul v31.2d, v3.2d, v9.d[1]
  166. ld1 {v4.2d, v5.2d}, [pA]
  167. add pA, pA, #32
  168. ld1 {v12.2d, v13.2d}, [pB]
  169. add pB, pB, #32
  170. ld1 {v6.2d, v7.2d}, [pA]
  171. add pA, pA, #32
  172. .endm
  173. .macro KERNEL8x4_M1
  174. fmla v16.2d, v0.2d, v8.d[0]
  175. fmla v17.2d, v1.2d, v8.d[0]
  176. fmla v18.2d, v2.2d, v8.d[0]
  177. fmla v19.2d, v3.2d, v8.d[0]
  178. fmla v20.2d, v0.2d, v8.d[1]
  179. fmla v21.2d, v1.2d, v8.d[1]
  180. fmla v22.2d, v2.2d, v8.d[1]
  181. fmla v23.2d, v3.2d, v8.d[1]
  182. fmla v24.2d, v0.2d, v9.d[0]
  183. fmla v25.2d, v1.2d, v9.d[0]
  184. fmla v26.2d, v2.2d, v9.d[0]
  185. fmla v27.2d, v3.2d, v9.d[0]
  186. fmla v28.2d, v0.2d, v9.d[1]
  187. fmla v29.2d, v1.2d, v9.d[1]
  188. fmla v30.2d, v2.2d, v9.d[1]
  189. fmla v31.2d, v3.2d, v9.d[1]
  190. ld1 {v4.2d, v5.2d}, [pA]
  191. add pA, pA, #32
  192. ld1 {v12.2d, v13.2d}, [pB]
  193. add pB, pB, #32
  194. ld1 {v6.2d, v7.2d}, [pA]
  195. add pA, pA, #32
  196. prfm PLDL1KEEP, [pA, #512]
  197. .endm
  198. .macro KERNEL8x4_M2
  199. fmla v16.2d, v4.2d, v12.d[0]
  200. fmla v17.2d, v5.2d, v12.d[0]
  201. fmla v18.2d, v6.2d, v12.d[0]
  202. fmla v19.2d, v7.2d, v12.d[0]
  203. fmla v20.2d, v4.2d, v12.d[1]
  204. fmla v21.2d, v5.2d, v12.d[1]
  205. fmla v22.2d, v6.2d, v12.d[1]
  206. fmla v23.2d, v7.2d, v12.d[1]
  207. fmla v24.2d, v4.2d, v13.d[0]
  208. fmla v25.2d, v5.2d, v13.d[0]
  209. fmla v26.2d, v6.2d, v13.d[0]
  210. fmla v27.2d, v7.2d, v13.d[0]
  211. fmla v28.2d, v4.2d, v13.d[1]
  212. fmla v29.2d, v5.2d, v13.d[1]
  213. fmla v30.2d, v6.2d, v13.d[1]
  214. fmla v31.2d, v7.2d, v13.d[1]
  215. ld1 {v0.2d, v1.2d}, [pA]
  216. add pA, pA, #32
  217. ld1 {v8.2d, v9.2d}, [pB]
  218. add pB, pB, #32
  219. ld1 {v2.2d, v3.2d}, [pA]
  220. add pA, pA, #32
  221. prfm PLDL1KEEP, [pB, #512]
  222. .endm
  223. .macro KERNEL8x4_E
  224. fmla v16.2d, v4.2d, v12.d[0]
  225. fmla v17.2d, v5.2d, v12.d[0]
  226. fmla v18.2d, v6.2d, v12.d[0]
  227. fmla v19.2d, v7.2d, v12.d[0]
  228. fmla v20.2d, v4.2d, v12.d[1]
  229. fmla v21.2d, v5.2d, v12.d[1]
  230. fmla v22.2d, v6.2d, v12.d[1]
  231. fmla v23.2d, v7.2d, v12.d[1]
  232. fmla v24.2d, v4.2d, v13.d[0]
  233. fmla v25.2d, v5.2d, v13.d[0]
  234. fmla v26.2d, v6.2d, v13.d[0]
  235. fmla v27.2d, v7.2d, v13.d[0]
  236. fmla v28.2d, v4.2d, v13.d[1]
  237. fmla v29.2d, v5.2d, v13.d[1]
  238. fmla v30.2d, v6.2d, v13.d[1]
  239. fmla v31.2d, v7.2d, v13.d[1]
  240. .endm
  241. .macro KERNEL8x4_SUB
  242. ld1 {v0.2d, v1.2d}, [pA]
  243. add pA, pA, #32
  244. ld1 {v8.2d, v9.2d}, [pB]
  245. add pB, pB, #32
  246. ld1 {v2.2d, v3.2d}, [pA]
  247. add pA, pA, #32
  248. fmla v16.2d, v0.2d, v8.d[0]
  249. fmla v17.2d, v1.2d, v8.d[0]
  250. fmla v18.2d, v2.2d, v8.d[0]
  251. fmla v19.2d, v3.2d, v8.d[0]
  252. fmla v20.2d, v0.2d, v8.d[1]
  253. fmla v21.2d, v1.2d, v8.d[1]
  254. fmla v22.2d, v2.2d, v8.d[1]
  255. fmla v23.2d, v3.2d, v8.d[1]
  256. fmla v24.2d, v0.2d, v9.d[0]
  257. fmla v25.2d, v1.2d, v9.d[0]
  258. fmla v26.2d, v2.2d, v9.d[0]
  259. fmla v27.2d, v3.2d, v9.d[0]
  260. fmla v28.2d, v0.2d, v9.d[1]
  261. fmla v29.2d, v1.2d, v9.d[1]
  262. fmla v30.2d, v2.2d, v9.d[1]
  263. fmla v31.2d, v3.2d, v9.d[1]
  264. .endm
  265. .macro SAVE8x4
  266. add pCRow1, pCRow0, LDC
  267. fmul v0.2d, v16.2d, alphaV0
  268. fmul v1.2d, v17.2d, alphaV1
  269. fmul v2.2d, v18.2d, alphaV2
  270. fmul v3.2d, v19.2d, alphaV3
  271. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  272. add pCRow2, pCRow1, LDC
  273. fmul v4.2d, v20.2d, alphaV0
  274. fmul v5.2d, v21.2d, alphaV1
  275. fmul v6.2d, v22.2d, alphaV2
  276. fmul v7.2d, v23.2d, alphaV3
  277. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
  278. add pCRow1, pCRow2, LDC
  279. fmul v0.2d, v24.2d, alphaV0
  280. fmul v1.2d, v25.2d, alphaV1
  281. fmul v2.2d, v26.2d, alphaV2
  282. fmul v3.2d, v27.2d, alphaV3
  283. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2]
  284. fmul v4.2d, v28.2d, alphaV0
  285. fmul v5.2d, v29.2d, alphaV1
  286. fmul v6.2d, v30.2d, alphaV2
  287. fmul v7.2d, v31.2d, alphaV3
  288. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
  289. add pCRow0, pCRow0, #64
  290. .endm
  291. /******************************************************************************/
  292. .macro INIT4x4
  293. fmov d16, xzr
  294. fmov d17, d16
  295. fmov d20, d17
  296. fmov d21, d16
  297. fmov d24, d17
  298. fmov d25, d16
  299. fmov d28, d17
  300. fmov d29, d16
  301. .endm
  302. .macro KERNEL4x4_SUB
  303. ld1 {v8.2d, v9.2d}, [pB]
  304. add pB, pB, #32
  305. ld1 {v0.2d, v1.2d}, [pA]
  306. add pA, pA, #32
  307. fmla v16.2d, v0.2d, v8.d[0]
  308. fmla v29.2d, v1.2d, v9.d[1]
  309. fmla v20.2d, v0.2d, v8.d[1]
  310. fmla v25.2d, v1.2d, v9.d[0]
  311. fmla v24.2d, v0.2d, v9.d[0]
  312. fmla v21.2d, v1.2d, v8.d[1]
  313. fmla v28.2d, v0.2d, v9.d[1]
  314. fmla v17.2d, v1.2d, v8.d[0]
  315. .endm
  316. .macro SAVE4x4
  317. fmul v8.2d, v16.2d, alphaV0
  318. fmul v9.2d, v17.2d, alphaV1
  319. st1 {v8.2d, v9.2d}, [pCRow0]
  320. add pCRow1, pCRow0, LDC
  321. fmul v12.2d, v20.2d, alphaV2
  322. fmul v13.2d, v21.2d, alphaV3
  323. st1 {v12.2d, v13.2d}, [pCRow1]
  324. add pCRow2, pCRow1, LDC
  325. fmul v8.2d, v24.2d, alphaV0
  326. fmul v9.2d, v25.2d, alphaV1
  327. st1 {v8.2d, v9.2d}, [pCRow2]
  328. add pCRow1, pCRow2, LDC
  329. fmul v12.2d, v28.2d, alphaV2
  330. fmul v13.2d, v29.2d, alphaV3
  331. st1 {v12.2d, v13.2d}, [pCRow1]
  332. add pCRow0, pCRow0, #32
  333. .endm
  334. /******************************************************************************/
  335. .macro INIT2x4
  336. fmov d16, xzr
  337. fmov d20, d16
  338. fmov d24, d20
  339. fmov d28, d16
  340. .endm
  341. .macro KERNEL2x4_SUB
  342. ld1 {v8.2d, v9.2d}, [pB]
  343. add pB, pB, #32
  344. ld1 {v0.2d}, [pA]
  345. add pA, pA, #16
  346. fmla v16.2d, v0.2d, v8.d[0]
  347. fmla v20.2d, v0.2d, v8.d[1]
  348. fmla v24.2d, v0.2d, v9.d[0]
  349. fmla v28.2d, v0.2d, v9.d[1]
  350. .endm
  351. .macro SAVE2x4
  352. fmul v8.2d, v16.2d, alphaV0
  353. st1 {v8.2d}, [pCRow0]
  354. add pCRow1, pCRow0, LDC
  355. fmul v12.2d, v20.2d, alphaV1
  356. st1 {v12.2d}, [pCRow1]
  357. add pCRow2, pCRow1, LDC
  358. fmul v8.2d, v24.2d, alphaV2
  359. st1 {v8.2d}, [pCRow2]
  360. add pCRow1, pCRow2, LDC
  361. fmul v12.2d, v28.2d, alphaV3
  362. st1 {v12.2d}, [pCRow1]
  363. add pCRow0, pCRow0, #16
  364. .endm
  365. /******************************************************************************/
  366. .macro INIT1x4
  367. fmov d16, xzr
  368. fmov d20, d16
  369. .endm
  370. .macro KERNEL1x4_SUB
  371. ldr d0, [pA]
  372. add pA, pA, #8
  373. ld1 {v8.2d, v9.2d}, [pB]
  374. add pB, pB, #32
  375. fmla v16.2d, v8.2d, v0.d[0]
  376. fmla v20.2d, v9.2d, v0.d[0]
  377. .endm
  378. .macro SAVE1x4
  379. add pCRow1, pCRow0, LDC
  380. fmul v8.2d, v16.2d, alphaV0
  381. st1 {v8.d}[0], [pCRow0]
  382. st1 {v8.d}[1], [pCRow1]
  383. add pCRow2, pCRow1, LDC
  384. add pCRow1, pCRow2, LDC
  385. fmul v12.2d, v20.2d, alphaV1
  386. st1 {v12.d}[0], [pCRow2]
  387. st1 {v12.d}[1], [pCRow1]
  388. add pCRow0, pCRow0, #8
  389. .endm
  390. /******************************************************************************/
  391. .macro INIT8x2
  392. fmov d16, xzr
  393. fmov d17, xzr
  394. fmov d18, d16
  395. fmov d19, d17
  396. fmov d20, xzr
  397. fmov d21, d16
  398. fmov d22, d17
  399. fmov d23, d18
  400. .endm
  401. .macro KERNEL8x2_SUB
  402. ld1 {v0.2d, v1.2d}, [pA]
  403. add pA, pA, #32
  404. ld1 {v8.2d}, [pB]
  405. add pB, pB, #16
  406. ld1 {v2.2d, v3.2d}, [pA]
  407. add pA, pA, #32
  408. fmla v16.2d, v0.2d, v8.d[0]
  409. fmla v17.2d, v1.2d, v8.d[0]
  410. fmla v18.2d, v2.2d, v8.d[0]
  411. fmla v19.2d, v3.2d, v8.d[0]
  412. fmla v20.2d, v0.2d, v8.d[1]
  413. fmla v21.2d, v1.2d, v8.d[1]
  414. fmla v22.2d, v2.2d, v8.d[1]
  415. fmla v23.2d, v3.2d, v8.d[1]
  416. .endm
  417. .macro SAVE8x2
  418. add pCRow1, pCRow0, LDC
  419. fmul v0.2d, v16.2d, alphaV0
  420. fmul v1.2d, v17.2d, alphaV1
  421. fmul v2.2d, v18.2d, alphaV2
  422. fmul v3.2d, v19.2d, alphaV3
  423. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  424. fmul v4.2d, v20.2d, alphaV0
  425. fmul v5.2d, v21.2d, alphaV1
  426. fmul v6.2d, v22.2d, alphaV2
  427. fmul v7.2d, v23.2d, alphaV3
  428. st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
  429. add pCRow0, pCRow0, #64
  430. .endm
  431. /******************************************************************************/
  432. .macro INIT4x2
  433. fmov d16, xzr
  434. fmov d17, d16
  435. fmov d20, d17
  436. fmov d21, d16
  437. .endm
  438. .macro KERNEL4x2_SUB
  439. ld1 {v8.2d}, [pB]
  440. add pB, pB, #16
  441. ld1 {v0.2d, v1.2d}, [pA]
  442. add pA, pA, #32
  443. fmla v16.2d, v0.2d, v8.d[0]
  444. fmla v17.2d, v1.2d, v8.d[0]
  445. fmla v20.2d, v0.2d, v8.d[1]
  446. fmla v21.2d, v1.2d, v8.d[1]
  447. .endm
  448. .macro SAVE4x2
  449. fmul v8.2d, v16.2d, alphaV0
  450. fmul v9.2d, v17.2d, alphaV1
  451. st1 {v8.2d, v9.2d}, [pCRow0]
  452. add pCRow1, pCRow0, LDC
  453. fmul v12.2d, v20.2d, alphaV2
  454. fmul v13.2d, v21.2d, alphaV3
  455. st1 {v12.2d, v13.2d}, [pCRow1]
  456. add pCRow0, pCRow0, #32
  457. .endm
  458. /******************************************************************************/
  459. .macro INIT2x2
  460. fmov d16, xzr
  461. fmov d20, d16
  462. .endm
  463. .macro KERNEL2x2_SUB
  464. ld1 {v8.2d}, [pB]
  465. add pB, pB, #16
  466. ld1 {v0.2d}, [pA]
  467. add pA, pA, #16
  468. fmla v16.2d, v0.2d, v8.d[0]
  469. fmla v20.2d, v0.2d, v8.d[1]
  470. .endm
  471. .macro SAVE2x2
  472. fmul v8.2d, v16.2d, alphaV0
  473. st1 {v8.2d}, [pCRow0]
  474. add pCRow1 , pCRow0, LDC
  475. fmul v12.2d, v20.2d, alphaV1
  476. st1 {v12.2d}, [pCRow1]
  477. add pCRow0, pCRow0, #16
  478. .endm
  479. /******************************************************************************/
  480. .macro INIT1x2
  481. fmov d16, xzr
  482. .endm
  483. .macro KERNEL1x2_SUB
  484. ld1 {v8.2d} , [pB]
  485. add pB , pB, #16
  486. ldr d0 , [pA]
  487. add pA, pA, #8
  488. fmla v16.2d, v8.2d, v0.d[0]
  489. .endm
  490. .macro SAVE1x2
  491. add pCRow1 , pCRow0, LDC
  492. fmul v8.2d, v16.2d, alphaV0
  493. st1 {v8.d}[0], [pCRow0]
  494. st1 {v8.d}[1], [pCRow1]
  495. add pCRow0, pCRow0, #8
  496. .endm
  497. /******************************************************************************/
  498. .macro INIT8x1
  499. fmov d16, xzr
  500. fmov d17, xzr
  501. fmov d18, d16
  502. fmov d19, d17
  503. .endm
  504. .macro KERNEL8x1_SUB
  505. ld1 {v0.2d, v1.2d}, [pA]
  506. add pA , pA, #32
  507. ldr d8, [pB]
  508. add pB , pB, #8
  509. ld1 {v2.2d, v3.2d}, [pA]
  510. add pA, pA, #32
  511. fmla v16.2d, v0.2d, v8.d[0]
  512. fmla v17.2d, v1.2d, v8.d[0]
  513. fmla v18.2d, v2.2d, v8.d[0]
  514. fmla v19.2d, v3.2d, v8.d[0]
  515. .endm
  516. .macro SAVE8x1
  517. fmul v0.2d, v16.2d, alphaV0
  518. fmul v1.2d, v17.2d, alphaV1
  519. fmul v2.2d, v18.2d, alphaV2
  520. fmul v3.2d, v19.2d, alphaV3
  521. st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
  522. add pCRow0, pCRow0, #64
  523. .endm
  524. /******************************************************************************/
  525. .macro INIT4x1
  526. fmov d16, xzr
  527. fmov d17, d16
  528. .endm
  529. .macro KERNEL4x1_SUB
  530. ldr d8, [pB]
  531. add pB , pB, #8
  532. ld1 {v0.2d, v1.2d}, [pA]
  533. add pA , pA, #32
  534. fmla v16.2d, v0.2d, v8.d[0]
  535. fmla v17.2d, v1.2d, v8.d[0]
  536. .endm
  537. .macro SAVE4x1
  538. fmul v8.2d, v16.2d, alphaV0
  539. fmul v9.2d, v17.2d, alphaV1
  540. st1 {v8.2d, v9.2d}, [pCRow0]
  541. add pCRow0, pCRow0, #32
  542. .endm
  543. /******************************************************************************/
  544. .macro INIT2x1
  545. fmov d16, xzr
  546. .endm
  547. .macro KERNEL2x1_SUB
  548. ldr d8, [pB]
  549. add pB , pB, #8
  550. ld1 {v0.2d}, [pA]
  551. add pA , pA, #16
  552. fmla v16.2d, v0.2d, v8.d[0]
  553. .endm
  554. .macro SAVE2x1
  555. fmul v8.2d, v16.2d, alphaV0
  556. st1 {v8.2d}, [pCRow0]
  557. add pCRow0, pCRow0, #16
  558. .endm
  559. /******************************************************************************/
  560. .macro INIT1x1
  561. fmov d16, xzr
  562. .endm
  563. .macro KERNEL1x1_SUB
  564. ldr d8, [pB]
  565. add pB , pB, #8
  566. ldr d0, [pA]
  567. add pA , pA, #8
  568. fmadd d16, d0, d8, d16
  569. .endm
  570. .macro SAVE1x1
  571. fmul d8, d16, alpha0
  572. str d8, [pCRow0]
  573. add pCRow0, pCRow0, #8
  574. .endm
  575. /*******************************************************************************
  576. * End of macro definitions
  577. *******************************************************************************/
  578. PROLOGUE
  579. .align 5
  580. add sp, sp, #-(11 * 16)
  581. stp d8, d9, [sp, #(0 * 16)]
  582. stp d10, d11, [sp, #(1 * 16)]
  583. stp d12, d13, [sp, #(2 * 16)]
  584. stp d14, d15, [sp, #(3 * 16)]
  585. stp d16, d17, [sp, #(4 * 16)]
  586. stp x18, x19, [sp, #(5 * 16)]
  587. stp x20, x21, [sp, #(6 * 16)]
  588. stp x22, x23, [sp, #(7 * 16)]
  589. stp x24, x25, [sp, #(8 * 16)]
  590. stp x26, x27, [sp, #(9 * 16)]
  591. str x28, [sp, #(10 * 16)]
  592. fmov alpha0, d0
  593. fmov alpha1, d0
  594. fmov alpha2, d0
  595. fmov alpha3, d0
  596. lsl LDC, LDC, #3 // ldc = ldc * 8
  597. #if !defined(LEFT)
  598. neg tempOffset, offset
  599. #endif
  600. mov pB, origPB
  601. mov counterJ, origN
  602. asr counterJ, counterJ, #2 // J = J / 4
  603. cmp counterJ, #0
  604. ble dtrmm_kernel_L2_BEGIN
  605. /******************************************************************************/
  606. dtrmm_kernel_L4_BEGIN:
  607. mov pCRow0, pC // pCRow0 = C
  608. add pC, pC, LDC, lsl #2
  609. #if defined(LEFT)
  610. mov tempOffset, offset
  611. #endif
  612. mov pA, origPA // pA = start of A array
  613. dtrmm_kernel_L4_M8_BEGIN:
  614. mov counterI, origM
  615. asr counterI, counterI, #3 // counterI = counterI / 8
  616. cmp counterI, #0
  617. ble dtrmm_kernel_L4_M4_BEGIN
  618. dtrmm_kernel_L4_M8_20:
  619. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  620. mov pB, origPB
  621. #else
  622. mov pB, origPB
  623. lsl temp, tempOffset, #6
  624. add pA, pA, temp
  625. lsl temp, tempOffset, #5
  626. add pB, pB, temp
  627. #endif
  628. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  629. sub tempK, origK, tempOffset
  630. #elif defined(LEFT)
  631. add tempK, tempOffset, #8
  632. #else
  633. add tempK, tempOffset, #4
  634. #endif
  635. asr counterL , tempK, #1 // L = K / 2
  636. cmp counterL , #2 // is there at least 4 to do?
  637. blt dtrmm_kernel_L4_M8_32
  638. KERNEL8x4_I // do one in the K
  639. KERNEL8x4_M2 // do another in the K
  640. subs counterL, counterL, #2 // subtract 2
  641. ble dtrmm_kernel_L4_M8_22a
  642. .align 5
  643. dtrmm_kernel_L4_M8_22:
  644. KERNEL8x4_M1
  645. KERNEL8x4_M2
  646. subs counterL, counterL, #1
  647. bgt dtrmm_kernel_L4_M8_22
  648. dtrmm_kernel_L4_M8_22a:
  649. KERNEL8x4_M1
  650. KERNEL8x4_E
  651. b dtrmm_kernel_L4_M8_44
  652. dtrmm_kernel_L4_M8_32:
  653. tst counterL, #1
  654. ble dtrmm_kernel_L4_M8_40
  655. KERNEL8x4_I
  656. KERNEL8x4_E
  657. b dtrmm_kernel_L4_M8_44
  658. dtrmm_kernel_L4_M8_40:
  659. INIT8x4
  660. dtrmm_kernel_L4_M8_44:
  661. ands counterL , tempK, #1
  662. ble dtrmm_kernel_L4_M8_100
  663. dtrmm_kernel_L4_M8_46:
  664. KERNEL8x4_SUB
  665. dtrmm_kernel_L4_M8_100:
  666. SAVE8x4
  667. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  668. sub tempK, origK, tempOffset
  669. #if defined(LEFT)
  670. sub tempK, tempK, #8
  671. #else
  672. sub tempK, tempK, #4
  673. #endif
  674. lsl temp, tempK, #6
  675. add pA, pA, temp
  676. lsl temp, tempK, #5
  677. add pB, pB, temp
  678. #endif
  679. #if defined(LEFT)
  680. add tempOffset, tempOffset, #8
  681. #endif
  682. dtrmm_kernel_L4_M8_END:
  683. subs counterI, counterI, #1
  684. bne dtrmm_kernel_L4_M8_20
  685. dtrmm_kernel_L4_M4_BEGIN:
  686. mov counterI, origM
  687. tst counterI , #7
  688. ble dtrmm_kernel_L4_END
  689. tst counterI, #4
  690. ble dtrmm_kernel_L4_M2_BEGIN
  691. dtrmm_kernel_L4_M4_20:
  692. INIT4x4
  693. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  694. mov pB, origPB
  695. #else
  696. mov pB, origPB
  697. lsl temp, tempOffset, #5
  698. add pB, pB, temp
  699. add pA, pA, temp
  700. #endif
  701. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  702. sub tempK, origK, tempOffset
  703. #elif defined(LEFT)
  704. add tempK, tempOffset, #4
  705. #else
  706. add tempK, tempOffset, #4
  707. #endif
  708. asr counterL , tempK, #3 // counterL = counterL / 8
  709. cmp counterL , #0
  710. ble dtrmm_kernel_L4_M4_40
  711. dtrmm_kernel_L4_M4_22:
  712. KERNEL4x4_SUB
  713. KERNEL4x4_SUB
  714. KERNEL4x4_SUB
  715. KERNEL4x4_SUB
  716. KERNEL4x4_SUB
  717. KERNEL4x4_SUB
  718. KERNEL4x4_SUB
  719. KERNEL4x4_SUB
  720. subs counterL, counterL, #1
  721. bgt dtrmm_kernel_L4_M4_22
  722. dtrmm_kernel_L4_M4_40:
  723. ands counterL , tempK, #7 // counterL = counterL % 8
  724. ble dtrmm_kernel_L4_M4_100
  725. dtrmm_kernel_L4_M4_42:
  726. KERNEL4x4_SUB
  727. subs counterL, counterL, #1
  728. bgt dtrmm_kernel_L4_M4_42
  729. dtrmm_kernel_L4_M4_100:
  730. SAVE4x4
  731. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  732. sub tempK, origK, tempOffset
  733. #if defined(LEFT)
  734. sub tempK, tempK, #4
  735. #else
  736. sub tempK, tempK, #4
  737. #endif
  738. lsl temp, tempK, #5
  739. add pA, pA, temp
  740. add pB, pB, temp
  741. #endif
  742. #if defined(LEFT)
  743. add tempOffset, tempOffset, #4
  744. #endif
  745. dtrmm_kernel_L4_M4_END:
  746. dtrmm_kernel_L4_M2_BEGIN:
  747. mov counterI, origM
  748. tst counterI , #3
  749. ble dtrmm_kernel_L4_END
  750. tst counterI, #2 // counterI = counterI / 2
  751. ble dtrmm_kernel_L4_M1_BEGIN
  752. dtrmm_kernel_L4_M2_20:
  753. INIT2x4
  754. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  755. mov pB, origPB
  756. #else
  757. mov pB, origPB
  758. lsl temp, tempOffset, #4
  759. add pA, pA, temp
  760. lsl temp, tempOffset, #5
  761. add pB, pB, temp
  762. #endif
  763. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  764. sub tempK, origK, tempOffset
  765. #elif defined(LEFT)
  766. add tempK, tempOffset, #2
  767. #else
  768. add tempK, tempOffset, #4
  769. #endif
  770. asr counterL , tempK, #3 // counterL = counterL / 8
  771. cmp counterL , #0
  772. ble dtrmm_kernel_L4_M2_40
  773. dtrmm_kernel_L4_M2_22:
  774. KERNEL2x4_SUB
  775. KERNEL2x4_SUB
  776. KERNEL2x4_SUB
  777. KERNEL2x4_SUB
  778. KERNEL2x4_SUB
  779. KERNEL2x4_SUB
  780. KERNEL2x4_SUB
  781. KERNEL2x4_SUB
  782. subs counterL, counterL, #1
  783. bgt dtrmm_kernel_L4_M2_22
  784. dtrmm_kernel_L4_M2_40:
  785. ands counterL , tempK, #7 // counterL = counterL % 8
  786. ble dtrmm_kernel_L4_M2_100
  787. dtrmm_kernel_L4_M2_42:
  788. KERNEL2x4_SUB
  789. subs counterL, counterL, #1
  790. bgt dtrmm_kernel_L4_M2_42
  791. dtrmm_kernel_L4_M2_100:
  792. SAVE2x4
  793. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  794. sub tempK, origK, tempOffset
  795. #if defined(LEFT)
  796. sub tempK, tempK, #2
  797. #else
  798. sub tempK, tempK, #4
  799. #endif
  800. lsl temp, tempK, #4
  801. add pA, pA, temp
  802. lsl temp, tempK, #5
  803. add pB, pB, temp
  804. #endif
  805. #if defined(LEFT)
  806. add tempOffset, tempOffset, #2
  807. #endif
  808. dtrmm_kernel_L4_M2_END:
  809. dtrmm_kernel_L4_M1_BEGIN:
  810. tst counterI, #1 // counterI = counterI % 2
  811. ble dtrmm_kernel_L4_END
  812. dtrmm_kernel_L4_M1_20:
  813. INIT1x4
  814. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  815. mov pB, origPB
  816. #else
  817. mov pB, origPB
  818. lsl temp, tempOffset, #5
  819. add pB, pB, temp
  820. lsl temp, tempOffset, #3
  821. add pA, pA, temp
  822. #endif
  823. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  824. sub tempK, origK, tempOffset
  825. #elif defined(LEFT)
  826. add tempK, tempOffset, #1
  827. #else
  828. add tempK, tempOffset, #4
  829. #endif
  830. asr counterL , tempK, #3 // counterL = counterL / 8
  831. cmp counterL , #0
  832. ble dtrmm_kernel_L4_M1_40
  833. dtrmm_kernel_L4_M1_22:
  834. KERNEL1x4_SUB
  835. KERNEL1x4_SUB
  836. KERNEL1x4_SUB
  837. KERNEL1x4_SUB
  838. KERNEL1x4_SUB
  839. KERNEL1x4_SUB
  840. KERNEL1x4_SUB
  841. KERNEL1x4_SUB
  842. subs counterL, counterL, #1
  843. bgt dtrmm_kernel_L4_M1_22
  844. dtrmm_kernel_L4_M1_40:
  845. ands counterL , tempK, #7 // counterL = counterL % 8
  846. ble dtrmm_kernel_L4_M1_100
  847. dtrmm_kernel_L4_M1_42:
  848. KERNEL1x4_SUB
  849. subs counterL, counterL, #1
  850. bgt dtrmm_kernel_L4_M1_42
  851. dtrmm_kernel_L4_M1_100:
  852. SAVE1x4
  853. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  854. sub tempK, origK, tempOffset
  855. #if defined(LEFT)
  856. sub tempK, tempK, #1
  857. #else
  858. sub tempK, tempK, #4
  859. #endif
  860. lsl temp, tempK, #3
  861. add pA, pA, temp
  862. lsl temp, tempK, #5
  863. add pB, pB, temp
  864. #endif
  865. #if defined(LEFT)
  866. add tempOffset, tempOffset, #1
  867. #endif
  868. dtrmm_kernel_L4_END:
  869. lsl temp, origK, #5
  870. add origPB, origPB, temp // B = B + K * 4 * 8
  871. #if !defined(LEFT)
  872. add tempOffset, tempOffset, #4
  873. #endif
  874. subs counterJ, counterJ , #1 // j--
  875. bgt dtrmm_kernel_L4_BEGIN
  876. /******************************************************************************/
  877. dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  878. mov counterJ , origN
  879. tst counterJ , #3
  880. ble dtrmm_kernel_L999 // error, N was less than 4?
  881. tst counterJ , #2
  882. ble dtrmm_kernel_L1_BEGIN
  883. mov pCRow0, pC // pCRow0 = pC
  884. add pC,pC,LDC, lsl #1
  885. #if defined(LEFT)
  886. mov tempOffset, offset
  887. #endif
  888. mov pA, origPA // pA = A
  889. dtrmm_kernel_L2_M8_BEGIN:
  890. mov counterI, origM
  891. asr counterI, counterI, #3 // counterI = counterI / 8
  892. cmp counterI, #0
  893. ble dtrmm_kernel_L2_M4_BEGIN
  894. dtrmm_kernel_L2_M8_20:
  895. INIT8x2
  896. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  897. mov pB, origPB
  898. #else
  899. mov pB, origPB
  900. lsl temp, tempOffset, #6
  901. add pA, pA, temp
  902. lsl temp, tempOffset, #4
  903. add pB, pB, temp
  904. #endif
  905. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  906. sub tempK, origK, tempOffset
  907. #elif defined(LEFT)
  908. add tempK, tempOffset, #8
  909. #else
  910. add tempK, tempOffset, #2
  911. #endif
  912. asr counterL , tempK, #3 // counterL = counterL / 8
  913. cmp counterL,#0
  914. ble dtrmm_kernel_L2_M8_40
  915. .align 5
  916. dtrmm_kernel_L2_M8_22:
  917. KERNEL8x2_SUB
  918. KERNEL8x2_SUB
  919. KERNEL8x2_SUB
  920. KERNEL8x2_SUB
  921. KERNEL8x2_SUB
  922. KERNEL8x2_SUB
  923. KERNEL8x2_SUB
  924. KERNEL8x2_SUB
  925. subs counterL, counterL, #1
  926. bgt dtrmm_kernel_L2_M8_22
  927. dtrmm_kernel_L2_M8_40:
  928. ands counterL , tempK, #7 // counterL = counterL % 8
  929. ble dtrmm_kernel_L2_M8_100
  930. dtrmm_kernel_L2_M8_42:
  931. KERNEL8x2_SUB
  932. subs counterL, counterL, #1
  933. bgt dtrmm_kernel_L2_M8_42
  934. dtrmm_kernel_L2_M8_100:
  935. SAVE8x2
  936. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  937. sub tempK, origK, tempOffset
  938. #if defined(LEFT)
  939. sub tempK, tempK, #8
  940. #else
  941. sub tempK, tempK, #2
  942. #endif
  943. lsl temp, tempK, #6
  944. add pA, pA, temp
  945. lsl temp, tempK, #4
  946. add pB, pB, temp
  947. #endif
  948. #if defined(LEFT)
  949. add tempOffset, tempOffset, #8
  950. #endif
  951. dtrmm_kernel_L2_M8_END:
  952. subs counterI, counterI, #1
  953. bgt dtrmm_kernel_L2_M8_20
  954. dtrmm_kernel_L2_M4_BEGIN:
  955. mov counterI, origM
  956. tst counterI , #7
  957. ble dtrmm_kernel_L2_END
  958. tst counterI, #4 // counterI = counterI / 2
  959. ble dtrmm_kernel_L2_M2_BEGIN
  960. dtrmm_kernel_L2_M4_20:
  961. INIT4x2
  962. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  963. mov pB, origPB
  964. #else
  965. mov pB, origPB
  966. lsl temp, tempOffset, #4
  967. add pB, pB, temp
  968. lsl temp, tempOffset, #5
  969. add pA, pA, temp
  970. #endif
  971. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  972. sub tempK, origK, tempOffset
  973. #elif defined(LEFT)
  974. add tempK, tempOffset, #4
  975. #else
  976. add tempK, tempOffset, #2
  977. #endif
  978. asr counterL , tempK, #3 // counterL = counterL / 8
  979. cmp counterL,#0
  980. ble dtrmm_kernel_L2_M4_40
  981. .align 5
  982. dtrmm_kernel_L2_M4_22:
  983. KERNEL4x2_SUB
  984. KERNEL4x2_SUB
  985. KERNEL4x2_SUB
  986. KERNEL4x2_SUB
  987. KERNEL4x2_SUB
  988. KERNEL4x2_SUB
  989. KERNEL4x2_SUB
  990. KERNEL4x2_SUB
  991. subs counterL, counterL, #1
  992. bgt dtrmm_kernel_L2_M4_22
  993. dtrmm_kernel_L2_M4_40:
  994. ands counterL , tempK, #7 // counterL = counterL % 8
  995. ble dtrmm_kernel_L2_M4_100
  996. dtrmm_kernel_L2_M4_42:
  997. KERNEL4x2_SUB
  998. subs counterL, counterL, #1
  999. bgt dtrmm_kernel_L2_M4_42
  1000. dtrmm_kernel_L2_M4_100:
  1001. SAVE4x2
  1002. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1003. sub tempK, origK, tempOffset
  1004. #if defined(LEFT)
  1005. sub tempK, tempK, #4
  1006. #else
  1007. sub tempK, tempK, #2
  1008. #endif
  1009. lsl temp, tempK, #5
  1010. add pA, pA, temp
  1011. lsl temp, tempK, #4
  1012. add pB, pB, temp
  1013. #endif
  1014. #if defined(LEFT)
  1015. add tempOffset, tempOffset, #4
  1016. #endif
  1017. dtrmm_kernel_L2_M4_END:
  1018. dtrmm_kernel_L2_M2_BEGIN:
  1019. mov counterI, origM
  1020. tst counterI , #3
  1021. ble dtrmm_kernel_L2_END
  1022. tst counterI, #2 // counterI = counterI / 2
  1023. ble dtrmm_kernel_L2_M1_BEGIN
  1024. dtrmm_kernel_L2_M2_20:
  1025. INIT2x2
  1026. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1027. mov pB, origPB
  1028. #else
  1029. mov pB, origPB
  1030. lsl temp, tempOffset, #4
  1031. add pB, pB, temp
  1032. lsl temp, tempOffset, #4
  1033. add pA, pA, temp
  1034. #endif
  1035. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1036. sub tempK, origK, tempOffset
  1037. #elif defined(LEFT)
  1038. add tempK, tempOffset, #2
  1039. #else
  1040. add tempK, tempOffset, #2
  1041. #endif
  1042. asr counterL , tempK, #3 // counterL = counterL / 8
  1043. cmp counterL,#0
  1044. ble dtrmm_kernel_L2_M2_40
  1045. dtrmm_kernel_L2_M2_22:
  1046. KERNEL2x2_SUB
  1047. KERNEL2x2_SUB
  1048. KERNEL2x2_SUB
  1049. KERNEL2x2_SUB
  1050. KERNEL2x2_SUB
  1051. KERNEL2x2_SUB
  1052. KERNEL2x2_SUB
  1053. KERNEL2x2_SUB
  1054. subs counterL, counterL, #1
  1055. bgt dtrmm_kernel_L2_M2_22
  1056. dtrmm_kernel_L2_M2_40:
  1057. ands counterL , tempK, #7 // counterL = counterL % 8
  1058. ble dtrmm_kernel_L2_M2_100
  1059. dtrmm_kernel_L2_M2_42:
  1060. KERNEL2x2_SUB
  1061. subs counterL, counterL, #1
  1062. bgt dtrmm_kernel_L2_M2_42
  1063. dtrmm_kernel_L2_M2_100:
  1064. SAVE2x2
  1065. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1066. sub tempK, origK, tempOffset
  1067. #if defined(LEFT)
  1068. sub tempK, tempK, #2
  1069. #else
  1070. sub tempK, tempK, #2
  1071. #endif
  1072. lsl temp, tempK, #4
  1073. add pA, pA, temp
  1074. lsl temp, tempK, #4
  1075. add pB, pB, temp
  1076. #endif
  1077. #if defined(LEFT)
  1078. add tempOffset, tempOffset, #2
  1079. #endif
  1080. dtrmm_kernel_L2_M2_END:
  1081. dtrmm_kernel_L2_M1_BEGIN:
  1082. tst counterI, #1 // counterI = counterI % 2
  1083. ble dtrmm_kernel_L2_END
  1084. dtrmm_kernel_L2_M1_20:
  1085. INIT1x2
  1086. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1087. mov pB, origPB
  1088. #else
  1089. mov pB, origPB
  1090. lsl temp, tempOffset, #4
  1091. add pB, pB, temp
  1092. lsl temp, tempOffset, #3
  1093. add pA, pA, temp
  1094. #endif
  1095. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1096. sub tempK, origK, tempOffset
  1097. #elif defined(LEFT)
  1098. add tempK, tempOffset, #1
  1099. #else
  1100. add tempK, tempOffset, #2
  1101. #endif
  1102. asr counterL , tempK, #3 // counterL = counterL / 8
  1103. cmp counterL, #0
  1104. ble dtrmm_kernel_L2_M1_40
  1105. dtrmm_kernel_L2_M1_22:
  1106. KERNEL1x2_SUB
  1107. KERNEL1x2_SUB
  1108. KERNEL1x2_SUB
  1109. KERNEL1x2_SUB
  1110. KERNEL1x2_SUB
  1111. KERNEL1x2_SUB
  1112. KERNEL1x2_SUB
  1113. KERNEL1x2_SUB
  1114. subs counterL, counterL, #1
  1115. bgt dtrmm_kernel_L2_M1_22
  1116. dtrmm_kernel_L2_M1_40:
  1117. ands counterL , tempK, #7 // counterL = counterL % 8
  1118. ble dtrmm_kernel_L2_M1_100
  1119. dtrmm_kernel_L2_M1_42:
  1120. KERNEL1x2_SUB
  1121. subs counterL, counterL, #1
  1122. bgt dtrmm_kernel_L2_M1_42
  1123. dtrmm_kernel_L2_M1_100:
  1124. SAVE1x2
  1125. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1126. sub tempK, origK, tempOffset
  1127. #if defined(LEFT)
  1128. sub tempK, tempK, #1
  1129. #else
  1130. sub tempK, tempK, #2
  1131. #endif
  1132. lsl temp, tempK, #3
  1133. add pA, pA, temp
  1134. lsl temp, tempK, #4
  1135. add pB, pB, temp
  1136. #endif
  1137. #if defined(LEFT)
  1138. add tempOffset, tempOffset, #1
  1139. #endif
  1140. dtrmm_kernel_L2_END:
  1141. #if !defined(LEFT)
  1142. add tempOffset, tempOffset, #2
  1143. #endif
  1144. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1145. /******************************************************************************/
  1146. dtrmm_kernel_L1_BEGIN:
  1147. mov counterJ , origN
  1148. tst counterJ , #1
  1149. ble dtrmm_kernel_L999 // done
  1150. mov pCRow0, pC // pCRow0 = C
  1151. add pC , pC , LDC // Update pC to point to next
  1152. #if defined(LEFT)
  1153. mov tempOffset, offset
  1154. #endif
  1155. mov pA, origPA // pA = A
  1156. dtrmm_kernel_L1_M8_BEGIN:
  1157. mov counterI, origM
  1158. asr counterI, counterI, #3 // counterI = counterI / 8
  1159. cmp counterI, #0
  1160. ble dtrmm_kernel_L1_M4_BEGIN
  1161. dtrmm_kernel_L1_M8_20:
  1162. INIT8x1
  1163. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1164. mov pB, origPB
  1165. #else
  1166. mov pB, origPB
  1167. lsl temp, tempOffset, #6
  1168. add pA, pA, temp
  1169. lsl temp, tempOffset, #3
  1170. add pB, pB, temp
  1171. #endif
  1172. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1173. sub tempK, origK, tempOffset
  1174. #elif defined(LEFT)
  1175. add tempK, tempOffset, #8
  1176. #else
  1177. add tempK, tempOffset, #1
  1178. #endif
  1179. asr counterL , tempK, #3 // counterL = counterL / 8
  1180. cmp counterL , #0
  1181. ble dtrmm_kernel_L1_M8_40
  1182. .align 5
  1183. dtrmm_kernel_L1_M8_22:
  1184. KERNEL8x1_SUB
  1185. KERNEL8x1_SUB
  1186. KERNEL8x1_SUB
  1187. KERNEL8x1_SUB
  1188. KERNEL8x1_SUB
  1189. KERNEL8x1_SUB
  1190. KERNEL8x1_SUB
  1191. KERNEL8x1_SUB
  1192. subs counterL, counterL, #1
  1193. bgt dtrmm_kernel_L1_M8_22
  1194. dtrmm_kernel_L1_M8_40:
  1195. ands counterL , tempK, #7 // counterL = counterL % 8
  1196. ble dtrmm_kernel_L1_M8_100
  1197. dtrmm_kernel_L1_M8_42:
  1198. KERNEL8x1_SUB
  1199. subs counterL, counterL, #1
  1200. bgt dtrmm_kernel_L1_M8_42
  1201. dtrmm_kernel_L1_M8_100:
  1202. SAVE8x1
  1203. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1204. sub tempK, origK, tempOffset
  1205. #if defined(LEFT)
  1206. sub tempK, tempK, #8
  1207. #else
  1208. sub tempK, tempK, #1
  1209. #endif
  1210. lsl temp, tempK, #6
  1211. add pA, pA, temp
  1212. lsl temp, tempK, #3
  1213. add pB, pB, temp
  1214. #endif
  1215. #if defined(LEFT)
  1216. add tempOffset, tempOffset, #8
  1217. #endif
  1218. dtrmm_kernel_L1_M8_END:
  1219. subs counterI, counterI, #1
  1220. bgt dtrmm_kernel_L1_M8_20
  1221. dtrmm_kernel_L1_M4_BEGIN:
  1222. mov counterI, origM
  1223. tst counterI , #7
  1224. ble dtrmm_kernel_L1_END
  1225. tst counterI, #4 // counterI = counterI / 2
  1226. ble dtrmm_kernel_L1_M2_BEGIN
  1227. dtrmm_kernel_L1_M4_20:
  1228. INIT4x1
  1229. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1230. mov pB, origPB
  1231. #else
  1232. mov pB, origPB
  1233. lsl temp, tempOffset, #3
  1234. add pB, pB, temp
  1235. lsl temp, tempOffset, #5
  1236. add pA, pA, temp
  1237. #endif
  1238. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1239. sub tempK, origK, tempOffset
  1240. #elif defined(LEFT)
  1241. add tempK, tempOffset, #4
  1242. #else
  1243. add tempK, tempOffset, #1
  1244. #endif
  1245. asr counterL , tempK, #3 // counterL = counterL / 8
  1246. cmp counterL , #0
  1247. ble dtrmm_kernel_L1_M4_40
  1248. .align 5
  1249. dtrmm_kernel_L1_M4_22:
  1250. KERNEL4x1_SUB
  1251. KERNEL4x1_SUB
  1252. KERNEL4x1_SUB
  1253. KERNEL4x1_SUB
  1254. KERNEL4x1_SUB
  1255. KERNEL4x1_SUB
  1256. KERNEL4x1_SUB
  1257. KERNEL4x1_SUB
  1258. subs counterL, counterL, #1
  1259. bgt dtrmm_kernel_L1_M4_22
  1260. dtrmm_kernel_L1_M4_40:
  1261. ands counterL , tempK, #7 // counterL = counterL % 8
  1262. ble dtrmm_kernel_L1_M4_100
  1263. dtrmm_kernel_L1_M4_42:
  1264. KERNEL4x1_SUB
  1265. subs counterL, counterL, #1
  1266. bgt dtrmm_kernel_L1_M4_42
  1267. dtrmm_kernel_L1_M4_100:
  1268. SAVE4x1
  1269. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1270. sub tempK, origK, tempOffset
  1271. #if defined(LEFT)
  1272. sub tempK, tempK, #4
  1273. #else
  1274. sub tempK, tempK, #1
  1275. #endif
  1276. lsl temp, tempK, #5
  1277. add pA, pA, temp
  1278. lsl temp, tempK, #3
  1279. add pB, pB, temp
  1280. #endif
  1281. #if defined(LEFT)
  1282. add tempOffset, tempOffset, #4
  1283. #endif
  1284. dtrmm_kernel_L1_M4_END:
  1285. dtrmm_kernel_L1_M2_BEGIN:
  1286. mov counterI, origM
  1287. tst counterI , #3
  1288. ble dtrmm_kernel_L1_END
  1289. tst counterI, #2 // counterI = counterI / 2
  1290. ble dtrmm_kernel_L1_M1_BEGIN
  1291. dtrmm_kernel_L1_M2_20:
  1292. INIT2x1
  1293. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1294. mov pB, origPB
  1295. #else
  1296. mov pB, origPB
  1297. lsl temp, tempOffset, #3
  1298. add pB, pB, temp
  1299. lsl temp, tempOffset, #4
  1300. add pA, pA, temp
  1301. #endif
  1302. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1303. sub tempK, origK, tempOffset
  1304. #elif defined(LEFT)
  1305. add tempK, tempOffset, #2
  1306. #else
  1307. add tempK, tempOffset, #1
  1308. #endif
  1309. asr counterL , tempK, #3 // counterL = counterL / 8
  1310. cmp counterL , #0
  1311. ble dtrmm_kernel_L1_M2_40
  1312. dtrmm_kernel_L1_M2_22:
  1313. KERNEL2x1_SUB
  1314. KERNEL2x1_SUB
  1315. KERNEL2x1_SUB
  1316. KERNEL2x1_SUB
  1317. KERNEL2x1_SUB
  1318. KERNEL2x1_SUB
  1319. KERNEL2x1_SUB
  1320. KERNEL2x1_SUB
  1321. subs counterL, counterL, #1
  1322. bgt dtrmm_kernel_L1_M2_22
  1323. dtrmm_kernel_L1_M2_40:
  1324. ands counterL , tempK, #7 // counterL = counterL % 8
  1325. ble dtrmm_kernel_L1_M2_100
  1326. dtrmm_kernel_L1_M2_42:
  1327. KERNEL2x1_SUB
  1328. subs counterL, counterL, #1
  1329. bgt dtrmm_kernel_L1_M2_42
  1330. dtrmm_kernel_L1_M2_100:
  1331. SAVE2x1
  1332. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1333. sub tempK, origK, tempOffset
  1334. #if defined(LEFT)
  1335. sub tempK, tempK, #2
  1336. #else
  1337. sub tempK, tempK, #1
  1338. #endif
  1339. lsl temp, tempK, #4
  1340. add pA, pA, temp
  1341. lsl temp, tempK, #3
  1342. add pB, pB, temp
  1343. #endif
  1344. #if defined(LEFT)
  1345. add tempOffset, tempOffset, #2
  1346. #endif
  1347. dtrmm_kernel_L1_M2_END:
  1348. dtrmm_kernel_L1_M1_BEGIN:
  1349. tst counterI, #1 // counterI = counterI % 2
  1350. ble dtrmm_kernel_L1_END
  1351. dtrmm_kernel_L1_M1_20:
  1352. INIT1x1
  1353. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1354. mov pB, origPB
  1355. #else
  1356. mov pB, origPB
  1357. lsl temp, tempOffset, #3
  1358. add pB, pB, temp
  1359. lsl temp, tempOffset, #3
  1360. add pA, pA, temp
  1361. #endif
  1362. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1363. sub tempK, origK, tempOffset
  1364. #elif defined(LEFT)
  1365. add tempK, tempOffset, #1
  1366. #else
  1367. add tempK, tempOffset, #1
  1368. #endif
  1369. asr counterL , tempK, #3 // counterL = counterL / 8
  1370. cmp counterL , #0
  1371. ble dtrmm_kernel_L1_M1_40
  1372. dtrmm_kernel_L1_M1_22:
  1373. KERNEL1x1_SUB
  1374. KERNEL1x1_SUB
  1375. KERNEL1x1_SUB
  1376. KERNEL1x1_SUB
  1377. KERNEL1x1_SUB
  1378. KERNEL1x1_SUB
  1379. KERNEL1x1_SUB
  1380. KERNEL1x1_SUB
  1381. subs counterL, counterL, #1
  1382. bgt dtrmm_kernel_L1_M1_22
  1383. dtrmm_kernel_L1_M1_40:
  1384. ands counterL , tempK, #7 // counterL = counterL % 8
  1385. ble dtrmm_kernel_L1_M1_100
  1386. dtrmm_kernel_L1_M1_42:
  1387. KERNEL1x1_SUB
  1388. subs counterL, counterL, #1
  1389. bgt dtrmm_kernel_L1_M1_42
  1390. dtrmm_kernel_L1_M1_100:
  1391. SAVE1x1
  1392. dtrmm_kernel_L1_END:
  1393. dtrmm_kernel_L999:
  1394. mov x0, #0 // set return value
  1395. ldp d8, d9, [sp, #(0 * 16)]
  1396. ldp d10, d11, [sp, #(1 * 16)]
  1397. ldp d12, d13, [sp, #(2 * 16)]
  1398. ldp d14, d15, [sp, #(3 * 16)]
  1399. ldp d16, d17, [sp, #(4 * 16)]
  1400. ldp x18, x19, [sp, #(5 * 16)]
  1401. ldp x20, x21, [sp, #(6 * 16)]
  1402. ldp x22, x23, [sp, #(7 * 16)]
  1403. ldp x24, x25, [sp, #(8 * 16)]
  1404. ldp x26, x27, [sp, #(9 * 16)]
  1405. ldr x28, [sp, #(10 * 16)]
  1406. add sp, sp, #(11*16)
  1407. ret
  1408. EPILOGUE