You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M x0 /* Y vector length */
  30. #define N x1 /* X vector length */
  31. #define A x3 /* A vector address */
  32. #define LDA x4 /* A stride */
  33. #define X x5 /* X vector address */
  34. #define INC_X x6 /* X stride */
  35. #define Y x7 /* Y vector address */
  36. #define INC_Y x2 /* Y stride */
  37. #define A_PTR x9 /* loop A vector address */
  38. #define X_PTR x10 /* loop Y vector address */
  39. #define J x11 /* loop variable */
  40. #define I x12 /* loop variable */
  41. #define A_PRE_SIZE 768
  42. #define X_PRE_SIZE 768
  43. /*******************************************************************************
  44. * Macro definitions
  45. *******************************************************************************/
  46. #if !defined(DOUBLE)
  47. #define ALPHA_R s0
  48. #define ALPHA_I s1
  49. #define ALPHA_R_COPY s7
  50. #define ALPHA_I_COPY s8
  51. #define SHZ 3
  52. #else
  53. #define ALPHA_R d0
  54. #define ALPHA_I d1
  55. #define ALPHA_R_COPY d7
  56. #define ALPHA_I_COPY d8
  57. #define SHZ 4
  58. #endif
  59. /******************************************************************************/
  60. .macro SAVE_REGS
  61. add sp, sp, #-(11 * 16)
  62. stp d8, d9, [sp, #(0 * 16)]
  63. stp d10, d11, [sp, #(1 * 16)]
  64. stp d12, d13, [sp, #(2 * 16)]
  65. stp d14, d15, [sp, #(3 * 16)]
  66. stp d16, d17, [sp, #(4 * 16)]
  67. stp x18, x19, [sp, #(5 * 16)]
  68. stp x20, x21, [sp, #(6 * 16)]
  69. stp x22, x23, [sp, #(7 * 16)]
  70. stp x24, x25, [sp, #(8 * 16)]
  71. stp x26, x27, [sp, #(9 * 16)]
  72. str x28, [sp, #(10 * 16)]
  73. .endm
  74. .macro RESTORE_REGS
  75. ldp d8, d9, [sp, #(0 * 16)]
  76. ldp d10, d11, [sp, #(1 * 16)]
  77. ldp d12, d13, [sp, #(2 * 16)]
  78. ldp d14, d15, [sp, #(3 * 16)]
  79. ldp d16, d17, [sp, #(4 * 16)]
  80. ldp x18, x19, [sp, #(5 * 16)]
  81. ldp x20, x21, [sp, #(6 * 16)]
  82. ldp x22, x23, [sp, #(7 * 16)]
  83. ldp x24, x25, [sp, #(8 * 16)]
  84. ldp x26, x27, [sp, #(9 * 16)]
  85. ldr x28, [sp, #(10 * 16)]
  86. add sp, sp, #(11*16)
  87. .endm
  88. .macro INIT
  89. #if !defined(XCONJ)
  90. #if !defined(DOUBLE)
  91. ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R
  92. eor v2.16b, v2.16b, v2.16b
  93. fsub s2, s2, ALPHA_I
  94. ins v1.s[1], v2.s[0]
  95. ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I
  96. #else
  97. ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R
  98. eor v2.16b, v2.16b, v2.16b
  99. fsub d2, d2, ALPHA_I
  100. ins v1.d[1], v2.d[0]
  101. ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I
  102. #endif
  103. #else // XCONJ
  104. #if !defined(DOUBLE)
  105. eor v2.16b, v2.16b, v2.16b
  106. fsub s2, s2, ALPHA_R
  107. ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R
  108. ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I
  109. #else
  110. eor v2.16b, v2.16b, v2.16b
  111. fsub d2, d2, ALPHA_R
  112. ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R
  113. ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I
  114. #endif
  115. #endif
  116. .endm
  117. .macro INIT_LOOP
  118. fmov d9, xzr // TEMP_R = [0, 0]
  119. fmov d10, xzr // TEMP_I = [0, 0]
  120. #if !defined(DOUBLE)
  121. #else
  122. fmov d15, xzr // TEMP_R = [0, 0]
  123. fmov d16, xzr // TEMP_I = [0, 0]
  124. #endif
  125. fmov d2, xzr // TEMP = [0, 0]
  126. .endm
  127. .macro KERNEL_F4
  128. #if !defined(DOUBLE)
  129. ld2 {v11.4s, v12.4s}, [X_PTR], #32
  130. ld2 {v13.4s, v14.4s}, [A_PTR], #32
  131. prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
  132. prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]
  133. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  134. fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
  135. fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
  136. fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
  137. fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
  138. #else
  139. fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
  140. fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
  141. fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
  142. fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
  143. #endif
  144. #else // DOUBLE
  145. ld2 {v11.2d, v12.2d}, [X_PTR], #32
  146. ld2 {v13.2d, v14.2d}, [A_PTR], #32
  147. prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE]
  148. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  149. fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
  150. fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
  151. fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
  152. fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
  153. #else
  154. fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
  155. fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
  156. fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
  157. fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
  158. #endif
  159. ld2 {v17.2d, v18.2d}, [X_PTR], #32
  160. ld2 {v19.2d, v20.2d}, [A_PTR], #32
  161. prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE]
  162. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  163. fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
  164. fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
  165. fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
  166. fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
  167. #else
  168. fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
  169. fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
  170. fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
  171. fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
  172. #endif
  173. #endif //DOUBLE
  174. .endm
  175. .macro KERNEL_F4_FINALIZE
  176. #if !defined(DOUBLE)
  177. ext v21.16b, v9.16b, v9.16b, #8
  178. fadd v9.2s, v9.2s, v21.2s
  179. faddp s9, v9.2s
  180. ext v21.16b, v10.16b, v10.16b, #8
  181. fadd v10.2s, v10.2s, v21.2s
  182. faddp s10, v10.2s
  183. ins v2.s[0], v9.s[0]
  184. ins v2.s[1], v10.s[0]
  185. #else
  186. fadd v9.2d, v9.2d, v15.2d
  187. fadd v10.2d, v10.2d, v16.2d
  188. faddp d9, v9.2d
  189. faddp d10, v10.2d
  190. ins v2.d[0], v9.d[0]
  191. ins v2.d[1], v10.d[0]
  192. #endif
  193. .endm
  194. .macro KERNEL_F1
  195. #if !defined(DOUBLE)
  196. ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
  197. ld1 {v5.s}[0], [A_PTR], #4 // A1
  198. ld1 {v6.2s}, [X_PTR], #8 // [X1, X0]
  199. eor v16.16b, v16.16b, v16.16b
  200. fsub s16, s16, s5
  201. ins v5.s[1], v16.s[0] // [-A1, A1]
  202. #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
  203. ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
  204. #endif
  205. ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1]
  206. fmla v2.2s, v4.2s, v6.2s
  207. fmla v2.2s, v5.2s, v7.2s
  208. #else // DOUBLE
  209. ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
  210. ld1 {v5.d}[0], [A_PTR], #8 // A1
  211. ld1 {v6.2d}, [X_PTR], #16 // [X1, X0]
  212. eor v16.16b, v16.16b, v16.16b
  213. fsub d16, d16, d5
  214. ins v5.d[1], v16.d[0] // [-A1, A1]
  215. #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
  216. ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
  217. #endif
  218. ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1]
  219. fmla v2.2d, v4.2d, v6.2d
  220. fmla v2.2d, v5.2d, v7.2d
  221. #endif
  222. .endm
  223. .macro INIT_S
  224. lsl INC_X, INC_X, #SHZ
  225. .endm
  226. .macro KERNEL_S1
  227. #if !defined(DOUBLE)
  228. ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
  229. ld1 {v5.s}[0], [A_PTR], #4 // A1
  230. ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0]
  231. eor v16.16b, v16.16b, v16.16b
  232. fsub s16, s16, s5
  233. ins v5.s[1], v16.s[0] // [-A1, A1]
  234. #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
  235. ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
  236. #endif
  237. ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1]
  238. fmla v2.2s, v4.2s, v6.2s
  239. fmla v2.2s, v5.2s, v7.2s
  240. #else // DOUBLE
  241. ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
  242. ld1 {v5.d}[0], [A_PTR], #8 // A1
  243. ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0]
  244. eor v16.16b, v16.16b, v16.16b
  245. fsub d16, d16, d5
  246. ins v5.d[1], v16.d[0] // [-A1, A1]
  247. #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
  248. ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
  249. #endif
  250. ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1]
  251. fmla v2.2d, v4.2d, v6.2d
  252. fmla v2.2d, v5.2d, v7.2d
  253. #endif
  254. .endm
  255. /*******************************************************************************
  256. * End of macro definitions
  257. *******************************************************************************/
  258. PROLOGUE
  259. ldr INC_Y, [sp]
  260. SAVE_REGS
  261. cmp N, xzr
  262. ble .Lzgemv_t_kernel_L999
  263. cmp M, xzr
  264. ble .Lzgemv_t_kernel_L999
  265. lsl LDA, LDA, #SHZ
  266. lsl INC_Y, INC_Y, #SHZ
  267. mov J, N
  268. INIT
  269. cmp INC_X, #1
  270. bne .Lzgemv_t_kernel_S_BEGIN
  271. .Lzgemv_t_kernel_F_LOOP:
  272. mov A_PTR, A
  273. mov X_PTR, X
  274. INIT_LOOP
  275. asr I, M, #2
  276. cmp I, xzr
  277. beq .Lzgemv_t_kernel_F1
  278. .Lzgemv_t_kernel_F4:
  279. KERNEL_F4
  280. subs I, I, #1
  281. bne .Lzgemv_t_kernel_F4
  282. KERNEL_F4_FINALIZE
  283. .Lzgemv_t_kernel_F1:
  284. ands I, M, #3
  285. ble .Lzgemv_t_kernel_F_END
  286. .Lzgemv_t_kernel_F10:
  287. KERNEL_F1
  288. subs I, I, #1
  289. bne .Lzgemv_t_kernel_F10
  290. .Lzgemv_t_kernel_F_END:
  291. #if !defined(DOUBLE)
  292. ld1 {v4.2s}, [Y]
  293. ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I]
  294. fmla v4.2s, v0.2s, v2.2s
  295. fmla v4.2s, v1.2s, v3.2s
  296. st1 {v4.2s}, [Y], INC_Y
  297. #else // DOUBLE
  298. ld1 {v4.2d}, [Y]
  299. ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I]
  300. fmla v4.2d, v0.2d, v2.2d
  301. fmla v4.2d, v1.2d, v3.2d
  302. st1 {v4.2d}, [Y], INC_Y
  303. #endif
  304. add A, A, LDA
  305. subs J, J, #1
  306. bne .Lzgemv_t_kernel_F_LOOP
  307. b .Lzgemv_t_kernel_L999
  308. .Lzgemv_t_kernel_S_BEGIN:
  309. INIT_S
  310. .Lzgemv_t_kernel_S_LOOP:
  311. mov A_PTR, A
  312. mov X_PTR, X
  313. INIT_LOOP
  314. asr I, M, #2
  315. cmp I, xzr
  316. ble .Lzgemv_t_kernel_S1
  317. .Lzgemv_t_kernel_S4:
  318. KERNEL_S1
  319. KERNEL_S1
  320. KERNEL_S1
  321. KERNEL_S1
  322. subs I, I, #1
  323. bne .Lzgemv_t_kernel_S4
  324. .Lzgemv_t_kernel_S1:
  325. ands I, M, #3
  326. ble .Lzgemv_t_kernel_S_END
  327. .Lzgemv_t_kernel_S10:
  328. KERNEL_S1
  329. subs I, I, #1
  330. bne .Lzgemv_t_kernel_S10
  331. .Lzgemv_t_kernel_S_END:
  332. #if !defined(DOUBLE)
  333. ld1 {v4.2s}, [Y]
  334. ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I]
  335. fmla v4.2s, v0.2s, v2.2s
  336. fmla v4.2s, v1.2s, v3.2s
  337. st1 {v4.2s}, [Y], INC_Y
  338. #else // DOUBLE
  339. ld1 {v4.2d}, [Y]
  340. ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I]
  341. fmla v4.2d, v0.2d, v2.2d
  342. fmla v4.2d, v1.2d, v3.2d
  343. st1 {v4.2d}, [Y], INC_Y
  344. #endif
  345. add A, A, LDA
  346. subs J, J, #1
  347. bne .Lzgemv_t_kernel_S_LOOP
  348. .Lzgemv_t_kernel_L999:
  349. RESTORE_REGS
  350. mov w0, wzr
  351. ret
  352. EPILOGUE