You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t.S 7.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M x0 /* Y vector length */
  30. #define N x1 /* X vector length */
  31. #define A x3 /* A vector address */
  32. #define LDA x4 /* A stride */
  33. #define X x5 /* X vector address */
  34. #define INC_X x6 /* X stride */
  35. #define Y x7 /* Y vector address */
  36. #define INC_Y x2 /* Y stride */
  37. #define A_PTR x9 /* loop A vector address */
  38. #define X_PTR x10 /* loop X vector address */
  39. #define J x11 /* loop variable */
  40. #define I x12 /* loop variable */
  41. /*******************************************************************************
  42. * Macro definitions
  43. *******************************************************************************/
  44. #if !defined(DOUBLE)
  45. #define REG0 wzr
  46. #define ALPHA s0
  47. #define TEMP s1
  48. #define TEMP1 s2
  49. #define TEMP2 s3
  50. #define TEMP3 s4
  51. #define TEMPV {v1.s}[0]
  52. #define TMP1 s2
  53. #define TMPV1 {v2.s}[0]
  54. #define TMP2 s3
  55. #define TMPV2 {v3.s}[0]
  56. #define SZ 4
  57. #define SHZ 2
  58. #else
  59. #define REG0 xzr
  60. #define ALPHA d0
  61. #define TEMP d1
  62. #define TEMP1 d2
  63. #define TEMP2 d3
  64. #define TEMP3 d4
  65. #define TEMPV {v1.d}[0]
  66. #define TMP1 d2
  67. #define TMPV1 {v2.d}[0]
  68. #define TMP2 d3
  69. #define TMPV2 {v3.d}[0]
  70. #define SZ 8
  71. #define SHZ 3
  72. #endif
  73. /******************************************************************************/
  74. .macro SAVE_REGS
  75. add sp, sp, #-(11 * 16)
  76. stp d8, d9, [sp, #(0 * 16)]
  77. stp d10, d11, [sp, #(1 * 16)]
  78. stp d12, d13, [sp, #(2 * 16)]
  79. stp d14, d15, [sp, #(3 * 16)]
  80. stp d16, d17, [sp, #(4 * 16)]
  81. stp x18, x19, [sp, #(5 * 16)]
  82. stp x20, x21, [sp, #(6 * 16)]
  83. stp x22, x23, [sp, #(7 * 16)]
  84. stp x24, x25, [sp, #(8 * 16)]
  85. stp x26, x27, [sp, #(9 * 16)]
  86. str x28, [sp, #(10 * 16)]
  87. .endm
  88. .macro RESTORE_REGS
  89. ldp d8, d9, [sp, #(0 * 16)]
  90. ldp d10, d11, [sp, #(1 * 16)]
  91. ldp d12, d13, [sp, #(2 * 16)]
  92. ldp d14, d15, [sp, #(3 * 16)]
  93. ldp d16, d17, [sp, #(4 * 16)]
  94. ldp x18, x19, [sp, #(5 * 16)]
  95. ldp x20, x21, [sp, #(6 * 16)]
  96. ldp x22, x23, [sp, #(7 * 16)]
  97. ldp x24, x25, [sp, #(8 * 16)]
  98. ldp x26, x27, [sp, #(9 * 16)]
  99. ldr x28, [sp, #(10 * 16)]
  100. add sp, sp, #(11*16)
  101. .endm
  102. .macro KERNEL_F32
  103. #if !defined(DOUBLE)
  104. ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
  105. ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
  106. fmla v1.4s, v5.4s, v9.4s
  107. fmla v2.4s, v6.4s, v10.4s
  108. fmla v3.4s, v7.4s, v11.4s
  109. fmla v4.4s, v8.4s, v12.4s
  110. ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
  111. ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
  112. fmla v1.4s, v13.4s, v17.4s
  113. fmla v2.4s, v14.4s, v18.4s
  114. fmla v3.4s, v15.4s, v19.4s
  115. fmla v4.4s, v16.4s, v20.4s
  116. #else
  117. ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
  118. ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
  119. fmla v1.2d, v5.2d, v9.2d
  120. fmla v2.2d, v6.2d, v10.2d
  121. fmla v3.2d, v7.2d, v11.2d
  122. fmla v4.2d, v8.2d, v12.2d
  123. ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
  124. ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
  125. fmla v1.2d, v13.2d, v17.2d
  126. fmla v2.2d, v14.2d, v18.2d
  127. fmla v3.2d, v15.2d, v19.2d
  128. fmla v4.2d, v16.2d, v20.2d
  129. ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
  130. ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
  131. fmla v1.2d, v5.2d, v9.2d
  132. fmla v2.2d, v6.2d, v10.2d
  133. fmla v3.2d, v7.2d, v11.2d
  134. fmla v4.2d, v8.2d, v12.2d
  135. ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
  136. ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
  137. fmla v1.2d, v13.2d, v17.2d
  138. fmla v2.2d, v14.2d, v18.2d
  139. fmla v3.2d, v15.2d, v19.2d
  140. fmla v4.2d, v16.2d, v20.2d
  141. #endif
  142. .endm
  143. .macro KERNEL_F32_FINALIZE
  144. #if !defined(DOUBLE)
  145. fadd v1.4s, v1.4s, v2.4s
  146. fadd v1.4s, v1.4s, v3.4s
  147. fadd v1.4s, v1.4s, v4.4s
  148. #else
  149. fadd v1.2d, v1.2d, v2.2d
  150. fadd v1.2d, v1.2d, v3.2d
  151. fadd v1.2d, v1.2d, v4.2d
  152. #endif
  153. .endm
  154. .macro KERNEL_F4
  155. #if !defined(DOUBLE)
  156. ld1 {v2.4s}, [A_PTR], #16
  157. ld1 {v3.4s}, [X_PTR], #16
  158. fmla v1.4s, v2.4s, v3.4s
  159. #else
  160. ld1 {v2.2d}, [A_PTR], #16
  161. ld1 {v3.2d}, [X_PTR], #16
  162. fmla v1.2d, v2.2d, v3.2d
  163. ld1 {v4.2d}, [A_PTR], #16
  164. ld1 {v5.2d}, [X_PTR], #16
  165. fmla v1.2d, v4.2d, v5.2d
  166. #endif
  167. .endm
  168. .macro KERNEL_F4_FINALIZE
  169. #if !defined(DOUBLE)
  170. ext v2.16b, v1.16b, v1.16b, #8
  171. fadd v1.2s, v1.2s, v2.2s
  172. faddp TEMP, v1.2s
  173. #else
  174. faddp TEMP, v1.2d
  175. #endif
  176. .endm
  177. .macro KERNEL_F1
  178. ld1 TMPV1, [A_PTR], #SZ
  179. ld1 TMPV2, [X_PTR], #SZ
  180. fmadd TEMP, TMP1, TMP2, TEMP
  181. .endm
  182. .macro INIT_S
  183. lsl INC_X, INC_X, #SHZ
  184. .endm
  185. .macro KERNEL_S1
  186. ld1 TMPV1, [A_PTR], #SZ
  187. ld1 TMPV2, [X_PTR], INC_X
  188. fmadd TEMP, TMP1, TMP2, TEMP
  189. .endm
  190. /*******************************************************************************
  191. * End of macro definitions
  192. *******************************************************************************/
  193. PROLOGUE
  194. ldr INC_Y, [sp]
  195. SAVE_REGS
  196. cmp N, xzr
  197. ble gemv_t_kernel_L999
  198. cmp M, xzr
  199. ble gemv_t_kernel_L999
  200. lsl LDA, LDA, #SHZ
  201. lsl INC_Y, INC_Y, #SHZ
  202. mov J, N
  203. cmp INC_X, #1
  204. bne gemv_t_kernel_S_BEGIN
  205. gemv_t_kernel_F_LOOP:
  206. fmov TEMP, REG0
  207. fmov TEMP1, REG0
  208. fmov TEMP2, REG0
  209. fmov TEMP3, REG0
  210. mov A_PTR, A
  211. mov X_PTR, X
  212. gemv_t_kernel_F32:
  213. asr I, M, #5
  214. cmp I, xzr
  215. beq gemv_t_kernel_F4
  216. gemv_t_kernel_F320:
  217. KERNEL_F32
  218. subs I, I, #1
  219. bne gemv_t_kernel_F320
  220. KERNEL_F32_FINALIZE
  221. gemv_t_kernel_F4:
  222. ands I, M, #31
  223. asr I, I, #2
  224. cmp I, xzr
  225. beq gemv_t_kernel_F1
  226. gemv_t_kernel_F40:
  227. KERNEL_F4
  228. subs I, I, #1
  229. bne gemv_t_kernel_F40
  230. gemv_t_kernel_F1:
  231. KERNEL_F4_FINALIZE
  232. ands I, M, #3
  233. ble gemv_t_kernel_F_END
  234. gemv_t_kernel_F10:
  235. KERNEL_F1
  236. subs I, I, #1
  237. bne gemv_t_kernel_F10
  238. gemv_t_kernel_F_END:
  239. ld1 TMPV1, [Y]
  240. add A, A, LDA
  241. subs J, J, #1
  242. fmadd TMP1, ALPHA, TEMP, TMP1
  243. st1 TMPV1, [Y], INC_Y
  244. bne gemv_t_kernel_F_LOOP
  245. b gemv_t_kernel_L999
  246. gemv_t_kernel_S_BEGIN:
  247. INIT_S
  248. gemv_t_kernel_S_LOOP:
  249. fmov TEMP, REG0
  250. mov A_PTR, A
  251. mov X_PTR, X
  252. asr I, M, #2
  253. cmp I, xzr
  254. ble gemv_t_kernel_S1
  255. gemv_t_kernel_S4:
  256. KERNEL_S1
  257. KERNEL_S1
  258. KERNEL_S1
  259. KERNEL_S1
  260. subs I, I, #1
  261. bne gemv_t_kernel_S4
  262. gemv_t_kernel_S1:
  263. ands I, M, #3
  264. ble gemv_t_kernel_S_END
  265. gemv_t_kernel_S10:
  266. KERNEL_S1
  267. subs I, I, #1
  268. bne gemv_t_kernel_S10
  269. gemv_t_kernel_S_END:
  270. ld1 TMPV1, [Y]
  271. add A, A, LDA
  272. subs J, J, #1
  273. fmadd TMP1, ALPHA, TEMP, TMP1
  274. st1 TMPV1, [Y], INC_Y
  275. bne gemv_t_kernel_S_LOOP
  276. gemv_t_kernel_L999:
  277. RESTORE_REGS
  278. mov w0, wzr
  279. ret
  280. EPILOGUE