You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n.S 7.0 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M x0 /* Y vector length */
  30. #define N x1 /* X vector length */
  31. #define A x3 /* A vector address */
  32. #define LDA x4 /* A stride */
  33. #define X x5 /* X vector address */
  34. #define INC_X x6 /* X stride */
  35. #define Y x7 /* Y vector address */
  36. #define INC_Y x2 /* Y stride */
  37. #define A_PTR x9 /* loop A vector address */
  38. #define Y_IPTR x10 /* loop Y vector address */
  39. #define J x11 /* loop variable */
  40. #define I x12 /* loop variable */
  41. #define Y_OPTR x13 /* loop Y vector address */
  42. /*******************************************************************************
  43. * Macro definitions
  44. *******************************************************************************/
  45. #if !defined(DOUBLE)
  46. #define ALPHA s0
  47. #define TEMP s1
  48. #define TEMPV {v1.s}[0]
  49. #define TMP1 s2
  50. #define TMPV1 {v2.s}[0]
  51. #define TMP2 s3
  52. #define TMPV2 {v3.s}[0]
  53. #define SZ 4
  54. #define SHZ 2
  55. #else
  56. #define ALPHA d0
  57. #define TEMP d1
  58. #define TEMPV {v1.d}[0]
  59. #define TMP1 d2
  60. #define TMPV1 {v2.d}[0]
  61. #define TMP2 d3
  62. #define TMPV2 {v3.d}[0]
  63. #define SZ 8
  64. #define SHZ 3
  65. #endif
  66. /******************************************************************************/
  67. .macro SAVE_REGS
  68. add sp, sp, #-(11 * 16)
  69. stp d8, d9, [sp, #(0 * 16)]
  70. stp d10, d11, [sp, #(1 * 16)]
  71. stp d12, d13, [sp, #(2 * 16)]
  72. stp d14, d15, [sp, #(3 * 16)]
  73. stp d16, d17, [sp, #(4 * 16)]
  74. stp x18, x19, [sp, #(5 * 16)]
  75. stp x20, x21, [sp, #(6 * 16)]
  76. stp x22, x23, [sp, #(7 * 16)]
  77. stp x24, x25, [sp, #(8 * 16)]
  78. stp x26, x27, [sp, #(9 * 16)]
  79. str x28, [sp, #(10 * 16)]
  80. .endm
  81. .macro RESTORE_REGS
  82. ldp d8, d9, [sp, #(0 * 16)]
  83. ldp d10, d11, [sp, #(1 * 16)]
  84. ldp d12, d13, [sp, #(2 * 16)]
  85. ldp d14, d15, [sp, #(3 * 16)]
  86. ldp d16, d17, [sp, #(4 * 16)]
  87. ldp x18, x19, [sp, #(5 * 16)]
  88. ldp x20, x21, [sp, #(6 * 16)]
  89. ldp x22, x23, [sp, #(7 * 16)]
  90. ldp x24, x25, [sp, #(8 * 16)]
  91. ldp x26, x27, [sp, #(9 * 16)]
  92. ldr x28, [sp, #(10 * 16)]
  93. add sp, sp, #(11*16)
  94. .endm
  95. .macro KERNEL_F16
  96. #if !defined(DOUBLE)
  97. ld1 {v2.4s, v3.4s}, [A_PTR], #32
  98. ld1 {v4.4s, v5.4s}, [Y_IPTR], #32
  99. fmla v4.4s, v1.4s, v2.4s
  100. fmla v5.4s, v1.4s, v3.4s
  101. st1 {v4.4s, v5.4s}, [Y_OPTR], #32
  102. ld1 {v6.4s, v7.4s}, [A_PTR], #32
  103. ld1 {v8.4s, v9.4s}, [Y_IPTR], #32
  104. fmla v8.4s, v1.4s, v6.4s
  105. fmla v9.4s, v1.4s, v7.4s
  106. st1 {v8.4s, v9.4s}, [Y_OPTR], #32
  107. #else //DOUBLE
  108. ld1 {v2.2d, v3.2d}, [A_PTR], #32
  109. ld1 {v4.2d, v5.2d}, [Y_IPTR], #32
  110. fmla v4.2d, v1.2d, v2.2d
  111. fmla v5.2d, v1.2d, v3.2d
  112. st1 {v4.2d, v5.2d}, [Y_OPTR], #32
  113. ld1 {v6.2d, v7.2d}, [A_PTR], #32
  114. ld1 {v8.2d, v9.2d}, [Y_IPTR], #32
  115. fmla v8.2d, v1.2d, v6.2d
  116. fmla v9.2d, v1.2d, v7.2d
  117. st1 {v8.2d, v9.2d}, [Y_OPTR], #32
  118. ld1 {v10.2d, v11.2d}, [A_PTR], #32
  119. ld1 {v12.2d, v13.2d}, [Y_IPTR], #32
  120. fmla v12.2d, v1.2d, v10.2d
  121. fmla v13.2d, v1.2d, v11.2d
  122. st1 {v12.2d, v13.2d}, [Y_OPTR], #32
  123. ld1 {v14.2d, v15.2d}, [A_PTR], #32
  124. ld1 {v16.2d, v17.2d}, [Y_IPTR], #32
  125. fmla v16.2d, v1.2d, v14.2d
  126. fmla v17.2d, v1.2d, v15.2d
  127. st1 {v16.2d, v17.2d}, [Y_OPTR], #32
  128. #endif
  129. .endm
  130. .macro KERNEL_F4
  131. #if !defined(DOUBLE)
  132. ld1 {v2.4s}, [A_PTR], #16
  133. ld1 {v3.4s}, [Y_IPTR], #16
  134. fmla v3.4s, v1.4s, v2.4s
  135. st1 {v3.4s}, [Y_OPTR], #16
  136. #else
  137. ld1 {v2.2d}, [A_PTR], #16
  138. ld1 {v3.2d}, [Y_IPTR], #16
  139. fmla v3.2d, v1.2d, v2.2d
  140. st1 {v3.2d}, [Y_OPTR], #16
  141. ld1 {v4.2d}, [A_PTR], #16
  142. ld1 {v5.2d}, [Y_IPTR], #16
  143. fmla v5.2d, v1.2d, v4.2d
  144. st1 {v5.2d}, [Y_OPTR], #16
  145. #endif
  146. .endm
  147. .macro KERNEL_F1
  148. ld1 TMPV1, [A_PTR], #SZ
  149. ld1 TMPV2, [Y_IPTR]
  150. fmadd TMP2, TEMP, TMP1, TMP2
  151. st1 TMPV2, [Y_IPTR], #SZ
  152. .endm
  153. .macro INIT_S
  154. lsl INC_Y, INC_Y, #SHZ
  155. .endm
  156. .macro KERNEL_S1
  157. ld1 TMPV1, [A_PTR], #SZ
  158. ld1 TMPV2, [Y_IPTR]
  159. fmadd TMP2, TEMP, TMP1, TMP2
  160. st1 TMPV2, [Y_IPTR], INC_Y
  161. .endm
  162. /*******************************************************************************
  163. * End of macro definitions
  164. *******************************************************************************/
  165. PROLOGUE
  166. ldr INC_Y, [sp]
  167. SAVE_REGS
  168. cmp N, xzr
  169. ble gemv_n_kernel_L999
  170. cmp M, xzr
  171. ble gemv_n_kernel_L999
  172. lsl LDA, LDA, #SHZ
  173. lsl INC_X, INC_X, #SHZ
  174. mov J, N
  175. cmp INC_Y, #1
  176. bne gemv_n_kernel_S_BEGIN
  177. gemv_n_kernel_F_LOOP:
  178. ld1 TEMPV, [X], INC_X
  179. fmul TEMP, ALPHA, TEMP
  180. #if !defined(DOUBLE)
  181. ins v1.s[1], v1.s[0]
  182. ins v1.s[2], v1.s[0]
  183. ins v1.s[3], v1.s[0]
  184. #else
  185. ins v1.d[1], v1.d[0]
  186. #endif
  187. mov A_PTR, A
  188. mov Y_IPTR, Y
  189. mov Y_OPTR, Y
  190. gemv_n_kernel_F32:
  191. asr I, M, #5
  192. cmp I, xzr
  193. beq gemv_n_kernel_F4
  194. gemv_n_kernel_F320:
  195. KERNEL_F16
  196. KERNEL_F16
  197. subs I, I, #1
  198. bne gemv_n_kernel_F320
  199. gemv_n_kernel_F4:
  200. ands I, M, #31
  201. asr I, I, #2
  202. cmp I, xzr
  203. beq gemv_n_kernel_F1
  204. gemv_n_kernel_F40:
  205. KERNEL_F4
  206. subs I, I, #1
  207. bne gemv_n_kernel_F40
  208. gemv_n_kernel_F1:
  209. ands I, M, #3
  210. ble gemv_n_kernel_F_END
  211. gemv_n_kernel_F10:
  212. KERNEL_F1
  213. subs I, I, #1
  214. bne gemv_n_kernel_F10
  215. gemv_n_kernel_F_END:
  216. add A, A, LDA
  217. subs J, J, #1
  218. bne gemv_n_kernel_F_LOOP
  219. b gemv_n_kernel_L999
  220. gemv_n_kernel_S_BEGIN:
  221. INIT_S
  222. gemv_n_kernel_S_LOOP:
  223. ld1 TEMPV, [X], INC_X
  224. fmul TEMP, ALPHA, TEMP
  225. mov A_PTR, A
  226. mov Y_IPTR, Y
  227. asr I, M, #2
  228. cmp I, xzr
  229. ble gemv_n_kernel_S1
  230. gemv_n_kernel_S4:
  231. KERNEL_S1
  232. KERNEL_S1
  233. KERNEL_S1
  234. KERNEL_S1
  235. subs I, I, #1
  236. bne gemv_n_kernel_S4
  237. gemv_n_kernel_S1:
  238. ands I, M, #3
  239. ble gemv_n_kernel_S_END
  240. gemv_n_kernel_S10:
  241. KERNEL_S1
  242. subs I, I, #1
  243. bne gemv_n_kernel_S10
  244. gemv_n_kernel_S_END:
  245. add A, A, LDA
  246. subs J, J, #1
  247. bne gemv_n_kernel_S_LOOP
  248. gemv_n_kernel_L999:
  249. mov w0, wzr
  250. RESTORE_REGS
  251. ret
  252. EPILOGUE