You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N x0 /* vector length */
  30. #define X x1 /* X vector address */
  31. #define INC_X x2 /* X stride */
  32. #define Y x3 /* Y vector address */
  33. #define INC_Y x4 /* Y stride */
  34. #define I x5 /* loop variable */
  35. /*******************************************************************************
  36. * Macro definitions
  37. *******************************************************************************/
  38. #if !defined(DOUBLE)
  39. #if !defined(DSDOT)
  40. #define REG0 wzr
  41. #define DOTF s0
  42. #else // DSDOT
  43. #define REG0 xzr
  44. #define DOTF d0
  45. #endif
  46. #define DOTI s1
  47. #define TMPX s2
  48. #define LD1VX {v2.s}[0]
  49. #define TMPY s3
  50. #define LD1VY {v3.s}[0]
  51. #define TMPVY v3.s[0]
  52. #define SZ 4
  53. #else
  54. #define REG0 xzr
  55. #define DOTF d0
  56. #define DOTI d1
  57. #define TMPX d2
  58. #define LD1VX {v2.d}[0]
  59. #define TMPY d3
  60. #define LD1VY {v3.d}[0]
  61. #define TMPVY v3.d[0]
  62. #define SZ 8
  63. #endif
  64. /******************************************************************************/
  65. .macro KERNEL_F1
  66. #if !defined(DOUBLE)
  67. ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2
  68. ld1 {v3.2s}, [Y], #8 // V3 = Y[iy+1], Y[iy]; Y += 2
  69. ins v4.s[0], v2.s[1] // V4 = X[ix+1]
  70. #if !defined(CONJ)
  71. fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
  72. fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1]
  73. fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy]
  74. fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
  75. #else
  76. fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
  77. fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1]
  78. fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy]
  79. fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
  80. #endif
  81. #else // DOUBLE
  82. ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2
  83. ld1 {v3.2d}, [Y], #16 // V3 = Y[iy+1], Y[iy]; Y += 2
  84. ins v4.d[0], v2.d[1] // V4 = X[ix+1]
  85. #if !defined(CONJ)
  86. fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
  87. fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1]
  88. fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy]
  89. fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
  90. #else
  91. fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
  92. fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1]
  93. fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy]
  94. fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
  95. #endif
  96. #endif
  97. .endm
  98. .macro KERNEL_F4
  99. #if !defined(DOUBLE)
  100. ld2 {v2.4s, v3.4s}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2
  101. ld2 {v4.4s, v5.4s}, [Y], #32 // V2 = X[ix+1], X[ix]; X += 2
  102. fmla v0.4s, v2.4s, v4.4s // dot[0] += X[ix] * Y[iy]
  103. fmla v1.4s, v2.4s, v5.4s // dot[1] += X[ix] * Y[iy+1]
  104. PRFM PLDL1KEEP, [X, #1024]
  105. PRFM PLDL1KEEP, [Y, #1024]
  106. #if !defined(CONJ)
  107. fmls v0.4s, v3.4s, v5.4s // dot[0] -= X[ix+1] * Y[iy+1]
  108. fmla v1.4s, v3.4s, v4.4s // dot[1] += X[ix+1] * Y[iy]
  109. #else
  110. fmla v0.4s, v3.4s, v5.4s // dot[0] += X[ix+1] * Y[iy+1]
  111. fmls v1.4s, v3.4s, v4.4s // dot[1] -= X[ix+1] * Y[iy]
  112. #endif
  113. #else // DOUBLE
  114. ld2 {v2.2d, v3.2d}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2
  115. ld2 {v16.2d, v17.2d}, [Y], #32
  116. fmla v0.2d, v2.2d, v16.2d // dot[0] += X[ix] * Y[iy]
  117. fmla v1.2d, v2.2d, v17.2d // dot[1] += X[ix] * Y[iy+1]
  118. ld2 {v4.2d, v5.2d}, [X], #32
  119. ld2 {v18.2d, v19.2d}, [Y], #32
  120. fmla v0.2d, v4.2d, v18.2d // dot[1] += X[ix] * Y[iy+1]
  121. fmla v1.2d, v4.2d, v19.2d // dot[1] += X[ix] * Y[iy+1]
  122. PRFM PLDL1KEEP, [X, #1024]
  123. PRFM PLDL1KEEP, [Y, #1024]
  124. #if !defined(CONJ)
  125. fmls v0.2d, v3.2d, v17.2d // dot[0] -= X[ix+1] * Y[iy+1]
  126. fmls v20.2d, v5.2d, v19.2d // dot[0] -= X[ix+1] * Y[iy+1]
  127. fmla v1.2d, v3.2d, v16.2d // dot[1] += X[ix+1] * Y[iy]
  128. fmla v21.2d, v5.2d, v18.2d // dot[1] += X[ix+1] * Y[iy]
  129. #else
  130. fmla v0.2d, v3.2d, v17.2d // dot[0] += X[ix+1] * Y[iy+1]
  131. fmla v20.2d, v5.2d, v19.2d // dot[0] += X[ix+1] * Y[iy+1]
  132. fmls v1.2d, v3.2d, v16.2d // dot[1] -= X[ix+1] * Y[iy]
  133. fmls v21.2d, v5.2d, v18.2d // dot[1] -= X[ix+1] * Y[iy]
  134. #endif
  135. #endif
  136. .endm
  137. .macro KERNEL_F4_FINALIZE
  138. #if !defined(DOUBLE)
  139. ext v2.16b, v0.16b, v0.16b, #8
  140. fadd v0.2s, v0.2s, v2.2s
  141. faddp DOTF, v0.2s
  142. ext v3.16b, v1.16b, v1.16b, #8
  143. fadd v1.2s, v1.2s, v3.2s
  144. faddp DOTI, v1.2s
  145. #else
  146. fadd v0.2d, v0.2d, v20.2d
  147. faddp DOTF, v0.2d
  148. fadd v1.2d, v1.2d, v21.2d
  149. faddp DOTI, v1.2d
  150. #endif
  151. .endm
  152. .macro INIT_S
  153. #if !defined(DOUBLE)
  154. lsl INC_X, INC_X, #3
  155. lsl INC_Y, INC_Y, #3
  156. #else
  157. lsl INC_X, INC_X, #4
  158. lsl INC_Y, INC_Y, #4
  159. #endif
  160. .endm
  161. .macro KERNEL_S1
  162. #if !defined(DOUBLE)
  163. ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
  164. ld1 {v3.2s}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2
  165. ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1]
  166. #if !defined(CONJ)
  167. fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
  168. fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1]
  169. fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy]
  170. fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
  171. #else
  172. fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
  173. fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1]
  174. fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy]
  175. fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
  176. #endif
  177. #else // DOUBLE
  178. ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
  179. ld1 {v3.2d}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2
  180. ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1]
  181. #if !defined(CONJ)
  182. fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
  183. fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1]
  184. fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy]
  185. fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
  186. #else
  187. fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
  188. fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1]
  189. fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy]
  190. fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
  191. #endif
  192. #endif
  193. .endm
  194. /*******************************************************************************
  195. * End of macro definitions
  196. *******************************************************************************/
  197. PROLOGUE
  198. fmov DOTF, REG0
  199. fmov DOTI, DOTF
  200. #if !defined(DOUBLE)
  201. fmov s20, DOTF
  202. fmov s21, DOTI
  203. #else
  204. fmov d20, DOTF
  205. fmov d21, DOTI
  206. #endif
  207. cmp N, xzr
  208. ble .Lzdot_kernel_L999
  209. cmp INC_X, #1
  210. bne .Lzdot_kernel_S_BEGIN
  211. cmp INC_Y, #1
  212. bne .Lzdot_kernel_S_BEGIN
  213. .Lzdot_kernel_F_BEGIN:
  214. asr I, N, #2
  215. cmp I, xzr
  216. beq .Lzdot_kernel_F1
  217. .Lzdot_kernel_F4:
  218. KERNEL_F4
  219. subs I, I, #1
  220. bne .Lzdot_kernel_F4
  221. KERNEL_F4_FINALIZE
  222. .Lzdot_kernel_F1:
  223. ands I, N, #3
  224. ble .Lzdot_kernel_L999
  225. .Lzdot_kernel_F10:
  226. KERNEL_F1
  227. subs I, I, #1
  228. bne .Lzdot_kernel_F10
  229. ret
  230. .Lzdot_kernel_S_BEGIN:
  231. INIT_S
  232. asr I, N, #2
  233. cmp I, xzr
  234. ble .Lzdot_kernel_S1
  235. .Lzdot_kernel_S4:
  236. KERNEL_S1
  237. KERNEL_S1
  238. KERNEL_S1
  239. KERNEL_S1
  240. subs I, I, #1
  241. bne .Lzdot_kernel_S4
  242. .Lzdot_kernel_S1:
  243. ands I, N, #3
  244. ble .Lzdot_kernel_L999
  245. .Lzdot_kernel_S10:
  246. KERNEL_S1
  247. subs I, I, #1
  248. bne .Lzdot_kernel_S10
  249. .Lzdot_kernel_L999:
  250. ret
  251. EPILOGUE