You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

znrm2.S 6.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N x0
  30. #define X x1
  31. #define INC_X x2
  32. #define I x3
  33. #if !defined(DOUBLE)
  34. #define SSQ s0
  35. #define SCALE s1
  36. #define REGZERO s6
  37. #define REGONE s7
  38. #else
  39. #define SSQ d0
  40. #define SCALE d1
  41. #define REGZERO d6
  42. #define REGONE d7
  43. #endif
  44. /**************************************************************************************
  45. * Macro definitions
  46. **************************************************************************************/
  47. .macro KERNEL_F1
  48. #if !defined(DOUBLE)
  49. ldr s4, [X], #4
  50. fcmp s4, REGZERO
  51. beq KERNEL_F1_NEXT_\@
  52. fabs s4, s4
  53. fcmp SCALE, s4
  54. bge KERNEL_F1_SCALE_GE_XR_\@
  55. fdiv s2, SCALE, s4
  56. fmul s2, s2, s2
  57. fmul s3, SSQ, s2
  58. fadd SSQ, REGONE, s3
  59. fmov SCALE, s4
  60. b KERNEL_F1_NEXT_\@
  61. KERNEL_F1_SCALE_GE_XR_\@:
  62. fdiv s2, s4, SCALE
  63. fmla SSQ, s2, v2.s[0]
  64. KERNEL_F1_NEXT_\@:
  65. ldr s5, [X], #4
  66. fcmp s5, REGZERO
  67. beq KERNEL_F1_END_\@
  68. fabs s5, s5
  69. fcmp SCALE, s5
  70. bge KERNEL_F1_SCALE_GE_XI_\@
  71. fdiv s2, SCALE, s5
  72. fmul s2, s2, s2
  73. fmul s3, SSQ, s2
  74. fadd SSQ, REGONE, s3
  75. fmov SCALE, s5
  76. b KERNEL_F1_END_\@
  77. KERNEL_F1_SCALE_GE_XI_\@:
  78. fdiv s2, s5, SCALE
  79. fmla SSQ, s2, v2.s[0]
  80. #else
  81. ldr d4, [X], #8
  82. fcmp d4, REGZERO
  83. beq KERNEL_F1_NEXT_\@
  84. fabs d4, d4
  85. fcmp SCALE, d4
  86. bge KERNEL_F1_SCALE_GE_XR_\@
  87. fdiv d2, SCALE, d4
  88. fmul d2, d2, d2
  89. fmul d3, SSQ, d2
  90. fadd SSQ, REGONE, d3
  91. fmov SCALE, d4
  92. b KERNEL_F1_NEXT_\@
  93. KERNEL_F1_SCALE_GE_XR_\@:
  94. fdiv d2, d4, SCALE
  95. fmla SSQ, d2, v2.d[0]
  96. KERNEL_F1_NEXT_\@:
  97. ldr d5, [X], #8
  98. fcmp d5, REGZERO
  99. beq KERNEL_F1_END_\@
  100. fabs d5, d5
  101. fcmp SCALE, d5
  102. bge KERNEL_F1_SCALE_GE_XI_\@
  103. fdiv d2, SCALE, d5
  104. fmul d2, d2, d2
  105. fmul d3, SSQ, d2
  106. fadd SSQ, REGONE, d3
  107. fmov SCALE, d5
  108. b KERNEL_F1_END_\@
  109. KERNEL_F1_SCALE_GE_XI_\@:
  110. fdiv d2, d5, SCALE
  111. fmla SSQ, d2, v2.d[0]
  112. #endif
  113. KERNEL_F1_END_\@:
  114. .endm
  115. .macro KERNEL_S1
  116. #if !defined(DOUBLE)
  117. ldr s4, [X]
  118. fcmp s4, REGZERO
  119. beq KERNEL_S1_NEXT_\@
  120. fabs s4, s4
  121. fcmp SCALE, s4
  122. bge KERNEL_S1_SCALE_GE_XR_\@
  123. fdiv s2, SCALE, s4
  124. fmul s2, s2, s2
  125. fmul s3, SSQ, s2
  126. fadd SSQ, REGONE, s3
  127. fmov SCALE, s4
  128. b KERNEL_S1_NEXT_\@
  129. KERNEL_S1_SCALE_GE_XR_\@:
  130. fdiv s2, s4, SCALE
  131. fmla SSQ, s2, v2.s[0]
  132. KERNEL_S1_NEXT_\@:
  133. ldr s5, [X, #4]
  134. fcmp s5, REGZERO
  135. beq KERNEL_S1_END_\@
  136. fabs s5, s5
  137. fcmp SCALE, s5
  138. bge KERNEL_S1_SCALE_GE_XI_\@
  139. fdiv s2, SCALE, s5
  140. fmul s2, s2, s2
  141. fmul s3, SSQ, s2
  142. fadd SSQ, REGONE, s3
  143. fmov SCALE, s5
  144. b KERNEL_S1_END_\@
  145. KERNEL_S1_SCALE_GE_XI_\@:
  146. fdiv s2, s5, SCALE
  147. fmla SSQ, s2, v2.s[0]
  148. #else
  149. ldr d4, [X]
  150. fcmp d4, REGZERO
  151. beq KERNEL_S1_NEXT_\@
  152. fabs d4, d4
  153. fcmp SCALE, d4
  154. bge KERNEL_S1_SCALE_GE_XR_\@
  155. fdiv d2, SCALE, d4
  156. fmul d2, d2, d2
  157. fmul d3, SSQ, d2
  158. fadd SSQ, REGONE, d3
  159. fmov SCALE, d4
  160. b KERNEL_S1_NEXT_\@
  161. KERNEL_S1_SCALE_GE_XR_\@:
  162. fdiv d2, d4, SCALE
  163. fmla SSQ, d2, v2.d[0]
  164. KERNEL_S1_NEXT_\@:
  165. ldr d5, [X, #8]
  166. fcmp d5, REGZERO
  167. beq KERNEL_S1_END_\@
  168. fabs d5, d5
  169. fcmp SCALE, d5
  170. bge KERNEL_S1_SCALE_GE_XI_\@
  171. fdiv d2, SCALE, d5
  172. fmul d2, d2, d2
  173. fmul d3, SSQ, d2
  174. fadd SSQ, REGONE, d3
  175. fmov SCALE, d5
  176. b KERNEL_S1_END_\@
  177. KERNEL_S1_SCALE_GE_XI_\@:
  178. fdiv d2, d5, SCALE
  179. fmla SSQ, d2, v2.d[0]
  180. #endif
  181. KERNEL_S1_END_\@:
  182. add X, X, INC_X
  183. .endm
  184. .macro KERNEL_F8
  185. KERNEL_F1
  186. KERNEL_F1
  187. KERNEL_F1
  188. KERNEL_F1
  189. KERNEL_F1
  190. KERNEL_F1
  191. KERNEL_F1
  192. KERNEL_F1
  193. .endm
  194. .macro INIT_S
  195. #if !defined(DOUBLE)
  196. lsl INC_X, INC_X, #3 // INC_X * SIZE
  197. #else
  198. lsl INC_X, INC_X, #4 // INC_X * SIZE
  199. #endif
  200. .endm
  201. .macro INIT
  202. eor v1.16b, v1.16b, v1.16b // scale=0.0
  203. fmov SSQ, #1.0
  204. fmov REGONE, SSQ
  205. fmov REGZERO, SCALE
  206. .endm
  207. /**************************************************************************************
  208. * End of macro definitions
  209. **************************************************************************************/
  210. PROLOGUE
  211. .align 5
  212. INIT
  213. cmp N, #0
  214. ble nrm2_kernel_L999
  215. cmp INC_X, #0
  216. beq nrm2_kernel_L999
  217. cmp INC_X, #1
  218. bne nrm2_kernel_S_BEGIN
  219. nrm2_kernel_F_BEGIN:
  220. asr I, N, #3 // I = N / 8
  221. cmp I, xzr
  222. ble nrm2_kernel_F1
  223. nrm2_kernel_F8:
  224. KERNEL_F8
  225. subs I, I, #1
  226. bne nrm2_kernel_F8
  227. nrm2_kernel_F1:
  228. ands I, N, #7
  229. ble nrm2_kernel_L999
  230. nrm2_kernel_F10:
  231. KERNEL_F1
  232. subs I, I, #1
  233. bne nrm2_kernel_F10
  234. b nrm2_kernel_L999
  235. nrm2_kernel_S_BEGIN:
  236. INIT_S
  237. mov I, N
  238. .align 5
  239. nrm2_kernel_S10:
  240. KERNEL_S1
  241. subs I, I, #1
  242. bne nrm2_kernel_S10
  243. nrm2_kernel_L999:
  244. fsqrt SSQ, SSQ
  245. fmul SSQ, SCALE, SSQ
  246. ret
  247. EPILOGUE