You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

nrm2.S 4.6 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N x0
  30. #define X x1
  31. #define INC_X x2
  32. #define I x3
  33. #if !defined(DOUBLE)
  34. #define SSQ s0
  35. #define SCALE s1
  36. #define REGZERO s5
  37. #define REGONE s6
  38. #else
  39. #define SSQ d0
  40. #define SCALE d1
  41. #define REGZERO d5
  42. #define REGONE d6
  43. #endif
  44. /*******************************************************************************
  45. * Macro definitions
  46. *******************************************************************************/
  47. .macro KERNEL_F1
  48. #if !defined(DOUBLE)
  49. ldr s4, [X], #4
  50. fcmp s4, REGZERO
  51. beq KERNEL_F1_NEXT_\@
  52. fabs s4, s4
  53. fcmp SCALE, s4
  54. bge KERNEL_F1_SCALE_GE_X_\@
  55. fdiv s2, SCALE, s4
  56. fmul s2, s2, s2
  57. fmul s3, SSQ, s2
  58. fadd SSQ, REGONE, s3
  59. fmov SCALE, s4
  60. b KERNEL_F1_NEXT_\@
  61. KERNEL_F1_SCALE_GE_X_\@:
  62. fdiv s2, s4, SCALE
  63. fmla SSQ, s2, v2.s[0]
  64. #else
  65. ldr d4, [X], #8
  66. fcmp d4, REGZERO
  67. beq KERNEL_F1_NEXT_\@
  68. fabs d4, d4
  69. fcmp SCALE, d4
  70. bge KERNEL_F1_SCALE_GE_X_\@
  71. fdiv d2, SCALE, d4
  72. fmul d2, d2, d2
  73. fmul d3, SSQ, d2
  74. fadd SSQ, REGONE, d3
  75. fmov SCALE, d4
  76. b KERNEL_F1_NEXT_\@
  77. KERNEL_F1_SCALE_GE_X_\@:
  78. fdiv d2, d4, SCALE
  79. fmla SSQ, d2, v2.d[0]
  80. #endif
  81. KERNEL_F1_NEXT_\@:
  82. .endm
  83. .macro KERNEL_S1
  84. #if !defined(DOUBLE)
  85. ldr s4, [X]
  86. fcmp s4, REGZERO
  87. beq KERNEL_S1_NEXT
  88. fabs s4, s4
  89. fcmp SCALE, s4
  90. bge KERNEL_S1_SCALE_GE_X
  91. fdiv s2, SCALE, s4
  92. fmul s2, s2, s2
  93. fmul s3, SSQ, s2
  94. fadd SSQ, REGONE, s3
  95. fmov SCALE, s4
  96. b KERNEL_S1_NEXT
  97. KERNEL_S1_SCALE_GE_X:
  98. fdiv s2, s4, SCALE
  99. fmla SSQ, s2, v2.s[0]
  100. #else
  101. ldr d4, [X]
  102. fcmp d4, REGZERO
  103. beq KERNEL_S1_NEXT
  104. fabs d4, d4
  105. fcmp SCALE, d4
  106. bge KERNEL_S1_SCALE_GE_X
  107. fdiv d2, SCALE, d4
  108. fmul d2, d2, d2
  109. fmul d3, SSQ, d2
  110. fadd SSQ, REGONE, d3
  111. fmov SCALE, d4
  112. b KERNEL_S1_NEXT
  113. KERNEL_S1_SCALE_GE_X:
  114. fdiv d2, d4, SCALE
  115. fmla SSQ, d2, v2.d[0]
  116. #endif
  117. KERNEL_S1_NEXT:
  118. add X, X, INC_X
  119. .endm
  120. .macro KERNEL_F8
  121. KERNEL_F1
  122. KERNEL_F1
  123. KERNEL_F1
  124. KERNEL_F1
  125. KERNEL_F1
  126. KERNEL_F1
  127. KERNEL_F1
  128. KERNEL_F1
  129. .endm
  130. .macro INIT_S
  131. #if !defined(DOUBLE)
  132. lsl INC_X, INC_X, #2 // INC_X * SIZE
  133. #else
  134. lsl INC_X, INC_X, #3 // INC_X * SIZE
  135. #endif
  136. .endm
  137. .macro INIT
  138. eor v1.16b, v1.16b, v1.16b // scale=0.0
  139. fmov SSQ, #1.0
  140. fmov REGONE, SSQ
  141. fmov REGZERO, SCALE
  142. .endm
  143. /*******************************************************************************
  144. * End of macro definitions
  145. *******************************************************************************/
  146. PROLOGUE
  147. .align 5
  148. INIT
  149. cmp N, #0
  150. ble nrm2_kernel_L999
  151. cmp INC_X, #0
  152. beq nrm2_kernel_L999
  153. cmp INC_X, #1
  154. bne nrm2_kernel_S_BEGIN
  155. nrm2_kernel_F_BEGIN:
  156. asr I, N, #3 // I = N / 8
  157. cmp I, xzr
  158. ble nrm2_kernel_F1
  159. nrm2_kernel_F8:
  160. KERNEL_F8
  161. subs I, I, #1
  162. bne nrm2_kernel_F8
  163. nrm2_kernel_F1:
  164. ands I, N, #7
  165. ble nrm2_kernel_L999
  166. nrm2_kernel_F10:
  167. KERNEL_F1
  168. subs I, I, #1
  169. bne nrm2_kernel_F10
  170. b nrm2_kernel_L999
  171. nrm2_kernel_S_BEGIN:
  172. INIT_S
  173. mov I, N
  174. .align 5
  175. nrm2_kernel_S10:
  176. KERNEL_S1
  177. subs I, I, #1
  178. bne nrm2_kernel_S10
  179. nrm2_kernel_L999:
  180. fsqrt SSQ, SSQ
  181. fmul SSQ, SCALE, SSQ
  182. ret
  183. EPILOGUE