You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dnrm2.S 7.5 kB


  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r5
  31. #define INCX $r6
  32. #define XX $r7
  33. #define I $r17
  34. #define TEMP $r18
  35. #define a1 $f10
  36. #define a2 $f11
  37. #define a3 $f12
  38. #define a4 $f13
  39. #define a5 $f14
  40. #define a6 $f15
  41. #define a7 $f16
  42. #define a8 $f17
  43. #define t1 $f0
  44. #define t2 $f1
  45. #define t3 $f2
  46. #define t4 $f3
  47. #define s1 $f22
  48. #define s2 $f8
  49. #define s3 $f23
  50. #define s4 $f9
  51. #define ALPHA $f4
  52. #define max $f5
  53. PROLOGUE
  54. #ifdef F_INTERFACE
  55. LDINT N, 0(N)
  56. LDINT INCX, 0(INCX)
  57. #endif
  58. MTC s1, $r0
  59. bge $r0, N, .L999
  60. slli.d INCX, INCX, BASE_SHIFT
  61. bge $r0, INCX, .L999
  62. move XX, X
  63. NOP
  64. LD a1, X, 0 * SIZE
  65. addi.d N, N, -1
  66. add.d X, X, INCX
  67. FABS s1, a1
  68. FABS s2, a1
  69. bge $r0, N, .L999
  70. FABS s3, a1
  71. srai.d I, N, 3
  72. FABS s4, a1
  73. bge $r0, I, .L15
  74. LD a1, X, 0 * SIZE
  75. add.d X, X, INCX
  76. LD a2, X, 0 * SIZE
  77. add.d X, X, INCX
  78. LD a3, X, 0 * SIZE
  79. add.d X, X, INCX
  80. LD a4, X, 0 * SIZE
  81. add.d X, X, INCX
  82. LD a5, X, 0 * SIZE
  83. add.d X, X, INCX
  84. LD a6, X, 0 * SIZE
  85. add.d X, X, INCX
  86. LD a7, X, 0 * SIZE
  87. add.d X, X, INCX
  88. LD a8, X, 0 * SIZE
  89. addi.d I, I, -1
  90. add.d X, X, INCX
  91. bge $r0, I, .L13
  92. .align 3
  93. .L12:
  94. FABS t1, a1
  95. LD a1, X, 0 * SIZE
  96. FABS t2, a2
  97. add.d X, X, INCX
  98. FABS t3, a3
  99. LD a2, X, 0 * SIZE
  100. FABS t4, a4
  101. add.d X, X, INCX
  102. CMPLT $fcc0, s1, t1
  103. LD a3, X, 0 * SIZE
  104. CMPLT $fcc1, s2, t2
  105. add.d X, X, INCX
  106. CMPLT $fcc2, s3, t3
  107. LD a4, X, 0 * SIZE
  108. CMPLT $fcc3, s4, t4
  109. add.d X, X, INCX
  110. CMOVT s1, s1, t1, $fcc0
  111. CMOVT s2, s2, t2, $fcc1
  112. CMOVT s3, s3, t3, $fcc2
  113. CMOVT s4, s4, t4, $fcc3
  114. FABS t1, a5
  115. LD a5, X, 0 * SIZE
  116. FABS t2, a6
  117. add.d X, X, INCX
  118. FABS t3, a7
  119. LD a6, X, 0 * SIZE
  120. FABS t4, a8
  121. add.d X, X, INCX
  122. CMPLT $fcc0, s1, t1
  123. LD a7, X, 0 * SIZE
  124. CMPLT $fcc1, s2, t2
  125. add.d X, X, INCX
  126. CMPLT $fcc2, s3, t3
  127. LD a8, X, 0 * SIZE
  128. CMPLT $fcc3, s4, t4
  129. add.d X, X, INCX
  130. CMOVT s1, s1, t1, $fcc0
  131. addi.d I, I, -1
  132. CMOVT s2, s2, t2, $fcc1
  133. CMOVT s3, s3, t3, $fcc2
  134. CMOVT s4, s4, t4, $fcc3
  135. blt $r0, I, .L12
  136. .align 3
  137. .L13:
  138. FABS t1, a1
  139. FABS t2, a2
  140. FABS t3, a3
  141. FABS t4, a4
  142. CMPLT $fcc0, s1, t1
  143. CMPLT $fcc1, s2, t2
  144. CMPLT $fcc2, s3, t3
  145. CMPLT $fcc3, s4, t4
  146. CMOVT s1, s1, t1, $fcc0
  147. CMOVT s2, s2, t2, $fcc1
  148. CMOVT s3, s3, t3, $fcc2
  149. CMOVT s4, s4, t4, $fcc3
  150. FABS t1, a5
  151. FABS t2, a6
  152. FABS t3, a7
  153. FABS t4, a8
  154. CMPLT $fcc0, s1, t1
  155. CMPLT $fcc1, s2, t2
  156. CMPLT $fcc2, s3, t3
  157. CMPLT $fcc3, s4, t4
  158. CMOVT s1, s1, t1, $fcc0
  159. CMOVT s2, s2, t2, $fcc1
  160. CMOVT s3, s3, t3, $fcc2
  161. CMOVT s4, s4, t4, $fcc3
  162. .align 3
  163. .L15:
  164. andi I, N, 7
  165. bge $r0, I, .L100
  166. .align 3
  167. .L16:
  168. LD a1, X, 0 * SIZE
  169. addi.d I, I, -1
  170. FABS t1, a1
  171. CMPLT $fcc0, s1, t1
  172. CMOVT s1, s1, t1, $fcc0
  173. add.d X, X, INCX
  174. blt $r0, I, .L16
  175. .align 3
  176. .L100:
  177. CMPLT $fcc0, s1, s2
  178. CMPLT $fcc1, s3, s4
  179. CMOVT s1, s1, s2, $fcc0
  180. CMOVT s3, s3, s4, $fcc1
  181. CMPLT $fcc0, s1, s3
  182. CMOVT s1, s1, s3, $fcc0
  183. addi.d N, N, 1
  184. lu12i.w TEMP, 0x3f800
  185. movgr2fr.d a1, $r0
  186. movgr2fr.w ALPHA, TEMP
  187. CMPEQ $fcc0, s1, a1
  188. fcvt.d.s ALPHA, ALPHA
  189. bcnez $fcc0, .L999
  190. fdiv.d ALPHA, ALPHA, s1
  191. MOV max, s1
  192. MOV s1, a1
  193. MOV s2, a1
  194. MOV s3, a1
  195. MOV s4, a1
  196. srai.d I, N, 3
  197. bge $r0, I, .L105
  198. LD a1, XX, 0 * SIZE
  199. add.d XX, XX, INCX
  200. LD a2, XX, 0 * SIZE
  201. add.d XX, XX, INCX
  202. LD a3, XX, 0 * SIZE
  203. add.d XX, XX, INCX
  204. LD a4, XX, 0 * SIZE
  205. add.d XX, XX, INCX
  206. LD a5, XX, 0 * SIZE
  207. add.d XX, XX, INCX
  208. LD a6, XX, 0 * SIZE
  209. add.d XX, XX, INCX
  210. LD a7, XX, 0 * SIZE
  211. add.d XX, XX, INCX
  212. LD a8, XX, 0 * SIZE
  213. addi.d I, I, -1
  214. add.d XX, XX, INCX
  215. bge $r0, I, .L104
  216. .align 3
  217. .L103:
  218. MUL t1, ALPHA, a1
  219. LD a1, XX, 0 * SIZE
  220. MUL t2, ALPHA, a2
  221. add.d XX, XX, INCX
  222. MUL t3, ALPHA, a3
  223. LD a2, XX, 0 * SIZE
  224. MUL t4, ALPHA, a4
  225. add.d XX, XX, INCX
  226. MADD s1, t1, t1, s1
  227. LD a3, XX, 0 * SIZE
  228. MADD s2, t2, t2, s2
  229. add.d XX, XX, INCX
  230. MADD s3, t3, t3, s3
  231. LD a4, XX, 0 * SIZE
  232. MADD s4, t4, t4, s4
  233. add.d XX, XX, INCX
  234. MUL t1, ALPHA, a5
  235. LD a5, XX, 0 * SIZE
  236. MUL t2, ALPHA, a6
  237. add.d XX, XX, INCX
  238. MUL t3, ALPHA, a7
  239. LD a6, XX, 0 * SIZE
  240. MUL t4, ALPHA, a8
  241. add.d XX, XX, INCX
  242. MADD s1, t1, t1, s1
  243. LD a7, XX, 0 * SIZE
  244. MADD s2, t2, t2, s2
  245. add.d XX, XX, INCX
  246. MADD s3, t3, t3, s3
  247. LD a8, XX, 0 * SIZE
  248. MADD s4, t4, t4, s4
  249. addi.d I, I, -1
  250. add.d XX, XX, INCX
  251. blt $r0, I, .L103
  252. .align 3
  253. .L104:
  254. MUL t1, ALPHA, a1
  255. MUL t2, ALPHA, a2
  256. MUL t3, ALPHA, a3
  257. MUL t4, ALPHA, a4
  258. MADD s1, t1, t1, s1
  259. MADD s2, t2, t2, s2
  260. MADD s3, t3, t3, s3
  261. MADD s4, t4, t4, s4
  262. MUL t1, ALPHA, a5
  263. MUL t2, ALPHA, a6
  264. MUL t3, ALPHA, a7
  265. MUL t4, ALPHA, a8
  266. MADD s1, t1, t1, s1
  267. MADD s2, t2, t2, s2
  268. MADD s3, t3, t3, s3
  269. MADD s4, t4, t4, s4
  270. .align 3
  271. .L105:
  272. andi I, N, 7
  273. bge $r0, I, .L998
  274. .align 3
  275. .L106:
  276. LD a1, XX, 0 * SIZE
  277. addi.d I, I, -1
  278. MUL t1, ALPHA, a1
  279. add.d XX, XX, INCX
  280. MADD s1, t1, t1, s1
  281. blt $r0, I, .L106
  282. .align 3
  283. .L998:
  284. ADD s1, s1, s2
  285. ADD s3, s3, s4
  286. ADD s1, s1, s3
  287. fsqrt.d s1, s1
  288. move $r4, $r17
  289. MUL $f0, max, s1
  290. jirl $r0, $r1, 0x0
  291. .align 3
  292. .L999:
  293. move $r4, $r17
  294. fmov.d $f0, $f22
  295. jirl $r0, $r1, 0x0
  296. EPILOGUE