You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zscal.S 8.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r7
  31. #define INCX $r8
  32. #define I $r17
  33. #define TEMP $r18
  34. #define XX $r5
  35. #define ALPHA_R $f0
  36. #define ALPHA_I $f1
  37. #define a1 $f22
  38. #define a2 $f8
  39. #define a3 $f23
  40. #define a4 $f9
  41. #define a5 $f10
  42. #define a6 $f11
  43. #define a7 $f12
  44. #define a8 $f13
  45. #define t1 $f14
  46. #define t2 $f15
  47. #define t3 $f16
  48. #define t4 $f17
  49. PROLOGUE
  50. li.d TEMP, 2 * SIZE
  51. MTC a1, $r0
  52. slli.d INCX, INCX, ZBASE_SHIFT
  53. bge $r0, N, .L999
  54. CMPEQ $fcc0, ALPHA_R, a1
  55. CMPEQ $fcc1, ALPHA_I, a1
  56. bceqz $fcc0, .L50
  57. bceqz $fcc1, .L50
  58. srai.d I, N, 2
  59. bne INCX, TEMP, .L20
  60. bge $r0, I, .L15
  61. .align 3
  62. .L12:
  63. ST a1, X, 0 * SIZE
  64. ST a1, X, 1 * SIZE
  65. ST a1, X, 2 * SIZE
  66. ST a1, X, 3 * SIZE
  67. ST a1, X, 4 * SIZE
  68. ST a1, X, 5 * SIZE
  69. ST a1, X, 6 * SIZE
  70. ST a1, X, 7 * SIZE
  71. addi.w I, I, -1
  72. addi.d X, X, 8 * SIZE
  73. blt $r0, I, .L12
  74. .align 3
  75. .L15:
  76. andi I, N, 3
  77. bge $r0, I, .L999
  78. .align 3
  79. .L16:
  80. ST a1, X, 0 * SIZE
  81. ST a1, X, 1 * SIZE
  82. addi.d I, I, -1
  83. addi.d X, X, 2 * SIZE
  84. blt $r0, I, .L16
  85. move $r4, $r17
  86. fmov.d $f0, $f22
  87. jirl $r0, $r1, 0x0
  88. .align 3
  89. .L20:
  90. srai.d I, N, 2
  91. bge $r0, I, .L25
  92. .align 3
  93. .L22:
  94. ST a1, X, 0 * SIZE
  95. ST a1, X, 1 * SIZE
  96. add.d X, X, INCX
  97. ST a1, X, 0 * SIZE
  98. ST a1, X, 1 * SIZE
  99. add.d X, X, INCX
  100. ST a1, X, 0 * SIZE
  101. ST a1, X, 1 * SIZE
  102. add.d X, X, INCX
  103. ST a1, X, 0 * SIZE
  104. ST a1, X, 1 * SIZE
  105. addi.d I, I, -1
  106. add.d X, X, INCX
  107. blt $r0, I, .L22
  108. .align 3
  109. .L25:
  110. andi I, N, 3
  111. bge $r0, I, .L999
  112. .align 3
  113. .L26:
  114. ST a1, X, 0 * SIZE
  115. addi.d I, I, -1
  116. ST a1, X, 1 * SIZE
  117. add.d X, X, INCX
  118. blt $r0, I, .L26
  119. move $r4, $r17
  120. fmov.d $f0, $f22
  121. jirl $r0, $r1, 0x0
  122. .align 3
  123. .L50:
  124. srai.d I, N, 2
  125. bne INCX, TEMP, .L60
  126. addi.d I, I, -1
  127. blt I, $r0, .L55
  128. LD a1, X, 0 * SIZE
  129. LD a2, X, 1 * SIZE
  130. LD a3, X, 2 * SIZE
  131. LD a4, X, 3 * SIZE
  132. LD a5, X, 4 * SIZE
  133. LD a6, X, 5 * SIZE
  134. MUL t1, ALPHA_R, a1
  135. LD a7, X, 6 * SIZE
  136. MUL t2, ALPHA_I, a1
  137. LD a8, X, 7 * SIZE
  138. MUL t3, ALPHA_R, a3
  139. MUL t4, ALPHA_I, a3
  140. bge $r0, I, .L53
  141. .align 3
  142. .L52:
  143. NMSUB t1, a2, ALPHA_I, t1
  144. LD a1, X, 8 * SIZE
  145. MADD t2, a2, ALPHA_R, t2
  146. LD a2, X, 9 * SIZE
  147. NMSUB t3, a4, ALPHA_I, t3
  148. LD a3, X, 10 * SIZE
  149. MADD t4, a4, ALPHA_R, t4
  150. LD a4, X, 11 * SIZE
  151. ST t1, X, 0 * SIZE
  152. MUL t1, ALPHA_R, a5
  153. ST t2, X, 1 * SIZE
  154. MUL t2, ALPHA_I, a5
  155. ST t3, X, 2 * SIZE
  156. MUL t3, ALPHA_R, a7
  157. ST t4, X, 3 * SIZE
  158. MUL t4, ALPHA_I, a7
  159. NMSUB t1, a6, ALPHA_I, t1
  160. LD a5, X, 12 * SIZE
  161. MADD t2, a6, ALPHA_R, t2
  162. LD a6, X, 13 * SIZE
  163. NMSUB t3, a8, ALPHA_I, t3
  164. LD a7, X, 14 * SIZE
  165. MADD t4, a8, ALPHA_R, t4
  166. LD a8, X, 15 * SIZE
  167. ST t1, X, 4 * SIZE
  168. MUL t1, ALPHA_R, a1
  169. ST t2, X, 5 * SIZE
  170. MUL t2, ALPHA_I, a1
  171. ST t3, X, 6 * SIZE
  172. MUL t3, ALPHA_R, a3
  173. ST t4, X, 7 * SIZE
  174. MUL t4, ALPHA_I, a3
  175. addi.d I, I, -1
  176. addi.d X, X, 8 * SIZE
  177. blt $r0, I, .L52
  178. .align 3
  179. .L53:
  180. NMSUB t1, a2, ALPHA_I, t1
  181. MADD t2, a2, ALPHA_R, t2
  182. NMSUB t3, a4, ALPHA_I, t3
  183. MADD t4, a4, ALPHA_R, t4
  184. ST t1, X, 0 * SIZE
  185. MUL t1, ALPHA_R, a5
  186. ST t2, X, 1 * SIZE
  187. MUL t2, ALPHA_I, a5
  188. ST t3, X, 2 * SIZE
  189. MUL t3, ALPHA_R, a7
  190. ST t4, X, 3 * SIZE
  191. MUL t4, ALPHA_I, a7
  192. NMSUB t1, a6, ALPHA_I, t1
  193. MADD t2, a6, ALPHA_R, t2
  194. NMSUB t3, a8, ALPHA_I, t3
  195. MADD t4, a8, ALPHA_R, t4
  196. ST t1, X, 4 * SIZE
  197. ST t2, X, 5 * SIZE
  198. ST t3, X, 6 * SIZE
  199. ST t4, X, 7 * SIZE
  200. addi.d X, X, 8 * SIZE
  201. .align 3
  202. .L55:
  203. andi I, N, 3
  204. bge $r0, I, .L999
  205. .align 3
  206. .L56:
  207. LD a1, X, 0 * SIZE
  208. LD a2, X, 1 * SIZE
  209. MUL t1, ALPHA_R, a1
  210. MUL t2, ALPHA_I, a1
  211. NMSUB t1, a2, ALPHA_I, t1
  212. MADD t2, a2, ALPHA_R, t2
  213. addi.d X, X, 2 * SIZE
  214. addi.d I, I, -1
  215. ST t1, X, -2 * SIZE
  216. ST t2, X, -1 * SIZE
  217. blt $r0, I, .L56
  218. move $r4, $r17
  219. fmov.d $f0, $f22
  220. jirl $r0, $r1, 0x0
  221. .align 3
  222. .L60:
  223. srai.d I, N, 2
  224. move XX, X
  225. addi.d I, I, -1
  226. blt I, $r0, .L65
  227. LD a1, X, 0 * SIZE
  228. LD a2, X, 1 * SIZE
  229. add.d X, X, INCX
  230. LD a3, X, 0 * SIZE
  231. LD a4, X, 1 * SIZE
  232. add.d X, X, INCX
  233. LD a5, X, 0 * SIZE
  234. LD a6, X, 1 * SIZE
  235. add.d X, X, INCX
  236. MUL t1, ALPHA_R, a1
  237. LD a7, X, 0 * SIZE
  238. MUL t2, ALPHA_I, a1
  239. LD a8, X, 1 * SIZE
  240. MUL t3, ALPHA_R, a3
  241. add.d X, X, INCX
  242. MUL t4, ALPHA_I, a3
  243. bge $r0, I, .L63
  244. .align 3
  245. .L62:
  246. NMSUB t1, a2, ALPHA_I, t1
  247. LD a1, X, 0 * SIZE
  248. MADD t2, a2, ALPHA_R, t2
  249. LD a2, X, 1 * SIZE
  250. add.d X, X, INCX
  251. NMSUB t3, a4, ALPHA_I, t3
  252. LD a3, X, 0 * SIZE
  253. MADD t4, a4, ALPHA_R, t4
  254. LD a4, X, 1 * SIZE
  255. add.d X, X, INCX
  256. ST t1, XX, 0 * SIZE
  257. MUL t1, ALPHA_R, a5
  258. ST t2, XX, 1 * SIZE
  259. MUL t2, ALPHA_I, a5
  260. add.d XX, XX, INCX
  261. ST t3, XX, 0 * SIZE
  262. MUL t3, ALPHA_R, a7
  263. ST t4, XX, 1 * SIZE
  264. MUL t4, ALPHA_I, a7
  265. add.d XX, XX, INCX
  266. NMSUB t1, a6, ALPHA_I, t1
  267. LD a5, X, 0 * SIZE
  268. MADD t2, a6, ALPHA_R, t2
  269. LD a6, X, 1 * SIZE
  270. add.d X, X, INCX
  271. NMSUB t3, a8, ALPHA_I, t3
  272. LD a7, X, 0 * SIZE
  273. MADD t4, a8, ALPHA_R, t4
  274. LD a8, X, 1 * SIZE
  275. add.d X, X, INCX
  276. ST t1, XX, 0 * SIZE
  277. MUL t1, ALPHA_R, a1
  278. ST t2, XX, 1 * SIZE
  279. MUL t2, ALPHA_I, a1
  280. add.d XX, XX, INCX
  281. ST t3, XX, 0 * SIZE
  282. MUL t3, ALPHA_R, a3
  283. ST t4, XX, 1 * SIZE
  284. MUL t4, ALPHA_I, a3
  285. addi.d I, I, -1
  286. add.d XX, XX, INCX
  287. blt $r0, I, .L62
  288. .align 3
  289. .L63:
  290. NMSUB t1, a2, ALPHA_I, t1
  291. MADD t2, a2, ALPHA_R, t2
  292. NMSUB t3, a4, ALPHA_I, t3
  293. MADD t4, a4, ALPHA_R, t4
  294. ST t1, XX, 0 * SIZE
  295. MUL t1, ALPHA_R, a5
  296. ST t2, XX, 1 * SIZE
  297. MUL t2, ALPHA_I, a5
  298. add.d XX, XX, INCX
  299. ST t3, XX, 0 * SIZE
  300. MUL t3, ALPHA_R, a7
  301. ST t4, XX, 1 * SIZE
  302. MUL t4, ALPHA_I, a7
  303. add.d XX, XX, INCX
  304. NMSUB t1, a6, ALPHA_I, t1
  305. MADD t2, a6, ALPHA_R, t2
  306. NMSUB t3, a8, ALPHA_I, t3
  307. MADD t4, a8, ALPHA_R, t4
  308. ST t1, XX, 0 * SIZE
  309. ST t2, XX, 1 * SIZE
  310. add.d XX, XX, INCX
  311. ST t3, XX, 0 * SIZE
  312. ST t4, XX, 1 * SIZE
  313. add.d XX, XX, INCX
  314. .align 3
  315. .L65:
  316. andi I, N, 3
  317. bge $r0, I, .L999
  318. .align 3
  319. .L66:
  320. LD a1, X, 0 * SIZE
  321. LD a2, X, 1 * SIZE
  322. MUL t1, ALPHA_R, a1
  323. MUL t2, ALPHA_I, a1
  324. NMSUB t1, a2, ALPHA_I, t1
  325. MADD t2, a2, ALPHA_R, t2
  326. addi.d I, I, -1
  327. ST t1, X, 0 * SIZE
  328. ST t2, X, 1 * SIZE
  329. add.d X, X, INCX
  330. blt $r0, I, .L66
  331. .align 3
  332. .L999:
  333. move $r4, $r17
  334. fmov.d $f0, $f22
  335. jirl $r0, $r1, 0x0
  336. EPILOGUE