You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

swap.S 8.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r7
  31. #define INCX $r8
  32. #define Y $r9
  33. #define INCY $r10
  34. #define I $r17
  35. #define TEMP $r18
  36. #define XX $r5
  37. #define YY $r6
  38. #define a1 $f22
  39. #define a2 $f8
  40. #define a3 $f23
  41. #define a4 $f9
  42. #define a5 $f10
  43. #define a6 $f11
  44. #define a7 $f12
  45. #define a8 $f13
  46. #define b1 $f14
  47. #define b2 $f15
  48. #define b3 $f16
  49. #define b4 $f17
  50. #define b5 $f0
  51. #define b6 $f1
  52. #define b7 $f2
  53. #define b8 $f3
  54. PROLOGUE
  55. li.d TEMP, SIZE
  56. slli.d INCX, INCX, BASE_SHIFT
  57. bge $r0, N, .L999
  58. slli.d INCY, INCY, BASE_SHIFT
  59. bne INCX, TEMP, .L20
  60. srai.d I, N, 3
  61. bne INCY, TEMP, .L20
  62. addi.d I, I, -1
  63. blt I, $r0, .L15
  64. LD a1, X, 0 * SIZE
  65. LD b1, Y, 0 * SIZE
  66. LD a2, X, 1 * SIZE
  67. LD b2, Y, 1 * SIZE
  68. LD a3, X, 2 * SIZE
  69. LD b3, Y, 2 * SIZE
  70. LD a4, X, 3 * SIZE
  71. LD b4, Y, 3 * SIZE
  72. LD a5, X, 4 * SIZE
  73. LD b5, Y, 4 * SIZE
  74. LD a6, X, 5 * SIZE
  75. LD b6, Y, 5 * SIZE
  76. LD a7, X, 6 * SIZE
  77. LD b7, Y, 6 * SIZE
  78. LD a8, X, 7 * SIZE
  79. LD b8, Y, 7 * SIZE
  80. bge $r0, I, .L13
  81. .align 3
  82. .L12:
  83. ST a1, Y, 0 * SIZE
  84. LD a1, X, 8 * SIZE
  85. ST b1, X, 0 * SIZE
  86. LD b1, Y, 8 * SIZE
  87. ST a2, Y, 1 * SIZE
  88. LD a2, X, 9 * SIZE
  89. ST b2, X, 1 * SIZE
  90. LD b2, Y, 9 * SIZE
  91. ST a3, Y, 2 * SIZE
  92. LD a3, X, 10 * SIZE
  93. ST b3, X, 2 * SIZE
  94. LD b3, Y, 10 * SIZE
  95. ST a4, Y, 3 * SIZE
  96. LD a4, X, 11 * SIZE
  97. ST b4, X, 3 * SIZE
  98. LD b4, Y, 11 * SIZE
  99. ST a5, Y, 4 * SIZE
  100. LD a5, X, 12 * SIZE
  101. ST b5, X, 4 * SIZE
  102. LD b5, Y, 12 * SIZE
  103. ST a6, Y, 5 * SIZE
  104. LD a6, X, 13 * SIZE
  105. ST b6, X, 5 * SIZE
  106. LD b6, Y, 13 * SIZE
  107. ST a7, Y, 6 * SIZE
  108. LD a7, X, 14 * SIZE
  109. ST b7, X, 6 * SIZE
  110. LD b7, Y, 14 * SIZE
  111. ST a8, Y, 7 * SIZE
  112. LD a8, X, 15 * SIZE
  113. ST b8, X, 7 * SIZE
  114. LD b8, Y, 15 * SIZE
  115. addi.d I, I, -1
  116. addi.d X, X, 8 * SIZE
  117. addi.d Y, Y, 8 * SIZE
  118. blt $r0, I, .L12
  119. .align 3
  120. .L13:
  121. ST a1, Y, 0 * SIZE
  122. ST b1, X, 0 * SIZE
  123. ST a2, Y, 1 * SIZE
  124. ST b2, X, 1 * SIZE
  125. ST a3, Y, 2 * SIZE
  126. ST b3, X, 2 * SIZE
  127. ST a4, Y, 3 * SIZE
  128. ST b4, X, 3 * SIZE
  129. ST a5, Y, 4 * SIZE
  130. ST b5, X, 4 * SIZE
  131. ST a6, Y, 5 * SIZE
  132. ST b6, X, 5 * SIZE
  133. ST a7, Y, 6 * SIZE
  134. ST b7, X, 6 * SIZE
  135. ST a8, Y, 7 * SIZE
  136. ST b8, X, 7 * SIZE
  137. addi.d X, X, 8 * SIZE
  138. addi.d Y, Y, 8 * SIZE
  139. .align 3
  140. .L15:
  141. andi I, N, 7
  142. bge $r0, I, .L999
  143. .align 3
  144. .L16:
  145. LD a1, X, 0 * SIZE
  146. LD b1, Y, 0 * SIZE
  147. addi.d X, X, SIZE
  148. addi.d I, I, -1
  149. addi.d Y, Y, SIZE
  150. ST b1, X, -1 * SIZE
  151. ST a1, Y, -1 * SIZE
  152. blt $r0, I, .L16
  153. b .L999
  154. .align 3
  155. .L20:
  156. srai.d I, N, 3
  157. move XX, X
  158. move YY, Y
  159. addi.d I, I, -1
  160. blt I, $r0, .L25
  161. LD a1, X, 0 * SIZE
  162. add.d X, X, INCX
  163. LD b1, Y, 0 * SIZE
  164. add.d Y, Y, INCY
  165. LD a2, X, 0 * SIZE
  166. add.d X, X, INCX
  167. LD b2, Y, 0 * SIZE
  168. add.d Y, Y, INCY
  169. LD a3, X, 0 * SIZE
  170. add.d X, X, INCX
  171. LD b3, Y, 0 * SIZE
  172. add.d Y, Y, INCY
  173. LD a4, X, 0 * SIZE
  174. add.d X, X, INCX
  175. LD b4, Y, 0 * SIZE
  176. add.d Y, Y, INCY
  177. LD a5, X, 0 * SIZE
  178. add.d X, X, INCX
  179. LD b5, Y, 0 * SIZE
  180. add.d Y, Y, INCY
  181. LD a6, X, 0 * SIZE
  182. add.d X, X, INCX
  183. LD b6, Y, 0 * SIZE
  184. add.d Y, Y, INCY
  185. LD a7, X, 0 * SIZE
  186. add.d X, X, INCX
  187. LD b7, Y, 0 * SIZE
  188. add.d Y, Y, INCY
  189. LD a8, X, 0 * SIZE
  190. add.d X, X, INCX
  191. LD b8, Y, 0 * SIZE
  192. add.d Y, Y, INCY
  193. bge $r0, I, .L23
  194. .align 3
  195. .L22:
  196. ST a1, YY, 0 * SIZE
  197. add.d YY, YY, INCY
  198. LD a1, X, 0 * SIZE
  199. add.d X, X, INCX
  200. ST b1, XX, 0 * SIZE
  201. add.d XX, XX, INCX
  202. LD b1, Y, 0 * SIZE
  203. add.d Y, Y, INCY
  204. ST a2, YY, 0 * SIZE
  205. add.d YY, YY, INCY
  206. LD a2, X, 0 * SIZE
  207. add.d X, X, INCX
  208. ST b2, XX, 0 * SIZE
  209. add.d XX, XX, INCX
  210. LD b2, Y, 0 * SIZE
  211. add.d Y, Y, INCY
  212. ST a3, YY, 0 * SIZE
  213. add.d YY, YY, INCY
  214. LD a3, X, 0 * SIZE
  215. add.d X, X, INCX
  216. ST b3, XX, 0 * SIZE
  217. add.d XX, XX, INCX
  218. LD b3, Y, 0 * SIZE
  219. add.d Y, Y, INCY
  220. ST a4, YY, 0 * SIZE
  221. add.d YY, YY, INCY
  222. LD a4, X, 0 * SIZE
  223. add.d X, X, INCX
  224. ST b4, XX, 0 * SIZE
  225. add.d XX, XX, INCX
  226. LD b4, Y, 0 * SIZE
  227. add.d Y, Y, INCY
  228. ST a5, YY, 0 * SIZE
  229. add.d YY, YY, INCY
  230. LD a5, X, 0 * SIZE
  231. add.d X, X, INCX
  232. ST b5, XX, 0 * SIZE
  233. add.d XX, XX, INCX
  234. LD b5, Y, 0 * SIZE
  235. add.d Y, Y, INCY
  236. ST a6, YY, 0 * SIZE
  237. add.d YY, YY, INCY
  238. LD a6, X, 0 * SIZE
  239. add.d X, X, INCX
  240. ST b6, XX, 0 * SIZE
  241. add.d XX, XX, INCX
  242. LD b6, Y, 0 * SIZE
  243. add.d Y, Y, INCY
  244. ST a7, YY, 0 * SIZE
  245. add.d YY, YY, INCY
  246. LD a7, X, 0 * SIZE
  247. add.d X, X, INCX
  248. ST b7, XX, 0 * SIZE
  249. add.d XX, XX, INCX
  250. LD b7, Y, 0 * SIZE
  251. add.d Y, Y, INCY
  252. ST a8, YY, 0 * SIZE
  253. add.d YY, YY, INCY
  254. LD a8, X, 0 * SIZE
  255. add.d X, X, INCX
  256. ST b8, XX, 0 * SIZE
  257. add.d XX, XX, INCX
  258. LD b8, Y, 0 * SIZE
  259. addi.d I, I, -1
  260. add.d Y, Y, INCY
  261. blt $r0, I, .L22
  262. .align 3
  263. .L23:
  264. ST a1, YY, 0 * SIZE
  265. add.d YY, YY, INCY
  266. ST b1, XX, 0 * SIZE
  267. add.d XX, XX, INCX
  268. ST a2, YY, 0 * SIZE
  269. add.d YY, YY, INCY
  270. ST b2, XX, 0 * SIZE
  271. add.d XX, XX, INCX
  272. ST a3, YY, 0 * SIZE
  273. add.d YY, YY, INCY
  274. ST b3, XX, 0 * SIZE
  275. add.d XX, XX, INCX
  276. ST a4, YY, 0 * SIZE
  277. add.d YY, YY, INCY
  278. ST b4, XX, 0 * SIZE
  279. add.d XX, XX, INCX
  280. ST a5, YY, 0 * SIZE
  281. add.d YY, YY, INCY
  282. ST b5, XX, 0 * SIZE
  283. add.d XX, XX, INCX
  284. ST a6, YY, 0 * SIZE
  285. add.d YY, YY, INCY
  286. ST b6, XX, 0 * SIZE
  287. add.d XX, XX, INCX
  288. ST a7, YY, 0 * SIZE
  289. add.d YY, YY, INCY
  290. ST b7, XX, 0 * SIZE
  291. add.d XX, XX, INCX
  292. ST a8, YY, 0 * SIZE
  293. add.d YY, YY, INCY
  294. ST b8, XX, 0 * SIZE
  295. add.d XX, XX, INCX
  296. .align 3
  297. .L25:
  298. andi I, N, 7
  299. bge $r0, I, .L999
  300. .align 3
  301. .L26:
  302. LD a1, X, 0 * SIZE
  303. LD b1, Y, 0 * SIZE
  304. addi.d I, I, -1
  305. ST a1, Y, 0 * SIZE
  306. ST b1, X, 0 * SIZE
  307. add.d X, X, INCX
  308. add.d Y, Y, INCY
  309. blt $r0, I, .L26
  310. .align 3
  311. .L999:
  312. move $r4, $r17
  313. fmov.d $f0, $f22
  314. jirl $r0, $r1, 0x0
  315. EPILOGUE