You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ccopy_lsx.S 9.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. /***************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r5
  31. #define INCX $r6
  32. #define Y $r7
  33. #define INCY $r8
  34. #define I $r17
  35. #define TEMP $r18
  36. #define t1 $r14
  37. #define t2 $r15
  38. #define t3 $r16
  39. #define t4 $r19
  40. #define a1 $f12
  41. #define a2 $f13
  42. #define a3 $f14
  43. #define a4 $f15
  44. #define VX0 $vr12
  45. #define VX1 $vr13
  46. #define VX2 $vr14
  47. #define VX3 $vr15
  48. PROLOGUE
  49. bge $r0, N, .L999
  50. li.d TEMP, 1
  51. slli.d TEMP, TEMP, ZBASE_SHIFT
  52. slli.d INCX, INCX, ZBASE_SHIFT
  53. slli.d INCY, INCY, ZBASE_SHIFT
  54. srai.d I, N, 3
  55. bne INCX, TEMP, .L20
  56. bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
  57. b .L11 // INCX==1 and INCY==1
  58. .L20:
  59. bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
  60. b .L21 // INCX!=1 and INCY==1
  61. .L11:// INCX==1 and INCY==1
  62. bge $r0, I, .L112
  63. .align 3
  64. .L111:
  65. vld VX0, X, 0 * SIZE
  66. #ifdef DOUBLE
  67. vld VX1, X, 2 * SIZE
  68. vld VX2, X, 4 * SIZE
  69. vld VX3, X, 6 * SIZE
  70. vst VX0, Y, 0 * SIZE
  71. vst VX1, Y, 2 * SIZE
  72. vst VX2, Y, 4 * SIZE
  73. vst VX3, Y, 6 * SIZE
  74. vld VX0, X, 8 * SIZE
  75. vld VX1, X, 10 * SIZE
  76. vld VX2, X, 12 * SIZE
  77. vld VX3, X, 14 * SIZE
  78. addi.d I, I, -1
  79. vst VX0, Y, 8 * SIZE
  80. vst VX1, Y, 10 * SIZE
  81. vst VX2, Y, 12 * SIZE
  82. vst VX3, Y, 14 * SIZE
  83. #else
  84. vld VX1, X, 4 * SIZE
  85. vld VX2, X, 8 * SIZE
  86. vld VX3, X, 12 * SIZE
  87. addi.d I, I, -1
  88. vst VX0, Y, 0 * SIZE
  89. vst VX1, Y, 4 * SIZE
  90. vst VX2, Y, 8 * SIZE
  91. vst VX3, Y, 12 * SIZE
  92. #endif
  93. addi.d X, X, 16 * SIZE
  94. addi.d Y, Y, 16 * SIZE
  95. blt $r0, I, .L111
  96. .align 3
  97. .L112:
  98. andi I, N, 7
  99. bge $r0, I, .L999
  100. .align 3
  101. .L113:
  102. LD a1, X, 0 * SIZE
  103. LD a2, X, 1 * SIZE
  104. addi.d I, I, -1
  105. addi.d X, X, 2 * SIZE
  106. ST a1, Y, 0 * SIZE
  107. ST a2, Y, 1 * SIZE
  108. addi.d Y, Y, 2 * SIZE
  109. blt $r0, I, .L113
  110. b .L999
  111. .align 3
  112. .L12: // INCX==1 and INCY!=1
  113. bge $r0, I, .L122
  114. .align 3
  115. .L121:
  116. vld VX0, X, 0 * SIZE
  117. #ifdef DOUBLE
  118. vld VX1, X, 2 * SIZE
  119. vld VX2, X, 4 * SIZE
  120. vld VX3, X, 6 * SIZE
  121. vstelm.d VX0, Y, 0 * SIZE, 0
  122. vstelm.d VX0, Y, 1 * SIZE, 1
  123. add.d Y, Y, INCY
  124. vstelm.d VX1, Y, 0 * SIZE, 0
  125. vstelm.d VX1, Y, 1 * SIZE, 1
  126. add.d Y, Y, INCY
  127. vstelm.d VX2, Y, 0 * SIZE, 0
  128. vstelm.d VX2, Y, 1 * SIZE, 1
  129. add.d Y, Y, INCY
  130. vstelm.d VX3, Y, 0 * SIZE, 0
  131. vstelm.d VX3, Y, 1 * SIZE, 1
  132. add.d Y, Y, INCY
  133. vld VX0, X, 8 * SIZE
  134. vld VX1, X, 10 * SIZE
  135. vld VX2, X, 12 * SIZE
  136. vld VX3, X, 14 * SIZE
  137. vstelm.d VX0, Y, 0 * SIZE, 0
  138. vstelm.d VX0, Y, 1 * SIZE, 1
  139. add.d Y, Y, INCY
  140. vstelm.d VX1, Y, 0 * SIZE, 0
  141. vstelm.d VX1, Y, 1 * SIZE, 1
  142. add.d Y, Y, INCY
  143. vstelm.d VX2, Y, 0 * SIZE, 0
  144. vstelm.d VX2, Y, 1 * SIZE, 1
  145. add.d Y, Y, INCY
  146. vstelm.d VX3, Y, 0 * SIZE, 0
  147. vstelm.d VX3, Y, 1 * SIZE, 1
  148. #else
  149. vld VX1, X, 4 * SIZE
  150. vld VX2, X, 8 * SIZE
  151. vld VX3, X, 12 * SIZE
  152. vstelm.w VX0, Y, 0 * SIZE, 0
  153. vstelm.w VX0, Y, 1 * SIZE, 1
  154. add.d Y, Y, INCY
  155. vstelm.w VX0, Y, 0 * SIZE, 2
  156. vstelm.w VX0, Y, 1 * SIZE, 3
  157. add.d Y, Y, INCY
  158. vstelm.w VX1, Y, 0 * SIZE, 0
  159. vstelm.w VX1, Y, 1 * SIZE, 1
  160. add.d Y, Y, INCY
  161. vstelm.w VX1, Y, 0 * SIZE, 2
  162. vstelm.w VX1, Y, 1 * SIZE, 3
  163. add.d Y, Y, INCY
  164. vstelm.w VX2, Y, 0 * SIZE, 0
  165. vstelm.w VX2, Y, 1 * SIZE, 1
  166. add.d Y, Y, INCY
  167. vstelm.w VX2, Y, 0 * SIZE, 2
  168. vstelm.w VX2, Y, 1 * SIZE, 3
  169. add.d Y, Y, INCY
  170. vstelm.w VX3, Y, 0 * SIZE, 0
  171. vstelm.w VX3, Y, 1 * SIZE, 1
  172. add.d Y, Y, INCY
  173. vstelm.w VX3, Y, 0 * SIZE, 2
  174. vstelm.w VX3, Y, 1 * SIZE, 3
  175. #endif
  176. add.d Y, Y, INCY
  177. addi.d X, X, 16 * SIZE
  178. addi.d I, I, -1
  179. blt $r0, I, .L121
  180. .align 3
  181. .L122:
  182. andi I, N, 7
  183. bge $r0, I, .L999
  184. .align 3
  185. .L123:
  186. LD a1, X, 0 * SIZE
  187. LD a2, X, 1 * SIZE
  188. addi.d I, I, -1
  189. addi.d X, X, 2 * SIZE
  190. ST a1, Y, 0 * SIZE
  191. ST a2, Y, 1 * SIZE
  192. add.d Y, Y, INCY
  193. blt $r0, I, .L123
  194. b .L999
  195. .align 3
  196. .L21:
  197. bge $r0, I, .L212
  198. .align 3
  199. .L211:
  200. #ifdef DOUBLE
  201. ld.d t1, X, 0 * SIZE
  202. ld.d t2, X, 1 * SIZE
  203. add.d X, X, INCX
  204. ld.d t3, X, 0 * SIZE
  205. ld.d t4, X, 1 * SIZE
  206. add.d X, X, INCX
  207. vinsgr2vr.d VX0, t1, 0
  208. vinsgr2vr.d VX0, t2, 1
  209. vinsgr2vr.d VX1, t3, 0
  210. vinsgr2vr.d VX1, t4, 1
  211. vst VX0, Y, 0 * SIZE
  212. vst VX1, Y, 2 * SIZE
  213. ld.d t1, X, 0 * SIZE
  214. ld.d t2, X, 1 * SIZE
  215. add.d X, X, INCX
  216. ld.d t3, X, 0 * SIZE
  217. ld.d t4, X, 1 * SIZE
  218. add.d X, X, INCX
  219. vinsgr2vr.d VX0, t1, 0
  220. vinsgr2vr.d VX0, t2, 1
  221. vinsgr2vr.d VX1, t3, 0
  222. vinsgr2vr.d VX1, t4, 1
  223. vst VX0, Y, 4 * SIZE
  224. vst VX1, Y, 6 * SIZE
  225. ld.d t1, X, 0 * SIZE
  226. ld.d t2, X, 1 * SIZE
  227. add.d X, X, INCX
  228. ld.d t3, X, 0 * SIZE
  229. ld.d t4, X, 1 * SIZE
  230. add.d X, X, INCX
  231. vinsgr2vr.d VX0, t1, 0
  232. vinsgr2vr.d VX0, t2, 1
  233. vinsgr2vr.d VX1, t3, 0
  234. vinsgr2vr.d VX1, t4, 1
  235. vst VX0, Y, 8 * SIZE
  236. vst VX1, Y, 10 * SIZE
  237. ld.d t1, X, 0 * SIZE
  238. ld.d t2, X, 1 * SIZE
  239. add.d X, X, INCX
  240. ld.d t3, X, 0 * SIZE
  241. ld.d t4, X, 1 * SIZE
  242. add.d X, X, INCX
  243. vinsgr2vr.d VX0, t1, 0
  244. vinsgr2vr.d VX0, t2, 1
  245. vinsgr2vr.d VX1, t3, 0
  246. vinsgr2vr.d VX1, t4, 1
  247. vst VX0, Y, 12 * SIZE
  248. vst VX1, Y, 14 * SIZE
  249. #else
  250. ld.w t1, X, 0 * SIZE
  251. ld.w t2, X, 1 * SIZE
  252. add.d X, X, INCX
  253. ld.w t3, X, 0 * SIZE
  254. ld.w t4, X, 1 * SIZE
  255. add.d X, X, INCX
  256. vinsgr2vr.w VX0, t1, 0
  257. vinsgr2vr.w VX0, t2, 1
  258. vinsgr2vr.w VX0, t3, 2
  259. vinsgr2vr.w VX0, t4, 3
  260. ld.w t1, X, 0 * SIZE
  261. ld.w t2, X, 1 * SIZE
  262. add.d X, X, INCX
  263. ld.w t3, X, 0 * SIZE
  264. ld.w t4, X, 1 * SIZE
  265. add.d X, X, INCX
  266. vinsgr2vr.w VX1, t1, 0
  267. vinsgr2vr.w VX1, t2, 1
  268. vinsgr2vr.w VX1, t3, 2
  269. vinsgr2vr.w VX1, t4, 3
  270. ld.w t1, X, 0 * SIZE
  271. ld.w t2, X, 1 * SIZE
  272. add.d X, X, INCX
  273. ld.w t3, X, 0 * SIZE
  274. ld.w t4, X, 1 * SIZE
  275. add.d X, X, INCX
  276. vinsgr2vr.w VX2, t1, 0
  277. vinsgr2vr.w VX2, t2, 1
  278. vinsgr2vr.w VX2, t3, 2
  279. vinsgr2vr.w VX2, t4, 3
  280. ld.w t1, X, 0 * SIZE
  281. ld.w t2, X, 1 * SIZE
  282. add.d X, X, INCX
  283. ld.w t3, X, 0 * SIZE
  284. ld.w t4, X, 1 * SIZE
  285. add.d X, X, INCX
  286. vinsgr2vr.w VX3, t1, 0
  287. vinsgr2vr.w VX3, t2, 1
  288. vinsgr2vr.w VX3, t3, 2
  289. vinsgr2vr.w VX3, t4, 3
  290. vst VX0, Y, 0 * SIZE
  291. vst VX1, Y, 4 * SIZE
  292. vst VX2, Y, 8 * SIZE
  293. vst VX3, Y, 12 * SIZE
  294. #endif
  295. addi.d Y, Y, 16 * SIZE
  296. addi.d I, I, -1
  297. blt $r0, I, .L211
  298. .align 3
  299. .L212:
  300. andi I, N, 7
  301. bge $r0, I, .L999
  302. .align 3
  303. .L213:
  304. LD a1, X, 0 * SIZE
  305. LD a2, X, 1 * SIZE
  306. addi.d I, I, -1
  307. ST a1, Y, 0 * SIZE
  308. ST a2, Y, 1 * SIZE
  309. add.d X, X, INCX
  310. addi.d Y, Y, 2 * SIZE
  311. blt $r0, I, .L213
  312. b .L999
  313. .align 3
  314. .L22:
  315. bge $r0, I, .L223
  316. .align 3
  317. .L222:
  318. LD a1, X, 0 * SIZE
  319. LD a2, X, 1 * SIZE
  320. add.d X, X, INCX
  321. LD a3, X, 0 * SIZE
  322. LD a4, X, 1 * SIZE
  323. add.d X, X, INCX
  324. ST a1, Y, 0 * SIZE
  325. ST a2, Y, 1 * SIZE
  326. add.d Y, Y, INCY
  327. ST a3, Y, 0 * SIZE
  328. ST a4, Y, 1 * SIZE
  329. add.d Y, Y, INCY
  330. LD a1, X, 0 * SIZE
  331. LD a2, X, 1 * SIZE
  332. add.d X, X, INCX
  333. LD a3, X, 0 * SIZE
  334. LD a4, X, 1 * SIZE
  335. add.d X, X, INCX
  336. ST a1, Y, 0 * SIZE
  337. ST a2, Y, 1 * SIZE
  338. add.d Y, Y, INCY
  339. ST a3, Y, 0 * SIZE
  340. ST a4, Y, 1 * SIZE
  341. add.d Y, Y, INCY
  342. LD a1, X, 0 * SIZE
  343. LD a2, X, 1 * SIZE
  344. add.d X, X, INCX
  345. LD a3, X, 0 * SIZE
  346. LD a4, X, 1 * SIZE
  347. add.d X, X, INCX
  348. ST a1, Y, 0 * SIZE
  349. ST a2, Y, 1 * SIZE
  350. add.d Y, Y, INCY
  351. ST a3, Y, 0 * SIZE
  352. ST a4, Y, 1 * SIZE
  353. add.d Y, Y, INCY
  354. LD a1, X, 0 * SIZE
  355. LD a2, X, 1 * SIZE
  356. add.d X, X, INCX
  357. LD a3, X, 0 * SIZE
  358. LD a4, X, 1 * SIZE
  359. add.d X, X, INCX
  360. ST a1, Y, 0 * SIZE
  361. ST a2, Y, 1 * SIZE
  362. add.d Y, Y, INCY
  363. ST a3, Y, 0 * SIZE
  364. ST a4, Y, 1 * SIZE
  365. add.d Y, Y, INCY
  366. addi.d I, I, -1
  367. blt $r0, I, .L222
  368. .align 3
  369. .L223:
  370. andi I, N, 7
  371. bge $r0, I, .L999
  372. .align 3
  373. .L224:
  374. LD a1, X, 0 * SIZE
  375. LD a2, X, 1 * SIZE
  376. addi.d I, I, -1
  377. ST a1, Y, 0 * SIZE
  378. ST a2, Y, 1 * SIZE
  379. add.d X, X, INCX
  380. add.d Y, Y, INCY
  381. blt $r0, I, .L224
  382. .align 3
  383. .L999:
  384. move $r4, $r12
  385. jirl $r0, $r1, 0x0
  386. .align 3
  387. EPILOGUE