You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy_loongson3a.S 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. /*********************************************************************/
  29. /* Copyright 2009, 2010 The University of Texas at Austin. */
  30. /* All rights reserved. */
  31. /* */
  32. /* Redistribution and use in source and binary forms, with or */
  33. /* without modification, are permitted provided that the following */
  34. /* conditions are met: */
  35. /* */
  36. /* 1. Redistributions of source code must retain the above */
  37. /* copyright notice, this list of conditions and the following */
  38. /* disclaimer. */
  39. /* */
  40. /* 2. Redistributions in binary form must reproduce the above */
  41. /* copyright notice, this list of conditions and the following */
  42. /* disclaimer in the documentation and/or other materials */
  43. /* provided with the distribution. */
  44. /* */
  45. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  46. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  47. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  48. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  49. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  50. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  51. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  52. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  53. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  54. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  55. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  56. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  57. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  58. /* POSSIBILITY OF SUCH DAMAGE. */
  59. /* */
  60. /* The views and conclusions contained in the software and */
  61. /* documentation are those of the authors and should not be */
  62. /* interpreted as representing official policies, either expressed */
  63. /* or implied, of The University of Texas at Austin. */
  64. /*********************************************************************/
  65. #define ASSEMBLER
  66. #include "common.h"
  67. #define PREFETCH_DISTANCE 48
  68. #define N $4
  69. #define X $8
  70. #define INCX $9
  71. #define Y $10
  72. #define INCY $11
  73. #define I $2
  74. #define TEMP $3
  75. #define YY $5
  76. #define ALPHA $f15
  77. #define a1 $f0
  78. #define a2 $f1
  79. #define a3 $f2
  80. #define a4 $f3
  81. #define a5 $f4
  82. #define a6 $f5
  83. #define a7 $f6
  84. #define a8 $f7
  85. #define b1 $f8
  86. #define b2 $f9
  87. #define b3 $f10
  88. #define b4 $f11
  89. #define b5 $f12
  90. #define b6 $f13
  91. #define b7 $f14
  92. #define b8 $f17
  93. #define t1 $f18
  94. #define t2 $f19
  95. #define t3 $f20
  96. #define t4 $f21
  97. PROLOGUE
  98. #ifndef __64BIT__
  99. daddiu $sp, $sp, -16
  100. sdc1 $f20, 0($sp)
  101. sdc1 $f21, 8($sp)
  102. #endif
  103. li TEMP, SIZE
  104. blez N, .L999
  105. dsll INCX, INCX, BASE_SHIFT
  106. bne INCX, TEMP, .L20
  107. dsll INCY, INCY, BASE_SHIFT
  108. bne INCY, TEMP, .L20
  109. dsra I, N, 3
  110. blez I, .L15
  111. daddiu I, I, -1
  112. LD a1, 0 * SIZE(X)
  113. LD a2, 1 * SIZE(X)
  114. LD a3, 2 * SIZE(X)
  115. LD a4, 3 * SIZE(X)
  116. LD a5, 4 * SIZE(X)
  117. LD a6, 5 * SIZE(X)
  118. LD a7, 6 * SIZE(X)
  119. LD a8, 7 * SIZE(X)
  120. LD b1, 0 * SIZE(Y)
  121. LD b2, 1 * SIZE(Y)
  122. LD b3, 2 * SIZE(Y)
  123. LD b4, 3 * SIZE(Y)
  124. LD b5, 4 * SIZE(Y)
  125. LD b6, 5 * SIZE(Y)
  126. LD b7, 6 * SIZE(Y)
  127. LD b8, 7 * SIZE(Y)
  128. blez I, .L13
  129. NOP
  130. .align 5
  131. .L12:
  132. PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
  133. PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
  134. MADD t1, b1, ALPHA, a1
  135. MADD t2, b2, ALPHA, a2
  136. LD b1, 8 * SIZE(Y)
  137. LD b2, 9 * SIZE(Y)
  138. MADD t3, b3, ALPHA, a3
  139. MADD t4, b4, ALPHA, a4
  140. LD b3, 10 * SIZE(Y)
  141. LD b4, 11 * SIZE(Y)
  142. LD a1, 8 * SIZE(X)
  143. LD a2, 9 * SIZE(X)
  144. LD a3, 10 * SIZE(X)
  145. LD a4, 11 * SIZE(X)
  146. ST t1, 0 * SIZE(Y)
  147. ST t2, 1 * SIZE(Y)
  148. ST t3, 2 * SIZE(Y)
  149. ST t4, 3 * SIZE(Y)
  150. PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
  151. PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
  152. MADD t1, b5, ALPHA, a5
  153. MADD t2, b6, ALPHA, a6
  154. LD b5, 12 * SIZE(Y)
  155. LD b6, 13 * SIZE(Y)
  156. MADD t3, b7, ALPHA, a7
  157. MADD t4, b8, ALPHA, a8
  158. LD b7, 14 * SIZE(Y)
  159. LD b8, 15 * SIZE(Y)
  160. LD a5, 12 * SIZE(X)
  161. LD a6, 13 * SIZE(X)
  162. LD a7, 14 * SIZE(X)
  163. LD a8, 15 * SIZE(X)
  164. ST t1, 4 * SIZE(Y)
  165. ST t2, 5 * SIZE(Y)
  166. ST t3, 6 * SIZE(Y)
  167. ST t4, 7 * SIZE(Y)
  168. daddiu I, I, -1
  169. daddiu Y, Y, 8 * SIZE
  170. bgtz I, .L12
  171. daddiu X, X, 8 * SIZE
  172. .align 5
  173. .L13:
  174. MADD t1, b1, ALPHA, a1
  175. MADD t2, b2, ALPHA, a2
  176. MADD t3, b3, ALPHA, a3
  177. MADD t4, b4, ALPHA, a4
  178. ST t1, 0 * SIZE(Y)
  179. MADD t1, b5, ALPHA, a5
  180. ST t2, 1 * SIZE(Y)
  181. MADD t2, b6, ALPHA, a6
  182. ST t3, 2 * SIZE(Y)
  183. MADD t3, b7, ALPHA, a7
  184. ST t4, 3 * SIZE(Y)
  185. MADD t4, b8, ALPHA, a8
  186. ST t1, 4 * SIZE(Y)
  187. ST t2, 5 * SIZE(Y)
  188. ST t3, 6 * SIZE(Y)
  189. ST t4, 7 * SIZE(Y)
  190. daddiu X, X, 8 * SIZE
  191. daddiu Y, Y, 8 * SIZE
  192. .align 5
  193. .L15:
  194. andi I, N, 7
  195. blez I, .L999
  196. NOP
  197. .align 3
  198. .L16:
  199. LD a1, 0 * SIZE(X)
  200. LD b1, 0 * SIZE(Y)
  201. daddiu X, X, SIZE
  202. daddiu Y, Y, SIZE
  203. MADD t1, b1, ALPHA, a1
  204. daddiu I, I, -1
  205. bgtz I, .L16
  206. ST t1, -1 * SIZE(Y)
  207. #ifndef __64BIT__
  208. ldc1 $f20, 0($sp)
  209. ldc1 $f21, 8($sp)
  210. daddiu $sp, $sp, 16
  211. #endif
  212. j $31
  213. NOP
  214. .align 5
  215. .L20:
  216. dsra I, N, 3
  217. move YY, Y
  218. blez I, .L25
  219. daddiu I, I, -1
  220. LD a1, 0 * SIZE(X)
  221. daddu X, X, INCX
  222. LD b1, 0 * SIZE(Y)
  223. daddu Y, Y, INCY
  224. LD a2, 0 * SIZE(X)
  225. daddu X, X, INCX
  226. LD b2, 0 * SIZE(Y)
  227. daddu Y, Y, INCY
  228. LD a3, 0 * SIZE(X)
  229. daddu X, X, INCX
  230. LD b3, 0 * SIZE(Y)
  231. daddu Y, Y, INCY
  232. LD a4, 0 * SIZE(X)
  233. daddu X, X, INCX
  234. LD b4, 0 * SIZE(Y)
  235. daddu Y, Y, INCY
  236. LD a5, 0 * SIZE(X)
  237. daddu X, X, INCX
  238. LD b5, 0 * SIZE(Y)
  239. daddu Y, Y, INCY
  240. LD a6, 0 * SIZE(X)
  241. daddu X, X, INCX
  242. LD b6, 0 * SIZE(Y)
  243. daddu Y, Y, INCY
  244. LD a7, 0 * SIZE(X)
  245. daddu X, X, INCX
  246. LD b7, 0 * SIZE(Y)
  247. daddu Y, Y, INCY
  248. LD a8, 0 * SIZE(X)
  249. daddu X, X, INCX
  250. LD b8, 0 * SIZE(Y)
  251. daddu Y, Y, INCY
  252. blez I, .L23
  253. NOP
  254. .align 5
  255. .L22:
  256. MADD t1, b1, ALPHA, a1
  257. LD a1, 0 * SIZE(X)
  258. LD b1, 0 * SIZE(Y)
  259. daddu X, X, INCX
  260. daddu Y, Y, INCY
  261. MADD t2, b2, ALPHA, a2
  262. LD a2, 0 * SIZE(X)
  263. LD b2, 0 * SIZE(Y)
  264. daddu X, X, INCX
  265. daddu Y, Y, INCY
  266. MADD t3, b3, ALPHA, a3
  267. LD a3, 0 * SIZE(X)
  268. LD b3, 0 * SIZE(Y)
  269. daddu X, X, INCX
  270. daddu Y, Y, INCY
  271. MADD t4, b4, ALPHA, a4
  272. LD a4, 0 * SIZE(X)
  273. LD b4, 0 * SIZE(Y)
  274. daddu X, X, INCX
  275. daddu Y, Y, INCY
  276. ST t1, 0 * SIZE(YY)
  277. daddu YY, YY, INCY
  278. MADD t1, b5, ALPHA, a5
  279. LD a5, 0 * SIZE(X)
  280. LD b5, 0 * SIZE(Y)
  281. daddu X, X, INCX
  282. daddu Y, Y, INCY
  283. ST t2, 0 * SIZE(YY)
  284. daddu YY, YY, INCY
  285. MADD t2, b6, ALPHA, a6
  286. LD a6, 0 * SIZE(X)
  287. LD b6, 0 * SIZE(Y)
  288. daddu X, X, INCX
  289. daddu Y, Y, INCY
  290. ST t3, 0 * SIZE(YY)
  291. daddu YY, YY, INCY
  292. MADD t3, b7, ALPHA, a7
  293. LD a7, 0 * SIZE(X)
  294. LD b7, 0 * SIZE(Y)
  295. daddu X, X, INCX
  296. daddu Y, Y, INCY
  297. ST t4, 0 * SIZE(YY)
  298. daddu YY, YY, INCY
  299. MADD t4, b8, ALPHA, a8
  300. LD a8, 0 * SIZE(X)
  301. daddu X, X, INCX
  302. LD b8, 0 * SIZE(Y)
  303. daddu Y, Y, INCY
  304. ST t1, 0 * SIZE(YY)
  305. daddu YY, YY, INCY
  306. ST t2, 0 * SIZE(YY)
  307. daddu YY, YY, INCY
  308. ST t3, 0 * SIZE(YY)
  309. daddu YY, YY, INCY
  310. ST t4, 0 * SIZE(YY)
  311. daddiu I, I, -1
  312. bgtz I, .L22
  313. daddu YY, YY, INCY
  314. .align 5
  315. .L23:
  316. MADD t1, b1, ALPHA, a1
  317. MADD t2, b2, ALPHA, a2
  318. MADD t3, b3, ALPHA, a3
  319. MADD t4, b4, ALPHA, a4
  320. ST t1, 0 * SIZE(YY)
  321. daddu YY, YY, INCY
  322. MADD t1, b5, ALPHA, a5
  323. ST t2, 0 * SIZE(YY)
  324. daddu YY, YY, INCY
  325. MADD t2, b6, ALPHA, a6
  326. ST t3, 0 * SIZE(YY)
  327. daddu YY, YY, INCY
  328. MADD t3, b7, ALPHA, a7
  329. ST t4, 0 * SIZE(YY)
  330. daddu YY, YY, INCY
  331. MADD t4, b8, ALPHA, a8
  332. ST t1, 0 * SIZE(YY)
  333. daddu YY, YY, INCY
  334. ST t2, 0 * SIZE(YY)
  335. daddu YY, YY, INCY
  336. ST t3, 0 * SIZE(YY)
  337. daddu YY, YY, INCY
  338. ST t4, 0 * SIZE(YY)
  339. daddu YY, YY, INCY
  340. .align 5
  341. .L25:
  342. andi I, N, 7
  343. blez I, .L999
  344. NOP
  345. .align 3
  346. .L26:
  347. LD a1, 0 * SIZE(X)
  348. LD b1, 0 * SIZE(Y)
  349. MADD t1, b1, ALPHA, a1
  350. daddu X, X, INCX
  351. ST t1, 0 * SIZE(Y)
  352. daddiu I, I, -1
  353. bgtz I, .L26
  354. daddu Y, Y, INCY
  355. .align 5
  356. .L999:
  357. #ifndef __64BIT__
  358. ldc1 $f20, 0($sp)
  359. ldc1 $f21, 8($sp)
  360. daddiu $sp, $sp, 16
  361. #endif
  362. j $31
  363. NOP
  364. EPILOGUE