You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy_loongson3a.S 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468
  1. /*****************************************************************************
  2. Copyright (c) 2011-2014, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written
  16. permission.
  17. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  23. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  24. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  25. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  26. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. **********************************************************************************/
  28. /*********************************************************************/
  29. /* Copyright 2009, 2010 The University of Texas at Austin. */
  30. /* All rights reserved. */
  31. /* */
  32. /* Redistribution and use in source and binary forms, with or */
  33. /* without modification, are permitted provided that the following */
  34. /* conditions are met: */
  35. /* */
  36. /* 1. Redistributions of source code must retain the above */
  37. /* copyright notice, this list of conditions and the following */
  38. /* disclaimer. */
  39. /* */
  40. /* 2. Redistributions in binary form must reproduce the above */
  41. /* copyright notice, this list of conditions and the following */
  42. /* disclaimer in the documentation and/or other materials */
  43. /* provided with the distribution. */
  44. /* */
  45. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  46. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  47. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  48. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  49. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  50. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  51. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  52. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  53. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  54. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  55. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  56. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  57. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  58. /* POSSIBILITY OF SUCH DAMAGE. */
  59. /* */
  60. /* The views and conclusions contained in the software and */
  61. /* documentation are those of the authors and should not be */
  62. /* interpreted as representing official policies, either expressed */
  63. /* or implied, of The University of Texas at Austin. */
  64. /*********************************************************************/
  65. #define ASSEMBLER
  66. #include "common.h"
  67. #define PREFETCH_DISTANCE 48
  68. #define N $4
  69. #define X $8
  70. #define INCX $9
  71. #define Y $10
  72. #define INCY $11
  73. #define I $2
  74. #define TEMP $3
  75. #define YY $5
  76. #define ALPHA $f15
  77. #define a1 $f0
  78. #define a2 $f1
  79. #define a3 $f2
  80. #define a4 $f3
  81. #define a5 $f4
  82. #define a6 $f5
  83. #define a7 $f6
  84. #define a8 $f7
  85. #define b1 $f8
  86. #define b2 $f9
  87. #define b3 $f10
  88. #define b4 $f11
  89. #define b5 $f12
  90. #define b6 $f13
  91. #define b7 $f14
  92. #define b8 $f17
  93. #define t1 $f18
  94. #define t2 $f19
  95. #define t3 $f20
  96. #define t4 $f21
  97. PROLOGUE
  98. #ifndef __64BIT__
  99. daddiu $sp, $sp, -16
  100. sdc1 $f20, 0($sp)
  101. sdc1 $f21, 8($sp)
  102. #endif
  103. li TEMP, SIZE
  104. blez N, .L999
  105. dsll INCX, INCX, BASE_SHIFT
  106. bne INCX, TEMP, .L20
  107. dsll INCY, INCY, BASE_SHIFT
  108. bne INCY, TEMP, .L20
  109. dsra I, N, 3
  110. blez I, .L15
  111. daddiu I, I, -1
  112. LD a1, 0 * SIZE(X)
  113. LD a2, 1 * SIZE(X)
  114. LD a3, 2 * SIZE(X)
  115. LD a4, 3 * SIZE(X)
  116. LD a5, 4 * SIZE(X)
  117. LD a6, 5 * SIZE(X)
  118. LD a7, 6 * SIZE(X)
  119. LD a8, 7 * SIZE(X)
  120. LD b1, 0 * SIZE(Y)
  121. LD b2, 1 * SIZE(Y)
  122. LD b3, 2 * SIZE(Y)
  123. LD b4, 3 * SIZE(Y)
  124. LD b5, 4 * SIZE(Y)
  125. LD b6, 5 * SIZE(Y)
  126. LD b7, 6 * SIZE(Y)
  127. LD b8, 7 * SIZE(Y)
  128. blez I, .L13
  129. NOP
  130. .align 5
  131. .L12:
  132. PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
  133. PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
  134. MADD t1, b1, ALPHA, a1
  135. MADD t2, b2, ALPHA, a2
  136. LD b1, 8 * SIZE(Y)
  137. LD b2, 9 * SIZE(Y)
  138. MADD t3, b3, ALPHA, a3
  139. MADD t4, b4, ALPHA, a4
  140. LD b3, 10 * SIZE(Y)
  141. LD b4, 11 * SIZE(Y)
  142. LD a1, 8 * SIZE(X)
  143. LD a2, 9 * SIZE(X)
  144. LD a3, 10 * SIZE(X)
  145. LD a4, 11 * SIZE(X)
  146. ST t1, 0 * SIZE(Y)
  147. ST t2, 1 * SIZE(Y)
  148. ST t3, 2 * SIZE(Y)
  149. ST t4, 3 * SIZE(Y)
  150. PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
  151. PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
  152. MADD t1, b5, ALPHA, a5
  153. MADD t2, b6, ALPHA, a6
  154. LD b5, 12 * SIZE(Y)
  155. LD b6, 13 * SIZE(Y)
  156. MADD t3, b7, ALPHA, a7
  157. MADD t4, b8, ALPHA, a8
  158. LD b7, 14 * SIZE(Y)
  159. LD b8, 15 * SIZE(Y)
  160. LD a5, 12 * SIZE(X)
  161. LD a6, 13 * SIZE(X)
  162. LD a7, 14 * SIZE(X)
  163. LD a8, 15 * SIZE(X)
  164. ST t1, 4 * SIZE(Y)
  165. ST t2, 5 * SIZE(Y)
  166. ST t3, 6 * SIZE(Y)
  167. ST t4, 7 * SIZE(Y)
  168. daddiu I, I, -1
  169. daddiu Y, Y, 8 * SIZE
  170. bgtz I, .L12
  171. daddiu X, X, 8 * SIZE
  172. .align 5
  173. .L13:
  174. MADD t1, b1, ALPHA, a1
  175. MADD t2, b2, ALPHA, a2
  176. MADD t3, b3, ALPHA, a3
  177. MADD t4, b4, ALPHA, a4
  178. ST t1, 0 * SIZE(Y)
  179. MADD t1, b5, ALPHA, a5
  180. ST t2, 1 * SIZE(Y)
  181. MADD t2, b6, ALPHA, a6
  182. ST t3, 2 * SIZE(Y)
  183. MADD t3, b7, ALPHA, a7
  184. ST t4, 3 * SIZE(Y)
  185. MADD t4, b8, ALPHA, a8
  186. ST t1, 4 * SIZE(Y)
  187. ST t2, 5 * SIZE(Y)
  188. ST t3, 6 * SIZE(Y)
  189. ST t4, 7 * SIZE(Y)
  190. daddiu X, X, 8 * SIZE
  191. daddiu Y, Y, 8 * SIZE
  192. .align 5
  193. .L15:
  194. andi I, N, 7
  195. blez I, .L999
  196. NOP
  197. .align 3
  198. .L16:
  199. LD a1, 0 * SIZE(X)
  200. LD b1, 0 * SIZE(Y)
  201. daddiu X, X, SIZE
  202. daddiu Y, Y, SIZE
  203. MADD t1, b1, ALPHA, a1
  204. daddiu I, I, -1
  205. bgtz I, .L16
  206. ST t1, -1 * SIZE(Y)
  207. #ifndef __64BIT__
  208. ldc1 $f20, 0($sp)
  209. ldc1 $f21, 8($sp)
  210. daddiu $sp, $sp, 16
  211. #endif
  212. j $31
  213. NOP
  214. .align 5
  215. .L20:
  216. beqz INCY, .L27
  217. dsra I, N, 3
  218. move YY, Y
  219. blez I, .L25
  220. daddiu I, I, -1
  221. LD a1, 0 * SIZE(X)
  222. daddu X, X, INCX
  223. LD b1, 0 * SIZE(Y)
  224. daddu Y, Y, INCY
  225. LD a2, 0 * SIZE(X)
  226. daddu X, X, INCX
  227. LD b2, 0 * SIZE(Y)
  228. daddu Y, Y, INCY
  229. LD a3, 0 * SIZE(X)
  230. daddu X, X, INCX
  231. LD b3, 0 * SIZE(Y)
  232. daddu Y, Y, INCY
  233. LD a4, 0 * SIZE(X)
  234. daddu X, X, INCX
  235. LD b4, 0 * SIZE(Y)
  236. daddu Y, Y, INCY
  237. LD a5, 0 * SIZE(X)
  238. daddu X, X, INCX
  239. LD b5, 0 * SIZE(Y)
  240. daddu Y, Y, INCY
  241. LD a6, 0 * SIZE(X)
  242. daddu X, X, INCX
  243. LD b6, 0 * SIZE(Y)
  244. daddu Y, Y, INCY
  245. LD a7, 0 * SIZE(X)
  246. daddu X, X, INCX
  247. LD b7, 0 * SIZE(Y)
  248. daddu Y, Y, INCY
  249. LD a8, 0 * SIZE(X)
  250. daddu X, X, INCX
  251. LD b8, 0 * SIZE(Y)
  252. daddu Y, Y, INCY
  253. blez I, .L23
  254. NOP
  255. .align 5
  256. .L22:
  257. MADD t1, b1, ALPHA, a1
  258. LD a1, 0 * SIZE(X)
  259. LD b1, 0 * SIZE(Y)
  260. daddu X, X, INCX
  261. daddu Y, Y, INCY
  262. MADD t2, b2, ALPHA, a2
  263. LD a2, 0 * SIZE(X)
  264. LD b2, 0 * SIZE(Y)
  265. daddu X, X, INCX
  266. daddu Y, Y, INCY
  267. MADD t3, b3, ALPHA, a3
  268. LD a3, 0 * SIZE(X)
  269. LD b3, 0 * SIZE(Y)
  270. daddu X, X, INCX
  271. daddu Y, Y, INCY
  272. MADD t4, b4, ALPHA, a4
  273. LD a4, 0 * SIZE(X)
  274. LD b4, 0 * SIZE(Y)
  275. daddu X, X, INCX
  276. daddu Y, Y, INCY
  277. ST t1, 0 * SIZE(YY)
  278. daddu YY, YY, INCY
  279. MADD t1, b5, ALPHA, a5
  280. LD a5, 0 * SIZE(X)
  281. LD b5, 0 * SIZE(Y)
  282. daddu X, X, INCX
  283. daddu Y, Y, INCY
  284. ST t2, 0 * SIZE(YY)
  285. daddu YY, YY, INCY
  286. MADD t2, b6, ALPHA, a6
  287. LD a6, 0 * SIZE(X)
  288. LD b6, 0 * SIZE(Y)
  289. daddu X, X, INCX
  290. daddu Y, Y, INCY
  291. ST t3, 0 * SIZE(YY)
  292. daddu YY, YY, INCY
  293. MADD t3, b7, ALPHA, a7
  294. LD a7, 0 * SIZE(X)
  295. LD b7, 0 * SIZE(Y)
  296. daddu X, X, INCX
  297. daddu Y, Y, INCY
  298. ST t4, 0 * SIZE(YY)
  299. daddu YY, YY, INCY
  300. MADD t4, b8, ALPHA, a8
  301. LD a8, 0 * SIZE(X)
  302. daddu X, X, INCX
  303. LD b8, 0 * SIZE(Y)
  304. daddu Y, Y, INCY
  305. ST t1, 0 * SIZE(YY)
  306. daddu YY, YY, INCY
  307. ST t2, 0 * SIZE(YY)
  308. daddu YY, YY, INCY
  309. ST t3, 0 * SIZE(YY)
  310. daddu YY, YY, INCY
  311. ST t4, 0 * SIZE(YY)
  312. daddiu I, I, -1
  313. bgtz I, .L22
  314. daddu YY, YY, INCY
  315. .align 5
  316. .L23:
  317. MADD t1, b1, ALPHA, a1
  318. MADD t2, b2, ALPHA, a2
  319. MADD t3, b3, ALPHA, a3
  320. MADD t4, b4, ALPHA, a4
  321. ST t1, 0 * SIZE(YY)
  322. daddu YY, YY, INCY
  323. MADD t1, b5, ALPHA, a5
  324. ST t2, 0 * SIZE(YY)
  325. daddu YY, YY, INCY
  326. MADD t2, b6, ALPHA, a6
  327. ST t3, 0 * SIZE(YY)
  328. daddu YY, YY, INCY
  329. MADD t3, b7, ALPHA, a7
  330. ST t4, 0 * SIZE(YY)
  331. daddu YY, YY, INCY
  332. MADD t4, b8, ALPHA, a8
  333. ST t1, 0 * SIZE(YY)
  334. daddu YY, YY, INCY
  335. ST t2, 0 * SIZE(YY)
  336. daddu YY, YY, INCY
  337. ST t3, 0 * SIZE(YY)
  338. daddu YY, YY, INCY
  339. ST t4, 0 * SIZE(YY)
  340. daddu YY, YY, INCY
  341. .align 5
  342. .L25:
  343. andi I, N, 7
  344. blez I, .L999
  345. NOP
  346. .align 3
  347. .L26:
  348. LD a1, 0 * SIZE(X)
  349. LD b1, 0 * SIZE(Y)
  350. MADD t1, b1, ALPHA, a1
  351. daddu X, X, INCX
  352. ST t1, 0 * SIZE(Y)
  353. daddiu I, I, -1
  354. bgtz I, .L26
  355. daddu Y, Y, INCY
  356. .align 5
  357. .L999:
  358. #ifndef __64BIT__
  359. ldc1 $f20, 0($sp)
  360. ldc1 $f21, 8($sp)
  361. daddiu $sp, $sp, 16
  362. #endif
  363. j $31
  364. NOP
  365. .align 3
  366. .L27:
  367. LD b1, 0 * SIZE(Y)
  368. .L28:
  369. daddiu N, N, -1
  370. LD a1, 0 * SIZE(X)
  371. daddu X, X, INCX
  372. bgtz N, .L28
  373. MADD b1, b1, ALPHA, a1
  374. j .L999
  375. ST b1, 0 * SIZE(Y)
  376. EPILOGUE