You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zcopy.S 8.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define Y ARG4 /* rcx */
  44. #ifndef WINDOWS_ABI
  45. #define INCY ARG5 /* r8 */
  46. #define FLAG ARG6
  47. #else
  48. #define INCY %r10
  49. #define FLAG %r11
  50. #endif
  51. #include "l1param.h"
  52. PROLOGUE
  53. PROFCODE
  54. #ifdef WINDOWS_ABI
  55. movq 40(%rsp), INCY
  56. #endif
  57. EMMS
  58. salq $ZBASE_SHIFT, INCX
  59. salq $ZBASE_SHIFT, INCY
  60. testq N, N # if m == 0 goto End
  61. jle .L999
  62. cmpq $2 * SIZE, INCX # if incx != 1
  63. jne .L100
  64. cmpq $2 * SIZE, INCY # if incy != 1
  65. jne .L100
  66. movq N, %rax # i = m
  67. sarq $2, %rax
  68. jle .L20
  69. ALIGN_2
  70. .L11:
  71. #ifdef XDOUBLE
  72. #ifdef PREFETCH
  73. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  74. #endif
  75. movq 0(X), %mm0
  76. movq %mm0, 0(Y)
  77. movq 8(X), %mm1
  78. movq %mm1, 8(Y)
  79. movq 16(X), %mm2
  80. movq %mm2, 16(Y)
  81. movq 24(X), %mm3
  82. movq %mm3, 24(Y)
  83. #ifdef PREFETCHW
  84. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  85. #endif
  86. movq 32(X), %mm4
  87. movq %mm4, 32(Y)
  88. movq 40(X), %mm5
  89. movq %mm5, 40(Y)
  90. movq 48(X), %mm6
  91. movq %mm6, 48(Y)
  92. movq 56(X), %mm7
  93. movq %mm7, 56(Y)
  94. #ifdef PREFETCH
  95. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  96. #endif
  97. movq 64(X), %mm0
  98. movq %mm0, 64(Y)
  99. movq 72(X), %mm1
  100. movq %mm1, 72(Y)
  101. movq 80(X), %mm2
  102. movq %mm2, 80(Y)
  103. movq 88(X), %mm3
  104. movq %mm3, 88(Y)
  105. #ifdef PREFETCHW
  106. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  107. #endif
  108. movq 96(X), %mm4
  109. movq %mm4, 96(Y)
  110. movq 104(X), %mm5
  111. movq %mm5, 104(Y)
  112. movq 112(X), %mm6
  113. movq %mm6, 112(Y)
  114. movq 120(X), %mm7
  115. movq %mm7, 120(Y)
  116. #elif defined(DOUBLE)
  117. #ifdef PREFETCH
  118. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  119. #endif
  120. movq 0 * SIZE(X), %mm0
  121. movq 1 * SIZE(X), %mm1
  122. movq %mm0, 0 * SIZE(Y)
  123. movq %mm1, 1 * SIZE(Y)
  124. movq 2 * SIZE(X), %mm2
  125. movq 3 * SIZE(X), %mm3
  126. movq %mm2, 2 * SIZE(Y)
  127. movq %mm3, 3 * SIZE(Y)
  128. movq 4 * SIZE(X), %mm4
  129. movq 5 * SIZE(X), %mm5
  130. #ifdef PREFETCHW
  131. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  132. #endif
  133. movq %mm4, 4 * SIZE(Y)
  134. movq %mm5, 5 * SIZE(Y)
  135. movq 6 * SIZE(X), %mm6
  136. movq 7 * SIZE(X), %mm7
  137. movq %mm6, 6 * SIZE(Y)
  138. movq %mm7, 7 * SIZE(Y)
  139. #else
  140. movq 0 * SIZE(X), %mm0
  141. movq 2 * SIZE(X), %mm2
  142. #ifdef PREFETCH
  143. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  144. #endif
  145. movq %mm0, 0 * SIZE(Y)
  146. movq %mm2, 2 * SIZE(Y)
  147. movq 4 * SIZE(X), %mm4
  148. movq 6 * SIZE(X), %mm6
  149. #ifdef PREFETCHW
  150. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  151. #endif
  152. movq %mm4, 4 * SIZE(Y)
  153. movq %mm6, 6 * SIZE(Y)
  154. #endif
  155. addq $8 * SIZE, X
  156. addq $8 * SIZE, Y
  157. decq %rax
  158. jg .L11
  159. ALIGN_2
  160. .L20:
  161. movq N, %rax # i = m
  162. andq $3, %rax
  163. jle .L99
  164. ALIGN_2
  165. .L21:
  166. #ifdef XDOUBLE
  167. movq 0(X), %mm0
  168. movq %mm0, 0(Y)
  169. movq 8(X), %mm1
  170. movq %mm1, 8(Y)
  171. movq 16(X), %mm2
  172. movq %mm2, 16(Y)
  173. movq 24(X), %mm3
  174. movq %mm3, 24(Y)
  175. #elif defined(DOUBLE)
  176. movq 0 * SIZE(X), %mm0
  177. movq %mm0, 0 * SIZE(Y)
  178. movq 1 * SIZE(X), %mm1
  179. movq %mm1, 1 * SIZE(Y)
  180. #else
  181. movq 0 * SIZE(X), %mm0
  182. movq %mm0, 0 * SIZE(Y)
  183. #endif
  184. addq $2 * SIZE, X
  185. addq $2 * SIZE, Y
  186. decq %rax
  187. jg .L21
  188. .L99:
  189. xorq %rax,%rax
  190. EMMS
  191. ret
  192. ALIGN_3
  193. .L100:
  194. movq N, %rax
  195. sarq $2, %rax
  196. jle .L120
  197. ALIGN_2
  198. .L111:
  199. #ifdef XDOUBLE
  200. movq 0(X), %mm0
  201. movq %mm0, 0(Y)
  202. movq 8(X), %mm1
  203. movq %mm1, 8(Y)
  204. movq 16(X), %mm2
  205. movq %mm2, 16(Y)
  206. movq 24(X), %mm3
  207. movq %mm3, 24(Y)
  208. addq INCX, X
  209. addq INCY, Y
  210. movq 0(X), %mm0
  211. movq %mm0, 0(Y)
  212. movq 8(X), %mm1
  213. movq %mm1, 8(Y)
  214. movq 16(X), %mm2
  215. movq %mm2, 16(Y)
  216. movq 24(X), %mm3
  217. movq %mm3, 24(Y)
  218. addq INCX, X
  219. addq INCY, Y
  220. movq 0(X), %mm0
  221. movq %mm0, 0(Y)
  222. movq 8(X), %mm1
  223. movq %mm1, 8(Y)
  224. movq 16(X), %mm2
  225. movq %mm2, 16(Y)
  226. movq 24(X), %mm3
  227. movq %mm3, 24(Y)
  228. addq INCX, X
  229. addq INCY, Y
  230. movq 0(X), %mm0
  231. movq %mm0, 0(Y)
  232. movq 8(X), %mm1
  233. movq %mm1, 8(Y)
  234. movq 16(X), %mm2
  235. movq %mm2, 16(Y)
  236. movq 24(X), %mm3
  237. movq %mm3, 24(Y)
  238. addq INCX, X
  239. addq INCY, Y
  240. #elif defined(DOUBLE)
  241. movq 0 * SIZE(X), %mm0
  242. movq %mm0, 0 * SIZE(Y)
  243. movq 1 * SIZE(X), %mm1
  244. movq %mm1, 1 * SIZE(Y)
  245. addq INCX, X
  246. addq INCY, Y
  247. movq 0 * SIZE(X), %mm2
  248. movq %mm2, 0 * SIZE(Y)
  249. movq 1 * SIZE(X), %mm3
  250. movq %mm3, 1 * SIZE(Y)
  251. addq INCX, X
  252. addq INCY, Y
  253. movq 0 * SIZE(X), %mm4
  254. movq %mm4, 0 * SIZE(Y)
  255. movq 1 * SIZE(X), %mm5
  256. movq %mm5, 1 * SIZE(Y)
  257. addq INCX, X
  258. addq INCY, Y
  259. movq 0 * SIZE(X), %mm6
  260. movq %mm6, 0 * SIZE(Y)
  261. movq 1 * SIZE(X), %mm7
  262. movq %mm7, 1 * SIZE(Y)
  263. addq INCX, X
  264. addq INCY, Y
  265. #else
  266. movq 0 * SIZE(X), %mm0
  267. movq %mm0, 0 * SIZE(Y)
  268. addq INCX, X
  269. addq INCY, Y
  270. movq 0 * SIZE(X), %mm2
  271. movq %mm2, 0 * SIZE(Y)
  272. addq INCX, X
  273. addq INCY, Y
  274. movq 0 * SIZE(X), %mm4
  275. movq %mm4, 0 * SIZE(Y)
  276. addq INCX, X
  277. addq INCY, Y
  278. movq 0 * SIZE(X), %mm6
  279. movq %mm6, 0 * SIZE(Y)
  280. addq INCX, X
  281. addq INCY, Y
  282. #endif
  283. decq %rax
  284. jg .L111
  285. .L120:
  286. movq N, %rax
  287. andq $3, %rax
  288. jle .L999
  289. ALIGN_2
  290. .L121:
  291. #ifdef XDOUBLE
  292. movq 0(X), %mm0
  293. movq %mm0, 0(Y)
  294. movq 8(X), %mm1
  295. movq %mm1, 8(Y)
  296. movq 16(X), %mm2
  297. movq %mm2, 16(Y)
  298. movq 24(X), %mm3
  299. movq %mm3, 24(Y)
  300. addq INCX, X
  301. addq INCY, Y
  302. #elif defined(DOUBLE)
  303. movq 0 * SIZE(X), %mm0
  304. movq %mm0, 0 * SIZE(Y)
  305. movq 1 * SIZE(X), %mm1
  306. movq %mm1, 1 * SIZE(Y)
  307. addq INCX, X
  308. addq INCY, Y
  309. #else
  310. movq 0 * SIZE(X), %mm0
  311. movq %mm0, 0 * SIZE(Y)
  312. addq INCX, X
  313. addq INCY, Y
  314. #endif
  315. decq %rax
  316. jg .L121
  317. .L999:
  318. xorq %rax,%rax
  319. EMMS
  320. ret
  321. EPILOGUE