You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

asum_sse.S 7.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define I %rax
  44. #include "l1param.h"
  45. PROLOGUE
  46. PROFCODE
  47. SAVEREGISTERS
  48. xorps %xmm0, %xmm0
  49. testq M, M
  50. jle .L999
  51. testq INCX, INCX
  52. jle .L999
  53. xorps %xmm1, %xmm1
  54. xorps %xmm2, %xmm2
  55. xorps %xmm3, %xmm3
  56. pcmpeqb %xmm15, %xmm15
  57. psrld $1, %xmm15
  58. leaq (, INCX, SIZE), INCX
  59. cmpq $SIZE, INCX
  60. jne .L100
  61. subq $-32 * SIZE, X
  62. cmpq $3, M
  63. jle .L18
  64. testq $4, X
  65. je .L05
  66. movss -32 * SIZE(X), %xmm0
  67. andps %xmm15, %xmm0
  68. addq $SIZE, X
  69. decq M
  70. jle .L998
  71. ALIGN_3
  72. .L05:
  73. testq $8, X
  74. je .L10
  75. movsd -32 * SIZE(X), %xmm1
  76. andps %xmm15, %xmm1
  77. addq $2 * SIZE, X
  78. subq $2, M
  79. jle .L998
  80. ALIGN_3
  81. .L10:
  82. movq M, I
  83. sarq $5, I
  84. jle .L14
  85. movaps -32 * SIZE(X), %xmm4
  86. movaps -28 * SIZE(X), %xmm5
  87. movaps -24 * SIZE(X), %xmm6
  88. movaps -20 * SIZE(X), %xmm7
  89. movaps -16 * SIZE(X), %xmm8
  90. movaps -12 * SIZE(X), %xmm9
  91. movaps -8 * SIZE(X), %xmm10
  92. movaps -4 * SIZE(X), %xmm11
  93. decq I
  94. jle .L12
  95. ALIGN_3
  96. .L11:
  97. #ifdef PREFETCH
  98. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  99. #endif
  100. andps %xmm15, %xmm4
  101. addps %xmm4, %xmm0
  102. movaps 0 * SIZE(X), %xmm4
  103. andps %xmm15, %xmm5
  104. addps %xmm5, %xmm1
  105. movaps 4 * SIZE(X), %xmm5
  106. andps %xmm15, %xmm6
  107. addps %xmm6, %xmm2
  108. movaps 8 * SIZE(X), %xmm6
  109. andps %xmm15, %xmm7
  110. addps %xmm7, %xmm3
  111. movaps 12 * SIZE(X), %xmm7
  112. #if defined(PREFETCH) && !defined(FETCH128)
  113. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  114. #endif
  115. andps %xmm15, %xmm8
  116. addps %xmm8, %xmm0
  117. movaps 16 * SIZE(X), %xmm8
  118. andps %xmm15, %xmm9
  119. addps %xmm9, %xmm1
  120. movaps 20 * SIZE(X), %xmm9
  121. andps %xmm15, %xmm10
  122. addps %xmm10, %xmm2
  123. movaps 24 * SIZE(X), %xmm10
  124. andps %xmm15, %xmm11
  125. addps %xmm11, %xmm3
  126. movaps 28 * SIZE(X), %xmm11
  127. subq $-32 * SIZE, X
  128. decq I
  129. jg .L11
  130. ALIGN_3
  131. .L12:
  132. andps %xmm15, %xmm4
  133. addps %xmm4, %xmm0
  134. andps %xmm15, %xmm5
  135. addps %xmm5, %xmm1
  136. andps %xmm15, %xmm6
  137. addps %xmm6, %xmm2
  138. andps %xmm15, %xmm7
  139. addps %xmm7, %xmm3
  140. andps %xmm15, %xmm8
  141. addps %xmm8, %xmm0
  142. andps %xmm15, %xmm9
  143. addps %xmm9, %xmm1
  144. andps %xmm15, %xmm10
  145. addps %xmm10, %xmm2
  146. andps %xmm15, %xmm11
  147. addps %xmm11, %xmm3
  148. subq $-32 * SIZE, X
  149. ALIGN_3
  150. .L14:
  151. testq $16, M
  152. je .L16
  153. movaps -32 * SIZE(X), %xmm4
  154. andps %xmm15, %xmm4
  155. addps %xmm4, %xmm0
  156. movaps -28 * SIZE(X), %xmm5
  157. andps %xmm15, %xmm5
  158. addps %xmm5, %xmm1
  159. movaps -24 * SIZE(X), %xmm4
  160. andps %xmm15, %xmm4
  161. addps %xmm4, %xmm0
  162. movaps -20 * SIZE(X), %xmm5
  163. andps %xmm15, %xmm5
  164. addps %xmm5, %xmm1
  165. addq $16 * SIZE, X
  166. ALIGN_3
  167. .L16:
  168. testq $8, M
  169. je .L17
  170. movaps -32 * SIZE(X), %xmm4
  171. andps %xmm15, %xmm4
  172. addps %xmm4, %xmm0
  173. movaps -28 * SIZE(X), %xmm5
  174. andps %xmm15, %xmm5
  175. addps %xmm5, %xmm1
  176. addq $8 * SIZE, X
  177. ALIGN_3
  178. .L17:
  179. testq $4, M
  180. je .L18
  181. movaps -32 * SIZE(X), %xmm6
  182. andps %xmm15, %xmm6
  183. addps %xmm6, %xmm2
  184. addq $4 * SIZE, X
  185. ALIGN_3
  186. .L18:
  187. testq $2, M
  188. je .L19
  189. #ifdef movsd
  190. xorps %xmm7, %xmm7
  191. #endif
  192. movsd -32 * SIZE(X), %xmm7
  193. andps %xmm15, %xmm7
  194. addps %xmm7, %xmm3
  195. addq $2 * SIZE, X
  196. ALIGN_3
  197. .L19:
  198. testq $1, M
  199. je .L998
  200. movss -32 * SIZE(X), %xmm6
  201. andps %xmm15, %xmm6
  202. addps %xmm6, %xmm2
  203. jmp .L998
  204. ALIGN_4
  205. .L100:
  206. movq M, I
  207. sarq $3, I
  208. jle .L105
  209. ALIGN_4
  210. .L101:
  211. movss 0 * SIZE(X), %xmm4
  212. addq INCX, X
  213. andps %xmm15, %xmm4
  214. addss %xmm4, %xmm0
  215. movss 0 * SIZE(X), %xmm5
  216. addq INCX, X
  217. andps %xmm15, %xmm5
  218. addss %xmm5, %xmm1
  219. movss 0 * SIZE(X), %xmm6
  220. addq INCX, X
  221. andps %xmm15, %xmm6
  222. addss %xmm6, %xmm2
  223. movss 0 * SIZE(X), %xmm7
  224. addq INCX, X
  225. andps %xmm15, %xmm7
  226. addss %xmm7, %xmm3
  227. movss 0 * SIZE(X), %xmm8
  228. addq INCX, X
  229. andps %xmm15, %xmm8
  230. addss %xmm8, %xmm0
  231. movss 0 * SIZE(X), %xmm4
  232. addq INCX, X
  233. andps %xmm15, %xmm4
  234. addss %xmm4, %xmm1
  235. movss 0 * SIZE(X), %xmm5
  236. addq INCX, X
  237. andps %xmm15, %xmm5
  238. addss %xmm5, %xmm2
  239. movss 0 * SIZE(X), %xmm6
  240. addq INCX, X
  241. andps %xmm15, %xmm6
  242. addss %xmm6, %xmm3
  243. decq I
  244. jg .L101
  245. ALIGN_4
  246. .L105:
  247. andq $7, M
  248. jle .L998
  249. ALIGN_4
  250. .L106:
  251. movss 0 * SIZE(X), %xmm4
  252. andps %xmm15, %xmm4
  253. addps %xmm4, %xmm0
  254. addq INCX, X
  255. decq M
  256. jg .L106
  257. ALIGN_4
  258. .L998:
  259. addps %xmm1, %xmm0
  260. addps %xmm3, %xmm2
  261. addps %xmm2, %xmm0
  262. #ifndef HAVE_SSE3
  263. movhlps %xmm0, %xmm1
  264. addps %xmm1, %xmm0
  265. movaps %xmm0, %xmm1
  266. shufps $1, %xmm0, %xmm0
  267. addss %xmm1, %xmm0
  268. #else
  269. haddps %xmm0, %xmm0
  270. haddps %xmm0, %xmm0
  271. #endif
  272. ALIGN_4
  273. .L999:
  274. RESTOREREGISTERS
  275. ret
  276. EPILOGUE