You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zasum_atom.S 8.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define I %rax
  44. #include "l1param.h"
  45. PROLOGUE
  46. PROFCODE
  47. SAVEREGISTERS
  48. xorps %xmm0, %xmm0
  49. testq M, M
  50. jle .L999
  51. testq INCX, INCX
  52. jle .L999
  53. xorps %xmm1, %xmm1
  54. xorps %xmm2, %xmm2
  55. xorps %xmm3, %xmm3
  56. pcmpeqb %xmm15, %xmm15
  57. psrlq $1, %xmm15
  58. salq $ZBASE_SHIFT, INCX
  59. xorps %xmm13, %xmm13
  60. cmpq $2 * SIZE, INCX
  61. jne .L20
  62. addq M, M
  63. testq $SIZE, X
  64. je .L05
  65. movsd (X), %xmm0
  66. addq $SIZE, X
  67. andps %xmm15, %xmm0
  68. decq M
  69. ALIGN_3
  70. .L05:
  71. subq $-16 * SIZE, X
  72. movq M, I
  73. sarq $4, I
  74. jle .L12
  75. movaps -16 * SIZE(X), %xmm4
  76. movaps -14 * SIZE(X), %xmm5
  77. movaps -12 * SIZE(X), %xmm6
  78. movaps -10 * SIZE(X), %xmm7
  79. movaps -8 * SIZE(X), %xmm8
  80. movaps -6 * SIZE(X), %xmm9
  81. movaps -4 * SIZE(X), %xmm10
  82. movaps -2 * SIZE(X), %xmm11
  83. decq I
  84. jle .L11
  85. ALIGN_4
  86. .L10:
  87. #ifdef PREFETCH
  88. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  89. #endif
  90. andps %xmm15, %xmm4
  91. addsd %xmm13, %xmm3
  92. pshufd $0x4e, %xmm4, %xmm12
  93. addsd %xmm4, %xmm0
  94. movaps 0 * SIZE(X), %xmm4
  95. andps %xmm15, %xmm5
  96. addsd %xmm12, %xmm1
  97. pshufd $0x4e, %xmm5, %xmm13
  98. addsd %xmm5, %xmm2
  99. movaps 2 * SIZE(X), %xmm5
  100. andps %xmm15, %xmm6
  101. addsd %xmm13, %xmm3
  102. pshufd $0x4e, %xmm6, %xmm12
  103. addsd %xmm6, %xmm0
  104. movaps 4 * SIZE(X), %xmm6
  105. andps %xmm15, %xmm7
  106. addsd %xmm12, %xmm1
  107. pshufd $0x4e, %xmm7, %xmm13
  108. addsd %xmm7, %xmm2
  109. movaps 6 * SIZE(X), %xmm7
  110. #if defined(PREFETCH) && !defined(FETCH128)
  111. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  112. #endif
  113. andps %xmm15, %xmm8
  114. addsd %xmm13, %xmm3
  115. pshufd $0x4e, %xmm8, %xmm12
  116. addsd %xmm8, %xmm0
  117. movaps 8 * SIZE(X), %xmm8
  118. andps %xmm15, %xmm9
  119. addsd %xmm12, %xmm1
  120. pshufd $0x4e, %xmm9, %xmm13
  121. addsd %xmm9, %xmm2
  122. movaps 10 * SIZE(X), %xmm9
  123. andps %xmm15, %xmm10
  124. addsd %xmm13, %xmm3
  125. pshufd $0x4e, %xmm10, %xmm12
  126. addsd %xmm10, %xmm0
  127. movaps 12 * SIZE(X), %xmm10
  128. andps %xmm15, %xmm11
  129. addsd %xmm12, %xmm1
  130. pshufd $0x4e, %xmm11, %xmm13
  131. addsd %xmm11, %xmm2
  132. movaps 14 * SIZE(X), %xmm11
  133. subq $-16 * SIZE, X
  134. decq I
  135. jg .L10
  136. ALIGN_4
  137. .L11:
  138. andps %xmm15, %xmm4
  139. addsd %xmm13, %xmm3
  140. pshufd $0x4e, %xmm4, %xmm12
  141. addsd %xmm4, %xmm0
  142. andps %xmm15, %xmm5
  143. addsd %xmm12, %xmm1
  144. pshufd $0x4e, %xmm5, %xmm13
  145. addsd %xmm5, %xmm2
  146. andps %xmm15, %xmm6
  147. addsd %xmm13, %xmm3
  148. pshufd $0x4e, %xmm6, %xmm12
  149. addsd %xmm6, %xmm0
  150. andps %xmm15, %xmm7
  151. addsd %xmm12, %xmm1
  152. pshufd $0x4e, %xmm7, %xmm13
  153. addsd %xmm7, %xmm2
  154. andps %xmm15, %xmm8
  155. addsd %xmm13, %xmm3
  156. pshufd $0x4e, %xmm8, %xmm12
  157. addsd %xmm8, %xmm0
  158. andps %xmm15, %xmm9
  159. addsd %xmm12, %xmm1
  160. pshufd $0x4e, %xmm9, %xmm13
  161. addsd %xmm9, %xmm2
  162. andps %xmm15, %xmm10
  163. addsd %xmm13, %xmm3
  164. pshufd $0x4e, %xmm10, %xmm12
  165. addsd %xmm10, %xmm0
  166. andps %xmm15, %xmm11
  167. addsd %xmm12, %xmm1
  168. pshufd $0x4e, %xmm11, %xmm13
  169. addsd %xmm11, %xmm2
  170. addsd %xmm13, %xmm3
  171. subq $-16 * SIZE, X
  172. ALIGN_3
  173. .L12:
  174. andq $15, M
  175. jle .L998
  176. testq $8, M
  177. je .L13
  178. movaps -16 * SIZE(X), %xmm4
  179. movaps -14 * SIZE(X), %xmm5
  180. movaps -12 * SIZE(X), %xmm6
  181. movaps -10 * SIZE(X), %xmm7
  182. addq $8 * SIZE, X
  183. andps %xmm15, %xmm4
  184. pshufd $0x4e, %xmm4, %xmm12
  185. addsd %xmm4, %xmm0
  186. andps %xmm15, %xmm5
  187. addsd %xmm12, %xmm1
  188. pshufd $0x4e, %xmm5, %xmm13
  189. addsd %xmm5, %xmm2
  190. addsd %xmm13, %xmm3
  191. andps %xmm15, %xmm6
  192. pshufd $0x4e, %xmm6, %xmm12
  193. addsd %xmm6, %xmm0
  194. andps %xmm15, %xmm7
  195. addsd %xmm12, %xmm1
  196. pshufd $0x4e, %xmm7, %xmm13
  197. addsd %xmm7, %xmm2
  198. addsd %xmm13, %xmm3
  199. ALIGN_3
  200. .L13:
  201. testq $4, M
  202. je .L14
  203. movaps -16 * SIZE(X), %xmm4
  204. movaps -14 * SIZE(X), %xmm5
  205. addq $4 * SIZE, X
  206. andps %xmm15, %xmm4
  207. pshufd $0x4e, %xmm4, %xmm12
  208. addsd %xmm4, %xmm0
  209. andps %xmm15, %xmm5
  210. addsd %xmm12, %xmm1
  211. pshufd $0x4e, %xmm5, %xmm13
  212. addsd %xmm5, %xmm2
  213. addsd %xmm13, %xmm3
  214. ALIGN_3
  215. .L14:
  216. testq $2, M
  217. je .L15
  218. movaps -16 * SIZE(X), %xmm4
  219. addq $2 * SIZE, X
  220. andps %xmm15, %xmm4
  221. pshufd $0x4e, %xmm4, %xmm5
  222. addsd %xmm4, %xmm2
  223. addsd %xmm5, %xmm3
  224. ALIGN_3
  225. .L15:
  226. testq $1, M
  227. je .L998
  228. movsd -16 * SIZE(X), %xmm4
  229. andps %xmm15, %xmm4
  230. addsd %xmm4, %xmm0
  231. jmp .L998
  232. ALIGN_3
  233. .L20:
  234. movq M, I
  235. sarq $2, I
  236. jle .L25
  237. movsd 0 * SIZE(X), %xmm4
  238. movsd 1 * SIZE(X), %xmm5
  239. addq INCX, X
  240. movsd 0 * SIZE(X), %xmm6
  241. movsd 1 * SIZE(X), %xmm7
  242. addq INCX, X
  243. movsd 0 * SIZE(X), %xmm8
  244. movsd 1 * SIZE(X), %xmm9
  245. addq INCX, X
  246. movsd 0 * SIZE(X), %xmm10
  247. movsd 1 * SIZE(X), %xmm11
  248. decq I
  249. jle .L23
  250. ALIGN_4
  251. .L22:
  252. andps %xmm15, %xmm4
  253. addq INCX, X
  254. addsd %xmm4, %xmm0
  255. movsd 0 * SIZE(X), %xmm4
  256. andps %xmm15, %xmm5
  257. addsd %xmm5, %xmm1
  258. movsd 1 * SIZE(X), %xmm5
  259. andps %xmm15, %xmm6
  260. addq INCX, X
  261. addsd %xmm6, %xmm2
  262. movsd 0 * SIZE(X), %xmm6
  263. andps %xmm15, %xmm7
  264. addsd %xmm7, %xmm3
  265. movsd 1 * SIZE(X), %xmm7
  266. andps %xmm15, %xmm8
  267. addq INCX, X
  268. addsd %xmm8, %xmm0
  269. movsd 0 * SIZE(X), %xmm8
  270. andps %xmm15, %xmm9
  271. addsd %xmm9, %xmm1
  272. movsd 1 * SIZE(X), %xmm9
  273. andps %xmm15, %xmm10
  274. addq INCX, X
  275. addsd %xmm10, %xmm2
  276. movsd 0 * SIZE(X), %xmm10
  277. andps %xmm15, %xmm11
  278. addsd %xmm11, %xmm3
  279. movsd 1 * SIZE(X), %xmm11
  280. decq I
  281. jg .L22
  282. ALIGN_4
  283. .L23:
  284. andps %xmm15, %xmm4
  285. addq INCX, X
  286. addsd %xmm4, %xmm0
  287. andps %xmm15, %xmm5
  288. addsd %xmm5, %xmm1
  289. andps %xmm15, %xmm6
  290. addsd %xmm6, %xmm2
  291. andps %xmm15, %xmm7
  292. addsd %xmm7, %xmm3
  293. andps %xmm15, %xmm8
  294. addsd %xmm8, %xmm0
  295. andps %xmm15, %xmm9
  296. addsd %xmm9, %xmm1
  297. andps %xmm15, %xmm10
  298. addsd %xmm10, %xmm2
  299. andps %xmm15, %xmm11
  300. addsd %xmm11, %xmm3
  301. ALIGN_3
  302. .L25:
  303. testq $2, M
  304. je .L26
  305. movsd 0 * SIZE(X), %xmm4
  306. movsd 1 * SIZE(X), %xmm5
  307. addq INCX, X
  308. movsd 0 * SIZE(X), %xmm6
  309. andps %xmm15, %xmm4
  310. addsd %xmm4, %xmm0
  311. movsd 1 * SIZE(X), %xmm7
  312. andps %xmm15, %xmm5
  313. addsd %xmm5, %xmm1
  314. addq INCX, X
  315. andps %xmm15, %xmm6
  316. addsd %xmm6, %xmm2
  317. andps %xmm15, %xmm7
  318. addsd %xmm7, %xmm3
  319. ALIGN_3
  320. .L26:
  321. testq $1, M
  322. je .L998
  323. movsd 0 * SIZE(X), %xmm4
  324. movsd 1 * SIZE(X), %xmm5
  325. addq INCX, X
  326. andps %xmm15, %xmm4
  327. andps %xmm15, %xmm5
  328. addsd %xmm4, %xmm0
  329. addsd %xmm5, %xmm1
  330. ALIGN_3
  331. .L998:
  332. addsd %xmm1, %xmm0
  333. addsd %xmm3, %xmm2
  334. addsd %xmm2, %xmm0
  335. ALIGN_4
  336. .L999:
  337. RESTOREREGISTERS
  338. ret
  339. EPILOGUE