You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dot_sse2.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define Y ARG4 /* rcx */
  44. #ifndef WINDOWS_ABI
  45. #define INCY ARG5 /* r8 */
  46. #else
  47. #define INCY %r10
  48. #endif
  49. #include "l1param.h"
  50. PROLOGUE
  51. PROFCODE
  52. #ifdef WINDOWS_ABI
  53. movq 40(%rsp), INCY
  54. #endif
  55. SAVEREGISTERS
  56. leaq (, INCX, SIZE), INCX
  57. leaq (, INCY, SIZE), INCY
  58. xorps %xmm0, %xmm0
  59. xorps %xmm1, %xmm1
  60. xorps %xmm2, %xmm2
  61. xorps %xmm3, %xmm3
  62. cmpq $0, N
  63. jle .L999
  64. cmpq $SIZE, INCX
  65. jne .L50
  66. cmpq $SIZE, INCY
  67. jne .L50
  68. subq $-16 * SIZE, X
  69. subq $-16 * SIZE, Y
  70. testq $SIZE, Y
  71. je .L10
  72. movsd -16 * SIZE(X), %xmm0
  73. mulsd -16 * SIZE(Y), %xmm0
  74. addq $1 * SIZE, X
  75. addq $1 * SIZE, Y
  76. decq N
  77. ALIGN_2
  78. .L10:
  79. testq $SIZE, X
  80. jne .L20
  81. movq N, %rax
  82. sarq $4, %rax
  83. jle .L14
  84. movaps -16 * SIZE(X), %xmm4
  85. movaps -14 * SIZE(X), %xmm5
  86. movaps -12 * SIZE(X), %xmm6
  87. movaps -10 * SIZE(X), %xmm7
  88. movaps -8 * SIZE(X), %xmm8
  89. movaps -6 * SIZE(X), %xmm9
  90. movaps -4 * SIZE(X), %xmm10
  91. movaps -2 * SIZE(X), %xmm11
  92. decq %rax
  93. jle .L12
  94. ALIGN_3
  95. .L11:
  96. #ifdef PREFETCH
  97. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  98. #endif
  99. mulpd -16 * SIZE(Y), %xmm4
  100. addpd %xmm4, %xmm0
  101. movaps 0 * SIZE(X), %xmm4
  102. mulpd -14 * SIZE(Y), %xmm5
  103. addpd %xmm5, %xmm1
  104. movaps 2 * SIZE(X), %xmm5
  105. #ifdef PREFETCH
  106. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  107. #endif
  108. mulpd -12 * SIZE(Y), %xmm6
  109. addpd %xmm6, %xmm2
  110. movaps 4 * SIZE(X), %xmm6
  111. mulpd -10 * SIZE(Y), %xmm7
  112. addpd %xmm7, %xmm3
  113. movaps 6 * SIZE(X), %xmm7
  114. #if defined(PREFETCH) && !defined(FETCH128)
  115. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  116. #endif
  117. mulpd -8 * SIZE(Y), %xmm8
  118. addpd %xmm8, %xmm0
  119. movaps 8 * SIZE(X), %xmm8
  120. mulpd -6 * SIZE(Y), %xmm9
  121. addpd %xmm9, %xmm1
  122. movaps 10 * SIZE(X), %xmm9
  123. #if defined(PREFETCH) && !defined(FETCH128)
  124. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  125. #endif
  126. mulpd -4 * SIZE(Y), %xmm10
  127. addpd %xmm10, %xmm2
  128. movaps 12 * SIZE(X), %xmm10
  129. mulpd -2 * SIZE(Y), %xmm11
  130. addpd %xmm11, %xmm3
  131. movaps 14 * SIZE(X), %xmm11
  132. subq $-16 * SIZE, X
  133. subq $-16 * SIZE, Y
  134. decq %rax
  135. jg .L11
  136. ALIGN_3
  137. .L12:
  138. mulpd -16 * SIZE(Y), %xmm4
  139. addpd %xmm4, %xmm0
  140. mulpd -14 * SIZE(Y), %xmm5
  141. addpd %xmm5, %xmm1
  142. mulpd -12 * SIZE(Y), %xmm6
  143. addpd %xmm6, %xmm2
  144. mulpd -10 * SIZE(Y), %xmm7
  145. addpd %xmm7, %xmm3
  146. mulpd -8 * SIZE(Y), %xmm8
  147. addpd %xmm8, %xmm0
  148. mulpd -6 * SIZE(Y), %xmm9
  149. addpd %xmm9, %xmm1
  150. mulpd -4 * SIZE(Y), %xmm10
  151. addpd %xmm10, %xmm2
  152. mulpd -2 * SIZE(Y), %xmm11
  153. addpd %xmm11, %xmm3
  154. subq $-16 * SIZE, X
  155. subq $-16 * SIZE, Y
  156. ALIGN_3
  157. .L14:
  158. testq $15, N
  159. jle .L999
  160. testq $8, N
  161. jle .L15
  162. movaps -16 * SIZE(X), %xmm4
  163. movaps -14 * SIZE(X), %xmm5
  164. movaps -12 * SIZE(X), %xmm6
  165. movaps -10 * SIZE(X), %xmm7
  166. mulpd -16 * SIZE(Y), %xmm4
  167. addpd %xmm4, %xmm0
  168. mulpd -14 * SIZE(Y), %xmm5
  169. addpd %xmm5, %xmm1
  170. mulpd -12 * SIZE(Y), %xmm6
  171. addpd %xmm6, %xmm2
  172. mulpd -10 * SIZE(Y), %xmm7
  173. addpd %xmm7, %xmm3
  174. addq $8 * SIZE, X
  175. addq $8 * SIZE, Y
  176. ALIGN_3
  177. .L15:
  178. testq $4, N
  179. jle .L16
  180. movaps -16 * SIZE(X), %xmm4
  181. movaps -14 * SIZE(X), %xmm5
  182. mulpd -16 * SIZE(Y), %xmm4
  183. addpd %xmm4, %xmm0
  184. mulpd -14 * SIZE(Y), %xmm5
  185. addpd %xmm5, %xmm1
  186. addq $4 * SIZE, X
  187. addq $4 * SIZE, Y
  188. ALIGN_3
  189. .L16:
  190. testq $2, N
  191. jle .L17
  192. movaps -16 * SIZE(X), %xmm4
  193. mulpd -16 * SIZE(Y), %xmm4
  194. addpd %xmm4, %xmm0
  195. addq $2 * SIZE, X
  196. addq $2 * SIZE, Y
  197. ALIGN_3
  198. .L17:
  199. testq $1, N
  200. jle .L999
  201. movsd -16 * SIZE(X), %xmm4
  202. mulsd -16 * SIZE(Y), %xmm4
  203. addsd %xmm4, %xmm0
  204. jmp .L999
  205. ALIGN_3
  206. .L20:
  207. #ifdef ALIGNED_ACCESS
  208. movhps -16 * SIZE(X), %xmm4
  209. addq $SIZE, X
  210. movq N, %rax
  211. sarq $4, %rax
  212. jle .L24
  213. movaps -16 * SIZE(X), %xmm5
  214. movaps -14 * SIZE(X), %xmm6
  215. movaps -12 * SIZE(X), %xmm7
  216. movaps -10 * SIZE(X), %xmm8
  217. movaps -8 * SIZE(X), %xmm9
  218. movaps -6 * SIZE(X), %xmm10
  219. movaps -4 * SIZE(X), %xmm11
  220. decq %rax
  221. jle .L22
  222. ALIGN_3
  223. .L21:
  224. #ifdef PREFETCH
  225. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  226. #endif
  227. SHUFPD_1 %xmm5, %xmm4
  228. mulpd -16 * SIZE(Y), %xmm4
  229. addpd %xmm4, %xmm0
  230. movaps -2 * SIZE(X), %xmm4
  231. SHUFPD_1 %xmm6, %xmm5
  232. mulpd -14 * SIZE(Y), %xmm5
  233. addpd %xmm5, %xmm1
  234. movaps 0 * SIZE(X), %xmm5
  235. #ifdef PREFETCH
  236. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  237. #endif
  238. SHUFPD_1 %xmm7, %xmm6
  239. mulpd -12 * SIZE(Y), %xmm6
  240. addpd %xmm6, %xmm2
  241. movaps 2 * SIZE(X), %xmm6
  242. SHUFPD_1 %xmm8, %xmm7
  243. mulpd -10 * SIZE(Y), %xmm7
  244. addpd %xmm7, %xmm3
  245. movaps 4 * SIZE(X), %xmm7
  246. #if defined(PREFETCH) && !defined(FETCH128)
  247. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  248. #endif
  249. SHUFPD_1 %xmm9, %xmm8
  250. mulpd -8 * SIZE(Y), %xmm8
  251. addpd %xmm8, %xmm0
  252. movaps 6 * SIZE(X), %xmm8
  253. SHUFPD_1 %xmm10, %xmm9
  254. mulpd -6 * SIZE(Y), %xmm9
  255. addpd %xmm9, %xmm1
  256. movaps 8 * SIZE(X), %xmm9
  257. #if defined(PREFETCH) && !defined(FETCH128)
  258. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  259. #endif
  260. SHUFPD_1 %xmm11, %xmm10
  261. mulpd -4 * SIZE(Y), %xmm10
  262. addpd %xmm10, %xmm2
  263. movaps 10 * SIZE(X), %xmm10
  264. SHUFPD_1 %xmm4, %xmm11
  265. mulpd -2 * SIZE(Y), %xmm11
  266. addpd %xmm11, %xmm3
  267. movaps 12 * SIZE(X), %xmm11
  268. subq $-16 * SIZE, X
  269. subq $-16 * SIZE, Y
  270. decq %rax
  271. jg .L21
  272. ALIGN_3
  273. .L22:
  274. SHUFPD_1 %xmm5, %xmm4
  275. mulpd -16 * SIZE(Y), %xmm4
  276. addpd %xmm4, %xmm0
  277. movaps -2 * SIZE(X), %xmm4
  278. SHUFPD_1 %xmm6, %xmm5
  279. mulpd -14 * SIZE(Y), %xmm5
  280. addpd %xmm5, %xmm1
  281. SHUFPD_1 %xmm7, %xmm6
  282. mulpd -12 * SIZE(Y), %xmm6
  283. addpd %xmm6, %xmm2
  284. SHUFPD_1 %xmm8, %xmm7
  285. mulpd -10 * SIZE(Y), %xmm7
  286. addpd %xmm7, %xmm3
  287. SHUFPD_1 %xmm9, %xmm8
  288. mulpd -8 * SIZE(Y), %xmm8
  289. addpd %xmm8, %xmm0
  290. SHUFPD_1 %xmm10, %xmm9
  291. mulpd -6 * SIZE(Y), %xmm9
  292. addpd %xmm9, %xmm1
  293. SHUFPD_1 %xmm11, %xmm10
  294. mulpd -4 * SIZE(Y), %xmm10
  295. addpd %xmm10, %xmm2
  296. SHUFPD_1 %xmm4, %xmm11
  297. mulpd -2 * SIZE(Y), %xmm11
  298. addpd %xmm11, %xmm3
  299. subq $-16 * SIZE, X
  300. subq $-16 * SIZE, Y
  301. ALIGN_3
  302. .L24:
  303. testq $15, N
  304. jle .L999
  305. testq $8, N
  306. jle .L25
  307. movaps -16 * SIZE(X), %xmm5
  308. movaps -14 * SIZE(X), %xmm6
  309. movaps -12 * SIZE(X), %xmm7
  310. SHUFPD_1 %xmm5, %xmm4
  311. mulpd -16 * SIZE(Y), %xmm4
  312. addpd %xmm4, %xmm0
  313. movaps -10 * SIZE(X), %xmm4
  314. SHUFPD_1 %xmm6, %xmm5
  315. mulpd -14 * SIZE(Y), %xmm5
  316. addpd %xmm5, %xmm1
  317. SHUFPD_1 %xmm7, %xmm6
  318. mulpd -12 * SIZE(Y), %xmm6
  319. addpd %xmm6, %xmm2
  320. SHUFPD_1 %xmm4, %xmm7
  321. mulpd -10 * SIZE(Y), %xmm7
  322. addpd %xmm7, %xmm3
  323. addq $8 * SIZE, X
  324. addq $8 * SIZE, Y
  325. ALIGN_3
  326. .L25:
  327. testq $4, N
  328. jle .L26
  329. movaps -16 * SIZE(X), %xmm5
  330. movaps -14 * SIZE(X), %xmm6
  331. SHUFPD_1 %xmm5, %xmm4
  332. mulpd -16 * SIZE(Y), %xmm4
  333. addpd %xmm4, %xmm0
  334. SHUFPD_1 %xmm6, %xmm5
  335. mulpd -14 * SIZE(Y), %xmm5
  336. addpd %xmm5, %xmm1
  337. movapd %xmm6, %xmm4
  338. addq $4 * SIZE, X
  339. addq $4 * SIZE, Y
  340. ALIGN_3
  341. .L26:
  342. testq $2, N
  343. jle .L27
  344. movaps -16 * SIZE(X), %xmm5
  345. SHUFPD_1 %xmm5, %xmm4
  346. mulpd -16 * SIZE(Y), %xmm4
  347. addpd %xmm4, %xmm0
  348. movapd %xmm5, %xmm4
  349. addq $2 * SIZE, X
  350. addq $2 * SIZE, Y
  351. ALIGN_3
  352. .L27:
  353. testq $1, N
  354. jle .L999
  355. SHUFPD_1 %xmm4, %xmm4
  356. mulsd -16 * SIZE(Y), %xmm4
  357. addsd %xmm4, %xmm0
  358. jmp .L999
  359. ALIGN_3
  360. #else
  361. movq N, %rax
  362. sarq $4, %rax
  363. jle .L24
  364. movlps -16 * SIZE(X), %xmm4
  365. movhps -15 * SIZE(X), %xmm4
  366. movlps -14 * SIZE(X), %xmm5
  367. movhps -13 * SIZE(X), %xmm5
  368. movlps -12 * SIZE(X), %xmm6
  369. movhps -11 * SIZE(X), %xmm6
  370. movlps -10 * SIZE(X), %xmm7
  371. movhps -9 * SIZE(X), %xmm7
  372. movlps -8 * SIZE(X), %xmm8
  373. movhps -7 * SIZE(X), %xmm8
  374. movlps -6 * SIZE(X), %xmm9
  375. movhps -5 * SIZE(X), %xmm9
  376. movlps -4 * SIZE(X), %xmm10
  377. movhps -3 * SIZE(X), %xmm10
  378. movlps -2 * SIZE(X), %xmm11
  379. movhps -1 * SIZE(X), %xmm11
  380. decq %rax
  381. jle .L22
  382. ALIGN_3
  383. .L21:
  384. #ifdef PREFETCH
  385. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  386. #endif
  387. mulpd -16 * SIZE(Y), %xmm4
  388. addpd %xmm4, %xmm0
  389. movlps 0 * SIZE(X), %xmm4
  390. movhps 1 * SIZE(X), %xmm4
  391. mulpd -14 * SIZE(Y), %xmm5
  392. addpd %xmm5, %xmm1
  393. movlps 2 * SIZE(X), %xmm5
  394. movhps 3 * SIZE(X), %xmm5
  395. #ifdef PREFETCH
  396. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  397. #endif
  398. mulpd -12 * SIZE(Y), %xmm6
  399. addpd %xmm6, %xmm2
  400. movlps 4 * SIZE(X), %xmm6
  401. movhps 5 * SIZE(X), %xmm6
  402. mulpd -10 * SIZE(Y), %xmm7
  403. addpd %xmm7, %xmm3
  404. movlps 6 * SIZE(X), %xmm7
  405. movhps 7 * SIZE(X), %xmm7
  406. #if defined(PREFETCH) && !defined(FETCH128)
  407. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  408. #endif
  409. mulpd -8 * SIZE(Y), %xmm8
  410. addpd %xmm8, %xmm0
  411. movlps 8 * SIZE(X), %xmm8
  412. movhps 9 * SIZE(X), %xmm8
  413. mulpd -6 * SIZE(Y), %xmm9
  414. addpd %xmm9, %xmm1
  415. movlps 10 * SIZE(X), %xmm9
  416. movhps 11 * SIZE(X), %xmm9
  417. #if defined(PREFETCH) && !defined(FETCH128)
  418. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  419. #endif
  420. mulpd -4 * SIZE(Y), %xmm10
  421. addpd %xmm10, %xmm2
  422. movlps 12 * SIZE(X), %xmm10
  423. movhps 13 * SIZE(X), %xmm10
  424. mulpd -2 * SIZE(Y), %xmm11
  425. addpd %xmm11, %xmm3
  426. movlps 14 * SIZE(X), %xmm11
  427. movhps 15 * SIZE(X), %xmm11
  428. subq $-16 * SIZE, X
  429. subq $-16 * SIZE, Y
  430. decq %rax
  431. jg .L21
  432. ALIGN_3
  433. .L22:
  434. mulpd -16 * SIZE(Y), %xmm4
  435. addpd %xmm4, %xmm0
  436. mulpd -14 * SIZE(Y), %xmm5
  437. addpd %xmm5, %xmm1
  438. mulpd -12 * SIZE(Y), %xmm6
  439. addpd %xmm6, %xmm2
  440. mulpd -10 * SIZE(Y), %xmm7
  441. addpd %xmm7, %xmm3
  442. mulpd -8 * SIZE(Y), %xmm8
  443. addpd %xmm8, %xmm0
  444. mulpd -6 * SIZE(Y), %xmm9
  445. addpd %xmm9, %xmm1
  446. mulpd -4 * SIZE(Y), %xmm10
  447. addpd %xmm10, %xmm2
  448. mulpd -2 * SIZE(Y), %xmm11
  449. addpd %xmm11, %xmm3
  450. subq $-16 * SIZE, X
  451. subq $-16 * SIZE, Y
  452. ALIGN_3
  453. .L24:
  454. testq $15, N
  455. jle .L999
  456. testq $8, N
  457. jle .L25
  458. movlps -16 * SIZE(X), %xmm4
  459. movhps -15 * SIZE(X), %xmm4
  460. movlps -14 * SIZE(X), %xmm5
  461. movhps -13 * SIZE(X), %xmm5
  462. movlps -12 * SIZE(X), %xmm6
  463. movhps -11 * SIZE(X), %xmm6
  464. movlps -10 * SIZE(X), %xmm7
  465. movhps -9 * SIZE(X), %xmm7
  466. mulpd -16 * SIZE(Y), %xmm4
  467. addpd %xmm4, %xmm0
  468. mulpd -14 * SIZE(Y), %xmm5
  469. addpd %xmm5, %xmm1
  470. mulpd -12 * SIZE(Y), %xmm6
  471. addpd %xmm6, %xmm2
  472. mulpd -10 * SIZE(Y), %xmm7
  473. addpd %xmm7, %xmm3
  474. addq $8 * SIZE, X
  475. addq $8 * SIZE, Y
  476. ALIGN_3
  477. .L25:
  478. testq $4, N
  479. jle .L26
  480. movlps -16 * SIZE(X), %xmm4
  481. movhps -15 * SIZE(X), %xmm4
  482. movlps -14 * SIZE(X), %xmm5
  483. movhps -13 * SIZE(X), %xmm5
  484. mulpd -16 * SIZE(Y), %xmm4
  485. addpd %xmm4, %xmm0
  486. mulpd -14 * SIZE(Y), %xmm5
  487. addpd %xmm5, %xmm1
  488. addq $4 * SIZE, X
  489. addq $4 * SIZE, Y
  490. ALIGN_3
  491. .L26:
  492. testq $2, N
  493. jle .L27
  494. movlps -16 * SIZE(X), %xmm4
  495. movhps -15 * SIZE(X), %xmm4
  496. mulpd -16 * SIZE(Y), %xmm4
  497. addpd %xmm4, %xmm0
  498. addq $2 * SIZE, X
  499. addq $2 * SIZE, Y
  500. ALIGN_3
  501. .L27:
  502. testq $1, N
  503. jle .L999
  504. movsd -16 * SIZE(X), %xmm4
  505. mulsd -16 * SIZE(Y), %xmm4
  506. addsd %xmm4, %xmm0
  507. jmp .L999
  508. ALIGN_3
  509. #endif
  510. .L50:
  511. movq N, %rax
  512. sarq $2, %rax
  513. jle .L55
  514. ALIGN_3
  515. .L53:
  516. movsd 0 * SIZE(X), %xmm4
  517. addq INCX, X
  518. mulsd 0 * SIZE(Y), %xmm4
  519. addq INCY, Y
  520. movsd 0 * SIZE(X), %xmm5
  521. addq INCX, X
  522. mulsd 0 * SIZE(Y), %xmm5
  523. addq INCY, Y
  524. movsd 0 * SIZE(X), %xmm6
  525. addq INCX, X
  526. mulsd 0 * SIZE(Y), %xmm6
  527. addq INCY, Y
  528. movsd 0 * SIZE(X), %xmm7
  529. addq INCX, X
  530. mulsd 0 * SIZE(Y), %xmm7
  531. addq INCY, Y
  532. addsd %xmm4, %xmm0
  533. addsd %xmm5, %xmm1
  534. addsd %xmm6, %xmm2
  535. addsd %xmm7, %xmm3
  536. decq %rax
  537. jg .L53
  538. ALIGN_3
  539. .L55:
  540. movq N, %rax
  541. andq $3, %rax
  542. jle .L999
  543. ALIGN_3
  544. .L56:
  545. movsd 0 * SIZE(X), %xmm4
  546. addq INCX, X
  547. mulsd 0 * SIZE(Y), %xmm4
  548. addq INCY, Y
  549. addsd %xmm4, %xmm0
  550. decq %rax
  551. jg .L56
  552. ALIGN_3
  553. .L999:
  554. addpd %xmm1, %xmm0
  555. addpd %xmm3, %xmm2
  556. addpd %xmm2, %xmm0
  557. #ifndef HAVE_SSE3
  558. pshufd $0xe, %xmm0, %xmm1
  559. addsd %xmm1, %xmm0
  560. #else
  561. haddpd %xmm0, %xmm0
  562. #endif
  563. RESTOREREGISTERS
  564. ret
  565. EPILOGUE