You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rot_sse2.S 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define Y ARG4 /* rcx */
  44. #ifndef WINDOWS_ABI
  45. #define INCY ARG5 /* r8 */
  46. #else
  47. #define INCY %r10
  48. #endif
  49. #define C %xmm14
  50. #define S %xmm15
  51. #include "l1param.h"
  52. PROLOGUE
  53. PROFCODE
  54. #ifdef WINDOWS_ABI
  55. movq 40(%rsp), INCY
  56. movsd 48(%rsp), %xmm0
  57. movsd 56(%rsp), %xmm1
  58. #endif
  59. SAVEREGISTERS
  60. leaq (, INCX, SIZE), INCX
  61. leaq (, INCY, SIZE), INCY
  62. pshufd $0x44, %xmm0, C
  63. pshufd $0x44, %xmm1, S
  64. cmpq $0, N
  65. jle .L999
  66. cmpq $SIZE, INCX
  67. jne .L50
  68. cmpq $SIZE, INCY
  69. jne .L50
  70. testq $SIZE, X
  71. je .L10
  72. movsd 0 * SIZE(Y), %xmm1
  73. movsd 0 * SIZE(X), %xmm0
  74. movaps %xmm1, %xmm2
  75. movaps %xmm0, %xmm3
  76. mulsd C, %xmm0
  77. mulsd S, %xmm1
  78. mulsd C, %xmm2
  79. mulsd S, %xmm3
  80. addsd %xmm1, %xmm0
  81. subsd %xmm3, %xmm2
  82. movsd %xmm0, 0 * SIZE(X)
  83. movsd %xmm2, 0 * SIZE(Y)
  84. addq $1 * SIZE, X
  85. addq $1 * SIZE, Y
  86. decq N
  87. jle .L999
  88. ALIGN_2
  89. .L10:
  90. testq $SIZE, Y
  91. jne .L20
  92. movq N, %rax
  93. sarq $4, %rax
  94. jle .L14
  95. movaps 0 * SIZE(Y), %xmm1
  96. movaps 2 * SIZE(Y), %xmm3
  97. movaps 4 * SIZE(Y), %xmm9
  98. movaps 6 * SIZE(Y), %xmm11
  99. movaps 0 * SIZE(X), %xmm0
  100. movaps 2 * SIZE(X), %xmm2
  101. movaps 4 * SIZE(X), %xmm8
  102. movaps 6 * SIZE(X), %xmm10
  103. decq %rax
  104. jle .L12
  105. ALIGN_3
  106. .L11:
  107. #ifdef PREFETCHW
  108. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  109. #endif
  110. movaps %xmm1, %xmm4
  111. mulpd S, %xmm1
  112. movaps %xmm3, %xmm6
  113. mulpd S, %xmm3
  114. movaps %xmm0, %xmm5
  115. mulpd C, %xmm0
  116. movaps %xmm2, %xmm7
  117. mulpd C, %xmm2
  118. mulpd C, %xmm4
  119. mulpd S, %xmm5
  120. mulpd C, %xmm6
  121. mulpd S, %xmm7
  122. addpd %xmm1, %xmm0
  123. movaps 8 * SIZE(Y), %xmm1
  124. addpd %xmm3, %xmm2
  125. movaps 10 * SIZE(Y), %xmm3
  126. subpd %xmm5, %xmm4
  127. subpd %xmm7, %xmm6
  128. #ifdef PREFETCHW
  129. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  130. #endif
  131. movaps %xmm0, 0 * SIZE(X)
  132. movaps 8 * SIZE(X), %xmm0
  133. movaps %xmm2, 2 * SIZE(X)
  134. movaps 10 * SIZE(X), %xmm2
  135. movaps %xmm4, 0 * SIZE(Y)
  136. movaps %xmm6, 2 * SIZE(Y)
  137. movaps %xmm9, %xmm4
  138. mulpd S, %xmm9
  139. movaps %xmm8, %xmm5
  140. mulpd C, %xmm8
  141. movaps %xmm11, %xmm6
  142. mulpd S, %xmm11
  143. movaps %xmm10, %xmm7
  144. mulpd C, %xmm10
  145. mulpd C, %xmm4
  146. mulpd S, %xmm5
  147. mulpd C, %xmm6
  148. mulpd S, %xmm7
  149. addpd %xmm9, %xmm8
  150. movaps 12 * SIZE(Y), %xmm9
  151. addpd %xmm11, %xmm10
  152. movaps 14 * SIZE(Y), %xmm11
  153. subpd %xmm5, %xmm4
  154. subpd %xmm7, %xmm6
  155. movaps %xmm8, 4 * SIZE(X)
  156. movaps 12 * SIZE(X), %xmm8
  157. movaps %xmm10,6 * SIZE(X)
  158. movaps 14 * SIZE(X), %xmm10
  159. movaps %xmm4, 4 * SIZE(Y)
  160. movaps %xmm6, 6 * SIZE(Y)
  161. #ifdef PREFETCHW
  162. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  163. #endif
  164. movaps %xmm1, %xmm4
  165. mulpd S, %xmm1
  166. movaps %xmm3, %xmm6
  167. mulpd S, %xmm3
  168. movaps %xmm0, %xmm5
  169. mulpd C, %xmm0
  170. movaps %xmm2, %xmm7
  171. mulpd C, %xmm2
  172. mulpd C, %xmm4
  173. mulpd S, %xmm5
  174. mulpd C, %xmm6
  175. mulpd S, %xmm7
  176. addpd %xmm1, %xmm0
  177. movaps 16 * SIZE(Y), %xmm1
  178. addpd %xmm3, %xmm2
  179. movaps 18 * SIZE(Y), %xmm3
  180. subpd %xmm5, %xmm4
  181. subpd %xmm7, %xmm6
  182. movaps %xmm0, 8 * SIZE(X)
  183. movaps 16 * SIZE(X), %xmm0
  184. movaps %xmm2, 10 * SIZE(X)
  185. movaps 18 * SIZE(X), %xmm2
  186. movaps %xmm4, 8 * SIZE(Y)
  187. movaps %xmm6, 10 * SIZE(Y)
  188. #ifdef PREFETCHW
  189. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  190. #endif
  191. movaps %xmm9, %xmm4
  192. mulpd S, %xmm9
  193. movaps %xmm8, %xmm5
  194. mulpd C, %xmm8
  195. movaps %xmm11, %xmm6
  196. mulpd S, %xmm11
  197. movaps %xmm10, %xmm7
  198. mulpd C, %xmm10
  199. mulpd C, %xmm4
  200. mulpd S, %xmm5
  201. mulpd C, %xmm6
  202. mulpd S, %xmm7
  203. addpd %xmm9, %xmm8
  204. movaps 20 * SIZE(Y), %xmm9
  205. addpd %xmm11, %xmm10
  206. movaps 22 * SIZE(Y), %xmm11
  207. subpd %xmm5, %xmm4
  208. subpd %xmm7, %xmm6
  209. movaps %xmm8, 12 * SIZE(X)
  210. movaps 20 * SIZE(X), %xmm8
  211. movaps %xmm10, 14 * SIZE(X)
  212. movaps 22 * SIZE(X), %xmm10
  213. movaps %xmm4, 12 * SIZE(Y)
  214. movaps %xmm6, 14 * SIZE(Y)
  215. addq $16 * SIZE, X
  216. addq $16 * SIZE, Y
  217. decq %rax
  218. jg .L11
  219. ALIGN_3
  220. .L12:
  221. movaps %xmm1, %xmm4
  222. mulpd S, %xmm1
  223. movaps %xmm3, %xmm6
  224. mulpd S, %xmm3
  225. movaps %xmm0, %xmm5
  226. mulpd C, %xmm0
  227. movaps %xmm2, %xmm7
  228. mulpd C, %xmm2
  229. mulpd C, %xmm4
  230. mulpd S, %xmm5
  231. mulpd C, %xmm6
  232. mulpd S, %xmm7
  233. addpd %xmm1, %xmm0
  234. movaps 8 * SIZE(Y), %xmm1
  235. addpd %xmm3, %xmm2
  236. movaps 10 * SIZE(Y), %xmm3
  237. subpd %xmm5, %xmm4
  238. subpd %xmm7, %xmm6
  239. movaps %xmm0, 0 * SIZE(X)
  240. movaps 8 * SIZE(X), %xmm0
  241. movaps %xmm2, 2 * SIZE(X)
  242. movaps 10 * SIZE(X), %xmm2
  243. movaps %xmm4, 0 * SIZE(Y)
  244. movaps %xmm6, 2 * SIZE(Y)
  245. movaps %xmm9, %xmm4
  246. mulpd S, %xmm9
  247. movaps %xmm8, %xmm5
  248. mulpd C, %xmm8
  249. movaps %xmm11, %xmm6
  250. mulpd S, %xmm11
  251. movaps %xmm10, %xmm7
  252. mulpd C, %xmm10
  253. mulpd C, %xmm4
  254. mulpd S, %xmm5
  255. mulpd C, %xmm6
  256. mulpd S, %xmm7
  257. addpd %xmm9, %xmm8
  258. movaps 12 * SIZE(Y), %xmm9
  259. addpd %xmm11, %xmm10
  260. movaps 14 * SIZE(Y), %xmm11
  261. subpd %xmm5, %xmm4
  262. subpd %xmm7, %xmm6
  263. movaps %xmm8, 4 * SIZE(X)
  264. movaps 12 * SIZE(X), %xmm8
  265. movaps %xmm10,6 * SIZE(X)
  266. movaps 14 * SIZE(X), %xmm10
  267. movaps %xmm4, 4 * SIZE(Y)
  268. movaps %xmm6, 6 * SIZE(Y)
  269. movaps %xmm1, %xmm4
  270. mulpd S, %xmm1
  271. movaps %xmm3, %xmm6
  272. mulpd S, %xmm3
  273. movaps %xmm0, %xmm5
  274. mulpd C, %xmm0
  275. movaps %xmm2, %xmm7
  276. mulpd C, %xmm2
  277. mulpd C, %xmm4
  278. mulpd S, %xmm5
  279. mulpd C, %xmm6
  280. mulpd S, %xmm7
  281. addpd %xmm1, %xmm0
  282. addpd %xmm3, %xmm2
  283. subpd %xmm5, %xmm4
  284. subpd %xmm7, %xmm6
  285. movaps %xmm0, 8 * SIZE(X)
  286. movaps %xmm2, 10 * SIZE(X)
  287. movaps %xmm4, 8 * SIZE(Y)
  288. movaps %xmm6, 10 * SIZE(Y)
  289. movaps %xmm9, %xmm4
  290. mulpd S, %xmm9
  291. movaps %xmm8, %xmm5
  292. mulpd C, %xmm8
  293. movaps %xmm11, %xmm6
  294. mulpd S, %xmm11
  295. movaps %xmm10, %xmm7
  296. mulpd C, %xmm10
  297. mulpd C, %xmm4
  298. mulpd S, %xmm5
  299. mulpd C, %xmm6
  300. mulpd S, %xmm7
  301. addpd %xmm9, %xmm8
  302. addpd %xmm11, %xmm10
  303. subpd %xmm5, %xmm4
  304. subpd %xmm7, %xmm6
  305. movaps %xmm8, 12 * SIZE(X)
  306. movaps %xmm10, 14 * SIZE(X)
  307. movaps %xmm4, 12 * SIZE(Y)
  308. movaps %xmm6, 14 * SIZE(Y)
  309. addq $16 * SIZE, X
  310. addq $16 * SIZE, Y
  311. ALIGN_3
  312. .L14:
  313. testq $15, N
  314. jle .L999
  315. testq $8, N
  316. jle .L15
  317. movaps 0 * SIZE(Y), %xmm1
  318. movaps 0 * SIZE(X), %xmm0
  319. movaps 2 * SIZE(Y), %xmm3
  320. movaps 2 * SIZE(X), %xmm2
  321. movaps %xmm1, %xmm4
  322. movaps %xmm0, %xmm5
  323. movaps %xmm3, %xmm6
  324. movaps %xmm2, %xmm7
  325. mulpd C, %xmm0
  326. mulpd S, %xmm1
  327. mulpd C, %xmm2
  328. mulpd S, %xmm3
  329. mulpd C, %xmm4
  330. mulpd S, %xmm5
  331. mulpd C, %xmm6
  332. mulpd S, %xmm7
  333. addpd %xmm1, %xmm0
  334. addpd %xmm3, %xmm2
  335. subpd %xmm5, %xmm4
  336. subpd %xmm7, %xmm6
  337. movaps %xmm0, 0 * SIZE(X)
  338. movaps %xmm2, 2 * SIZE(X)
  339. movaps %xmm4, 0 * SIZE(Y)
  340. movaps %xmm6, 2 * SIZE(Y)
  341. movaps 4 * SIZE(Y), %xmm1
  342. movaps 4 * SIZE(X), %xmm0
  343. movaps 6 * SIZE(Y), %xmm3
  344. movaps 6 * SIZE(X), %xmm2
  345. movaps %xmm1, %xmm4
  346. movaps %xmm0, %xmm5
  347. movaps %xmm3, %xmm6
  348. movaps %xmm2, %xmm7
  349. mulpd C, %xmm0
  350. mulpd S, %xmm1
  351. mulpd C, %xmm2
  352. mulpd S, %xmm3
  353. mulpd C, %xmm4
  354. mulpd S, %xmm5
  355. mulpd C, %xmm6
  356. mulpd S, %xmm7
  357. addpd %xmm1, %xmm0
  358. addpd %xmm3, %xmm2
  359. subpd %xmm5, %xmm4
  360. subpd %xmm7, %xmm6
  361. movaps %xmm0, 4 * SIZE(X)
  362. movaps %xmm2, 6 * SIZE(X)
  363. movaps %xmm4, 4 * SIZE(Y)
  364. movaps %xmm6, 6 * SIZE(Y)
  365. addq $8 * SIZE, X
  366. addq $8 * SIZE, Y
  367. ALIGN_3
  368. .L15:
  369. testq $4, N
  370. jle .L16
  371. movaps 0 * SIZE(Y), %xmm1
  372. movaps 0 * SIZE(X), %xmm0
  373. movaps 2 * SIZE(Y), %xmm3
  374. movaps 2 * SIZE(X), %xmm2
  375. movaps %xmm1, %xmm4
  376. movaps %xmm0, %xmm5
  377. movaps %xmm3, %xmm6
  378. movaps %xmm2, %xmm7
  379. mulpd C, %xmm0
  380. mulpd S, %xmm1
  381. mulpd C, %xmm2
  382. mulpd S, %xmm3
  383. mulpd C, %xmm4
  384. mulpd S, %xmm5
  385. mulpd C, %xmm6
  386. mulpd S, %xmm7
  387. addpd %xmm1, %xmm0
  388. addpd %xmm3, %xmm2
  389. subpd %xmm5, %xmm4
  390. subpd %xmm7, %xmm6
  391. movaps %xmm0, 0 * SIZE(X)
  392. movaps %xmm2, 2 * SIZE(X)
  393. movaps %xmm4, 0 * SIZE(Y)
  394. movaps %xmm6, 2 * SIZE(Y)
  395. addq $4 * SIZE, X
  396. addq $4 * SIZE, Y
  397. ALIGN_3
  398. .L16:
  399. testq $2, N
  400. jle .L17
  401. movaps 0 * SIZE(Y), %xmm1
  402. movaps 0 * SIZE(X), %xmm0
  403. movaps %xmm1, %xmm2
  404. movaps %xmm0, %xmm3
  405. mulpd C, %xmm0
  406. mulpd S, %xmm1
  407. mulpd C, %xmm2
  408. mulpd S, %xmm3
  409. addpd %xmm1, %xmm0
  410. subpd %xmm3, %xmm2
  411. movaps %xmm0, 0 * SIZE(X)
  412. movaps %xmm2, 0 * SIZE(Y)
  413. addq $2 * SIZE, X
  414. addq $2 * SIZE, Y
  415. ALIGN_3
  416. .L17:
  417. testq $1, N
  418. jle .L999
  419. movsd 0 * SIZE(Y), %xmm1
  420. movsd 0 * SIZE(X), %xmm0
  421. movaps %xmm1, %xmm2
  422. movaps %xmm0, %xmm3
  423. mulsd C, %xmm0
  424. mulsd S, %xmm1
  425. mulsd C, %xmm2
  426. mulsd S, %xmm3
  427. addsd %xmm1, %xmm0
  428. subsd %xmm3, %xmm2
  429. movsd %xmm0, 0 * SIZE(X)
  430. movsd %xmm2, 0 * SIZE(Y)
  431. jmp .L999
  432. ALIGN_3
  433. .L20:
  434. movaps -1 * SIZE(Y), %xmm1
  435. movq N, %rax
  436. sarq $4, %rax
  437. jle .L24
  438. ALIGN_3
  439. .L21:
  440. #ifdef PREFETCHW
  441. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  442. #endif
  443. movaps 1 * SIZE(Y), %xmm3
  444. movaps 3 * SIZE(Y), %xmm8
  445. movaps 0 * SIZE(X), %xmm0
  446. movaps 2 * SIZE(X), %xmm2
  447. SHUFPD_1 %xmm3, %xmm1
  448. SHUFPD_1 %xmm8, %xmm3
  449. movaps %xmm1, %xmm4
  450. movaps %xmm0, %xmm5
  451. movaps %xmm3, %xmm6
  452. movaps %xmm2, %xmm7
  453. mulpd C, %xmm0
  454. mulpd S, %xmm1
  455. mulpd C, %xmm2
  456. mulpd S, %xmm3
  457. mulpd C, %xmm4
  458. mulpd S, %xmm5
  459. mulpd C, %xmm6
  460. mulpd S, %xmm7
  461. addpd %xmm1, %xmm0
  462. addpd %xmm3, %xmm2
  463. subpd %xmm5, %xmm4
  464. subpd %xmm7, %xmm6
  465. movaps %xmm0, 0 * SIZE(X)
  466. movaps %xmm2, 2 * SIZE(X)
  467. movlpd %xmm4, 0 * SIZE(Y)
  468. movhps %xmm4, 1 * SIZE(Y)
  469. movlpd %xmm6, 2 * SIZE(Y)
  470. movhps %xmm6, 3 * SIZE(Y)
  471. #ifdef PREFETCHW
  472. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  473. #endif
  474. movaps 5 * SIZE(Y), %xmm9
  475. movaps 7 * SIZE(Y), %xmm1
  476. movaps 4 * SIZE(X), %xmm0
  477. movaps 6 * SIZE(X), %xmm2
  478. SHUFPD_1 %xmm9, %xmm8
  479. SHUFPD_1 %xmm1, %xmm9
  480. movaps %xmm8, %xmm4
  481. movaps %xmm0, %xmm5
  482. movaps %xmm9, %xmm6
  483. movaps %xmm2, %xmm7
  484. mulpd C, %xmm0
  485. mulpd S, %xmm8
  486. mulpd C, %xmm2
  487. mulpd S, %xmm9
  488. mulpd C, %xmm4
  489. mulpd S, %xmm5
  490. mulpd C, %xmm6
  491. mulpd S, %xmm7
  492. addpd %xmm8, %xmm0
  493. addpd %xmm9, %xmm2
  494. subpd %xmm5, %xmm4
  495. subpd %xmm7, %xmm6
  496. movaps %xmm0, 4 * SIZE(X)
  497. movaps %xmm2, 6 * SIZE(X)
  498. movlpd %xmm4, 4 * SIZE(Y)
  499. movhps %xmm4, 5 * SIZE(Y)
  500. movlpd %xmm6, 6 * SIZE(Y)
  501. movhps %xmm6, 7 * SIZE(Y)
  502. #ifdef PREFETCHW
  503. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  504. #endif
  505. movaps 9 * SIZE(Y), %xmm3
  506. movaps 11 * SIZE(Y), %xmm8
  507. movaps 8 * SIZE(X), %xmm0
  508. movaps 10 * SIZE(X), %xmm2
  509. SHUFPD_1 %xmm3, %xmm1
  510. SHUFPD_1 %xmm8, %xmm3
  511. movaps %xmm1, %xmm4
  512. movaps %xmm0, %xmm5
  513. movaps %xmm3, %xmm6
  514. movaps %xmm2, %xmm7
  515. mulpd C, %xmm0
  516. mulpd S, %xmm1
  517. mulpd C, %xmm2
  518. mulpd S, %xmm3
  519. mulpd C, %xmm4
  520. mulpd S, %xmm5
  521. mulpd C, %xmm6
  522. mulpd S, %xmm7
  523. addpd %xmm1, %xmm0
  524. addpd %xmm3, %xmm2
  525. subpd %xmm5, %xmm4
  526. subpd %xmm7, %xmm6
  527. movaps %xmm0, 8 * SIZE(X)
  528. movaps %xmm2, 10 * SIZE(X)
  529. movlpd %xmm4, 8 * SIZE(Y)
  530. movhps %xmm4, 9 * SIZE(Y)
  531. movlpd %xmm6, 10 * SIZE(Y)
  532. movhps %xmm6, 11 * SIZE(Y)
  533. #ifdef PREFETCHW
  534. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  535. #endif
  536. movaps 13 * SIZE(Y), %xmm9
  537. movaps 15 * SIZE(Y), %xmm1
  538. movaps 12 * SIZE(X), %xmm0
  539. movaps 14 * SIZE(X), %xmm2
  540. SHUFPD_1 %xmm9, %xmm8
  541. SHUFPD_1 %xmm1, %xmm9
  542. movaps %xmm8, %xmm4
  543. movaps %xmm0, %xmm5
  544. movaps %xmm9, %xmm6
  545. movaps %xmm2, %xmm7
  546. mulpd C, %xmm0
  547. mulpd S, %xmm8
  548. mulpd C, %xmm2
  549. mulpd S, %xmm9
  550. mulpd C, %xmm4
  551. mulpd S, %xmm5
  552. mulpd C, %xmm6
  553. mulpd S, %xmm7
  554. addpd %xmm8, %xmm0
  555. addpd %xmm9, %xmm2
  556. subpd %xmm5, %xmm4
  557. subpd %xmm7, %xmm6
  558. movaps %xmm0, 12 * SIZE(X)
  559. movaps %xmm2, 14 * SIZE(X)
  560. movlpd %xmm4, 12 * SIZE(Y)
  561. movhps %xmm4, 13 * SIZE(Y)
  562. movlpd %xmm6, 14 * SIZE(Y)
  563. movhps %xmm6, 15 * SIZE(Y)
  564. addq $16 * SIZE, X
  565. addq $16 * SIZE, Y
  566. decq %rax
  567. jg .L21
  568. ALIGN_3
  569. .L24:
  570. testq $15, N
  571. jle .L999
  572. testq $8, N
  573. jle .L25
  574. movaps 1 * SIZE(Y), %xmm3
  575. movaps 3 * SIZE(Y), %xmm8
  576. movaps 0 * SIZE(X), %xmm0
  577. movaps 2 * SIZE(X), %xmm2
  578. SHUFPD_1 %xmm3, %xmm1
  579. SHUFPD_1 %xmm8, %xmm3
  580. movaps %xmm1, %xmm4
  581. movaps %xmm0, %xmm5
  582. movaps %xmm3, %xmm6
  583. movaps %xmm2, %xmm7
  584. mulpd C, %xmm0
  585. mulpd S, %xmm1
  586. mulpd C, %xmm2
  587. mulpd S, %xmm3
  588. mulpd C, %xmm4
  589. mulpd S, %xmm5
  590. mulpd C, %xmm6
  591. mulpd S, %xmm7
  592. addpd %xmm1, %xmm0
  593. addpd %xmm3, %xmm2
  594. subpd %xmm5, %xmm4
  595. subpd %xmm7, %xmm6
  596. movaps %xmm0, 0 * SIZE(X)
  597. movaps %xmm2, 2 * SIZE(X)
  598. movlpd %xmm4, 0 * SIZE(Y)
  599. movhps %xmm4, 1 * SIZE(Y)
  600. movlpd %xmm6, 2 * SIZE(Y)
  601. movhps %xmm6, 3 * SIZE(Y)
  602. movaps 5 * SIZE(Y), %xmm9
  603. movaps 7 * SIZE(Y), %xmm1
  604. movaps 4 * SIZE(X), %xmm0
  605. movaps 6 * SIZE(X), %xmm2
  606. SHUFPD_1 %xmm9, %xmm8
  607. SHUFPD_1 %xmm1, %xmm9
  608. movaps %xmm8, %xmm4
  609. movaps %xmm0, %xmm5
  610. movaps %xmm9, %xmm6
  611. movaps %xmm2, %xmm7
  612. mulpd C, %xmm0
  613. mulpd S, %xmm8
  614. mulpd C, %xmm2
  615. mulpd S, %xmm9
  616. mulpd C, %xmm4
  617. mulpd S, %xmm5
  618. mulpd C, %xmm6
  619. mulpd S, %xmm7
  620. addpd %xmm8, %xmm0
  621. addpd %xmm9, %xmm2
  622. subpd %xmm5, %xmm4
  623. subpd %xmm7, %xmm6
  624. movaps %xmm0, 4 * SIZE(X)
  625. movaps %xmm2, 6 * SIZE(X)
  626. movlpd %xmm4, 4 * SIZE(Y)
  627. movhps %xmm4, 5 * SIZE(Y)
  628. movlpd %xmm6, 6 * SIZE(Y)
  629. movhps %xmm6, 7 * SIZE(Y)
  630. addq $8 * SIZE, X
  631. addq $8 * SIZE, Y
  632. ALIGN_3
  633. .L25:
  634. testq $4, N
  635. jle .L26
  636. movaps 1 * SIZE(Y), %xmm3
  637. movaps 3 * SIZE(Y), %xmm8
  638. movaps 0 * SIZE(X), %xmm0
  639. movaps 2 * SIZE(X), %xmm2
  640. SHUFPD_1 %xmm3, %xmm1
  641. SHUFPD_1 %xmm8, %xmm3
  642. movaps %xmm1, %xmm4
  643. movaps %xmm0, %xmm5
  644. movaps %xmm3, %xmm6
  645. movaps %xmm2, %xmm7
  646. mulpd C, %xmm0
  647. mulpd S, %xmm1
  648. mulpd C, %xmm2
  649. mulpd S, %xmm3
  650. mulpd C, %xmm4
  651. mulpd S, %xmm5
  652. mulpd C, %xmm6
  653. mulpd S, %xmm7
  654. addpd %xmm1, %xmm0
  655. addpd %xmm3, %xmm2
  656. subpd %xmm5, %xmm4
  657. subpd %xmm7, %xmm6
  658. movaps %xmm0, 0 * SIZE(X)
  659. movaps %xmm2, 2 * SIZE(X)
  660. movlpd %xmm4, 0 * SIZE(Y)
  661. movhps %xmm4, 1 * SIZE(Y)
  662. movlpd %xmm6, 2 * SIZE(Y)
  663. movhps %xmm6, 3 * SIZE(Y)
  664. movaps %xmm8, %xmm1
  665. addq $4 * SIZE, X
  666. addq $4 * SIZE, Y
  667. ALIGN_3
  668. .L26:
  669. testq $2, N
  670. jle .L27
  671. movaps 1 * SIZE(Y), %xmm4
  672. movaps 0 * SIZE(X), %xmm0
  673. SHUFPD_1 %xmm4, %xmm1
  674. movaps %xmm1, %xmm2
  675. movaps %xmm0, %xmm3
  676. mulpd C, %xmm0
  677. mulpd S, %xmm1
  678. mulpd C, %xmm2
  679. mulpd S, %xmm3
  680. addpd %xmm1, %xmm0
  681. subpd %xmm3, %xmm2
  682. movaps %xmm0, 0 * SIZE(X)
  683. movlpd %xmm2, 0 * SIZE(Y)
  684. movhps %xmm2, 1 * SIZE(Y)
  685. movaps %xmm4, %xmm1
  686. addq $2 * SIZE, X
  687. addq $2 * SIZE, Y
  688. ALIGN_3
  689. .L27:
  690. testq $1, N
  691. jle .L999
  692. unpckhpd %xmm1, %xmm1
  693. movsd 0 * SIZE(X), %xmm0
  694. movaps %xmm1, %xmm2
  695. movaps %xmm0, %xmm3
  696. mulsd C, %xmm0
  697. mulsd S, %xmm1
  698. mulsd C, %xmm2
  699. mulsd S, %xmm3
  700. addsd %xmm1, %xmm0
  701. subsd %xmm3, %xmm2
  702. movsd %xmm0, 0 * SIZE(X)
  703. movsd %xmm2, 0 * SIZE(Y)
  704. jmp .L999
  705. ALIGN_3
  706. .L50:
  707. movq N, %rax
  708. cmpq $0, INCX
  709. je .L56
  710. cmpq $0, INCY
  711. je .L56
  712. sarq $2, %rax
  713. jle .L55
  714. ALIGN_3
  715. .L53:
  716. movsd (Y), %xmm1
  717. movhps (Y, INCY), %xmm1
  718. movsd (X), %xmm0
  719. movhps (X, INCX), %xmm0
  720. movaps %xmm1, %xmm2
  721. movaps %xmm0, %xmm3
  722. mulpd C, %xmm0
  723. mulpd S, %xmm1
  724. mulpd C, %xmm2
  725. mulpd S, %xmm3
  726. addpd %xmm1, %xmm0
  727. subpd %xmm3, %xmm2
  728. movlpd %xmm0, (X)
  729. movhps %xmm0, (X, INCX)
  730. movlpd %xmm2, (Y)
  731. movhps %xmm2, (Y, INCY)
  732. leaq (X, INCX, 2), X
  733. leaq (Y, INCY, 2), Y
  734. movsd (Y), %xmm1
  735. movhps (Y, INCY), %xmm1
  736. movsd (X), %xmm0
  737. movhps (X, INCX), %xmm0
  738. movaps %xmm1, %xmm2
  739. movaps %xmm0, %xmm3
  740. mulpd C, %xmm0
  741. mulpd S, %xmm1
  742. mulpd C, %xmm2
  743. mulpd S, %xmm3
  744. addpd %xmm1, %xmm0
  745. subpd %xmm3, %xmm2
  746. movlpd %xmm0, (X)
  747. movhps %xmm0, (X, INCX)
  748. movlpd %xmm2, (Y)
  749. movhps %xmm2, (Y, INCY)
  750. leaq (X, INCX, 2), X
  751. leaq (Y, INCY, 2), Y
  752. decq %rax
  753. jg .L53
  754. ALIGN_3
  755. .L55:
  756. movq N, %rax
  757. andq $3, %rax
  758. jle .L999
  759. ALIGN_3
  760. .L56:
  761. movsd (Y), %xmm1
  762. movsd (X), %xmm0
  763. movaps %xmm1, %xmm2
  764. movaps %xmm0, %xmm3
  765. mulsd C, %xmm0
  766. mulsd S, %xmm1
  767. mulsd C, %xmm2
  768. mulsd S, %xmm3
  769. addsd %xmm1, %xmm0
  770. subsd %xmm3, %xmm2
  771. movsd %xmm0, (X)
  772. movsd %xmm2, (Y)
  773. addq INCX, X
  774. addq INCY, Y
  775. decq %rax
  776. jg .L56
  777. ALIGN_3
  778. .L999:
  779. RESTOREREGISTERS
  780. ret
  781. EPILOGUE