You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rot_sse.S 19 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define Y ARG4 /* rcx */
  44. #ifndef WINDOWS_ABI
  45. #define INCY ARG5 /* r8 */
  46. #else
  47. #define INCY %r10
  48. #endif
  49. #define C %xmm14
  50. #define S %xmm15
  51. #include "l1param.h"
  52. PROLOGUE
  53. PROFCODE
  54. #ifdef WINDOWS_ABI
  55. movq 40(%rsp), INCY
  56. movss 48(%rsp), %xmm0
  57. movss 56(%rsp), %xmm1
  58. #endif
  59. SAVEREGISTERS
  60. leaq (, INCX, SIZE), INCX
  61. leaq (, INCY, SIZE), INCY
  62. pshufd $0x0, %xmm0, C
  63. pshufd $0x0, %xmm1, S
  64. cmpq $0, N
  65. jle .L999
  66. cmpq $SIZE, INCX
  67. jne .L50
  68. cmpq $SIZE, INCY
  69. jne .L50
  70. testq $SIZE, X
  71. je .L05
  72. movss 0 * SIZE(Y), %xmm1
  73. movss 0 * SIZE(X), %xmm0
  74. movaps %xmm1, %xmm2
  75. movaps %xmm0, %xmm3
  76. mulss C, %xmm0
  77. mulss S, %xmm1
  78. mulss C, %xmm2
  79. mulss S, %xmm3
  80. addss %xmm1, %xmm0
  81. subss %xmm3, %xmm2
  82. movss %xmm0, 0 * SIZE(X)
  83. movss %xmm2, 0 * SIZE(Y)
  84. addq $1 * SIZE, X
  85. addq $1 * SIZE, Y
  86. decq N
  87. jle .L999
  88. .L05:
  89. testq $2 * SIZE, X
  90. je .L10
  91. cmpq $1, N
  92. je .L17
  93. movsd 0 * SIZE(Y), %xmm1
  94. movsd 0 * SIZE(X), %xmm0
  95. movaps %xmm1, %xmm2
  96. movaps %xmm0, %xmm3
  97. mulps C, %xmm0
  98. mulps S, %xmm1
  99. mulps C, %xmm2
  100. mulps S, %xmm3
  101. addps %xmm1, %xmm0
  102. subps %xmm3, %xmm2
  103. movlps %xmm0, 0 * SIZE(X)
  104. movlps %xmm2, 0 * SIZE(Y)
  105. addq $2 * SIZE, X
  106. addq $2 * SIZE, Y
  107. subq $2, N
  108. jle .L999
  109. ALIGN_2
  110. .L10:
  111. testq $3 * SIZE, Y
  112. jne .L20
  113. movq N, %rax
  114. sarq $5, %rax
  115. jle .L14
  116. movaps 0 * SIZE(Y), %xmm1
  117. movaps 4 * SIZE(Y), %xmm3
  118. movaps 8 * SIZE(Y), %xmm9
  119. movaps 12 * SIZE(Y), %xmm11
  120. movaps 0 * SIZE(X), %xmm0
  121. movaps 4 * SIZE(X), %xmm2
  122. movaps 8 * SIZE(X), %xmm8
  123. movaps 12 * SIZE(X), %xmm10
  124. decq %rax
  125. jle .L12
  126. ALIGN_3
  127. .L11:
  128. #ifdef PREFETCHW
  129. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  130. #endif
  131. movaps %xmm1, %xmm4
  132. mulps S, %xmm1
  133. movaps %xmm3, %xmm6
  134. mulps S, %xmm3
  135. movaps %xmm0, %xmm5
  136. mulps C, %xmm0
  137. movaps %xmm2, %xmm7
  138. mulps C, %xmm2
  139. mulps C, %xmm4
  140. mulps S, %xmm5
  141. mulps C, %xmm6
  142. mulps S, %xmm7
  143. addps %xmm1, %xmm0
  144. movaps 16 * SIZE(Y), %xmm1
  145. addps %xmm3, %xmm2
  146. movaps 20 * SIZE(Y), %xmm3
  147. subps %xmm5, %xmm4
  148. subps %xmm7, %xmm6
  149. #ifdef PREFETCHW
  150. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  151. #endif
  152. movaps %xmm0, 0 * SIZE(X)
  153. movaps 16 * SIZE(X), %xmm0
  154. movaps %xmm2, 4 * SIZE(X)
  155. movaps 20 * SIZE(X), %xmm2
  156. movaps %xmm4, 0 * SIZE(Y)
  157. movaps %xmm6, 4 * SIZE(Y)
  158. movaps %xmm9, %xmm4
  159. mulps S, %xmm9
  160. movaps %xmm8, %xmm5
  161. mulps C, %xmm8
  162. movaps %xmm11, %xmm6
  163. mulps S, %xmm11
  164. movaps %xmm10, %xmm7
  165. mulps C, %xmm10
  166. mulps C, %xmm4
  167. mulps S, %xmm5
  168. mulps C, %xmm6
  169. mulps S, %xmm7
  170. addps %xmm9, %xmm8
  171. movaps 24 * SIZE(Y), %xmm9
  172. addps %xmm11, %xmm10
  173. movaps 28 * SIZE(Y), %xmm11
  174. subps %xmm5, %xmm4
  175. subps %xmm7, %xmm6
  176. movaps %xmm8, 8 * SIZE(X)
  177. movaps 24 * SIZE(X), %xmm8
  178. movaps %xmm10,12 * SIZE(X)
  179. movaps 28 * SIZE(X), %xmm10
  180. movaps %xmm4, 8 * SIZE(Y)
  181. movaps %xmm6, 12 * SIZE(Y)
  182. #ifdef PREFETCHW
  183. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  184. #endif
  185. movaps %xmm1, %xmm4
  186. mulps S, %xmm1
  187. movaps %xmm3, %xmm6
  188. mulps S, %xmm3
  189. movaps %xmm0, %xmm5
  190. mulps C, %xmm0
  191. movaps %xmm2, %xmm7
  192. mulps C, %xmm2
  193. mulps C, %xmm4
  194. mulps S, %xmm5
  195. mulps C, %xmm6
  196. mulps S, %xmm7
  197. addps %xmm1, %xmm0
  198. movaps 32 * SIZE(Y), %xmm1
  199. addps %xmm3, %xmm2
  200. movaps 36 * SIZE(Y), %xmm3
  201. subps %xmm5, %xmm4
  202. subps %xmm7, %xmm6
  203. movaps %xmm0, 16 * SIZE(X)
  204. movaps 32 * SIZE(X), %xmm0
  205. movaps %xmm2, 20 * SIZE(X)
  206. movaps 36 * SIZE(X), %xmm2
  207. movaps %xmm4, 16 * SIZE(Y)
  208. movaps %xmm6, 20 * SIZE(Y)
  209. #ifdef PREFETCHW
  210. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  211. #endif
  212. movaps %xmm9, %xmm4
  213. mulps S, %xmm9
  214. movaps %xmm8, %xmm5
  215. mulps C, %xmm8
  216. movaps %xmm11, %xmm6
  217. mulps S, %xmm11
  218. movaps %xmm10, %xmm7
  219. mulps C, %xmm10
  220. mulps C, %xmm4
  221. mulps S, %xmm5
  222. mulps C, %xmm6
  223. mulps S, %xmm7
  224. addps %xmm9, %xmm8
  225. movaps 40 * SIZE(Y), %xmm9
  226. addps %xmm11, %xmm10
  227. movaps 44 * SIZE(Y), %xmm11
  228. subps %xmm5, %xmm4
  229. subps %xmm7, %xmm6
  230. movaps %xmm8, 24 * SIZE(X)
  231. movaps 40 * SIZE(X), %xmm8
  232. movaps %xmm10, 28 * SIZE(X)
  233. movaps 44 * SIZE(X), %xmm10
  234. movaps %xmm4, 24 * SIZE(Y)
  235. movaps %xmm6, 28 * SIZE(Y)
  236. addq $32 * SIZE, X
  237. addq $32 * SIZE, Y
  238. decq %rax
  239. jg .L11
  240. ALIGN_3
  241. .L12:
  242. movaps %xmm1, %xmm4
  243. mulps S, %xmm1
  244. movaps %xmm3, %xmm6
  245. mulps S, %xmm3
  246. movaps %xmm0, %xmm5
  247. mulps C, %xmm0
  248. movaps %xmm2, %xmm7
  249. mulps C, %xmm2
  250. mulps C, %xmm4
  251. mulps S, %xmm5
  252. mulps C, %xmm6
  253. mulps S, %xmm7
  254. addps %xmm1, %xmm0
  255. movaps 16 * SIZE(Y), %xmm1
  256. addps %xmm3, %xmm2
  257. movaps 20 * SIZE(Y), %xmm3
  258. subps %xmm5, %xmm4
  259. subps %xmm7, %xmm6
  260. movaps %xmm0, 0 * SIZE(X)
  261. movaps 16 * SIZE(X), %xmm0
  262. movaps %xmm2, 4 * SIZE(X)
  263. movaps 20 * SIZE(X), %xmm2
  264. movaps %xmm4, 0 * SIZE(Y)
  265. movaps %xmm6, 4 * SIZE(Y)
  266. movaps %xmm9, %xmm4
  267. mulps S, %xmm9
  268. movaps %xmm8, %xmm5
  269. mulps C, %xmm8
  270. movaps %xmm11, %xmm6
  271. mulps S, %xmm11
  272. movaps %xmm10, %xmm7
  273. mulps C, %xmm10
  274. mulps C, %xmm4
  275. mulps S, %xmm5
  276. mulps C, %xmm6
  277. mulps S, %xmm7
  278. addps %xmm9, %xmm8
  279. movaps 24 * SIZE(Y), %xmm9
  280. addps %xmm11, %xmm10
  281. movaps 28 * SIZE(Y), %xmm11
  282. subps %xmm5, %xmm4
  283. subps %xmm7, %xmm6
  284. movaps %xmm8, 8 * SIZE(X)
  285. movaps 24 * SIZE(X), %xmm8
  286. movaps %xmm10,12 * SIZE(X)
  287. movaps 28 * SIZE(X), %xmm10
  288. movaps %xmm4, 8 * SIZE(Y)
  289. movaps %xmm6, 12 * SIZE(Y)
  290. movaps %xmm1, %xmm4
  291. mulps S, %xmm1
  292. movaps %xmm3, %xmm6
  293. mulps S, %xmm3
  294. movaps %xmm0, %xmm5
  295. mulps C, %xmm0
  296. movaps %xmm2, %xmm7
  297. mulps C, %xmm2
  298. mulps C, %xmm4
  299. mulps S, %xmm5
  300. mulps C, %xmm6
  301. mulps S, %xmm7
  302. addps %xmm1, %xmm0
  303. addps %xmm3, %xmm2
  304. subps %xmm5, %xmm4
  305. subps %xmm7, %xmm6
  306. movaps %xmm0, 16 * SIZE(X)
  307. movaps %xmm2, 20 * SIZE(X)
  308. movaps %xmm4, 16 * SIZE(Y)
  309. movaps %xmm6, 20 * SIZE(Y)
  310. movaps %xmm9, %xmm4
  311. mulps S, %xmm9
  312. movaps %xmm8, %xmm5
  313. mulps C, %xmm8
  314. movaps %xmm11, %xmm6
  315. mulps S, %xmm11
  316. movaps %xmm10, %xmm7
  317. mulps C, %xmm10
  318. mulps C, %xmm4
  319. mulps S, %xmm5
  320. mulps C, %xmm6
  321. mulps S, %xmm7
  322. addps %xmm9, %xmm8
  323. addps %xmm11, %xmm10
  324. subps %xmm5, %xmm4
  325. subps %xmm7, %xmm6
  326. movaps %xmm8, 24 * SIZE(X)
  327. movaps %xmm10, 28 * SIZE(X)
  328. movaps %xmm4, 24 * SIZE(Y)
  329. movaps %xmm6, 28 * SIZE(Y)
  330. addq $32 * SIZE, X
  331. addq $32 * SIZE, Y
  332. ALIGN_3
  333. .L14:
  334. testq $31, N
  335. jle .L999
  336. testq $16, N
  337. jle .L15
  338. movaps 0 * SIZE(Y), %xmm1
  339. movaps 0 * SIZE(X), %xmm0
  340. movaps 4 * SIZE(Y), %xmm3
  341. movaps 4 * SIZE(X), %xmm2
  342. movaps %xmm1, %xmm4
  343. movaps %xmm0, %xmm5
  344. movaps %xmm3, %xmm6
  345. movaps %xmm2, %xmm7
  346. mulps C, %xmm0
  347. mulps S, %xmm1
  348. mulps C, %xmm2
  349. mulps S, %xmm3
  350. mulps C, %xmm4
  351. mulps S, %xmm5
  352. mulps C, %xmm6
  353. mulps S, %xmm7
  354. addps %xmm1, %xmm0
  355. addps %xmm3, %xmm2
  356. subps %xmm5, %xmm4
  357. subps %xmm7, %xmm6
  358. movaps %xmm0, 0 * SIZE(X)
  359. movaps %xmm2, 4 * SIZE(X)
  360. movaps %xmm4, 0 * SIZE(Y)
  361. movaps %xmm6, 4 * SIZE(Y)
  362. movaps 8 * SIZE(Y), %xmm1
  363. movaps 8 * SIZE(X), %xmm0
  364. movaps 12 * SIZE(Y), %xmm3
  365. movaps 12 * SIZE(X), %xmm2
  366. movaps %xmm1, %xmm4
  367. movaps %xmm0, %xmm5
  368. movaps %xmm3, %xmm6
  369. movaps %xmm2, %xmm7
  370. mulps C, %xmm0
  371. mulps S, %xmm1
  372. mulps C, %xmm2
  373. mulps S, %xmm3
  374. mulps C, %xmm4
  375. mulps S, %xmm5
  376. mulps C, %xmm6
  377. mulps S, %xmm7
  378. addps %xmm1, %xmm0
  379. addps %xmm3, %xmm2
  380. subps %xmm5, %xmm4
  381. subps %xmm7, %xmm6
  382. movaps %xmm0, 8 * SIZE(X)
  383. movaps %xmm2, 12 * SIZE(X)
  384. movaps %xmm4, 8 * SIZE(Y)
  385. movaps %xmm6, 12 * SIZE(Y)
  386. addq $16 * SIZE, X
  387. addq $16 * SIZE, Y
  388. ALIGN_3
  389. .L15:
  390. testq $8, N
  391. jle .L16
  392. movaps 0 * SIZE(Y), %xmm1
  393. movaps 0 * SIZE(X), %xmm0
  394. movaps 4 * SIZE(Y), %xmm3
  395. movaps 4 * SIZE(X), %xmm2
  396. movaps %xmm1, %xmm4
  397. movaps %xmm0, %xmm5
  398. movaps %xmm3, %xmm6
  399. movaps %xmm2, %xmm7
  400. mulps C, %xmm0
  401. mulps S, %xmm1
  402. mulps C, %xmm2
  403. mulps S, %xmm3
  404. mulps C, %xmm4
  405. mulps S, %xmm5
  406. mulps C, %xmm6
  407. mulps S, %xmm7
  408. addps %xmm1, %xmm0
  409. addps %xmm3, %xmm2
  410. subps %xmm5, %xmm4
  411. subps %xmm7, %xmm6
  412. movaps %xmm0, 0 * SIZE(X)
  413. movaps %xmm2, 4 * SIZE(X)
  414. movaps %xmm4, 0 * SIZE(Y)
  415. movaps %xmm6, 4 * SIZE(Y)
  416. addq $8 * SIZE, X
  417. addq $8 * SIZE, Y
  418. ALIGN_3
  419. .L16:
  420. testq $4, N
  421. jle .L17
  422. movaps 0 * SIZE(Y), %xmm1
  423. movaps 0 * SIZE(X), %xmm0
  424. movaps %xmm1, %xmm2
  425. movaps %xmm0, %xmm3
  426. mulps C, %xmm0
  427. mulps S, %xmm1
  428. mulps C, %xmm2
  429. mulps S, %xmm3
  430. addps %xmm1, %xmm0
  431. subps %xmm3, %xmm2
  432. movaps %xmm0, 0 * SIZE(X)
  433. movaps %xmm2, 0 * SIZE(Y)
  434. addq $4 * SIZE, X
  435. addq $4 * SIZE, Y
  436. ALIGN_3
  437. .L17:
  438. testq $2, N
  439. jle .L18
  440. movsd 0 * SIZE(Y), %xmm1
  441. movsd 0 * SIZE(X), %xmm0
  442. movaps %xmm1, %xmm2
  443. movaps %xmm0, %xmm3
  444. mulps C, %xmm0
  445. mulps S, %xmm1
  446. mulps C, %xmm2
  447. mulps S, %xmm3
  448. addps %xmm1, %xmm0
  449. subps %xmm3, %xmm2
  450. movlps %xmm0, 0 * SIZE(X)
  451. movlps %xmm2, 0 * SIZE(Y)
  452. addq $2 * SIZE, X
  453. addq $2 * SIZE, Y
  454. ALIGN_3
  455. .L18:
  456. testq $1, N
  457. jle .L999
  458. movss 0 * SIZE(Y), %xmm1
  459. movss 0 * SIZE(X), %xmm0
  460. movaps %xmm1, %xmm2
  461. movaps %xmm0, %xmm3
  462. mulss C, %xmm0
  463. mulss S, %xmm1
  464. mulss C, %xmm2
  465. mulss S, %xmm3
  466. addss %xmm1, %xmm0
  467. subss %xmm3, %xmm2
  468. movss %xmm0, 0 * SIZE(X)
  469. movss %xmm2, 0 * SIZE(Y)
  470. jmp .L999
  471. ALIGN_3
  472. .L20:
  473. movq N, %rax
  474. sarq $5, %rax
  475. jle .L24
  476. ALIGN_3
  477. .L21:
  478. #ifdef PREFETCHW
  479. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  480. #endif
  481. movsd 0 * SIZE(Y), %xmm1
  482. movhps 2 * SIZE(Y), %xmm1
  483. movsd 4 * SIZE(Y), %xmm3
  484. movhps 6 * SIZE(Y), %xmm3
  485. movaps 0 * SIZE(X), %xmm0
  486. movaps 4 * SIZE(X), %xmm2
  487. movaps %xmm1, %xmm4
  488. movaps %xmm0, %xmm5
  489. movaps %xmm3, %xmm6
  490. movaps %xmm2, %xmm7
  491. mulps C, %xmm0
  492. mulps S, %xmm1
  493. mulps C, %xmm2
  494. mulps S, %xmm3
  495. mulps C, %xmm4
  496. mulps S, %xmm5
  497. mulps C, %xmm6
  498. mulps S, %xmm7
  499. addps %xmm1, %xmm0
  500. addps %xmm3, %xmm2
  501. subps %xmm5, %xmm4
  502. subps %xmm7, %xmm6
  503. movaps %xmm0, 0 * SIZE(X)
  504. movaps %xmm2, 4 * SIZE(X)
  505. movlps %xmm4, 0 * SIZE(Y)
  506. movhps %xmm4, 2 * SIZE(Y)
  507. movlps %xmm6, 4 * SIZE(Y)
  508. movhps %xmm6, 6 * SIZE(Y)
  509. #ifdef PREFETCHW
  510. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  511. #endif
  512. movsd 8 * SIZE(Y), %xmm1
  513. movhps 10 * SIZE(Y), %xmm1
  514. movsd 12 * SIZE(Y), %xmm3
  515. movhps 14 * SIZE(Y), %xmm3
  516. movaps 8 * SIZE(X), %xmm0
  517. movaps 12 * SIZE(X), %xmm2
  518. movaps %xmm1, %xmm4
  519. movaps %xmm0, %xmm5
  520. movaps %xmm3, %xmm6
  521. movaps %xmm2, %xmm7
  522. mulps C, %xmm0
  523. mulps S, %xmm1
  524. mulps C, %xmm2
  525. mulps S, %xmm3
  526. mulps C, %xmm4
  527. mulps S, %xmm5
  528. mulps C, %xmm6
  529. mulps S, %xmm7
  530. addps %xmm1, %xmm0
  531. addps %xmm3, %xmm2
  532. subps %xmm5, %xmm4
  533. subps %xmm7, %xmm6
  534. movaps %xmm0, 8 * SIZE(X)
  535. movaps %xmm2, 12 * SIZE(X)
  536. movlps %xmm4, 8 * SIZE(Y)
  537. movhps %xmm4, 10 * SIZE(Y)
  538. movlps %xmm6, 12 * SIZE(Y)
  539. movhps %xmm6, 14 * SIZE(Y)
  540. #ifdef PREFETCHW
  541. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  542. #endif
  543. movsd 16 * SIZE(Y), %xmm1
  544. movhps 18 * SIZE(Y), %xmm1
  545. movsd 20 * SIZE(Y), %xmm3
  546. movhps 22 * SIZE(Y), %xmm3
  547. movaps 16 * SIZE(X), %xmm0
  548. movaps 20 * SIZE(X), %xmm2
  549. movaps %xmm1, %xmm4
  550. movaps %xmm0, %xmm5
  551. movaps %xmm3, %xmm6
  552. movaps %xmm2, %xmm7
  553. mulps C, %xmm0
  554. mulps S, %xmm1
  555. mulps C, %xmm2
  556. mulps S, %xmm3
  557. mulps C, %xmm4
  558. mulps S, %xmm5
  559. mulps C, %xmm6
  560. mulps S, %xmm7
  561. addps %xmm1, %xmm0
  562. addps %xmm3, %xmm2
  563. subps %xmm5, %xmm4
  564. subps %xmm7, %xmm6
  565. movaps %xmm0, 16 * SIZE(X)
  566. movaps %xmm2, 20 * SIZE(X)
  567. movlps %xmm4, 16 * SIZE(Y)
  568. movhps %xmm4, 18 * SIZE(Y)
  569. movlps %xmm6, 20 * SIZE(Y)
  570. movhps %xmm6, 22 * SIZE(Y)
  571. #ifdef PREFETCHW
  572. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  573. #endif
  574. movsd 24 * SIZE(Y), %xmm1
  575. movhps 26 * SIZE(Y), %xmm1
  576. movsd 28 * SIZE(Y), %xmm3
  577. movhps 30 * SIZE(Y), %xmm3
  578. movaps 24 * SIZE(X), %xmm0
  579. movaps 28 * SIZE(X), %xmm2
  580. movaps %xmm1, %xmm4
  581. movaps %xmm0, %xmm5
  582. movaps %xmm3, %xmm6
  583. movaps %xmm2, %xmm7
  584. mulps C, %xmm0
  585. mulps S, %xmm1
  586. mulps C, %xmm2
  587. mulps S, %xmm3
  588. mulps C, %xmm4
  589. mulps S, %xmm5
  590. mulps C, %xmm6
  591. mulps S, %xmm7
  592. addps %xmm1, %xmm0
  593. addps %xmm3, %xmm2
  594. subps %xmm5, %xmm4
  595. subps %xmm7, %xmm6
  596. movaps %xmm0, 24 * SIZE(X)
  597. movaps %xmm2, 28 * SIZE(X)
  598. movlps %xmm4, 24 * SIZE(Y)
  599. movhps %xmm4, 26 * SIZE(Y)
  600. movlps %xmm6, 28 * SIZE(Y)
  601. movhps %xmm6, 30 * SIZE(Y)
  602. addq $32 * SIZE, X
  603. addq $32 * SIZE, Y
  604. decq %rax
  605. jg .L21
  606. ALIGN_3
  607. .L24:
  608. testq $31, N
  609. jle .L999
  610. testq $16, N
  611. jle .L25
  612. movsd 0 * SIZE(Y), %xmm1
  613. movhps 2 * SIZE(Y), %xmm1
  614. movsd 4 * SIZE(Y), %xmm3
  615. movhps 6 * SIZE(Y), %xmm3
  616. movaps 0 * SIZE(X), %xmm0
  617. movaps 4 * SIZE(X), %xmm2
  618. movaps %xmm1, %xmm4
  619. movaps %xmm0, %xmm5
  620. movaps %xmm3, %xmm6
  621. movaps %xmm2, %xmm7
  622. mulps C, %xmm0
  623. mulps S, %xmm1
  624. mulps C, %xmm2
  625. mulps S, %xmm3
  626. mulps C, %xmm4
  627. mulps S, %xmm5
  628. mulps C, %xmm6
  629. mulps S, %xmm7
  630. addps %xmm1, %xmm0
  631. addps %xmm3, %xmm2
  632. subps %xmm5, %xmm4
  633. subps %xmm7, %xmm6
  634. movaps %xmm0, 0 * SIZE(X)
  635. movaps %xmm2, 4 * SIZE(X)
  636. movlps %xmm4, 0 * SIZE(Y)
  637. movhps %xmm4, 2 * SIZE(Y)
  638. movlps %xmm6, 4 * SIZE(Y)
  639. movhps %xmm6, 6 * SIZE(Y)
  640. movsd 8 * SIZE(Y), %xmm1
  641. movhps 10 * SIZE(Y), %xmm1
  642. movsd 12 * SIZE(Y), %xmm3
  643. movhps 14 * SIZE(Y), %xmm3
  644. movaps 8 * SIZE(X), %xmm0
  645. movaps 12 * SIZE(X), %xmm2
  646. movaps %xmm1, %xmm4
  647. movaps %xmm0, %xmm5
  648. movaps %xmm3, %xmm6
  649. movaps %xmm2, %xmm7
  650. mulps C, %xmm0
  651. mulps S, %xmm1
  652. mulps C, %xmm2
  653. mulps S, %xmm3
  654. mulps C, %xmm4
  655. mulps S, %xmm5
  656. mulps C, %xmm6
  657. mulps S, %xmm7
  658. addps %xmm1, %xmm0
  659. addps %xmm3, %xmm2
  660. subps %xmm5, %xmm4
  661. subps %xmm7, %xmm6
  662. movaps %xmm0, 8 * SIZE(X)
  663. movaps %xmm2, 12 * SIZE(X)
  664. movlps %xmm4, 8 * SIZE(Y)
  665. movhps %xmm4, 10 * SIZE(Y)
  666. movlps %xmm6, 12 * SIZE(Y)
  667. movhps %xmm6, 14 * SIZE(Y)
  668. addq $16 * SIZE, X
  669. addq $16 * SIZE, Y
  670. ALIGN_3
  671. .L25:
  672. testq $8, N
  673. jle .L26
  674. movsd 0 * SIZE(Y), %xmm1
  675. movhps 2 * SIZE(Y), %xmm1
  676. movsd 4 * SIZE(Y), %xmm3
  677. movhps 6 * SIZE(Y), %xmm3
  678. movaps 0 * SIZE(X), %xmm0
  679. movaps 4 * SIZE(X), %xmm2
  680. movaps %xmm1, %xmm4
  681. movaps %xmm0, %xmm5
  682. movaps %xmm3, %xmm6
  683. movaps %xmm2, %xmm7
  684. mulps C, %xmm0
  685. mulps S, %xmm1
  686. mulps C, %xmm2
  687. mulps S, %xmm3
  688. mulps C, %xmm4
  689. mulps S, %xmm5
  690. mulps C, %xmm6
  691. mulps S, %xmm7
  692. addps %xmm1, %xmm0
  693. addps %xmm3, %xmm2
  694. subps %xmm5, %xmm4
  695. subps %xmm7, %xmm6
  696. movaps %xmm0, 0 * SIZE(X)
  697. movaps %xmm2, 4 * SIZE(X)
  698. movlps %xmm4, 0 * SIZE(Y)
  699. movhps %xmm4, 2 * SIZE(Y)
  700. movlps %xmm6, 4 * SIZE(Y)
  701. movhps %xmm6, 6 * SIZE(Y)
  702. addq $8 * SIZE, X
  703. addq $8 * SIZE, Y
  704. ALIGN_3
  705. .L26:
  706. testq $4, N
  707. jle .L27
  708. movsd 0 * SIZE(Y), %xmm1
  709. movhps 2 * SIZE(Y), %xmm1
  710. movaps 0 * SIZE(X), %xmm0
  711. movaps %xmm1, %xmm2
  712. movaps %xmm0, %xmm3
  713. mulps C, %xmm0
  714. mulps S, %xmm1
  715. mulps C, %xmm2
  716. mulps S, %xmm3
  717. addps %xmm1, %xmm0
  718. subps %xmm3, %xmm2
  719. movaps %xmm0, 0 * SIZE(X)
  720. movlps %xmm2, 0 * SIZE(Y)
  721. movhps %xmm2, 2 * SIZE(Y)
  722. addq $4 * SIZE, X
  723. addq $4 * SIZE, Y
  724. ALIGN_3
  725. .L27:
  726. testq $2, N
  727. jle .L28
  728. movsd 0 * SIZE(Y), %xmm1
  729. movsd 0 * SIZE(X), %xmm0
  730. movaps %xmm1, %xmm2
  731. movaps %xmm0, %xmm3
  732. mulps C, %xmm0
  733. mulps S, %xmm1
  734. mulps C, %xmm2
  735. mulps S, %xmm3
  736. addps %xmm1, %xmm0
  737. subps %xmm3, %xmm2
  738. movlps %xmm0, 0 * SIZE(X)
  739. movlps %xmm2, 0 * SIZE(Y)
  740. addq $2 * SIZE, X
  741. addq $2 * SIZE, Y
  742. ALIGN_3
  743. .L28:
  744. testq $1, N
  745. jle .L999
  746. movss 0 * SIZE(Y), %xmm1
  747. movss 0 * SIZE(X), %xmm0
  748. movaps %xmm1, %xmm2
  749. movaps %xmm0, %xmm3
  750. mulss C, %xmm0
  751. mulss S, %xmm1
  752. mulss C, %xmm2
  753. mulss S, %xmm3
  754. addss %xmm1, %xmm0
  755. subss %xmm3, %xmm2
  756. movss %xmm0, 0 * SIZE(X)
  757. movss %xmm2, 0 * SIZE(Y)
  758. jmp .L999
  759. ALIGN_3
  760. .L50:
  761. movq N, %rax
  762. sarq $2, %rax
  763. jle .L55
  764. ALIGN_3
  765. .L53:
  766. movss (Y), %xmm1
  767. movss (X), %xmm0
  768. movaps %xmm1, %xmm2
  769. movaps %xmm0, %xmm3
  770. mulss C, %xmm0
  771. mulss S, %xmm1
  772. mulss C, %xmm2
  773. mulss S, %xmm3
  774. addss %xmm1, %xmm0
  775. subss %xmm3, %xmm2
  776. movss %xmm0, (X)
  777. movss %xmm2, (Y)
  778. addq INCX, X
  779. addq INCY, Y
  780. movss (Y), %xmm1
  781. movss (X), %xmm0
  782. movaps %xmm1, %xmm2
  783. movaps %xmm0, %xmm3
  784. mulss C, %xmm0
  785. mulss S, %xmm1
  786. mulss C, %xmm2
  787. mulss S, %xmm3
  788. addss %xmm1, %xmm0
  789. subss %xmm3, %xmm2
  790. movss %xmm0, (X)
  791. movss %xmm2, (Y)
  792. addq INCX, X
  793. addq INCY, Y
  794. movss (Y), %xmm1
  795. movss (X), %xmm0
  796. movaps %xmm1, %xmm2
  797. movaps %xmm0, %xmm3
  798. mulss C, %xmm0
  799. mulss S, %xmm1
  800. mulss C, %xmm2
  801. mulss S, %xmm3
  802. addss %xmm1, %xmm0
  803. subss %xmm3, %xmm2
  804. movss %xmm0, (X)
  805. movss %xmm2, (Y)
  806. addq INCX, X
  807. addq INCY, Y
  808. movss (Y), %xmm1
  809. movss (X), %xmm0
  810. movaps %xmm1, %xmm2
  811. movaps %xmm0, %xmm3
  812. mulss C, %xmm0
  813. mulss S, %xmm1
  814. mulss C, %xmm2
  815. mulss S, %xmm3
  816. addss %xmm1, %xmm0
  817. subss %xmm3, %xmm2
  818. movss %xmm0, (X)
  819. movss %xmm2, (Y)
  820. addq INCX, X
  821. addq INCY, Y
  822. decq %rax
  823. jg .L53
  824. ALIGN_3
  825. .L55:
  826. movq N, %rax
  827. andq $3, %rax
  828. jle .L999
  829. ALIGN_3
  830. .L56:
  831. movss (Y), %xmm1
  832. movss (X), %xmm0
  833. movaps %xmm1, %xmm2
  834. movaps %xmm0, %xmm3
  835. mulss C, %xmm0
  836. mulss S, %xmm1
  837. mulss C, %xmm2
  838. mulss S, %xmm3
  839. addss %xmm1, %xmm0
  840. subss %xmm3, %xmm2
  841. movss %xmm0, (X)
  842. movss %xmm2, (Y)
  843. addq INCX, X
  844. addq INCY, Y
  845. decq %rax
  846. jg .L56
  847. ALIGN_3
  848. .L999:
  849. RESTOREREGISTERS
  850. ret
  851. EPILOGUE