You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

copy_sse.S 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define Y ARG4 /* rcx */
  44. #ifndef WINDOWS_ABI
  45. #define INCY ARG5 /* r8 */
  46. #else
  47. #define INCY %r10
  48. #endif
  49. #include "l1param.h"
  50. #ifdef OPTERON
  51. #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addps OFFSET(ADDR), REG
  52. #else
  53. #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG
  54. #endif
  55. PROLOGUE
  56. PROFCODE
  57. #ifdef WINDOWS_ABI
  58. movq 40(%rsp), INCY
  59. #endif
  60. SAVEREGISTERS
  61. leaq (, INCX, SIZE), INCX
  62. leaq (, INCY, SIZE), INCY
  63. cmpq $SIZE, INCX
  64. jne .L50
  65. cmpq $SIZE, INCY
  66. jne .L50
  67. cmpq $3, M
  68. jle .L55
  69. subq $-32 * SIZE, X
  70. subq $-32 * SIZE, Y
  71. testq $SIZE, Y
  72. je .L05
  73. movss -32 * SIZE(X), %xmm0
  74. movss %xmm0, -32 * SIZE(Y)
  75. addq $1 * SIZE, X
  76. addq $1 * SIZE, Y
  77. decq M
  78. ALIGN_4
  79. .L05:
  80. testq $2 * SIZE, Y
  81. je .L10
  82. movsd -32 * SIZE(X), %xmm0
  83. movlps %xmm0, -32 * SIZE(Y)
  84. addq $2 * SIZE, X
  85. addq $2 * SIZE, Y
  86. subq $2, M
  87. jle .L19
  88. ALIGN_4
  89. .L10:
  90. testq $3 * SIZE, X
  91. jne .L20
  92. movq M, %rax
  93. sarq $5, %rax
  94. jle .L13
  95. movaps -32 * SIZE(X), %xmm0
  96. movaps -28 * SIZE(X), %xmm1
  97. movaps -24 * SIZE(X), %xmm2
  98. movaps -20 * SIZE(X), %xmm3
  99. movaps -16 * SIZE(X), %xmm4
  100. movaps -12 * SIZE(X), %xmm5
  101. movaps -8 * SIZE(X), %xmm6
  102. movaps -4 * SIZE(X), %xmm7
  103. decq %rax
  104. jle .L12
  105. ALIGN_3
  106. .L11:
  107. #ifdef PREFETCHW
  108. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  109. #endif
  110. movaps %xmm0, -32 * SIZE(Y)
  111. LOAD( 0 * SIZE, X, %xmm0)
  112. movaps %xmm1, -28 * SIZE(Y)
  113. LOAD( 4 * SIZE, X, %xmm1)
  114. #ifdef PREFETCH
  115. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  116. #endif
  117. movaps %xmm2, -24 * SIZE(Y)
  118. LOAD( 8 * SIZE, X, %xmm2)
  119. movaps %xmm3, -20 * SIZE(Y)
  120. LOAD(12 * SIZE, X, %xmm3)
  121. #if defined(PREFETCHW) && !defined(FETCH128)
  122. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  123. #endif
  124. movaps %xmm4,-16 * SIZE(Y)
  125. LOAD(16 * SIZE, X, %xmm4)
  126. movaps %xmm5,-12 * SIZE(Y)
  127. LOAD(20 * SIZE, X, %xmm5)
  128. #if defined(PREFETCH) && !defined(FETCH128)
  129. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  130. #endif
  131. movaps %xmm6, -8 * SIZE(Y)
  132. LOAD(24 * SIZE, X, %xmm6)
  133. movaps %xmm7, -4 * SIZE(Y)
  134. LOAD(28 * SIZE, X, %xmm7)
  135. subq $-32 * SIZE, Y
  136. subq $-32 * SIZE, X
  137. decq %rax
  138. jg .L11
  139. ALIGN_3
  140. .L12:
  141. movaps %xmm0, -32 * SIZE(Y)
  142. movaps %xmm1, -28 * SIZE(Y)
  143. movaps %xmm2, -24 * SIZE(Y)
  144. movaps %xmm3, -20 * SIZE(Y)
  145. movaps %xmm4, -16 * SIZE(Y)
  146. movaps %xmm5, -12 * SIZE(Y)
  147. movaps %xmm6, -8 * SIZE(Y)
  148. movaps %xmm7, -4 * SIZE(Y)
  149. subq $-32 * SIZE, Y
  150. subq $-32 * SIZE, X
  151. ALIGN_3
  152. .L13:
  153. testq $16, M
  154. jle .L14
  155. movaps -32 * SIZE(X), %xmm0
  156. movaps -28 * SIZE(X), %xmm1
  157. movaps -24 * SIZE(X), %xmm2
  158. movaps -20 * SIZE(X), %xmm3
  159. movaps %xmm0, -32 * SIZE(Y)
  160. movaps %xmm1, -28 * SIZE(Y)
  161. movaps %xmm2, -24 * SIZE(Y)
  162. movaps %xmm3, -20 * SIZE(Y)
  163. addq $16 * SIZE, X
  164. addq $16 * SIZE, Y
  165. ALIGN_3
  166. .L14:
  167. testq $8, M
  168. jle .L15
  169. movaps -32 * SIZE(X), %xmm0
  170. movaps -28 * SIZE(X), %xmm1
  171. movaps %xmm0, -32 * SIZE(Y)
  172. movaps %xmm1, -28 * SIZE(Y)
  173. addq $8 * SIZE, X
  174. addq $8 * SIZE, Y
  175. ALIGN_3
  176. .L15:
  177. testq $4, M
  178. jle .L16
  179. movaps -32 * SIZE(X), %xmm0
  180. movaps %xmm0, -32 * SIZE(Y)
  181. addq $4 * SIZE, X
  182. addq $4 * SIZE, Y
  183. ALIGN_3
  184. .L16:
  185. testq $2, M
  186. jle .L17
  187. movsd -32 * SIZE(X), %xmm0
  188. movlps %xmm0, -32 * SIZE(Y)
  189. addq $2 * SIZE, X
  190. addq $2 * SIZE, Y
  191. ALIGN_3
  192. .L17:
  193. testq $1, M
  194. jle .L19
  195. movss -32 * SIZE(X), %xmm0
  196. movss %xmm0, -32 * SIZE(Y)
  197. ALIGN_3
  198. .L19:
  199. xorq %rax,%rax
  200. RESTOREREGISTERS
  201. ret
  202. ALIGN_3
  203. .L20:
  204. testq $SIZE, X
  205. jne .L30
  206. movhps -32 * SIZE(X), %xmm0
  207. movq M, %rax
  208. sarq $5, %rax
  209. jle .L23
  210. movaps -30 * SIZE(X), %xmm1
  211. movaps -26 * SIZE(X), %xmm2
  212. movaps -22 * SIZE(X), %xmm3
  213. movaps -18 * SIZE(X), %xmm4
  214. movaps -14 * SIZE(X), %xmm5
  215. movaps -10 * SIZE(X), %xmm6
  216. movaps -6 * SIZE(X), %xmm7
  217. decq %rax
  218. jle .L22
  219. ALIGN_4
  220. .L21:
  221. #ifdef PREFETCHW
  222. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  223. #endif
  224. shufps $0x4e, %xmm1, %xmm0
  225. movaps %xmm0, -32 * SIZE(Y)
  226. movaps -2 * SIZE(X), %xmm0
  227. shufps $0x4e, %xmm2, %xmm1
  228. movaps %xmm1, -28 * SIZE(Y)
  229. movaps 2 * SIZE(X), %xmm1
  230. #ifdef PREFETCH
  231. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  232. #endif
  233. shufps $0x4e, %xmm3, %xmm2
  234. movaps %xmm2, -24 * SIZE(Y)
  235. movaps 6 * SIZE(X), %xmm2
  236. shufps $0x4e, %xmm4, %xmm3
  237. movaps %xmm3, -20 * SIZE(Y)
  238. movaps 10 * SIZE(X), %xmm3
  239. #if defined(PREFETCHW) && !defined(FETCH128)
  240. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  241. #endif
  242. shufps $0x4e, %xmm5, %xmm4
  243. movaps %xmm4, -16 * SIZE(Y)
  244. movaps 14 * SIZE(X), %xmm4
  245. shufps $0x4e, %xmm6, %xmm5
  246. movaps %xmm5, -12 * SIZE(Y)
  247. movaps 18 * SIZE(X), %xmm5
  248. #if defined(PREFETCH) && !defined(FETCH128)
  249. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  250. #endif
  251. shufps $0x4e, %xmm7, %xmm6
  252. movaps %xmm6, -8 * SIZE(Y)
  253. movaps 22 * SIZE(X), %xmm6
  254. shufps $0x4e, %xmm0, %xmm7
  255. movaps %xmm7, -4 * SIZE(Y)
  256. movaps 26 * SIZE(X), %xmm7
  257. subq $-32 * SIZE, X
  258. subq $-32 * SIZE, Y
  259. decq %rax
  260. jg .L21
  261. ALIGN_3
  262. .L22:
  263. shufps $0x4e, %xmm1, %xmm0
  264. movaps %xmm0, -32 * SIZE(Y)
  265. movaps -2 * SIZE(X), %xmm0
  266. shufps $0x4e, %xmm2, %xmm1
  267. movaps %xmm1, -28 * SIZE(Y)
  268. shufps $0x4e, %xmm3, %xmm2
  269. movaps %xmm2, -24 * SIZE(Y)
  270. shufps $0x4e, %xmm4, %xmm3
  271. movaps %xmm3, -20 * SIZE(Y)
  272. shufps $0x4e, %xmm5, %xmm4
  273. movaps %xmm4, -16 * SIZE(Y)
  274. shufps $0x4e, %xmm6, %xmm5
  275. movaps %xmm5, -12 * SIZE(Y)
  276. shufps $0x4e, %xmm7, %xmm6
  277. movaps %xmm6, -8 * SIZE(Y)
  278. shufps $0x4e, %xmm0, %xmm7
  279. movaps %xmm7, -4 * SIZE(Y)
  280. subq $-32 * SIZE, X
  281. subq $-32 * SIZE, Y
  282. ALIGN_3
  283. .L23:
  284. testq $16, M
  285. jle .L24
  286. ALIGN_3
  287. movaps -30 * SIZE(X), %xmm1
  288. movaps -26 * SIZE(X), %xmm2
  289. movaps -22 * SIZE(X), %xmm3
  290. movaps -18 * SIZE(X), %xmm4
  291. shufps $0x4e, %xmm1, %xmm0
  292. movaps %xmm0, -32 * SIZE(Y)
  293. shufps $0x4e, %xmm2, %xmm1
  294. movaps %xmm1, -28 * SIZE(Y)
  295. shufps $0x4e, %xmm3, %xmm2
  296. movaps %xmm2, -24 * SIZE(Y)
  297. shufps $0x4e, %xmm4, %xmm3
  298. movaps %xmm3, -20 * SIZE(Y)
  299. movaps %xmm4, %xmm0
  300. addq $16 * SIZE, X
  301. addq $16 * SIZE, Y
  302. ALIGN_3
  303. .L24:
  304. testq $8, M
  305. jle .L25
  306. ALIGN_3
  307. movaps -30 * SIZE(X), %xmm1
  308. movaps -26 * SIZE(X), %xmm2
  309. shufps $0x4e, %xmm1, %xmm0
  310. shufps $0x4e, %xmm2, %xmm1
  311. movaps %xmm0, -32 * SIZE(Y)
  312. movaps %xmm1, -28 * SIZE(Y)
  313. movaps %xmm2, %xmm0
  314. addq $8 * SIZE, X
  315. addq $8 * SIZE, Y
  316. ALIGN_3
  317. .L25:
  318. testq $4, M
  319. jle .L26
  320. ALIGN_3
  321. movaps -30 * SIZE(X), %xmm1
  322. shufps $0x4e, %xmm1, %xmm0
  323. movaps %xmm0, -32 * SIZE(Y)
  324. addq $4 * SIZE, X
  325. addq $4 * SIZE, Y
  326. ALIGN_3
  327. .L26:
  328. testq $2, M
  329. jle .L27
  330. ALIGN_3
  331. movsd -32 * SIZE(X), %xmm0
  332. movsd %xmm0, -32 * SIZE(Y)
  333. addq $2 * SIZE, X
  334. addq $2 * SIZE, Y
  335. ALIGN_3
  336. .L27:
  337. testq $1, M
  338. jle .L29
  339. ALIGN_3
  340. movss -32 * SIZE(X), %xmm0
  341. movss %xmm0, -32 * SIZE(Y)
  342. addq $SIZE, Y
  343. ALIGN_3
  344. .L29:
  345. xorq %rax,%rax
  346. RESTOREREGISTERS
  347. ret
  348. ALIGN_3
  349. .L30:
  350. testq $2 * SIZE, X
  351. jne .L40
  352. movaps -33 * SIZE(X), %xmm0
  353. movq M, %rax
  354. sarq $5, %rax
  355. jle .L33
  356. movaps -29 * SIZE(X), %xmm1
  357. movaps -25 * SIZE(X), %xmm2
  358. movaps -21 * SIZE(X), %xmm3
  359. movaps -17 * SIZE(X), %xmm4
  360. movaps -13 * SIZE(X), %xmm5
  361. movaps -9 * SIZE(X), %xmm6
  362. movaps -5 * SIZE(X), %xmm7
  363. decq %rax
  364. jle .L32
  365. ALIGN_4
  366. .L31:
  367. #ifdef PREFETCHW
  368. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  369. #endif
  370. movss %xmm1, %xmm0
  371. shufps $0x39, %xmm0, %xmm0
  372. movaps %xmm0, -32 * SIZE(Y)
  373. movaps -1 * SIZE(X), %xmm0
  374. movss %xmm2, %xmm1
  375. shufps $0x39, %xmm1, %xmm1
  376. movaps %xmm1, -28 * SIZE(Y)
  377. movaps 3 * SIZE(X), %xmm1
  378. #ifdef PREFETCH
  379. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  380. #endif
  381. movss %xmm3, %xmm2
  382. shufps $0x39, %xmm2, %xmm2
  383. movaps %xmm2, -24 * SIZE(Y)
  384. movaps 7 * SIZE(X), %xmm2
  385. movss %xmm4, %xmm3
  386. shufps $0x39, %xmm3, %xmm3
  387. movaps %xmm3, -20 * SIZE(Y)
  388. movaps 11 * SIZE(X), %xmm3
  389. #if defined(PREFETCHW) && !defined(FETCH128)
  390. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  391. #endif
  392. movss %xmm5, %xmm4
  393. shufps $0x39, %xmm4, %xmm4
  394. movaps %xmm4, -16 * SIZE(Y)
  395. movaps 15 * SIZE(X), %xmm4
  396. movss %xmm6, %xmm5
  397. shufps $0x39, %xmm5, %xmm5
  398. movaps %xmm5, -12 * SIZE(Y)
  399. movaps 19 * SIZE(X), %xmm5
  400. #if defined(PREFETCH) && !defined(FETCH128)
  401. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  402. #endif
  403. movss %xmm7, %xmm6
  404. shufps $0x39, %xmm6, %xmm6
  405. movaps %xmm6, -8 * SIZE(Y)
  406. movaps 23 * SIZE(X), %xmm6
  407. movss %xmm0, %xmm7
  408. shufps $0x39, %xmm7, %xmm7
  409. movaps %xmm7, -4 * SIZE(Y)
  410. movaps 27 * SIZE(X), %xmm7
  411. subq $-32 * SIZE, X
  412. subq $-32 * SIZE, Y
  413. decq %rax
  414. jg .L31
  415. ALIGN_3
  416. .L32:
  417. movss %xmm1, %xmm0
  418. shufps $0x39, %xmm0, %xmm0
  419. movaps %xmm0, -32 * SIZE(Y)
  420. movaps -1 * SIZE(X), %xmm0
  421. movss %xmm2, %xmm1
  422. shufps $0x39, %xmm1, %xmm1
  423. movaps %xmm1, -28 * SIZE(Y)
  424. movss %xmm3, %xmm2
  425. shufps $0x39, %xmm2, %xmm2
  426. movaps %xmm2, -24 * SIZE(Y)
  427. movss %xmm4, %xmm3
  428. shufps $0x39, %xmm3, %xmm3
  429. movaps %xmm3, -20 * SIZE(Y)
  430. movss %xmm5, %xmm4
  431. shufps $0x39, %xmm4, %xmm4
  432. movaps %xmm4, -16 * SIZE(Y)
  433. movss %xmm6, %xmm5
  434. shufps $0x39, %xmm5, %xmm5
  435. movaps %xmm5, -12 * SIZE(Y)
  436. movss %xmm7, %xmm6
  437. shufps $0x39, %xmm6, %xmm6
  438. movaps %xmm6, -8 * SIZE(Y)
  439. movss %xmm0, %xmm7
  440. shufps $0x39, %xmm7, %xmm7
  441. movaps %xmm7, -4 * SIZE(Y)
  442. subq $-32 * SIZE, X
  443. subq $-32 * SIZE, Y
  444. ALIGN_3
  445. .L33:
  446. testq $16, M
  447. jle .L34
  448. ALIGN_3
  449. movaps -29 * SIZE(X), %xmm1
  450. movaps -25 * SIZE(X), %xmm2
  451. movaps -21 * SIZE(X), %xmm3
  452. movaps -17 * SIZE(X), %xmm4
  453. movss %xmm1, %xmm0
  454. shufps $0x39, %xmm0, %xmm0
  455. movaps %xmm0, -32 * SIZE(Y)
  456. movss %xmm2, %xmm1
  457. shufps $0x39, %xmm1, %xmm1
  458. movaps %xmm1, -28 * SIZE(Y)
  459. movss %xmm3, %xmm2
  460. shufps $0x39, %xmm2, %xmm2
  461. movaps %xmm2, -24 * SIZE(Y)
  462. movss %xmm4, %xmm3
  463. shufps $0x39, %xmm3, %xmm3
  464. movaps %xmm3, -20 * SIZE(Y)
  465. movaps %xmm4, %xmm0
  466. addq $16 * SIZE, X
  467. addq $16 * SIZE, Y
  468. ALIGN_3
  469. .L34:
  470. testq $8, M
  471. jle .L35
  472. ALIGN_3
  473. movaps -29 * SIZE(X), %xmm1
  474. movaps -25 * SIZE(X), %xmm2
  475. movss %xmm1, %xmm0
  476. shufps $0x39, %xmm0, %xmm0
  477. movaps %xmm0, -32 * SIZE(Y)
  478. movss %xmm2, %xmm1
  479. shufps $0x39, %xmm1, %xmm1
  480. movaps %xmm1, -28 * SIZE(Y)
  481. movaps %xmm2, %xmm0
  482. addq $8 * SIZE, X
  483. addq $8 * SIZE, Y
  484. ALIGN_3
  485. .L35:
  486. testq $4, M
  487. jle .L36
  488. ALIGN_3
  489. movaps -29 * SIZE(X), %xmm1
  490. movss %xmm1, %xmm0
  491. shufps $0x39, %xmm0, %xmm0
  492. movaps %xmm0, -32 * SIZE(Y)
  493. addq $4 * SIZE, X
  494. addq $4 * SIZE, Y
  495. ALIGN_3
  496. .L36:
  497. testq $2, M
  498. jle .L37
  499. ALIGN_3
  500. movsd -32 * SIZE(X), %xmm0
  501. movsd %xmm0, -32 * SIZE(Y)
  502. addq $2 * SIZE, X
  503. addq $2 * SIZE, Y
  504. ALIGN_3
  505. .L37:
  506. testq $1, M
  507. jle .L39
  508. ALIGN_3
  509. movss -32 * SIZE(X), %xmm0
  510. movss %xmm0, -32 * SIZE(Y)
  511. addq $SIZE, Y
  512. ALIGN_3
  513. .L39:
  514. xorq %rax,%rax
  515. RESTOREREGISTERS
  516. ret
  517. ALIGN_3
  518. .L40:
  519. movaps -35 * SIZE(X), %xmm0
  520. movq M, %rax
  521. sarq $5, %rax
  522. jle .L43
  523. movaps -31 * SIZE(X), %xmm1
  524. movaps -27 * SIZE(X), %xmm2
  525. movaps -23 * SIZE(X), %xmm3
  526. movaps -19 * SIZE(X), %xmm4
  527. movaps -15 * SIZE(X), %xmm5
  528. movaps -11 * SIZE(X), %xmm6
  529. movaps -7 * SIZE(X), %xmm7
  530. decq %rax
  531. jle .L42
  532. ALIGN_4
  533. .L41:
  534. #ifdef PREFETCHW
  535. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  536. #endif
  537. movss %xmm1, %xmm0
  538. shufps $0x93, %xmm1, %xmm0
  539. movaps %xmm0, -32 * SIZE(Y)
  540. movaps -3 * SIZE(X), %xmm0
  541. movss %xmm2, %xmm1
  542. shufps $0x93, %xmm2, %xmm1
  543. movaps %xmm1, -28 * SIZE(Y)
  544. movaps 1 * SIZE(X), %xmm1
  545. #ifdef PREFETCH
  546. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  547. #endif
  548. movss %xmm3, %xmm2
  549. shufps $0x93, %xmm3, %xmm2
  550. movaps %xmm2, -24 * SIZE(Y)
  551. movaps 5 * SIZE(X), %xmm2
  552. movss %xmm4, %xmm3
  553. shufps $0x93, %xmm4, %xmm3
  554. movaps %xmm3, -20 * SIZE(Y)
  555. movaps 9 * SIZE(X), %xmm3
  556. #if defined(PREFETCHW) && !defined(FETCH128)
  557. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  558. #endif
  559. movss %xmm5, %xmm4
  560. shufps $0x93, %xmm5, %xmm4
  561. movaps %xmm4, -16 * SIZE(Y)
  562. movaps 13 * SIZE(X), %xmm4
  563. movss %xmm6, %xmm5
  564. shufps $0x93, %xmm6, %xmm5
  565. movaps %xmm5, -12 * SIZE(Y)
  566. movaps 17 * SIZE(X), %xmm5
  567. #if defined(PREFETCH) && !defined(FETCH128)
  568. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  569. #endif
  570. movss %xmm7, %xmm6
  571. shufps $0x93, %xmm7, %xmm6
  572. movaps %xmm6, -8 * SIZE(Y)
  573. movaps 21 * SIZE(X), %xmm6
  574. movss %xmm0, %xmm7
  575. shufps $0x93, %xmm0, %xmm7
  576. movaps %xmm7, -4 * SIZE(Y)
  577. movaps 25 * SIZE(X), %xmm7
  578. subq $-32 * SIZE, X
  579. subq $-32 * SIZE, Y
  580. decq %rax
  581. jg .L41
  582. ALIGN_3
  583. .L42:
  584. movss %xmm1, %xmm0
  585. shufps $0x93, %xmm1, %xmm0
  586. movaps %xmm0, -32 * SIZE(Y)
  587. movaps -3 * SIZE(X), %xmm0
  588. movss %xmm2, %xmm1
  589. shufps $0x93, %xmm2, %xmm1
  590. movaps %xmm1, -28 * SIZE(Y)
  591. movss %xmm3, %xmm2
  592. shufps $0x93, %xmm3, %xmm2
  593. movaps %xmm2, -24 * SIZE(Y)
  594. movss %xmm4, %xmm3
  595. shufps $0x93, %xmm4, %xmm3
  596. movaps %xmm3, -20 * SIZE(Y)
  597. movss %xmm5, %xmm4
  598. shufps $0x93, %xmm5, %xmm4
  599. movaps %xmm4, -16 * SIZE(Y)
  600. movss %xmm6, %xmm5
  601. shufps $0x93, %xmm6, %xmm5
  602. movaps %xmm5, -12 * SIZE(Y)
  603. movss %xmm7, %xmm6
  604. shufps $0x93, %xmm7, %xmm6
  605. movaps %xmm6, -8 * SIZE(Y)
  606. movss %xmm0, %xmm7
  607. shufps $0x93, %xmm0, %xmm7
  608. movaps %xmm7, -4 * SIZE(Y)
  609. subq $-32 * SIZE, X
  610. subq $-32 * SIZE, Y
  611. ALIGN_3
  612. .L43:
  613. testq $16, M
  614. jle .L44
  615. ALIGN_3
  616. movaps -31 * SIZE(X), %xmm1
  617. movaps -27 * SIZE(X), %xmm2
  618. movaps -23 * SIZE(X), %xmm3
  619. movaps -19 * SIZE(X), %xmm4
  620. movss %xmm1, %xmm0
  621. shufps $0x93, %xmm1, %xmm0
  622. movaps %xmm0, -32 * SIZE(Y)
  623. movss %xmm2, %xmm1
  624. shufps $0x93, %xmm2, %xmm1
  625. movaps %xmm1, -28 * SIZE(Y)
  626. movss %xmm3, %xmm2
  627. shufps $0x93, %xmm3, %xmm2
  628. movaps %xmm2, -24 * SIZE(Y)
  629. movss %xmm4, %xmm3
  630. shufps $0x93, %xmm4, %xmm3
  631. movaps %xmm3, -20 * SIZE(Y)
  632. movaps %xmm4, %xmm0
  633. addq $16 * SIZE, X
  634. addq $16 * SIZE, Y
  635. ALIGN_3
  636. .L44:
  637. testq $8, M
  638. jle .L45
  639. ALIGN_3
  640. movaps -31 * SIZE(X), %xmm1
  641. movaps -27 * SIZE(X), %xmm2
  642. movss %xmm1, %xmm0
  643. shufps $0x93, %xmm1, %xmm0
  644. movaps %xmm0, -32 * SIZE(Y)
  645. movss %xmm2, %xmm1
  646. shufps $0x93, %xmm2, %xmm1
  647. movaps %xmm1, -28 * SIZE(Y)
  648. movaps %xmm2, %xmm0
  649. addq $8 * SIZE, X
  650. addq $8 * SIZE, Y
  651. ALIGN_3
  652. .L45:
  653. testq $4, M
  654. jle .L46
  655. ALIGN_3
  656. movaps -31 * SIZE(X), %xmm1
  657. movss %xmm1, %xmm0
  658. shufps $0x93, %xmm1, %xmm0
  659. movaps %xmm0, -32 * SIZE(Y)
  660. addq $4 * SIZE, X
  661. addq $4 * SIZE, Y
  662. ALIGN_3
  663. .L46:
  664. testq $2, M
  665. jle .L47
  666. ALIGN_3
  667. movsd -32 * SIZE(X), %xmm0
  668. movsd %xmm0, -32 * SIZE(Y)
  669. addq $2 * SIZE, X
  670. addq $2 * SIZE, Y
  671. ALIGN_3
  672. .L47:
  673. testq $1, M
  674. jle .L49
  675. ALIGN_3
  676. movss -32 * SIZE(X), %xmm0
  677. movss %xmm0, -32 * SIZE(Y)
  678. addq $SIZE, Y
  679. ALIGN_3
  680. .L49:
  681. xorq %rax,%rax
  682. RESTOREREGISTERS
  683. ret
  684. ALIGN_4
  685. .L50:
  686. movq M, %rax
  687. sarq $3, %rax
  688. jle .L55
  689. ALIGN_3
  690. .L51:
  691. movss (X), %xmm0
  692. addq INCX, X
  693. movss (X), %xmm1
  694. addq INCX, X
  695. movss (X), %xmm2
  696. addq INCX, X
  697. movss (X), %xmm3
  698. addq INCX, X
  699. movss (X), %xmm4
  700. addq INCX, X
  701. movss (X), %xmm5
  702. addq INCX, X
  703. movss (X), %xmm6
  704. addq INCX, X
  705. movss (X), %xmm7
  706. addq INCX, X
  707. movss %xmm0, (Y)
  708. addq INCY, Y
  709. movss %xmm1, (Y)
  710. addq INCY, Y
  711. movss %xmm2, (Y)
  712. addq INCY, Y
  713. movss %xmm3, (Y)
  714. addq INCY, Y
  715. movss %xmm4, (Y)
  716. addq INCY, Y
  717. movss %xmm5, (Y)
  718. addq INCY, Y
  719. movss %xmm6, (Y)
  720. addq INCY, Y
  721. movss %xmm7, (Y)
  722. addq INCY, Y
  723. decq %rax
  724. jg .L51
  725. ALIGN_3
  726. .L55:
  727. movq M, %rax
  728. andq $7, %rax
  729. jle .L57
  730. ALIGN_3
  731. .L56:
  732. movss (X), %xmm0
  733. addq INCX, X
  734. movss %xmm0, (Y)
  735. addq INCY, Y
  736. decq %rax
  737. jg .L56
  738. ALIGN_3
  739. .L57:
  740. xorq %rax, %rax
  741. RESTOREREGISTERS
  742. ret
  743. EPILOGUE