You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

symv_L_sse.S 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef ATOM
  41. #define PREFETCH prefetcht0
  42. #define PREFETCHW prefetcht0
  43. #define PREFETCHSIZE (16 * 12)
  44. #endif
  45. #ifdef CORE2
  46. #define PREFETCH prefetcht0
  47. #define PREFETCHW prefetcht0
  48. #define PREFETCHSIZE (16 * 12)
  49. #endif
  50. #if defined(PENRYN) || defined(DUNNINGTON)
  51. #define PREFETCH prefetcht0
  52. #define PREFETCHW prefetcht0
  53. #define PREFETCHSIZE (16 * 12)
  54. #endif
  55. #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHW prefetcht0
  58. #define PREFETCHSIZE (16 * 12)
  59. #endif
  60. #ifdef PENTIUM4
  61. #define PREFETCH prefetcht0
  62. #define PREFETCHW prefetcht0
  63. #define PREFETCHSIZE (16 * 20)
  64. #endif
  65. #ifdef OPTERON
  66. #define PREFETCH prefetch
  67. #define PREFETCHW prefetchw
  68. #define PREFETCHSIZE (16 * 8)
  69. #define movsd movlps
  70. #endif
  71. #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  72. #define PREFETCH prefetch
  73. #define PREFETCHW prefetchw
  74. #define PREFETCHSIZE (16 * 16)
  75. #endif
  76. #ifdef NANO
  77. #define PREFETCH prefetcht0
  78. #define PREFETCHW prefetcht0
  79. #define PREFETCHSIZE (16 * 24)
  80. #endif
  81. #ifdef GENERIC
  82. #define PREFETCH prefetcht0
  83. #define PREFETCHW prefetcht0
  84. #define PREFETCHSIZE (16 * 20)
  85. #endif
  86. #ifndef WINDOWS_ABI
  87. #define STACKSIZE 80
  88. #define OLD_Y 8 + STACKSIZE(%rsp)
  89. #define OLD_INCY 16 + STACKSIZE(%rsp)
  90. #define OLD_BUFFER 24 + STACKSIZE(%rsp)
  91. #define M ARG1
  92. #define N ARG2
  93. #define A ARG3
  94. #define LDA ARG4
  95. #define X ARG5
  96. #define INCX ARG6
  97. #else
  98. #define STACKSIZE 256
  99. #define OLD_LDA 40 + STACKSIZE(%rsp)
  100. #define OLD_X 48 + STACKSIZE(%rsp)
  101. #define OLD_INCX 56 + STACKSIZE(%rsp)
  102. #define OLD_Y 64 + STACKSIZE(%rsp)
  103. #define OLD_INCY 72 + STACKSIZE(%rsp)
  104. #define OLD_BUFFER 80 + STACKSIZE(%rsp)
  105. #define M ARG1
  106. #define N ARG2
  107. #define A ARG4
  108. #define LDA ARG3
  109. #define X %rdi
  110. #define INCX %rsi
  111. #endif
  112. #define Y %r10
  113. #define INCY %r11
  114. #define BUFFER %r12
  115. #define TEMP %rax
  116. #define I %rax
  117. #define A1 %rbx
  118. #define A2 %rbp
  119. #define XX %r13
  120. #define YY %r14
  121. #define IS %r15
  122. #define NEW_X BUFFER
  123. #define NEW_Y X
  124. #define ALPHA %xmm0
  125. #define atemp1 %xmm0
  126. #define atemp2 %xmm1
  127. #define atemp3 %xmm2
  128. #define atemp4 %xmm3
  129. #define xsum1 %xmm4
  130. #define xsum2 %xmm5
  131. #define xsum3 %xmm6
  132. #define xsum4 %xmm7
  133. #define xtemp1 %xmm8
  134. #define xtemp2 %xmm9
  135. #define yy1 %xmm10
  136. #define xt1 %xmm11
  137. #define a1 %xmm12
  138. #define a2 %xmm13
  139. #define a3 %xmm14
  140. #define a4 %xmm15
  141. PROLOGUE
  142. PROFCODE
  143. subq $STACKSIZE, %rsp
  144. movq %rbx, 0(%rsp)
  145. movq %rbp, 8(%rsp)
  146. movq %r12, 16(%rsp)
  147. movq %r13, 24(%rsp)
  148. movq %r14, 32(%rsp)
  149. movq %r15, 40(%rsp)
  150. #ifdef WINDOWS_ABI
  151. movq %rdi, 48(%rsp)
  152. movq %rsi, 56(%rsp)
  153. movups %xmm6, 64(%rsp)
  154. movups %xmm7, 80(%rsp)
  155. movups %xmm8, 96(%rsp)
  156. movups %xmm9, 112(%rsp)
  157. movups %xmm10, 128(%rsp)
  158. movups %xmm11, 144(%rsp)
  159. movups %xmm12, 160(%rsp)
  160. movups %xmm13, 176(%rsp)
  161. movups %xmm14, 192(%rsp)
  162. movups %xmm15, 208(%rsp)
  163. movq OLD_LDA, LDA
  164. movq OLD_X, X
  165. movq OLD_INCX, INCX
  166. movaps %xmm2, %xmm0
  167. #endif
  168. movq OLD_Y, Y
  169. movq OLD_INCY, INCY
  170. movq OLD_BUFFER, BUFFER
  171. leaq (,INCX, SIZE), INCX
  172. leaq (,INCY, SIZE), INCY
  173. leaq (,LDA, SIZE), LDA
  174. testq M, M
  175. jle .L999
  176. shufps $0, ALPHA, ALPHA
  177. movq BUFFER, XX
  178. movq M, %rax
  179. sarq $3, %rax
  180. jle .L02
  181. ALIGN_3
  182. .L01:
  183. movss 0 * SIZE(X), %xmm1
  184. addq INCX, X
  185. movss 0 * SIZE(X), %xmm2
  186. addq INCX, X
  187. movss 0 * SIZE(X), %xmm3
  188. addq INCX, X
  189. movss 0 * SIZE(X), %xmm4
  190. addq INCX, X
  191. movss 0 * SIZE(X), %xmm5
  192. addq INCX, X
  193. movss 0 * SIZE(X), %xmm6
  194. addq INCX, X
  195. movss 0 * SIZE(X), %xmm7
  196. addq INCX, X
  197. movss 0 * SIZE(X), %xmm8
  198. addq INCX, X
  199. mulss ALPHA, %xmm1
  200. mulss ALPHA, %xmm2
  201. mulss ALPHA, %xmm3
  202. mulss ALPHA, %xmm4
  203. mulss ALPHA, %xmm5
  204. mulss ALPHA, %xmm6
  205. mulss ALPHA, %xmm7
  206. mulss ALPHA, %xmm8
  207. movss %xmm1, 0 * SIZE(XX)
  208. movss %xmm2, 1 * SIZE(XX)
  209. movss %xmm3, 2 * SIZE(XX)
  210. movss %xmm4, 3 * SIZE(XX)
  211. movss %xmm5, 4 * SIZE(XX)
  212. movss %xmm6, 5 * SIZE(XX)
  213. movss %xmm7, 6 * SIZE(XX)
  214. movss %xmm8, 7 * SIZE(XX)
  215. addq $8 * SIZE, XX
  216. decq %rax
  217. jg .L01
  218. ALIGN_3
  219. .L02:
  220. movq M, %rax
  221. andq $7, %rax
  222. jle .L05
  223. ALIGN_3
  224. .L03:
  225. movss 0 * SIZE(X), %xmm1
  226. addq INCX, X
  227. mulss ALPHA, %xmm1
  228. movss %xmm1, 0 * SIZE(XX)
  229. addq $1 * SIZE, XX
  230. decq %rax
  231. jg .L03
  232. ALIGN_3
  233. .L05:
  234. /* now we don't need original X */
  235. movq Y, NEW_Y
  236. addq $512, XX
  237. andq $-512, XX
  238. cmpq $SIZE, INCY
  239. je .L10
  240. movq Y, YY
  241. movq XX, NEW_Y
  242. movq M, %rax
  243. sarq $3, %rax
  244. jle .L07
  245. ALIGN_3
  246. .L06:
  247. movss 0 * SIZE(YY), %xmm0
  248. addq INCY, YY
  249. movss 0 * SIZE(YY), %xmm1
  250. addq INCY, YY
  251. movss 0 * SIZE(YY), %xmm2
  252. addq INCY, YY
  253. movss 0 * SIZE(YY), %xmm3
  254. addq INCY, YY
  255. movss 0 * SIZE(YY), %xmm4
  256. addq INCY, YY
  257. movss 0 * SIZE(YY), %xmm5
  258. addq INCY, YY
  259. movss 0 * SIZE(YY), %xmm6
  260. addq INCY, YY
  261. movss 0 * SIZE(YY), %xmm7
  262. addq INCY, YY
  263. movss %xmm0, 0 * SIZE(XX)
  264. movss %xmm1, 1 * SIZE(XX)
  265. movss %xmm2, 2 * SIZE(XX)
  266. movss %xmm3, 3 * SIZE(XX)
  267. movss %xmm4, 4 * SIZE(XX)
  268. movss %xmm5, 5 * SIZE(XX)
  269. movss %xmm6, 6 * SIZE(XX)
  270. movss %xmm7, 7 * SIZE(XX)
  271. addq $8 * SIZE, XX
  272. decq %rax
  273. jg .L06
  274. ALIGN_3
  275. .L07:
  276. movq M, %rax
  277. andq $7, %rax
  278. jle .L10
  279. ALIGN_3
  280. .L08:
  281. movss 0 * SIZE(YY), %xmm0
  282. addq INCY, YY
  283. movss %xmm0, 0 * SIZE(XX)
  284. addq $1 * SIZE, XX
  285. decq %rax
  286. jg .L08
  287. ALIGN_3
  288. .L10:
  289. xorq IS, IS # is = 0
  290. cmpq $4, N
  291. jl .L20
  292. ALIGN_3
  293. .L11:
  294. movq A, A1
  295. leaq (A, LDA, 2), A2
  296. leaq 4 * SIZE(A, LDA, 4), A
  297. leaq (NEW_X, IS, SIZE), XX
  298. leaq 4 * SIZE(NEW_Y, IS, SIZE), YY
  299. movaps 0 * SIZE(XX), atemp4
  300. movsd 0 * SIZE(A1), xsum1
  301. movhps 2 * SIZE(A1), xsum1
  302. mulps atemp4, xsum1
  303. movss 1 * SIZE(A1), xsum2
  304. movss 1 * SIZE(A1, LDA, 1), a2
  305. movss 2 * SIZE(A1, LDA, 1), a3
  306. movss 3 * SIZE(A1, LDA, 1), a4
  307. unpcklps a3, xsum2
  308. unpcklps a4, a2
  309. unpcklps a2, xsum2
  310. mulps atemp4, xsum2
  311. movss 2 * SIZE(A1), xsum3
  312. movss 2 * SIZE(A1, LDA, 1), a2
  313. movss 2 * SIZE(A2), a3
  314. movss 3 * SIZE(A2), a4
  315. unpcklps a3, xsum3
  316. unpcklps a4, a2
  317. unpcklps a2, xsum3
  318. mulps atemp4, xsum3
  319. movss 3 * SIZE(A1), xsum4
  320. movss 3 * SIZE(A1, LDA, 1), a2
  321. movss 3 * SIZE(A2), a3
  322. movss 3 * SIZE(A2, LDA, 1), a4
  323. unpcklps a3, xsum4
  324. unpcklps a4, a2
  325. unpcklps a2, xsum4
  326. mulps atemp4, xsum4
  327. pshufd $0x00, atemp4, atemp1
  328. pshufd $0x55, atemp4, atemp2
  329. pshufd $0xaa, atemp4, atemp3
  330. pshufd $0xff, atemp4, atemp4
  331. movaps 4 * SIZE(XX), xtemp1
  332. movaps 8 * SIZE(XX), xtemp2
  333. movsd 0 * SIZE(YY), yy1
  334. movhps 2 * SIZE(YY), yy1
  335. movsd 4 * SIZE(A1), a1
  336. movhps 6 * SIZE(A1), a1
  337. movsd 4 * SIZE(A1, LDA, 1), a2
  338. movhps 6 * SIZE(A1, LDA, 1), a2
  339. movsd 4 * SIZE(A2), a3
  340. movhps 6 * SIZE(A2), a3
  341. movsd 4 * SIZE(A2, LDA, 1), a4
  342. movhps 6 * SIZE(A2, LDA, 1), a4
  343. addq $4 * SIZE, XX
  344. addq $4 * SIZE, A1
  345. addq $4 * SIZE, A2
  346. movq M, I
  347. subq IS, I
  348. subq $4, I
  349. sarq $4, I
  350. jle .L14
  351. ALIGN_3
  352. .L12:
  353. movaps xtemp1, xt1
  354. mulps a1, xt1
  355. mulps atemp1, a1
  356. addps xt1, xsum1
  357. addps a1, yy1
  358. movsd 4 * SIZE(A1), a1
  359. movhps 6 * SIZE(A1), a1
  360. PREFETCH PREFETCHSIZE(A1)
  361. movaps xtemp1, xt1
  362. mulps a2, xt1
  363. mulps atemp2, a2
  364. addps xt1, xsum2
  365. addps a2, yy1
  366. movsd 4 * SIZE(A1, LDA, 1), a2
  367. movhps 6 * SIZE(A1, LDA, 1), a2
  368. movaps xtemp1, xt1
  369. mulps a3, xt1
  370. mulps atemp3, a3
  371. addps xt1, xsum3
  372. addps a3, yy1
  373. movsd 4 * SIZE(A2), a3
  374. movhps 6 * SIZE(A2), a3
  375. #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
  376. PREFETCH PREFETCHSIZE(XX)
  377. #endif
  378. movaps xtemp1, xt1
  379. movaps 8 * SIZE(XX), xtemp1
  380. mulps a4, xt1
  381. mulps atemp4, a4
  382. addps xt1, xsum4
  383. addps a4, yy1
  384. movsd 4 * SIZE(A2, LDA, 1), a4
  385. movhps 6 * SIZE(A2, LDA, 1), a4
  386. movlps yy1, 0 * SIZE(YY)
  387. movhps yy1, 2 * SIZE(YY)
  388. movsd 4 * SIZE(YY), yy1
  389. movhps 6 * SIZE(YY), yy1
  390. movaps xtemp2, xt1
  391. mulps a1, xt1
  392. mulps atemp1, a1
  393. addps xt1, xsum1
  394. addps a1, yy1
  395. movsd 8 * SIZE(A1), a1
  396. movhps 10 * SIZE(A1), a1
  397. PREFETCH PREFETCHSIZE(A1, LDA, 1)
  398. movaps xtemp2, xt1
  399. mulps a2, xt1
  400. mulps atemp2, a2
  401. addps xt1, xsum2
  402. addps a2, yy1
  403. movsd 8 * SIZE(A1, LDA, 1), a2
  404. movhps 10 * SIZE(A1, LDA, 1), a2
  405. movaps xtemp2, xt1
  406. mulps a3, xt1
  407. mulps atemp3, a3
  408. addps xt1, xsum3
  409. addps a3, yy1
  410. movsd 8 * SIZE(A2), a3
  411. movhps 10 * SIZE(A2), a3
  412. movaps xtemp2, xt1
  413. movaps 12 * SIZE(XX), xtemp2
  414. mulps a4, xt1
  415. mulps atemp4, a4
  416. addps xt1, xsum4
  417. addps a4, yy1
  418. movsd 8 * SIZE(A2, LDA, 1), a4
  419. movhps 10 * SIZE(A2, LDA, 1), a4
  420. movlps yy1, 4 * SIZE(YY)
  421. movhps yy1, 6 * SIZE(YY)
  422. movsd 8 * SIZE(YY), yy1
  423. movhps 10 * SIZE(YY), yy1
  424. movaps xtemp1, xt1
  425. mulps a1, xt1
  426. mulps atemp1, a1
  427. addps xt1, xsum1
  428. addps a1, yy1
  429. movsd 12 * SIZE(A1), a1
  430. movhps 14 * SIZE(A1), a1
  431. PREFETCH PREFETCHSIZE(A2)
  432. movaps xtemp1, xt1
  433. mulps a2, xt1
  434. mulps atemp2, a2
  435. addps xt1, xsum2
  436. addps a2, yy1
  437. movsd 12 * SIZE(A1, LDA, 1), a2
  438. movhps 14 * SIZE(A1, LDA, 1), a2
  439. movaps xtemp1, xt1
  440. mulps a3, xt1
  441. mulps atemp3, a3
  442. addps xt1, xsum3
  443. addps a3, yy1
  444. movsd 12 * SIZE(A2), a3
  445. movhps 14 * SIZE(A2), a3
  446. #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
  447. PREFETCHW PREFETCHSIZE(YY)
  448. #endif
  449. movaps xtemp1, xt1
  450. movaps 16 * SIZE(XX), xtemp1
  451. mulps a4, xt1
  452. mulps atemp4, a4
  453. addps xt1, xsum4
  454. addps a4, yy1
  455. movsd 12 * SIZE(A2, LDA, 1), a4
  456. movhps 14 * SIZE(A2, LDA, 1), a4
  457. movlps yy1, 8 * SIZE(YY)
  458. movhps yy1, 10 * SIZE(YY)
  459. movsd 12 * SIZE(YY), yy1
  460. movhps 14 * SIZE(YY), yy1
  461. movaps xtemp2, xt1
  462. mulps a1, xt1
  463. mulps atemp1, a1
  464. addps xt1, xsum1
  465. addps a1, yy1
  466. movsd 16 * SIZE(A1), a1
  467. movhps 18 * SIZE(A1), a1
  468. PREFETCH PREFETCHSIZE(A2, LDA, 1)
  469. movaps xtemp2, xt1
  470. mulps a2, xt1
  471. mulps atemp2, a2
  472. addps xt1, xsum2
  473. addps a2, yy1
  474. movsd 16 * SIZE(A1, LDA, 1), a2
  475. movhps 18 * SIZE(A1, LDA, 1), a2
  476. movaps xtemp2, xt1
  477. mulps a3, xt1
  478. mulps atemp3, a3
  479. addps xt1, xsum3
  480. addps a3, yy1
  481. movsd 16 * SIZE(A2), a3
  482. movhps 18 * SIZE(A2), a3
  483. movaps xtemp2, xt1
  484. movaps 20 * SIZE(XX), xtemp2
  485. mulps a4, xt1
  486. mulps atemp4, a4
  487. addps xt1, xsum4
  488. addps a4, yy1
  489. movsd 16 * SIZE(A2, LDA, 1), a4
  490. movhps 18 * SIZE(A2, LDA, 1), a4
  491. movlps yy1, 12 * SIZE(YY)
  492. movhps yy1, 14 * SIZE(YY)
  493. movsd 16 * SIZE(YY), yy1
  494. movhps 18 * SIZE(YY), yy1
  495. addq $16 * SIZE, XX
  496. addq $16 * SIZE, YY
  497. addq $16 * SIZE, A1
  498. addq $16 * SIZE, A2
  499. decq I
  500. jg .L12
  501. ALIGN_3
  502. .L14:
  503. movq M, I
  504. subq IS, I
  505. subq $4, I
  506. test $8, I
  507. jle .L15
  508. movaps xtemp1, xt1
  509. mulps a1, xt1
  510. mulps atemp1, a1
  511. addps xt1, xsum1
  512. addps a1, yy1
  513. movsd 4 * SIZE(A1), a1
  514. movhps 6 * SIZE(A1), a1
  515. movaps xtemp1, xt1
  516. mulps a2, xt1
  517. mulps atemp2, a2
  518. addps xt1, xsum2
  519. addps a2, yy1
  520. movsd 4 * SIZE(A1, LDA, 1), a2
  521. movhps 6 * SIZE(A1, LDA, 1), a2
  522. movaps xtemp1, xt1
  523. mulps a3, xt1
  524. mulps atemp3, a3
  525. addps xt1, xsum3
  526. addps a3, yy1
  527. movsd 4 * SIZE(A2), a3
  528. movhps 6 * SIZE(A2), a3
  529. movaps xtemp1, xt1
  530. movaps 8 * SIZE(XX), xtemp1
  531. mulps a4, xt1
  532. mulps atemp4, a4
  533. addps xt1, xsum4
  534. addps a4, yy1
  535. movsd 4 * SIZE(A2, LDA, 1), a4
  536. movhps 6 * SIZE(A2, LDA, 1), a4
  537. movlps yy1, 0 * SIZE(YY)
  538. movhps yy1, 2 * SIZE(YY)
  539. movsd 4 * SIZE(YY), yy1
  540. movhps 6 * SIZE(YY), yy1
  541. movaps xtemp2, xt1
  542. mulps a1, xt1
  543. mulps atemp1, a1
  544. addps xt1, xsum1
  545. addps a1, yy1
  546. movsd 8 * SIZE(A1), a1
  547. movhps 10 * SIZE(A1), a1
  548. movaps xtemp2, xt1
  549. mulps a2, xt1
  550. mulps atemp2, a2
  551. addps xt1, xsum2
  552. addps a2, yy1
  553. movsd 8 * SIZE(A1, LDA, 1), a2
  554. movhps 10 * SIZE(A1, LDA, 1), a2
  555. movaps xtemp2, xt1
  556. mulps a3, xt1
  557. mulps atemp3, a3
  558. addps xt1, xsum3
  559. addps a3, yy1
  560. movsd 8 * SIZE(A2), a3
  561. movhps 10 * SIZE(A2), a3
  562. movaps xtemp2, xt1
  563. movaps 12 * SIZE(XX), xtemp2
  564. mulps a4, xt1
  565. mulps atemp4, a4
  566. addps xt1, xsum4
  567. addps a4, yy1
  568. movsd 8 * SIZE(A2, LDA, 1), a4
  569. movhps 10 * SIZE(A2, LDA, 1), a4
  570. movlps yy1, 4 * SIZE(YY)
  571. movhps yy1, 6 * SIZE(YY)
  572. movsd 8 * SIZE(YY), yy1
  573. movhps 10 * SIZE(YY), yy1
  574. addq $8 * SIZE, XX
  575. addq $8 * SIZE, YY
  576. addq $8 * SIZE, A1
  577. addq $8 * SIZE, A2
  578. ALIGN_3
  579. .L15:
  580. test $4, I
  581. jle .L17
  582. movaps xtemp1, xt1
  583. mulps a1, xt1
  584. mulps atemp1, a1
  585. addps xt1, xsum1
  586. addps a1, yy1
  587. movsd 4 * SIZE(A1), a1
  588. movaps xtemp1, xt1
  589. mulps a2, xt1
  590. mulps atemp2, a2
  591. addps xt1, xsum2
  592. addps a2, yy1
  593. movsd 4 * SIZE(A1, LDA, 1), a2
  594. movaps xtemp1, xt1
  595. mulps a3, xt1
  596. mulps atemp3, a3
  597. addps xt1, xsum3
  598. addps a3, yy1
  599. movsd 4 * SIZE(A2), a3
  600. movaps xtemp1, xt1
  601. movsd 4 * SIZE(XX), xtemp1
  602. mulps a4, xt1
  603. mulps atemp4, a4
  604. addps xt1, xsum4
  605. addps a4, yy1
  606. movsd 4 * SIZE(A2, LDA, 1), a4
  607. movlps yy1, 0 * SIZE(YY)
  608. movhps yy1, 2 * SIZE(YY)
  609. movsd 4 * SIZE(YY), yy1
  610. addq $4 * SIZE, XX
  611. addq $4 * SIZE, YY
  612. addq $4 * SIZE, A1
  613. addq $4 * SIZE, A2
  614. ALIGN_3
  615. .L17:
  616. testq $2, M
  617. jle .L18
  618. pxor xtemp2, xtemp2
  619. movlhps xtemp2, a1
  620. movaps xtemp1, xt1
  621. mulps a1, xt1
  622. mulps atemp1, a1
  623. addps xt1, xsum1
  624. addps a1, yy1
  625. movss 2 * SIZE(A1), a1
  626. movlhps xtemp2, a2
  627. movaps xtemp1, xt1
  628. mulps a2, xt1
  629. mulps atemp2, a2
  630. addps xt1, xsum2
  631. addps a2, yy1
  632. movss 2 * SIZE(A1, LDA, 1), a2
  633. movlhps xtemp2, a3
  634. movaps xtemp1, xt1
  635. mulps a3, xt1
  636. mulps atemp3, a3
  637. addps xt1, xsum3
  638. addps a3, yy1
  639. movss 2 * SIZE(A2), a3
  640. movlhps xtemp2, a4
  641. movaps xtemp1, xt1
  642. movss 2 * SIZE(XX), xtemp1
  643. mulps a4, xt1
  644. mulps atemp4, a4
  645. addps xt1, xsum4
  646. addps a4, yy1
  647. movss 2 * SIZE(A2, LDA, 1), a4
  648. movlps yy1, 0 * SIZE(YY)
  649. movss 2 * SIZE(YY), yy1
  650. addq $2 * SIZE, XX
  651. addq $2 * SIZE, YY
  652. addq $2 * SIZE, A1
  653. addq $2 * SIZE, A2
  654. ALIGN_3
  655. .L18:
  656. testq $1, M
  657. jle .L19
  658. movss 0 * SIZE(XX), xtemp1
  659. movss 0 * SIZE(YY), yy1
  660. movss 0 * SIZE(A1), a1
  661. movss 0 * SIZE(A1, LDA, 1), a2
  662. movss 0 * SIZE(A2), a3
  663. movss 0 * SIZE(A2, LDA, 1), a4
  664. movaps xtemp1, xt1
  665. mulss a1, xt1
  666. mulss atemp1, a1
  667. addss xt1, xsum1
  668. addss a1, yy1
  669. movaps xtemp1, xt1
  670. mulss a2, xt1
  671. mulss atemp2, a2
  672. addss xt1, xsum2
  673. addss a2, yy1
  674. movaps xtemp1, xt1
  675. mulss a3, xt1
  676. mulss atemp3, a3
  677. addss xt1, xsum3
  678. addss a3, yy1
  679. movaps xtemp1, xt1
  680. mulss a4, xt1
  681. mulss atemp4, a4
  682. addss xt1, xsum4
  683. addss a4, yy1
  684. movss yy1, 0 * SIZE(YY)
  685. ALIGN_3
  686. .L19:
  687. #ifndef HAVE_SSE3
  688. movaps xsum1, xtemp1
  689. unpcklps xsum3, xsum1
  690. unpckhps xsum3, xtemp1
  691. movaps xsum2, xtemp2
  692. unpcklps xsum4, xsum2
  693. unpckhps xsum4, xtemp2
  694. movaps xsum1, xsum3
  695. unpcklps xsum2, xsum1
  696. unpckhps xsum2, xsum3
  697. movaps xtemp1, xsum4
  698. unpcklps xtemp2, xtemp1
  699. unpckhps xtemp2, xsum4
  700. addps xsum3, xsum1
  701. addps xtemp1, xsum4
  702. addps xsum4, xsum1
  703. #else
  704. haddps xsum2, xsum1
  705. haddps xsum4, xsum3
  706. haddps xsum3, xsum1
  707. #endif
  708. movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
  709. movhps 2 * SIZE(NEW_Y, IS, SIZE), yy1
  710. addps xsum1, yy1
  711. movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE)
  712. movhps yy1, 2 * SIZE(NEW_Y, IS, SIZE)
  713. addq $4, IS
  714. movq IS, I
  715. addq $4, I
  716. cmpq N, I
  717. jle .L11
  718. ALIGN_3
  719. .L20:
  720. testq $2, N
  721. jle .L30
  722. movq A, A1
  723. leaq 2 * SIZE(A, LDA, 2), A
  724. movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4
  725. #if defined(OPTERON)
  726. pxor xsum1, xsum1
  727. #endif
  728. movsd 0 * SIZE(A1), xsum1
  729. mulps atemp4, xsum1
  730. movss 1 * SIZE(A1), xsum2
  731. movss 1 * SIZE(A1, LDA, 1), a2
  732. unpcklps a2, xsum2
  733. mulps atemp4, xsum2
  734. pshufd $0x00, atemp4, atemp1
  735. pshufd $0x55, atemp4, atemp2
  736. testq $1, M
  737. jle .L29
  738. movss 2 * SIZE(A1), a1
  739. movss 2 * SIZE(A1, LDA, 1), a2
  740. movss 2 * SIZE(NEW_X, IS, SIZE), xtemp1
  741. movss 2 * SIZE(NEW_Y, IS, SIZE), yy1
  742. movaps xtemp1, xt1
  743. mulss a1, xt1
  744. mulss atemp1, a1
  745. addss xt1, xsum1
  746. addps a1, yy1
  747. movaps xtemp1, xt1
  748. mulss a2, xt1
  749. mulss atemp2, a2
  750. addss xt1, xsum2
  751. addss a2, yy1
  752. movss yy1, 2 * SIZE(NEW_Y, IS, SIZE)
  753. ALIGN_3
  754. .L29:
  755. #ifndef HAVE_SSE3
  756. unpcklps xsum2, xsum1
  757. movhlps xsum1, xsum2
  758. addps xsum2, xsum1
  759. #else
  760. haddps xsum2, xsum1
  761. haddps xsum1, xsum1
  762. #endif
  763. movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
  764. addps xsum1, yy1
  765. movlps yy1, 0 * SIZE(NEW_Y, IS, SIZE)
  766. addq $2, IS
  767. ALIGN_3
  768. .L30:
  769. testq $1, N
  770. jle .L990
  771. movss 0 * SIZE(NEW_X, IS, SIZE), xsum1
  772. mulss 0 * SIZE(A), xsum1
  773. addss 0 * SIZE(NEW_Y, IS, SIZE), xsum1
  774. movss xsum1, 0 * SIZE(NEW_Y, IS, SIZE)
  775. ALIGN_3
  776. .L990:
  777. cmpq $SIZE, INCY
  778. je .L999
  779. movq M, %rax
  780. sarq $3, %rax
  781. jle .L997
  782. ALIGN_3
  783. .L996:
  784. movss 0 * SIZE(NEW_Y), %xmm0
  785. movss 1 * SIZE(NEW_Y), %xmm1
  786. movss 2 * SIZE(NEW_Y), %xmm2
  787. movss 3 * SIZE(NEW_Y), %xmm3
  788. movss 4 * SIZE(NEW_Y), %xmm4
  789. movss 5 * SIZE(NEW_Y), %xmm5
  790. movss 6 * SIZE(NEW_Y), %xmm6
  791. movss 7 * SIZE(NEW_Y), %xmm7
  792. movss %xmm0, 0 * SIZE(Y)
  793. addq INCY, Y
  794. movss %xmm1, 0 * SIZE(Y)
  795. addq INCY, Y
  796. movss %xmm2, 0 * SIZE(Y)
  797. addq INCY, Y
  798. movss %xmm3, 0 * SIZE(Y)
  799. addq INCY, Y
  800. movss %xmm4, 0 * SIZE(Y)
  801. addq INCY, Y
  802. movss %xmm5, 0 * SIZE(Y)
  803. addq INCY, Y
  804. movss %xmm6, 0 * SIZE(Y)
  805. addq INCY, Y
  806. movss %xmm7, 0 * SIZE(Y)
  807. addq INCY, Y
  808. addq $8 * SIZE, NEW_Y
  809. decq %rax
  810. jg .L996
  811. ALIGN_3
  812. .L997:
  813. movq M, %rax
  814. andq $7, %rax
  815. jle .L999
  816. ALIGN_3
  817. .L998:
  818. movss 0 * SIZE(NEW_Y), %xmm0
  819. movss %xmm0, 0 * SIZE(Y)
  820. addq INCY, Y
  821. addq $1 * SIZE, NEW_Y
  822. decq %rax
  823. jg .L998
  824. ALIGN_3
  825. .L999:
  826. movq 0(%rsp), %rbx
  827. movq 8(%rsp), %rbp
  828. movq 16(%rsp), %r12
  829. movq 24(%rsp), %r13
  830. movq 32(%rsp), %r14
  831. movq 40(%rsp), %r15
  832. #ifdef WINDOWS_ABI
  833. movq 48(%rsp), %rdi
  834. movq 56(%rsp), %rsi
  835. movups 64(%rsp), %xmm6
  836. movups 80(%rsp), %xmm7
  837. movups 96(%rsp), %xmm8
  838. movups 112(%rsp), %xmm9
  839. movups 128(%rsp), %xmm10
  840. movups 144(%rsp), %xmm11
  841. movups 160(%rsp), %xmm12
  842. movups 176(%rsp), %xmm13
  843. movups 192(%rsp), %xmm14
  844. movups 208(%rsp), %xmm15
  845. #endif
  846. addq $STACKSIZE, %rsp
  847. ret
  848. EPILOGUE