You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dot_sse.S 24 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define Y ARG4 /* rcx */
  44. #ifndef WINDOWS_ABI
  45. #define INCY ARG5 /* r8 */
  46. #else
  47. #define INCY %r10
  48. #endif
  49. #include "l1param.h"
  50. PROLOGUE
  51. PROFCODE
  52. #ifdef WINDOWS_ABI
  53. movq 40(%rsp), INCY
  54. #endif
  55. SAVEREGISTERS
  56. leaq (, INCX, SIZE), INCX
  57. leaq (, INCY, SIZE), INCY
  58. xorps %xmm0, %xmm0
  59. xorps %xmm1, %xmm1
  60. xorps %xmm2, %xmm2
  61. xorps %xmm3, %xmm3
  62. cmpq $0, N
  63. jle .L999
  64. cmpq $SIZE, INCX
  65. jne .L50
  66. cmpq $SIZE, INCY
  67. jne .L50
  68. subq $-32 * SIZE, X
  69. subq $-32 * SIZE, Y
  70. cmpq $3, N
  71. jle .L17
  72. testq $SIZE, Y
  73. je .L05
  74. movss -32 * SIZE(X), %xmm0
  75. mulss -32 * SIZE(Y), %xmm0
  76. addq $1 * SIZE, X
  77. addq $1 * SIZE, Y
  78. decq N
  79. ALIGN_2
  80. .L05:
  81. testq $2 * SIZE, Y
  82. je .L10
  83. #ifdef movsd
  84. xorps %xmm4, %xmm4
  85. #endif
  86. movsd -32 * SIZE(X), %xmm4
  87. #ifdef movsd
  88. xorps %xmm1, %xmm1
  89. #endif
  90. movsd -32 * SIZE(Y), %xmm1
  91. mulps %xmm4, %xmm1
  92. addq $2 * SIZE, X
  93. addq $2 * SIZE, Y
  94. subq $2, N
  95. jle .L999
  96. ALIGN_2
  97. .L10:
  98. #ifdef ALIGNED_ACCESS
  99. testq $2 * SIZE, X
  100. jne .L30
  101. testq $SIZE, X
  102. jne .L20
  103. #else
  104. testq $3 * SIZE, X
  105. jne .L20
  106. #endif
  107. movq N, %rax
  108. sarq $5, %rax
  109. jle .L14
  110. movaps -32 * SIZE(X), %xmm4
  111. movaps -28 * SIZE(X), %xmm5
  112. movaps -24 * SIZE(X), %xmm6
  113. movaps -20 * SIZE(X), %xmm7
  114. movaps -16 * SIZE(X), %xmm8
  115. movaps -12 * SIZE(X), %xmm9
  116. movaps -8 * SIZE(X), %xmm10
  117. movaps -4 * SIZE(X), %xmm11
  118. decq %rax
  119. jle .L12
  120. ALIGN_3
  121. .L11:
  122. #ifdef PREFETCH
  123. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  124. #endif
  125. mulps -32 * SIZE(Y), %xmm4
  126. addps %xmm4, %xmm0
  127. movaps 0 * SIZE(X), %xmm4
  128. mulps -28 * SIZE(Y), %xmm5
  129. addps %xmm5, %xmm1
  130. movaps 4 * SIZE(X), %xmm5
  131. #ifdef PREFETCH
  132. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  133. #endif
  134. mulps -24 * SIZE(Y), %xmm6
  135. addps %xmm6, %xmm2
  136. movaps 8 * SIZE(X), %xmm6
  137. mulps -20 * SIZE(Y), %xmm7
  138. addps %xmm7, %xmm3
  139. movaps 12 * SIZE(X), %xmm7
  140. #if defined(PREFETCH) && !defined(FETCH128)
  141. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  142. #endif
  143. mulps -16 * SIZE(Y), %xmm8
  144. addps %xmm8, %xmm0
  145. movaps 16 * SIZE(X), %xmm8
  146. mulps -12 * SIZE(Y), %xmm9
  147. addps %xmm9, %xmm1
  148. movaps 20 * SIZE(X), %xmm9
  149. #if defined(PREFETCH) && !defined(FETCH128)
  150. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  151. #endif
  152. mulps -8 * SIZE(Y), %xmm10
  153. addps %xmm10, %xmm2
  154. movaps 24 * SIZE(X), %xmm10
  155. mulps -4 * SIZE(Y), %xmm11
  156. addps %xmm11, %xmm3
  157. movaps 28 * SIZE(X), %xmm11
  158. subq $-32 * SIZE, X
  159. subq $-32 * SIZE, Y
  160. decq %rax
  161. jg .L11
  162. ALIGN_3
  163. .L12:
  164. mulps -32 * SIZE(Y), %xmm4
  165. addps %xmm4, %xmm0
  166. mulps -28 * SIZE(Y), %xmm5
  167. addps %xmm5, %xmm1
  168. mulps -24 * SIZE(Y), %xmm6
  169. addps %xmm6, %xmm2
  170. mulps -20 * SIZE(Y), %xmm7
  171. addps %xmm7, %xmm3
  172. mulps -16 * SIZE(Y), %xmm8
  173. addps %xmm8, %xmm0
  174. mulps -12 * SIZE(Y), %xmm9
  175. addps %xmm9, %xmm1
  176. mulps -8 * SIZE(Y), %xmm10
  177. addps %xmm10, %xmm2
  178. mulps -4 * SIZE(Y), %xmm11
  179. addps %xmm11, %xmm3
  180. subq $-32 * SIZE, X
  181. subq $-32 * SIZE, Y
  182. ALIGN_3
  183. .L14:
  184. testq $31, N
  185. jle .L999
  186. testq $16, N
  187. jle .L15
  188. movaps -32 * SIZE(X), %xmm4
  189. movaps -28 * SIZE(X), %xmm5
  190. movaps -24 * SIZE(X), %xmm6
  191. movaps -20 * SIZE(X), %xmm7
  192. mulps -32 * SIZE(Y), %xmm4
  193. addps %xmm4, %xmm0
  194. mulps -28 * SIZE(Y), %xmm5
  195. addps %xmm5, %xmm1
  196. mulps -24 * SIZE(Y), %xmm6
  197. addps %xmm6, %xmm2
  198. mulps -20 * SIZE(Y), %xmm7
  199. addps %xmm7, %xmm3
  200. addq $16 * SIZE, X
  201. addq $16 * SIZE, Y
  202. ALIGN_3
  203. .L15:
  204. testq $8, N
  205. jle .L16
  206. movaps -32 * SIZE(X), %xmm4
  207. movaps -28 * SIZE(X), %xmm5
  208. mulps -32 * SIZE(Y), %xmm4
  209. addps %xmm4, %xmm0
  210. mulps -28 * SIZE(Y), %xmm5
  211. addps %xmm5, %xmm1
  212. addq $8 * SIZE, X
  213. addq $8 * SIZE, Y
  214. ALIGN_3
  215. .L16:
  216. testq $4, N
  217. jle .L17
  218. movaps -32 * SIZE(X), %xmm4
  219. mulps -32 * SIZE(Y), %xmm4
  220. addps %xmm4, %xmm2
  221. addq $4 * SIZE, X
  222. addq $4 * SIZE, Y
  223. ALIGN_3
  224. .L17:
  225. testq $2, N
  226. jle .L18
  227. #ifdef movsd
  228. xorps %xmm4, %xmm4
  229. #endif
  230. movsd -32 * SIZE(X), %xmm4
  231. #ifdef movsd
  232. xorps %xmm8, %xmm8
  233. #endif
  234. movsd -32 * SIZE(Y), %xmm8
  235. mulps %xmm8, %xmm4
  236. addps %xmm4, %xmm3
  237. addq $2 * SIZE, X
  238. addq $2 * SIZE, Y
  239. ALIGN_3
  240. .L18:
  241. testq $1, N
  242. jle .L999
  243. movss -32 * SIZE(X), %xmm4
  244. mulss -32 * SIZE(Y), %xmm4
  245. addss %xmm4, %xmm0
  246. jmp .L999
  247. ALIGN_3
  248. .L20:
  249. #ifdef ALIGNED_ACCESS
  250. movaps -33 * SIZE(X), %xmm4
  251. addq $3 * SIZE, X
  252. movq N, %rax
  253. sarq $5, %rax
  254. jle .L24
  255. movaps -32 * SIZE(X), %xmm5
  256. movaps -28 * SIZE(X), %xmm6
  257. movaps -24 * SIZE(X), %xmm7
  258. movaps -20 * SIZE(X), %xmm8
  259. movaps -16 * SIZE(X), %xmm9
  260. movaps -12 * SIZE(X), %xmm10
  261. movaps -8 * SIZE(X), %xmm11
  262. decq %rax
  263. jle .L22
  264. ALIGN_3
  265. .L21:
  266. #ifdef PREFETCH
  267. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  268. #endif
  269. movss %xmm5, %xmm4
  270. pshufd $0x39, %xmm4, %xmm4
  271. mulps -32 * SIZE(Y), %xmm4
  272. addps %xmm4, %xmm0
  273. movaps -4 * SIZE(X), %xmm4
  274. movss %xmm6, %xmm5
  275. pshufd $0x39, %xmm5, %xmm5
  276. mulps -28 * SIZE(Y), %xmm5
  277. addps %xmm5, %xmm1
  278. movaps 0 * SIZE(X), %xmm5
  279. #ifdef PREFETCH
  280. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  281. #endif
  282. movss %xmm7, %xmm6
  283. pshufd $0x39, %xmm6, %xmm6
  284. mulps -24 * SIZE(Y), %xmm6
  285. addps %xmm6, %xmm2
  286. movaps 4 * SIZE(X), %xmm6
  287. movss %xmm8, %xmm7
  288. pshufd $0x39, %xmm7, %xmm7
  289. mulps -20 * SIZE(Y), %xmm7
  290. addps %xmm7, %xmm3
  291. movaps 8 * SIZE(X), %xmm7
  292. #if defined(PREFETCH) && !defined(FETCH128)
  293. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  294. #endif
  295. movss %xmm9, %xmm8
  296. pshufd $0x39, %xmm8, %xmm8
  297. mulps -16 * SIZE(Y), %xmm8
  298. addps %xmm8, %xmm0
  299. movaps 12 * SIZE(X), %xmm8
  300. movss %xmm10, %xmm9
  301. pshufd $0x39, %xmm9, %xmm9
  302. mulps -12 * SIZE(Y), %xmm9
  303. addps %xmm9, %xmm1
  304. movaps 16 * SIZE(X), %xmm9
  305. #if defined(PREFETCH) && !defined(FETCH128)
  306. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  307. #endif
  308. movss %xmm11, %xmm10
  309. pshufd $0x39, %xmm10, %xmm10
  310. mulps -8 * SIZE(Y), %xmm10
  311. addps %xmm10, %xmm2
  312. movaps 20 * SIZE(X), %xmm10
  313. movss %xmm4, %xmm11
  314. pshufd $0x39, %xmm11, %xmm11
  315. mulps -4 * SIZE(Y), %xmm11
  316. addps %xmm11, %xmm3
  317. movaps 24 * SIZE(X), %xmm11
  318. subq $-32 * SIZE, X
  319. subq $-32 * SIZE, Y
  320. decq %rax
  321. jg .L21
  322. ALIGN_3
  323. .L22:
  324. movss %xmm5, %xmm4
  325. pshufd $0x39, %xmm4, %xmm4
  326. mulps -32 * SIZE(Y), %xmm4
  327. addps %xmm4, %xmm0
  328. movaps -4 * SIZE(X), %xmm4
  329. movss %xmm6, %xmm5
  330. pshufd $0x39, %xmm5, %xmm5
  331. mulps -28 * SIZE(Y), %xmm5
  332. addps %xmm5, %xmm1
  333. movss %xmm7, %xmm6
  334. pshufd $0x39, %xmm6, %xmm6
  335. mulps -24 * SIZE(Y), %xmm6
  336. addps %xmm6, %xmm2
  337. movss %xmm8, %xmm7
  338. pshufd $0x39, %xmm7, %xmm7
  339. mulps -20 * SIZE(Y), %xmm7
  340. addps %xmm7, %xmm3
  341. movss %xmm9, %xmm8
  342. pshufd $0x39, %xmm8, %xmm8
  343. mulps -16 * SIZE(Y), %xmm8
  344. addps %xmm8, %xmm0
  345. movss %xmm10, %xmm9
  346. pshufd $0x39, %xmm9, %xmm9
  347. mulps -12 * SIZE(Y), %xmm9
  348. addps %xmm9, %xmm1
  349. movss %xmm11, %xmm10
  350. pshufd $0x39, %xmm10, %xmm10
  351. mulps -8 * SIZE(Y), %xmm10
  352. addps %xmm10, %xmm2
  353. movss %xmm4, %xmm11
  354. pshufd $0x39, %xmm11, %xmm11
  355. mulps -4 * SIZE(Y), %xmm11
  356. addps %xmm11, %xmm3
  357. subq $-32 * SIZE, X
  358. subq $-32 * SIZE, Y
  359. ALIGN_3
  360. .L24:
  361. testq $31, N
  362. jle .L999
  363. testq $16, N
  364. jle .L25
  365. movaps -32 * SIZE(X), %xmm5
  366. movaps -28 * SIZE(X), %xmm6
  367. movaps -24 * SIZE(X), %xmm7
  368. movss %xmm5, %xmm4
  369. pshufd $0x39, %xmm4, %xmm4
  370. mulps -32 * SIZE(Y), %xmm4
  371. addps %xmm4, %xmm0
  372. movaps -20 * SIZE(X), %xmm4
  373. movss %xmm6, %xmm5
  374. pshufd $0x39, %xmm5, %xmm5
  375. mulps -28 * SIZE(Y), %xmm5
  376. addps %xmm5, %xmm1
  377. movss %xmm7, %xmm6
  378. pshufd $0x39, %xmm6, %xmm6
  379. mulps -24 * SIZE(Y), %xmm6
  380. addps %xmm6, %xmm2
  381. movss %xmm4, %xmm7
  382. pshufd $0x39, %xmm7, %xmm7
  383. mulps -20 * SIZE(Y), %xmm7
  384. addps %xmm7, %xmm3
  385. addq $16 * SIZE, X
  386. addq $16 * SIZE, Y
  387. ALIGN_3
  388. .L25:
  389. testq $8, N
  390. jle .L26
  391. movaps -32 * SIZE(X), %xmm5
  392. movaps -28 * SIZE(X), %xmm6
  393. movss %xmm5, %xmm4
  394. pshufd $0x39, %xmm4, %xmm4
  395. mulps -32 * SIZE(Y), %xmm4
  396. addps %xmm4, %xmm0
  397. movss %xmm6, %xmm5
  398. pshufd $0x39, %xmm5, %xmm5
  399. mulps -28 * SIZE(Y), %xmm5
  400. addps %xmm5, %xmm1
  401. movaps %xmm6, %xmm4
  402. addq $8 * SIZE, X
  403. addq $8 * SIZE, Y
  404. ALIGN_3
  405. .L26:
  406. testq $4, N
  407. jle .L27
  408. movaps -32 * SIZE(X), %xmm5
  409. movss %xmm5, %xmm4
  410. pshufd $0x39, %xmm4, %xmm4
  411. mulps -32 * SIZE(Y), %xmm4
  412. addps %xmm4, %xmm2
  413. movaps %xmm5, %xmm4
  414. addq $4 * SIZE, X
  415. addq $4 * SIZE, Y
  416. ALIGN_3
  417. .L27:
  418. testq $2, N
  419. jle .L28
  420. #ifdef movsd
  421. xorps %xmm8, %xmm8
  422. #endif
  423. movsd -32 * SIZE(Y), %xmm8
  424. pshufd $0x29, %xmm4, %xmm5
  425. mulps %xmm8, %xmm5
  426. addps %xmm5, %xmm3
  427. movhlps %xmm4, %xmm4
  428. addq $2 * SIZE, X
  429. addq $2 * SIZE, Y
  430. ALIGN_3
  431. .L28:
  432. testq $1, N
  433. jle .L999
  434. pshufd $0x39, %xmm4, %xmm4
  435. mulss -32 * SIZE(Y), %xmm4
  436. addss %xmm4, %xmm0
  437. jmp .L999
  438. ALIGN_3
  439. .L30:
  440. testq $SIZE, X
  441. jne .L40
  442. movhps -32 * SIZE(X), %xmm4
  443. addq $2 * SIZE, X
  444. movq N, %rax
  445. sarq $5, %rax
  446. jle .L34
  447. movaps -32 * SIZE(X), %xmm5
  448. movaps -28 * SIZE(X), %xmm6
  449. movaps -24 * SIZE(X), %xmm7
  450. movaps -20 * SIZE(X), %xmm8
  451. movaps -16 * SIZE(X), %xmm9
  452. movaps -12 * SIZE(X), %xmm10
  453. movaps -8 * SIZE(X), %xmm11
  454. decq %rax
  455. jle .L32
  456. ALIGN_3
  457. .L31:
  458. #ifdef PREFETCH
  459. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  460. #endif
  461. SHUFPD_1 %xmm5, %xmm4
  462. mulps -32 * SIZE(Y), %xmm4
  463. addps %xmm4, %xmm0
  464. movaps -4 * SIZE(X), %xmm4
  465. SHUFPD_1 %xmm6, %xmm5
  466. mulps -28 * SIZE(Y), %xmm5
  467. addps %xmm5, %xmm1
  468. movaps 0 * SIZE(X), %xmm5
  469. #ifdef PREFETCH
  470. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  471. #endif
  472. SHUFPD_1 %xmm7, %xmm6
  473. mulps -24 * SIZE(Y), %xmm6
  474. addps %xmm6, %xmm2
  475. movaps 4 * SIZE(X), %xmm6
  476. SHUFPD_1 %xmm8, %xmm7
  477. mulps -20 * SIZE(Y), %xmm7
  478. addps %xmm7, %xmm3
  479. movaps 8 * SIZE(X), %xmm7
  480. #if defined(PREFETCH) && !defined(FETCH128)
  481. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  482. #endif
  483. SHUFPD_1 %xmm9, %xmm8
  484. mulps -16 * SIZE(Y), %xmm8
  485. addps %xmm8, %xmm0
  486. movaps 12 * SIZE(X), %xmm8
  487. SHUFPD_1 %xmm10, %xmm9
  488. mulps -12 * SIZE(Y), %xmm9
  489. addps %xmm9, %xmm1
  490. movaps 16 * SIZE(X), %xmm9
  491. #if defined(PREFETCH) && !defined(FETCH128)
  492. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  493. #endif
  494. SHUFPD_1 %xmm11, %xmm10
  495. mulps -8 * SIZE(Y), %xmm10
  496. addps %xmm10, %xmm2
  497. movaps 20 * SIZE(X), %xmm10
  498. SHUFPD_1 %xmm4, %xmm11
  499. mulps -4 * SIZE(Y), %xmm11
  500. addps %xmm11, %xmm3
  501. movaps 24 * SIZE(X), %xmm11
  502. subq $-32 * SIZE, X
  503. subq $-32 * SIZE, Y
  504. decq %rax
  505. jg .L31
  506. ALIGN_3
  507. .L32:
  508. SHUFPD_1 %xmm5, %xmm4
  509. mulps -32 * SIZE(Y), %xmm4
  510. addps %xmm4, %xmm0
  511. movaps -4 * SIZE(X), %xmm4
  512. SHUFPD_1 %xmm6, %xmm5
  513. mulps -28 * SIZE(Y), %xmm5
  514. addps %xmm5, %xmm1
  515. SHUFPD_1 %xmm7, %xmm6
  516. mulps -24 * SIZE(Y), %xmm6
  517. addps %xmm6, %xmm2
  518. SHUFPD_1 %xmm8, %xmm7
  519. mulps -20 * SIZE(Y), %xmm7
  520. addps %xmm7, %xmm3
  521. SHUFPD_1 %xmm9, %xmm8
  522. mulps -16 * SIZE(Y), %xmm8
  523. addps %xmm8, %xmm0
  524. SHUFPD_1 %xmm10, %xmm9
  525. mulps -12 * SIZE(Y), %xmm9
  526. addps %xmm9, %xmm1
  527. SHUFPD_1 %xmm11, %xmm10
  528. mulps -8 * SIZE(Y), %xmm10
  529. addps %xmm10, %xmm2
  530. SHUFPD_1 %xmm4, %xmm11
  531. mulps -4 * SIZE(Y), %xmm11
  532. addps %xmm11, %xmm3
  533. subq $-32 * SIZE, X
  534. subq $-32 * SIZE, Y
  535. ALIGN_3
  536. .L34:
  537. testq $31, N
  538. jle .L999
  539. testq $16, N
  540. jle .L35
  541. movaps -32 * SIZE(X), %xmm5
  542. movaps -28 * SIZE(X), %xmm6
  543. movaps -24 * SIZE(X), %xmm7
  544. SHUFPD_1 %xmm5, %xmm4
  545. mulps -32 * SIZE(Y), %xmm4
  546. addps %xmm4, %xmm0
  547. movaps -20 * SIZE(X), %xmm4
  548. SHUFPD_1 %xmm6, %xmm5
  549. mulps -28 * SIZE(Y), %xmm5
  550. addps %xmm5, %xmm1
  551. SHUFPD_1 %xmm7, %xmm6
  552. mulps -24 * SIZE(Y), %xmm6
  553. addps %xmm6, %xmm2
  554. SHUFPD_1 %xmm4, %xmm7
  555. mulps -20 * SIZE(Y), %xmm7
  556. addps %xmm7, %xmm3
  557. addq $16 * SIZE, X
  558. addq $16 * SIZE, Y
  559. ALIGN_3
  560. .L35:
  561. testq $8, N
  562. jle .L36
  563. movaps -32 * SIZE(X), %xmm5
  564. movaps -28 * SIZE(X), %xmm6
  565. SHUFPD_1 %xmm5, %xmm4
  566. mulps -32 * SIZE(Y), %xmm4
  567. addps %xmm4, %xmm0
  568. SHUFPD_1 %xmm6, %xmm5
  569. mulps -28 * SIZE(Y), %xmm5
  570. addps %xmm5, %xmm1
  571. movapd %xmm6, %xmm4
  572. addq $8 * SIZE, X
  573. addq $8 * SIZE, Y
  574. ALIGN_3
  575. .L36:
  576. testq $4, N
  577. jle .L37
  578. movaps -32 * SIZE(X), %xmm5
  579. SHUFPD_1 %xmm5, %xmm4
  580. mulps -32 * SIZE(Y), %xmm4
  581. addps %xmm4, %xmm0
  582. movaps %xmm5, %xmm4
  583. addq $4 * SIZE, X
  584. addq $4 * SIZE, Y
  585. ALIGN_3
  586. .L37:
  587. testq $2, N
  588. jle .L38
  589. xorps %xmm5, %xmm5
  590. movhlps %xmm4, %xmm5
  591. movlps -32 * SIZE(Y), %xmm4
  592. mulps %xmm4, %xmm5
  593. addps %xmm5, %xmm0
  594. addq $2 * SIZE, X
  595. addq $2 * SIZE, Y
  596. ALIGN_3
  597. .L38:
  598. testq $1, N
  599. jle .L999
  600. movss -34 * SIZE(X), %xmm4
  601. mulss -32 * SIZE(Y), %xmm4
  602. addss %xmm4, %xmm0
  603. jmp .L999
  604. ALIGN_3
  605. .L40:
  606. movaps -35 * SIZE(X), %xmm4
  607. addq $SIZE, X
  608. movq N, %rax
  609. sarq $5, %rax
  610. jle .L44
  611. movaps -32 * SIZE(X), %xmm5
  612. movaps -28 * SIZE(X), %xmm6
  613. movaps -24 * SIZE(X), %xmm7
  614. movaps -20 * SIZE(X), %xmm8
  615. movaps -16 * SIZE(X), %xmm9
  616. movaps -12 * SIZE(X), %xmm10
  617. movaps -8 * SIZE(X), %xmm11
  618. decq %rax
  619. jle .L42
  620. ALIGN_3
  621. .L41:
  622. #ifdef PREFETCH
  623. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  624. #endif
  625. movss %xmm5, %xmm4
  626. shufps $0x93, %xmm5, %xmm4
  627. mulps -32 * SIZE(Y), %xmm4
  628. addps %xmm4, %xmm0
  629. movaps -4 * SIZE(X), %xmm4
  630. movss %xmm6, %xmm5
  631. shufps $0x93, %xmm6, %xmm5
  632. mulps -28 * SIZE(Y), %xmm5
  633. addps %xmm5, %xmm1
  634. movaps 0 * SIZE(X), %xmm5
  635. #ifdef PREFETCH
  636. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  637. #endif
  638. movss %xmm7, %xmm6
  639. shufps $0x93, %xmm7, %xmm6
  640. mulps -24 * SIZE(Y), %xmm6
  641. addps %xmm6, %xmm2
  642. movaps 4 * SIZE(X), %xmm6
  643. movss %xmm8, %xmm7
  644. shufps $0x93, %xmm8, %xmm7
  645. mulps -20 * SIZE(Y), %xmm7
  646. addps %xmm7, %xmm3
  647. movaps 8 * SIZE(X), %xmm7
  648. #if defined(PREFETCH) && !defined(FETCH128)
  649. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  650. #endif
  651. movss %xmm9, %xmm8
  652. shufps $0x93, %xmm9, %xmm8
  653. mulps -16 * SIZE(Y), %xmm8
  654. addps %xmm8, %xmm0
  655. movaps 12 * SIZE(X), %xmm8
  656. movss %xmm10, %xmm9
  657. shufps $0x93, %xmm10, %xmm9
  658. mulps -12 * SIZE(Y), %xmm9
  659. addps %xmm9, %xmm1
  660. movaps 16 * SIZE(X), %xmm9
  661. #if defined(PREFETCH) && !defined(FETCH128)
  662. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  663. #endif
  664. movss %xmm11, %xmm10
  665. shufps $0x93, %xmm11, %xmm10
  666. mulps -8 * SIZE(Y), %xmm10
  667. addps %xmm10, %xmm2
  668. movaps 20 * SIZE(X), %xmm10
  669. movss %xmm4, %xmm11
  670. shufps $0x93, %xmm4, %xmm11
  671. mulps -4 * SIZE(Y), %xmm11
  672. addps %xmm11, %xmm3
  673. movaps 24 * SIZE(X), %xmm11
  674. subq $-32 * SIZE, X
  675. subq $-32 * SIZE, Y
  676. decq %rax
  677. jg .L41
  678. ALIGN_3
  679. .L42:
  680. movss %xmm5, %xmm4
  681. shufps $0x93, %xmm5, %xmm4
  682. mulps -32 * SIZE(Y), %xmm4
  683. addps %xmm4, %xmm0
  684. movaps -4 * SIZE(X), %xmm4
  685. movss %xmm6, %xmm5
  686. shufps $0x93, %xmm6, %xmm5
  687. mulps -28 * SIZE(Y), %xmm5
  688. addps %xmm5, %xmm1
  689. movss %xmm7, %xmm6
  690. shufps $0x93, %xmm7, %xmm6
  691. mulps -24 * SIZE(Y), %xmm6
  692. addps %xmm6, %xmm2
  693. movss %xmm8, %xmm7
  694. shufps $0x93, %xmm8, %xmm7
  695. mulps -20 * SIZE(Y), %xmm7
  696. addps %xmm7, %xmm3
  697. movss %xmm9, %xmm8
  698. shufps $0x93, %xmm9, %xmm8
  699. mulps -16 * SIZE(Y), %xmm8
  700. addps %xmm8, %xmm0
  701. movss %xmm10, %xmm9
  702. shufps $0x93, %xmm10, %xmm9
  703. mulps -12 * SIZE(Y), %xmm9
  704. addps %xmm9, %xmm1
  705. movss %xmm11, %xmm10
  706. shufps $0x93, %xmm11, %xmm10
  707. mulps -8 * SIZE(Y), %xmm10
  708. addps %xmm10, %xmm2
  709. movss %xmm4, %xmm11
  710. shufps $0x93, %xmm4, %xmm11
  711. mulps -4 * SIZE(Y), %xmm11
  712. addps %xmm11, %xmm3
  713. subq $-32 * SIZE, X
  714. subq $-32 * SIZE, Y
  715. ALIGN_3
  716. .L44:
  717. testq $31, N
  718. jle .L999
  719. testq $16, N
  720. jle .L45
  721. movaps -32 * SIZE(X), %xmm5
  722. movaps -28 * SIZE(X), %xmm6
  723. movaps -24 * SIZE(X), %xmm7
  724. movss %xmm5, %xmm4
  725. shufps $0x93, %xmm5, %xmm4
  726. mulps -32 * SIZE(Y), %xmm4
  727. addps %xmm4, %xmm0
  728. movaps -20 * SIZE(X), %xmm4
  729. movss %xmm6, %xmm5
  730. shufps $0x93, %xmm6, %xmm5
  731. mulps -28 * SIZE(Y), %xmm5
  732. addps %xmm5, %xmm1
  733. movss %xmm7, %xmm6
  734. shufps $0x93, %xmm7, %xmm6
  735. mulps -24 * SIZE(Y), %xmm6
  736. addps %xmm6, %xmm2
  737. movss %xmm4, %xmm7
  738. shufps $0x93, %xmm4, %xmm7
  739. mulps -20 * SIZE(Y), %xmm7
  740. addps %xmm7, %xmm3
  741. addq $16 * SIZE, X
  742. addq $16 * SIZE, Y
  743. ALIGN_3
  744. .L45:
  745. testq $8, N
  746. jle .L46
  747. movaps -32 * SIZE(X), %xmm5
  748. movaps -28 * SIZE(X), %xmm6
  749. movss %xmm5, %xmm4
  750. shufps $0x93, %xmm5, %xmm4
  751. mulps -32 * SIZE(Y), %xmm4
  752. addps %xmm4, %xmm0
  753. movss %xmm6, %xmm5
  754. shufps $0x93, %xmm6, %xmm5
  755. mulps -28 * SIZE(Y), %xmm5
  756. addps %xmm5, %xmm1
  757. movaps %xmm6, %xmm4
  758. addq $8 * SIZE, X
  759. addq $8 * SIZE, Y
  760. ALIGN_3
  761. .L46:
  762. testq $4, N
  763. jle .L47
  764. movaps -32 * SIZE(X), %xmm5
  765. movss %xmm5, %xmm4
  766. shufps $0x93, %xmm5, %xmm4
  767. mulps -32 * SIZE(Y), %xmm4
  768. addps %xmm4, %xmm2
  769. movaps %xmm5, %xmm4
  770. addq $4 * SIZE, X
  771. addq $4 * SIZE, Y
  772. ALIGN_3
  773. .L47:
  774. testq $2, N
  775. jle .L48
  776. movaps -32 * SIZE(X), %xmm5
  777. #ifdef movsd
  778. xorps %xmm8, %xmm8
  779. #endif
  780. movsd -32 * SIZE(Y), %xmm8
  781. movss %xmm5, %xmm4
  782. shufps $0x93, %xmm4, %xmm4
  783. mulps %xmm8, %xmm4
  784. addps %xmm4, %xmm3
  785. movlhps %xmm5, %xmm4
  786. addq $2 * SIZE, X
  787. addq $2 * SIZE, Y
  788. ALIGN_3
  789. .L48:
  790. testq $1, N
  791. jle .L999
  792. pshufd $0x93, %xmm4, %xmm4
  793. mulss -32 * SIZE(Y), %xmm4
  794. addss %xmm4, %xmm0
  795. jmp .L999
  796. ALIGN_4
  797. #else
  798. movq N, %rax
  799. sarq $5, %rax
  800. jle .L24
  801. movlps -32 * SIZE(X), %xmm4
  802. movhps -30 * SIZE(X), %xmm4
  803. movlps -28 * SIZE(X), %xmm5
  804. movhps -26 * SIZE(X), %xmm5
  805. movlps -24 * SIZE(X), %xmm6
  806. movhps -22 * SIZE(X), %xmm6
  807. movlps -20 * SIZE(X), %xmm7
  808. movhps -18 * SIZE(X), %xmm7
  809. movlps -16 * SIZE(X), %xmm8
  810. movhps -14 * SIZE(X), %xmm8
  811. movlps -12 * SIZE(X), %xmm9
  812. movhps -10 * SIZE(X), %xmm9
  813. movlps -8 * SIZE(X), %xmm10
  814. movhps -6 * SIZE(X), %xmm10
  815. movlps -4 * SIZE(X), %xmm11
  816. movhps -2 * SIZE(X), %xmm11
  817. decq %rax
  818. jle .L22
  819. ALIGN_3
  820. .L21:
  821. #ifdef PREFETCH
  822. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  823. #endif
  824. mulps -32 * SIZE(Y), %xmm4
  825. addps %xmm4, %xmm0
  826. movlps 0 * SIZE(X), %xmm4
  827. movhps 2 * SIZE(X), %xmm4
  828. mulps -28 * SIZE(Y), %xmm5
  829. addps %xmm5, %xmm1
  830. movlps 4 * SIZE(X), %xmm5
  831. movhps 6 * SIZE(X), %xmm5
  832. #ifdef PREFETCH
  833. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  834. #endif
  835. mulps -24 * SIZE(Y), %xmm6
  836. addps %xmm6, %xmm2
  837. movlps 8 * SIZE(X), %xmm6
  838. movhps 10 * SIZE(X), %xmm6
  839. mulps -20 * SIZE(Y), %xmm7
  840. addps %xmm7, %xmm3
  841. movlps 12 * SIZE(X), %xmm7
  842. movhps 14 * SIZE(X), %xmm7
  843. #if defined(PREFETCH) && !defined(FETCH128)
  844. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  845. #endif
  846. mulps -16 * SIZE(Y), %xmm8
  847. addps %xmm8, %xmm0
  848. movlps 16 * SIZE(X), %xmm8
  849. movhps 18 * SIZE(X), %xmm8
  850. mulps -12 * SIZE(Y), %xmm9
  851. addps %xmm9, %xmm1
  852. movlps 20 * SIZE(X), %xmm9
  853. movhps 22 * SIZE(X), %xmm9
  854. #if defined(PREFETCH) && !defined(FETCH128)
  855. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  856. #endif
  857. mulps -8 * SIZE(Y), %xmm10
  858. addps %xmm10, %xmm2
  859. movlps 24 * SIZE(X), %xmm10
  860. movhps 26 * SIZE(X), %xmm10
  861. mulps -4 * SIZE(Y), %xmm11
  862. addps %xmm11, %xmm3
  863. movlps 28 * SIZE(X), %xmm11
  864. movhps 30 * SIZE(X), %xmm11
  865. subq $-32 * SIZE, X
  866. subq $-32 * SIZE, Y
  867. decq %rax
  868. jg .L21
  869. ALIGN_3
  870. .L22:
  871. mulps -32 * SIZE(Y), %xmm4
  872. addps %xmm4, %xmm0
  873. mulps -28 * SIZE(Y), %xmm5
  874. addps %xmm5, %xmm1
  875. mulps -24 * SIZE(Y), %xmm6
  876. addps %xmm6, %xmm2
  877. mulps -20 * SIZE(Y), %xmm7
  878. addps %xmm7, %xmm3
  879. mulps -16 * SIZE(Y), %xmm8
  880. addps %xmm8, %xmm0
  881. mulps -12 * SIZE(Y), %xmm9
  882. addps %xmm9, %xmm1
  883. mulps -8 * SIZE(Y), %xmm10
  884. addps %xmm10, %xmm2
  885. mulps -4 * SIZE(Y), %xmm11
  886. addps %xmm11, %xmm3
  887. subq $-32 * SIZE, X
  888. subq $-32 * SIZE, Y
  889. ALIGN_3
  890. .L24:
  891. testq $31, N
  892. jle .L999
  893. testq $16, N
  894. jle .L25
  895. movlps -32 * SIZE(X), %xmm4
  896. movhps -30 * SIZE(X), %xmm4
  897. movlps -28 * SIZE(X), %xmm5
  898. movhps -26 * SIZE(X), %xmm5
  899. movlps -24 * SIZE(X), %xmm6
  900. movhps -22 * SIZE(X), %xmm6
  901. movlps -20 * SIZE(X), %xmm7
  902. movhps -18 * SIZE(X), %xmm7
  903. mulps -32 * SIZE(Y), %xmm4
  904. addps %xmm4, %xmm0
  905. mulps -28 * SIZE(Y), %xmm5
  906. addps %xmm5, %xmm1
  907. mulps -24 * SIZE(Y), %xmm6
  908. addps %xmm6, %xmm2
  909. mulps -20 * SIZE(Y), %xmm7
  910. addps %xmm7, %xmm3
  911. addq $16 * SIZE, X
  912. addq $16 * SIZE, Y
  913. ALIGN_3
  914. .L25:
  915. testq $8, N
  916. jle .L26
  917. movlps -32 * SIZE(X), %xmm4
  918. movhps -30 * SIZE(X), %xmm4
  919. movlps -28 * SIZE(X), %xmm5
  920. movhps -26 * SIZE(X), %xmm5
  921. mulps -32 * SIZE(Y), %xmm4
  922. addps %xmm4, %xmm0
  923. mulps -28 * SIZE(Y), %xmm5
  924. addps %xmm5, %xmm1
  925. addq $8 * SIZE, X
  926. addq $8 * SIZE, Y
  927. ALIGN_3
  928. .L26:
  929. testq $4, N
  930. jle .L27
  931. movlps -32 * SIZE(X), %xmm4
  932. movhps -30 * SIZE(X), %xmm4
  933. mulps -32 * SIZE(Y), %xmm4
  934. addps %xmm4, %xmm2
  935. addq $4 * SIZE, X
  936. addq $4 * SIZE, Y
  937. ALIGN_3
  938. .L27:
  939. testq $2, N
  940. jle .L28
  941. #ifdef movsd
  942. xorps %xmm4, %xmm4
  943. #endif
  944. movsd -32 * SIZE(X), %xmm4
  945. #ifdef movsd
  946. xorps %xmm8, %xmm8
  947. #endif
  948. movsd -32 * SIZE(Y), %xmm8
  949. mulps %xmm8, %xmm4
  950. addps %xmm4, %xmm3
  951. addq $2 * SIZE, X
  952. addq $2 * SIZE, Y
  953. ALIGN_3
  954. .L28:
  955. testq $1, N
  956. jle .L999
  957. movss -32 * SIZE(X), %xmm4
  958. mulss -32 * SIZE(Y), %xmm4
  959. addss %xmm4, %xmm0
  960. jmp .L999
  961. ALIGN_3
  962. #endif
  963. .L50:
  964. movq N, %rax
  965. sarq $2, %rax
  966. jle .L55
  967. ALIGN_3
  968. .L53:
  969. movss 0 * SIZE(X), %xmm4
  970. addq INCX, X
  971. mulss 0 * SIZE(Y), %xmm4
  972. addq INCY, Y
  973. movss 0 * SIZE(X), %xmm5
  974. addq INCX, X
  975. mulss 0 * SIZE(Y), %xmm5
  976. addq INCY, Y
  977. movss 0 * SIZE(X), %xmm6
  978. addq INCX, X
  979. mulss 0 * SIZE(Y), %xmm6
  980. addq INCY, Y
  981. movss 0 * SIZE(X), %xmm7
  982. addq INCX, X
  983. mulss 0 * SIZE(Y), %xmm7
  984. addq INCY, Y
  985. addss %xmm4, %xmm0
  986. addss %xmm5, %xmm1
  987. addss %xmm6, %xmm2
  988. addss %xmm7, %xmm3
  989. decq %rax
  990. jg .L53
  991. ALIGN_3
  992. .L55:
  993. movq N, %rax
  994. andq $3, %rax
  995. jle .L999
  996. ALIGN_3
  997. .L56:
  998. movss 0 * SIZE(X), %xmm4
  999. addq INCX, X
  1000. mulss 0 * SIZE(Y), %xmm4
  1001. addq INCY, Y
  1002. addss %xmm4, %xmm0
  1003. decq %rax
  1004. jg .L56
  1005. ALIGN_3
  1006. .L999:
  1007. addps %xmm1, %xmm0
  1008. addps %xmm3, %xmm2
  1009. addps %xmm2, %xmm0
  1010. #ifndef HAVE_SSE3
  1011. movhlps %xmm0, %xmm1
  1012. addps %xmm1, %xmm0
  1013. movaps %xmm0, %xmm1
  1014. shufps $1, %xmm0, %xmm0
  1015. addss %xmm1, %xmm0
  1016. #else
  1017. haddps %xmm0, %xmm0
  1018. haddps %xmm0, %xmm0
  1019. #endif
  1020. #ifdef DSDOT
  1021. cvtss2sd %xmm0, %xmm0
  1022. #endif
  1023. RESTOREREGISTERS
  1024. ret
  1025. EPILOGUE