You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_n_atom.S 23 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "l2param.h"
  41. #ifdef ATOM
  42. #define PREFETCH prefetchnta
  43. #define PREFETCHW prefetcht0
  44. #define PREFETCHSIZE (8 * 6)
  45. #endif
  46. #ifndef WINDOWS_ABI
  47. #define STACKSIZE 64
  48. #define OLD_INCX 8 + STACKSIZE(%rsp)
  49. #define OLD_Y 16 + STACKSIZE(%rsp)
  50. #define OLD_INCY 24 + STACKSIZE(%rsp)
  51. #define OLD_BUFFER 32 + STACKSIZE(%rsp)
  52. #define M %rdi
  53. #define N %rsi
  54. #define A %rcx
  55. #define LDA %r8
  56. #define X %r9
  57. #define INCX %rdx
  58. #define Y %rbp
  59. #define INCY %r10
  60. #else
  61. #define STACKSIZE 256
  62. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  63. #define OLD_A 48 + STACKSIZE(%rsp)
  64. #define OLD_LDA 56 + STACKSIZE(%rsp)
  65. #define OLD_X 64 + STACKSIZE(%rsp)
  66. #define OLD_INCX 72 + STACKSIZE(%rsp)
  67. #define OLD_Y 80 + STACKSIZE(%rsp)
  68. #define OLD_INCY 88 + STACKSIZE(%rsp)
  69. #define OLD_BUFFER 96 + STACKSIZE(%rsp)
  70. #define M %rcx
  71. #define N %rdx
  72. #define A %r8
  73. #define LDA %r9
  74. #define X %rdi
  75. #define INCX %rsi
  76. #define Y %rbp
  77. #define INCY %r10
  78. #endif
  79. #define I %rax
  80. #define J %r11
  81. #define A1 %r12
  82. #define A2 %r13
  83. #define Y1 %r14
  84. #define BUFFER %r15
  85. #define ALPHA_R %xmm14
  86. #define ALPHA_I %xmm15
  87. #if !defined(CONJ) && !defined(XCONJ)
  88. #define ADD1 addsd
  89. #define ADD2 addsd
  90. #define ADD3 subsd
  91. #define ADD4 addsd
  92. #endif
  93. #if defined(CONJ) && !defined(XCONJ)
  94. #define ADD1 addsd
  95. #define ADD2 addsd
  96. #define ADD3 addsd
  97. #define ADD4 subsd
  98. #endif
  99. #if !defined(CONJ) && defined(XCONJ)
  100. #define ADD1 addsd
  101. #define ADD2 subsd
  102. #define ADD3 addsd
  103. #define ADD4 addsd
  104. #endif
  105. #if defined(CONJ) && defined(XCONJ)
  106. #define ADD1 addsd
  107. #define ADD2 subsd
  108. #define ADD3 subsd
  109. #define ADD4 subsd
  110. #endif
  111. PROLOGUE
  112. PROFCODE
  113. subq $STACKSIZE, %rsp
  114. movq %rbx, 0(%rsp)
  115. movq %rbp, 8(%rsp)
  116. movq %r12, 16(%rsp)
  117. movq %r13, 24(%rsp)
  118. movq %r14, 32(%rsp)
  119. movq %r15, 40(%rsp)
  120. #ifdef WINDOWS_ABI
  121. movq %rdi, 48(%rsp)
  122. movq %rsi, 56(%rsp)
  123. movups %xmm6, 64(%rsp)
  124. movups %xmm7, 80(%rsp)
  125. movups %xmm8, 96(%rsp)
  126. movups %xmm9, 112(%rsp)
  127. movups %xmm10, 128(%rsp)
  128. movups %xmm11, 144(%rsp)
  129. movups %xmm12, 160(%rsp)
  130. movups %xmm13, 176(%rsp)
  131. movups %xmm14, 192(%rsp)
  132. movups %xmm15, 208(%rsp)
  133. movq OLD_A, A
  134. movq OLD_LDA, LDA
  135. movq OLD_X, X
  136. movapd %xmm3, %xmm0
  137. movsd OLD_ALPHA_I, %xmm1
  138. #endif
  139. movq OLD_INCX, INCX
  140. movq OLD_Y, Y
  141. movq OLD_INCY, INCY
  142. movq OLD_BUFFER, BUFFER
  143. salq $ZBASE_SHIFT, LDA
  144. salq $ZBASE_SHIFT, INCX
  145. salq $ZBASE_SHIFT, INCY
  146. movaps %xmm0, ALPHA_R
  147. movaps %xmm1, ALPHA_I
  148. subq $-16 * SIZE, A
  149. testq M, M
  150. jle .L999
  151. testq N, N
  152. jle .L999
  153. ALIGN_3
  154. movq BUFFER, Y1
  155. pxor %xmm4, %xmm4
  156. movq M, %rax
  157. addq $8, %rax
  158. sarq $3, %rax
  159. ALIGN_3
  160. .L01:
  161. movapd %xmm4, 0 * SIZE(Y1)
  162. movapd %xmm4, 2 * SIZE(Y1)
  163. movapd %xmm4, 4 * SIZE(Y1)
  164. movapd %xmm4, 6 * SIZE(Y1)
  165. movapd %xmm4, 8 * SIZE(Y1)
  166. movapd %xmm4, 10 * SIZE(Y1)
  167. movapd %xmm4, 12 * SIZE(Y1)
  168. movapd %xmm4, 14 * SIZE(Y1)
  169. subq $-16 * SIZE, Y1
  170. decq %rax
  171. jg .L01
  172. ALIGN_3
  173. .L10:
  174. movq N, J
  175. sarq $1, J
  176. jle .L20
  177. ALIGN_3
  178. .L11:
  179. leaq 16 * SIZE(BUFFER), Y1
  180. movq A, A1
  181. leaq (A, LDA, 1), A2
  182. leaq (A, LDA, 2), A
  183. movsd 0 * SIZE(X), %xmm4
  184. movsd 1 * SIZE(X), %xmm5
  185. addq INCX, X
  186. movsd 0 * SIZE(X), %xmm6
  187. movsd 1 * SIZE(X), %xmm7
  188. addq INCX, X
  189. movapd %xmm4, %xmm8
  190. mulsd ALPHA_R, %xmm4
  191. mulsd ALPHA_I, %xmm8
  192. movapd %xmm6, %xmm10
  193. mulsd ALPHA_R, %xmm6
  194. mulsd ALPHA_I, %xmm10
  195. movapd %xmm5, %xmm9
  196. mulsd ALPHA_I, %xmm9
  197. mulsd ALPHA_R, %xmm5
  198. movapd %xmm7, %xmm11
  199. mulsd ALPHA_I, %xmm11
  200. mulsd ALPHA_R, %xmm7
  201. #ifndef XCONJ
  202. subsd %xmm9, %xmm4
  203. addsd %xmm8, %xmm5
  204. subsd %xmm11, %xmm6
  205. addsd %xmm10, %xmm7
  206. #else
  207. addsd %xmm9, %xmm4
  208. subsd %xmm8, %xmm5
  209. addsd %xmm11, %xmm6
  210. subsd %xmm10, %xmm7
  211. #endif
  212. movsd -16 * SIZE(Y1), %xmm0
  213. movsd -15 * SIZE(Y1), %xmm1
  214. movsd -14 * SIZE(Y1), %xmm2
  215. movsd -13 * SIZE(Y1), %xmm3
  216. ALIGN_3
  217. movq M, I
  218. sarq $2, I
  219. jle .L15
  220. movsd -16 * SIZE(A1), %xmm8
  221. movsd -15 * SIZE(A1), %xmm9
  222. movsd -14 * SIZE(A1), %xmm10
  223. movsd -13 * SIZE(A1), %xmm11
  224. movapd %xmm8, %xmm12
  225. mulsd %xmm4, %xmm8
  226. mulsd %xmm5, %xmm12
  227. movapd %xmm10, %xmm13
  228. mulsd %xmm4, %xmm10
  229. ADD1 %xmm8, %xmm0
  230. movsd -16 * SIZE(A2), %xmm8
  231. mulsd %xmm5, %xmm13
  232. ADD2 %xmm12, %xmm1
  233. decq I
  234. jle .L14
  235. ALIGN_3
  236. .L13:
  237. #ifdef PREFETCH
  238. PREFETCH (PREFETCHSIZE + 0) * SIZE(A2)
  239. #endif
  240. movapd %xmm9, %xmm12
  241. mulsd %xmm5, %xmm9
  242. ADD1 %xmm10, %xmm2
  243. movsd -14 * SIZE(A2), %xmm10
  244. mulsd %xmm4, %xmm12
  245. ADD2 %xmm13, %xmm3
  246. movapd %xmm11, %xmm13
  247. mulsd %xmm5, %xmm11
  248. ADD3 %xmm9, %xmm0
  249. movsd -15 * SIZE(A2), %xmm9
  250. mulsd %xmm4, %xmm13
  251. ADD4 %xmm12, %xmm1
  252. movapd %xmm8, %xmm12
  253. mulsd %xmm6, %xmm8
  254. ADD3 %xmm11, %xmm2
  255. movsd -13 * SIZE(A2), %xmm11
  256. mulsd %xmm7, %xmm12
  257. ADD4 %xmm13, %xmm3
  258. movapd %xmm10, %xmm13
  259. mulsd %xmm6, %xmm10
  260. ADD1 %xmm8, %xmm0
  261. movsd -12 * SIZE(A1), %xmm8
  262. mulsd %xmm7, %xmm13
  263. ADD2 %xmm12, %xmm1
  264. movapd %xmm9, %xmm12
  265. mulsd %xmm7, %xmm9
  266. ADD1 %xmm10, %xmm2
  267. movsd -10 * SIZE(A1), %xmm10
  268. mulsd %xmm6, %xmm12
  269. ADD2 %xmm13, %xmm3
  270. movapd %xmm11, %xmm13
  271. mulsd %xmm7, %xmm11
  272. ADD3 %xmm9, %xmm0
  273. movsd -11 * SIZE(A1), %xmm9
  274. mulsd %xmm6, %xmm13
  275. ADD4 %xmm12, %xmm1
  276. movapd %xmm8, %xmm12
  277. movlpd %xmm0, -16 * SIZE(Y1)
  278. mulsd %xmm4, %xmm8
  279. movsd -12 * SIZE(Y1), %xmm0
  280. ADD3 %xmm11, %xmm2
  281. movsd -9 * SIZE(A1), %xmm11
  282. mulsd %xmm5, %xmm12
  283. movlpd %xmm1, -15 * SIZE(Y1)
  284. ADD4 %xmm13, %xmm3
  285. movsd -11 * SIZE(Y1), %xmm1
  286. movapd %xmm10, %xmm13
  287. movlpd %xmm2, -14 * SIZE(Y1)
  288. mulsd %xmm4, %xmm10
  289. movlpd %xmm3, -13 * SIZE(Y1)
  290. ADD1 %xmm8, %xmm0
  291. movsd -12 * SIZE(A2), %xmm8
  292. mulsd %xmm5, %xmm13
  293. movsd -10 * SIZE(Y1), %xmm2
  294. ADD2 %xmm12, %xmm1
  295. movsd -9 * SIZE(Y1), %xmm3
  296. #ifdef PREFETCH
  297. PREFETCH (PREFETCHSIZE + 0) * SIZE(A1)
  298. #endif
  299. movapd %xmm9, %xmm12
  300. mulsd %xmm5, %xmm9
  301. ADD1 %xmm10, %xmm2
  302. movsd -10 * SIZE(A2), %xmm10
  303. mulsd %xmm4, %xmm12
  304. ADD2 %xmm13, %xmm3
  305. movapd %xmm11, %xmm13
  306. mulsd %xmm5, %xmm11
  307. ADD3 %xmm9, %xmm0
  308. movsd -11 * SIZE(A2), %xmm9
  309. mulsd %xmm4, %xmm13
  310. ADD4 %xmm12, %xmm1
  311. movapd %xmm8, %xmm12
  312. mulsd %xmm6, %xmm8
  313. ADD3 %xmm11, %xmm2
  314. movsd -9 * SIZE(A2), %xmm11
  315. mulsd %xmm7, %xmm12
  316. ADD4 %xmm13, %xmm3
  317. movapd %xmm10, %xmm13
  318. mulsd %xmm6, %xmm10
  319. ADD1 %xmm8, %xmm0
  320. movsd -8 * SIZE(A1), %xmm8
  321. mulsd %xmm7, %xmm13
  322. ADD2 %xmm12, %xmm1
  323. movapd %xmm9, %xmm12
  324. mulsd %xmm7, %xmm9
  325. ADD1 %xmm10, %xmm2
  326. movsd -6 * SIZE(A1), %xmm10
  327. mulsd %xmm6, %xmm12
  328. ADD2 %xmm13, %xmm3
  329. movapd %xmm11, %xmm13
  330. mulsd %xmm7, %xmm11
  331. ADD3 %xmm9, %xmm0
  332. movsd -7 * SIZE(A1), %xmm9
  333. mulsd %xmm6, %xmm13
  334. ADD4 %xmm12, %xmm1
  335. movapd %xmm8, %xmm12
  336. movlpd %xmm0, -12 * SIZE(Y1)
  337. mulsd %xmm4, %xmm8
  338. movsd -8 * SIZE(Y1), %xmm0
  339. ADD3 %xmm11, %xmm2
  340. movsd -5 * SIZE(A1), %xmm11
  341. mulsd %xmm5, %xmm12
  342. movlpd %xmm1, -11 * SIZE(Y1)
  343. ADD4 %xmm13, %xmm3
  344. movsd -7 * SIZE(Y1), %xmm1
  345. movapd %xmm10, %xmm13
  346. movlpd %xmm2, -10 * SIZE(Y1)
  347. mulsd %xmm4, %xmm10
  348. movsd -6 * SIZE(Y1), %xmm2
  349. ADD1 %xmm8, %xmm0
  350. movsd -8 * SIZE(A2), %xmm8
  351. mulsd %xmm5, %xmm13
  352. movlpd %xmm3, -9 * SIZE(Y1)
  353. ADD2 %xmm12, %xmm1
  354. movsd -5 * SIZE(Y1), %xmm3
  355. subq $-8 * SIZE, A1
  356. subq $-8 * SIZE, A2
  357. subq $-8 * SIZE, Y1
  358. subq $1, I
  359. BRANCH
  360. jg .L13
  361. ALIGN_3
  362. .L14:
  363. movapd %xmm9, %xmm12
  364. mulsd %xmm5, %xmm9
  365. ADD1 %xmm10, %xmm2
  366. movsd -14 * SIZE(A2), %xmm10
  367. mulsd %xmm4, %xmm12
  368. ADD2 %xmm13, %xmm3
  369. movapd %xmm11, %xmm13
  370. mulsd %xmm5, %xmm11
  371. ADD3 %xmm9, %xmm0
  372. movsd -15 * SIZE(A2), %xmm9
  373. mulsd %xmm4, %xmm13
  374. ADD4 %xmm12, %xmm1
  375. movapd %xmm8, %xmm12
  376. mulsd %xmm6, %xmm8
  377. ADD3 %xmm11, %xmm2
  378. movsd -13 * SIZE(A2), %xmm11
  379. mulsd %xmm7, %xmm12
  380. ADD4 %xmm13, %xmm3
  381. movapd %xmm10, %xmm13
  382. mulsd %xmm6, %xmm10
  383. ADD1 %xmm8, %xmm0
  384. movsd -12 * SIZE(A1), %xmm8
  385. mulsd %xmm7, %xmm13
  386. ADD2 %xmm12, %xmm1
  387. movapd %xmm9, %xmm12
  388. mulsd %xmm7, %xmm9
  389. ADD1 %xmm10, %xmm2
  390. movsd -10 * SIZE(A1), %xmm10
  391. mulsd %xmm6, %xmm12
  392. ADD2 %xmm13, %xmm3
  393. movapd %xmm11, %xmm13
  394. mulsd %xmm7, %xmm11
  395. ADD3 %xmm9, %xmm0
  396. movsd -11 * SIZE(A1), %xmm9
  397. mulsd %xmm6, %xmm13
  398. ADD4 %xmm12, %xmm1
  399. movapd %xmm8, %xmm12
  400. movlpd %xmm0, -16 * SIZE(Y1)
  401. mulsd %xmm4, %xmm8
  402. movsd -12 * SIZE(Y1), %xmm0
  403. ADD3 %xmm11, %xmm2
  404. movsd -9 * SIZE(A1), %xmm11
  405. mulsd %xmm5, %xmm12
  406. movlpd %xmm1, -15 * SIZE(Y1)
  407. ADD4 %xmm13, %xmm3
  408. movsd -11 * SIZE(Y1), %xmm1
  409. movapd %xmm10, %xmm13
  410. movlpd %xmm2, -14 * SIZE(Y1)
  411. mulsd %xmm4, %xmm10
  412. movlpd %xmm3, -13 * SIZE(Y1)
  413. ADD1 %xmm8, %xmm0
  414. movsd -12 * SIZE(A2), %xmm8
  415. mulsd %xmm5, %xmm13
  416. movsd -10 * SIZE(Y1), %xmm2
  417. ADD2 %xmm12, %xmm1
  418. movsd -9 * SIZE(Y1), %xmm3
  419. movapd %xmm9, %xmm12
  420. mulsd %xmm5, %xmm9
  421. ADD1 %xmm10, %xmm2
  422. movsd -10 * SIZE(A2), %xmm10
  423. mulsd %xmm4, %xmm12
  424. ADD2 %xmm13, %xmm3
  425. movapd %xmm11, %xmm13
  426. mulsd %xmm5, %xmm11
  427. ADD3 %xmm9, %xmm0
  428. movsd -11 * SIZE(A2), %xmm9
  429. mulsd %xmm4, %xmm13
  430. ADD4 %xmm12, %xmm1
  431. movapd %xmm8, %xmm12
  432. mulsd %xmm6, %xmm8
  433. ADD3 %xmm11, %xmm2
  434. movsd -9 * SIZE(A2), %xmm11
  435. mulsd %xmm7, %xmm12
  436. ADD4 %xmm13, %xmm3
  437. movapd %xmm10, %xmm13
  438. mulsd %xmm6, %xmm10
  439. ADD1 %xmm8, %xmm0
  440. mulsd %xmm7, %xmm13
  441. ADD2 %xmm12, %xmm1
  442. movapd %xmm9, %xmm12
  443. mulsd %xmm7, %xmm9
  444. ADD1 %xmm10, %xmm2
  445. mulsd %xmm6, %xmm12
  446. ADD2 %xmm13, %xmm3
  447. movapd %xmm11, %xmm13
  448. mulsd %xmm7, %xmm11
  449. ADD3 %xmm9, %xmm0
  450. mulsd %xmm6, %xmm13
  451. ADD4 %xmm12, %xmm1
  452. ADD3 %xmm11, %xmm2
  453. movlpd %xmm0, -12 * SIZE(Y1)
  454. movsd -8 * SIZE(Y1), %xmm0
  455. ADD4 %xmm13, %xmm3
  456. movlpd %xmm1, -11 * SIZE(Y1)
  457. movsd -7 * SIZE(Y1), %xmm1
  458. movlpd %xmm2, -10 * SIZE(Y1)
  459. movsd -6 * SIZE(Y1), %xmm2
  460. movlpd %xmm3, -9 * SIZE(Y1)
  461. movsd -5 * SIZE(Y1), %xmm3
  462. subq $-8 * SIZE, A1
  463. subq $-8 * SIZE, A2
  464. subq $-8 * SIZE, Y1
  465. ALIGN_3
  466. .L15:
  467. testq $2, M
  468. je .L17
  469. movsd -16 * SIZE(A1), %xmm8
  470. movsd -15 * SIZE(A1), %xmm9
  471. movsd -14 * SIZE(A1), %xmm10
  472. movsd -13 * SIZE(A1), %xmm11
  473. movapd %xmm8, %xmm12
  474. mulsd %xmm4, %xmm8
  475. mulsd %xmm5, %xmm12
  476. movapd %xmm10, %xmm13
  477. mulsd %xmm4, %xmm10
  478. ADD1 %xmm8, %xmm0
  479. movsd -16 * SIZE(A2), %xmm8
  480. mulsd %xmm5, %xmm13
  481. ADD2 %xmm12, %xmm1
  482. movapd %xmm9, %xmm12
  483. mulsd %xmm5, %xmm9
  484. ADD1 %xmm10, %xmm2
  485. movsd -14 * SIZE(A2), %xmm10
  486. mulsd %xmm4, %xmm12
  487. ADD2 %xmm13, %xmm3
  488. movapd %xmm11, %xmm13
  489. mulsd %xmm5, %xmm11
  490. ADD3 %xmm9, %xmm0
  491. movsd -15 * SIZE(A2), %xmm9
  492. mulsd %xmm4, %xmm13
  493. ADD4 %xmm12, %xmm1
  494. movapd %xmm8, %xmm12
  495. mulsd %xmm6, %xmm8
  496. ADD3 %xmm11, %xmm2
  497. movsd -13 * SIZE(A2), %xmm11
  498. mulsd %xmm7, %xmm12
  499. ADD4 %xmm13, %xmm3
  500. movapd %xmm10, %xmm13
  501. mulsd %xmm6, %xmm10
  502. ADD1 %xmm8, %xmm0
  503. mulsd %xmm7, %xmm13
  504. ADD2 %xmm12, %xmm1
  505. movapd %xmm9, %xmm12
  506. mulsd %xmm7, %xmm9
  507. ADD1 %xmm10, %xmm2
  508. mulsd %xmm6, %xmm12
  509. ADD2 %xmm13, %xmm3
  510. movapd %xmm11, %xmm13
  511. mulsd %xmm7, %xmm11
  512. ADD3 %xmm9, %xmm0
  513. mulsd %xmm6, %xmm13
  514. ADD4 %xmm12, %xmm1
  515. ADD3 %xmm11, %xmm2
  516. ADD4 %xmm13, %xmm3
  517. movlpd %xmm0, -16 * SIZE(Y1)
  518. movlpd %xmm1, -15 * SIZE(Y1)
  519. movsd -12 * SIZE(Y1), %xmm0
  520. movsd -11 * SIZE(Y1), %xmm1
  521. movlpd %xmm2, -14 * SIZE(Y1)
  522. movlpd %xmm3, -13 * SIZE(Y1)
  523. addq $4 * SIZE, A1
  524. addq $4 * SIZE, A2
  525. addq $4 * SIZE, Y1
  526. ALIGN_3
  527. .L17:
  528. testq $1, M
  529. je .L19
  530. movsd -16 * SIZE(A1), %xmm8
  531. movsd -15 * SIZE(A1), %xmm9
  532. movsd -16 * SIZE(A2), %xmm10
  533. movsd -15 * SIZE(A2), %xmm11
  534. movapd %xmm8, %xmm12
  535. mulsd %xmm4, %xmm8
  536. mulsd %xmm5, %xmm12
  537. movapd %xmm9, %xmm13
  538. mulsd %xmm5, %xmm9
  539. ADD1 %xmm8, %xmm0
  540. mulsd %xmm4, %xmm13
  541. ADD2 %xmm12, %xmm1
  542. movapd %xmm10, %xmm12
  543. mulsd %xmm6, %xmm10
  544. ADD3 %xmm9, %xmm0
  545. mulsd %xmm7, %xmm12
  546. ADD4 %xmm13, %xmm1
  547. movapd %xmm11, %xmm13
  548. mulsd %xmm7, %xmm11
  549. ADD1 %xmm10, %xmm0
  550. mulsd %xmm6, %xmm13
  551. ADD2 %xmm12, %xmm1
  552. ADD3 %xmm11, %xmm0
  553. ADD4 %xmm13, %xmm1
  554. movlpd %xmm0, -16 * SIZE(Y1)
  555. movlpd %xmm1, -15 * SIZE(Y1)
  556. ALIGN_3
  557. .L19:
  558. decq J
  559. jg .L11
  560. ALIGN_3
  561. .L20:
  562. testq $1, N
  563. jle .L90
  564. leaq 16 * SIZE(BUFFER), Y1
  565. movq A, A1
  566. movsd 0 * SIZE(X), %xmm4
  567. movsd 1 * SIZE(X), %xmm5
  568. movapd %xmm4, %xmm8
  569. mulsd ALPHA_R, %xmm4
  570. mulsd ALPHA_I, %xmm8
  571. movapd %xmm5, %xmm9
  572. mulsd ALPHA_I, %xmm9
  573. mulsd ALPHA_R, %xmm5
  574. #ifndef XCONJ
  575. subsd %xmm9, %xmm4
  576. addsd %xmm8, %xmm5
  577. #else
  578. addsd %xmm9, %xmm4
  579. subsd %xmm8, %xmm5
  580. #endif
  581. movsd -16 * SIZE(Y1), %xmm0
  582. movsd -15 * SIZE(Y1), %xmm1
  583. movsd -14 * SIZE(Y1), %xmm2
  584. movsd -13 * SIZE(Y1), %xmm3
  585. ALIGN_3
  586. movq M, I
  587. sarq $2, I
  588. jle .L25
  589. movsd -16 * SIZE(A1), %xmm8
  590. movsd -15 * SIZE(A1), %xmm9
  591. movsd -14 * SIZE(A1), %xmm10
  592. movsd -13 * SIZE(A1), %xmm11
  593. movapd %xmm8, %xmm12
  594. mulsd %xmm4, %xmm8
  595. mulsd %xmm5, %xmm12
  596. movapd %xmm10, %xmm13
  597. mulsd %xmm4, %xmm10
  598. ADD1 %xmm8, %xmm0
  599. movsd -12 * SIZE(A1), %xmm8
  600. mulsd %xmm5, %xmm13
  601. ADD2 %xmm12, %xmm1
  602. decq I
  603. jle .L24
  604. ALIGN_3
  605. .L23:
  606. #ifdef PREFETCH
  607. PREFETCH (PREFETCHSIZE + 0) * SIZE(A2)
  608. #endif
  609. movapd %xmm9, %xmm12
  610. mulsd %xmm5, %xmm9
  611. ADD1 %xmm10, %xmm2
  612. movsd -10 * SIZE(A1), %xmm10
  613. mulsd %xmm4, %xmm12
  614. ADD2 %xmm13, %xmm3
  615. movapd %xmm11, %xmm13
  616. mulsd %xmm5, %xmm11
  617. ADD3 %xmm9, %xmm0
  618. movsd -11 * SIZE(A1), %xmm9
  619. mulsd %xmm4, %xmm13
  620. ADD4 %xmm12, %xmm1
  621. movapd %xmm8, %xmm12
  622. movlpd %xmm0, -16 * SIZE(Y1)
  623. mulsd %xmm4, %xmm8
  624. movsd -12 * SIZE(Y1), %xmm0
  625. ADD3 %xmm11, %xmm2
  626. movsd -9 * SIZE(A1), %xmm11
  627. mulsd %xmm5, %xmm12
  628. movlpd %xmm1, -15 * SIZE(Y1)
  629. ADD4 %xmm13, %xmm3
  630. movsd -11 * SIZE(Y1), %xmm1
  631. movapd %xmm10, %xmm13
  632. movlpd %xmm2, -14 * SIZE(Y1)
  633. mulsd %xmm4, %xmm10
  634. movsd -10 * SIZE(Y1), %xmm2
  635. ADD1 %xmm8, %xmm0
  636. movsd -8 * SIZE(A1), %xmm8
  637. mulsd %xmm5, %xmm13
  638. movlpd %xmm3, -13 * SIZE(Y1)
  639. ADD2 %xmm12, %xmm1
  640. movsd -9 * SIZE(Y1), %xmm3
  641. movapd %xmm9, %xmm12
  642. mulsd %xmm5, %xmm9
  643. ADD1 %xmm10, %xmm2
  644. movsd -6 * SIZE(A1), %xmm10
  645. mulsd %xmm4, %xmm12
  646. ADD2 %xmm13, %xmm3
  647. movapd %xmm11, %xmm13
  648. mulsd %xmm5, %xmm11
  649. ADD3 %xmm9, %xmm0
  650. movsd -7 * SIZE(A1), %xmm9
  651. mulsd %xmm4, %xmm13
  652. subq $-8 * SIZE, A1
  653. ADD4 %xmm12, %xmm1
  654. movapd %xmm8, %xmm12
  655. movlpd %xmm0, -12 * SIZE(Y1)
  656. mulsd %xmm4, %xmm8
  657. movsd -8 * SIZE(Y1), %xmm0
  658. ADD3 %xmm11, %xmm2
  659. movsd -13 * SIZE(A1), %xmm11
  660. mulsd %xmm5, %xmm12
  661. movlpd %xmm1, -11 * SIZE(Y1)
  662. ADD4 %xmm13, %xmm3
  663. movsd -7 * SIZE(Y1), %xmm1
  664. movapd %xmm10, %xmm13
  665. movlpd %xmm2, -10 * SIZE(Y1)
  666. mulsd %xmm4, %xmm10
  667. movsd -6 * SIZE(Y1), %xmm2
  668. ADD1 %xmm8, %xmm0
  669. movsd -12 * SIZE(A1), %xmm8
  670. mulsd %xmm5, %xmm13
  671. movlpd %xmm3, -9 * SIZE(Y1)
  672. ADD2 %xmm12, %xmm1
  673. movsd -5 * SIZE(Y1), %xmm3
  674. subq $-8 * SIZE, Y1
  675. subq $1, I
  676. BRANCH
  677. jg .L23
  678. ALIGN_3
  679. .L24:
  680. movapd %xmm9, %xmm12
  681. mulsd %xmm5, %xmm9
  682. ADD1 %xmm10, %xmm2
  683. movsd -10 * SIZE(A1), %xmm10
  684. mulsd %xmm4, %xmm12
  685. ADD2 %xmm13, %xmm3
  686. movapd %xmm11, %xmm13
  687. mulsd %xmm5, %xmm11
  688. ADD3 %xmm9, %xmm0
  689. movsd -11 * SIZE(A1), %xmm9
  690. mulsd %xmm4, %xmm13
  691. ADD4 %xmm12, %xmm1
  692. movapd %xmm8, %xmm12
  693. movlpd %xmm0, -16 * SIZE(Y1)
  694. mulsd %xmm4, %xmm8
  695. movsd -12 * SIZE(Y1), %xmm0
  696. ADD3 %xmm11, %xmm2
  697. movsd -9 * SIZE(A1), %xmm11
  698. mulsd %xmm5, %xmm12
  699. movlpd %xmm1, -15 * SIZE(Y1)
  700. ADD4 %xmm13, %xmm3
  701. movsd -11 * SIZE(Y1), %xmm1
  702. movapd %xmm10, %xmm13
  703. mulsd %xmm4, %xmm10
  704. movlpd %xmm2, -14 * SIZE(Y1)
  705. ADD1 %xmm8, %xmm0
  706. movsd -10 * SIZE(Y1), %xmm2
  707. mulsd %xmm5, %xmm13
  708. movlpd %xmm3, -13 * SIZE(Y1)
  709. ADD2 %xmm12, %xmm1
  710. movsd -9 * SIZE(Y1), %xmm3
  711. movapd %xmm9, %xmm12
  712. mulsd %xmm5, %xmm9
  713. ADD1 %xmm10, %xmm2
  714. mulsd %xmm4, %xmm12
  715. ADD2 %xmm13, %xmm3
  716. movapd %xmm11, %xmm13
  717. mulsd %xmm5, %xmm11
  718. ADD3 %xmm9, %xmm0
  719. mulsd %xmm4, %xmm13
  720. ADD4 %xmm12, %xmm1
  721. ADD3 %xmm11, %xmm2
  722. movlpd %xmm0, -12 * SIZE(Y1)
  723. movsd -8 * SIZE(Y1), %xmm0
  724. ADD4 %xmm13, %xmm3
  725. movlpd %xmm1, -11 * SIZE(Y1)
  726. movsd -7 * SIZE(Y1), %xmm1
  727. movlpd %xmm2, -10 * SIZE(Y1)
  728. movlpd %xmm3, -9 * SIZE(Y1)
  729. movsd -6 * SIZE(Y1), %xmm2
  730. movsd -5 * SIZE(Y1), %xmm3
  731. subq $-8 * SIZE, A1
  732. subq $-8 * SIZE, Y1
  733. ALIGN_3
  734. .L25:
  735. testq $2, M
  736. je .L27
  737. movsd -16 * SIZE(A1), %xmm8
  738. movsd -15 * SIZE(A1), %xmm9
  739. movsd -14 * SIZE(A1), %xmm10
  740. movsd -13 * SIZE(A1), %xmm11
  741. movapd %xmm8, %xmm12
  742. mulsd %xmm4, %xmm8
  743. mulsd %xmm5, %xmm12
  744. movapd %xmm10, %xmm13
  745. mulsd %xmm4, %xmm10
  746. ADD1 %xmm8, %xmm0
  747. mulsd %xmm5, %xmm13
  748. ADD2 %xmm12, %xmm1
  749. movapd %xmm9, %xmm12
  750. mulsd %xmm5, %xmm9
  751. ADD1 %xmm10, %xmm2
  752. mulsd %xmm4, %xmm12
  753. ADD2 %xmm13, %xmm3
  754. movapd %xmm11, %xmm13
  755. mulsd %xmm5, %xmm11
  756. ADD3 %xmm9, %xmm0
  757. mulsd %xmm4, %xmm13
  758. ADD4 %xmm12, %xmm1
  759. ADD3 %xmm11, %xmm2
  760. movlpd %xmm0, -16 * SIZE(Y1)
  761. movsd -12 * SIZE(Y1), %xmm0
  762. ADD4 %xmm13, %xmm3
  763. movlpd %xmm1, -15 * SIZE(Y1)
  764. movsd -11 * SIZE(Y1), %xmm1
  765. movlpd %xmm2, -14 * SIZE(Y1)
  766. movlpd %xmm3, -13 * SIZE(Y1)
  767. addq $4 * SIZE, A1
  768. addq $4 * SIZE, Y1
  769. ALIGN_3
  770. .L27:
  771. testq $1, M
  772. je .L90
  773. movsd -16 * SIZE(A1), %xmm8
  774. movsd -15 * SIZE(A1), %xmm9
  775. movapd %xmm8, %xmm12
  776. mulsd %xmm4, %xmm8
  777. mulsd %xmm5, %xmm12
  778. movapd %xmm9, %xmm13
  779. mulsd %xmm5, %xmm9
  780. ADD1 %xmm8, %xmm0
  781. mulsd %xmm4, %xmm13
  782. ADD2 %xmm12, %xmm1
  783. ADD3 %xmm9, %xmm0
  784. ADD4 %xmm13, %xmm1
  785. movlpd %xmm0, -16 * SIZE(Y1)
  786. movlpd %xmm1, -15 * SIZE(Y1)
  787. ALIGN_3
  788. .L90:
  789. movq Y, Y1
  790. movq M, %rax
  791. sarq $3, %rax
  792. jle .L94
  793. ALIGN_3
  794. .L92:
  795. movsd 0 * SIZE(Y), %xmm0
  796. movhpd 1 * SIZE(Y), %xmm0
  797. addq INCY, Y
  798. movsd 0 * SIZE(Y), %xmm1
  799. movhpd 1 * SIZE(Y), %xmm1
  800. addq INCY, Y
  801. movsd 0 * SIZE(Y), %xmm2
  802. movhpd 1 * SIZE(Y), %xmm2
  803. addq INCY, Y
  804. movsd 0 * SIZE(Y), %xmm3
  805. movhpd 1 * SIZE(Y), %xmm3
  806. addq INCY, Y
  807. movsd 0 * SIZE(Y), %xmm4
  808. movhpd 1 * SIZE(Y), %xmm4
  809. addq INCY, Y
  810. movsd 0 * SIZE(Y), %xmm5
  811. movhpd 1 * SIZE(Y), %xmm5
  812. addq INCY, Y
  813. movsd 0 * SIZE(Y), %xmm6
  814. movhpd 1 * SIZE(Y), %xmm6
  815. addq INCY, Y
  816. movsd 0 * SIZE(Y), %xmm7
  817. movhpd 1 * SIZE(Y), %xmm7
  818. addq INCY, Y
  819. addpd 0 * SIZE(BUFFER), %xmm0
  820. addpd 2 * SIZE(BUFFER), %xmm1
  821. addpd 4 * SIZE(BUFFER), %xmm2
  822. addpd 6 * SIZE(BUFFER), %xmm3
  823. addpd 8 * SIZE(BUFFER), %xmm4
  824. addpd 10 * SIZE(BUFFER), %xmm5
  825. addpd 12 * SIZE(BUFFER), %xmm6
  826. addpd 14 * SIZE(BUFFER), %xmm7
  827. movlpd %xmm0, 0 * SIZE(Y1)
  828. movhpd %xmm0, 1 * SIZE(Y1)
  829. addq INCY, Y1
  830. movlpd %xmm1, 0 * SIZE(Y1)
  831. movhpd %xmm1, 1 * SIZE(Y1)
  832. addq INCY, Y1
  833. movlpd %xmm2, 0 * SIZE(Y1)
  834. movhpd %xmm2, 1 * SIZE(Y1)
  835. addq INCY, Y1
  836. movlpd %xmm3, 0 * SIZE(Y1)
  837. movhpd %xmm3, 1 * SIZE(Y1)
  838. addq INCY, Y1
  839. movlpd %xmm4, 0 * SIZE(Y1)
  840. movhpd %xmm4, 1 * SIZE(Y1)
  841. addq INCY, Y1
  842. movlpd %xmm5, 0 * SIZE(Y1)
  843. movhpd %xmm5, 1 * SIZE(Y1)
  844. addq INCY, Y1
  845. movlpd %xmm6, 0 * SIZE(Y1)
  846. movhpd %xmm6, 1 * SIZE(Y1)
  847. addq INCY, Y1
  848. movlpd %xmm7, 0 * SIZE(Y1)
  849. movhpd %xmm7, 1 * SIZE(Y1)
  850. addq INCY, Y1
  851. subq $-16 * SIZE, BUFFER
  852. decq %rax
  853. jg .L92
  854. ALIGN_3
  855. .L94:
  856. testq $7, M
  857. jle .L999
  858. testq $4, M
  859. jle .L95
  860. movsd 0 * SIZE(Y), %xmm0
  861. movhpd 1 * SIZE(Y), %xmm0
  862. addq INCY, Y
  863. movsd 0 * SIZE(Y), %xmm1
  864. movhpd 1 * SIZE(Y), %xmm1
  865. addq INCY, Y
  866. movsd 0 * SIZE(Y), %xmm2
  867. movhpd 1 * SIZE(Y), %xmm2
  868. addq INCY, Y
  869. movsd 0 * SIZE(Y), %xmm3
  870. movhpd 1 * SIZE(Y), %xmm3
  871. addq INCY, Y
  872. addpd 0 * SIZE(BUFFER), %xmm0
  873. addpd 2 * SIZE(BUFFER), %xmm1
  874. addpd 4 * SIZE(BUFFER), %xmm2
  875. addpd 6 * SIZE(BUFFER), %xmm3
  876. movlpd %xmm0, 0 * SIZE(Y1)
  877. movhpd %xmm0, 1 * SIZE(Y1)
  878. addq INCY, Y1
  879. movlpd %xmm1, 0 * SIZE(Y1)
  880. movhpd %xmm1, 1 * SIZE(Y1)
  881. addq INCY, Y1
  882. movlpd %xmm2, 0 * SIZE(Y1)
  883. movhpd %xmm2, 1 * SIZE(Y1)
  884. addq INCY, Y1
  885. movlpd %xmm3, 0 * SIZE(Y1)
  886. movhpd %xmm3, 1 * SIZE(Y1)
  887. addq INCY, Y1
  888. addq $8 * SIZE, BUFFER
  889. ALIGN_3
  890. .L95:
  891. testq $2, M
  892. jle .L96
  893. movsd 0 * SIZE(Y), %xmm0
  894. movhpd 1 * SIZE(Y), %xmm0
  895. addq INCY, Y
  896. movsd 0 * SIZE(Y), %xmm1
  897. movhpd 1 * SIZE(Y), %xmm1
  898. addq INCY, Y
  899. addpd 0 * SIZE(BUFFER), %xmm0
  900. addpd 2 * SIZE(BUFFER), %xmm1
  901. movlpd %xmm0, 0 * SIZE(Y1)
  902. movhpd %xmm0, 1 * SIZE(Y1)
  903. addq INCY, Y1
  904. movlpd %xmm1, 0 * SIZE(Y1)
  905. movhpd %xmm1, 1 * SIZE(Y1)
  906. addq INCY, Y1
  907. addq $4 * SIZE, BUFFER
  908. ALIGN_3
  909. .L96:
  910. testq $1, M
  911. jle .L999
  912. movsd 0 * SIZE(Y), %xmm0
  913. movhpd 1 * SIZE(Y), %xmm0
  914. addpd 0 * SIZE(BUFFER), %xmm0
  915. movlpd %xmm0, 0 * SIZE(Y1)
  916. movhpd %xmm0, 1 * SIZE(Y1)
  917. ALIGN_3
  918. .L999:
  919. movq 0(%rsp), %rbx
  920. movq 8(%rsp), %rbp
  921. movq 16(%rsp), %r12
  922. movq 24(%rsp), %r13
  923. movq 32(%rsp), %r14
  924. movq 40(%rsp), %r15
  925. #ifdef WINDOWS_ABI
  926. movq 48(%rsp), %rdi
  927. movq 56(%rsp), %rsi
  928. movups 64(%rsp), %xmm6
  929. movups 80(%rsp), %xmm7
  930. movups 96(%rsp), %xmm8
  931. movups 112(%rsp), %xmm9
  932. movups 128(%rsp), %xmm10
  933. movups 144(%rsp), %xmm11
  934. movups 160(%rsp), %xmm12
  935. movups 176(%rsp), %xmm13
  936. movups 192(%rsp), %xmm14
  937. movups 208(%rsp), %xmm15
  938. #endif
  939. addq $STACKSIZE, %rsp
  940. ret
  941. EPILOGUE