You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_n.S 54 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "l2param.h"
  41. #ifndef WINDOWS_ABI
  42. #define STACKSIZE 128
  43. #define OLD_INCX 8 + STACKSIZE(%rsp)
  44. #define OLD_Y 16 + STACKSIZE(%rsp)
  45. #define OLD_INCY 24 + STACKSIZE(%rsp)
  46. #define OLD_BUFFER 32 + STACKSIZE(%rsp)
  47. #define ALPHA_R 48 (%rsp)
  48. #define ALPHA_I 56 (%rsp)
  49. #define MMM 64(%rsp)
  50. #define NN 72(%rsp)
  51. #define AA 80(%rsp)
  52. #define XX 88(%rsp)
  53. #define LDAX 96(%rsp)
  54. #define ALPHAR 104(%rsp)
  55. #define ALPHAI 112(%rsp)
  56. #define M %rdi
  57. #define N %rsi
  58. #define A %rcx
  59. #define LDA %r8
  60. #define X %r9
  61. #define INCX %rdx
  62. #define Y %rbp
  63. #define INCY %r10
  64. #else
  65. #define STACKSIZE 304
  66. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  67. #define OLD_A 48 + STACKSIZE(%rsp)
  68. #define OLD_LDA 56 + STACKSIZE(%rsp)
  69. #define OLD_X 64 + STACKSIZE(%rsp)
  70. #define OLD_INCX 72 + STACKSIZE(%rsp)
  71. #define OLD_Y 80 + STACKSIZE(%rsp)
  72. #define OLD_INCY 88 + STACKSIZE(%rsp)
  73. #define OLD_BUFFER 96 + STACKSIZE(%rsp)
  74. #define ALPHA_R 224 (%rsp)
  75. #define ALPHA_I 232 (%rsp)
  76. #define MMM 240(%rsp)
  77. #define NN 248(%rsp)
  78. #define AA 256(%rsp)
  79. #define XX 264(%rsp)
  80. #define LDAX 272(%rsp)
  81. #define ALPHAR 280(%rsp)
  82. #define ALPHAI 288(%rsp)
  83. #define M %rcx
  84. #define N %rdx
  85. #define A %r8
  86. #define LDA %r9
  87. #define X %rdi
  88. #define INCX %rsi
  89. #define Y %rbp
  90. #define INCY %r10
  91. #endif
  92. #define I %rax
  93. #define A1 %r12
  94. #define A2 %r13
  95. #define Y1 %r14
  96. #define BUFFER %r15
  97. #define J %r11
  98. #undef SUBPD
  99. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  100. #define SUBPD subpd
  101. #else
  102. #define SUBPD addpd
  103. #endif
  104. PROLOGUE
  105. PROFCODE
  106. subq $STACKSIZE, %rsp
  107. movq %rbx, 0(%rsp)
  108. movq %rbp, 8(%rsp)
  109. movq %r12, 16(%rsp)
  110. movq %r13, 24(%rsp)
  111. movq %r14, 32(%rsp)
  112. movq %r15, 40(%rsp)
  113. #ifdef WINDOWS_ABI
  114. movq %rdi, 48(%rsp)
  115. movq %rsi, 56(%rsp)
  116. movups %xmm6, 64(%rsp)
  117. movups %xmm7, 80(%rsp)
  118. movups %xmm8, 96(%rsp)
  119. movups %xmm9, 112(%rsp)
  120. movups %xmm10, 128(%rsp)
  121. movups %xmm11, 144(%rsp)
  122. movups %xmm12, 160(%rsp)
  123. movups %xmm13, 176(%rsp)
  124. movups %xmm14, 192(%rsp)
  125. movups %xmm15, 208(%rsp)
  126. movq OLD_A, A
  127. movq OLD_LDA, LDA
  128. movq OLD_X, X
  129. movapd %xmm3, %xmm0
  130. movsd OLD_ALPHA_I, %xmm1
  131. #endif
  132. movq A, AA
  133. movq N, NN
  134. movq M, MMM
  135. movq LDA, LDAX
  136. movq X, XX
  137. movq OLD_Y, Y
  138. movsd %xmm0,ALPHAR
  139. movsd %xmm1,ALPHAI
  140. .L0t:
  141. xorq I,I
  142. addq $1,I
  143. salq $18,I
  144. subq I,MMM
  145. movq I,M
  146. movsd ALPHAR,%xmm0
  147. movsd ALPHAI,%xmm1
  148. jge .L00t
  149. movq MMM,M
  150. addq I,M
  151. jle .L999x
  152. .L00t:
  153. movq AA, A
  154. movq NN, N
  155. movq LDAX, LDA
  156. movq XX, X
  157. movq OLD_INCX, INCX
  158. # movq OLD_Y, Y
  159. movq OLD_INCY, INCY
  160. movq OLD_BUFFER, BUFFER
  161. salq $ZBASE_SHIFT, LDA
  162. salq $ZBASE_SHIFT, INCX
  163. salq $ZBASE_SHIFT, INCY
  164. movlpd %xmm0, ALPHA_R
  165. movlpd %xmm1, ALPHA_I
  166. subq $-16 * SIZE, A
  167. testq M, M
  168. jle .L999
  169. testq N, N
  170. jle .L999
  171. ALIGN_3
  172. movq BUFFER, Y1
  173. pxor %xmm4, %xmm4
  174. movq M, %rax
  175. addq $8, %rax
  176. sarq $3, %rax
  177. ALIGN_3
  178. .L01:
  179. movapd %xmm4, 0 * SIZE(Y1)
  180. movapd %xmm4, 2 * SIZE(Y1)
  181. movapd %xmm4, 4 * SIZE(Y1)
  182. movapd %xmm4, 6 * SIZE(Y1)
  183. movapd %xmm4, 8 * SIZE(Y1)
  184. movapd %xmm4, 10 * SIZE(Y1)
  185. movapd %xmm4, 12 * SIZE(Y1)
  186. movapd %xmm4, 14 * SIZE(Y1)
  187. subq $-16 * SIZE, Y1
  188. decq %rax
  189. jg .L01
  190. ALIGN_3
  191. .L10:
  192. #ifdef ALIGNED_ACCESS
  193. testq $SIZE, A
  194. jne .L100
  195. #endif
  196. #if GEMV_UNROLL >= 4
  197. cmpq $4, N
  198. jl .L20
  199. ALIGN_3
  200. .L11:
  201. subq $4, N
  202. leaq 16 * SIZE(BUFFER), Y1
  203. movq A, A1
  204. leaq (A, LDA, 2), A2
  205. leaq (A, LDA, 4), A
  206. movsd 0 * SIZE(X), %xmm8
  207. movhpd 1 * SIZE(X), %xmm8
  208. addq INCX, X
  209. movsd 0 * SIZE(X), %xmm10
  210. movhpd 1 * SIZE(X), %xmm10
  211. addq INCX, X
  212. movsd 0 * SIZE(X), %xmm12
  213. movhpd 1 * SIZE(X), %xmm12
  214. addq INCX, X
  215. movsd 0 * SIZE(X), %xmm14
  216. movhpd 1 * SIZE(X), %xmm14
  217. addq INCX, X
  218. pcmpeqb %xmm5, %xmm5
  219. psllq $63, %xmm5
  220. shufps $0xc0, %xmm5, %xmm5
  221. pshufd $0x4e, %xmm8, %xmm9
  222. pshufd $0x4e, %xmm10, %xmm11
  223. pshufd $0x4e, %xmm12, %xmm13
  224. pshufd $0x4e, %xmm14, %xmm15
  225. #ifdef HAVE_SSE3
  226. movddup ALPHA_R, %xmm6
  227. movddup ALPHA_I, %xmm7
  228. #else
  229. movsd ALPHA_R, %xmm6
  230. unpcklpd %xmm6, %xmm6
  231. movsd ALPHA_I, %xmm7
  232. unpcklpd %xmm7, %xmm7
  233. #endif
  234. xorpd %xmm5, %xmm9
  235. xorpd %xmm5, %xmm11
  236. xorpd %xmm5, %xmm13
  237. xorpd %xmm5, %xmm15
  238. mulpd %xmm6, %xmm8
  239. mulpd %xmm7, %xmm9
  240. mulpd %xmm6, %xmm10
  241. mulpd %xmm7, %xmm11
  242. mulpd %xmm6, %xmm12
  243. mulpd %xmm7, %xmm13
  244. mulpd %xmm6, %xmm14
  245. mulpd %xmm7, %xmm15
  246. #ifndef XCONJ
  247. subpd %xmm9, %xmm8
  248. subpd %xmm11, %xmm10
  249. subpd %xmm13, %xmm12
  250. subpd %xmm15, %xmm14
  251. #else
  252. addpd %xmm9, %xmm8
  253. addpd %xmm11, %xmm10
  254. addpd %xmm13, %xmm12
  255. addpd %xmm15, %xmm14
  256. #endif
  257. pshufd $0xee, %xmm8, %xmm9
  258. pshufd $0x44, %xmm8, %xmm8
  259. pshufd $0xee, %xmm10, %xmm11
  260. pshufd $0x44, %xmm10, %xmm10
  261. pshufd $0xee, %xmm12, %xmm13
  262. pshufd $0x44, %xmm12, %xmm12
  263. pshufd $0xee, %xmm14, %xmm15
  264. pshufd $0x44, %xmm14, %xmm14
  265. #ifndef CONJ
  266. xorpd %xmm5, %xmm9
  267. xorpd %xmm5, %xmm11
  268. xorpd %xmm5, %xmm13
  269. xorpd %xmm5, %xmm15
  270. #else
  271. xorpd %xmm5, %xmm8
  272. xorpd %xmm5, %xmm10
  273. xorpd %xmm5, %xmm12
  274. xorpd %xmm5, %xmm14
  275. #endif
  276. MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
  277. MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
  278. MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
  279. MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
  280. ALIGN_3
  281. movq M, I
  282. sarq $2, I
  283. jle .L15
  284. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  285. MOVUPS_A1(-14 * SIZE, A1, %xmm6)
  286. decq I
  287. jle .L14
  288. ALIGN_3
  289. .L13:
  290. #ifdef PREFETCH
  291. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1)
  292. #endif
  293. pshufd $0x4e, %xmm4, %xmm5
  294. mulpd %xmm8, %xmm4
  295. addpd %xmm4, %xmm0
  296. MOVUPS_A1(-12 * SIZE, A1, %xmm4)
  297. pshufd $0x4e, %xmm6, %xmm7
  298. mulpd %xmm8, %xmm6
  299. addpd %xmm6, %xmm1
  300. MOVUPS_A1(-10 * SIZE, A1, %xmm6)
  301. mulpd %xmm9, %xmm5
  302. SUBPD %xmm5, %xmm0
  303. mulpd %xmm9, %xmm7
  304. SUBPD %xmm7, %xmm1
  305. pshufd $0x4e, %xmm4, %xmm5
  306. mulpd %xmm8, %xmm4
  307. addpd %xmm4, %xmm2
  308. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
  309. pshufd $0x4e, %xmm6, %xmm7
  310. mulpd %xmm8, %xmm6
  311. addpd %xmm6, %xmm3
  312. MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6)
  313. mulpd %xmm9, %xmm5
  314. SUBPD %xmm5, %xmm2
  315. mulpd %xmm9, %xmm7
  316. SUBPD %xmm7, %xmm3
  317. #ifdef PREFETCH
  318. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
  319. #endif
  320. pshufd $0x4e, %xmm4, %xmm5
  321. mulpd %xmm10, %xmm4
  322. addpd %xmm4, %xmm0
  323. MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm4)
  324. pshufd $0x4e, %xmm6, %xmm7
  325. mulpd %xmm10, %xmm6
  326. addpd %xmm6, %xmm1
  327. MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm6)
  328. mulpd %xmm11, %xmm5
  329. SUBPD %xmm5, %xmm0
  330. mulpd %xmm11, %xmm7
  331. SUBPD %xmm7, %xmm1
  332. pshufd $0x4e, %xmm4, %xmm5
  333. mulpd %xmm10, %xmm4
  334. addpd %xmm4, %xmm2
  335. MOVUPS_A1(-16 * SIZE, A2, %xmm4)
  336. pshufd $0x4e, %xmm6, %xmm7
  337. mulpd %xmm10, %xmm6
  338. addpd %xmm6, %xmm3
  339. MOVUPS_A1(-14 * SIZE, A2, %xmm6)
  340. mulpd %xmm11, %xmm5
  341. SUBPD %xmm5, %xmm2
  342. mulpd %xmm11, %xmm7
  343. SUBPD %xmm7, %xmm3
  344. #ifdef PREFETCH
  345. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2)
  346. #endif
  347. pshufd $0x4e, %xmm4, %xmm5
  348. mulpd %xmm12, %xmm4
  349. addpd %xmm4, %xmm0
  350. MOVUPS_A1(-12 * SIZE, A2, %xmm4)
  351. pshufd $0x4e, %xmm6, %xmm7
  352. mulpd %xmm12, %xmm6
  353. addpd %xmm6, %xmm1
  354. MOVUPS_A1(-10 * SIZE, A2, %xmm6)
  355. mulpd %xmm13, %xmm5
  356. SUBPD %xmm5, %xmm0
  357. mulpd %xmm13, %xmm7
  358. SUBPD %xmm7, %xmm1
  359. pshufd $0x4e, %xmm4, %xmm5
  360. mulpd %xmm12, %xmm4
  361. addpd %xmm4, %xmm2
  362. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
  363. pshufd $0x4e, %xmm6, %xmm7
  364. mulpd %xmm12, %xmm6
  365. addpd %xmm6, %xmm3
  366. MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6)
  367. mulpd %xmm13, %xmm5
  368. SUBPD %xmm5, %xmm2
  369. mulpd %xmm13, %xmm7
  370. SUBPD %xmm7, %xmm3
  371. #ifdef PREFETCH
  372. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
  373. #endif
  374. pshufd $0x4e, %xmm4, %xmm5
  375. mulpd %xmm14, %xmm4
  376. addpd %xmm4, %xmm0
  377. MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm4)
  378. pshufd $0x4e, %xmm6, %xmm7
  379. mulpd %xmm14, %xmm6
  380. addpd %xmm6, %xmm1
  381. MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm6)
  382. mulpd %xmm15, %xmm5
  383. SUBPD %xmm5, %xmm0
  384. mulpd %xmm15, %xmm7
  385. SUBPD %xmm7, %xmm1
  386. pshufd $0x4e, %xmm4, %xmm5
  387. mulpd %xmm14, %xmm4
  388. addpd %xmm4, %xmm2
  389. MOVUPS_A1( -8 * SIZE, A1, %xmm4)
  390. pshufd $0x4e, %xmm6, %xmm7
  391. mulpd %xmm14, %xmm6
  392. addpd %xmm6, %xmm3
  393. MOVUPS_A1( -6 * SIZE, A1, %xmm6)
  394. mulpd %xmm15, %xmm5
  395. SUBPD %xmm5, %xmm2
  396. mulpd %xmm15, %xmm7
  397. SUBPD %xmm7, %xmm3
  398. #ifdef PREFETCHW
  399. PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1)
  400. #endif
  401. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  402. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  403. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  404. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  405. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  406. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  407. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  408. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  409. subq $-8 * SIZE, A1
  410. subq $-8 * SIZE, A2
  411. subq $-8 * SIZE, Y1
  412. subq $1, I
  413. BRANCH
  414. jg .L13
  415. ALIGN_3
  416. .L14:
  417. pshufd $0x4e, %xmm4, %xmm5
  418. mulpd %xmm8, %xmm4
  419. addpd %xmm4, %xmm0
  420. MOVUPS_A1(-12 * SIZE, A1, %xmm4)
  421. pshufd $0x4e, %xmm6, %xmm7
  422. mulpd %xmm8, %xmm6
  423. addpd %xmm6, %xmm1
  424. MOVUPS_A1(-10 * SIZE, A1, %xmm6)
  425. mulpd %xmm9, %xmm5
  426. SUBPD %xmm5, %xmm0
  427. mulpd %xmm9, %xmm7
  428. SUBPD %xmm7, %xmm1
  429. pshufd $0x4e, %xmm4, %xmm5
  430. mulpd %xmm8, %xmm4
  431. addpd %xmm4, %xmm2
  432. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
  433. pshufd $0x4e, %xmm6, %xmm7
  434. mulpd %xmm8, %xmm6
  435. addpd %xmm6, %xmm3
  436. MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6)
  437. mulpd %xmm9, %xmm5
  438. SUBPD %xmm5, %xmm2
  439. mulpd %xmm9, %xmm7
  440. SUBPD %xmm7, %xmm3
  441. pshufd $0x4e, %xmm4, %xmm5
  442. mulpd %xmm10, %xmm4
  443. addpd %xmm4, %xmm0
  444. MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm4)
  445. pshufd $0x4e, %xmm6, %xmm7
  446. mulpd %xmm10, %xmm6
  447. addpd %xmm6, %xmm1
  448. MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm6)
  449. mulpd %xmm11, %xmm5
  450. SUBPD %xmm5, %xmm0
  451. mulpd %xmm11, %xmm7
  452. SUBPD %xmm7, %xmm1
  453. pshufd $0x4e, %xmm4, %xmm5
  454. mulpd %xmm10, %xmm4
  455. addpd %xmm4, %xmm2
  456. MOVUPS_A1(-16 * SIZE, A2, %xmm4)
  457. pshufd $0x4e, %xmm6, %xmm7
  458. mulpd %xmm10, %xmm6
  459. addpd %xmm6, %xmm3
  460. MOVUPS_A1(-14 * SIZE, A2, %xmm6)
  461. mulpd %xmm11, %xmm5
  462. SUBPD %xmm5, %xmm2
  463. mulpd %xmm11, %xmm7
  464. SUBPD %xmm7, %xmm3
  465. pshufd $0x4e, %xmm4, %xmm5
  466. mulpd %xmm12, %xmm4
  467. addpd %xmm4, %xmm0
  468. MOVUPS_A1(-12 * SIZE, A2, %xmm4)
  469. pshufd $0x4e, %xmm6, %xmm7
  470. mulpd %xmm12, %xmm6
  471. addpd %xmm6, %xmm1
  472. MOVUPS_A1(-10 * SIZE, A2, %xmm6)
  473. mulpd %xmm13, %xmm5
  474. SUBPD %xmm5, %xmm0
  475. mulpd %xmm13, %xmm7
  476. SUBPD %xmm7, %xmm1
  477. pshufd $0x4e, %xmm4, %xmm5
  478. mulpd %xmm12, %xmm4
  479. addpd %xmm4, %xmm2
  480. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
  481. pshufd $0x4e, %xmm6, %xmm7
  482. mulpd %xmm12, %xmm6
  483. addpd %xmm6, %xmm3
  484. MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6)
  485. mulpd %xmm13, %xmm5
  486. SUBPD %xmm5, %xmm2
  487. mulpd %xmm13, %xmm7
  488. SUBPD %xmm7, %xmm3
  489. pshufd $0x4e, %xmm4, %xmm5
  490. mulpd %xmm14, %xmm4
  491. addpd %xmm4, %xmm0
  492. MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm4)
  493. pshufd $0x4e, %xmm6, %xmm7
  494. mulpd %xmm14, %xmm6
  495. addpd %xmm6, %xmm1
  496. MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm6)
  497. mulpd %xmm15, %xmm5
  498. SUBPD %xmm5, %xmm0
  499. mulpd %xmm15, %xmm7
  500. SUBPD %xmm7, %xmm1
  501. pshufd $0x4e, %xmm4, %xmm5
  502. mulpd %xmm14, %xmm4
  503. addpd %xmm4, %xmm2
  504. pshufd $0x4e, %xmm6, %xmm7
  505. mulpd %xmm14, %xmm6
  506. addpd %xmm6, %xmm3
  507. mulpd %xmm15, %xmm5
  508. SUBPD %xmm5, %xmm2
  509. mulpd %xmm15, %xmm7
  510. SUBPD %xmm7, %xmm3
  511. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  512. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  513. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  514. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  515. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  516. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  517. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  518. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  519. subq $-8 * SIZE, A1
  520. subq $-8 * SIZE, A2
  521. subq $-8 * SIZE, Y1
  522. ALIGN_3
  523. .L15:
  524. testq $2, M
  525. je .L17
  526. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  527. MOVUPS_A1(-14 * SIZE, A1, %xmm6)
  528. pshufd $0x4e, %xmm4, %xmm5
  529. mulpd %xmm8, %xmm4
  530. addpd %xmm4, %xmm0
  531. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
  532. pshufd $0x4e, %xmm6, %xmm7
  533. mulpd %xmm8, %xmm6
  534. addpd %xmm6, %xmm1
  535. MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6)
  536. mulpd %xmm9, %xmm5
  537. SUBPD %xmm5, %xmm0
  538. mulpd %xmm9, %xmm7
  539. SUBPD %xmm7, %xmm1
  540. pshufd $0x4e, %xmm4, %xmm5
  541. mulpd %xmm10, %xmm4
  542. addpd %xmm4, %xmm0
  543. MOVUPS_A1(-16 * SIZE, A2, %xmm4)
  544. pshufd $0x4e, %xmm6, %xmm7
  545. mulpd %xmm10, %xmm6
  546. addpd %xmm6, %xmm1
  547. MOVUPS_A1(-14 * SIZE, A2, %xmm6)
  548. mulpd %xmm11, %xmm5
  549. SUBPD %xmm5, %xmm0
  550. mulpd %xmm11, %xmm7
  551. SUBPD %xmm7, %xmm1
  552. pshufd $0x4e, %xmm4, %xmm5
  553. mulpd %xmm12, %xmm4
  554. addpd %xmm4, %xmm0
  555. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
  556. pshufd $0x4e, %xmm6, %xmm7
  557. mulpd %xmm12, %xmm6
  558. addpd %xmm6, %xmm1
  559. MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6)
  560. mulpd %xmm13, %xmm5
  561. SUBPD %xmm5, %xmm0
  562. mulpd %xmm13, %xmm7
  563. SUBPD %xmm7, %xmm1
  564. pshufd $0x4e, %xmm4, %xmm5
  565. mulpd %xmm14, %xmm4
  566. addpd %xmm4, %xmm0
  567. mulpd %xmm15, %xmm5
  568. SUBPD %xmm5, %xmm0
  569. pshufd $0x4e, %xmm6, %xmm7
  570. mulpd %xmm14, %xmm6
  571. addpd %xmm6, %xmm1
  572. mulpd %xmm15, %xmm7
  573. SUBPD %xmm7, %xmm1
  574. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  575. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  576. movapd %xmm2, %xmm0
  577. addq $4 * SIZE, A1
  578. addq $4 * SIZE, A2
  579. addq $4 * SIZE, Y1
  580. ALIGN_3
  581. .L17:
  582. testq $1, M
  583. je .L19
  584. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  585. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6)
  586. pshufd $0x4e, %xmm4, %xmm5
  587. mulpd %xmm8, %xmm4
  588. addpd %xmm4, %xmm0
  589. MOVUPS_A1(-16 * SIZE, A2, %xmm4)
  590. mulpd %xmm9, %xmm5
  591. SUBPD %xmm5, %xmm0
  592. pshufd $0x4e, %xmm6, %xmm7
  593. mulpd %xmm10, %xmm6
  594. addpd %xmm6, %xmm0
  595. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6)
  596. mulpd %xmm11, %xmm7
  597. SUBPD %xmm7, %xmm0
  598. pshufd $0x4e, %xmm4, %xmm5
  599. mulpd %xmm12, %xmm4
  600. addpd %xmm4, %xmm0
  601. mulpd %xmm13, %xmm5
  602. SUBPD %xmm5, %xmm0
  603. pshufd $0x4e, %xmm6, %xmm7
  604. mulpd %xmm14, %xmm6
  605. addpd %xmm6, %xmm0
  606. mulpd %xmm15, %xmm7
  607. SUBPD %xmm7, %xmm0
  608. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  609. ALIGN_3
  610. .L19:
  611. cmpq $4, N
  612. jge .L11
  613. ALIGN_3
  614. .L20:
  615. #endif
  616. #if GEMV_UNROLL >= 2
  617. cmpq $2, N
  618. jl .L30
  619. #if GEMV_UNROLL == 2
  620. ALIGN_3
  621. .L21:
  622. #endif
  623. subq $2, N
  624. leaq 16 * SIZE(BUFFER), Y1
  625. movq A, A1
  626. leaq (A, LDA, 1), A2
  627. leaq (A, LDA, 2), A
  628. movsd 0 * SIZE(X), %xmm12
  629. movhpd 1 * SIZE(X), %xmm12
  630. addq INCX, X
  631. movsd 0 * SIZE(X), %xmm14
  632. movhpd 1 * SIZE(X), %xmm14
  633. addq INCX, X
  634. pcmpeqb %xmm11, %xmm11
  635. psllq $63, %xmm11
  636. shufps $0xc0, %xmm11, %xmm11
  637. pshufd $0x4e, %xmm12, %xmm13
  638. pshufd $0x4e, %xmm14, %xmm15
  639. #ifdef HAVE_SSE3
  640. movddup ALPHA_R, %xmm8
  641. movddup ALPHA_I, %xmm9
  642. #else
  643. movsd ALPHA_R, %xmm8
  644. unpcklpd %xmm8, %xmm8
  645. movsd ALPHA_I, %xmm9
  646. unpcklpd %xmm9, %xmm9
  647. #endif
  648. xorpd %xmm11, %xmm13
  649. xorpd %xmm11, %xmm15
  650. mulpd %xmm8, %xmm12
  651. mulpd %xmm9, %xmm13
  652. mulpd %xmm8, %xmm14
  653. mulpd %xmm9, %xmm15
  654. #ifndef XCONJ
  655. subpd %xmm13, %xmm12
  656. subpd %xmm15, %xmm14
  657. #else
  658. addpd %xmm13, %xmm12
  659. addpd %xmm15, %xmm14
  660. #endif
  661. pshufd $0xee, %xmm12, %xmm13
  662. pshufd $0x44, %xmm12, %xmm12
  663. pshufd $0xee, %xmm14, %xmm15
  664. pshufd $0x44, %xmm14, %xmm14
  665. #ifndef CONJ
  666. xorpd %xmm11, %xmm13
  667. xorpd %xmm11, %xmm15
  668. #else
  669. xorpd %xmm11, %xmm12
  670. xorpd %xmm11, %xmm14
  671. #endif
  672. MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
  673. MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
  674. MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
  675. MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
  676. ALIGN_3
  677. movq M, I
  678. sarq $2, I
  679. jle .L25
  680. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  681. MOVUPS_A1(-14 * SIZE, A1, %xmm6)
  682. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  683. MOVUPS_A1(-10 * SIZE, A1, %xmm10)
  684. decq I
  685. jle .L24
  686. ALIGN_3
  687. .L23:
  688. #ifdef PREFETCH
  689. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  690. #endif
  691. pshufd $0x4e, %xmm4, %xmm5
  692. mulpd %xmm12, %xmm4
  693. addpd %xmm4, %xmm0
  694. MOVUPS_A1(-16 * SIZE, A2, %xmm4)
  695. pshufd $0x4e, %xmm6, %xmm7
  696. mulpd %xmm12, %xmm6
  697. addpd %xmm6, %xmm1
  698. MOVUPS_A1(-14 * SIZE, A2, %xmm6)
  699. pshufd $0x4e, %xmm8, %xmm9
  700. mulpd %xmm12, %xmm8
  701. addpd %xmm8, %xmm2
  702. MOVUPS_A1(-12 * SIZE, A2, %xmm8)
  703. pshufd $0x4e, %xmm10, %xmm11
  704. mulpd %xmm12, %xmm10
  705. addpd %xmm10, %xmm3
  706. MOVUPS_A1(-10 * SIZE, A2, %xmm10)
  707. mulpd %xmm13, %xmm5
  708. SUBPD %xmm5, %xmm0
  709. mulpd %xmm13, %xmm7
  710. SUBPD %xmm7, %xmm1
  711. mulpd %xmm13, %xmm9
  712. SUBPD %xmm9, %xmm2
  713. mulpd %xmm13, %xmm11
  714. SUBPD %xmm11, %xmm3
  715. #ifdef PREFETCH
  716. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  717. #endif
  718. pshufd $0x4e, %xmm4, %xmm5
  719. mulpd %xmm14, %xmm4
  720. addpd %xmm4, %xmm0
  721. MOVUPS_A1( -8 * SIZE, A1, %xmm4)
  722. pshufd $0x4e, %xmm6, %xmm7
  723. mulpd %xmm14, %xmm6
  724. addpd %xmm6, %xmm1
  725. MOVUPS_A1( -6 * SIZE, A1, %xmm6)
  726. pshufd $0x4e, %xmm8, %xmm9
  727. mulpd %xmm14, %xmm8
  728. addpd %xmm8, %xmm2
  729. MOVUPS_A1( -4 * SIZE, A1, %xmm8)
  730. pshufd $0x4e, %xmm10, %xmm11
  731. mulpd %xmm14, %xmm10
  732. addpd %xmm10, %xmm3
  733. MOVUPS_A1( -2 * SIZE, A1, %xmm10)
  734. mulpd %xmm15, %xmm5
  735. SUBPD %xmm5, %xmm0
  736. mulpd %xmm15, %xmm7
  737. SUBPD %xmm7, %xmm1
  738. mulpd %xmm15, %xmm9
  739. SUBPD %xmm9, %xmm2
  740. mulpd %xmm15, %xmm11
  741. SUBPD %xmm11, %xmm3
  742. #ifdef PREFETCHW
  743. PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1)
  744. #endif
  745. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  746. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  747. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  748. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  749. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  750. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  751. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  752. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  753. subq $-8 * SIZE, A1
  754. subq $-8 * SIZE, A2
  755. subq $-8 * SIZE, Y1
  756. subq $1, I
  757. BRANCH
  758. jg .L23
  759. ALIGN_3
  760. .L24:
  761. pshufd $0x4e, %xmm4, %xmm5
  762. mulpd %xmm12, %xmm4
  763. addpd %xmm4, %xmm0
  764. MOVUPS_A1(-16 * SIZE, A2, %xmm4)
  765. pshufd $0x4e, %xmm6, %xmm7
  766. mulpd %xmm12, %xmm6
  767. addpd %xmm6, %xmm1
  768. MOVUPS_A1(-14 * SIZE, A2, %xmm6)
  769. pshufd $0x4e, %xmm8, %xmm9
  770. mulpd %xmm12, %xmm8
  771. addpd %xmm8, %xmm2
  772. MOVUPS_A1(-12 * SIZE, A2, %xmm8)
  773. pshufd $0x4e, %xmm10, %xmm11
  774. mulpd %xmm12, %xmm10
  775. addpd %xmm10, %xmm3
  776. MOVUPS_A1(-10 * SIZE, A2, %xmm10)
  777. mulpd %xmm13, %xmm5
  778. SUBPD %xmm5, %xmm0
  779. mulpd %xmm13, %xmm7
  780. SUBPD %xmm7, %xmm1
  781. mulpd %xmm13, %xmm9
  782. SUBPD %xmm9, %xmm2
  783. mulpd %xmm13, %xmm11
  784. SUBPD %xmm11, %xmm3
  785. pshufd $0x4e, %xmm4, %xmm5
  786. mulpd %xmm14, %xmm4
  787. addpd %xmm4, %xmm0
  788. pshufd $0x4e, %xmm6, %xmm7
  789. mulpd %xmm14, %xmm6
  790. addpd %xmm6, %xmm1
  791. pshufd $0x4e, %xmm8, %xmm9
  792. mulpd %xmm14, %xmm8
  793. addpd %xmm8, %xmm2
  794. pshufd $0x4e, %xmm10, %xmm11
  795. mulpd %xmm14, %xmm10
  796. addpd %xmm10, %xmm3
  797. mulpd %xmm15, %xmm5
  798. SUBPD %xmm5, %xmm0
  799. mulpd %xmm15, %xmm7
  800. SUBPD %xmm7, %xmm1
  801. mulpd %xmm15, %xmm9
  802. SUBPD %xmm9, %xmm2
  803. mulpd %xmm15, %xmm11
  804. SUBPD %xmm11, %xmm3
  805. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  806. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  807. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  808. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  809. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  810. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  811. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  812. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  813. subq $-8 * SIZE, A1
  814. subq $-8 * SIZE, A2
  815. subq $-8 * SIZE, Y1
  816. ALIGN_3
  817. .L25:
  818. testq $2, M
  819. je .L27
  820. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  821. MOVUPS_A1(-14 * SIZE, A1, %xmm6)
  822. MOVUPS_A1(-16 * SIZE, A2, %xmm8)
  823. MOVUPS_A1(-14 * SIZE, A2, %xmm10)
  824. pshufd $0x4e, %xmm4, %xmm5
  825. mulpd %xmm12, %xmm4
  826. addpd %xmm4, %xmm0
  827. pshufd $0x4e, %xmm6, %xmm7
  828. mulpd %xmm12, %xmm6
  829. addpd %xmm6, %xmm1
  830. mulpd %xmm13, %xmm5
  831. SUBPD %xmm5, %xmm0
  832. mulpd %xmm13, %xmm7
  833. SUBPD %xmm7, %xmm1
  834. pshufd $0x4e, %xmm8, %xmm9
  835. mulpd %xmm14, %xmm8
  836. addpd %xmm8, %xmm0
  837. pshufd $0x4e, %xmm10, %xmm11
  838. mulpd %xmm14, %xmm10
  839. addpd %xmm10, %xmm1
  840. mulpd %xmm15, %xmm9
  841. SUBPD %xmm9, %xmm0
  842. mulpd %xmm15, %xmm11
  843. SUBPD %xmm11, %xmm1
  844. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  845. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  846. movapd %xmm2, %xmm0
  847. addq $4 * SIZE, A1
  848. addq $4 * SIZE, A2
  849. addq $4 * SIZE, Y1
  850. ALIGN_3
  851. .L27:
  852. testq $1, M
  853. #if GEMV_UNROLL == 2
  854. je .L29
  855. #else
  856. je .L30
  857. #endif
  858. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  859. MOVUPS_A1(-16 * SIZE, A2, %xmm6)
  860. pshufd $0x4e, %xmm4, %xmm5
  861. mulpd %xmm12, %xmm4
  862. addpd %xmm4, %xmm0
  863. mulpd %xmm13, %xmm5
  864. SUBPD %xmm5, %xmm0
  865. pshufd $0x4e, %xmm6, %xmm7
  866. mulpd %xmm14, %xmm6
  867. addpd %xmm6, %xmm0
  868. mulpd %xmm15, %xmm7
  869. SUBPD %xmm7, %xmm0
  870. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  871. #if GEMV_UNROLL == 2
  872. ALIGN_3
  873. .L29:
  874. cmpq $2, N
  875. jge .L21
  876. #endif
  877. ALIGN_3
  878. .L30:
  879. #endif
  880. cmpq $1, N
  881. jl .L980
  882. #if GEMV_UNROLL == 1
  883. .L31:
  884. decq N
  885. #endif
  886. leaq 16 * SIZE(BUFFER), Y1
  887. movq A, A1
  888. #if GEMV_UNROLL == 1
  889. addq LDA, A
  890. #endif
  891. movsd 0 * SIZE(X), %xmm12
  892. movhpd 1 * SIZE(X), %xmm12
  893. addq INCX, X
  894. pcmpeqb %xmm11, %xmm11
  895. psllq $63, %xmm11
  896. shufps $0xc0, %xmm11, %xmm11
  897. pshufd $0x4e, %xmm12, %xmm13
  898. #ifdef HAVE_SSE3
  899. movddup ALPHA_R, %xmm8
  900. movddup ALPHA_I, %xmm9
  901. #else
  902. movsd ALPHA_R, %xmm8
  903. unpcklpd %xmm8, %xmm8
  904. movsd ALPHA_I, %xmm9
  905. unpcklpd %xmm9, %xmm9
  906. #endif
  907. xorpd %xmm11, %xmm13
  908. mulpd %xmm8, %xmm12
  909. mulpd %xmm9, %xmm13
  910. #ifndef XCONJ
  911. subpd %xmm13, %xmm12
  912. #else
  913. addpd %xmm13, %xmm12
  914. #endif
  915. pshufd $0xee, %xmm12, %xmm13
  916. pshufd $0x44, %xmm12, %xmm12
  917. #ifndef CONJ
  918. xorpd %xmm11, %xmm13
  919. #else
  920. xorpd %xmm11, %xmm12
  921. #endif
  922. MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
  923. MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
  924. MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
  925. MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
  926. movq M, I
  927. sarq $2, I
  928. jle .L35
  929. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  930. MOVUPS_A1(-14 * SIZE, A1, %xmm6)
  931. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  932. MOVUPS_A1(-10 * SIZE, A1, %xmm10)
  933. decq I
  934. jle .L34
  935. ALIGN_3
  936. .L33:
  937. #ifdef PREFETCH
  938. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  939. #endif
  940. pshufd $0x4e, %xmm4, %xmm5
  941. mulpd %xmm12, %xmm4
  942. addpd %xmm4, %xmm0
  943. MOVUPS_A1( -8 * SIZE, A1, %xmm4)
  944. pshufd $0x4e, %xmm6, %xmm7
  945. mulpd %xmm12, %xmm6
  946. addpd %xmm6, %xmm1
  947. MOVUPS_A1( -6 * SIZE, A1, %xmm6)
  948. pshufd $0x4e, %xmm8, %xmm9
  949. mulpd %xmm12, %xmm8
  950. addpd %xmm8, %xmm2
  951. MOVUPS_A1( -4 * SIZE, A1, %xmm8)
  952. pshufd $0x4e, %xmm10, %xmm11
  953. mulpd %xmm12, %xmm10
  954. addpd %xmm10, %xmm3
  955. MOVUPS_A1( -2 * SIZE, A1, %xmm10)
  956. mulpd %xmm13, %xmm5
  957. SUBPD %xmm5, %xmm0
  958. mulpd %xmm13, %xmm7
  959. SUBPD %xmm7, %xmm1
  960. mulpd %xmm13, %xmm9
  961. SUBPD %xmm9, %xmm2
  962. mulpd %xmm13, %xmm11
  963. SUBPD %xmm11, %xmm3
  964. #ifdef PREFETCHW
  965. PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1)
  966. #endif
  967. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  968. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  969. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  970. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  971. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  972. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  973. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  974. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  975. subq $-8 * SIZE, A1
  976. subq $-8 * SIZE, Y1
  977. subq $1, I
  978. BRANCH
  979. jg .L33
  980. ALIGN_3
  981. .L34:
  982. pshufd $0x4e, %xmm4, %xmm5
  983. mulpd %xmm12, %xmm4
  984. addpd %xmm4, %xmm0
  985. pshufd $0x4e, %xmm6, %xmm7
  986. mulpd %xmm12, %xmm6
  987. addpd %xmm6, %xmm1
  988. pshufd $0x4e, %xmm8, %xmm9
  989. mulpd %xmm12, %xmm8
  990. addpd %xmm8, %xmm2
  991. pshufd $0x4e, %xmm10, %xmm11
  992. mulpd %xmm12, %xmm10
  993. addpd %xmm10, %xmm3
  994. mulpd %xmm13, %xmm5
  995. SUBPD %xmm5, %xmm0
  996. mulpd %xmm13, %xmm7
  997. SUBPD %xmm7, %xmm1
  998. mulpd %xmm13, %xmm9
  999. SUBPD %xmm9, %xmm2
  1000. mulpd %xmm13, %xmm11
  1001. SUBPD %xmm11, %xmm3
  1002. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1003. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  1004. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  1005. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  1006. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  1007. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  1008. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  1009. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  1010. subq $-8 * SIZE, A1
  1011. subq $-8 * SIZE, Y1
  1012. ALIGN_3
  1013. .L35:
  1014. testq $2, M
  1015. je .L37
  1016. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1017. MOVUPS_A1(-14 * SIZE, A1, %xmm6)
  1018. pshufd $0x4e, %xmm4, %xmm5
  1019. mulpd %xmm12, %xmm4
  1020. addpd %xmm4, %xmm0
  1021. pshufd $0x4e, %xmm6, %xmm7
  1022. mulpd %xmm12, %xmm6
  1023. addpd %xmm6, %xmm1
  1024. mulpd %xmm13, %xmm5
  1025. SUBPD %xmm5, %xmm0
  1026. mulpd %xmm13, %xmm7
  1027. SUBPD %xmm7, %xmm1
  1028. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1029. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  1030. movapd %xmm2, %xmm0
  1031. addq $4 * SIZE, A1
  1032. addq $4 * SIZE, Y1
  1033. ALIGN_3
  1034. .L37:
  1035. testq $1, M
  1036. #if GEMV_UNROLL == 1
  1037. je .L39
  1038. #else
  1039. je .L980
  1040. #endif
  1041. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1042. pshufd $0x4e, %xmm4, %xmm5
  1043. mulpd %xmm12, %xmm4
  1044. addpd %xmm4, %xmm0
  1045. mulpd %xmm13, %xmm5
  1046. SUBPD %xmm5, %xmm0
  1047. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1048. #if GEMV_UNROLL == 1
  1049. ALIGN_3
  1050. .L39:
  1051. cmpq $1, N
  1052. jge .L31
  1053. #endif
  1054. #ifdef ALIGNED_ACCESS
  1055. jmp .L980
  1056. ALIGN_3
  1057. .L100:
  1058. #if GEMV_UNROLL >= 4
  1059. cmpq $4, N
  1060. jl .L110
  1061. ALIGN_3
  1062. .L101:
  1063. subq $4, N
  1064. leaq 16 * SIZE(BUFFER), Y1
  1065. movq A, A1
  1066. leaq (A, LDA, 2), A2
  1067. leaq (A, LDA, 4), A
  1068. movsd 0 * SIZE(X), %xmm8
  1069. movhpd 1 * SIZE(X), %xmm8
  1070. addq INCX, X
  1071. movsd 0 * SIZE(X), %xmm10
  1072. movhpd 1 * SIZE(X), %xmm10
  1073. addq INCX, X
  1074. movsd 0 * SIZE(X), %xmm12
  1075. movhpd 1 * SIZE(X), %xmm12
  1076. addq INCX, X
  1077. movsd 0 * SIZE(X), %xmm14
  1078. movhpd 1 * SIZE(X), %xmm14
  1079. addq INCX, X
  1080. pcmpeqb %xmm5, %xmm5
  1081. psllq $63, %xmm5
  1082. shufps $0xc0, %xmm5, %xmm5
  1083. pshufd $0x4e, %xmm8, %xmm9
  1084. pshufd $0x4e, %xmm10, %xmm11
  1085. pshufd $0x4e, %xmm12, %xmm13
  1086. pshufd $0x4e, %xmm14, %xmm15
  1087. #ifdef HAVE_SSE3
  1088. movddup ALPHA_R, %xmm6
  1089. movddup ALPHA_I, %xmm7
  1090. #else
  1091. movsd ALPHA_R, %xmm6
  1092. unpcklpd %xmm6, %xmm6
  1093. movsd ALPHA_I, %xmm7
  1094. unpcklpd %xmm7, %xmm7
  1095. #endif
  1096. xorpd %xmm5, %xmm9
  1097. xorpd %xmm5, %xmm11
  1098. xorpd %xmm5, %xmm13
  1099. xorpd %xmm5, %xmm15
  1100. mulpd %xmm6, %xmm8
  1101. mulpd %xmm7, %xmm9
  1102. mulpd %xmm6, %xmm10
  1103. mulpd %xmm7, %xmm11
  1104. mulpd %xmm6, %xmm12
  1105. mulpd %xmm7, %xmm13
  1106. mulpd %xmm6, %xmm14
  1107. mulpd %xmm7, %xmm15
  1108. #ifndef XCONJ
  1109. subpd %xmm9, %xmm8
  1110. subpd %xmm11, %xmm10
  1111. subpd %xmm13, %xmm12
  1112. subpd %xmm15, %xmm14
  1113. #else
  1114. addpd %xmm9, %xmm8
  1115. addpd %xmm11, %xmm10
  1116. addpd %xmm13, %xmm12
  1117. addpd %xmm15, %xmm14
  1118. #endif
  1119. pshufd $0xee, %xmm8, %xmm9
  1120. pshufd $0x44, %xmm8, %xmm8
  1121. pshufd $0xee, %xmm10, %xmm11
  1122. pshufd $0x44, %xmm10, %xmm10
  1123. pshufd $0xee, %xmm12, %xmm13
  1124. pshufd $0x44, %xmm12, %xmm12
  1125. pshufd $0xee, %xmm14, %xmm15
  1126. pshufd $0x44, %xmm14, %xmm14
  1127. #ifndef CONJ
  1128. xorpd %xmm5, %xmm9
  1129. xorpd %xmm5, %xmm11
  1130. xorpd %xmm5, %xmm13
  1131. xorpd %xmm5, %xmm15
  1132. #else
  1133. xorpd %xmm5, %xmm8
  1134. xorpd %xmm5, %xmm10
  1135. xorpd %xmm5, %xmm12
  1136. xorpd %xmm5, %xmm14
  1137. #endif
  1138. MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
  1139. MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
  1140. MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
  1141. MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
  1142. ALIGN_3
  1143. movq M, I
  1144. sarq $2, I
  1145. jle .L105
  1146. movsd -16 * SIZE(A1), %xmm4
  1147. movhpd -15 * SIZE(A1), %xmm4
  1148. movsd -14 * SIZE(A1), %xmm6
  1149. movhpd -13 * SIZE(A1), %xmm6
  1150. decq I
  1151. jle .L104
  1152. ALIGN_3
  1153. .L103:
  1154. #ifdef PREFETCH
  1155. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1)
  1156. #endif
  1157. pshufd $0x4e, %xmm4, %xmm5
  1158. mulpd %xmm8, %xmm4
  1159. addpd %xmm4, %xmm0
  1160. movsd -12 * SIZE(A1), %xmm4
  1161. movhpd -11 * SIZE(A1), %xmm4
  1162. pshufd $0x4e, %xmm6, %xmm7
  1163. mulpd %xmm8, %xmm6
  1164. addpd %xmm6, %xmm1
  1165. movsd -10 * SIZE(A1), %xmm6
  1166. movhpd -9 * SIZE(A1), %xmm6
  1167. mulpd %xmm9, %xmm5
  1168. SUBPD %xmm5, %xmm0
  1169. mulpd %xmm9, %xmm7
  1170. SUBPD %xmm7, %xmm1
  1171. pshufd $0x4e, %xmm4, %xmm5
  1172. mulpd %xmm8, %xmm4
  1173. addpd %xmm4, %xmm2
  1174. movsd -16 * SIZE(A1, LDA), %xmm4
  1175. movhpd -15 * SIZE(A1, LDA), %xmm4
  1176. pshufd $0x4e, %xmm6, %xmm7
  1177. mulpd %xmm8, %xmm6
  1178. addpd %xmm6, %xmm3
  1179. movsd -14 * SIZE(A1, LDA), %xmm6
  1180. movhpd -13 * SIZE(A1, LDA), %xmm6
  1181. mulpd %xmm9, %xmm5
  1182. SUBPD %xmm5, %xmm2
  1183. mulpd %xmm9, %xmm7
  1184. SUBPD %xmm7, %xmm3
  1185. #ifdef PREFETCH
  1186. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
  1187. #endif
  1188. pshufd $0x4e, %xmm4, %xmm5
  1189. mulpd %xmm10, %xmm4
  1190. addpd %xmm4, %xmm0
  1191. movsd -12 * SIZE(A1, LDA), %xmm4
  1192. movhpd -11 * SIZE(A1, LDA), %xmm4
  1193. pshufd $0x4e, %xmm6, %xmm7
  1194. mulpd %xmm10, %xmm6
  1195. addpd %xmm6, %xmm1
  1196. movsd -10 * SIZE(A1, LDA), %xmm6
  1197. movhpd -9 * SIZE(A1, LDA), %xmm6
  1198. mulpd %xmm11, %xmm5
  1199. SUBPD %xmm5, %xmm0
  1200. mulpd %xmm11, %xmm7
  1201. SUBPD %xmm7, %xmm1
  1202. pshufd $0x4e, %xmm4, %xmm5
  1203. mulpd %xmm10, %xmm4
  1204. addpd %xmm4, %xmm2
  1205. movsd -16 * SIZE(A2), %xmm4
  1206. movhpd -15 * SIZE(A2), %xmm4
  1207. pshufd $0x4e, %xmm6, %xmm7
  1208. mulpd %xmm10, %xmm6
  1209. addpd %xmm6, %xmm3
  1210. movsd -14 * SIZE(A2), %xmm6
  1211. movhpd -13 * SIZE(A2), %xmm6
  1212. mulpd %xmm11, %xmm5
  1213. SUBPD %xmm5, %xmm2
  1214. mulpd %xmm11, %xmm7
  1215. SUBPD %xmm7, %xmm3
  1216. #ifdef PREFETCH
  1217. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2)
  1218. #endif
  1219. pshufd $0x4e, %xmm4, %xmm5
  1220. mulpd %xmm12, %xmm4
  1221. addpd %xmm4, %xmm0
  1222. movsd -12 * SIZE(A2), %xmm4
  1223. movhpd -11 * SIZE(A2), %xmm4
  1224. pshufd $0x4e, %xmm6, %xmm7
  1225. mulpd %xmm12, %xmm6
  1226. addpd %xmm6, %xmm1
  1227. movsd -10 * SIZE(A2), %xmm6
  1228. movhpd -9 * SIZE(A2), %xmm6
  1229. mulpd %xmm13, %xmm5
  1230. SUBPD %xmm5, %xmm0
  1231. mulpd %xmm13, %xmm7
  1232. SUBPD %xmm7, %xmm1
  1233. pshufd $0x4e, %xmm4, %xmm5
  1234. mulpd %xmm12, %xmm4
  1235. addpd %xmm4, %xmm2
  1236. movsd -16 * SIZE(A2, LDA), %xmm4
  1237. movhpd -15 * SIZE(A2, LDA), %xmm4
  1238. pshufd $0x4e, %xmm6, %xmm7
  1239. mulpd %xmm12, %xmm6
  1240. addpd %xmm6, %xmm3
  1241. movsd -14 * SIZE(A2, LDA), %xmm6
  1242. movhpd -13 * SIZE(A2, LDA), %xmm6
  1243. mulpd %xmm13, %xmm5
  1244. SUBPD %xmm5, %xmm2
  1245. mulpd %xmm13, %xmm7
  1246. SUBPD %xmm7, %xmm3
  1247. #ifdef PREFETCH
  1248. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
  1249. #endif
  1250. pshufd $0x4e, %xmm4, %xmm5
  1251. mulpd %xmm14, %xmm4
  1252. addpd %xmm4, %xmm0
  1253. movsd -12 * SIZE(A2, LDA), %xmm4
  1254. movhpd -11 * SIZE(A2, LDA), %xmm4
  1255. pshufd $0x4e, %xmm6, %xmm7
  1256. mulpd %xmm14, %xmm6
  1257. addpd %xmm6, %xmm1
  1258. movsd -10 * SIZE(A2, LDA), %xmm6
  1259. movhpd -9 * SIZE(A2, LDA), %xmm6
  1260. mulpd %xmm15, %xmm5
  1261. SUBPD %xmm5, %xmm0
  1262. mulpd %xmm15, %xmm7
  1263. SUBPD %xmm7, %xmm1
  1264. pshufd $0x4e, %xmm4, %xmm5
  1265. mulpd %xmm14, %xmm4
  1266. addpd %xmm4, %xmm2
  1267. movsd -8 * SIZE(A1), %xmm4
  1268. movhpd -7 * SIZE(A1), %xmm4
  1269. pshufd $0x4e, %xmm6, %xmm7
  1270. mulpd %xmm14, %xmm6
  1271. addpd %xmm6, %xmm3
  1272. movsd -6 * SIZE(A1), %xmm6
  1273. movhpd -5 * SIZE(A1), %xmm6
  1274. mulpd %xmm15, %xmm5
  1275. SUBPD %xmm5, %xmm2
  1276. mulpd %xmm15, %xmm7
  1277. SUBPD %xmm7, %xmm3
  1278. #ifdef PREFETCHW
  1279. PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1)
  1280. #endif
  1281. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1282. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  1283. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  1284. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  1285. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  1286. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  1287. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  1288. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  1289. subq $-8 * SIZE, A1
  1290. subq $-8 * SIZE, A2
  1291. subq $-8 * SIZE, Y1
  1292. subq $1, I
  1293. BRANCH
  1294. jg .L103
  1295. ALIGN_3
  1296. .L104:
  1297. pshufd $0x4e, %xmm4, %xmm5
  1298. mulpd %xmm8, %xmm4
  1299. addpd %xmm4, %xmm0
  1300. movsd -12 * SIZE(A1), %xmm4
  1301. movhpd -11 * SIZE(A1), %xmm4
  1302. pshufd $0x4e, %xmm6, %xmm7
  1303. mulpd %xmm8, %xmm6
  1304. addpd %xmm6, %xmm1
  1305. movsd -10 * SIZE(A1), %xmm6
  1306. movhpd -9 * SIZE(A1), %xmm6
  1307. mulpd %xmm9, %xmm5
  1308. SUBPD %xmm5, %xmm0
  1309. mulpd %xmm9, %xmm7
  1310. SUBPD %xmm7, %xmm1
  1311. pshufd $0x4e, %xmm4, %xmm5
  1312. mulpd %xmm8, %xmm4
  1313. addpd %xmm4, %xmm2
  1314. movsd -16 * SIZE(A1, LDA), %xmm4
  1315. movhpd -15 * SIZE(A1, LDA), %xmm4
  1316. pshufd $0x4e, %xmm6, %xmm7
  1317. mulpd %xmm8, %xmm6
  1318. addpd %xmm6, %xmm3
  1319. movsd -14 * SIZE(A1, LDA), %xmm6
  1320. movhpd -13 * SIZE(A1, LDA), %xmm6
  1321. mulpd %xmm9, %xmm5
  1322. SUBPD %xmm5, %xmm2
  1323. mulpd %xmm9, %xmm7
  1324. SUBPD %xmm7, %xmm3
  1325. pshufd $0x4e, %xmm4, %xmm5
  1326. mulpd %xmm10, %xmm4
  1327. addpd %xmm4, %xmm0
  1328. movsd -12 * SIZE(A1, LDA), %xmm4
  1329. movhpd -11 * SIZE(A1, LDA), %xmm4
  1330. pshufd $0x4e, %xmm6, %xmm7
  1331. mulpd %xmm10, %xmm6
  1332. addpd %xmm6, %xmm1
  1333. movsd -10 * SIZE(A1, LDA), %xmm6
  1334. movhpd -9 * SIZE(A1, LDA), %xmm6
  1335. mulpd %xmm11, %xmm5
  1336. SUBPD %xmm5, %xmm0
  1337. mulpd %xmm11, %xmm7
  1338. SUBPD %xmm7, %xmm1
  1339. pshufd $0x4e, %xmm4, %xmm5
  1340. mulpd %xmm10, %xmm4
  1341. addpd %xmm4, %xmm2
  1342. movsd -16 * SIZE(A2), %xmm4
  1343. movhpd -15 * SIZE(A2), %xmm4
  1344. pshufd $0x4e, %xmm6, %xmm7
  1345. mulpd %xmm10, %xmm6
  1346. addpd %xmm6, %xmm3
  1347. movsd -14 * SIZE(A2), %xmm6
  1348. movhpd -13 * SIZE(A2), %xmm6
  1349. mulpd %xmm11, %xmm5
  1350. SUBPD %xmm5, %xmm2
  1351. mulpd %xmm11, %xmm7
  1352. SUBPD %xmm7, %xmm3
  1353. pshufd $0x4e, %xmm4, %xmm5
  1354. mulpd %xmm12, %xmm4
  1355. addpd %xmm4, %xmm0
  1356. movsd -12 * SIZE(A2), %xmm4
  1357. movhpd -11 * SIZE(A2), %xmm4
  1358. pshufd $0x4e, %xmm6, %xmm7
  1359. mulpd %xmm12, %xmm6
  1360. addpd %xmm6, %xmm1
  1361. movsd -10 * SIZE(A2), %xmm6
  1362. movhpd -9 * SIZE(A2), %xmm6
  1363. mulpd %xmm13, %xmm5
  1364. SUBPD %xmm5, %xmm0
  1365. mulpd %xmm13, %xmm7
  1366. SUBPD %xmm7, %xmm1
  1367. pshufd $0x4e, %xmm4, %xmm5
  1368. mulpd %xmm12, %xmm4
  1369. addpd %xmm4, %xmm2
  1370. movsd -16 * SIZE(A2, LDA), %xmm4
  1371. movhpd -15 * SIZE(A2, LDA), %xmm4
  1372. pshufd $0x4e, %xmm6, %xmm7
  1373. mulpd %xmm12, %xmm6
  1374. addpd %xmm6, %xmm3
  1375. movsd -14 * SIZE(A2, LDA), %xmm6
  1376. movhpd -13 * SIZE(A2, LDA), %xmm6
  1377. mulpd %xmm13, %xmm5
  1378. SUBPD %xmm5, %xmm2
  1379. mulpd %xmm13, %xmm7
  1380. SUBPD %xmm7, %xmm3
  1381. pshufd $0x4e, %xmm4, %xmm5
  1382. mulpd %xmm14, %xmm4
  1383. addpd %xmm4, %xmm0
  1384. movsd -12 * SIZE(A2, LDA), %xmm4
  1385. movhpd -11 * SIZE(A2, LDA), %xmm4
  1386. pshufd $0x4e, %xmm6, %xmm7
  1387. mulpd %xmm14, %xmm6
  1388. addpd %xmm6, %xmm1
  1389. movsd -10 * SIZE(A2, LDA), %xmm6
  1390. movhpd -9 * SIZE(A2, LDA), %xmm6
  1391. mulpd %xmm15, %xmm5
  1392. SUBPD %xmm5, %xmm0
  1393. mulpd %xmm15, %xmm7
  1394. SUBPD %xmm7, %xmm1
  1395. pshufd $0x4e, %xmm4, %xmm5
  1396. mulpd %xmm14, %xmm4
  1397. addpd %xmm4, %xmm2
  1398. pshufd $0x4e, %xmm6, %xmm7
  1399. mulpd %xmm14, %xmm6
  1400. addpd %xmm6, %xmm3
  1401. mulpd %xmm15, %xmm5
  1402. SUBPD %xmm5, %xmm2
  1403. mulpd %xmm15, %xmm7
  1404. SUBPD %xmm7, %xmm3
  1405. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1406. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  1407. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  1408. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  1409. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  1410. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  1411. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  1412. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  1413. subq $-8 * SIZE, A1
  1414. subq $-8 * SIZE, A2
  1415. subq $-8 * SIZE, Y1
  1416. ALIGN_3
  1417. .L105:
  1418. testq $2, M
  1419. je .L107
  1420. movsd -16 * SIZE(A1), %xmm4
  1421. movhpd -15 * SIZE(A1), %xmm4
  1422. movsd -14 * SIZE(A1), %xmm6
  1423. movhpd -13 * SIZE(A1), %xmm6
  1424. pshufd $0x4e, %xmm4, %xmm5
  1425. mulpd %xmm8, %xmm4
  1426. addpd %xmm4, %xmm0
  1427. movsd -16 * SIZE(A1, LDA), %xmm4
  1428. movhpd -15 * SIZE(A1, LDA), %xmm4
  1429. pshufd $0x4e, %xmm6, %xmm7
  1430. mulpd %xmm8, %xmm6
  1431. addpd %xmm6, %xmm1
  1432. movsd -14 * SIZE(A1, LDA), %xmm6
  1433. movhpd -13 * SIZE(A1, LDA), %xmm6
  1434. mulpd %xmm9, %xmm5
  1435. SUBPD %xmm5, %xmm0
  1436. mulpd %xmm9, %xmm7
  1437. SUBPD %xmm7, %xmm1
  1438. pshufd $0x4e, %xmm4, %xmm5
  1439. mulpd %xmm10, %xmm4
  1440. addpd %xmm4, %xmm0
  1441. movsd -16 * SIZE(A2), %xmm4
  1442. movhpd -15 * SIZE(A2), %xmm4
  1443. pshufd $0x4e, %xmm6, %xmm7
  1444. mulpd %xmm10, %xmm6
  1445. addpd %xmm6, %xmm1
  1446. movsd -14 * SIZE(A2), %xmm6
  1447. movhpd -13 * SIZE(A2), %xmm6
  1448. mulpd %xmm11, %xmm5
  1449. SUBPD %xmm5, %xmm0
  1450. mulpd %xmm11, %xmm7
  1451. SUBPD %xmm7, %xmm1
  1452. pshufd $0x4e, %xmm4, %xmm5
  1453. mulpd %xmm12, %xmm4
  1454. addpd %xmm4, %xmm0
  1455. movsd -16 * SIZE(A2, LDA), %xmm4
  1456. movhpd -15 * SIZE(A2, LDA), %xmm4
  1457. pshufd $0x4e, %xmm6, %xmm7
  1458. mulpd %xmm12, %xmm6
  1459. addpd %xmm6, %xmm1
  1460. movsd -14 * SIZE(A2, LDA), %xmm6
  1461. movhpd -13 * SIZE(A2, LDA), %xmm6
  1462. mulpd %xmm13, %xmm5
  1463. SUBPD %xmm5, %xmm0
  1464. mulpd %xmm13, %xmm7
  1465. SUBPD %xmm7, %xmm1
  1466. pshufd $0x4e, %xmm4, %xmm5
  1467. mulpd %xmm14, %xmm4
  1468. addpd %xmm4, %xmm0
  1469. mulpd %xmm15, %xmm5
  1470. SUBPD %xmm5, %xmm0
  1471. pshufd $0x4e, %xmm6, %xmm7
  1472. mulpd %xmm14, %xmm6
  1473. addpd %xmm6, %xmm1
  1474. mulpd %xmm15, %xmm7
  1475. SUBPD %xmm7, %xmm1
  1476. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1477. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  1478. movapd %xmm2, %xmm0
  1479. addq $4 * SIZE, A1
  1480. addq $4 * SIZE, A2
  1481. addq $4 * SIZE, Y1
  1482. ALIGN_3
  1483. .L107:
  1484. testq $1, M
  1485. je .L109
  1486. movsd -16 * SIZE(A1), %xmm4
  1487. movhpd -15 * SIZE(A1), %xmm4
  1488. movsd -16 * SIZE(A1, LDA), %xmm6
  1489. movhpd -15 * SIZE(A1, LDA), %xmm6
  1490. pshufd $0x4e, %xmm4, %xmm5
  1491. mulpd %xmm8, %xmm4
  1492. addpd %xmm4, %xmm0
  1493. movsd -16 * SIZE(A2), %xmm4
  1494. movhpd -15 * SIZE(A2), %xmm4
  1495. mulpd %xmm9, %xmm5
  1496. SUBPD %xmm5, %xmm0
  1497. pshufd $0x4e, %xmm6, %xmm7
  1498. mulpd %xmm10, %xmm6
  1499. addpd %xmm6, %xmm0
  1500. movsd -16 * SIZE(A2, LDA), %xmm6
  1501. movhpd -15 * SIZE(A2, LDA), %xmm6
  1502. mulpd %xmm11, %xmm7
  1503. SUBPD %xmm7, %xmm0
  1504. pshufd $0x4e, %xmm4, %xmm5
  1505. mulpd %xmm12, %xmm4
  1506. addpd %xmm4, %xmm0
  1507. mulpd %xmm13, %xmm5
  1508. SUBPD %xmm5, %xmm0
  1509. pshufd $0x4e, %xmm6, %xmm7
  1510. mulpd %xmm14, %xmm6
  1511. addpd %xmm6, %xmm0
  1512. mulpd %xmm15, %xmm7
  1513. SUBPD %xmm7, %xmm0
  1514. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1515. ALIGN_3
  1516. .L109:
  1517. cmpq $4, N
  1518. jge .L101
  1519. ALIGN_3
  1520. .L110:
  1521. #endif
  1522. #if GEMV_UNROLL >= 2
  1523. cmpq $2, N
  1524. jl .L120
  1525. #if GEMV_UNROLL == 2
  1526. ALIGN_3
  1527. .L111:
  1528. #endif
  1529. subq $2, N
  1530. leaq 16 * SIZE(BUFFER), Y1
  1531. movq A, A1
  1532. leaq (A, LDA, 1), A2
  1533. leaq (A, LDA, 2), A
  1534. movsd 0 * SIZE(X), %xmm12
  1535. movhpd 1 * SIZE(X), %xmm12
  1536. addq INCX, X
  1537. movsd 0 * SIZE(X), %xmm14
  1538. movhpd 1 * SIZE(X), %xmm14
  1539. addq INCX, X
  1540. pcmpeqb %xmm11, %xmm11
  1541. psllq $63, %xmm11
  1542. shufps $0xc0, %xmm11, %xmm11
  1543. pshufd $0x4e, %xmm12, %xmm13
  1544. pshufd $0x4e, %xmm14, %xmm15
  1545. #ifdef HAVE_SSE3
  1546. movddup ALPHA_R, %xmm8
  1547. movddup ALPHA_I, %xmm9
  1548. #else
  1549. movsd ALPHA_R, %xmm8
  1550. unpcklpd %xmm8, %xmm8
  1551. movsd ALPHA_I, %xmm9
  1552. unpcklpd %xmm9, %xmm9
  1553. #endif
  1554. xorpd %xmm11, %xmm13
  1555. xorpd %xmm11, %xmm15
  1556. mulpd %xmm8, %xmm12
  1557. mulpd %xmm9, %xmm13
  1558. mulpd %xmm8, %xmm14
  1559. mulpd %xmm9, %xmm15
  1560. #ifndef XCONJ
  1561. subpd %xmm13, %xmm12
  1562. subpd %xmm15, %xmm14
  1563. #else
  1564. addpd %xmm13, %xmm12
  1565. addpd %xmm15, %xmm14
  1566. #endif
  1567. pshufd $0xee, %xmm12, %xmm13
  1568. pshufd $0x44, %xmm12, %xmm12
  1569. pshufd $0xee, %xmm14, %xmm15
  1570. pshufd $0x44, %xmm14, %xmm14
  1571. #ifndef CONJ
  1572. xorpd %xmm11, %xmm13
  1573. xorpd %xmm11, %xmm15
  1574. #else
  1575. xorpd %xmm11, %xmm12
  1576. xorpd %xmm11, %xmm14
  1577. #endif
  1578. MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
  1579. MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
  1580. MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
  1581. MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
  1582. ALIGN_3
  1583. movq M, I
  1584. sarq $2, I
  1585. jle .L115
  1586. movsd -16 * SIZE(A1), %xmm4
  1587. movhpd -15 * SIZE(A1), %xmm4
  1588. movsd -14 * SIZE(A1), %xmm6
  1589. movhpd -13 * SIZE(A1), %xmm6
  1590. movsd -12 * SIZE(A1), %xmm8
  1591. movhpd -11 * SIZE(A1), %xmm8
  1592. movsd -10 * SIZE(A1), %xmm10
  1593. movhpd -9 * SIZE(A1), %xmm10
  1594. decq I
  1595. jle .L114
  1596. ALIGN_3
  1597. .L113:
  1598. #ifdef PREFETCH
  1599. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  1600. #endif
  1601. pshufd $0x4e, %xmm4, %xmm5
  1602. mulpd %xmm12, %xmm4
  1603. addpd %xmm4, %xmm0
  1604. movsd -16 * SIZE(A2), %xmm4
  1605. movhpd -15 * SIZE(A2), %xmm4
  1606. pshufd $0x4e, %xmm6, %xmm7
  1607. mulpd %xmm12, %xmm6
  1608. addpd %xmm6, %xmm1
  1609. movsd -14 * SIZE(A2), %xmm6
  1610. movhpd -13 * SIZE(A2), %xmm6
  1611. pshufd $0x4e, %xmm8, %xmm9
  1612. mulpd %xmm12, %xmm8
  1613. addpd %xmm8, %xmm2
  1614. movsd -12 * SIZE(A2), %xmm8
  1615. movhpd -11 * SIZE(A2), %xmm8
  1616. pshufd $0x4e, %xmm10, %xmm11
  1617. mulpd %xmm12, %xmm10
  1618. addpd %xmm10, %xmm3
  1619. movsd -10 * SIZE(A2), %xmm10
  1620. movhpd -9 * SIZE(A2), %xmm10
  1621. mulpd %xmm13, %xmm5
  1622. SUBPD %xmm5, %xmm0
  1623. mulpd %xmm13, %xmm7
  1624. SUBPD %xmm7, %xmm1
  1625. mulpd %xmm13, %xmm9
  1626. SUBPD %xmm9, %xmm2
  1627. mulpd %xmm13, %xmm11
  1628. SUBPD %xmm11, %xmm3
  1629. #ifdef PREFETCH
  1630. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  1631. #endif
  1632. pshufd $0x4e, %xmm4, %xmm5
  1633. mulpd %xmm14, %xmm4
  1634. addpd %xmm4, %xmm0
  1635. movsd -8 * SIZE(A1), %xmm4
  1636. movhpd -7 * SIZE(A1), %xmm4
  1637. pshufd $0x4e, %xmm6, %xmm7
  1638. mulpd %xmm14, %xmm6
  1639. addpd %xmm6, %xmm1
  1640. movsd -6 * SIZE(A1), %xmm6
  1641. movhpd -5 * SIZE(A1), %xmm6
  1642. pshufd $0x4e, %xmm8, %xmm9
  1643. mulpd %xmm14, %xmm8
  1644. addpd %xmm8, %xmm2
  1645. movsd -4 * SIZE(A1), %xmm8
  1646. movhpd -3 * SIZE(A1), %xmm8
  1647. pshufd $0x4e, %xmm10, %xmm11
  1648. mulpd %xmm14, %xmm10
  1649. addpd %xmm10, %xmm3
  1650. movsd -2 * SIZE(A1), %xmm10
  1651. movhpd -1 * SIZE(A1), %xmm10
  1652. mulpd %xmm15, %xmm5
  1653. SUBPD %xmm5, %xmm0
  1654. mulpd %xmm15, %xmm7
  1655. SUBPD %xmm7, %xmm1
  1656. mulpd %xmm15, %xmm9
  1657. SUBPD %xmm9, %xmm2
  1658. mulpd %xmm15, %xmm11
  1659. SUBPD %xmm11, %xmm3
  1660. #ifdef PREFETCHW
  1661. PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1)
  1662. #endif
  1663. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1664. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  1665. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  1666. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  1667. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  1668. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  1669. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  1670. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  1671. subq $-8 * SIZE, A1
  1672. subq $-8 * SIZE, A2
  1673. subq $-8 * SIZE, Y1
  1674. subq $1, I
  1675. BRANCH
  1676. jg .L113
  1677. ALIGN_3
  1678. .L114:
  1679. pshufd $0x4e, %xmm4, %xmm5
  1680. mulpd %xmm12, %xmm4
  1681. addpd %xmm4, %xmm0
  1682. movsd -16 * SIZE(A2), %xmm4
  1683. movhpd -15 * SIZE(A2), %xmm4
  1684. pshufd $0x4e, %xmm6, %xmm7
  1685. mulpd %xmm12, %xmm6
  1686. addpd %xmm6, %xmm1
  1687. movsd -14 * SIZE(A2), %xmm6
  1688. movhpd -13 * SIZE(A2), %xmm6
  1689. pshufd $0x4e, %xmm8, %xmm9
  1690. mulpd %xmm12, %xmm8
  1691. addpd %xmm8, %xmm2
  1692. movsd -12 * SIZE(A2), %xmm8
  1693. movhpd -11 * SIZE(A2), %xmm8
  1694. pshufd $0x4e, %xmm10, %xmm11
  1695. mulpd %xmm12, %xmm10
  1696. addpd %xmm10, %xmm3
  1697. movsd -10 * SIZE(A2), %xmm10
  1698. movhpd -9 * SIZE(A2), %xmm10
  1699. mulpd %xmm13, %xmm5
  1700. SUBPD %xmm5, %xmm0
  1701. mulpd %xmm13, %xmm7
  1702. SUBPD %xmm7, %xmm1
  1703. mulpd %xmm13, %xmm9
  1704. SUBPD %xmm9, %xmm2
  1705. mulpd %xmm13, %xmm11
  1706. SUBPD %xmm11, %xmm3
  1707. pshufd $0x4e, %xmm4, %xmm5
  1708. mulpd %xmm14, %xmm4
  1709. addpd %xmm4, %xmm0
  1710. pshufd $0x4e, %xmm6, %xmm7
  1711. mulpd %xmm14, %xmm6
  1712. addpd %xmm6, %xmm1
  1713. pshufd $0x4e, %xmm8, %xmm9
  1714. mulpd %xmm14, %xmm8
  1715. addpd %xmm8, %xmm2
  1716. pshufd $0x4e, %xmm10, %xmm11
  1717. mulpd %xmm14, %xmm10
  1718. addpd %xmm10, %xmm3
  1719. mulpd %xmm15, %xmm5
  1720. SUBPD %xmm5, %xmm0
  1721. mulpd %xmm15, %xmm7
  1722. SUBPD %xmm7, %xmm1
  1723. mulpd %xmm15, %xmm9
  1724. SUBPD %xmm9, %xmm2
  1725. mulpd %xmm15, %xmm11
  1726. SUBPD %xmm11, %xmm3
  1727. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1728. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  1729. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  1730. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  1731. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  1732. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  1733. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  1734. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  1735. subq $-8 * SIZE, A1
  1736. subq $-8 * SIZE, A2
  1737. subq $-8 * SIZE, Y1
  1738. ALIGN_3
  1739. .L115:
  1740. testq $2, M
  1741. je .L117
  1742. movsd -16 * SIZE(A1), %xmm4
  1743. movhpd -15 * SIZE(A1), %xmm4
  1744. movsd -14 * SIZE(A1), %xmm6
  1745. movhpd -13 * SIZE(A1), %xmm6
  1746. movsd -16 * SIZE(A2), %xmm8
  1747. movhpd -15 * SIZE(A2), %xmm8
  1748. movsd -14 * SIZE(A2), %xmm10
  1749. movhpd -13 * SIZE(A2), %xmm10
  1750. pshufd $0x4e, %xmm4, %xmm5
  1751. mulpd %xmm12, %xmm4
  1752. addpd %xmm4, %xmm0
  1753. pshufd $0x4e, %xmm6, %xmm7
  1754. mulpd %xmm12, %xmm6
  1755. addpd %xmm6, %xmm1
  1756. mulpd %xmm13, %xmm5
  1757. SUBPD %xmm5, %xmm0
  1758. mulpd %xmm13, %xmm7
  1759. SUBPD %xmm7, %xmm1
  1760. pshufd $0x4e, %xmm8, %xmm9
  1761. mulpd %xmm14, %xmm8
  1762. addpd %xmm8, %xmm0
  1763. pshufd $0x4e, %xmm10, %xmm11
  1764. mulpd %xmm14, %xmm10
  1765. addpd %xmm10, %xmm1
  1766. mulpd %xmm15, %xmm9
  1767. SUBPD %xmm9, %xmm0
  1768. mulpd %xmm15, %xmm11
  1769. SUBPD %xmm11, %xmm1
  1770. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1771. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  1772. movapd %xmm2, %xmm0
  1773. addq $4 * SIZE, A1
  1774. addq $4 * SIZE, A2
  1775. addq $4 * SIZE, Y1
  1776. ALIGN_3
  1777. .L117:
  1778. testq $1, M
  1779. #if GEMV_UNROLL == 2
  1780. je .L119
  1781. #else
  1782. je .L120
  1783. #endif
  1784. movsd -16 * SIZE(A1), %xmm4
  1785. movhpd -15 * SIZE(A1), %xmm4
  1786. movsd -16 * SIZE(A2), %xmm6
  1787. movhpd -15 * SIZE(A2), %xmm6
  1788. pshufd $0x4e, %xmm4, %xmm5
  1789. mulpd %xmm12, %xmm4
  1790. addpd %xmm4, %xmm0
  1791. mulpd %xmm13, %xmm5
  1792. SUBPD %xmm5, %xmm0
  1793. pshufd $0x4e, %xmm6, %xmm7
  1794. mulpd %xmm14, %xmm6
  1795. addpd %xmm6, %xmm0
  1796. mulpd %xmm15, %xmm7
  1797. SUBPD %xmm7, %xmm0
  1798. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1799. #if GEMV_UNROLL == 2
  1800. ALIGN_3
  1801. .L119:
  1802. cmpq $2, N
  1803. jge .L111
  1804. #endif
  1805. ALIGN_3
  1806. .L120:
  1807. #endif
  1808. cmpq $1, N
  1809. jl .L980
  1810. #if GEMV_UNROLL == 1
  1811. .L121:
  1812. decq N
  1813. #endif
  1814. leaq 16 * SIZE(BUFFER), Y1
  1815. movq A, A1
  1816. #if GEMV_UNROLL == 1
  1817. addq LDA, A
  1818. #endif
  1819. movsd 0 * SIZE(X), %xmm12
  1820. movhpd 1 * SIZE(X), %xmm12
  1821. addq INCX, X
  1822. pcmpeqb %xmm11, %xmm11
  1823. psllq $63, %xmm11
  1824. shufps $0xc0, %xmm11, %xmm11
  1825. pshufd $0x4e, %xmm12, %xmm13
  1826. #ifdef HAVE_SSE3
  1827. movddup ALPHA_R, %xmm8
  1828. movddup ALPHA_I, %xmm9
  1829. #else
  1830. movsd ALPHA_R, %xmm8
  1831. unpcklpd %xmm8, %xmm8
  1832. movsd ALPHA_I, %xmm9
  1833. unpcklpd %xmm9, %xmm9
  1834. #endif
  1835. xorpd %xmm11, %xmm13
  1836. mulpd %xmm8, %xmm12
  1837. mulpd %xmm9, %xmm13
  1838. #ifndef XCONJ
  1839. subpd %xmm13, %xmm12
  1840. #else
  1841. addpd %xmm13, %xmm12
  1842. #endif
  1843. pshufd $0xee, %xmm12, %xmm13
  1844. pshufd $0x44, %xmm12, %xmm12
  1845. #ifndef CONJ
  1846. xorpd %xmm11, %xmm13
  1847. #else
  1848. xorpd %xmm11, %xmm12
  1849. #endif
  1850. MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
  1851. MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
  1852. MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
  1853. MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
  1854. movq M, I
  1855. sarq $2, I
  1856. jle .L125
  1857. movsd -16 * SIZE(A1), %xmm4
  1858. movhpd -15 * SIZE(A1), %xmm4
  1859. movsd -14 * SIZE(A1), %xmm6
  1860. movhpd -13 * SIZE(A1), %xmm6
  1861. movsd -12 * SIZE(A1), %xmm8
  1862. movhpd -11 * SIZE(A1), %xmm8
  1863. movsd -10 * SIZE(A1), %xmm10
  1864. movhpd -9 * SIZE(A1), %xmm10
  1865. decq I
  1866. jle .L124
  1867. ALIGN_3
  1868. .L123:
  1869. #ifdef PREFETCH
  1870. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  1871. #endif
  1872. pshufd $0x4e, %xmm4, %xmm5
  1873. mulpd %xmm12, %xmm4
  1874. addpd %xmm4, %xmm0
  1875. movsd -8 * SIZE(A1), %xmm4
  1876. movhpd -7 * SIZE(A1), %xmm4
  1877. pshufd $0x4e, %xmm6, %xmm7
  1878. mulpd %xmm12, %xmm6
  1879. addpd %xmm6, %xmm1
  1880. movsd -6 * SIZE(A1), %xmm6
  1881. movhpd -5 * SIZE(A1), %xmm6
  1882. pshufd $0x4e, %xmm8, %xmm9
  1883. mulpd %xmm12, %xmm8
  1884. addpd %xmm8, %xmm2
  1885. movsd -4 * SIZE(A1), %xmm8
  1886. movhpd -3 * SIZE(A1), %xmm8
  1887. pshufd $0x4e, %xmm10, %xmm11
  1888. mulpd %xmm12, %xmm10
  1889. addpd %xmm10, %xmm3
  1890. movsd -2 * SIZE(A1), %xmm10
  1891. movhpd -1 * SIZE(A1), %xmm10
  1892. mulpd %xmm13, %xmm5
  1893. SUBPD %xmm5, %xmm0
  1894. mulpd %xmm13, %xmm7
  1895. SUBPD %xmm7, %xmm1
  1896. mulpd %xmm13, %xmm9
  1897. SUBPD %xmm9, %xmm2
  1898. mulpd %xmm13, %xmm11
  1899. SUBPD %xmm11, %xmm3
  1900. #ifdef PREFETCHW
  1901. PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1)
  1902. #endif
  1903. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1904. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  1905. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  1906. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  1907. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  1908. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  1909. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  1910. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  1911. subq $-8 * SIZE, A1
  1912. subq $-8 * SIZE, Y1
  1913. subq $1, I
  1914. BRANCH
  1915. jg .L123
  1916. ALIGN_3
  1917. .L124:
  1918. pshufd $0x4e, %xmm4, %xmm5
  1919. mulpd %xmm12, %xmm4
  1920. addpd %xmm4, %xmm0
  1921. pshufd $0x4e, %xmm6, %xmm7
  1922. mulpd %xmm12, %xmm6
  1923. addpd %xmm6, %xmm1
  1924. pshufd $0x4e, %xmm8, %xmm9
  1925. mulpd %xmm12, %xmm8
  1926. addpd %xmm8, %xmm2
  1927. pshufd $0x4e, %xmm10, %xmm11
  1928. mulpd %xmm12, %xmm10
  1929. addpd %xmm10, %xmm3
  1930. mulpd %xmm13, %xmm5
  1931. SUBPD %xmm5, %xmm0
  1932. mulpd %xmm13, %xmm7
  1933. SUBPD %xmm7, %xmm1
  1934. mulpd %xmm13, %xmm9
  1935. SUBPD %xmm9, %xmm2
  1936. mulpd %xmm13, %xmm11
  1937. SUBPD %xmm11, %xmm3
  1938. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1939. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  1940. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  1941. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  1942. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  1943. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  1944. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  1945. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  1946. subq $-8 * SIZE, A1
  1947. subq $-8 * SIZE, Y1
  1948. ALIGN_3
  1949. .L125:
  1950. testq $2, M
  1951. je .L127
  1952. movsd -16 * SIZE(A1), %xmm4
  1953. movhpd -15 * SIZE(A1), %xmm4
  1954. movsd -14 * SIZE(A1), %xmm6
  1955. movhpd -13 * SIZE(A1), %xmm6
  1956. pshufd $0x4e, %xmm4, %xmm5
  1957. mulpd %xmm12, %xmm4
  1958. addpd %xmm4, %xmm0
  1959. pshufd $0x4e, %xmm6, %xmm7
  1960. mulpd %xmm12, %xmm6
  1961. addpd %xmm6, %xmm1
  1962. mulpd %xmm13, %xmm5
  1963. SUBPD %xmm5, %xmm0
  1964. mulpd %xmm13, %xmm7
  1965. SUBPD %xmm7, %xmm1
  1966. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1967. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  1968. movapd %xmm2, %xmm0
  1969. addq $4 * SIZE, A1
  1970. addq $4 * SIZE, Y1
  1971. ALIGN_3
  1972. .L127:
  1973. testq $1, M
  1974. #if GEMV_UNROLL == 1
  1975. je .L129
  1976. #else
  1977. je .L980
  1978. #endif
  1979. movsd -16 * SIZE(A1), %xmm4
  1980. movhpd -15 * SIZE(A1), %xmm4
  1981. pshufd $0x4e, %xmm4, %xmm5
  1982. mulpd %xmm12, %xmm4
  1983. addpd %xmm4, %xmm0
  1984. mulpd %xmm13, %xmm5
  1985. SUBPD %xmm5, %xmm0
  1986. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  1987. #if GEMV_UNROLL == 1
  1988. ALIGN_3
  1989. .L129:
  1990. cmpq $1, N
  1991. jge .L121
  1992. #endif
  1993. #endif
  1994. ALIGN_3
  1995. .L980:
  1996. testq $SIZE, Y
  1997. jne .L990
  1998. movq Y, Y1
  1999. movq M, %rax
  2000. sarq $3, %rax
  2001. jle .L184
  2002. ALIGN_3
  2003. .L182:
  2004. movapd (Y), %xmm0
  2005. addq INCY, Y
  2006. movapd (Y), %xmm1
  2007. addq INCY, Y
  2008. movapd (Y), %xmm2
  2009. addq INCY, Y
  2010. movapd (Y), %xmm3
  2011. addq INCY, Y
  2012. movapd (Y), %xmm4
  2013. addq INCY, Y
  2014. movapd (Y), %xmm5
  2015. addq INCY, Y
  2016. movapd (Y), %xmm6
  2017. addq INCY, Y
  2018. movapd (Y), %xmm7
  2019. addq INCY, Y
  2020. addpd 0 * SIZE(BUFFER), %xmm0
  2021. addpd 2 * SIZE(BUFFER), %xmm1
  2022. addpd 4 * SIZE(BUFFER), %xmm2
  2023. addpd 6 * SIZE(BUFFER), %xmm3
  2024. addpd 8 * SIZE(BUFFER), %xmm4
  2025. addpd 10 * SIZE(BUFFER), %xmm5
  2026. addpd 12 * SIZE(BUFFER), %xmm6
  2027. addpd 14 * SIZE(BUFFER), %xmm7
  2028. movapd %xmm0, (Y1)
  2029. addq INCY, Y1
  2030. movapd %xmm1, (Y1)
  2031. addq INCY, Y1
  2032. movapd %xmm2, (Y1)
  2033. addq INCY, Y1
  2034. movapd %xmm3, (Y1)
  2035. addq INCY, Y1
  2036. movapd %xmm4, (Y1)
  2037. addq INCY, Y1
  2038. movapd %xmm5, (Y1)
  2039. addq INCY, Y1
  2040. movapd %xmm6, (Y1)
  2041. addq INCY, Y1
  2042. movapd %xmm7, (Y1)
  2043. addq INCY, Y1
  2044. subq $-16 * SIZE, BUFFER
  2045. decq %rax
  2046. jg .L182
  2047. ALIGN_3
  2048. .L184:
  2049. testq $7, M
  2050. jle .L999
  2051. testq $4, M
  2052. jle .L185
  2053. movapd (Y), %xmm0
  2054. addq INCY, Y
  2055. movapd (Y), %xmm1
  2056. addq INCY, Y
  2057. movapd (Y), %xmm2
  2058. addq INCY, Y
  2059. movapd (Y), %xmm3
  2060. addq INCY, Y
  2061. addpd 0 * SIZE(BUFFER), %xmm0
  2062. addpd 2 * SIZE(BUFFER), %xmm1
  2063. addpd 4 * SIZE(BUFFER), %xmm2
  2064. addpd 6 * SIZE(BUFFER), %xmm3
  2065. movapd %xmm0, (Y1)
  2066. addq INCY, Y1
  2067. movapd %xmm1, (Y1)
  2068. addq INCY, Y1
  2069. movapd %xmm2, (Y1)
  2070. addq INCY, Y1
  2071. movapd %xmm3, (Y1)
  2072. addq INCY, Y1
  2073. addq $8 * SIZE, BUFFER
  2074. ALIGN_3
  2075. .L185:
  2076. testq $2, M
  2077. jle .L186
  2078. movapd (Y), %xmm0
  2079. addq INCY, Y
  2080. movapd (Y), %xmm1
  2081. addq INCY, Y
  2082. addpd 0 * SIZE(BUFFER), %xmm0
  2083. addpd 2 * SIZE(BUFFER), %xmm1
  2084. movapd %xmm0, (Y1)
  2085. addq INCY, Y1
  2086. movapd %xmm1, (Y1)
  2087. addq INCY, Y1
  2088. addq $4 * SIZE, BUFFER
  2089. ALIGN_3
  2090. .L186:
  2091. testq $1, M
  2092. jle .L999
  2093. movapd (Y), %xmm0
  2094. addpd (BUFFER), %xmm0
  2095. movapd %xmm0, (Y1)
  2096. jmp .L999
  2097. ALIGN_3
  2098. .L990:
  2099. movq Y, Y1
  2100. movq M, %rax
  2101. sarq $3, %rax
  2102. jle .L994
  2103. ALIGN_3
  2104. .L992:
  2105. movsd 0 * SIZE(Y), %xmm0
  2106. movhpd 1 * SIZE(Y), %xmm0
  2107. addq INCY, Y
  2108. movsd 0 * SIZE(Y), %xmm1
  2109. movhpd 1 * SIZE(Y), %xmm1
  2110. addq INCY, Y
  2111. movsd 0 * SIZE(Y), %xmm2
  2112. movhpd 1 * SIZE(Y), %xmm2
  2113. addq INCY, Y
  2114. movsd 0 * SIZE(Y), %xmm3
  2115. movhpd 1 * SIZE(Y), %xmm3
  2116. addq INCY, Y
  2117. movsd 0 * SIZE(Y), %xmm4
  2118. movhpd 1 * SIZE(Y), %xmm4
  2119. addq INCY, Y
  2120. movsd 0 * SIZE(Y), %xmm5
  2121. movhpd 1 * SIZE(Y), %xmm5
  2122. addq INCY, Y
  2123. movsd 0 * SIZE(Y), %xmm6
  2124. movhpd 1 * SIZE(Y), %xmm6
  2125. addq INCY, Y
  2126. movsd 0 * SIZE(Y), %xmm7
  2127. movhpd 1 * SIZE(Y), %xmm7
  2128. addq INCY, Y
  2129. addpd 0 * SIZE(BUFFER), %xmm0
  2130. addpd 2 * SIZE(BUFFER), %xmm1
  2131. addpd 4 * SIZE(BUFFER), %xmm2
  2132. addpd 6 * SIZE(BUFFER), %xmm3
  2133. addpd 8 * SIZE(BUFFER), %xmm4
  2134. addpd 10 * SIZE(BUFFER), %xmm5
  2135. addpd 12 * SIZE(BUFFER), %xmm6
  2136. addpd 14 * SIZE(BUFFER), %xmm7
  2137. movlpd %xmm0, 0 * SIZE(Y1)
  2138. movhpd %xmm0, 1 * SIZE(Y1)
  2139. addq INCY, Y1
  2140. movlpd %xmm1, 0 * SIZE(Y1)
  2141. movhpd %xmm1, 1 * SIZE(Y1)
  2142. addq INCY, Y1
  2143. movlpd %xmm2, 0 * SIZE(Y1)
  2144. movhpd %xmm2, 1 * SIZE(Y1)
  2145. addq INCY, Y1
  2146. movlpd %xmm3, 0 * SIZE(Y1)
  2147. movhpd %xmm3, 1 * SIZE(Y1)
  2148. addq INCY, Y1
  2149. movlpd %xmm4, 0 * SIZE(Y1)
  2150. movhpd %xmm4, 1 * SIZE(Y1)
  2151. addq INCY, Y1
  2152. movlpd %xmm5, 0 * SIZE(Y1)
  2153. movhpd %xmm5, 1 * SIZE(Y1)
  2154. addq INCY, Y1
  2155. movlpd %xmm6, 0 * SIZE(Y1)
  2156. movhpd %xmm6, 1 * SIZE(Y1)
  2157. addq INCY, Y1
  2158. movlpd %xmm7, 0 * SIZE(Y1)
  2159. movhpd %xmm7, 1 * SIZE(Y1)
  2160. addq INCY, Y1
  2161. subq $-16 * SIZE, BUFFER
  2162. decq %rax
  2163. jg .L992
  2164. ALIGN_3
  2165. .L994:
  2166. testq $7, M
  2167. jle .L999
  2168. testq $4, M
  2169. jle .L995
  2170. movsd 0 * SIZE(Y), %xmm0
  2171. movhpd 1 * SIZE(Y), %xmm0
  2172. addq INCY, Y
  2173. movsd 0 * SIZE(Y), %xmm1
  2174. movhpd 1 * SIZE(Y), %xmm1
  2175. addq INCY, Y
  2176. movsd 0 * SIZE(Y), %xmm2
  2177. movhpd 1 * SIZE(Y), %xmm2
  2178. addq INCY, Y
  2179. movsd 0 * SIZE(Y), %xmm3
  2180. movhpd 1 * SIZE(Y), %xmm3
  2181. addq INCY, Y
  2182. addpd 0 * SIZE(BUFFER), %xmm0
  2183. addpd 2 * SIZE(BUFFER), %xmm1
  2184. addpd 4 * SIZE(BUFFER), %xmm2
  2185. addpd 6 * SIZE(BUFFER), %xmm3
  2186. movlpd %xmm0, 0 * SIZE(Y1)
  2187. movhpd %xmm0, 1 * SIZE(Y1)
  2188. addq INCY, Y1
  2189. movlpd %xmm1, 0 * SIZE(Y1)
  2190. movhpd %xmm1, 1 * SIZE(Y1)
  2191. addq INCY, Y1
  2192. movlpd %xmm2, 0 * SIZE(Y1)
  2193. movhpd %xmm2, 1 * SIZE(Y1)
  2194. addq INCY, Y1
  2195. movlpd %xmm3, 0 * SIZE(Y1)
  2196. movhpd %xmm3, 1 * SIZE(Y1)
  2197. addq INCY, Y1
  2198. addq $8 * SIZE, BUFFER
  2199. ALIGN_3
  2200. .L995:
  2201. testq $2, M
  2202. jle .L996
  2203. movsd 0 * SIZE(Y), %xmm0
  2204. movhpd 1 * SIZE(Y), %xmm0
  2205. addq INCY, Y
  2206. movsd 0 * SIZE(Y), %xmm1
  2207. movhpd 1 * SIZE(Y), %xmm1
  2208. addq INCY, Y
  2209. addpd 0 * SIZE(BUFFER), %xmm0
  2210. addpd 2 * SIZE(BUFFER), %xmm1
  2211. movlpd %xmm0, 0 * SIZE(Y1)
  2212. movhpd %xmm0, 1 * SIZE(Y1)
  2213. addq INCY, Y1
  2214. movlpd %xmm1, 0 * SIZE(Y1)
  2215. movhpd %xmm1, 1 * SIZE(Y1)
  2216. addq INCY, Y1
  2217. addq $4 * SIZE, BUFFER
  2218. ALIGN_3
  2219. .L996:
  2220. testq $1, M
  2221. jle .L999
  2222. movsd 0 * SIZE(Y), %xmm0
  2223. movhpd 1 * SIZE(Y), %xmm0
  2224. addpd 0 * SIZE(BUFFER), %xmm0
  2225. movlpd %xmm0, 0 * SIZE(Y1)
  2226. movhpd %xmm0, 1 * SIZE(Y1)
  2227. ALIGN_3
  2228. .L999:
  2229. movq M, I
  2230. salq $ZBASE_SHIFT,I
  2231. addq I,AA
  2232. jmp .L0t
  2233. .L999x:
  2234. movq 0(%rsp), %rbx
  2235. movq 8(%rsp), %rbp
  2236. movq 16(%rsp), %r12
  2237. movq 24(%rsp), %r13
  2238. movq 32(%rsp), %r14
  2239. movq 40(%rsp), %r15
  2240. #ifdef WINDOWS_ABI
  2241. movq 48(%rsp), %rdi
  2242. movq 56(%rsp), %rsi
  2243. movups 64(%rsp), %xmm6
  2244. movups 80(%rsp), %xmm7
  2245. movups 96(%rsp), %xmm8
  2246. movups 112(%rsp), %xmm9
  2247. movups 128(%rsp), %xmm10
  2248. movups 144(%rsp), %xmm11
  2249. movups 160(%rsp), %xmm12
  2250. movups 176(%rsp), %xmm13
  2251. movups 192(%rsp), %xmm14
  2252. movups 208(%rsp), %xmm15
  2253. #endif
  2254. addq $STACKSIZE, %rsp
  2255. ret
  2256. EPILOGUE