You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemv_t.S 48 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "l2param.h"
  41. #if GEMV_UNROLL < 2
  42. #undef GEMV_UNROLL
  43. #define GEMV_UNROLL 2
  44. #endif
  45. #ifndef WINDOWS_ABI
  46. #define STACKSIZE 128
  47. #define OLD_M %rdi
  48. #define OLD_N %rsi
  49. #define OLD_A %rcx
  50. #define OLD_LDA %r8
  51. #define STACK_INCX 8 + STACKSIZE(%rsp)
  52. #define STACK_Y 16 + STACKSIZE(%rsp)
  53. #define STACK_INCY 24 + STACKSIZE(%rsp)
  54. #define STACK_BUFFER 32 + STACKSIZE(%rsp)
  55. #define MMM 56(%rsp)
  56. #define NN 64(%rsp)
  57. #define AA 72(%rsp)
  58. #define LDAX 80(%rsp)
  59. #else
  60. #define STACKSIZE 256
  61. #define OLD_M %rcx
  62. #define OLD_N %rdx
  63. #define OLD_A 40 + STACKSIZE(%rsp)
  64. #define OLD_LDA 48 + STACKSIZE(%rsp)
  65. #define OLD_X 56 + STACKSIZE(%rsp)
  66. #define STACK_INCX 64 + STACKSIZE(%rsp)
  67. #define STACK_Y 72 + STACKSIZE(%rsp)
  68. #define STACK_INCY 80 + STACKSIZE(%rsp)
  69. #define STACK_BUFFER 88 + STACKSIZE(%rsp)
  70. //Temp variables for M,N,A,LDA
  71. #define MMM 224(%rsp)
  72. #define NN 232(%rsp)
  73. #define AA 240(%rsp)
  74. #define LDAX 248(%rsp)
  75. #endif
  76. #define LDA %r8
  77. #define X %r9
  78. #define INCX %rsi
  79. #define INCY %rdi
  80. #define M %r10
  81. #define N %r11
  82. #define A %r12
  83. #define Y %r14
  84. #define BUFFER %r13
  85. #define I %rax
  86. #define A1 %rbx
  87. #define A2 %rcx
  88. #define LDA3 %rdx
  89. #define Y1 %rbp
  90. #define X1 %r15
  91. #ifdef ALIGNED_ACCESS
  92. #define MM INCX
  93. #else
  94. #define MM M
  95. #endif
  96. #define ALPHA %xmm15
  97. PROLOGUE
  98. PROFCODE
  99. subq $STACKSIZE, %rsp
  100. movq %rbx, 0(%rsp)
  101. movq %rbp, 8(%rsp)
  102. movq %r12, 16(%rsp)
  103. movq %r13, 24(%rsp)
  104. movq %r14, 32(%rsp)
  105. movq %r15, 40(%rsp)
  106. #ifdef WINDOWS_ABI
  107. movq %rdi, 48(%rsp)
  108. movq %rsi, 56(%rsp)
  109. movups %xmm6, 64(%rsp)
  110. movups %xmm7, 80(%rsp)
  111. movups %xmm8, 96(%rsp)
  112. movups %xmm9, 112(%rsp)
  113. movups %xmm10, 128(%rsp)
  114. movups %xmm11, 144(%rsp)
  115. movups %xmm12, 160(%rsp)
  116. movups %xmm13, 176(%rsp)
  117. movups %xmm14, 192(%rsp)
  118. movups %xmm15, 208(%rsp)
  119. movq OLD_M, M
  120. movq OLD_N, N
  121. movq OLD_A, A
  122. movq OLD_LDA, LDA
  123. movq OLD_X, X
  124. movq M, MMM
  125. movq N, NN
  126. movq A, AA
  127. movq LDA, LDAX
  128. #else
  129. movq OLD_M, MMM
  130. movq OLD_N, NN
  131. movq OLD_A, AA
  132. movq OLD_LDA, LDAX
  133. #endif
  134. #ifdef HAVE_SSE3
  135. #ifndef WINDOWS_ABI
  136. movddup %xmm0, ALPHA
  137. #else
  138. movddup %xmm3, ALPHA
  139. #endif
  140. #else
  141. #ifndef WINDOWS_ABI
  142. movapd %xmm0, ALPHA
  143. #else
  144. movapd %xmm3, ALPHA
  145. #endif
  146. unpcklpd ALPHA, ALPHA
  147. #endif
  148. .L0x:
  149. xorq M,M
  150. addq $1,M
  151. salq $21,M
  152. subq M,MMM
  153. jge .L00
  154. movq MMM,%rax
  155. addq M,%rax
  156. jle .L999x
  157. movq %rax,M
  158. .L00:
  159. movq LDAX,LDA
  160. movq NN,N
  161. movq AA,A
  162. movq STACK_INCX, INCX
  163. movq STACK_Y, Y
  164. movq STACK_INCY, INCY
  165. movq STACK_BUFFER, BUFFER
  166. leaq -1(INCX), %rax
  167. leaq (,LDA, SIZE), LDA
  168. leaq (,INCX, SIZE), INCX
  169. leaq (,INCY, SIZE), INCY
  170. leaq (LDA, LDA, 2), LDA3
  171. subq $-16 * SIZE, A
  172. testq M, M
  173. jle .L999
  174. testq N, N
  175. jle .L999
  176. movq BUFFER, X1
  177. #ifdef ALIGNED_ACCESS
  178. testq $SIZE, A
  179. je .L01
  180. movsd (X), %xmm0
  181. addq INCX, X
  182. movsd %xmm0, 1 * SIZE(BUFFER)
  183. addq $1 * SIZE, BUFFER
  184. addq $2 * SIZE, X1
  185. decq M
  186. jle .L10
  187. ALIGN_4
  188. .L01:
  189. #endif
  190. movq M, I
  191. sarq $3, I
  192. jle .L05
  193. ALIGN_4
  194. .L02:
  195. movsd (X), %xmm0
  196. addq INCX, X
  197. movhpd (X), %xmm0
  198. addq INCX, X
  199. movsd (X), %xmm1
  200. addq INCX, X
  201. movhpd (X), %xmm1
  202. addq INCX, X
  203. movsd (X), %xmm2
  204. addq INCX, X
  205. movhpd (X), %xmm2
  206. addq INCX, X
  207. movsd (X), %xmm3
  208. addq INCX, X
  209. movhpd (X), %xmm3
  210. addq INCX, X
  211. movapd %xmm0, 0 * SIZE(X1)
  212. movapd %xmm1, 2 * SIZE(X1)
  213. movapd %xmm2, 4 * SIZE(X1)
  214. movapd %xmm3, 6 * SIZE(X1)
  215. addq $8 * SIZE, X1
  216. decq I
  217. jg .L02
  218. ALIGN_4
  219. .L05:
  220. movq M, I
  221. andq $7, I
  222. jle .L10
  223. ALIGN_2
  224. .L06:
  225. movsd (X), %xmm0
  226. addq INCX, X
  227. movsd %xmm0, 0 * SIZE(X1)
  228. addq $SIZE, X1
  229. decq I
  230. jg .L06
  231. ALIGN_4
  232. .L10:
  233. movq Y, Y1
  234. #ifdef ALIGNED_ACCESS
  235. testq $SIZE, LDA
  236. jne .L50
  237. #endif
  238. #if GEMV_UNROLL >= 8
  239. cmpq $8, N
  240. jl .L20
  241. ALIGN_3
  242. .L11:
  243. subq $8, N
  244. leaq 16 * SIZE(BUFFER), X1
  245. movq A, A1
  246. leaq (A1, LDA, 4), A2
  247. leaq (A1, LDA, 8), A
  248. xorps %xmm0, %xmm0
  249. xorps %xmm1, %xmm1
  250. xorps %xmm2, %xmm2
  251. xorps %xmm3, %xmm3
  252. xorps %xmm4, %xmm4
  253. xorps %xmm5, %xmm5
  254. xorps %xmm6, %xmm6
  255. xorps %xmm7, %xmm7
  256. #ifdef PREFETCHW
  257. PREFETCHW 7 * SIZE(Y1)
  258. #endif
  259. #ifdef ALIGNED_ACCESS
  260. testq $SIZE, A
  261. je .L1X
  262. movsd -16 * SIZE(X1), %xmm12
  263. movsd -16 * SIZE(A1), %xmm8
  264. mulsd %xmm12, %xmm8
  265. addsd %xmm8, %xmm0
  266. movsd -16 * SIZE(A1, LDA), %xmm9
  267. mulsd %xmm12, %xmm9
  268. addsd %xmm9, %xmm1
  269. movsd -16 * SIZE(A1, LDA, 2), %xmm10
  270. mulsd %xmm12, %xmm10
  271. addsd %xmm10, %xmm2
  272. movsd -16 * SIZE(A1, LDA3), %xmm11
  273. mulsd %xmm12, %xmm11
  274. addsd %xmm11, %xmm3
  275. movsd -16 * SIZE(A2), %xmm8
  276. mulsd %xmm12, %xmm8
  277. addsd %xmm8, %xmm4
  278. movsd -16 * SIZE(A2, LDA), %xmm9
  279. mulsd %xmm12, %xmm9
  280. addsd %xmm9, %xmm5
  281. movsd -16 * SIZE(A2, LDA, 2), %xmm10
  282. mulsd %xmm12, %xmm10
  283. addsd %xmm10, %xmm6
  284. movsd -16 * SIZE(A2, LDA3), %xmm11
  285. mulsd %xmm12, %xmm11
  286. addsd %xmm11, %xmm7
  287. addq $SIZE, A1
  288. addq $SIZE, A2
  289. addq $SIZE, X1
  290. ALIGN_3
  291. .L1X:
  292. #endif
  293. movq M, I
  294. sarq $3, I
  295. jle .L15
  296. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  297. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9)
  298. MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm10)
  299. MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm11)
  300. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  301. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  302. decq I
  303. jle .L13
  304. ALIGN_4
  305. .L12:
  306. #ifdef PREFETCH
  307. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1)
  308. #endif
  309. mulpd %xmm12, %xmm8
  310. addpd %xmm8, %xmm0
  311. MOVUPS_A1(-16 * SIZE, A2, %xmm8)
  312. mulpd %xmm12, %xmm9
  313. addpd %xmm9, %xmm1
  314. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9)
  315. mulpd %xmm12, %xmm10
  316. addpd %xmm10, %xmm2
  317. MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10)
  318. mulpd %xmm12, %xmm11
  319. addpd %xmm11, %xmm3
  320. MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11)
  321. #ifdef PREFETCH
  322. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
  323. #endif
  324. mulpd %xmm12, %xmm8
  325. addpd %xmm8, %xmm4
  326. MOVUPS_A1(-14 * SIZE, A1, %xmm8)
  327. mulpd %xmm12, %xmm9
  328. addpd %xmm9, %xmm5
  329. MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9)
  330. mulpd %xmm12, %xmm10
  331. addpd %xmm10, %xmm6
  332. MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm10)
  333. mulpd %xmm12, %xmm11
  334. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  335. addpd %xmm11, %xmm7
  336. MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm11)
  337. #ifdef PREFETCH
  338. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2)
  339. #endif
  340. mulpd %xmm13, %xmm8
  341. addpd %xmm8, %xmm0
  342. MOVUPS_A1(-14 * SIZE, A2, %xmm8)
  343. mulpd %xmm13, %xmm9
  344. addpd %xmm9, %xmm1
  345. MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm9)
  346. mulpd %xmm13, %xmm10
  347. addpd %xmm10, %xmm2
  348. MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm10)
  349. mulpd %xmm13, %xmm11
  350. addpd %xmm11, %xmm3
  351. MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm11)
  352. #ifdef PREFETCH
  353. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3)
  354. #endif
  355. mulpd %xmm13, %xmm8
  356. addpd %xmm8, %xmm4
  357. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  358. mulpd %xmm13, %xmm9
  359. addpd %xmm9, %xmm5
  360. MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9)
  361. mulpd %xmm13, %xmm10
  362. addpd %xmm10, %xmm6
  363. MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm10)
  364. mulpd %xmm13, %xmm11
  365. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  366. addpd %xmm11, %xmm7
  367. MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm11)
  368. #ifdef PREFETCH
  369. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2)
  370. #endif
  371. mulpd %xmm12, %xmm8
  372. addpd %xmm8, %xmm0
  373. MOVUPS_A1(-12 * SIZE, A2, %xmm8)
  374. mulpd %xmm12, %xmm9
  375. addpd %xmm9, %xmm1
  376. MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm9)
  377. mulpd %xmm12, %xmm10
  378. addpd %xmm10, %xmm2
  379. MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm10)
  380. mulpd %xmm12, %xmm11
  381. addpd %xmm11, %xmm3
  382. MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm11)
  383. #ifdef PREFETCH
  384. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
  385. #endif
  386. mulpd %xmm12, %xmm8
  387. addpd %xmm8, %xmm4
  388. MOVUPS_A1(-10 * SIZE, A1, %xmm8)
  389. mulpd %xmm12, %xmm9
  390. addpd %xmm9, %xmm5
  391. MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9)
  392. mulpd %xmm12, %xmm10
  393. addpd %xmm10, %xmm6
  394. MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm10)
  395. mulpd %xmm12, %xmm11
  396. MOVUPS_XL1(-8 * SIZE, X1, %xmm12)
  397. addpd %xmm11, %xmm7
  398. MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm11)
  399. #ifdef PREFETCH
  400. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2)
  401. #endif
  402. mulpd %xmm13, %xmm8
  403. addpd %xmm8, %xmm0
  404. MOVUPS_A1(-10 * SIZE, A2, %xmm8)
  405. mulpd %xmm13, %xmm9
  406. addpd %xmm9, %xmm1
  407. MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm9)
  408. mulpd %xmm13, %xmm10
  409. addpd %xmm10, %xmm2
  410. MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm10)
  411. mulpd %xmm13, %xmm11
  412. addpd %xmm11, %xmm3
  413. MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm11)
  414. #ifdef PREFETCH
  415. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3)
  416. #endif
  417. mulpd %xmm13, %xmm8
  418. addpd %xmm8, %xmm4
  419. MOVUPS_A1(-8 * SIZE, A1, %xmm8)
  420. mulpd %xmm13, %xmm9
  421. addpd %xmm9, %xmm5
  422. MOVUPS_A2(-8 * SIZE, A1, LDA, 1, %xmm9)
  423. #ifdef PREFETCHW
  424. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1)
  425. #endif
  426. mulpd %xmm13, %xmm10
  427. addpd %xmm10, %xmm6
  428. MOVUPS_A2(-8 * SIZE, A1, LDA, 2, %xmm10)
  429. mulpd %xmm13, %xmm11
  430. MOVUPS_XL1(-6 * SIZE, X1, %xmm13)
  431. addpd %xmm11, %xmm7
  432. MOVUPS_A2(-8 * SIZE, A1, LDA3, 1, %xmm11)
  433. addq $8 * SIZE, A1
  434. addq $8 * SIZE, A2
  435. addq $8 * SIZE, X1
  436. decq I
  437. jg .L12
  438. ALIGN_4
  439. .L13:
  440. mulpd %xmm12, %xmm8
  441. addpd %xmm8, %xmm0
  442. MOVUPS_A1(-16 * SIZE, A2, %xmm8)
  443. mulpd %xmm12, %xmm9
  444. addpd %xmm9, %xmm1
  445. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9)
  446. mulpd %xmm12, %xmm10
  447. addpd %xmm10, %xmm2
  448. MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10)
  449. mulpd %xmm12, %xmm11
  450. addpd %xmm11, %xmm3
  451. MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11)
  452. mulpd %xmm12, %xmm8
  453. addpd %xmm8, %xmm4
  454. MOVUPS_A1(-14 * SIZE, A1, %xmm8)
  455. mulpd %xmm12, %xmm9
  456. addpd %xmm9, %xmm5
  457. MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9)
  458. mulpd %xmm12, %xmm10
  459. addpd %xmm10, %xmm6
  460. MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm10)
  461. mulpd %xmm12, %xmm11
  462. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  463. addpd %xmm11, %xmm7
  464. MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm11)
  465. mulpd %xmm13, %xmm8
  466. addpd %xmm8, %xmm0
  467. MOVUPS_A1(-14 * SIZE, A2, %xmm8)
  468. mulpd %xmm13, %xmm9
  469. addpd %xmm9, %xmm1
  470. MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm9)
  471. mulpd %xmm13, %xmm10
  472. addpd %xmm10, %xmm2
  473. MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm10)
  474. mulpd %xmm13, %xmm11
  475. addpd %xmm11, %xmm3
  476. MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm11)
  477. mulpd %xmm13, %xmm8
  478. addpd %xmm8, %xmm4
  479. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  480. mulpd %xmm13, %xmm9
  481. addpd %xmm9, %xmm5
  482. MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9)
  483. mulpd %xmm13, %xmm10
  484. addpd %xmm10, %xmm6
  485. MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm10)
  486. mulpd %xmm13, %xmm11
  487. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  488. addpd %xmm11, %xmm7
  489. MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm11)
  490. mulpd %xmm12, %xmm8
  491. addpd %xmm8, %xmm0
  492. MOVUPS_A1(-12 * SIZE, A2, %xmm8)
  493. mulpd %xmm12, %xmm9
  494. addpd %xmm9, %xmm1
  495. MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm9)
  496. mulpd %xmm12, %xmm10
  497. addpd %xmm10, %xmm2
  498. MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm10)
  499. mulpd %xmm12, %xmm11
  500. addpd %xmm11, %xmm3
  501. MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm11)
  502. mulpd %xmm12, %xmm8
  503. addpd %xmm8, %xmm4
  504. MOVUPS_A1(-10 * SIZE, A1, %xmm8)
  505. mulpd %xmm12, %xmm9
  506. addpd %xmm9, %xmm5
  507. MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9)
  508. mulpd %xmm12, %xmm10
  509. addpd %xmm10, %xmm6
  510. MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm10)
  511. mulpd %xmm12, %xmm11
  512. addpd %xmm11, %xmm7
  513. MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm11)
  514. mulpd %xmm13, %xmm8
  515. addpd %xmm8, %xmm0
  516. MOVUPS_A1(-10 * SIZE, A2, %xmm8)
  517. mulpd %xmm13, %xmm9
  518. addpd %xmm9, %xmm1
  519. MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm9)
  520. mulpd %xmm13, %xmm10
  521. addpd %xmm10, %xmm2
  522. MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm10)
  523. mulpd %xmm13, %xmm11
  524. addpd %xmm11, %xmm3
  525. MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm11)
  526. mulpd %xmm13, %xmm8
  527. addpd %xmm8, %xmm4
  528. mulpd %xmm13, %xmm9
  529. addpd %xmm9, %xmm5
  530. mulpd %xmm13, %xmm10
  531. addpd %xmm10, %xmm6
  532. mulpd %xmm13, %xmm11
  533. addpd %xmm11, %xmm7
  534. addq $8 * SIZE, A1
  535. addq $8 * SIZE, A2
  536. addq $8 * SIZE, X1
  537. ALIGN_4
  538. .L15:
  539. testq $4, M
  540. jle .L16
  541. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  542. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9)
  543. MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm10)
  544. MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm11)
  545. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  546. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  547. mulpd %xmm12, %xmm8
  548. addpd %xmm8, %xmm0
  549. MOVUPS_A1(-16 * SIZE, A2, %xmm8)
  550. mulpd %xmm12, %xmm9
  551. addpd %xmm9, %xmm1
  552. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9)
  553. mulpd %xmm12, %xmm10
  554. addpd %xmm10, %xmm2
  555. MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10)
  556. mulpd %xmm12, %xmm11
  557. addpd %xmm11, %xmm3
  558. MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11)
  559. mulpd %xmm12, %xmm8
  560. addpd %xmm8, %xmm4
  561. MOVUPS_A1(-14 * SIZE, A1, %xmm8)
  562. mulpd %xmm12, %xmm9
  563. addpd %xmm9, %xmm5
  564. MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9)
  565. mulpd %xmm12, %xmm10
  566. addpd %xmm10, %xmm6
  567. MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm10)
  568. mulpd %xmm12, %xmm11
  569. addpd %xmm11, %xmm7
  570. MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm11)
  571. mulpd %xmm13, %xmm8
  572. addpd %xmm8, %xmm0
  573. MOVUPS_A1(-14 * SIZE, A2, %xmm8)
  574. mulpd %xmm13, %xmm9
  575. addpd %xmm9, %xmm1
  576. MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm9)
  577. mulpd %xmm13, %xmm10
  578. addpd %xmm10, %xmm2
  579. MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm10)
  580. mulpd %xmm13, %xmm11
  581. addpd %xmm11, %xmm3
  582. MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm11)
  583. mulpd %xmm13, %xmm8
  584. addpd %xmm8, %xmm4
  585. mulpd %xmm13, %xmm9
  586. addpd %xmm9, %xmm5
  587. mulpd %xmm13, %xmm10
  588. addpd %xmm10, %xmm6
  589. mulpd %xmm13, %xmm11
  590. addpd %xmm11, %xmm7
  591. addq $4 * SIZE, A1
  592. addq $4 * SIZE, A2
  593. addq $4 * SIZE, X1
  594. ALIGN_4
  595. .L16:
  596. testq $2, M
  597. jle .L17
  598. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  599. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9)
  600. MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm10)
  601. MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm11)
  602. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  603. mulpd %xmm12, %xmm8
  604. addpd %xmm8, %xmm0
  605. MOVUPS_A1(-16 * SIZE, A2, %xmm8)
  606. mulpd %xmm12, %xmm9
  607. addpd %xmm9, %xmm1
  608. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9)
  609. mulpd %xmm12, %xmm10
  610. addpd %xmm10, %xmm2
  611. MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10)
  612. mulpd %xmm12, %xmm11
  613. addpd %xmm11, %xmm3
  614. MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11)
  615. mulpd %xmm12, %xmm8
  616. addpd %xmm8, %xmm4
  617. mulpd %xmm12, %xmm9
  618. addpd %xmm9, %xmm5
  619. mulpd %xmm12, %xmm10
  620. addpd %xmm10, %xmm6
  621. mulpd %xmm12, %xmm11
  622. addpd %xmm11, %xmm7
  623. addq $2 * SIZE, A1
  624. addq $2 * SIZE, A2
  625. addq $2 * SIZE, X1
  626. ALIGN_4
  627. .L17:
  628. testq $1, M
  629. je .L18
  630. movsd -16 * SIZE(X1), %xmm12
  631. movsd -16 * SIZE(A1), %xmm8
  632. mulsd %xmm12, %xmm8
  633. addsd %xmm8, %xmm0
  634. movsd -16 * SIZE(A1, LDA), %xmm9
  635. mulsd %xmm12, %xmm9
  636. addsd %xmm9, %xmm1
  637. movsd -16 * SIZE(A1, LDA, 2), %xmm10
  638. mulsd %xmm12, %xmm10
  639. addsd %xmm10, %xmm2
  640. movsd -16 * SIZE(A1, LDA3), %xmm11
  641. mulsd %xmm12, %xmm11
  642. addsd %xmm11, %xmm3
  643. movsd -16 * SIZE(A2), %xmm8
  644. mulsd %xmm12, %xmm8
  645. addsd %xmm8, %xmm4
  646. movsd -16 * SIZE(A2, LDA), %xmm9
  647. mulsd %xmm12, %xmm9
  648. addsd %xmm9, %xmm5
  649. movsd -16 * SIZE(A2, LDA, 2), %xmm10
  650. mulsd %xmm12, %xmm10
  651. addsd %xmm10, %xmm6
  652. movsd -16 * SIZE(A2, LDA3), %xmm11
  653. mulsd %xmm12, %xmm11
  654. addsd %xmm11, %xmm7
  655. ALIGN_4
  656. .L18:
  657. #ifdef HAVE_SSE3
  658. haddpd %xmm1, %xmm0
  659. haddpd %xmm3, %xmm2
  660. haddpd %xmm5, %xmm4
  661. haddpd %xmm7, %xmm6
  662. #else
  663. movapd %xmm0, %xmm8
  664. unpcklpd %xmm1, %xmm0
  665. unpckhpd %xmm1, %xmm8
  666. movapd %xmm2, %xmm9
  667. unpcklpd %xmm3, %xmm2
  668. unpckhpd %xmm3, %xmm9
  669. movapd %xmm4, %xmm10
  670. unpcklpd %xmm5, %xmm4
  671. unpckhpd %xmm5, %xmm10
  672. movapd %xmm6, %xmm11
  673. unpcklpd %xmm7, %xmm6
  674. unpckhpd %xmm7, %xmm11
  675. addpd %xmm8, %xmm0
  676. addpd %xmm9, %xmm2
  677. addpd %xmm10, %xmm4
  678. addpd %xmm11, %xmm6
  679. #endif
  680. mulpd ALPHA, %xmm0
  681. mulpd ALPHA, %xmm2
  682. mulpd ALPHA, %xmm4
  683. mulpd ALPHA, %xmm6
  684. cmpq $SIZE, INCY
  685. jne .L19
  686. movsd 0 * SIZE(Y), %xmm8
  687. movhpd 1 * SIZE(Y), %xmm8
  688. movsd 2 * SIZE(Y), %xmm9
  689. movhpd 3 * SIZE(Y), %xmm9
  690. movsd 4 * SIZE(Y), %xmm10
  691. movhpd 5 * SIZE(Y), %xmm10
  692. movsd 6 * SIZE(Y), %xmm11
  693. movhpd 7 * SIZE(Y), %xmm11
  694. addq $8 * SIZE, Y
  695. addpd %xmm8, %xmm0
  696. addpd %xmm9, %xmm2
  697. addpd %xmm10, %xmm4
  698. addpd %xmm11, %xmm6
  699. movlpd %xmm0, 0 * SIZE(Y1)
  700. movhpd %xmm0, 1 * SIZE(Y1)
  701. movlpd %xmm2, 2 * SIZE(Y1)
  702. movhpd %xmm2, 3 * SIZE(Y1)
  703. movlpd %xmm4, 4 * SIZE(Y1)
  704. movhpd %xmm4, 5 * SIZE(Y1)
  705. movlpd %xmm6, 6 * SIZE(Y1)
  706. movhpd %xmm6, 7 * SIZE(Y1)
  707. addq $8 * SIZE, Y1
  708. cmpq $8, N
  709. jge .L11
  710. jmp .L20
  711. ALIGN_4
  712. .L19:
  713. movsd (Y), %xmm8
  714. addq INCY, Y
  715. movhpd (Y), %xmm8
  716. addq INCY, Y
  717. movsd (Y), %xmm9
  718. addq INCY, Y
  719. movhpd (Y), %xmm9
  720. addq INCY, Y
  721. movsd (Y), %xmm10
  722. addq INCY, Y
  723. movhpd (Y), %xmm10
  724. addq INCY, Y
  725. movsd (Y), %xmm11
  726. addq INCY, Y
  727. movhpd (Y), %xmm11
  728. addq INCY, Y
  729. addpd %xmm8, %xmm0
  730. addpd %xmm9, %xmm2
  731. addpd %xmm10, %xmm4
  732. addpd %xmm11, %xmm6
  733. movlpd %xmm0, (Y1)
  734. addq INCY, Y1
  735. movhpd %xmm0, (Y1)
  736. addq INCY, Y1
  737. movlpd %xmm2, (Y1)
  738. addq INCY, Y1
  739. movhpd %xmm2, (Y1)
  740. addq INCY, Y1
  741. movlpd %xmm4, (Y1)
  742. addq INCY, Y1
  743. movhpd %xmm4, (Y1)
  744. addq INCY, Y1
  745. movlpd %xmm6, (Y1)
  746. addq INCY, Y1
  747. movhpd %xmm6, (Y1)
  748. addq INCY, Y1
  749. cmpq $8, N
  750. jge .L11
  751. ALIGN_4
  752. .L20:
  753. #endif
  754. #if GEMV_UNROLL >= 4
  755. cmpq $4, N
  756. jl .L30
  757. #if GEMV_UNROLL == 4
  758. ALIGN_3
  759. .L21:
  760. #endif
  761. subq $4, N
  762. leaq 16 * SIZE(BUFFER), X1
  763. movq A, A1
  764. leaq (A1, LDA, 2), A2
  765. leaq (A1, LDA, 4), A
  766. xorps %xmm0, %xmm0
  767. xorps %xmm1, %xmm1
  768. xorps %xmm2, %xmm2
  769. xorps %xmm3, %xmm3
  770. #if (GEMV_UNROLL == 4 ) && defined(PREFETCHW)
  771. PREFETCHW 3 * SIZE(Y1)
  772. #endif
  773. #ifdef ALIGNED_ACCESS
  774. testq $SIZE, A
  775. je .L2X
  776. movsd -16 * SIZE(X1), %xmm12
  777. movsd -16 * SIZE(A1), %xmm8
  778. mulsd %xmm12, %xmm8
  779. addsd %xmm8, %xmm0
  780. movsd -16 * SIZE(A1, LDA), %xmm9
  781. mulsd %xmm12, %xmm9
  782. addsd %xmm9, %xmm1
  783. movsd -16 * SIZE(A2), %xmm10
  784. mulsd %xmm12, %xmm10
  785. addsd %xmm10, %xmm2
  786. movsd -16 * SIZE(A2, LDA), %xmm11
  787. mulsd %xmm12, %xmm11
  788. addsd %xmm11, %xmm3
  789. addq $SIZE, A1
  790. addq $SIZE, A2
  791. addq $SIZE, X1
  792. ALIGN_3
  793. .L2X:
  794. #endif
  795. movq M, I
  796. sarq $3, I
  797. jle .L25
  798. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  799. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9)
  800. MOVUPS_A1(-16 * SIZE, A2, %xmm10)
  801. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11)
  802. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  803. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  804. decq I
  805. jle .L23
  806. ALIGN_4
  807. .L22:
  808. #ifdef PREFETCH
  809. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  810. #endif
  811. mulpd %xmm12, %xmm8
  812. addpd %xmm8, %xmm0
  813. MOVUPS_A1(-14 * SIZE, A1, %xmm8)
  814. mulpd %xmm12, %xmm9
  815. addpd %xmm9, %xmm1
  816. MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9)
  817. mulpd %xmm12, %xmm10
  818. addpd %xmm10, %xmm2
  819. MOVUPS_A1(-14 * SIZE, A2, %xmm10)
  820. mulpd %xmm12, %xmm11
  821. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  822. addpd %xmm11, %xmm3
  823. MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm11)
  824. #ifdef PREFETCH
  825. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA)
  826. #endif
  827. mulpd %xmm13, %xmm8
  828. addpd %xmm8, %xmm0
  829. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  830. mulpd %xmm13, %xmm9
  831. addpd %xmm9, %xmm1
  832. MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9)
  833. mulpd %xmm13, %xmm10
  834. addpd %xmm10, %xmm2
  835. MOVUPS_A1(-12 * SIZE, A2, %xmm10)
  836. mulpd %xmm13, %xmm11
  837. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  838. addpd %xmm11, %xmm3
  839. MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm11)
  840. #ifdef PREFETCH
  841. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  842. #endif
  843. mulpd %xmm12, %xmm8
  844. addpd %xmm8, %xmm0
  845. MOVUPS_A1(-10 * SIZE, A1, %xmm8)
  846. mulpd %xmm12, %xmm9
  847. addpd %xmm9, %xmm1
  848. MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9)
  849. mulpd %xmm12, %xmm10
  850. addpd %xmm10, %xmm2
  851. MOVUPS_A1(-10 * SIZE, A2, %xmm10)
  852. mulpd %xmm12, %xmm11
  853. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  854. addpd %xmm11, %xmm3
  855. MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm11)
  856. #ifdef PREFETCH
  857. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA)
  858. #endif
  859. mulpd %xmm13, %xmm8
  860. addpd %xmm8, %xmm0
  861. MOVUPS_A1( -8 * SIZE, A1, %xmm8)
  862. mulpd %xmm13, %xmm9
  863. addpd %xmm9, %xmm1
  864. MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm9)
  865. #ifdef PREFETCHW
  866. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
  867. #endif
  868. mulpd %xmm13, %xmm10
  869. addpd %xmm10, %xmm2
  870. MOVUPS_A1( -8 * SIZE, A2, %xmm10)
  871. mulpd %xmm13, %xmm11
  872. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  873. addpd %xmm11, %xmm3
  874. MOVUPS_A2( -8 * SIZE, A2, LDA, 1, %xmm11)
  875. addq $8 * SIZE, A1
  876. addq $8 * SIZE, A2
  877. addq $8 * SIZE, X1
  878. decq I
  879. jg .L22
  880. ALIGN_4
  881. .L23:
  882. mulpd %xmm12, %xmm8
  883. addpd %xmm8, %xmm0
  884. MOVUPS_A1(-14 * SIZE, A1, %xmm8)
  885. mulpd %xmm12, %xmm9
  886. addpd %xmm9, %xmm1
  887. MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9)
  888. mulpd %xmm12, %xmm10
  889. addpd %xmm10, %xmm2
  890. MOVUPS_A1(-14 * SIZE, A2, %xmm10)
  891. mulpd %xmm12, %xmm11
  892. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  893. addpd %xmm11, %xmm3
  894. MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm11)
  895. mulpd %xmm13, %xmm8
  896. addpd %xmm8, %xmm0
  897. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  898. mulpd %xmm13, %xmm9
  899. addpd %xmm9, %xmm1
  900. MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9)
  901. mulpd %xmm13, %xmm10
  902. addpd %xmm10, %xmm2
  903. MOVUPS_A1(-12 * SIZE, A2, %xmm10)
  904. mulpd %xmm13, %xmm11
  905. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  906. addpd %xmm11, %xmm3
  907. MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm11)
  908. mulpd %xmm12, %xmm8
  909. addpd %xmm8, %xmm0
  910. MOVUPS_A1(-10 * SIZE, A1, %xmm8)
  911. mulpd %xmm12, %xmm9
  912. addpd %xmm9, %xmm1
  913. MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9)
  914. mulpd %xmm12, %xmm10
  915. addpd %xmm10, %xmm2
  916. MOVUPS_A1(-10 * SIZE, A2, %xmm10)
  917. mulpd %xmm12, %xmm11
  918. addpd %xmm11, %xmm3
  919. MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm11)
  920. mulpd %xmm13, %xmm8
  921. addpd %xmm8, %xmm0
  922. mulpd %xmm13, %xmm9
  923. addpd %xmm9, %xmm1
  924. mulpd %xmm13, %xmm10
  925. addpd %xmm10, %xmm2
  926. mulpd %xmm13, %xmm11
  927. addpd %xmm11, %xmm3
  928. addq $8 * SIZE, A1
  929. addq $8 * SIZE, A2
  930. addq $8 * SIZE, X1
  931. ALIGN_4
  932. .L25:
  933. testq $4, M
  934. jle .L26
  935. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  936. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  937. mulpd %xmm12, %xmm8
  938. addpd %xmm8, %xmm0
  939. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9)
  940. mulpd %xmm12, %xmm9
  941. addpd %xmm9, %xmm1
  942. MOVUPS_A1(-16 * SIZE, A2, %xmm10)
  943. mulpd %xmm12, %xmm10
  944. addpd %xmm10, %xmm2
  945. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11)
  946. mulpd %xmm12, %xmm11
  947. addpd %xmm11, %xmm3
  948. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  949. MOVUPS_A1(-14 * SIZE, A1, %xmm8)
  950. mulpd %xmm13, %xmm8
  951. addpd %xmm8, %xmm0
  952. MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9)
  953. mulpd %xmm13, %xmm9
  954. addpd %xmm9, %xmm1
  955. MOVUPS_A1(-14 * SIZE, A2, %xmm10)
  956. mulpd %xmm13, %xmm10
  957. addpd %xmm10, %xmm2
  958. MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm11)
  959. mulpd %xmm13, %xmm11
  960. addpd %xmm11, %xmm3
  961. addq $4 * SIZE, A1
  962. addq $4 * SIZE, A2
  963. addq $4 * SIZE, X1
  964. ALIGN_4
  965. .L26:
  966. testq $2, M
  967. jle .L27
  968. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  969. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  970. mulpd %xmm12, %xmm8
  971. addpd %xmm8, %xmm0
  972. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9)
  973. mulpd %xmm12, %xmm9
  974. addpd %xmm9, %xmm1
  975. MOVUPS_A1(-16 * SIZE, A2, %xmm10)
  976. mulpd %xmm12, %xmm10
  977. addpd %xmm10, %xmm2
  978. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11)
  979. mulpd %xmm12, %xmm11
  980. addpd %xmm11, %xmm3
  981. addq $2 * SIZE, A1
  982. addq $2 * SIZE, A2
  983. addq $2 * SIZE, X1
  984. ALIGN_4
  985. .L27:
  986. testq $1, M
  987. je .L28
  988. movsd -16 * SIZE(X1), %xmm12
  989. movsd -16 * SIZE(A1), %xmm8
  990. mulsd %xmm12, %xmm8
  991. addsd %xmm8, %xmm0
  992. movsd -16 * SIZE(A1, LDA), %xmm9
  993. mulsd %xmm12, %xmm9
  994. addsd %xmm9, %xmm1
  995. movsd -16 * SIZE(A2), %xmm10
  996. mulsd %xmm12, %xmm10
  997. addsd %xmm10, %xmm2
  998. movsd -16 * SIZE(A2, LDA), %xmm11
  999. mulsd %xmm12, %xmm11
  1000. addsd %xmm11, %xmm3
  1001. ALIGN_4
  1002. .L28:
  1003. #ifdef HAVE_SSE3
  1004. haddpd %xmm1, %xmm0
  1005. haddpd %xmm3, %xmm2
  1006. #else
  1007. movapd %xmm0, %xmm8
  1008. unpcklpd %xmm1, %xmm0
  1009. unpckhpd %xmm1, %xmm8
  1010. movapd %xmm2, %xmm9
  1011. unpcklpd %xmm3, %xmm2
  1012. unpckhpd %xmm3, %xmm9
  1013. addpd %xmm8, %xmm0
  1014. addpd %xmm9, %xmm2
  1015. #endif
  1016. mulpd ALPHA, %xmm0
  1017. mulpd ALPHA, %xmm2
  1018. cmpq $SIZE, INCY
  1019. jne .L29
  1020. movsd 0 * SIZE(Y), %xmm4
  1021. movhpd 1 * SIZE(Y), %xmm4
  1022. movsd 2 * SIZE(Y), %xmm5
  1023. movhpd 3 * SIZE(Y), %xmm5
  1024. addq $4 * SIZE, Y
  1025. addpd %xmm4, %xmm0
  1026. addpd %xmm5, %xmm2
  1027. movlpd %xmm0, 0 * SIZE(Y1)
  1028. movhpd %xmm0, 1 * SIZE(Y1)
  1029. movlpd %xmm2, 2 * SIZE(Y1)
  1030. movhpd %xmm2, 3 * SIZE(Y1)
  1031. addq $4 * SIZE, Y1
  1032. #if GEMV_UNROLL == 4
  1033. cmpq $4, N
  1034. jge .L21
  1035. #endif
  1036. jmp .L30
  1037. ALIGN_4
  1038. .L29:
  1039. movsd (Y), %xmm4
  1040. addq INCY, Y
  1041. movhpd (Y), %xmm4
  1042. addq INCY, Y
  1043. movsd (Y), %xmm5
  1044. addq INCY, Y
  1045. movhpd (Y), %xmm5
  1046. addq INCY, Y
  1047. addpd %xmm4, %xmm0
  1048. addpd %xmm5, %xmm2
  1049. movlpd %xmm0, (Y1)
  1050. addq INCY, Y1
  1051. movhpd %xmm0, (Y1)
  1052. addq INCY, Y1
  1053. movlpd %xmm2, (Y1)
  1054. addq INCY, Y1
  1055. movhpd %xmm2, (Y1)
  1056. addq INCY, Y1
  1057. #if GEMV_UNROLL == 4
  1058. cmpq $4, N
  1059. jge .L21
  1060. #endif
  1061. ALIGN_4
  1062. .L30:
  1063. #endif
  1064. #if GEMV_UNROLL >= 2
  1065. cmpq $2, N
  1066. jl .L40
  1067. #if GEMV_UNROLL == 2
  1068. ALIGN_3
  1069. .L31:
  1070. #endif
  1071. subq $2, N
  1072. leaq 16 * SIZE(BUFFER), X1
  1073. movq A, A1
  1074. leaq (A1, LDA), A2
  1075. leaq (A1, LDA, 2), A
  1076. xorps %xmm0, %xmm0
  1077. xorps %xmm1, %xmm1
  1078. xorps %xmm2, %xmm2
  1079. xorps %xmm3, %xmm3
  1080. #if (GEMV_UNROLL == 2 ) && defined(PREFETCHW)
  1081. PREFETCHW 2 * SIZE(Y1)
  1082. #endif
  1083. #ifdef ALIGNED_ACCESS
  1084. testq $SIZE, A
  1085. je .L3X
  1086. movsd -16 * SIZE(X1), %xmm12
  1087. movsd -16 * SIZE(A1), %xmm8
  1088. mulsd %xmm12, %xmm8
  1089. addsd %xmm8, %xmm0
  1090. movsd -16 * SIZE(A2), %xmm9
  1091. mulsd %xmm12, %xmm9
  1092. addsd %xmm9, %xmm1
  1093. addq $SIZE, A1
  1094. addq $SIZE, A2
  1095. addq $SIZE, X1
  1096. ALIGN_3
  1097. .L3X:
  1098. #endif
  1099. movq M, I
  1100. sarq $3, I
  1101. jle .L35
  1102. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  1103. MOVUPS_A1(-16 * SIZE, A2, %xmm9)
  1104. MOVUPS_A1(-14 * SIZE, A1, %xmm10)
  1105. MOVUPS_A1(-14 * SIZE, A2, %xmm11)
  1106. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1107. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  1108. decq I
  1109. jle .L33
  1110. ALIGN_4
  1111. .L32:
  1112. #ifdef PREFETCH
  1113. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  1114. #endif
  1115. mulpd %xmm12, %xmm8
  1116. addpd %xmm8, %xmm0
  1117. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  1118. mulpd %xmm12, %xmm9
  1119. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1120. addpd %xmm9, %xmm1
  1121. MOVUPS_A1(-12 * SIZE, A2, %xmm9)
  1122. mulpd %xmm13, %xmm10
  1123. addpd %xmm10, %xmm2
  1124. MOVUPS_A1(-10 * SIZE, A1, %xmm10)
  1125. mulpd %xmm13, %xmm11
  1126. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  1127. addpd %xmm11, %xmm3
  1128. MOVUPS_A1(-10 * SIZE, A2, %xmm11)
  1129. #ifdef PREFETCH
  1130. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2)
  1131. #endif
  1132. mulpd %xmm12, %xmm8
  1133. addpd %xmm8, %xmm0
  1134. MOVUPS_A1( -8 * SIZE, A1, %xmm8)
  1135. mulpd %xmm12, %xmm9
  1136. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  1137. addpd %xmm9, %xmm1
  1138. MOVUPS_A1( -8 * SIZE, A2, %xmm9)
  1139. #ifdef PREFETCHW
  1140. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
  1141. #endif
  1142. mulpd %xmm13, %xmm10
  1143. addpd %xmm10, %xmm2
  1144. MOVUPS_A1( -6 * SIZE, A1, %xmm10)
  1145. mulpd %xmm13, %xmm11
  1146. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  1147. addpd %xmm11, %xmm3
  1148. MOVUPS_A1( -6 * SIZE, A2, %xmm11)
  1149. addq $8 * SIZE, A1
  1150. addq $8 * SIZE, A2
  1151. addq $8 * SIZE, X1
  1152. decq I
  1153. jg .L32
  1154. ALIGN_4
  1155. .L33:
  1156. mulpd %xmm12, %xmm8
  1157. addpd %xmm8, %xmm0
  1158. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  1159. mulpd %xmm12, %xmm9
  1160. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1161. addpd %xmm9, %xmm1
  1162. MOVUPS_A1(-12 * SIZE, A2, %xmm9)
  1163. mulpd %xmm13, %xmm10
  1164. addpd %xmm10, %xmm2
  1165. MOVUPS_A1(-10 * SIZE, A1, %xmm10)
  1166. mulpd %xmm13, %xmm11
  1167. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  1168. addpd %xmm11, %xmm3
  1169. MOVUPS_A1(-10 * SIZE, A2, %xmm11)
  1170. mulpd %xmm12, %xmm8
  1171. addpd %xmm8, %xmm0
  1172. mulpd %xmm12, %xmm9
  1173. addpd %xmm9, %xmm1
  1174. mulpd %xmm13, %xmm10
  1175. addpd %xmm10, %xmm2
  1176. mulpd %xmm13, %xmm11
  1177. addpd %xmm11, %xmm3
  1178. addq $8 * SIZE, A1
  1179. addq $8 * SIZE, A2
  1180. addq $8 * SIZE, X1
  1181. ALIGN_4
  1182. .L35:
  1183. testq $4, M
  1184. jle .L36
  1185. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1186. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  1187. mulpd %xmm12, %xmm8
  1188. addpd %xmm8, %xmm0
  1189. MOVUPS_A1(-16 * SIZE, A2, %xmm9)
  1190. mulpd %xmm12, %xmm9
  1191. addpd %xmm9, %xmm1
  1192. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  1193. MOVUPS_A1(-14 * SIZE, A1, %xmm10)
  1194. mulpd %xmm13, %xmm10
  1195. addpd %xmm10, %xmm2
  1196. MOVUPS_A1(-14 * SIZE, A2, %xmm11)
  1197. mulpd %xmm13, %xmm11
  1198. addpd %xmm11, %xmm3
  1199. addq $4 * SIZE, A1
  1200. addq $4 * SIZE, A2
  1201. addq $4 * SIZE, X1
  1202. ALIGN_4
  1203. .L36:
  1204. testq $2, M
  1205. jle .L37
  1206. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1207. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  1208. mulpd %xmm12, %xmm8
  1209. addpd %xmm8, %xmm0
  1210. MOVUPS_A1(-16 * SIZE, A2, %xmm9)
  1211. mulpd %xmm12, %xmm9
  1212. addpd %xmm9, %xmm1
  1213. addq $2 * SIZE, A1
  1214. addq $2 * SIZE, A2
  1215. addq $2 * SIZE, X1
  1216. ALIGN_4
  1217. .L37:
  1218. testq $1, M
  1219. je .L38
  1220. movsd -16 * SIZE(X1), %xmm12
  1221. movsd -16 * SIZE(A1), %xmm8
  1222. mulsd %xmm12, %xmm8
  1223. addsd %xmm8, %xmm0
  1224. movsd -16 * SIZE(A2), %xmm9
  1225. mulsd %xmm12, %xmm9
  1226. addsd %xmm9, %xmm1
  1227. ALIGN_4
  1228. .L38:
  1229. addpd %xmm2, %xmm0
  1230. addpd %xmm3, %xmm1
  1231. #ifdef HAVE_SSE3
  1232. haddpd %xmm1, %xmm0
  1233. #else
  1234. movapd %xmm0, %xmm8
  1235. unpcklpd %xmm1, %xmm0
  1236. unpckhpd %xmm1, %xmm8
  1237. addpd %xmm8, %xmm0
  1238. #endif
  1239. mulpd ALPHA, %xmm0
  1240. movsd (Y), %xmm4
  1241. addq INCY, Y
  1242. movhpd (Y), %xmm4
  1243. addq INCY, Y
  1244. addpd %xmm4, %xmm0
  1245. movlpd %xmm0, (Y1)
  1246. addq INCY, Y1
  1247. movhpd %xmm0, (Y1)
  1248. addq INCY, Y1
  1249. #if GEMV_UNROLL == 2
  1250. cmpq $2, N
  1251. jge .L31
  1252. #endif
  1253. ALIGN_4
  1254. .L40:
  1255. cmpq $1, N
  1256. jl .L999
  1257. #endif
  1258. leaq 16 * SIZE(BUFFER), X1
  1259. movq A, A1
  1260. xorps %xmm0, %xmm0
  1261. xorps %xmm1, %xmm1
  1262. xorps %xmm2, %xmm2
  1263. xorps %xmm3, %xmm3
  1264. #ifdef ALIGNED_ACCESS
  1265. testq $SIZE, A
  1266. je .L4X
  1267. movsd -16 * SIZE(X1), %xmm12
  1268. movsd -16 * SIZE(A1), %xmm8
  1269. mulsd %xmm12, %xmm8
  1270. addsd %xmm8, %xmm0
  1271. addq $SIZE, A1
  1272. addq $SIZE, X1
  1273. ALIGN_3
  1274. .L4X:
  1275. #endif
  1276. movq M, I
  1277. sarq $3, I
  1278. jle .L45
  1279. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  1280. MOVUPS_A1(-14 * SIZE, A1, %xmm9)
  1281. MOVUPS_A1(-12 * SIZE, A1, %xmm10)
  1282. MOVUPS_A1(-10 * SIZE, A1, %xmm11)
  1283. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1284. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  1285. decq I
  1286. jle .L43
  1287. ALIGN_4
  1288. .L42:
  1289. #ifdef PREFETCH
  1290. PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1)
  1291. #endif
  1292. mulpd %xmm12, %xmm8
  1293. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1294. addpd %xmm8, %xmm0
  1295. MOVUPS_A1( -8 * SIZE, A1, %xmm8)
  1296. mulpd %xmm13, %xmm9
  1297. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  1298. addpd %xmm9, %xmm2
  1299. MOVUPS_A1( -6 * SIZE, A1, %xmm9)
  1300. #ifdef PREFETCHW
  1301. PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1)
  1302. #endif
  1303. mulpd %xmm12, %xmm10
  1304. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  1305. addpd %xmm10, %xmm0
  1306. MOVUPS_A1( -4 * SIZE, A1, %xmm10)
  1307. mulpd %xmm13, %xmm11
  1308. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  1309. addpd %xmm11, %xmm2
  1310. MOVUPS_A1( -2 * SIZE, A1, %xmm11)
  1311. addq $8 * SIZE, A1
  1312. addq $8 * SIZE, X1
  1313. decq I
  1314. jg .L42
  1315. ALIGN_4
  1316. .L43:
  1317. mulpd %xmm12, %xmm8
  1318. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1319. addpd %xmm8, %xmm0
  1320. mulpd %xmm13, %xmm9
  1321. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  1322. addpd %xmm9, %xmm2
  1323. mulpd %xmm12, %xmm10
  1324. addpd %xmm10, %xmm0
  1325. mulpd %xmm13, %xmm11
  1326. addpd %xmm11, %xmm2
  1327. addq $8 * SIZE, A1
  1328. addq $8 * SIZE, X1
  1329. ALIGN_4
  1330. .L45:
  1331. testq $4, M
  1332. jle .L46
  1333. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  1334. MOVUPS_A1(-14 * SIZE, A1, %xmm9)
  1335. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1336. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  1337. mulpd %xmm12, %xmm8
  1338. addpd %xmm8, %xmm0
  1339. mulpd %xmm13, %xmm9
  1340. addpd %xmm9, %xmm2
  1341. addq $4 * SIZE, A1
  1342. addq $4 * SIZE, X1
  1343. ALIGN_4
  1344. .L46:
  1345. testq $2, M
  1346. jle .L47
  1347. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1348. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  1349. mulpd %xmm12, %xmm8
  1350. addpd %xmm8, %xmm0
  1351. addq $2 * SIZE, A1
  1352. addq $2 * SIZE, X1
  1353. ALIGN_4
  1354. .L47:
  1355. testq $1, M
  1356. je .L48
  1357. movsd -16 * SIZE(X1), %xmm12
  1358. movsd -16 * SIZE(A1), %xmm8
  1359. mulsd %xmm12, %xmm8
  1360. addsd %xmm8, %xmm0
  1361. ALIGN_4
  1362. .L48:
  1363. addpd %xmm2, %xmm0
  1364. addpd %xmm3, %xmm1
  1365. addpd %xmm1, %xmm0
  1366. #ifdef HAVE_SSE3
  1367. haddpd %xmm1, %xmm0
  1368. #else
  1369. movapd %xmm0, %xmm8
  1370. unpcklpd %xmm1, %xmm0
  1371. unpckhpd %xmm1, %xmm8
  1372. addsd %xmm8, %xmm0
  1373. #endif
  1374. mulsd ALPHA, %xmm0
  1375. movsd (Y), %xmm4
  1376. addq INCY, Y
  1377. addsd %xmm4, %xmm0
  1378. movlpd %xmm0, (Y1)
  1379. addq INCY, Y1
  1380. #ifdef ALIGNED_ACCESS
  1381. jmp .L999
  1382. ALIGN_4
  1383. .L50:
  1384. #if GEMV_UNROLL >= 4
  1385. cmpq $4, N
  1386. jl .L60
  1387. ALIGN_3
  1388. .L51:
  1389. subq $4, N
  1390. leaq 16 * SIZE(BUFFER), X1
  1391. movq A, A1
  1392. leaq (A1, LDA, 2), A2
  1393. leaq (A1, LDA, 4), A
  1394. xorps %xmm0, %xmm0
  1395. xorps %xmm1, %xmm1
  1396. xorps %xmm2, %xmm2
  1397. xorps %xmm3, %xmm3
  1398. #ifdef PREFETCHW
  1399. PREFETCHW 3 * SIZE(Y1)
  1400. #endif
  1401. #ifdef ALIGNED_ACCESS
  1402. testq $SIZE, A
  1403. je .L5X
  1404. movsd -16 * SIZE(X1), %xmm12
  1405. movsd -16 * SIZE(A1), %xmm4
  1406. mulsd %xmm12, %xmm4
  1407. addsd %xmm4, %xmm0
  1408. movsd -16 * SIZE(A1, LDA), %xmm5
  1409. mulsd %xmm12, %xmm5
  1410. addsd %xmm5, %xmm1
  1411. movsd -16 * SIZE(A2), %xmm6
  1412. mulsd %xmm12, %xmm6
  1413. addsd %xmm6, %xmm2
  1414. movsd -16 * SIZE(A2, LDA), %xmm7
  1415. mulsd %xmm12, %xmm7
  1416. addsd %xmm7, %xmm3
  1417. addq $SIZE, A1
  1418. addq $SIZE, A2
  1419. addq $SIZE, X1
  1420. ALIGN_3
  1421. .L5X:
  1422. #endif
  1423. movhpd -16 * SIZE(A1, LDA), %xmm8
  1424. movhpd -16 * SIZE(A2, LDA), %xmm9
  1425. movq M, I
  1426. sarq $3, I
  1427. jle .L55
  1428. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1429. MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5)
  1430. MOVUPS_A1(-16 * SIZE, A2, %xmm6)
  1431. MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7)
  1432. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1433. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  1434. decq I
  1435. jle .L53
  1436. ALIGN_4
  1437. .L52:
  1438. #ifdef PREFETCH
  1439. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  1440. #endif
  1441. mulpd %xmm12, %xmm4
  1442. addpd %xmm4, %xmm0
  1443. MOVUPS_A1(-14 * SIZE, A1, %xmm4)
  1444. shufpd $1, %xmm5, %xmm8
  1445. mulpd %xmm12, %xmm8
  1446. addpd %xmm8, %xmm1
  1447. MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8)
  1448. mulpd %xmm12, %xmm6
  1449. addpd %xmm6, %xmm2
  1450. MOVUPS_A1(-14 * SIZE, A2, %xmm6)
  1451. shufpd $1, %xmm7, %xmm9
  1452. mulpd %xmm12, %xmm9
  1453. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1454. addpd %xmm9, %xmm3
  1455. MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9)
  1456. #ifdef PREFETCH
  1457. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A1, LDA)
  1458. #endif
  1459. mulpd %xmm13, %xmm4
  1460. addpd %xmm4, %xmm0
  1461. MOVUPS_A1(-12 * SIZE, A1, %xmm4)
  1462. shufpd $1, %xmm8, %xmm5
  1463. mulpd %xmm13, %xmm5
  1464. addpd %xmm5, %xmm1
  1465. MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5)
  1466. mulpd %xmm13, %xmm6
  1467. addpd %xmm6, %xmm2
  1468. MOVUPS_A1(-12 * SIZE, A2, %xmm6)
  1469. shufpd $1, %xmm9, %xmm7
  1470. mulpd %xmm13, %xmm7
  1471. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  1472. addpd %xmm7, %xmm3
  1473. MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7)
  1474. #ifdef PREFETCH
  1475. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  1476. #endif
  1477. mulpd %xmm12, %xmm4
  1478. addpd %xmm4, %xmm0
  1479. MOVUPS_A1(-10 * SIZE, A1, %xmm4)
  1480. shufpd $1, %xmm5, %xmm8
  1481. mulpd %xmm12, %xmm8
  1482. addpd %xmm8, %xmm1
  1483. MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8)
  1484. mulpd %xmm12, %xmm6
  1485. addpd %xmm6, %xmm2
  1486. MOVUPS_A1(-10 * SIZE, A2, %xmm6)
  1487. shufpd $1, %xmm7, %xmm9
  1488. mulpd %xmm12, %xmm9
  1489. MOVUPS_XL1(-8 * SIZE, X1, %xmm12)
  1490. addpd %xmm9, %xmm3
  1491. MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9)
  1492. #ifdef PREFETCH
  1493. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A2, LDA)
  1494. #endif
  1495. mulpd %xmm13, %xmm4
  1496. addpd %xmm4, %xmm0
  1497. MOVUPS_A1(-8 * SIZE, A1, %xmm4)
  1498. shufpd $1, %xmm8, %xmm5
  1499. mulpd %xmm13, %xmm5
  1500. addpd %xmm5, %xmm1
  1501. MOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5)
  1502. #ifdef PREFETCHW
  1503. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(X1)
  1504. #endif
  1505. mulpd %xmm13, %xmm6
  1506. addpd %xmm6, %xmm2
  1507. MOVUPS_A1(-8 * SIZE, A2, %xmm6)
  1508. shufpd $1, %xmm9, %xmm7
  1509. mulpd %xmm13, %xmm7
  1510. MOVUPS_XL1(-6 * SIZE, X1, %xmm13)
  1511. addpd %xmm7, %xmm3
  1512. MOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7)
  1513. addq $8 * SIZE, A1
  1514. addq $8 * SIZE, A2
  1515. addq $8 * SIZE, X1
  1516. decq I
  1517. jg .L52
  1518. ALIGN_4
  1519. .L53:
  1520. mulpd %xmm12, %xmm4
  1521. addpd %xmm4, %xmm0
  1522. MOVUPS_A1(-14 * SIZE, A1, %xmm4)
  1523. shufpd $1, %xmm5, %xmm8
  1524. mulpd %xmm12, %xmm8
  1525. addpd %xmm8, %xmm1
  1526. MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8)
  1527. mulpd %xmm12, %xmm6
  1528. addpd %xmm6, %xmm2
  1529. MOVUPS_A1(-14 * SIZE, A2, %xmm6)
  1530. shufpd $1, %xmm7, %xmm9
  1531. mulpd %xmm12, %xmm9
  1532. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1533. addpd %xmm9, %xmm3
  1534. MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9)
  1535. mulpd %xmm13, %xmm4
  1536. addpd %xmm4, %xmm0
  1537. MOVUPS_A1(-12 * SIZE, A1, %xmm4)
  1538. shufpd $1, %xmm8, %xmm5
  1539. mulpd %xmm13, %xmm5
  1540. addpd %xmm5, %xmm1
  1541. MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5)
  1542. mulpd %xmm13, %xmm6
  1543. addpd %xmm6, %xmm2
  1544. MOVUPS_A1(-12 * SIZE, A2, %xmm6)
  1545. shufpd $1, %xmm9, %xmm7
  1546. mulpd %xmm13, %xmm7
  1547. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  1548. addpd %xmm7, %xmm3
  1549. MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7)
  1550. mulpd %xmm12, %xmm4
  1551. addpd %xmm4, %xmm0
  1552. MOVUPS_A1(-10 * SIZE, A1, %xmm4)
  1553. shufpd $1, %xmm5, %xmm8
  1554. mulpd %xmm12, %xmm8
  1555. addpd %xmm8, %xmm1
  1556. MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8)
  1557. mulpd %xmm12, %xmm6
  1558. addpd %xmm6, %xmm2
  1559. MOVUPS_A1(-10 * SIZE, A2, %xmm6)
  1560. shufpd $1, %xmm7, %xmm9
  1561. mulpd %xmm12, %xmm9
  1562. addpd %xmm9, %xmm3
  1563. MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9)
  1564. mulpd %xmm13, %xmm4
  1565. addpd %xmm4, %xmm0
  1566. shufpd $1, %xmm8, %xmm5
  1567. mulpd %xmm13, %xmm5
  1568. addpd %xmm5, %xmm1
  1569. mulpd %xmm13, %xmm6
  1570. addpd %xmm6, %xmm2
  1571. shufpd $1, %xmm9, %xmm7
  1572. mulpd %xmm13, %xmm7
  1573. addpd %xmm7, %xmm3
  1574. addq $8 * SIZE, A1
  1575. addq $8 * SIZE, A2
  1576. addq $8 * SIZE, X1
  1577. ALIGN_4
  1578. .L55:
  1579. testq $4, M
  1580. jle .L56
  1581. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1582. MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5)
  1583. MOVUPS_A1(-16 * SIZE, A2, %xmm6)
  1584. MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7)
  1585. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1586. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  1587. mulpd %xmm12, %xmm4
  1588. addpd %xmm4, %xmm0
  1589. MOVUPS_A1(-14 * SIZE, A1, %xmm4)
  1590. shufpd $1, %xmm5, %xmm8
  1591. mulpd %xmm12, %xmm8
  1592. addpd %xmm8, %xmm1
  1593. MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8)
  1594. mulpd %xmm12, %xmm6
  1595. addpd %xmm6, %xmm2
  1596. MOVUPS_A1(-14 * SIZE, A2, %xmm6)
  1597. shufpd $1, %xmm7, %xmm9
  1598. mulpd %xmm12, %xmm9
  1599. addpd %xmm9, %xmm3
  1600. MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9)
  1601. mulpd %xmm13, %xmm4
  1602. addpd %xmm4, %xmm0
  1603. shufpd $1, %xmm8, %xmm5
  1604. mulpd %xmm13, %xmm5
  1605. addpd %xmm5, %xmm1
  1606. mulpd %xmm13, %xmm6
  1607. addpd %xmm6, %xmm2
  1608. shufpd $1, %xmm9, %xmm7
  1609. mulpd %xmm13, %xmm7
  1610. addpd %xmm7, %xmm3
  1611. addq $4 * SIZE, A1
  1612. addq $4 * SIZE, A2
  1613. addq $4 * SIZE, X1
  1614. ALIGN_4
  1615. .L56:
  1616. testq $2, M
  1617. jle .L57
  1618. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1619. MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5)
  1620. MOVUPS_A1(-16 * SIZE, A2, %xmm6)
  1621. MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7)
  1622. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1623. mulpd %xmm12, %xmm4
  1624. addpd %xmm4, %xmm0
  1625. shufpd $1, %xmm5, %xmm8
  1626. mulpd %xmm12, %xmm8
  1627. addpd %xmm8, %xmm1
  1628. movaps %xmm5, %xmm8
  1629. mulpd %xmm12, %xmm6
  1630. addpd %xmm6, %xmm2
  1631. shufpd $1, %xmm7, %xmm9
  1632. mulpd %xmm12, %xmm9
  1633. addpd %xmm9, %xmm3
  1634. movaps %xmm7, %xmm9
  1635. addq $2 * SIZE, A1
  1636. addq $2 * SIZE, A2
  1637. addq $2 * SIZE, X1
  1638. ALIGN_4
  1639. .L57:
  1640. testq $1, M
  1641. je .L58
  1642. movsd -16 * SIZE(X1), %xmm12
  1643. movsd -16 * SIZE(A1), %xmm4
  1644. mulsd %xmm12, %xmm4
  1645. addsd %xmm4, %xmm0
  1646. shufpd $1, %xmm8, %xmm8
  1647. mulsd %xmm12, %xmm8
  1648. addsd %xmm8, %xmm1
  1649. movsd -16 * SIZE(A2), %xmm6
  1650. mulsd %xmm12, %xmm6
  1651. addsd %xmm6, %xmm2
  1652. shufpd $1, %xmm9, %xmm9
  1653. mulsd %xmm12, %xmm9
  1654. addsd %xmm9, %xmm3
  1655. ALIGN_4
  1656. .L58:
  1657. #ifdef HAVE_SSE3
  1658. haddpd %xmm1, %xmm0
  1659. haddpd %xmm3, %xmm2
  1660. #else
  1661. movapd %xmm0, %xmm4
  1662. unpcklpd %xmm1, %xmm0
  1663. unpckhpd %xmm1, %xmm4
  1664. movapd %xmm2, %xmm5
  1665. unpcklpd %xmm3, %xmm2
  1666. unpckhpd %xmm3, %xmm5
  1667. addpd %xmm4, %xmm0
  1668. addpd %xmm5, %xmm2
  1669. #endif
  1670. mulpd ALPHA, %xmm0
  1671. mulpd ALPHA, %xmm2
  1672. cmpq $SIZE, INCY
  1673. jne .L59
  1674. movsd 0 * SIZE(Y), %xmm4
  1675. movhpd 1 * SIZE(Y), %xmm4
  1676. movsd 2 * SIZE(Y), %xmm5
  1677. movhpd 3 * SIZE(Y), %xmm5
  1678. addq $4 * SIZE, Y
  1679. addpd %xmm4, %xmm0
  1680. addpd %xmm5, %xmm2
  1681. movlpd %xmm0, 0 * SIZE(Y1)
  1682. movhpd %xmm0, 1 * SIZE(Y1)
  1683. movlpd %xmm2, 2 * SIZE(Y1)
  1684. movhpd %xmm2, 3 * SIZE(Y1)
  1685. addq $4 * SIZE, Y1
  1686. cmpq $4, N
  1687. jge .L51
  1688. jmp .L60
  1689. ALIGN_4
  1690. .L59:
  1691. movsd (Y), %xmm4
  1692. addq INCY, Y
  1693. movhpd (Y), %xmm4
  1694. addq INCY, Y
  1695. movsd (Y), %xmm5
  1696. addq INCY, Y
  1697. movhpd (Y), %xmm5
  1698. addq INCY, Y
  1699. addpd %xmm4, %xmm0
  1700. addpd %xmm5, %xmm2
  1701. movlpd %xmm0, (Y1)
  1702. addq INCY, Y1
  1703. movhpd %xmm0, (Y1)
  1704. addq INCY, Y1
  1705. movlpd %xmm2, (Y1)
  1706. addq INCY, Y1
  1707. movhpd %xmm2, (Y1)
  1708. addq INCY, Y1
  1709. cmpq $4, N
  1710. jge .L51
  1711. ALIGN_4
  1712. .L60:
  1713. #endif
  1714. #if GEMV_UNROLL >= 2
  1715. cmpq $2, N
  1716. jl .L70
  1717. #if GEMV_UNROLL == 2
  1718. ALIGN_3
  1719. .L61:
  1720. #endif
  1721. subq $2, N
  1722. leaq 16 * SIZE(BUFFER), X1
  1723. movq A, A1
  1724. leaq (A1, LDA), A2
  1725. leaq (A1, LDA, 2), A
  1726. xorps %xmm0, %xmm0
  1727. xorps %xmm1, %xmm1
  1728. xorps %xmm2, %xmm2
  1729. xorps %xmm3, %xmm3
  1730. #if (GEMV_UNROLL == 2 ) && defined(PREFETCHW)
  1731. PREFETCHW 2 * SIZE(Y1)
  1732. #endif
  1733. #ifdef ALIGNED_ACCESS
  1734. testq $SIZE, A
  1735. je .L6X
  1736. movsd -16 * SIZE(X1), %xmm12
  1737. movsd -16 * SIZE(A1), %xmm4
  1738. mulsd %xmm12, %xmm4
  1739. addsd %xmm4, %xmm0
  1740. movsd -16 * SIZE(A2), %xmm5
  1741. mulsd %xmm12, %xmm5
  1742. addsd %xmm5, %xmm1
  1743. addq $SIZE, A1
  1744. addq $SIZE, A2
  1745. addq $SIZE, X1
  1746. ALIGN_3
  1747. .L6X:
  1748. #endif
  1749. movhpd -16 * SIZE(A2), %xmm8
  1750. movq M, I
  1751. sarq $3, I
  1752. jle .L65
  1753. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1754. MOVUPS_A1(-15 * SIZE, A2, %xmm5)
  1755. MOVUPS_A1(-14 * SIZE, A1, %xmm6)
  1756. MOVUPS_A1(-13 * SIZE, A2, %xmm7)
  1757. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1758. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  1759. decq I
  1760. jle .L63
  1761. ALIGN_4
  1762. .L62:
  1763. #ifdef PREFETCH
  1764. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  1765. #endif
  1766. mulpd %xmm12, %xmm4
  1767. addpd %xmm4, %xmm0
  1768. MOVUPS_A1(-12 * SIZE, A1, %xmm4)
  1769. shufpd $1, %xmm5, %xmm8
  1770. mulpd %xmm12, %xmm8
  1771. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1772. addpd %xmm8, %xmm1
  1773. MOVUPS_A1(-11 * SIZE, A2, %xmm9)
  1774. mulpd %xmm13, %xmm6
  1775. addpd %xmm6, %xmm0
  1776. MOVUPS_A1(-10 * SIZE, A1, %xmm6)
  1777. shufpd $1, %xmm7, %xmm5
  1778. mulpd %xmm13, %xmm5
  1779. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  1780. addpd %xmm5, %xmm1
  1781. MOVUPS_A1( -9 * SIZE, A2, %xmm8)
  1782. #ifdef PREFETCH
  1783. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET + 8(A2)
  1784. #endif
  1785. mulpd %xmm12, %xmm4
  1786. addpd %xmm4, %xmm0
  1787. MOVUPS_A1(-8 * SIZE, A1, %xmm4)
  1788. shufpd $1, %xmm9, %xmm7
  1789. mulpd %xmm12, %xmm7
  1790. MOVUPS_XL1(-8 * SIZE, X1, %xmm12)
  1791. addpd %xmm7, %xmm1
  1792. MOVUPS_A1(-7 * SIZE, A2, %xmm5)
  1793. #ifdef PREFETCHW
  1794. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET + 8(X1)
  1795. #endif
  1796. mulpd %xmm13, %xmm6
  1797. addpd %xmm6, %xmm0
  1798. MOVUPS_A1(-6 * SIZE, A1, %xmm6)
  1799. shufpd $1, %xmm8, %xmm9
  1800. mulpd %xmm13, %xmm9
  1801. MOVUPS_XL1(-6 * SIZE, X1, %xmm13)
  1802. addpd %xmm9, %xmm1
  1803. MOVUPS_A1(-5 * SIZE, A2, %xmm7)
  1804. addq $8 * SIZE, A1
  1805. addq $8 * SIZE, A2
  1806. addq $8 * SIZE, X1
  1807. decq I
  1808. jg .L62
  1809. ALIGN_4
  1810. .L63:
  1811. mulpd %xmm12, %xmm4
  1812. addpd %xmm4, %xmm0
  1813. MOVUPS_A1(-12 * SIZE, A1, %xmm4)
  1814. shufpd $1, %xmm5, %xmm8
  1815. mulpd %xmm12, %xmm8
  1816. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1817. addpd %xmm8, %xmm1
  1818. MOVUPS_A1(-11 * SIZE, A2, %xmm9)
  1819. mulpd %xmm13, %xmm6
  1820. addpd %xmm6, %xmm0
  1821. MOVUPS_A1(-10 * SIZE, A1, %xmm6)
  1822. shufpd $1, %xmm7, %xmm5
  1823. mulpd %xmm13, %xmm5
  1824. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  1825. addpd %xmm5, %xmm1
  1826. MOVUPS_A1( -9 * SIZE, A2, %xmm8)
  1827. mulpd %xmm12, %xmm4
  1828. addpd %xmm4, %xmm0
  1829. shufpd $1, %xmm9, %xmm7
  1830. mulpd %xmm12, %xmm7
  1831. addpd %xmm7, %xmm1
  1832. mulpd %xmm13, %xmm6
  1833. addpd %xmm6, %xmm0
  1834. shufpd $1, %xmm8, %xmm9
  1835. mulpd %xmm13, %xmm9
  1836. addpd %xmm9, %xmm1
  1837. addq $8 * SIZE, A1
  1838. addq $8 * SIZE, A2
  1839. addq $8 * SIZE, X1
  1840. ALIGN_4
  1841. .L65:
  1842. testq $4, M
  1843. jle .L66
  1844. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1845. MOVUPS_A1(-15 * SIZE, A2, %xmm5)
  1846. MOVUPS_A1(-14 * SIZE, A1, %xmm6)
  1847. MOVUPS_A1(-13 * SIZE, A2, %xmm7)
  1848. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1849. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  1850. mulpd %xmm12, %xmm4
  1851. addpd %xmm4, %xmm0
  1852. shufpd $1, %xmm5, %xmm8
  1853. mulpd %xmm12, %xmm8
  1854. addpd %xmm8, %xmm1
  1855. mulpd %xmm13, %xmm6
  1856. addpd %xmm6, %xmm0
  1857. shufpd $1, %xmm7, %xmm5
  1858. movaps %xmm7, %xmm8
  1859. mulpd %xmm13, %xmm5
  1860. addpd %xmm5, %xmm1
  1861. addq $4 * SIZE, A1
  1862. addq $4 * SIZE, A2
  1863. addq $4 * SIZE, X1
  1864. ALIGN_4
  1865. .L66:
  1866. testq $2, M
  1867. jle .L67
  1868. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1869. MOVUPS_A1(-15 * SIZE, A2, %xmm5)
  1870. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1871. mulpd %xmm12, %xmm4
  1872. addpd %xmm4, %xmm0
  1873. shufpd $1, %xmm5, %xmm8
  1874. mulpd %xmm12, %xmm8
  1875. addpd %xmm8, %xmm1
  1876. movaps %xmm5, %xmm8
  1877. addq $2 * SIZE, A1
  1878. addq $2 * SIZE, A2
  1879. addq $2 * SIZE, X1
  1880. ALIGN_4
  1881. .L67:
  1882. testq $1, M
  1883. je .L68
  1884. movsd -16 * SIZE(X1), %xmm12
  1885. movsd -16 * SIZE(A1), %xmm4
  1886. mulsd %xmm12, %xmm4
  1887. addsd %xmm4, %xmm0
  1888. shufpd $1, %xmm8, %xmm8
  1889. mulsd %xmm12, %xmm8
  1890. addsd %xmm8, %xmm1
  1891. ALIGN_4
  1892. .L68:
  1893. addpd %xmm2, %xmm0
  1894. addpd %xmm3, %xmm1
  1895. #ifdef HAVE_SSE3
  1896. haddpd %xmm1, %xmm0
  1897. #else
  1898. movapd %xmm0, %xmm4
  1899. unpcklpd %xmm1, %xmm0
  1900. unpckhpd %xmm1, %xmm4
  1901. addpd %xmm4, %xmm0
  1902. #endif
  1903. mulpd ALPHA, %xmm0
  1904. movsd (Y), %xmm4
  1905. addq INCY, Y
  1906. movhpd (Y), %xmm4
  1907. addq INCY, Y
  1908. addpd %xmm4, %xmm0
  1909. movlpd %xmm0, (Y1)
  1910. addq INCY, Y1
  1911. movhpd %xmm0, (Y1)
  1912. addq INCY, Y1
  1913. #if GEMV_UNROLL == 2
  1914. cmpq $2, N
  1915. jge .L61
  1916. #endif
  1917. ALIGN_4
  1918. .L70:
  1919. cmpq $1, N
  1920. jl .L999
  1921. #endif
  1922. leaq 16 * SIZE(BUFFER), X1
  1923. movq A, A1
  1924. xorps %xmm0, %xmm0
  1925. xorps %xmm1, %xmm1
  1926. xorps %xmm2, %xmm2
  1927. xorps %xmm3, %xmm3
  1928. #ifdef ALIGNED_ACCESS
  1929. testq $SIZE, A
  1930. je .L7X
  1931. movsd -16 * SIZE(X1), %xmm12
  1932. movsd -16 * SIZE(A1), %xmm4
  1933. mulsd %xmm12, %xmm4
  1934. addsd %xmm4, %xmm0
  1935. addq $SIZE, A1
  1936. addq $SIZE, X1
  1937. ALIGN_3
  1938. .L7X:
  1939. #endif
  1940. movq M, I
  1941. sarq $3, I
  1942. jle .L75
  1943. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1944. MOVUPS_A1(-14 * SIZE, A1, %xmm5)
  1945. MOVUPS_A1(-12 * SIZE, A1, %xmm6)
  1946. MOVUPS_A1(-10 * SIZE, A1, %xmm7)
  1947. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1948. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  1949. decq I
  1950. jle .L73
  1951. ALIGN_4
  1952. .L72:
  1953. #ifdef PREFETCH
  1954. PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1)
  1955. #endif
  1956. mulpd %xmm12, %xmm4
  1957. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1958. addpd %xmm4, %xmm0
  1959. MOVUPS_A1( -8 * SIZE, A1, %xmm4)
  1960. mulpd %xmm13, %xmm5
  1961. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  1962. addpd %xmm5, %xmm2
  1963. MOVUPS_A1( -6 * SIZE, A1, %xmm5)
  1964. #ifdef PREFETCHW
  1965. PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1)
  1966. #endif
  1967. mulpd %xmm12, %xmm6
  1968. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  1969. addpd %xmm6, %xmm0
  1970. MOVUPS_A1( -4 * SIZE, A1, %xmm6)
  1971. mulpd %xmm13, %xmm7
  1972. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  1973. addpd %xmm7, %xmm2
  1974. MOVUPS_A1( -2 * SIZE, A1, %xmm7)
  1975. addq $8 * SIZE, A1
  1976. addq $8 * SIZE, X1
  1977. decq I
  1978. jg .L72
  1979. ALIGN_4
  1980. .L73:
  1981. mulpd %xmm12, %xmm4
  1982. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1983. addpd %xmm4, %xmm0
  1984. mulpd %xmm13, %xmm5
  1985. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  1986. addpd %xmm5, %xmm2
  1987. mulpd %xmm12, %xmm6
  1988. addpd %xmm6, %xmm0
  1989. mulpd %xmm13, %xmm7
  1990. addpd %xmm7, %xmm2
  1991. addq $8 * SIZE, A1
  1992. addq $8 * SIZE, X1
  1993. ALIGN_4
  1994. .L75:
  1995. testq $4, M
  1996. jle .L76
  1997. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1998. MOVUPS_A1(-14 * SIZE, A1, %xmm5)
  1999. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  2000. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  2001. mulpd %xmm12, %xmm4
  2002. addpd %xmm4, %xmm0
  2003. mulpd %xmm13, %xmm5
  2004. addpd %xmm5, %xmm2
  2005. addq $4 * SIZE, A1
  2006. addq $4 * SIZE, X1
  2007. ALIGN_4
  2008. .L76:
  2009. testq $2, M
  2010. jle .L77
  2011. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  2012. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  2013. mulpd %xmm12, %xmm4
  2014. addpd %xmm4, %xmm0
  2015. addq $2 * SIZE, A1
  2016. addq $2 * SIZE, X1
  2017. ALIGN_4
  2018. .L77:
  2019. testq $1, M
  2020. je .L78
  2021. movsd -16 * SIZE(X1), %xmm12
  2022. movsd -16 * SIZE(A1), %xmm4
  2023. mulsd %xmm12, %xmm4
  2024. addsd %xmm4, %xmm0
  2025. ALIGN_4
  2026. .L78:
  2027. addpd %xmm2, %xmm0
  2028. addpd %xmm3, %xmm1
  2029. addpd %xmm1, %xmm0
  2030. #ifdef HAVE_SSE3
  2031. haddpd %xmm1, %xmm0
  2032. #else
  2033. movapd %xmm0, %xmm4
  2034. unpcklpd %xmm1, %xmm0
  2035. unpckhpd %xmm1, %xmm4
  2036. addsd %xmm4, %xmm0
  2037. #endif
  2038. mulsd ALPHA, %xmm0
  2039. movsd (Y), %xmm4
  2040. addq INCY, Y
  2041. addsd %xmm4, %xmm0
  2042. movlpd %xmm0, (Y1)
  2043. addq INCY, Y1
  2044. #endif
  2045. ALIGN_4
  2046. .L999:
  2047. leaq (, M, SIZE), %rax
  2048. addq %rax,AA
  2049. jmp .L0x;
  2050. ALIGN_4
  2051. .L999x:
  2052. movq 0(%rsp), %rbx
  2053. movq 8(%rsp), %rbp
  2054. movq 16(%rsp), %r12
  2055. movq 24(%rsp), %r13
  2056. movq 32(%rsp), %r14
  2057. movq 40(%rsp), %r15
  2058. #ifdef WINDOWS_ABI
  2059. movq 48(%rsp), %rdi
  2060. movq 56(%rsp), %rsi
  2061. movups 64(%rsp), %xmm6
  2062. movups 80(%rsp), %xmm7
  2063. movups 96(%rsp), %xmm8
  2064. movups 112(%rsp), %xmm9
  2065. movups 128(%rsp), %xmm10
  2066. movups 144(%rsp), %xmm11
  2067. movups 160(%rsp), %xmm12
  2068. movups 176(%rsp), %xmm13
  2069. movups 192(%rsp), %xmm14
  2070. movups 208(%rsp), %xmm15
  2071. #endif
  2072. addq $STACKSIZE, %rsp
  2073. ret
  2074. ALIGN_4
  2075. EPILOGUE