You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zdot_sse.S 67 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define Y ARG4 /* rcx */
  44. #ifndef WINDOWS_ABI
  45. #define INCY ARG5 /* r8 */
  46. #else
  47. #define INCY %r10
  48. #endif
  49. #include "l1param.h"
  50. PROLOGUE
  51. PROFCODE
  52. #ifdef WINDOWS_ABI
  53. movq 40(%rsp), INCY
  54. #endif
  55. SAVEREGISTERS
  56. salq $ZBASE_SHIFT, INCX
  57. salq $ZBASE_SHIFT, INCY
  58. xorps %xmm0, %xmm0
  59. xorps %xmm1, %xmm1
  60. xorps %xmm2, %xmm2
  61. xorps %xmm3, %xmm3
  62. testq N, N
  63. jle .L999
  64. cmpq $2 * SIZE, INCX
  65. jne .L200
  66. cmpq $2 * SIZE, INCY
  67. jne .L200
  68. subq $-32 * SIZE, X
  69. subq $-32 * SIZE, Y
  70. testq $SIZE, X
  71. jne .L50
  72. .L0x:
  73. testq $2 * SIZE, X
  74. je .L10
  75. #ifdef movsd
  76. xorps %xmm4, %xmm4
  77. #endif
  78. movsd -32 * SIZE(X), %xmm4
  79. movsd -32 * SIZE(Y), %xmm0
  80. pshufd $0xb1, %xmm0, %xmm1
  81. mulps %xmm4, %xmm0
  82. mulps %xmm4, %xmm1
  83. addq $2 * SIZE, X
  84. addq $2 * SIZE, Y
  85. decq N
  86. ALIGN_3
  87. .L10:
  88. testq $3 * SIZE, Y
  89. jne .L20
  90. movq N, %rax
  91. sarq $4, %rax
  92. jle .L15
  93. movaps -32 * SIZE(X), %xmm4
  94. movaps -28 * SIZE(X), %xmm5
  95. movaps -32 * SIZE(Y), %xmm8
  96. movaps -28 * SIZE(Y), %xmm9
  97. movaps -24 * SIZE(X), %xmm6
  98. movaps -20 * SIZE(X), %xmm7
  99. movaps -24 * SIZE(Y), %xmm10
  100. movaps -20 * SIZE(Y), %xmm11
  101. decq %rax
  102. jle .L12
  103. ALIGN_3
  104. .L11:
  105. #ifdef PREFETCH
  106. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  107. #endif
  108. pshufd $0xb1, %xmm8, %xmm12
  109. mulps %xmm4, %xmm8
  110. addps %xmm8, %xmm0
  111. movaps -16 * SIZE(Y), %xmm8
  112. mulps %xmm4, %xmm12
  113. movaps -16 * SIZE(X), %xmm4
  114. addps %xmm12, %xmm1
  115. pshufd $0xb1, %xmm9, %xmm12
  116. mulps %xmm5, %xmm9
  117. addps %xmm9, %xmm2
  118. movaps -12 * SIZE(Y), %xmm9
  119. mulps %xmm5, %xmm12
  120. movaps -12 * SIZE(X), %xmm5
  121. addps %xmm12, %xmm3
  122. #ifdef PREFETCH
  123. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  124. #endif
  125. pshufd $0xb1, %xmm10, %xmm12
  126. mulps %xmm6, %xmm10
  127. addps %xmm10, %xmm0
  128. movaps -8 * SIZE(Y), %xmm10
  129. mulps %xmm6, %xmm12
  130. movaps -8 * SIZE(X), %xmm6
  131. addps %xmm12, %xmm1
  132. pshufd $0xb1, %xmm11, %xmm12
  133. mulps %xmm7, %xmm11
  134. addps %xmm11, %xmm2
  135. movaps -4 * SIZE(Y), %xmm11
  136. mulps %xmm7, %xmm12
  137. movaps -4 * SIZE(X), %xmm7
  138. addps %xmm12, %xmm3
  139. #if defined(PREFETCH) && !defined(FETCH128)
  140. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  141. #endif
  142. pshufd $0xb1, %xmm8, %xmm12
  143. mulps %xmm4, %xmm8
  144. addps %xmm8, %xmm0
  145. movaps 0 * SIZE(Y), %xmm8
  146. mulps %xmm4, %xmm12
  147. movaps 0 * SIZE(X), %xmm4
  148. addps %xmm12, %xmm1
  149. pshufd $0xb1, %xmm9, %xmm12
  150. mulps %xmm5, %xmm9
  151. addps %xmm9, %xmm2
  152. movaps 4 * SIZE(Y), %xmm9
  153. mulps %xmm5, %xmm12
  154. movaps 4 * SIZE(X), %xmm5
  155. addps %xmm12, %xmm3
  156. #if defined(PREFETCH) && !defined(FETCH128)
  157. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  158. #endif
  159. pshufd $0xb1, %xmm10, %xmm12
  160. mulps %xmm6, %xmm10
  161. addps %xmm10, %xmm0
  162. movaps 8 * SIZE(Y), %xmm10
  163. mulps %xmm6, %xmm12
  164. movaps 8 * SIZE(X), %xmm6
  165. addps %xmm12, %xmm1
  166. pshufd $0xb1, %xmm11, %xmm12
  167. mulps %xmm7, %xmm11
  168. addps %xmm11, %xmm2
  169. movaps 12 * SIZE(Y), %xmm11
  170. mulps %xmm7, %xmm12
  171. movaps 12 * SIZE(X), %xmm7
  172. addps %xmm12, %xmm3
  173. subq $-32 * SIZE, X
  174. subq $-32 * SIZE, Y
  175. decq %rax
  176. jg .L11
  177. ALIGN_3
  178. .L12:
  179. pshufd $0xb1, %xmm8, %xmm12
  180. mulps %xmm4, %xmm8
  181. addps %xmm8, %xmm0
  182. movaps -16 * SIZE(Y), %xmm8
  183. mulps %xmm4, %xmm12
  184. movaps -16 * SIZE(X), %xmm4
  185. addps %xmm12, %xmm1
  186. pshufd $0xb1, %xmm9, %xmm12
  187. mulps %xmm5, %xmm9
  188. addps %xmm9, %xmm2
  189. movaps -12 * SIZE(Y), %xmm9
  190. mulps %xmm5, %xmm12
  191. movaps -12 * SIZE(X), %xmm5
  192. addps %xmm12, %xmm3
  193. pshufd $0xb1, %xmm10, %xmm12
  194. mulps %xmm6, %xmm10
  195. addps %xmm10, %xmm0
  196. movaps -8 * SIZE(Y), %xmm10
  197. mulps %xmm6, %xmm12
  198. movaps -8 * SIZE(X), %xmm6
  199. addps %xmm12, %xmm1
  200. pshufd $0xb1, %xmm11, %xmm12
  201. mulps %xmm7, %xmm11
  202. addps %xmm11, %xmm2
  203. movaps -4 * SIZE(Y), %xmm11
  204. mulps %xmm7, %xmm12
  205. movaps -4 * SIZE(X), %xmm7
  206. addps %xmm12, %xmm3
  207. pshufd $0xb1, %xmm8, %xmm12
  208. mulps %xmm4, %xmm8
  209. addps %xmm8, %xmm0
  210. mulps %xmm4, %xmm12
  211. addps %xmm12, %xmm1
  212. pshufd $0xb1, %xmm9, %xmm12
  213. mulps %xmm5, %xmm9
  214. addps %xmm9, %xmm2
  215. mulps %xmm5, %xmm12
  216. addps %xmm12, %xmm3
  217. pshufd $0xb1, %xmm10, %xmm12
  218. mulps %xmm6, %xmm10
  219. addps %xmm10, %xmm0
  220. mulps %xmm6, %xmm12
  221. addps %xmm12, %xmm1
  222. pshufd $0xb1, %xmm11, %xmm12
  223. mulps %xmm7, %xmm11
  224. addps %xmm11, %xmm2
  225. mulps %xmm7, %xmm12
  226. addps %xmm12, %xmm3
  227. subq $-32 * SIZE, X
  228. subq $-32 * SIZE, Y
  229. ALIGN_3
  230. .L15:
  231. testq $8, N
  232. jle .L16
  233. movaps -32 * SIZE(X), %xmm4
  234. movaps -32 * SIZE(Y), %xmm8
  235. pshufd $0xb1, %xmm8, %xmm12
  236. mulps %xmm4, %xmm8
  237. addps %xmm8, %xmm0
  238. mulps %xmm4, %xmm12
  239. addps %xmm12, %xmm1
  240. movaps -28 * SIZE(X), %xmm5
  241. movaps -28 * SIZE(Y), %xmm9
  242. pshufd $0xb1, %xmm9, %xmm12
  243. mulps %xmm5, %xmm9
  244. addps %xmm9, %xmm2
  245. mulps %xmm5, %xmm12
  246. addps %xmm12, %xmm3
  247. movaps -24 * SIZE(X), %xmm6
  248. movaps -24 * SIZE(Y), %xmm10
  249. pshufd $0xb1, %xmm10, %xmm12
  250. mulps %xmm6, %xmm10
  251. addps %xmm10, %xmm0
  252. mulps %xmm6, %xmm12
  253. addps %xmm12, %xmm1
  254. movaps -20 * SIZE(X), %xmm7
  255. movaps -20 * SIZE(Y), %xmm11
  256. pshufd $0xb1, %xmm11, %xmm12
  257. mulps %xmm7, %xmm11
  258. addps %xmm11, %xmm2
  259. mulps %xmm7, %xmm12
  260. addps %xmm12, %xmm3
  261. addq $16 * SIZE, X
  262. addq $16 * SIZE, Y
  263. ALIGN_3
  264. .L16:
  265. testq $4, N
  266. jle .L17
  267. movaps -32 * SIZE(X), %xmm4
  268. movaps -32 * SIZE(Y), %xmm8
  269. movaps -28 * SIZE(X), %xmm5
  270. movaps -28 * SIZE(Y), %xmm9
  271. pshufd $0xb1, %xmm8, %xmm12
  272. mulps %xmm4, %xmm8
  273. addps %xmm8, %xmm0
  274. mulps %xmm4, %xmm12
  275. addps %xmm12, %xmm1
  276. pshufd $0xb1, %xmm9, %xmm12
  277. mulps %xmm5, %xmm9
  278. addps %xmm9, %xmm2
  279. mulps %xmm5, %xmm12
  280. addps %xmm12, %xmm3
  281. addq $8 * SIZE, X
  282. addq $8 * SIZE, Y
  283. ALIGN_3
  284. .L17:
  285. testq $2, N
  286. jle .L18
  287. movaps -32 * SIZE(X), %xmm4
  288. movaps -32 * SIZE(Y), %xmm8
  289. pshufd $0xb1, %xmm8, %xmm12
  290. mulps %xmm4, %xmm8
  291. addps %xmm8, %xmm0
  292. mulps %xmm4, %xmm12
  293. addps %xmm12, %xmm1
  294. addq $4 * SIZE, X
  295. addq $4 * SIZE, Y
  296. ALIGN_3
  297. .L18:
  298. testq $1, N
  299. jle .L98
  300. #ifdef movsd
  301. xorps %xmm4, %xmm4
  302. #endif
  303. movsd -32 * SIZE(X), %xmm4
  304. #ifdef movsd
  305. xorps %xmm8, %xmm8
  306. #endif
  307. movsd -32 * SIZE(Y), %xmm8
  308. pshufd $0xb1, %xmm8, %xmm12
  309. mulps %xmm4, %xmm8
  310. addps %xmm8, %xmm0
  311. mulps %xmm4, %xmm12
  312. addps %xmm12, %xmm1
  313. jmp .L98
  314. ALIGN_3
  315. .L20:
  316. #ifdef ALIGNED_ACCESS
  317. testq $2 * SIZE, Y
  318. jne .L30
  319. movaps -33 * SIZE(Y), %xmm8
  320. addq $3 * SIZE, Y
  321. shufps $0xb1, %xmm1, %xmm1
  322. movq N, %rax
  323. sarq $4, %rax
  324. jle .L25
  325. movaps -32 * SIZE(X), %xmm4
  326. movaps -32 * SIZE(Y), %xmm9
  327. movaps -28 * SIZE(X), %xmm5
  328. movaps -28 * SIZE(Y), %xmm10
  329. movaps -24 * SIZE(X), %xmm6
  330. movaps -24 * SIZE(Y), %xmm11
  331. movaps -20 * SIZE(X), %xmm7
  332. decq %rax
  333. jle .L22
  334. ALIGN_3
  335. .L21:
  336. #ifdef PREFETCH
  337. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  338. #endif
  339. movss %xmm9, %xmm8
  340. pshufd $0xb1, %xmm4, %xmm12
  341. shufps $0x39, %xmm8, %xmm8
  342. mulps %xmm8, %xmm4
  343. addps %xmm4, %xmm0
  344. movaps -16 * SIZE(X), %xmm4
  345. mulps %xmm8, %xmm12
  346. movaps -20 * SIZE(Y), %xmm8
  347. addps %xmm12, %xmm1
  348. movss %xmm10, %xmm9
  349. pshufd $0xb1, %xmm5, %xmm12
  350. shufps $0x39, %xmm9, %xmm9
  351. mulps %xmm9, %xmm5
  352. addps %xmm5, %xmm0
  353. movaps -12 * SIZE(X), %xmm5
  354. mulps %xmm9, %xmm12
  355. movaps -16 * SIZE(Y), %xmm9
  356. addps %xmm12, %xmm1
  357. #ifdef PREFETCH
  358. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  359. #endif
  360. movss %xmm11, %xmm10
  361. pshufd $0xb1, %xmm6, %xmm12
  362. shufps $0x39, %xmm10, %xmm10
  363. mulps %xmm10, %xmm6
  364. addps %xmm6, %xmm0
  365. movaps -8 * SIZE(X), %xmm6
  366. mulps %xmm10, %xmm12
  367. movaps -12 * SIZE(Y), %xmm10
  368. addps %xmm12, %xmm1
  369. movss %xmm8, %xmm11
  370. pshufd $0xb1, %xmm7, %xmm12
  371. shufps $0x39, %xmm11, %xmm11
  372. mulps %xmm11, %xmm7
  373. addps %xmm7, %xmm0
  374. movaps -4 * SIZE(X), %xmm7
  375. mulps %xmm11, %xmm12
  376. movaps -8 * SIZE(Y), %xmm11
  377. addps %xmm12, %xmm1
  378. #if defined(PREFETCH) && !defined(FETCH128)
  379. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  380. #endif
  381. movss %xmm9, %xmm8
  382. pshufd $0xb1, %xmm4, %xmm12
  383. shufps $0x39, %xmm8, %xmm8
  384. mulps %xmm8, %xmm4
  385. addps %xmm4, %xmm0
  386. movaps 0 * SIZE(X), %xmm4
  387. mulps %xmm8, %xmm12
  388. movaps -4 * SIZE(Y), %xmm8
  389. addps %xmm12, %xmm1
  390. movss %xmm10, %xmm9
  391. pshufd $0xb1, %xmm5, %xmm12
  392. shufps $0x39, %xmm9, %xmm9
  393. mulps %xmm9, %xmm5
  394. addps %xmm5, %xmm0
  395. movaps 4 * SIZE(X), %xmm5
  396. mulps %xmm9, %xmm12
  397. movaps 0 * SIZE(Y), %xmm9
  398. addps %xmm12, %xmm1
  399. #if defined(PREFETCH) && !defined(FETCH128)
  400. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  401. #endif
  402. movss %xmm11, %xmm10
  403. pshufd $0xb1, %xmm6, %xmm12
  404. shufps $0x39, %xmm10, %xmm10
  405. mulps %xmm10, %xmm6
  406. addps %xmm6, %xmm0
  407. movaps 8 * SIZE(X), %xmm6
  408. mulps %xmm10, %xmm12
  409. movaps 4 * SIZE(Y), %xmm10
  410. addps %xmm12, %xmm1
  411. movss %xmm8, %xmm11
  412. pshufd $0xb1, %xmm7, %xmm12
  413. shufps $0x39, %xmm11, %xmm11
  414. mulps %xmm11, %xmm7
  415. addps %xmm7, %xmm0
  416. movaps 12 * SIZE(X), %xmm7
  417. mulps %xmm11, %xmm12
  418. movaps 8 * SIZE(Y), %xmm11
  419. addps %xmm12, %xmm1
  420. subq $-32 * SIZE, X
  421. subq $-32 * SIZE, Y
  422. decq %rax
  423. jg .L21
  424. ALIGN_3
  425. .L22:
  426. movss %xmm9, %xmm8
  427. pshufd $0xb1, %xmm4, %xmm12
  428. shufps $0x39, %xmm8, %xmm8
  429. mulps %xmm8, %xmm4
  430. addps %xmm4, %xmm0
  431. movaps -16 * SIZE(X), %xmm4
  432. mulps %xmm8, %xmm12
  433. movaps -20 * SIZE(Y), %xmm8
  434. addps %xmm12, %xmm1
  435. movss %xmm10, %xmm9
  436. pshufd $0xb1, %xmm5, %xmm12
  437. shufps $0x39, %xmm9, %xmm9
  438. mulps %xmm9, %xmm5
  439. addps %xmm5, %xmm0
  440. movaps -12 * SIZE(X), %xmm5
  441. mulps %xmm9, %xmm12
  442. movaps -16 * SIZE(Y), %xmm9
  443. addps %xmm12, %xmm1
  444. movss %xmm11, %xmm10
  445. pshufd $0xb1, %xmm6, %xmm12
  446. shufps $0x39, %xmm10, %xmm10
  447. mulps %xmm10, %xmm6
  448. addps %xmm6, %xmm0
  449. movaps -8 * SIZE(X), %xmm6
  450. mulps %xmm10, %xmm12
  451. movaps -12 * SIZE(Y), %xmm10
  452. addps %xmm12, %xmm1
  453. movss %xmm8, %xmm11
  454. pshufd $0xb1, %xmm7, %xmm12
  455. shufps $0x39, %xmm11, %xmm11
  456. mulps %xmm11, %xmm7
  457. addps %xmm7, %xmm0
  458. movaps -4 * SIZE(X), %xmm7
  459. mulps %xmm11, %xmm12
  460. movaps -8 * SIZE(Y), %xmm11
  461. addps %xmm12, %xmm1
  462. movss %xmm9, %xmm8
  463. pshufd $0xb1, %xmm4, %xmm12
  464. shufps $0x39, %xmm8, %xmm8
  465. mulps %xmm8, %xmm4
  466. addps %xmm4, %xmm0
  467. mulps %xmm8, %xmm12
  468. movaps -4 * SIZE(Y), %xmm8
  469. addps %xmm12, %xmm1
  470. movss %xmm10, %xmm9
  471. pshufd $0xb1, %xmm5, %xmm12
  472. shufps $0x39, %xmm9, %xmm9
  473. mulps %xmm9, %xmm5
  474. addps %xmm5, %xmm0
  475. mulps %xmm9, %xmm12
  476. addps %xmm12, %xmm1
  477. movss %xmm11, %xmm10
  478. pshufd $0xb1, %xmm6, %xmm12
  479. shufps $0x39, %xmm10, %xmm10
  480. mulps %xmm10, %xmm6
  481. addps %xmm6, %xmm0
  482. mulps %xmm10, %xmm12
  483. addps %xmm12, %xmm1
  484. movss %xmm8, %xmm11
  485. pshufd $0xb1, %xmm7, %xmm12
  486. shufps $0x39, %xmm11, %xmm11
  487. mulps %xmm11, %xmm7
  488. addps %xmm7, %xmm0
  489. mulps %xmm11, %xmm12
  490. addps %xmm12, %xmm1
  491. subq $-32 * SIZE, X
  492. subq $-32 * SIZE, Y
  493. ALIGN_3
  494. .L25:
  495. testq $8, N
  496. jle .L26
  497. movaps -32 * SIZE(X), %xmm4
  498. movaps -32 * SIZE(Y), %xmm9
  499. movaps -28 * SIZE(X), %xmm5
  500. movaps -28 * SIZE(Y), %xmm10
  501. movss %xmm9, %xmm8
  502. pshufd $0xb1, %xmm4, %xmm12
  503. shufps $0x39, %xmm8, %xmm8
  504. mulps %xmm8, %xmm4
  505. addps %xmm4, %xmm0
  506. mulps %xmm8, %xmm12
  507. addps %xmm12, %xmm1
  508. movaps -24 * SIZE(X), %xmm6
  509. movaps -24 * SIZE(Y), %xmm11
  510. movss %xmm10, %xmm9
  511. pshufd $0xb1, %xmm5, %xmm12
  512. shufps $0x39, %xmm9, %xmm9
  513. mulps %xmm9, %xmm5
  514. addps %xmm5, %xmm0
  515. mulps %xmm9, %xmm12
  516. addps %xmm12, %xmm1
  517. movaps -20 * SIZE(X), %xmm7
  518. movaps -20 * SIZE(Y), %xmm8
  519. movss %xmm11, %xmm10
  520. pshufd $0xb1, %xmm6, %xmm12
  521. shufps $0x39, %xmm10, %xmm10
  522. mulps %xmm10, %xmm6
  523. addps %xmm6, %xmm0
  524. mulps %xmm10, %xmm12
  525. addps %xmm12, %xmm1
  526. movss %xmm8, %xmm11
  527. pshufd $0xb1, %xmm7, %xmm12
  528. shufps $0x39, %xmm11, %xmm11
  529. mulps %xmm11, %xmm7
  530. addps %xmm7, %xmm0
  531. mulps %xmm11, %xmm12
  532. addps %xmm12, %xmm1
  533. addq $16 * SIZE, X
  534. addq $16 * SIZE, Y
  535. ALIGN_3
  536. .L26:
  537. testq $4, N
  538. jle .L27
  539. movaps -32 * SIZE(X), %xmm4
  540. movaps -32 * SIZE(Y), %xmm9
  541. movss %xmm9, %xmm8
  542. pshufd $0xb1, %xmm4, %xmm12
  543. shufps $0x39, %xmm8, %xmm8
  544. mulps %xmm8, %xmm4
  545. addps %xmm4, %xmm0
  546. mulps %xmm8, %xmm12
  547. addps %xmm12, %xmm1
  548. movaps -28 * SIZE(X), %xmm5
  549. movaps -28 * SIZE(Y), %xmm10
  550. movss %xmm10, %xmm9
  551. pshufd $0xb1, %xmm5, %xmm12
  552. shufps $0x39, %xmm9, %xmm9
  553. mulps %xmm9, %xmm5
  554. addps %xmm5, %xmm0
  555. mulps %xmm9, %xmm12
  556. addps %xmm12, %xmm1
  557. movaps %xmm10, %xmm8
  558. addq $8 * SIZE, X
  559. addq $8 * SIZE, Y
  560. ALIGN_3
  561. .L27:
  562. testq $2, N
  563. jle .L28
  564. movaps -32 * SIZE(X), %xmm4
  565. movaps -32 * SIZE(Y), %xmm9
  566. movss %xmm9, %xmm8
  567. pshufd $0xb1, %xmm4, %xmm12
  568. shufps $0x39, %xmm8, %xmm8
  569. mulps %xmm8, %xmm4
  570. addps %xmm4, %xmm0
  571. mulps %xmm8, %xmm12
  572. addps %xmm12, %xmm1
  573. movaps %xmm9, %xmm8
  574. addq $4 * SIZE, X
  575. addq $4 * SIZE, Y
  576. ALIGN_3
  577. .L28:
  578. testq $1, N
  579. jle .L29
  580. #ifdef movsd
  581. xorps %xmm4, %xmm4
  582. #endif
  583. movsd -32 * SIZE(X), %xmm4
  584. pshufd $0xb1, %xmm4, %xmm12
  585. shufps $0x59, %xmm8, %xmm8
  586. mulps %xmm8, %xmm4
  587. addps %xmm4, %xmm0
  588. mulps %xmm8, %xmm12
  589. addps %xmm12, %xmm1
  590. ALIGN_3
  591. .L29:
  592. shufps $0xb1, %xmm1, %xmm1
  593. shufps $0xb1, %xmm3, %xmm3
  594. jmp .L98
  595. ALIGN_3
  596. .L30:
  597. testq $SIZE, Y
  598. jne .L40
  599. #endif
  600. movq N, %rax
  601. sarq $4, %rax
  602. jle .L35
  603. movaps -32 * SIZE(X), %xmm4
  604. movsd -32 * SIZE(Y), %xmm8
  605. movhps -30 * SIZE(Y), %xmm8
  606. movaps -28 * SIZE(X), %xmm5
  607. movsd -28 * SIZE(Y), %xmm9
  608. movhps -26 * SIZE(Y), %xmm9
  609. movaps -24 * SIZE(X), %xmm6
  610. movsd -24 * SIZE(Y), %xmm10
  611. movhps -22 * SIZE(Y), %xmm10
  612. movaps -20 * SIZE(X), %xmm7
  613. movsd -20 * SIZE(Y), %xmm11
  614. movhps -18 * SIZE(Y), %xmm11
  615. decq %rax
  616. jle .L32
  617. ALIGN_3
  618. .L31:
  619. #ifdef PREFETCH
  620. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  621. #endif
  622. pshufd $0xb1, %xmm8, %xmm12
  623. mulps %xmm4, %xmm8
  624. addps %xmm8, %xmm0
  625. movsd -16 * SIZE(Y), %xmm8
  626. movhps -14 * SIZE(Y), %xmm8
  627. mulps %xmm4, %xmm12
  628. movaps -16 * SIZE(X), %xmm4
  629. addps %xmm12, %xmm1
  630. pshufd $0xb1, %xmm9, %xmm12
  631. mulps %xmm5, %xmm9
  632. addps %xmm9, %xmm2
  633. movsd -12 * SIZE(Y), %xmm9
  634. movhps -10 * SIZE(Y), %xmm9
  635. mulps %xmm5, %xmm12
  636. movaps -12 * SIZE(X), %xmm5
  637. addps %xmm12, %xmm3
  638. #ifdef PREFETCH
  639. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  640. #endif
  641. pshufd $0xb1, %xmm10, %xmm12
  642. mulps %xmm6, %xmm10
  643. addps %xmm10, %xmm0
  644. movsd -8 * SIZE(Y), %xmm10
  645. movhps -6 * SIZE(Y), %xmm10
  646. mulps %xmm6, %xmm12
  647. movaps -8 * SIZE(X), %xmm6
  648. addps %xmm12, %xmm1
  649. pshufd $0xb1, %xmm11, %xmm12
  650. mulps %xmm7, %xmm11
  651. addps %xmm11, %xmm2
  652. movsd -4 * SIZE(Y), %xmm11
  653. movhps -2 * SIZE(Y), %xmm11
  654. mulps %xmm7, %xmm12
  655. movaps -4 * SIZE(X), %xmm7
  656. addps %xmm12, %xmm3
  657. #if defined(PREFETCH) && !defined(FETCH128)
  658. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  659. #endif
  660. pshufd $0xb1, %xmm8, %xmm12
  661. mulps %xmm4, %xmm8
  662. addps %xmm8, %xmm0
  663. movsd 0 * SIZE(Y), %xmm8
  664. movhps 2 * SIZE(Y), %xmm8
  665. mulps %xmm4, %xmm12
  666. movaps 0 * SIZE(X), %xmm4
  667. addps %xmm12, %xmm1
  668. pshufd $0xb1, %xmm9, %xmm12
  669. mulps %xmm5, %xmm9
  670. addps %xmm9, %xmm2
  671. movsd 4 * SIZE(Y), %xmm9
  672. movhps 6 * SIZE(Y), %xmm9
  673. mulps %xmm5, %xmm12
  674. movaps 4 * SIZE(X), %xmm5
  675. addps %xmm12, %xmm3
  676. #if defined(PREFETCH) && !defined(FETCH128)
  677. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  678. #endif
  679. pshufd $0xb1, %xmm10, %xmm12
  680. mulps %xmm6, %xmm10
  681. addps %xmm10, %xmm0
  682. movsd 8 * SIZE(Y), %xmm10
  683. movhps 10 * SIZE(Y), %xmm10
  684. mulps %xmm6, %xmm12
  685. movaps 8 * SIZE(X), %xmm6
  686. addps %xmm12, %xmm1
  687. pshufd $0xb1, %xmm11, %xmm12
  688. mulps %xmm7, %xmm11
  689. addps %xmm11, %xmm2
  690. movsd 12 * SIZE(Y), %xmm11
  691. movhps 14 * SIZE(Y), %xmm11
  692. mulps %xmm7, %xmm12
  693. movaps 12 * SIZE(X), %xmm7
  694. addps %xmm12, %xmm3
  695. subq $-32 * SIZE, X
  696. subq $-32 * SIZE, Y
  697. decq %rax
  698. jg .L31
  699. ALIGN_3
  700. .L32:
  701. pshufd $0xb1, %xmm8, %xmm12
  702. mulps %xmm4, %xmm8
  703. addps %xmm8, %xmm0
  704. movsd -16 * SIZE(Y), %xmm8
  705. movhps -14 * SIZE(Y), %xmm8
  706. mulps %xmm4, %xmm12
  707. movaps -16 * SIZE(X), %xmm4
  708. addps %xmm12, %xmm1
  709. pshufd $0xb1, %xmm9, %xmm12
  710. mulps %xmm5, %xmm9
  711. addps %xmm9, %xmm2
  712. movsd -12 * SIZE(Y), %xmm9
  713. movhps -10 * SIZE(Y), %xmm9
  714. mulps %xmm5, %xmm12
  715. movaps -12 * SIZE(X), %xmm5
  716. addps %xmm12, %xmm3
  717. pshufd $0xb1, %xmm10, %xmm12
  718. mulps %xmm6, %xmm10
  719. addps %xmm10, %xmm0
  720. movsd -8 * SIZE(Y), %xmm10
  721. movhps -6 * SIZE(Y), %xmm10
  722. mulps %xmm6, %xmm12
  723. movaps -8 * SIZE(X), %xmm6
  724. addps %xmm12, %xmm1
  725. pshufd $0xb1, %xmm11, %xmm12
  726. mulps %xmm7, %xmm11
  727. addps %xmm11, %xmm2
  728. movsd -4 * SIZE(Y), %xmm11
  729. movhps -2 * SIZE(Y), %xmm11
  730. mulps %xmm7, %xmm12
  731. movaps -4 * SIZE(X), %xmm7
  732. addps %xmm12, %xmm3
  733. pshufd $0xb1, %xmm8, %xmm12
  734. mulps %xmm4, %xmm8
  735. addps %xmm8, %xmm0
  736. mulps %xmm4, %xmm12
  737. addps %xmm12, %xmm1
  738. pshufd $0xb1, %xmm9, %xmm12
  739. mulps %xmm5, %xmm9
  740. addps %xmm9, %xmm2
  741. mulps %xmm5, %xmm12
  742. addps %xmm12, %xmm3
  743. pshufd $0xb1, %xmm10, %xmm12
  744. mulps %xmm6, %xmm10
  745. addps %xmm10, %xmm0
  746. mulps %xmm6, %xmm12
  747. addps %xmm12, %xmm1
  748. pshufd $0xb1, %xmm11, %xmm12
  749. mulps %xmm7, %xmm11
  750. addps %xmm11, %xmm2
  751. mulps %xmm7, %xmm12
  752. addps %xmm12, %xmm3
  753. subq $-32 * SIZE, X
  754. subq $-32 * SIZE, Y
  755. ALIGN_3
  756. .L35:
  757. testq $8, N
  758. jle .L36
  759. movaps -32 * SIZE(X), %xmm4
  760. movsd -32 * SIZE(Y), %xmm8
  761. movhps -30 * SIZE(Y), %xmm8
  762. pshufd $0xb1, %xmm8, %xmm12
  763. mulps %xmm4, %xmm8
  764. addps %xmm8, %xmm0
  765. mulps %xmm4, %xmm12
  766. addps %xmm12, %xmm1
  767. movaps -28 * SIZE(X), %xmm5
  768. movsd -28 * SIZE(Y), %xmm9
  769. movhps -26 * SIZE(Y), %xmm9
  770. pshufd $0xb1, %xmm9, %xmm12
  771. mulps %xmm5, %xmm9
  772. addps %xmm9, %xmm2
  773. mulps %xmm5, %xmm12
  774. addps %xmm12, %xmm3
  775. movaps -24 * SIZE(X), %xmm6
  776. movsd -24 * SIZE(Y), %xmm10
  777. movhps -22 * SIZE(Y), %xmm10
  778. pshufd $0xb1, %xmm10, %xmm12
  779. mulps %xmm6, %xmm10
  780. addps %xmm10, %xmm0
  781. mulps %xmm6, %xmm12
  782. addps %xmm12, %xmm1
  783. movaps -20 * SIZE(X), %xmm7
  784. movsd -20 * SIZE(Y), %xmm11
  785. movhps -18 * SIZE(Y), %xmm11
  786. pshufd $0xb1, %xmm11, %xmm12
  787. mulps %xmm7, %xmm11
  788. addps %xmm11, %xmm2
  789. mulps %xmm7, %xmm12
  790. addps %xmm12, %xmm3
  791. addq $16 * SIZE, X
  792. addq $16 * SIZE, Y
  793. ALIGN_3
  794. .L36:
  795. testq $4, N
  796. jle .L37
  797. movaps -32 * SIZE(X), %xmm4
  798. movsd -32 * SIZE(Y), %xmm8
  799. movhps -30 * SIZE(Y), %xmm8
  800. pshufd $0xb1, %xmm8, %xmm12
  801. mulps %xmm4, %xmm8
  802. addps %xmm8, %xmm0
  803. mulps %xmm4, %xmm12
  804. addps %xmm12, %xmm1
  805. movaps -28 * SIZE(X), %xmm5
  806. movsd -28 * SIZE(Y), %xmm9
  807. movhps -26 * SIZE(Y), %xmm9
  808. pshufd $0xb1, %xmm9, %xmm12
  809. mulps %xmm5, %xmm9
  810. addps %xmm9, %xmm2
  811. mulps %xmm5, %xmm12
  812. addps %xmm12, %xmm3
  813. addq $8 * SIZE, X
  814. addq $8 * SIZE, Y
  815. ALIGN_3
  816. .L37:
  817. testq $2, N
  818. jle .L38
  819. movaps -32 * SIZE(X), %xmm4
  820. movsd -32 * SIZE(Y), %xmm8
  821. movhps -30 * SIZE(Y), %xmm8
  822. pshufd $0xb1, %xmm8, %xmm12
  823. mulps %xmm4, %xmm8
  824. addps %xmm8, %xmm0
  825. mulps %xmm4, %xmm12
  826. addps %xmm12, %xmm1
  827. addq $4 * SIZE, X
  828. addq $4 * SIZE, Y
  829. ALIGN_3
  830. .L38:
  831. testq $1, N
  832. jle .L98
  833. #ifdef movsd
  834. xorps %xmm4, %xmm4
  835. #endif
  836. movsd -32 * SIZE(X), %xmm4
  837. #ifdef movsd
  838. xorps %xmm8, %xmm8
  839. #endif
  840. movsd -32 * SIZE(Y), %xmm8
  841. pshufd $0xb1, %xmm8, %xmm12
  842. mulps %xmm4, %xmm8
  843. addps %xmm8, %xmm0
  844. mulps %xmm4, %xmm12
  845. addps %xmm12, %xmm1
  846. jmp .L98
  847. ALIGN_3
  848. #ifdef ALIGNED_ACCESS
  849. .L40:
  850. movaps -35 * SIZE(Y), %xmm8
  851. addq $1 * SIZE, Y
  852. shufps $0xb1, %xmm1, %xmm1
  853. movq N, %rax
  854. sarq $4, %rax
  855. jle .L45
  856. movaps -32 * SIZE(X), %xmm4
  857. movaps -32 * SIZE(Y), %xmm9
  858. movaps -28 * SIZE(X), %xmm5
  859. movaps -28 * SIZE(Y), %xmm10
  860. movaps -24 * SIZE(X), %xmm6
  861. movaps -24 * SIZE(Y), %xmm11
  862. movaps -20 * SIZE(X), %xmm7
  863. decq %rax
  864. jle .L42
  865. ALIGN_3
  866. .L41:
  867. #ifdef PREFETCH
  868. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  869. #endif
  870. movss %xmm9, %xmm8
  871. pshufd $0xb1, %xmm4, %xmm12
  872. shufps $0x93, %xmm9, %xmm8
  873. mulps %xmm8, %xmm4
  874. addps %xmm4, %xmm0
  875. movaps -16 * SIZE(X), %xmm4
  876. mulps %xmm8, %xmm12
  877. movaps -20 * SIZE(Y), %xmm8
  878. addps %xmm12, %xmm1
  879. movss %xmm10, %xmm9
  880. pshufd $0xb1, %xmm5, %xmm12
  881. shufps $0x93, %xmm10, %xmm9
  882. mulps %xmm9, %xmm5
  883. addps %xmm5, %xmm0
  884. movaps -12 * SIZE(X), %xmm5
  885. mulps %xmm9, %xmm12
  886. movaps -16 * SIZE(Y), %xmm9
  887. addps %xmm12, %xmm1
  888. #ifdef PREFETCH
  889. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  890. #endif
  891. movss %xmm11, %xmm10
  892. pshufd $0xb1, %xmm6, %xmm12
  893. shufps $0x93, %xmm11, %xmm10
  894. mulps %xmm10, %xmm6
  895. addps %xmm6, %xmm0
  896. movaps -8 * SIZE(X), %xmm6
  897. mulps %xmm10, %xmm12
  898. movaps -12 * SIZE(Y), %xmm10
  899. addps %xmm12, %xmm1
  900. movss %xmm8, %xmm11
  901. pshufd $0xb1, %xmm7, %xmm12
  902. shufps $0x93, %xmm8, %xmm11
  903. mulps %xmm11, %xmm7
  904. addps %xmm7, %xmm0
  905. movaps -4 * SIZE(X), %xmm7
  906. mulps %xmm11, %xmm12
  907. movaps -8 * SIZE(Y), %xmm11
  908. addps %xmm12, %xmm1
  909. #if defined(PREFETCH) && !defined(FETCH128)
  910. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  911. #endif
  912. movss %xmm9, %xmm8
  913. pshufd $0xb1, %xmm4, %xmm12
  914. shufps $0x93, %xmm9, %xmm8
  915. mulps %xmm8, %xmm4
  916. addps %xmm4, %xmm0
  917. movaps 0 * SIZE(X), %xmm4
  918. mulps %xmm8, %xmm12
  919. movaps -4 * SIZE(Y), %xmm8
  920. addps %xmm12, %xmm1
  921. movss %xmm10, %xmm9
  922. pshufd $0xb1, %xmm5, %xmm12
  923. shufps $0x93, %xmm10, %xmm9
  924. mulps %xmm9, %xmm5
  925. addps %xmm5, %xmm0
  926. movaps 4 * SIZE(X), %xmm5
  927. mulps %xmm9, %xmm12
  928. movaps 0 * SIZE(Y), %xmm9
  929. addps %xmm12, %xmm1
  930. #if defined(PREFETCH) && !defined(FETCH128)
  931. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  932. #endif
  933. movss %xmm11, %xmm10
  934. pshufd $0xb1, %xmm6, %xmm12
  935. shufps $0x93, %xmm11, %xmm10
  936. mulps %xmm10, %xmm6
  937. addps %xmm6, %xmm0
  938. movaps 8 * SIZE(X), %xmm6
  939. mulps %xmm10, %xmm12
  940. movaps 4 * SIZE(Y), %xmm10
  941. addps %xmm12, %xmm1
  942. movss %xmm8, %xmm11
  943. pshufd $0xb1, %xmm7, %xmm12
  944. shufps $0x93, %xmm8, %xmm11
  945. mulps %xmm11, %xmm7
  946. addps %xmm7, %xmm0
  947. movaps 12 * SIZE(X), %xmm7
  948. mulps %xmm11, %xmm12
  949. movaps 8 * SIZE(Y), %xmm11
  950. addps %xmm12, %xmm1
  951. subq $-32 * SIZE, X
  952. subq $-32 * SIZE, Y
  953. decq %rax
  954. jg .L41
  955. ALIGN_3
  956. .L42:
  957. movss %xmm9, %xmm8
  958. pshufd $0xb1, %xmm4, %xmm12
  959. shufps $0x93, %xmm9, %xmm8
  960. mulps %xmm8, %xmm4
  961. addps %xmm4, %xmm0
  962. movaps -16 * SIZE(X), %xmm4
  963. mulps %xmm8, %xmm12
  964. movaps -20 * SIZE(Y), %xmm8
  965. addps %xmm12, %xmm1
  966. movss %xmm10, %xmm9
  967. pshufd $0xb1, %xmm5, %xmm12
  968. shufps $0x93, %xmm10, %xmm9
  969. mulps %xmm9, %xmm5
  970. addps %xmm5, %xmm0
  971. movaps -12 * SIZE(X), %xmm5
  972. mulps %xmm9, %xmm12
  973. movaps -16 * SIZE(Y), %xmm9
  974. addps %xmm12, %xmm1
  975. movss %xmm11, %xmm10
  976. pshufd $0xb1, %xmm6, %xmm12
  977. shufps $0x93, %xmm11, %xmm10
  978. mulps %xmm10, %xmm6
  979. addps %xmm6, %xmm0
  980. movaps -8 * SIZE(X), %xmm6
  981. mulps %xmm10, %xmm12
  982. movaps -12 * SIZE(Y), %xmm10
  983. addps %xmm12, %xmm1
  984. movss %xmm8, %xmm11
  985. pshufd $0xb1, %xmm7, %xmm12
  986. shufps $0x93, %xmm8, %xmm11
  987. mulps %xmm11, %xmm7
  988. addps %xmm7, %xmm0
  989. movaps -4 * SIZE(X), %xmm7
  990. mulps %xmm11, %xmm12
  991. movaps -8 * SIZE(Y), %xmm11
  992. addps %xmm12, %xmm1
  993. movss %xmm9, %xmm8
  994. pshufd $0xb1, %xmm4, %xmm12
  995. shufps $0x93, %xmm9, %xmm8
  996. mulps %xmm8, %xmm4
  997. addps %xmm4, %xmm0
  998. mulps %xmm8, %xmm12
  999. movaps -4 * SIZE(Y), %xmm8
  1000. addps %xmm12, %xmm1
  1001. movss %xmm10, %xmm9
  1002. pshufd $0xb1, %xmm5, %xmm12
  1003. shufps $0x93, %xmm10, %xmm9
  1004. mulps %xmm9, %xmm5
  1005. addps %xmm5, %xmm0
  1006. mulps %xmm9, %xmm12
  1007. addps %xmm12, %xmm1
  1008. movss %xmm11, %xmm10
  1009. pshufd $0xb1, %xmm6, %xmm12
  1010. shufps $0x93, %xmm11, %xmm10
  1011. mulps %xmm10, %xmm6
  1012. addps %xmm6, %xmm0
  1013. mulps %xmm10, %xmm12
  1014. addps %xmm12, %xmm1
  1015. movss %xmm8, %xmm11
  1016. pshufd $0xb1, %xmm7, %xmm12
  1017. shufps $0x93, %xmm8, %xmm11
  1018. mulps %xmm11, %xmm7
  1019. addps %xmm7, %xmm0
  1020. mulps %xmm11, %xmm12
  1021. addps %xmm12, %xmm1
  1022. subq $-32 * SIZE, X
  1023. subq $-32 * SIZE, Y
  1024. ALIGN_3
  1025. .L45:
  1026. testq $8, N
  1027. jle .L46
  1028. movaps -32 * SIZE(X), %xmm4
  1029. movaps -32 * SIZE(Y), %xmm9
  1030. movaps -28 * SIZE(X), %xmm5
  1031. movaps -28 * SIZE(Y), %xmm10
  1032. movss %xmm9, %xmm8
  1033. pshufd $0xb1, %xmm4, %xmm12
  1034. shufps $0x93, %xmm9, %xmm8
  1035. mulps %xmm8, %xmm4
  1036. addps %xmm4, %xmm0
  1037. mulps %xmm8, %xmm12
  1038. addps %xmm12, %xmm1
  1039. movaps -24 * SIZE(X), %xmm6
  1040. movaps -24 * SIZE(Y), %xmm11
  1041. movss %xmm10, %xmm9
  1042. pshufd $0xb1, %xmm5, %xmm12
  1043. shufps $0x93, %xmm10, %xmm9
  1044. mulps %xmm9, %xmm5
  1045. addps %xmm5, %xmm0
  1046. mulps %xmm9, %xmm12
  1047. addps %xmm12, %xmm1
  1048. movaps -20 * SIZE(X), %xmm7
  1049. movaps -20 * SIZE(Y), %xmm8
  1050. movss %xmm11, %xmm10
  1051. pshufd $0xb1, %xmm6, %xmm12
  1052. shufps $0x93, %xmm11, %xmm10
  1053. mulps %xmm10, %xmm6
  1054. addps %xmm6, %xmm0
  1055. mulps %xmm10, %xmm12
  1056. addps %xmm12, %xmm1
  1057. movss %xmm8, %xmm11
  1058. pshufd $0xb1, %xmm7, %xmm12
  1059. shufps $0x93, %xmm8, %xmm11
  1060. mulps %xmm11, %xmm7
  1061. addps %xmm7, %xmm0
  1062. mulps %xmm11, %xmm12
  1063. addps %xmm12, %xmm1
  1064. addq $16 * SIZE, X
  1065. addq $16 * SIZE, Y
  1066. ALIGN_3
  1067. .L46:
  1068. testq $4, N
  1069. jle .L47
  1070. movaps -32 * SIZE(X), %xmm4
  1071. movaps -32 * SIZE(Y), %xmm9
  1072. movss %xmm9, %xmm8
  1073. pshufd $0xb1, %xmm4, %xmm12
  1074. shufps $0x93, %xmm9, %xmm8
  1075. mulps %xmm8, %xmm4
  1076. addps %xmm4, %xmm0
  1077. mulps %xmm8, %xmm12
  1078. addps %xmm12, %xmm1
  1079. movaps -28 * SIZE(X), %xmm5
  1080. movaps -28 * SIZE(Y), %xmm10
  1081. movss %xmm10, %xmm9
  1082. pshufd $0xb1, %xmm5, %xmm12
  1083. shufps $0x93, %xmm10, %xmm9
  1084. mulps %xmm9, %xmm5
  1085. addps %xmm5, %xmm0
  1086. mulps %xmm9, %xmm12
  1087. addps %xmm12, %xmm1
  1088. movaps %xmm10, %xmm8
  1089. addq $8 * SIZE, X
  1090. addq $8 * SIZE, Y
  1091. ALIGN_3
  1092. .L47:
  1093. testq $2, N
  1094. jle .L48
  1095. movaps -32 * SIZE(X), %xmm4
  1096. movaps -32 * SIZE(Y), %xmm9
  1097. movss %xmm9, %xmm8
  1098. pshufd $0xb1, %xmm4, %xmm12
  1099. shufps $0x93, %xmm9, %xmm8
  1100. mulps %xmm8, %xmm4
  1101. addps %xmm4, %xmm0
  1102. mulps %xmm8, %xmm12
  1103. addps %xmm12, %xmm1
  1104. movaps %xmm9, %xmm8
  1105. addq $4 * SIZE, X
  1106. addq $4 * SIZE, Y
  1107. ALIGN_3
  1108. .L48:
  1109. testq $1, N
  1110. jle .L49
  1111. #ifdef movsd
  1112. xorps %xmm4, %xmm4
  1113. #endif
  1114. movsd -32 * SIZE(X), %xmm4
  1115. movss -32 * SIZE(Y), %xmm9
  1116. movss %xmm9, %xmm8
  1117. pshufd $0xb1, %xmm4, %xmm12
  1118. shufps $0x03, %xmm8, %xmm8
  1119. mulps %xmm8, %xmm4
  1120. addps %xmm4, %xmm0
  1121. mulps %xmm8, %xmm12
  1122. addps %xmm12, %xmm1
  1123. ALIGN_3
  1124. .L49:
  1125. shufps $0xb1, %xmm1, %xmm1
  1126. shufps $0xb1, %xmm3, %xmm3
  1127. jmp .L98
  1128. ALIGN_3
  1129. #endif
  1130. .L50:
  1131. testq $SIZE, Y
  1132. jne .L70
  1133. #ifdef ALIGNED_ACCESS
  1134. testq $2 * SIZE, Y
  1135. je .L50x
  1136. #ifdef movsd
  1137. xorps %xmm0, %xmm0
  1138. #endif
  1139. movsd -32 * SIZE(X), %xmm0
  1140. #ifdef movsd
  1141. xorps %xmm4, %xmm4
  1142. #endif
  1143. movsd -32 * SIZE(Y), %xmm4
  1144. pshufd $0xb1, %xmm0, %xmm1
  1145. mulps %xmm4, %xmm0
  1146. mulps %xmm4, %xmm1
  1147. addq $2 * SIZE, X
  1148. addq $2 * SIZE, Y
  1149. decq N
  1150. ALIGN_3
  1151. .L50x:
  1152. testq $2 * SIZE, X
  1153. jne .L60
  1154. movaps -33 * SIZE(X), %xmm8
  1155. addq $3 * SIZE, X
  1156. shufps $0xb1, %xmm1, %xmm1
  1157. movq N, %rax
  1158. sarq $4, %rax
  1159. jle .L55
  1160. movaps -32 * SIZE(Y), %xmm4
  1161. movaps -32 * SIZE(X), %xmm9
  1162. movaps -28 * SIZE(Y), %xmm5
  1163. movaps -28 * SIZE(X), %xmm10
  1164. movaps -24 * SIZE(Y), %xmm6
  1165. movaps -24 * SIZE(X), %xmm11
  1166. movaps -20 * SIZE(Y), %xmm7
  1167. decq %rax
  1168. jle .L52
  1169. ALIGN_3
  1170. .L51:
  1171. #ifdef PREFETCH
  1172. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  1173. #endif
  1174. movss %xmm9, %xmm8
  1175. pshufd $0xb1, %xmm4, %xmm12
  1176. shufps $0x39, %xmm8, %xmm8
  1177. mulps %xmm8, %xmm4
  1178. addps %xmm4, %xmm0
  1179. movaps -16 * SIZE(Y), %xmm4
  1180. mulps %xmm8, %xmm12
  1181. movaps -20 * SIZE(X), %xmm8
  1182. addps %xmm12, %xmm1
  1183. movss %xmm10, %xmm9
  1184. pshufd $0xb1, %xmm5, %xmm12
  1185. shufps $0x39, %xmm9, %xmm9
  1186. mulps %xmm9, %xmm5
  1187. addps %xmm5, %xmm0
  1188. movaps -12 * SIZE(Y), %xmm5
  1189. mulps %xmm9, %xmm12
  1190. movaps -16 * SIZE(X), %xmm9
  1191. addps %xmm12, %xmm1
  1192. #ifdef PREFETCH
  1193. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  1194. #endif
  1195. movss %xmm11, %xmm10
  1196. pshufd $0xb1, %xmm6, %xmm12
  1197. shufps $0x39, %xmm10, %xmm10
  1198. mulps %xmm10, %xmm6
  1199. addps %xmm6, %xmm0
  1200. movaps -8 * SIZE(Y), %xmm6
  1201. mulps %xmm10, %xmm12
  1202. movaps -12 * SIZE(X), %xmm10
  1203. addps %xmm12, %xmm1
  1204. movss %xmm8, %xmm11
  1205. pshufd $0xb1, %xmm7, %xmm12
  1206. shufps $0x39, %xmm11, %xmm11
  1207. mulps %xmm11, %xmm7
  1208. addps %xmm7, %xmm0
  1209. movaps -4 * SIZE(Y), %xmm7
  1210. mulps %xmm11, %xmm12
  1211. movaps -8 * SIZE(X), %xmm11
  1212. addps %xmm12, %xmm1
  1213. #if defined(PREFETCH) && !defined(FETCH128)
  1214. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  1215. #endif
  1216. movss %xmm9, %xmm8
  1217. pshufd $0xb1, %xmm4, %xmm12
  1218. shufps $0x39, %xmm8, %xmm8
  1219. mulps %xmm8, %xmm4
  1220. addps %xmm4, %xmm0
  1221. movaps 0 * SIZE(Y), %xmm4
  1222. mulps %xmm8, %xmm12
  1223. movaps -4 * SIZE(X), %xmm8
  1224. addps %xmm12, %xmm1
  1225. movss %xmm10, %xmm9
  1226. pshufd $0xb1, %xmm5, %xmm12
  1227. shufps $0x39, %xmm9, %xmm9
  1228. mulps %xmm9, %xmm5
  1229. addps %xmm5, %xmm0
  1230. movaps 4 * SIZE(Y), %xmm5
  1231. mulps %xmm9, %xmm12
  1232. movaps 0 * SIZE(X), %xmm9
  1233. addps %xmm12, %xmm1
  1234. #if defined(PREFETCH) && !defined(FETCH128)
  1235. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  1236. #endif
  1237. movss %xmm11, %xmm10
  1238. pshufd $0xb1, %xmm6, %xmm12
  1239. shufps $0x39, %xmm10, %xmm10
  1240. mulps %xmm10, %xmm6
  1241. addps %xmm6, %xmm0
  1242. movaps 8 * SIZE(Y), %xmm6
  1243. mulps %xmm10, %xmm12
  1244. movaps 4 * SIZE(X), %xmm10
  1245. addps %xmm12, %xmm1
  1246. movss %xmm8, %xmm11
  1247. pshufd $0xb1, %xmm7, %xmm12
  1248. shufps $0x39, %xmm11, %xmm11
  1249. mulps %xmm11, %xmm7
  1250. addps %xmm7, %xmm0
  1251. movaps 12 * SIZE(Y), %xmm7
  1252. mulps %xmm11, %xmm12
  1253. movaps 8 * SIZE(X), %xmm11
  1254. addps %xmm12, %xmm1
  1255. subq $-32 * SIZE, X
  1256. subq $-32 * SIZE, Y
  1257. decq %rax
  1258. jg .L51
  1259. ALIGN_3
  1260. .L52:
  1261. movss %xmm9, %xmm8
  1262. pshufd $0xb1, %xmm4, %xmm12
  1263. shufps $0x39, %xmm8, %xmm8
  1264. mulps %xmm8, %xmm4
  1265. addps %xmm4, %xmm0
  1266. movaps -16 * SIZE(Y), %xmm4
  1267. mulps %xmm8, %xmm12
  1268. movaps -20 * SIZE(X), %xmm8
  1269. addps %xmm12, %xmm1
  1270. movss %xmm10, %xmm9
  1271. pshufd $0xb1, %xmm5, %xmm12
  1272. shufps $0x39, %xmm9, %xmm9
  1273. mulps %xmm9, %xmm5
  1274. addps %xmm5, %xmm0
  1275. movaps -12 * SIZE(Y), %xmm5
  1276. mulps %xmm9, %xmm12
  1277. movaps -16 * SIZE(X), %xmm9
  1278. addps %xmm12, %xmm1
  1279. movss %xmm11, %xmm10
  1280. pshufd $0xb1, %xmm6, %xmm12
  1281. shufps $0x39, %xmm10, %xmm10
  1282. mulps %xmm10, %xmm6
  1283. addps %xmm6, %xmm0
  1284. movaps -8 * SIZE(Y), %xmm6
  1285. mulps %xmm10, %xmm12
  1286. movaps -12 * SIZE(X), %xmm10
  1287. addps %xmm12, %xmm1
  1288. movss %xmm8, %xmm11
  1289. pshufd $0xb1, %xmm7, %xmm12
  1290. shufps $0x39, %xmm11, %xmm11
  1291. mulps %xmm11, %xmm7
  1292. addps %xmm7, %xmm0
  1293. movaps -4 * SIZE(Y), %xmm7
  1294. mulps %xmm11, %xmm12
  1295. movaps -8 * SIZE(X), %xmm11
  1296. addps %xmm12, %xmm1
  1297. movss %xmm9, %xmm8
  1298. pshufd $0xb1, %xmm4, %xmm12
  1299. shufps $0x39, %xmm8, %xmm8
  1300. mulps %xmm8, %xmm4
  1301. addps %xmm4, %xmm0
  1302. mulps %xmm8, %xmm12
  1303. movaps -4 * SIZE(X), %xmm8
  1304. addps %xmm12, %xmm1
  1305. movss %xmm10, %xmm9
  1306. pshufd $0xb1, %xmm5, %xmm12
  1307. shufps $0x39, %xmm9, %xmm9
  1308. mulps %xmm9, %xmm5
  1309. addps %xmm5, %xmm0
  1310. mulps %xmm9, %xmm12
  1311. addps %xmm12, %xmm1
  1312. movss %xmm11, %xmm10
  1313. pshufd $0xb1, %xmm6, %xmm12
  1314. shufps $0x39, %xmm10, %xmm10
  1315. mulps %xmm10, %xmm6
  1316. addps %xmm6, %xmm0
  1317. mulps %xmm10, %xmm12
  1318. addps %xmm12, %xmm1
  1319. movss %xmm8, %xmm11
  1320. pshufd $0xb1, %xmm7, %xmm12
  1321. shufps $0x39, %xmm11, %xmm11
  1322. mulps %xmm11, %xmm7
  1323. addps %xmm7, %xmm0
  1324. mulps %xmm11, %xmm12
  1325. addps %xmm12, %xmm1
  1326. subq $-32 * SIZE, X
  1327. subq $-32 * SIZE, Y
  1328. ALIGN_3
  1329. .L55:
  1330. testq $8, N
  1331. jle .L56
  1332. movaps -32 * SIZE(Y), %xmm4
  1333. movaps -32 * SIZE(X), %xmm9
  1334. movaps -28 * SIZE(Y), %xmm5
  1335. movaps -28 * SIZE(X), %xmm10
  1336. movss %xmm9, %xmm8
  1337. pshufd $0xb1, %xmm4, %xmm12
  1338. shufps $0x39, %xmm8, %xmm8
  1339. mulps %xmm8, %xmm4
  1340. addps %xmm4, %xmm0
  1341. mulps %xmm8, %xmm12
  1342. addps %xmm12, %xmm1
  1343. movaps -24 * SIZE(Y), %xmm6
  1344. movaps -24 * SIZE(X), %xmm11
  1345. movss %xmm10, %xmm9
  1346. pshufd $0xb1, %xmm5, %xmm12
  1347. shufps $0x39, %xmm9, %xmm9
  1348. mulps %xmm9, %xmm5
  1349. addps %xmm5, %xmm0
  1350. mulps %xmm9, %xmm12
  1351. addps %xmm12, %xmm1
  1352. movaps -20 * SIZE(Y), %xmm7
  1353. movaps -20 * SIZE(X), %xmm8
  1354. movss %xmm11, %xmm10
  1355. pshufd $0xb1, %xmm6, %xmm12
  1356. shufps $0x39, %xmm10, %xmm10
  1357. mulps %xmm10, %xmm6
  1358. addps %xmm6, %xmm0
  1359. mulps %xmm10, %xmm12
  1360. addps %xmm12, %xmm1
  1361. movss %xmm8, %xmm11
  1362. pshufd $0xb1, %xmm7, %xmm12
  1363. shufps $0x39, %xmm11, %xmm11
  1364. mulps %xmm11, %xmm7
  1365. addps %xmm7, %xmm0
  1366. mulps %xmm11, %xmm12
  1367. addps %xmm12, %xmm1
  1368. addq $16 * SIZE, X
  1369. addq $16 * SIZE, Y
  1370. ALIGN_3
  1371. .L56:
  1372. testq $4, N
  1373. jle .L57
  1374. movaps -32 * SIZE(Y), %xmm4
  1375. movaps -32 * SIZE(X), %xmm9
  1376. movss %xmm9, %xmm8
  1377. pshufd $0xb1, %xmm4, %xmm12
  1378. shufps $0x39, %xmm8, %xmm8
  1379. mulps %xmm8, %xmm4
  1380. addps %xmm4, %xmm0
  1381. mulps %xmm8, %xmm12
  1382. addps %xmm12, %xmm1
  1383. movaps -28 * SIZE(Y), %xmm5
  1384. movaps -28 * SIZE(X), %xmm10
  1385. movss %xmm10, %xmm9
  1386. pshufd $0xb1, %xmm5, %xmm12
  1387. shufps $0x39, %xmm9, %xmm9
  1388. mulps %xmm9, %xmm5
  1389. addps %xmm5, %xmm0
  1390. mulps %xmm9, %xmm12
  1391. addps %xmm12, %xmm1
  1392. movaps %xmm10, %xmm8
  1393. addq $8 * SIZE, X
  1394. addq $8 * SIZE, Y
  1395. ALIGN_3
  1396. .L57:
  1397. testq $2, N
  1398. jle .L58
  1399. movaps -32 * SIZE(Y), %xmm4
  1400. movaps -32 * SIZE(X), %xmm9
  1401. movss %xmm9, %xmm8
  1402. pshufd $0xb1, %xmm4, %xmm12
  1403. shufps $0x39, %xmm8, %xmm8
  1404. mulps %xmm8, %xmm4
  1405. addps %xmm4, %xmm0
  1406. mulps %xmm8, %xmm12
  1407. addps %xmm12, %xmm1
  1408. movaps %xmm9, %xmm8
  1409. addq $4 * SIZE, X
  1410. addq $4 * SIZE, Y
  1411. ALIGN_3
  1412. .L58:
  1413. testq $1, N
  1414. jle .L98
  1415. #ifdef movsd
  1416. xorps %xmm4, %xmm4
  1417. #endif
  1418. movsd -32 * SIZE(Y), %xmm4
  1419. pshufd $0xb1, %xmm4, %xmm12
  1420. shufps $0xa9, %xmm8, %xmm8
  1421. mulps %xmm8, %xmm4
  1422. addps %xmm4, %xmm0
  1423. mulps %xmm8, %xmm12
  1424. addps %xmm12, %xmm1
  1425. jmp .L98
  1426. ALIGN_3
  1427. .L60:
  1428. movaps -35 * SIZE(X), %xmm8
  1429. addq $1 * SIZE, X
  1430. shufps $0xb1, %xmm1, %xmm1
  1431. movq N, %rax
  1432. sarq $4, %rax
  1433. jle .L65
  1434. movaps -32 * SIZE(Y), %xmm4
  1435. movaps -32 * SIZE(X), %xmm9
  1436. movaps -28 * SIZE(Y), %xmm5
  1437. movaps -28 * SIZE(X), %xmm10
  1438. movaps -24 * SIZE(Y), %xmm6
  1439. movaps -24 * SIZE(X), %xmm11
  1440. movaps -20 * SIZE(Y), %xmm7
  1441. decq %rax
  1442. jle .L62
  1443. ALIGN_3
  1444. .L61:
  1445. #ifdef PREFETCH
  1446. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  1447. #endif
  1448. movss %xmm9, %xmm8
  1449. pshufd $0xb1, %xmm4, %xmm12
  1450. shufps $0x93, %xmm9, %xmm8
  1451. mulps %xmm8, %xmm4
  1452. addps %xmm4, %xmm0
  1453. movaps -16 * SIZE(Y), %xmm4
  1454. mulps %xmm8, %xmm12
  1455. movaps -20 * SIZE(X), %xmm8
  1456. addps %xmm12, %xmm1
  1457. movss %xmm10, %xmm9
  1458. pshufd $0xb1, %xmm5, %xmm12
  1459. shufps $0x93, %xmm10, %xmm9
  1460. mulps %xmm9, %xmm5
  1461. addps %xmm5, %xmm0
  1462. movaps -12 * SIZE(Y), %xmm5
  1463. mulps %xmm9, %xmm12
  1464. movaps -16 * SIZE(X), %xmm9
  1465. addps %xmm12, %xmm1
  1466. #ifdef PREFETCH
  1467. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  1468. #endif
  1469. movss %xmm11, %xmm10
  1470. pshufd $0xb1, %xmm6, %xmm12
  1471. shufps $0x93, %xmm11, %xmm10
  1472. mulps %xmm10, %xmm6
  1473. addps %xmm6, %xmm0
  1474. movaps -8 * SIZE(Y), %xmm6
  1475. mulps %xmm10, %xmm12
  1476. movaps -12 * SIZE(X), %xmm10
  1477. addps %xmm12, %xmm1
  1478. movss %xmm8, %xmm11
  1479. pshufd $0xb1, %xmm7, %xmm12
  1480. shufps $0x93, %xmm8, %xmm11
  1481. mulps %xmm11, %xmm7
  1482. addps %xmm7, %xmm0
  1483. movaps -4 * SIZE(Y), %xmm7
  1484. mulps %xmm11, %xmm12
  1485. movaps -8 * SIZE(X), %xmm11
  1486. addps %xmm12, %xmm1
  1487. #if defined(PREFETCH) && !defined(FETCH128)
  1488. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  1489. #endif
  1490. movss %xmm9, %xmm8
  1491. pshufd $0xb1, %xmm4, %xmm12
  1492. shufps $0x93, %xmm9, %xmm8
  1493. mulps %xmm8, %xmm4
  1494. addps %xmm4, %xmm0
  1495. movaps 0 * SIZE(Y), %xmm4
  1496. mulps %xmm8, %xmm12
  1497. movaps -4 * SIZE(X), %xmm8
  1498. addps %xmm12, %xmm1
  1499. movss %xmm10, %xmm9
  1500. pshufd $0xb1, %xmm5, %xmm12
  1501. shufps $0x93, %xmm10, %xmm9
  1502. mulps %xmm9, %xmm5
  1503. addps %xmm5, %xmm0
  1504. movaps 4 * SIZE(Y), %xmm5
  1505. mulps %xmm9, %xmm12
  1506. movaps 0 * SIZE(X), %xmm9
  1507. addps %xmm12, %xmm1
  1508. #if defined(PREFETCH) && !defined(FETCH128)
  1509. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  1510. #endif
  1511. movss %xmm11, %xmm10
  1512. pshufd $0xb1, %xmm6, %xmm12
  1513. shufps $0x93, %xmm11, %xmm10
  1514. mulps %xmm10, %xmm6
  1515. addps %xmm6, %xmm0
  1516. movaps 8 * SIZE(Y), %xmm6
  1517. mulps %xmm10, %xmm12
  1518. movaps 4 * SIZE(X), %xmm10
  1519. addps %xmm12, %xmm1
  1520. movss %xmm8, %xmm11
  1521. pshufd $0xb1, %xmm7, %xmm12
  1522. shufps $0x93, %xmm8, %xmm11
  1523. mulps %xmm11, %xmm7
  1524. addps %xmm7, %xmm0
  1525. movaps 12 * SIZE(Y), %xmm7
  1526. mulps %xmm11, %xmm12
  1527. movaps 8 * SIZE(X), %xmm11
  1528. addps %xmm12, %xmm1
  1529. subq $-32 * SIZE, X
  1530. subq $-32 * SIZE, Y
  1531. decq %rax
  1532. jg .L61
  1533. ALIGN_3
  1534. .L62:
  1535. movss %xmm9, %xmm8
  1536. pshufd $0xb1, %xmm4, %xmm12
  1537. shufps $0x93, %xmm9, %xmm8
  1538. mulps %xmm8, %xmm4
  1539. addps %xmm4, %xmm0
  1540. movaps -16 * SIZE(Y), %xmm4
  1541. mulps %xmm8, %xmm12
  1542. movaps -20 * SIZE(X), %xmm8
  1543. addps %xmm12, %xmm1
  1544. movss %xmm10, %xmm9
  1545. pshufd $0xb1, %xmm5, %xmm12
  1546. shufps $0x93, %xmm10, %xmm9
  1547. mulps %xmm9, %xmm5
  1548. addps %xmm5, %xmm0
  1549. movaps -12 * SIZE(Y), %xmm5
  1550. mulps %xmm9, %xmm12
  1551. movaps -16 * SIZE(X), %xmm9
  1552. addps %xmm12, %xmm1
  1553. movss %xmm11, %xmm10
  1554. pshufd $0xb1, %xmm6, %xmm12
  1555. shufps $0x93, %xmm11, %xmm10
  1556. mulps %xmm10, %xmm6
  1557. addps %xmm6, %xmm0
  1558. movaps -8 * SIZE(Y), %xmm6
  1559. mulps %xmm10, %xmm12
  1560. movaps -12 * SIZE(X), %xmm10
  1561. addps %xmm12, %xmm1
  1562. movss %xmm8, %xmm11
  1563. pshufd $0xb1, %xmm7, %xmm12
  1564. shufps $0x93, %xmm8, %xmm11
  1565. mulps %xmm11, %xmm7
  1566. addps %xmm7, %xmm0
  1567. movaps -4 * SIZE(Y), %xmm7
  1568. mulps %xmm11, %xmm12
  1569. movaps -8 * SIZE(X), %xmm11
  1570. addps %xmm12, %xmm1
  1571. movss %xmm9, %xmm8
  1572. pshufd $0xb1, %xmm4, %xmm12
  1573. shufps $0x93, %xmm9, %xmm8
  1574. mulps %xmm8, %xmm4
  1575. addps %xmm4, %xmm0
  1576. mulps %xmm8, %xmm12
  1577. movaps -4 * SIZE(X), %xmm8
  1578. addps %xmm12, %xmm1
  1579. movss %xmm10, %xmm9
  1580. pshufd $0xb1, %xmm5, %xmm12
  1581. shufps $0x93, %xmm10, %xmm9
  1582. mulps %xmm9, %xmm5
  1583. addps %xmm5, %xmm0
  1584. mulps %xmm9, %xmm12
  1585. addps %xmm12, %xmm1
  1586. movss %xmm11, %xmm10
  1587. pshufd $0xb1, %xmm6, %xmm12
  1588. shufps $0x93, %xmm11, %xmm10
  1589. mulps %xmm10, %xmm6
  1590. addps %xmm6, %xmm0
  1591. mulps %xmm10, %xmm12
  1592. addps %xmm12, %xmm1
  1593. movss %xmm8, %xmm11
  1594. pshufd $0xb1, %xmm7, %xmm12
  1595. shufps $0x93, %xmm8, %xmm11
  1596. mulps %xmm11, %xmm7
  1597. addps %xmm7, %xmm0
  1598. mulps %xmm11, %xmm12
  1599. addps %xmm12, %xmm1
  1600. subq $-32 * SIZE, X
  1601. subq $-32 * SIZE, Y
  1602. ALIGN_3
  1603. .L65:
  1604. testq $8, N
  1605. jle .L66
  1606. movaps -32 * SIZE(Y), %xmm4
  1607. movaps -32 * SIZE(X), %xmm9
  1608. movaps -28 * SIZE(Y), %xmm5
  1609. movaps -28 * SIZE(X), %xmm10
  1610. movss %xmm9, %xmm8
  1611. pshufd $0xb1, %xmm4, %xmm12
  1612. shufps $0x93, %xmm9, %xmm8
  1613. mulps %xmm8, %xmm4
  1614. addps %xmm4, %xmm0
  1615. mulps %xmm8, %xmm12
  1616. addps %xmm12, %xmm1
  1617. movaps -24 * SIZE(Y), %xmm6
  1618. movaps -24 * SIZE(X), %xmm11
  1619. movss %xmm10, %xmm9
  1620. pshufd $0xb1, %xmm5, %xmm12
  1621. shufps $0x93, %xmm10, %xmm9
  1622. mulps %xmm9, %xmm5
  1623. addps %xmm5, %xmm0
  1624. mulps %xmm9, %xmm12
  1625. addps %xmm12, %xmm1
  1626. movaps -20 * SIZE(Y), %xmm7
  1627. movaps -20 * SIZE(X), %xmm8
  1628. movss %xmm11, %xmm10
  1629. pshufd $0xb1, %xmm6, %xmm12
  1630. shufps $0x93, %xmm11, %xmm10
  1631. mulps %xmm10, %xmm6
  1632. addps %xmm6, %xmm0
  1633. mulps %xmm10, %xmm12
  1634. addps %xmm12, %xmm1
  1635. movss %xmm8, %xmm11
  1636. pshufd $0xb1, %xmm7, %xmm12
  1637. shufps $0x93, %xmm8, %xmm11
  1638. mulps %xmm11, %xmm7
  1639. addps %xmm7, %xmm0
  1640. mulps %xmm11, %xmm12
  1641. addps %xmm12, %xmm1
  1642. addq $16 * SIZE, X
  1643. addq $16 * SIZE, Y
  1644. ALIGN_3
  1645. .L66:
  1646. testq $4, N
  1647. jle .L67
  1648. movaps -32 * SIZE(Y), %xmm4
  1649. movaps -32 * SIZE(X), %xmm9
  1650. movss %xmm9, %xmm8
  1651. pshufd $0xb1, %xmm4, %xmm12
  1652. shufps $0x93, %xmm9, %xmm8
  1653. mulps %xmm8, %xmm4
  1654. addps %xmm4, %xmm0
  1655. mulps %xmm8, %xmm12
  1656. addps %xmm12, %xmm1
  1657. movaps -28 * SIZE(Y), %xmm5
  1658. movaps -28 * SIZE(X), %xmm10
  1659. movss %xmm10, %xmm9
  1660. pshufd $0xb1, %xmm5, %xmm12
  1661. shufps $0x93, %xmm10, %xmm9
  1662. mulps %xmm9, %xmm5
  1663. addps %xmm5, %xmm0
  1664. mulps %xmm9, %xmm12
  1665. addps %xmm12, %xmm1
  1666. movaps %xmm10, %xmm8
  1667. addq $8 * SIZE, X
  1668. addq $8 * SIZE, Y
  1669. ALIGN_3
  1670. .L67:
  1671. testq $2, N
  1672. jle .L68
  1673. movaps -32 * SIZE(Y), %xmm4
  1674. movaps -32 * SIZE(X), %xmm9
  1675. movss %xmm9, %xmm8
  1676. pshufd $0xb1, %xmm4, %xmm12
  1677. shufps $0x93, %xmm9, %xmm8
  1678. mulps %xmm8, %xmm4
  1679. addps %xmm4, %xmm0
  1680. mulps %xmm8, %xmm12
  1681. addps %xmm12, %xmm1
  1682. movaps %xmm9, %xmm8
  1683. addq $4 * SIZE, X
  1684. addq $4 * SIZE, Y
  1685. ALIGN_3
  1686. .L68:
  1687. testq $1, N
  1688. jle .L98
  1689. #ifdef movsd
  1690. xorps %xmm4, %xmm4
  1691. #endif
  1692. movsd -32 * SIZE(Y), %xmm4
  1693. movss -32 * SIZE(X), %xmm9
  1694. movss %xmm9, %xmm8
  1695. pshufd $0xb1, %xmm4, %xmm12
  1696. shufps $0x03, %xmm8, %xmm8
  1697. mulps %xmm8, %xmm4
  1698. addps %xmm4, %xmm0
  1699. mulps %xmm8, %xmm12
  1700. addps %xmm12, %xmm1
  1701. jmp .L98
  1702. ALIGN_3
  1703. #else
  1704. testq $2 * SIZE, Y
  1705. je .L50x
  1706. #ifdef movsd
  1707. xorps %xmm0, %xmm0
  1708. #endif
  1709. movsd -32 * SIZE(Y), %xmm0
  1710. #ifdef movsd
  1711. xorps %xmm4, %xmm4
  1712. #endif
  1713. movsd -32 * SIZE(X), %xmm4
  1714. pshufd $0xb1, %xmm0, %xmm1
  1715. mulps %xmm4, %xmm0
  1716. mulps %xmm4, %xmm1
  1717. addq $2 * SIZE, X
  1718. addq $2 * SIZE, Y
  1719. decq N
  1720. ALIGN_3
  1721. .L50x:
  1722. movq N, %rax
  1723. sarq $4, %rax
  1724. jle .L55
  1725. movaps -32 * SIZE(Y), %xmm4
  1726. movlps -32 * SIZE(X), %xmm8
  1727. movhps -30 * SIZE(X), %xmm8
  1728. movaps -28 * SIZE(Y), %xmm5
  1729. movlps -28 * SIZE(X), %xmm9
  1730. movhps -26 * SIZE(X), %xmm9
  1731. movaps -24 * SIZE(Y), %xmm6
  1732. movlps -24 * SIZE(X), %xmm10
  1733. movhps -22 * SIZE(X), %xmm10
  1734. movaps -20 * SIZE(Y), %xmm7
  1735. movlps -20 * SIZE(X), %xmm11
  1736. movhps -18 * SIZE(X), %xmm11
  1737. decq %rax
  1738. jle .L52
  1739. ALIGN_3
  1740. .L51:
  1741. #ifdef PREFETCH
  1742. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  1743. #endif
  1744. #ifdef PREFETCH
  1745. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  1746. #endif
  1747. #if defined(PREFETCH) && !defined(FETCH128)
  1748. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  1749. #endif
  1750. #if defined(PREFETCH) && !defined(FETCH128)
  1751. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  1752. #endif
  1753. pshufd $0xb1, %xmm4, %xmm12
  1754. mulps %xmm8, %xmm4
  1755. addps %xmm4, %xmm0
  1756. movaps -16 * SIZE(Y), %xmm4
  1757. mulps %xmm8, %xmm12
  1758. movlps -16 * SIZE(X), %xmm8
  1759. movhps -14 * SIZE(X), %xmm8
  1760. addps %xmm12, %xmm1
  1761. pshufd $0xb1, %xmm5, %xmm12
  1762. mulps %xmm9, %xmm5
  1763. addps %xmm5, %xmm0
  1764. movaps -12 * SIZE(Y), %xmm5
  1765. mulps %xmm9, %xmm12
  1766. movlps -12 * SIZE(X), %xmm9
  1767. movhps -10 * SIZE(X), %xmm9
  1768. addps %xmm12, %xmm1
  1769. pshufd $0xb1, %xmm6, %xmm12
  1770. mulps %xmm10, %xmm6
  1771. addps %xmm6, %xmm0
  1772. movaps -8 * SIZE(Y), %xmm6
  1773. mulps %xmm10, %xmm12
  1774. movlps -8 * SIZE(X), %xmm10
  1775. movhps -6 * SIZE(X), %xmm10
  1776. addps %xmm12, %xmm1
  1777. pshufd $0xb1, %xmm7, %xmm12
  1778. mulps %xmm11, %xmm7
  1779. addps %xmm7, %xmm0
  1780. movaps -4 * SIZE(Y), %xmm7
  1781. mulps %xmm11, %xmm12
  1782. movlps -4 * SIZE(X), %xmm11
  1783. movhps -2 * SIZE(X), %xmm11
  1784. addps %xmm12, %xmm1
  1785. pshufd $0xb1, %xmm4, %xmm12
  1786. mulps %xmm8, %xmm4
  1787. addps %xmm4, %xmm0
  1788. movaps 0 * SIZE(Y), %xmm4
  1789. mulps %xmm8, %xmm12
  1790. movlps 0 * SIZE(X), %xmm8
  1791. movhps 2 * SIZE(X), %xmm8
  1792. addps %xmm12, %xmm1
  1793. pshufd $0xb1, %xmm5, %xmm12
  1794. mulps %xmm9, %xmm5
  1795. addps %xmm5, %xmm0
  1796. movaps 4 * SIZE(Y), %xmm5
  1797. mulps %xmm9, %xmm12
  1798. movlps 4 * SIZE(X), %xmm9
  1799. movhps 6 * SIZE(X), %xmm9
  1800. addps %xmm12, %xmm1
  1801. pshufd $0xb1, %xmm6, %xmm12
  1802. mulps %xmm10, %xmm6
  1803. addps %xmm6, %xmm0
  1804. movaps 8 * SIZE(Y), %xmm6
  1805. mulps %xmm10, %xmm12
  1806. movlps 8 * SIZE(X), %xmm10
  1807. movhps 10 * SIZE(X), %xmm10
  1808. addps %xmm12, %xmm1
  1809. pshufd $0xb1, %xmm7, %xmm12
  1810. mulps %xmm11, %xmm7
  1811. addps %xmm7, %xmm0
  1812. movaps 12 * SIZE(Y), %xmm7
  1813. mulps %xmm11, %xmm12
  1814. movlps 12 * SIZE(X), %xmm11
  1815. movhps 14 * SIZE(X), %xmm11
  1816. addps %xmm12, %xmm1
  1817. subq $-32 * SIZE, X
  1818. subq $-32 * SIZE, Y
  1819. decq %rax
  1820. jg .L51
  1821. ALIGN_3
  1822. .L52:
  1823. pshufd $0xb1, %xmm4, %xmm12
  1824. mulps %xmm8, %xmm4
  1825. addps %xmm4, %xmm0
  1826. movaps -16 * SIZE(Y), %xmm4
  1827. mulps %xmm8, %xmm12
  1828. movlps -16 * SIZE(X), %xmm8
  1829. movhps -14 * SIZE(X), %xmm8
  1830. addps %xmm12, %xmm1
  1831. pshufd $0xb1, %xmm5, %xmm12
  1832. mulps %xmm9, %xmm5
  1833. addps %xmm5, %xmm0
  1834. movaps -12 * SIZE(Y), %xmm5
  1835. mulps %xmm9, %xmm12
  1836. movlps -12 * SIZE(X), %xmm9
  1837. movhps -10 * SIZE(X), %xmm9
  1838. addps %xmm12, %xmm1
  1839. pshufd $0xb1, %xmm6, %xmm12
  1840. mulps %xmm10, %xmm6
  1841. addps %xmm6, %xmm0
  1842. movaps -8 * SIZE(Y), %xmm6
  1843. mulps %xmm10, %xmm12
  1844. movlps -8 * SIZE(X), %xmm10
  1845. movhps -6 * SIZE(X), %xmm10
  1846. addps %xmm12, %xmm1
  1847. pshufd $0xb1, %xmm7, %xmm12
  1848. mulps %xmm11, %xmm7
  1849. addps %xmm7, %xmm0
  1850. movaps -4 * SIZE(Y), %xmm7
  1851. mulps %xmm11, %xmm12
  1852. movlps -4 * SIZE(X), %xmm11
  1853. movhps -2 * SIZE(X), %xmm11
  1854. addps %xmm12, %xmm1
  1855. pshufd $0xb1, %xmm4, %xmm12
  1856. mulps %xmm8, %xmm4
  1857. addps %xmm4, %xmm0
  1858. mulps %xmm8, %xmm12
  1859. addps %xmm12, %xmm1
  1860. pshufd $0xb1, %xmm5, %xmm12
  1861. mulps %xmm9, %xmm5
  1862. addps %xmm5, %xmm0
  1863. mulps %xmm9, %xmm12
  1864. addps %xmm12, %xmm1
  1865. pshufd $0xb1, %xmm6, %xmm12
  1866. mulps %xmm10, %xmm6
  1867. addps %xmm6, %xmm0
  1868. mulps %xmm10, %xmm12
  1869. addps %xmm12, %xmm1
  1870. pshufd $0xb1, %xmm7, %xmm12
  1871. mulps %xmm11, %xmm7
  1872. addps %xmm7, %xmm0
  1873. mulps %xmm11, %xmm12
  1874. addps %xmm12, %xmm1
  1875. subq $-32 * SIZE, X
  1876. subq $-32 * SIZE, Y
  1877. ALIGN_3
  1878. .L55:
  1879. testq $8, N
  1880. jle .L56
  1881. movaps -32 * SIZE(Y), %xmm4
  1882. movlps -32 * SIZE(X), %xmm8
  1883. movhps -30 * SIZE(X), %xmm8
  1884. movaps -28 * SIZE(Y), %xmm5
  1885. movlps -28 * SIZE(X), %xmm9
  1886. movhps -26 * SIZE(X), %xmm9
  1887. pshufd $0xb1, %xmm4, %xmm12
  1888. mulps %xmm8, %xmm4
  1889. addps %xmm4, %xmm0
  1890. mulps %xmm8, %xmm12
  1891. addps %xmm12, %xmm1
  1892. movaps -24 * SIZE(Y), %xmm6
  1893. movlps -24 * SIZE(X), %xmm10
  1894. movhps -22 * SIZE(X), %xmm10
  1895. pshufd $0xb1, %xmm5, %xmm12
  1896. mulps %xmm9, %xmm5
  1897. addps %xmm5, %xmm0
  1898. mulps %xmm9, %xmm12
  1899. addps %xmm12, %xmm1
  1900. movaps -20 * SIZE(Y), %xmm7
  1901. movlps -20 * SIZE(X), %xmm11
  1902. movhps -18 * SIZE(X), %xmm11
  1903. pshufd $0xb1, %xmm6, %xmm12
  1904. mulps %xmm10, %xmm6
  1905. addps %xmm6, %xmm0
  1906. mulps %xmm10, %xmm12
  1907. addps %xmm12, %xmm1
  1908. pshufd $0xb1, %xmm7, %xmm12
  1909. mulps %xmm11, %xmm7
  1910. addps %xmm7, %xmm0
  1911. mulps %xmm11, %xmm12
  1912. addps %xmm12, %xmm1
  1913. addq $16 * SIZE, X
  1914. addq $16 * SIZE, Y
  1915. ALIGN_3
  1916. .L56:
  1917. testq $4, N
  1918. jle .L57
  1919. movaps -32 * SIZE(Y), %xmm4
  1920. movlps -32 * SIZE(X), %xmm8
  1921. movhps -30 * SIZE(X), %xmm8
  1922. pshufd $0xb1, %xmm4, %xmm12
  1923. mulps %xmm8, %xmm4
  1924. addps %xmm4, %xmm0
  1925. mulps %xmm8, %xmm12
  1926. addps %xmm12, %xmm1
  1927. movaps -28 * SIZE(Y), %xmm5
  1928. movlps -28 * SIZE(X), %xmm9
  1929. movhps -26 * SIZE(X), %xmm9
  1930. pshufd $0xb1, %xmm5, %xmm12
  1931. mulps %xmm9, %xmm5
  1932. addps %xmm5, %xmm0
  1933. mulps %xmm9, %xmm12
  1934. addps %xmm12, %xmm1
  1935. addq $8 * SIZE, X
  1936. addq $8 * SIZE, Y
  1937. ALIGN_3
  1938. .L57:
  1939. testq $2, N
  1940. jle .L58
  1941. movaps -32 * SIZE(Y), %xmm4
  1942. movlps -32 * SIZE(X), %xmm8
  1943. movhps -30 * SIZE(X), %xmm8
  1944. pshufd $0xb1, %xmm4, %xmm12
  1945. mulps %xmm8, %xmm4
  1946. addps %xmm4, %xmm0
  1947. mulps %xmm8, %xmm12
  1948. addps %xmm12, %xmm1
  1949. movaps %xmm9, %xmm8
  1950. addq $4 * SIZE, X
  1951. addq $4 * SIZE, Y
  1952. ALIGN_3
  1953. .L58:
  1954. testq $1, N
  1955. jle .L98
  1956. #ifdef movsd
  1957. xorps %xmm4, %xmm4
  1958. #endif
  1959. movsd -32 * SIZE(Y), %xmm4
  1960. #ifdef movsd
  1961. xorps %xmm8, %xmm8
  1962. #endif
  1963. movsd -32 * SIZE(X), %xmm8
  1964. pshufd $0xb1, %xmm4, %xmm12
  1965. mulps %xmm8, %xmm4
  1966. addps %xmm4, %xmm0
  1967. mulps %xmm8, %xmm12
  1968. addps %xmm12, %xmm1
  1969. jmp .L98
  1970. ALIGN_3
  1971. #endif
  1972. .L70:
  1973. testq $2 * SIZE, Y
  1974. je .L70x
  1975. #ifdef movsd
  1976. xorps %xmm4, %xmm4
  1977. #endif
  1978. movsd -32 * SIZE(X), %xmm4
  1979. addq $2 * SIZE, X
  1980. #ifdef movsd
  1981. xorps %xmm1, %xmm1
  1982. #endif
  1983. movsd -32 * SIZE(Y), %xmm1
  1984. addq $2 * SIZE, Y
  1985. pshufd $0xb1, %xmm1, %xmm0
  1986. shufps $0xb1, %xmm4, %xmm4
  1987. mulps %xmm4, %xmm0
  1988. mulps %xmm4, %xmm1
  1989. decq N
  1990. ALIGN_3
  1991. .L70x:
  1992. testq $2 * SIZE, X
  1993. jne .L80
  1994. movaps -33 * SIZE(X), %xmm4
  1995. addq $3 * SIZE, X
  1996. movaps -33 * SIZE(Y), %xmm8
  1997. addq $3 * SIZE, Y
  1998. movq N, %rax
  1999. sarq $4, %rax
  2000. jle .L75
  2001. movaps -32 * SIZE(X), %xmm5
  2002. movaps -32 * SIZE(Y), %xmm9
  2003. movaps -28 * SIZE(X), %xmm6
  2004. movaps -28 * SIZE(Y), %xmm10
  2005. movaps -24 * SIZE(X), %xmm7
  2006. movaps -24 * SIZE(Y), %xmm11
  2007. decq %rax
  2008. jle .L72
  2009. ALIGN_3
  2010. .L71:
  2011. #ifdef PREFETCH
  2012. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  2013. #endif
  2014. movss %xmm9, %xmm8
  2015. pshufd $0x1b, %xmm8, %xmm12
  2016. movss %xmm5, %xmm4
  2017. mulps %xmm4, %xmm8
  2018. addps %xmm8, %xmm0
  2019. movaps -20 * SIZE(Y), %xmm8
  2020. mulps %xmm4, %xmm12
  2021. movaps -20 * SIZE(X), %xmm4
  2022. addps %xmm12, %xmm1
  2023. movss %xmm10, %xmm9
  2024. pshufd $0x1b, %xmm9, %xmm12
  2025. movss %xmm6, %xmm5
  2026. mulps %xmm5, %xmm9
  2027. addps %xmm9, %xmm2
  2028. movaps -16 * SIZE(Y), %xmm9
  2029. mulps %xmm5, %xmm12
  2030. movaps -16 * SIZE(X), %xmm5
  2031. addps %xmm12, %xmm3
  2032. #ifdef PREFETCH
  2033. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  2034. #endif
  2035. movss %xmm11, %xmm10
  2036. pshufd $0x1b, %xmm10, %xmm12
  2037. movss %xmm7, %xmm6
  2038. mulps %xmm6, %xmm10
  2039. addps %xmm10, %xmm0
  2040. movaps -12 * SIZE(Y), %xmm10
  2041. mulps %xmm6, %xmm12
  2042. movaps -12 * SIZE(X), %xmm6
  2043. addps %xmm12, %xmm1
  2044. movss %xmm8, %xmm11
  2045. pshufd $0x1b, %xmm11, %xmm12
  2046. movss %xmm4, %xmm7
  2047. mulps %xmm7, %xmm11
  2048. addps %xmm11, %xmm2
  2049. movaps -8 * SIZE(Y), %xmm11
  2050. mulps %xmm7, %xmm12
  2051. movaps -8 * SIZE(X), %xmm7
  2052. addps %xmm12, %xmm3
  2053. #if defined(PREFETCH) && !defined(FETCH128)
  2054. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  2055. #endif
  2056. movss %xmm9, %xmm8
  2057. pshufd $0x1b, %xmm8, %xmm12
  2058. movss %xmm5, %xmm4
  2059. mulps %xmm4, %xmm8
  2060. addps %xmm8, %xmm0
  2061. movaps -4 * SIZE(Y), %xmm8
  2062. mulps %xmm4, %xmm12
  2063. movaps -4 * SIZE(X), %xmm4
  2064. addps %xmm12, %xmm1
  2065. movss %xmm10, %xmm9
  2066. pshufd $0x1b, %xmm9, %xmm12
  2067. movss %xmm6, %xmm5
  2068. mulps %xmm5, %xmm9
  2069. addps %xmm9, %xmm2
  2070. movaps 0 * SIZE(Y), %xmm9
  2071. mulps %xmm5, %xmm12
  2072. movaps 0 * SIZE(X), %xmm5
  2073. addps %xmm12, %xmm3
  2074. #if defined(PREFETCH) && !defined(FETCH128)
  2075. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  2076. #endif
  2077. movss %xmm11, %xmm10
  2078. pshufd $0x1b, %xmm10, %xmm12
  2079. movss %xmm7, %xmm6
  2080. mulps %xmm6, %xmm10
  2081. addps %xmm10, %xmm0
  2082. movaps 4 * SIZE(Y), %xmm10
  2083. mulps %xmm6, %xmm12
  2084. movaps 4 * SIZE(X), %xmm6
  2085. addps %xmm12, %xmm1
  2086. movss %xmm8, %xmm11
  2087. pshufd $0x1b, %xmm11, %xmm12
  2088. movss %xmm4, %xmm7
  2089. mulps %xmm7, %xmm11
  2090. addps %xmm11, %xmm2
  2091. movaps 8 * SIZE(Y), %xmm11
  2092. mulps %xmm7, %xmm12
  2093. movaps 8 * SIZE(X), %xmm7
  2094. addps %xmm12, %xmm3
  2095. subq $-32 * SIZE, X
  2096. subq $-32 * SIZE, Y
  2097. decq %rax
  2098. jg .L71
  2099. ALIGN_3
  2100. .L72:
  2101. movss %xmm9, %xmm8
  2102. pshufd $0x1b, %xmm8, %xmm12
  2103. movss %xmm5, %xmm4
  2104. mulps %xmm4, %xmm8
  2105. addps %xmm8, %xmm0
  2106. movaps -20 * SIZE(Y), %xmm8
  2107. mulps %xmm4, %xmm12
  2108. movaps -20 * SIZE(X), %xmm4
  2109. addps %xmm12, %xmm1
  2110. movss %xmm10, %xmm9
  2111. pshufd $0x1b, %xmm9, %xmm12
  2112. movss %xmm6, %xmm5
  2113. mulps %xmm5, %xmm9
  2114. addps %xmm9, %xmm2
  2115. movaps -16 * SIZE(Y), %xmm9
  2116. mulps %xmm5, %xmm12
  2117. movaps -16 * SIZE(X), %xmm5
  2118. addps %xmm12, %xmm3
  2119. movss %xmm11, %xmm10
  2120. pshufd $0x1b, %xmm10, %xmm12
  2121. movss %xmm7, %xmm6
  2122. mulps %xmm6, %xmm10
  2123. addps %xmm10, %xmm0
  2124. movaps -12 * SIZE(Y), %xmm10
  2125. mulps %xmm6, %xmm12
  2126. movaps -12 * SIZE(X), %xmm6
  2127. addps %xmm12, %xmm1
  2128. movss %xmm8, %xmm11
  2129. pshufd $0x1b, %xmm11, %xmm12
  2130. movss %xmm4, %xmm7
  2131. mulps %xmm7, %xmm11
  2132. addps %xmm11, %xmm2
  2133. movaps -8 * SIZE(Y), %xmm11
  2134. mulps %xmm7, %xmm12
  2135. movaps -8 * SIZE(X), %xmm7
  2136. addps %xmm12, %xmm3
  2137. movss %xmm9, %xmm8
  2138. pshufd $0x1b, %xmm8, %xmm12
  2139. movss %xmm5, %xmm4
  2140. mulps %xmm4, %xmm8
  2141. addps %xmm8, %xmm0
  2142. movaps -4 * SIZE(Y), %xmm8
  2143. mulps %xmm4, %xmm12
  2144. movaps -4 * SIZE(X), %xmm4
  2145. addps %xmm12, %xmm1
  2146. movss %xmm10, %xmm9
  2147. pshufd $0x1b, %xmm9, %xmm12
  2148. movss %xmm6, %xmm5
  2149. mulps %xmm5, %xmm9
  2150. addps %xmm9, %xmm2
  2151. mulps %xmm5, %xmm12
  2152. addps %xmm12, %xmm3
  2153. movss %xmm11, %xmm10
  2154. pshufd $0x1b, %xmm10, %xmm12
  2155. movss %xmm7, %xmm6
  2156. mulps %xmm6, %xmm10
  2157. addps %xmm10, %xmm0
  2158. mulps %xmm6, %xmm12
  2159. addps %xmm12, %xmm1
  2160. movss %xmm8, %xmm11
  2161. pshufd $0x1b, %xmm11, %xmm12
  2162. movss %xmm4, %xmm7
  2163. mulps %xmm7, %xmm11
  2164. addps %xmm11, %xmm2
  2165. mulps %xmm7, %xmm12
  2166. addps %xmm12, %xmm3
  2167. subq $-32 * SIZE, X
  2168. subq $-32 * SIZE, Y
  2169. ALIGN_3
  2170. .L75:
  2171. testq $8, N
  2172. jle .L76
  2173. movaps -32 * SIZE(X), %xmm5
  2174. movaps -32 * SIZE(Y), %xmm9
  2175. movss %xmm9, %xmm8
  2176. pshufd $0x1b, %xmm8, %xmm12
  2177. movss %xmm5, %xmm4
  2178. mulps %xmm4, %xmm8
  2179. addps %xmm8, %xmm0
  2180. mulps %xmm4, %xmm12
  2181. addps %xmm12, %xmm1
  2182. movaps -28 * SIZE(X), %xmm6
  2183. movaps -28 * SIZE(Y), %xmm10
  2184. movss %xmm10, %xmm9
  2185. pshufd $0x1b, %xmm9, %xmm12
  2186. movss %xmm6, %xmm5
  2187. mulps %xmm5, %xmm9
  2188. addps %xmm9, %xmm2
  2189. mulps %xmm5, %xmm12
  2190. addps %xmm12, %xmm3
  2191. movaps -24 * SIZE(X), %xmm7
  2192. movaps -24 * SIZE(Y), %xmm11
  2193. movss %xmm11, %xmm10
  2194. pshufd $0x1b, %xmm10, %xmm12
  2195. movss %xmm7, %xmm6
  2196. mulps %xmm6, %xmm10
  2197. addps %xmm10, %xmm0
  2198. mulps %xmm6, %xmm12
  2199. addps %xmm12, %xmm1
  2200. movaps -20 * SIZE(X), %xmm4
  2201. movaps -20 * SIZE(Y), %xmm8
  2202. movss %xmm8, %xmm11
  2203. pshufd $0x1b, %xmm11, %xmm12
  2204. movss %xmm4, %xmm7
  2205. mulps %xmm7, %xmm11
  2206. addps %xmm11, %xmm2
  2207. mulps %xmm7, %xmm12
  2208. addps %xmm12, %xmm3
  2209. addq $16 * SIZE, X
  2210. addq $16 * SIZE, Y
  2211. ALIGN_3
  2212. .L76:
  2213. testq $4, N
  2214. jle .L77
  2215. movaps -32 * SIZE(X), %xmm5
  2216. movaps -32 * SIZE(Y), %xmm9
  2217. movaps -28 * SIZE(X), %xmm6
  2218. movaps -28 * SIZE(Y), %xmm10
  2219. movss %xmm9, %xmm8
  2220. pshufd $0x1b, %xmm8, %xmm12
  2221. movss %xmm5, %xmm4
  2222. mulps %xmm4, %xmm8
  2223. addps %xmm8, %xmm0
  2224. mulps %xmm4, %xmm12
  2225. addps %xmm12, %xmm1
  2226. movss %xmm10, %xmm9
  2227. pshufd $0x1b, %xmm9, %xmm12
  2228. movss %xmm6, %xmm5
  2229. mulps %xmm5, %xmm9
  2230. addps %xmm9, %xmm2
  2231. mulps %xmm5, %xmm12
  2232. addps %xmm12, %xmm3
  2233. movaps %xmm6, %xmm4
  2234. movaps %xmm10, %xmm8
  2235. addq $8 * SIZE, X
  2236. addq $8 * SIZE, Y
  2237. ALIGN_3
  2238. .L77:
  2239. testq $2, N
  2240. jle .L78
  2241. movaps -32 * SIZE(X), %xmm5
  2242. movaps -32 * SIZE(Y), %xmm9
  2243. movss %xmm9, %xmm8
  2244. pshufd $0x1b, %xmm8, %xmm12
  2245. movss %xmm5, %xmm4
  2246. mulps %xmm4, %xmm8
  2247. addps %xmm8, %xmm0
  2248. mulps %xmm4, %xmm12
  2249. addps %xmm12, %xmm1
  2250. movaps %xmm5, %xmm4
  2251. movaps %xmm9, %xmm8
  2252. ALIGN_3
  2253. .L78:
  2254. testq $1, N
  2255. jle .L79
  2256. xorps %xmm5, %xmm5
  2257. movss %xmm5, %xmm4
  2258. movss %xmm5, %xmm8
  2259. shufps $0x24, %xmm4, %xmm4
  2260. pshufd $0x18, %xmm8, %xmm12
  2261. shufps $0x24, %xmm8, %xmm8
  2262. mulps %xmm4, %xmm8
  2263. addps %xmm8, %xmm0
  2264. mulps %xmm4, %xmm12
  2265. addps %xmm12, %xmm1
  2266. ALIGN_3
  2267. .L79:
  2268. shufps $0x39, %xmm0, %xmm0
  2269. shufps $0x39, %xmm1, %xmm1
  2270. shufps $0x39, %xmm2, %xmm2
  2271. shufps $0x39, %xmm3, %xmm3
  2272. jmp .L98
  2273. ALIGN_3
  2274. .L80:
  2275. movsd -33 * SIZE(X), %xmm4
  2276. movhps -31 * SIZE(X), %xmm4
  2277. addq $3 * SIZE, X
  2278. movaps -33 * SIZE(Y), %xmm8
  2279. addq $3 * SIZE, Y
  2280. movq N, %rax
  2281. sarq $4, %rax
  2282. jle .L85
  2283. movsd -32 * SIZE(X), %xmm5
  2284. movhps -30 * SIZE(X), %xmm5
  2285. movaps -32 * SIZE(Y), %xmm9
  2286. movsd -28 * SIZE(X), %xmm6
  2287. movhps -26 * SIZE(X), %xmm6
  2288. movaps -28 * SIZE(Y), %xmm10
  2289. movsd -24 * SIZE(X), %xmm7
  2290. movhps -22 * SIZE(X), %xmm7
  2291. movaps -24 * SIZE(Y), %xmm11
  2292. decq %rax
  2293. jle .L82
  2294. ALIGN_3
  2295. .L81:
  2296. #ifdef PREFETCH
  2297. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  2298. #endif
  2299. movss %xmm9, %xmm8
  2300. pshufd $0x1b, %xmm8, %xmm12
  2301. movss %xmm5, %xmm4
  2302. mulps %xmm4, %xmm8
  2303. addps %xmm8, %xmm0
  2304. movaps -20 * SIZE(Y), %xmm8
  2305. mulps %xmm4, %xmm12
  2306. movsd -20 * SIZE(X), %xmm4
  2307. movhps -18 * SIZE(X), %xmm4
  2308. addps %xmm12, %xmm1
  2309. movss %xmm10, %xmm9
  2310. pshufd $0x1b, %xmm9, %xmm12
  2311. movss %xmm6, %xmm5
  2312. mulps %xmm5, %xmm9
  2313. addps %xmm9, %xmm2
  2314. movaps -16 * SIZE(Y), %xmm9
  2315. mulps %xmm5, %xmm12
  2316. movsd -16 * SIZE(X), %xmm5
  2317. movhps -14 * SIZE(X), %xmm5
  2318. addps %xmm12, %xmm3
  2319. #ifdef PREFETCH
  2320. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  2321. #endif
  2322. movss %xmm11, %xmm10
  2323. pshufd $0x1b, %xmm10, %xmm12
  2324. movss %xmm7, %xmm6
  2325. mulps %xmm6, %xmm10
  2326. addps %xmm10, %xmm0
  2327. movaps -12 * SIZE(Y), %xmm10
  2328. mulps %xmm6, %xmm12
  2329. movsd -12 * SIZE(X), %xmm6
  2330. movhps -10 * SIZE(X), %xmm6
  2331. addps %xmm12, %xmm1
  2332. movss %xmm8, %xmm11
  2333. pshufd $0x1b, %xmm11, %xmm12
  2334. movss %xmm4, %xmm7
  2335. mulps %xmm7, %xmm11
  2336. addps %xmm11, %xmm2
  2337. movaps -8 * SIZE(Y), %xmm11
  2338. mulps %xmm7, %xmm12
  2339. movsd -8 * SIZE(X), %xmm7
  2340. movhps -6 * SIZE(X), %xmm7
  2341. addps %xmm12, %xmm3
  2342. #if defined(PREFETCH) && !defined(FETCH128)
  2343. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  2344. #endif
  2345. movss %xmm9, %xmm8
  2346. pshufd $0x1b, %xmm8, %xmm12
  2347. movss %xmm5, %xmm4
  2348. mulps %xmm4, %xmm8
  2349. addps %xmm8, %xmm0
  2350. movaps -4 * SIZE(Y), %xmm8
  2351. mulps %xmm4, %xmm12
  2352. movsd -4 * SIZE(X), %xmm4
  2353. movhps -2 * SIZE(X), %xmm4
  2354. addps %xmm12, %xmm1
  2355. movss %xmm10, %xmm9
  2356. pshufd $0x1b, %xmm9, %xmm12
  2357. movss %xmm6, %xmm5
  2358. mulps %xmm5, %xmm9
  2359. addps %xmm9, %xmm2
  2360. movaps 0 * SIZE(Y), %xmm9
  2361. mulps %xmm5, %xmm12
  2362. movsd 0 * SIZE(X), %xmm5
  2363. movhps 2 * SIZE(X), %xmm5
  2364. addps %xmm12, %xmm3
  2365. #if defined(PREFETCH) && !defined(FETCH128)
  2366. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  2367. #endif
  2368. movss %xmm11, %xmm10
  2369. pshufd $0x1b, %xmm10, %xmm12
  2370. movss %xmm7, %xmm6
  2371. mulps %xmm6, %xmm10
  2372. addps %xmm10, %xmm0
  2373. movaps 4 * SIZE(Y), %xmm10
  2374. mulps %xmm6, %xmm12
  2375. movsd 4 * SIZE(X), %xmm6
  2376. movhps 6 * SIZE(X), %xmm6
  2377. addps %xmm12, %xmm1
  2378. movss %xmm8, %xmm11
  2379. pshufd $0x1b, %xmm11, %xmm12
  2380. movss %xmm4, %xmm7
  2381. mulps %xmm7, %xmm11
  2382. addps %xmm11, %xmm2
  2383. movaps 8 * SIZE(Y), %xmm11
  2384. mulps %xmm7, %xmm12
  2385. movsd 8 * SIZE(X), %xmm7
  2386. movhps 10 * SIZE(X), %xmm7
  2387. addps %xmm12, %xmm3
  2388. subq $-32 * SIZE, X
  2389. subq $-32 * SIZE, Y
  2390. decq %rax
  2391. jg .L81
  2392. ALIGN_3
  2393. .L82:
  2394. movss %xmm9, %xmm8
  2395. pshufd $0x1b, %xmm8, %xmm12
  2396. movss %xmm5, %xmm4
  2397. mulps %xmm4, %xmm8
  2398. addps %xmm8, %xmm0
  2399. movaps -20 * SIZE(Y), %xmm8
  2400. mulps %xmm4, %xmm12
  2401. movsd -20 * SIZE(X), %xmm4
  2402. movhps -18 * SIZE(X), %xmm4
  2403. addps %xmm12, %xmm1
  2404. movss %xmm10, %xmm9
  2405. pshufd $0x1b, %xmm9, %xmm12
  2406. movss %xmm6, %xmm5
  2407. mulps %xmm5, %xmm9
  2408. addps %xmm9, %xmm2
  2409. movaps -16 * SIZE(Y), %xmm9
  2410. mulps %xmm5, %xmm12
  2411. movsd -16 * SIZE(X), %xmm5
  2412. movhps -14 * SIZE(X), %xmm5
  2413. addps %xmm12, %xmm3
  2414. movss %xmm11, %xmm10
  2415. pshufd $0x1b, %xmm10, %xmm12
  2416. movss %xmm7, %xmm6
  2417. mulps %xmm6, %xmm10
  2418. addps %xmm10, %xmm0
  2419. movaps -12 * SIZE(Y), %xmm10
  2420. mulps %xmm6, %xmm12
  2421. movsd -12 * SIZE(X), %xmm6
  2422. movhps -10 * SIZE(X), %xmm6
  2423. addps %xmm12, %xmm1
  2424. movss %xmm8, %xmm11
  2425. pshufd $0x1b, %xmm11, %xmm12
  2426. movss %xmm4, %xmm7
  2427. mulps %xmm7, %xmm11
  2428. addps %xmm11, %xmm2
  2429. movaps -8 * SIZE(Y), %xmm11
  2430. mulps %xmm7, %xmm12
  2431. movsd -8 * SIZE(X), %xmm7
  2432. movhps -6 * SIZE(X), %xmm7
  2433. addps %xmm12, %xmm3
  2434. movss %xmm9, %xmm8
  2435. pshufd $0x1b, %xmm8, %xmm12
  2436. movss %xmm5, %xmm4
  2437. mulps %xmm4, %xmm8
  2438. addps %xmm8, %xmm0
  2439. movaps -4 * SIZE(Y), %xmm8
  2440. mulps %xmm4, %xmm12
  2441. movsd -4 * SIZE(X), %xmm4
  2442. movhps -2 * SIZE(X), %xmm4
  2443. addps %xmm12, %xmm1
  2444. movss %xmm10, %xmm9
  2445. pshufd $0x1b, %xmm9, %xmm12
  2446. movss %xmm6, %xmm5
  2447. mulps %xmm5, %xmm9
  2448. addps %xmm9, %xmm2
  2449. mulps %xmm5, %xmm12
  2450. addps %xmm12, %xmm3
  2451. movss %xmm11, %xmm10
  2452. pshufd $0x1b, %xmm10, %xmm12
  2453. movss %xmm7, %xmm6
  2454. mulps %xmm6, %xmm10
  2455. addps %xmm10, %xmm0
  2456. mulps %xmm6, %xmm12
  2457. addps %xmm12, %xmm1
  2458. movss %xmm8, %xmm11
  2459. pshufd $0x1b, %xmm11, %xmm12
  2460. movss %xmm4, %xmm7
  2461. mulps %xmm7, %xmm11
  2462. addps %xmm11, %xmm2
  2463. mulps %xmm7, %xmm12
  2464. addps %xmm12, %xmm3
  2465. subq $-32 * SIZE, X
  2466. subq $-32 * SIZE, Y
  2467. ALIGN_3
  2468. .L85:
  2469. testq $8, N
  2470. jle .L86
  2471. movsd -32 * SIZE(X), %xmm5
  2472. movhps -30 * SIZE(X), %xmm5
  2473. movaps -32 * SIZE(Y), %xmm9
  2474. movss %xmm9, %xmm8
  2475. pshufd $0x1b, %xmm8, %xmm12
  2476. movss %xmm5, %xmm4
  2477. mulps %xmm4, %xmm8
  2478. addps %xmm8, %xmm0
  2479. mulps %xmm4, %xmm12
  2480. addps %xmm12, %xmm1
  2481. movsd -28 * SIZE(X), %xmm6
  2482. movhps -26 * SIZE(X), %xmm6
  2483. movaps -28 * SIZE(Y), %xmm10
  2484. movss %xmm10, %xmm9
  2485. pshufd $0x1b, %xmm9, %xmm12
  2486. movss %xmm6, %xmm5
  2487. mulps %xmm5, %xmm9
  2488. addps %xmm9, %xmm2
  2489. mulps %xmm5, %xmm12
  2490. addps %xmm12, %xmm3
  2491. movsd -24 * SIZE(X), %xmm7
  2492. movhps -22 * SIZE(X), %xmm7
  2493. movaps -24 * SIZE(Y), %xmm11
  2494. movss %xmm11, %xmm10
  2495. pshufd $0x1b, %xmm10, %xmm12
  2496. movss %xmm7, %xmm6
  2497. mulps %xmm6, %xmm10
  2498. addps %xmm10, %xmm0
  2499. mulps %xmm6, %xmm12
  2500. addps %xmm12, %xmm1
  2501. movsd -20 * SIZE(X), %xmm4
  2502. movhps -18 * SIZE(X), %xmm4
  2503. movaps -20 * SIZE(Y), %xmm8
  2504. movss %xmm8, %xmm11
  2505. pshufd $0x1b, %xmm11, %xmm12
  2506. movss %xmm4, %xmm7
  2507. mulps %xmm7, %xmm11
  2508. addps %xmm11, %xmm2
  2509. mulps %xmm7, %xmm12
  2510. addps %xmm12, %xmm3
  2511. addq $16 * SIZE, X
  2512. addq $16 * SIZE, Y
  2513. ALIGN_3
  2514. .L86:
  2515. testq $4, N
  2516. jle .L87
  2517. movsd -32 * SIZE(X), %xmm5
  2518. movhps -30 * SIZE(X), %xmm5
  2519. movaps -32 * SIZE(Y), %xmm9
  2520. movss %xmm9, %xmm8
  2521. pshufd $0x1b, %xmm8, %xmm12
  2522. movss %xmm5, %xmm4
  2523. mulps %xmm4, %xmm8
  2524. addps %xmm8, %xmm0
  2525. mulps %xmm4, %xmm12
  2526. addps %xmm12, %xmm1
  2527. movsd -28 * SIZE(X), %xmm6
  2528. movhps -26 * SIZE(X), %xmm6
  2529. movaps -28 * SIZE(Y), %xmm10
  2530. movss %xmm10, %xmm9
  2531. pshufd $0x1b, %xmm9, %xmm12
  2532. movss %xmm6, %xmm5
  2533. mulps %xmm5, %xmm9
  2534. addps %xmm9, %xmm2
  2535. mulps %xmm5, %xmm12
  2536. addps %xmm12, %xmm3
  2537. movaps %xmm6, %xmm4
  2538. movaps %xmm10, %xmm8
  2539. addq $8 * SIZE, X
  2540. addq $8 * SIZE, Y
  2541. ALIGN_3
  2542. .L87:
  2543. testq $2, N
  2544. jle .L88
  2545. movsd -32 * SIZE(X), %xmm5
  2546. movhps -30 * SIZE(X), %xmm5
  2547. movaps -32 * SIZE(Y), %xmm9
  2548. movss %xmm9, %xmm8
  2549. pshufd $0x1b, %xmm8, %xmm12
  2550. movss %xmm5, %xmm4
  2551. mulps %xmm4, %xmm8
  2552. addps %xmm8, %xmm0
  2553. mulps %xmm4, %xmm12
  2554. addps %xmm12, %xmm1
  2555. movaps %xmm5, %xmm4
  2556. movaps %xmm9, %xmm8
  2557. ALIGN_3
  2558. .L88:
  2559. testq $1, N
  2560. jle .L89
  2561. xorps %xmm5, %xmm5
  2562. movss %xmm5, %xmm4
  2563. movss %xmm5, %xmm8
  2564. shufps $0x24, %xmm4, %xmm4
  2565. pshufd $0x18, %xmm8, %xmm12
  2566. shufps $0x24, %xmm8, %xmm8
  2567. mulps %xmm4, %xmm8
  2568. addps %xmm8, %xmm0
  2569. mulps %xmm4, %xmm12
  2570. addps %xmm12, %xmm1
  2571. ALIGN_3
  2572. .L89:
  2573. shufps $0x39, %xmm0, %xmm0
  2574. shufps $0x39, %xmm1, %xmm1
  2575. shufps $0x39, %xmm2, %xmm2
  2576. shufps $0x39, %xmm3, %xmm3
  2577. jmp .L98
  2578. ALIGN_3
  2579. .L200:
  2580. movq N, %rax
  2581. sarq $4, %rax
  2582. jle .L205
  2583. movsd (X), %xmm4
  2584. addq INCX, X
  2585. movhps (X), %xmm4
  2586. addq INCX, X
  2587. movsd (Y), %xmm8
  2588. addq INCY, Y
  2589. movhps (Y), %xmm8
  2590. addq INCY, Y
  2591. movsd (X), %xmm5
  2592. addq INCX, X
  2593. movhps (X), %xmm5
  2594. addq INCX, X
  2595. movsd (Y), %xmm9
  2596. addq INCY, Y
  2597. movhps (Y), %xmm9
  2598. addq INCY, Y
  2599. movsd (X), %xmm6
  2600. addq INCX, X
  2601. movhps (X), %xmm6
  2602. addq INCX, X
  2603. movsd (Y), %xmm10
  2604. addq INCY, Y
  2605. movhps (Y), %xmm10
  2606. addq INCY, Y
  2607. movsd (X), %xmm7
  2608. addq INCX, X
  2609. movhps (X), %xmm7
  2610. addq INCX, X
  2611. movsd (Y), %xmm11
  2612. addq INCY, Y
  2613. movhps (Y), %xmm11
  2614. addq INCY, Y
  2615. decq %rax
  2616. jle .L204
  2617. ALIGN_3
  2618. .L203:
  2619. pshufd $0xb1, %xmm8, %xmm12
  2620. mulps %xmm4, %xmm8
  2621. addps %xmm8, %xmm0
  2622. movsd (Y), %xmm8
  2623. addq INCY, Y
  2624. movhps (Y), %xmm8
  2625. addq INCY, Y
  2626. mulps %xmm4, %xmm12
  2627. movsd (X), %xmm4
  2628. addq INCX, X
  2629. movhps (X), %xmm4
  2630. addq INCX, X
  2631. addps %xmm12, %xmm1
  2632. pshufd $0xb1, %xmm9, %xmm12
  2633. mulps %xmm5, %xmm9
  2634. addps %xmm9, %xmm2
  2635. movsd (Y), %xmm9
  2636. addq INCY, Y
  2637. movhps (Y), %xmm9
  2638. addq INCY, Y
  2639. mulps %xmm5, %xmm12
  2640. movsd (X), %xmm5
  2641. addq INCX, X
  2642. movhps (X), %xmm5
  2643. addq INCX, X
  2644. addps %xmm12, %xmm3
  2645. pshufd $0xb1, %xmm10, %xmm12
  2646. mulps %xmm6, %xmm10
  2647. addps %xmm10, %xmm0
  2648. movsd (Y), %xmm10
  2649. addq INCY, Y
  2650. movhps (Y), %xmm10
  2651. addq INCY, Y
  2652. mulps %xmm6, %xmm12
  2653. movsd (X), %xmm6
  2654. addq INCX, X
  2655. movhps (X), %xmm6
  2656. addq INCX, X
  2657. addps %xmm12, %xmm1
  2658. pshufd $0xb1, %xmm11, %xmm12
  2659. mulps %xmm7, %xmm11
  2660. addps %xmm11, %xmm2
  2661. movsd (Y), %xmm11
  2662. addq INCY, Y
  2663. movhps (Y), %xmm11
  2664. addq INCY, Y
  2665. mulps %xmm7, %xmm12
  2666. movsd (X), %xmm7
  2667. addq INCX, X
  2668. movhps (X), %xmm7
  2669. addq INCX, X
  2670. addps %xmm12, %xmm3
  2671. pshufd $0xb1, %xmm8, %xmm12
  2672. mulps %xmm4, %xmm8
  2673. addps %xmm8, %xmm0
  2674. movsd (Y), %xmm8
  2675. addq INCY, Y
  2676. movhps (Y), %xmm8
  2677. addq INCY, Y
  2678. mulps %xmm4, %xmm12
  2679. movsd (X), %xmm4
  2680. addq INCX, X
  2681. movhps (X), %xmm4
  2682. addq INCX, X
  2683. addps %xmm12, %xmm1
  2684. pshufd $0xb1, %xmm9, %xmm12
  2685. mulps %xmm5, %xmm9
  2686. addps %xmm9, %xmm2
  2687. movsd (Y), %xmm9
  2688. addq INCY, Y
  2689. movhps (Y), %xmm9
  2690. addq INCY, Y
  2691. mulps %xmm5, %xmm12
  2692. movsd (X), %xmm5
  2693. addq INCX, X
  2694. movhps (X), %xmm5
  2695. addq INCX, X
  2696. addps %xmm12, %xmm3
  2697. pshufd $0xb1, %xmm10, %xmm12
  2698. mulps %xmm6, %xmm10
  2699. addps %xmm10, %xmm0
  2700. movsd (Y), %xmm10
  2701. addq INCY, Y
  2702. movhps (Y), %xmm10
  2703. addq INCY, Y
  2704. mulps %xmm6, %xmm12
  2705. movsd (X), %xmm6
  2706. addq INCX, X
  2707. movhps (X), %xmm6
  2708. addq INCX, X
  2709. addps %xmm12, %xmm1
  2710. pshufd $0xb1, %xmm11, %xmm12
  2711. mulps %xmm7, %xmm11
  2712. addps %xmm11, %xmm2
  2713. movsd (Y), %xmm11
  2714. addq INCY, Y
  2715. movhps (Y), %xmm11
  2716. addq INCY, Y
  2717. mulps %xmm7, %xmm12
  2718. movsd (X), %xmm7
  2719. addq INCX, X
  2720. movhps (X), %xmm7
  2721. addq INCX, X
  2722. addps %xmm12, %xmm3
  2723. decq %rax
  2724. jg .L203
  2725. ALIGN_3
  2726. .L204:
  2727. pshufd $0xb1, %xmm8, %xmm12
  2728. mulps %xmm4, %xmm8
  2729. addps %xmm8, %xmm0
  2730. movsd (Y), %xmm8
  2731. addq INCY, Y
  2732. movhps (Y), %xmm8
  2733. addq INCY, Y
  2734. mulps %xmm4, %xmm12
  2735. movsd (X), %xmm4
  2736. addq INCX, X
  2737. movhps (X), %xmm4
  2738. addq INCX, X
  2739. addps %xmm12, %xmm1
  2740. pshufd $0xb1, %xmm9, %xmm12
  2741. mulps %xmm5, %xmm9
  2742. addps %xmm9, %xmm2
  2743. movsd (Y), %xmm9
  2744. addq INCY, Y
  2745. movhps (Y), %xmm9
  2746. addq INCY, Y
  2747. mulps %xmm5, %xmm12
  2748. movsd (X), %xmm5
  2749. addq INCX, X
  2750. movhps (X), %xmm5
  2751. addq INCX, X
  2752. addps %xmm12, %xmm3
  2753. pshufd $0xb1, %xmm10, %xmm12
  2754. mulps %xmm6, %xmm10
  2755. addps %xmm10, %xmm0
  2756. movsd (Y), %xmm10
  2757. addq INCY, Y
  2758. movhps (Y), %xmm10
  2759. addq INCY, Y
  2760. mulps %xmm6, %xmm12
  2761. movsd (X), %xmm6
  2762. addq INCX, X
  2763. movhps (X), %xmm6
  2764. addq INCX, X
  2765. addps %xmm12, %xmm1
  2766. pshufd $0xb1, %xmm11, %xmm12
  2767. mulps %xmm7, %xmm11
  2768. addps %xmm11, %xmm2
  2769. movsd (Y), %xmm11
  2770. addq INCY, Y
  2771. movhps (Y), %xmm11
  2772. addq INCY, Y
  2773. mulps %xmm7, %xmm12
  2774. movsd (X), %xmm7
  2775. addq INCX, X
  2776. movhps (X), %xmm7
  2777. addq INCX, X
  2778. addps %xmm12, %xmm3
  2779. pshufd $0xb1, %xmm8, %xmm12
  2780. mulps %xmm4, %xmm8
  2781. addps %xmm8, %xmm0
  2782. mulps %xmm4, %xmm12
  2783. addps %xmm12, %xmm1
  2784. pshufd $0xb1, %xmm9, %xmm12
  2785. mulps %xmm5, %xmm9
  2786. addps %xmm9, %xmm2
  2787. mulps %xmm5, %xmm12
  2788. addps %xmm12, %xmm3
  2789. pshufd $0xb1, %xmm10, %xmm12
  2790. mulps %xmm6, %xmm10
  2791. addps %xmm10, %xmm0
  2792. mulps %xmm6, %xmm12
  2793. addps %xmm12, %xmm1
  2794. pshufd $0xb1, %xmm11, %xmm12
  2795. mulps %xmm7, %xmm11
  2796. addps %xmm11, %xmm2
  2797. mulps %xmm7, %xmm12
  2798. addps %xmm12, %xmm3
  2799. ALIGN_3
  2800. .L205:
  2801. testq $8, N
  2802. jle .L206
  2803. movsd (X), %xmm4
  2804. addq INCX, X
  2805. movhps (X), %xmm4
  2806. addq INCX, X
  2807. movsd (Y), %xmm8
  2808. addq INCY, Y
  2809. movhps (Y), %xmm8
  2810. addq INCY, Y
  2811. pshufd $0xb1, %xmm8, %xmm12
  2812. mulps %xmm4, %xmm8
  2813. addps %xmm8, %xmm0
  2814. mulps %xmm4, %xmm12
  2815. addps %xmm12, %xmm1
  2816. movsd (X), %xmm5
  2817. addq INCX, X
  2818. movhps (X), %xmm5
  2819. addq INCX, X
  2820. movsd (Y), %xmm9
  2821. addq INCY, Y
  2822. movhps (Y), %xmm9
  2823. addq INCY, Y
  2824. pshufd $0xb1, %xmm9, %xmm12
  2825. mulps %xmm5, %xmm9
  2826. addps %xmm9, %xmm2
  2827. mulps %xmm5, %xmm12
  2828. addps %xmm12, %xmm3
  2829. movsd (X), %xmm6
  2830. addq INCX, X
  2831. movhps (X), %xmm6
  2832. addq INCX, X
  2833. movsd (Y), %xmm10
  2834. addq INCY, Y
  2835. movhps (Y), %xmm10
  2836. addq INCY, Y
  2837. pshufd $0xb1, %xmm10, %xmm12
  2838. mulps %xmm6, %xmm10
  2839. addps %xmm10, %xmm0
  2840. mulps %xmm6, %xmm12
  2841. addps %xmm12, %xmm1
  2842. movsd (X), %xmm7
  2843. addq INCX, X
  2844. movhps (X), %xmm7
  2845. addq INCX, X
  2846. movsd (Y), %xmm11
  2847. addq INCY, Y
  2848. movhps (Y), %xmm11
  2849. addq INCY, Y
  2850. pshufd $0xb1, %xmm11, %xmm12
  2851. mulps %xmm7, %xmm11
  2852. addps %xmm11, %xmm2
  2853. mulps %xmm7, %xmm12
  2854. addps %xmm12, %xmm3
  2855. ALIGN_3
  2856. .L206:
  2857. testq $4, N
  2858. jle .L207
  2859. movsd (X), %xmm4
  2860. addq INCX, X
  2861. movhps (X), %xmm4
  2862. addq INCX, X
  2863. movsd (Y), %xmm8
  2864. addq INCY, Y
  2865. movhps (Y), %xmm8
  2866. addq INCY, Y
  2867. pshufd $0xb1, %xmm8, %xmm12
  2868. mulps %xmm4, %xmm8
  2869. addps %xmm8, %xmm0
  2870. mulps %xmm4, %xmm12
  2871. addps %xmm12, %xmm1
  2872. movsd (X), %xmm5
  2873. addq INCX, X
  2874. movhps (X), %xmm5
  2875. addq INCX, X
  2876. movsd (Y), %xmm9
  2877. addq INCY, Y
  2878. movhps (Y), %xmm9
  2879. addq INCY, Y
  2880. pshufd $0xb1, %xmm9, %xmm12
  2881. mulps %xmm5, %xmm9
  2882. addps %xmm9, %xmm2
  2883. mulps %xmm5, %xmm12
  2884. addps %xmm12, %xmm3
  2885. ALIGN_3
  2886. .L207:
  2887. testq $2, N
  2888. jle .L208
  2889. movsd (X), %xmm4
  2890. addq INCX, X
  2891. movhps (X), %xmm4
  2892. addq INCX, X
  2893. movsd (Y), %xmm8
  2894. addq INCY, Y
  2895. movhps (Y), %xmm8
  2896. addq INCY, Y
  2897. pshufd $0xb1, %xmm8, %xmm12
  2898. mulps %xmm4, %xmm8
  2899. addps %xmm8, %xmm0
  2900. mulps %xmm4, %xmm12
  2901. addps %xmm12, %xmm1
  2902. ALIGN_3
  2903. .L208:
  2904. testq $1, N
  2905. jle .L98
  2906. #ifdef movsd
  2907. xorps %xmm4, %xmm4
  2908. #endif
  2909. movsd (X), %xmm4
  2910. #ifdef movsd
  2911. xorps %xmm8, %xmm8
  2912. #endif
  2913. movsd (Y), %xmm8
  2914. pshufd $0xb1, %xmm8, %xmm12
  2915. mulps %xmm4, %xmm8
  2916. addps %xmm8, %xmm0
  2917. mulps %xmm4, %xmm12
  2918. addps %xmm12, %xmm1
  2919. ALIGN_3
  2920. .L98:
  2921. addps %xmm2, %xmm0
  2922. addps %xmm3, %xmm1
  2923. movhlps %xmm0, %xmm2
  2924. movhlps %xmm1, %xmm3
  2925. addps %xmm2, %xmm0
  2926. addps %xmm3, %xmm1
  2927. pshufd $1, %xmm0, %xmm2
  2928. pshufd $1, %xmm1, %xmm3
  2929. ALIGN_3
  2930. .L999:
  2931. #ifndef CONJ
  2932. subss %xmm2, %xmm0
  2933. addss %xmm3, %xmm1
  2934. #else
  2935. addss %xmm2, %xmm0
  2936. subss %xmm3, %xmm1
  2937. #endif
  2938. unpcklps %xmm1, %xmm0
  2939. #ifdef WINDOWS_ABI
  2940. movq %xmm0, %rax
  2941. #endif
  2942. RESTOREREGISTERS
  2943. ret
  2944. ALIGN_3
  2945. EPILOGUE