You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_4x4_penryn.S 55 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 20 + STACK + ARGS(%esp)
  47. #define ARG_B 24 + STACK + ARGS(%esp)
  48. #define C 28 + STACK + ARGS(%esp)
  49. #define ARG_LDC 32 + STACK + ARGS(%esp)
  50. #define OFFSET 36 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define KK 4 + STACK(%esp)
  53. #define KKK 8 + STACK(%esp)
  54. #define AORIG 12 + STACK(%esp)
  55. #if defined(PENRYN) || defined(DUNNINGTON)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHSIZE (8 * 21 + 4)
  58. #endif
  59. #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
  60. #define PREFETCH prefetcht0
  61. #define PREFETCHSIZE (8 * 21 + 4)
  62. #endif
  63. #ifdef ATOM
  64. #define PREFETCH prefetcht0
  65. #define PREFETCHSIZE (8 * 8 + 4)
  66. #endif
  67. #ifdef NANO
  68. #define PREFETCH prefetcht0
  69. #define PREFETCHSIZE (16 * 2)
  70. #endif
  71. #define B %edi
  72. #define AA %edx
  73. #define BB %ecx
  74. #define LDC %ebp
  75. #define CO1 %esi
  76. PROLOGUE
  77. subl $ARGS, %esp
  78. pushl %ebp
  79. pushl %edi
  80. pushl %esi
  81. pushl %ebx
  82. PROFCODE
  83. movl ARG_B, B
  84. movl ARG_LDC, LDC
  85. movl OFFSET, %eax
  86. #ifdef RN
  87. negl %eax
  88. #endif
  89. movl %eax, KK
  90. leal (, LDC, SIZE), LDC
  91. subl $-32 * SIZE, A
  92. subl $-32 * SIZE, B
  93. #ifdef LN
  94. movl M, %eax
  95. leal (, %eax, SIZE), %eax
  96. addl %eax, C
  97. imull K, %eax
  98. addl %eax, A
  99. #endif
  100. #ifdef RT
  101. movl N, %eax
  102. leal (, %eax, SIZE), %eax
  103. imull K, %eax
  104. addl %eax, B
  105. movl N, %eax
  106. imull LDC, %eax
  107. addl %eax, C
  108. #endif
  109. #ifdef RT
  110. movl N, %eax
  111. subl OFFSET, %eax
  112. movl %eax, KK
  113. #endif
  114. movl N, %eax
  115. sarl $2, %eax
  116. movl %eax, J
  117. jle .L40
  118. .L10:
  119. #if defined(LT) || defined(RN)
  120. movl A, AA
  121. #else
  122. movl A, %eax
  123. movl %eax, AORIG
  124. #endif
  125. #ifdef RT
  126. movl K, %eax
  127. sall $2 + BASE_SHIFT, %eax
  128. subl %eax, B
  129. #endif
  130. leal (, LDC, 4), %eax
  131. #ifdef RT
  132. subl %eax, C
  133. #endif
  134. movl C, CO1
  135. #ifndef RT
  136. addl %eax, C
  137. #endif
  138. #ifdef LN
  139. movl OFFSET, %eax
  140. addl M, %eax
  141. movl %eax, KK
  142. #endif
  143. #ifdef LT
  144. movl OFFSET, %eax
  145. movl %eax, KK
  146. #endif
  147. movl M, %ebx
  148. sarl $2, %ebx # i = (m >> 2)
  149. jle .L20
  150. ALIGN_4
  151. .L11:
  152. #ifdef LN
  153. movl K, %eax
  154. sall $2 + BASE_SHIFT, %eax
  155. subl %eax, AORIG
  156. #endif
  157. #if defined(LN) || defined(RT)
  158. movl KK, %eax
  159. movl AORIG, AA
  160. leal (, %eax, SIZE), %eax
  161. leal (AA, %eax, 4), AA
  162. #endif
  163. movl B, BB
  164. #if defined(LN) || defined(RT)
  165. movl KK, %eax
  166. sall $2 + BASE_SHIFT, %eax
  167. addl %eax, BB
  168. #endif
  169. leal (CO1, LDC, 2), %eax
  170. movaps -32 * SIZE(AA), %xmm0
  171. pxor %xmm2, %xmm2
  172. movaps -32 * SIZE(BB), %xmm1
  173. pxor %xmm3, %xmm3
  174. #ifdef LN
  175. pxor %xmm4, %xmm4
  176. prefetcht0 -4 * SIZE(CO1)
  177. pxor %xmm5, %xmm5
  178. prefetcht0 -4 * SIZE(CO1, LDC)
  179. pxor %xmm6, %xmm6
  180. prefetcht0 -4 * SIZE(%eax)
  181. pxor %xmm7, %xmm7
  182. prefetcht0 -4 * SIZE(%eax, LDC)
  183. #else
  184. pxor %xmm4, %xmm4
  185. prefetcht0 3 * SIZE(CO1)
  186. pxor %xmm5, %xmm5
  187. prefetcht0 3 * SIZE(CO1, LDC)
  188. pxor %xmm6, %xmm6
  189. prefetcht0 3 * SIZE(%eax)
  190. pxor %xmm7, %xmm7
  191. prefetcht0 3 * SIZE(%eax, LDC)
  192. #endif
  193. #if defined(LT) || defined(RN)
  194. movl KK, %eax
  195. #else
  196. movl K, %eax
  197. subl KK, %eax
  198. #endif
  199. sarl $3, %eax
  200. je .L15
  201. ALIGN_4
  202. .L12:
  203. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  204. addps %xmm2, %xmm7
  205. pshufd $0x93, %xmm1, %xmm2
  206. mulps %xmm0, %xmm1
  207. addps %xmm3, %xmm6
  208. pshufd $0x93, %xmm2, %xmm3
  209. mulps %xmm0, %xmm2
  210. addps %xmm2, %xmm5
  211. pshufd $0x93, %xmm3, %xmm2
  212. mulps %xmm0, %xmm3
  213. addps %xmm1, %xmm4
  214. movaps -28 * SIZE(BB), %xmm1
  215. mulps %xmm0, %xmm2
  216. movaps -28 * SIZE(AA), %xmm0
  217. addps %xmm2, %xmm7
  218. pshufd $0x93, %xmm1, %xmm2
  219. mulps %xmm0, %xmm1
  220. addps %xmm3, %xmm6
  221. pshufd $0x93, %xmm2, %xmm3
  222. mulps %xmm0, %xmm2
  223. addps %xmm2, %xmm5
  224. pshufd $0x93, %xmm3, %xmm2
  225. mulps %xmm0, %xmm3
  226. addps %xmm1, %xmm4
  227. movaps -24 * SIZE(BB), %xmm1
  228. mulps %xmm0, %xmm2
  229. movaps -24 * SIZE(AA), %xmm0
  230. addps %xmm2, %xmm7
  231. pshufd $0x93, %xmm1, %xmm2
  232. mulps %xmm0, %xmm1
  233. addps %xmm3, %xmm6
  234. pshufd $0x93, %xmm2, %xmm3
  235. mulps %xmm0, %xmm2
  236. addps %xmm2, %xmm5
  237. pshufd $0x93, %xmm3, %xmm2
  238. mulps %xmm0, %xmm3
  239. addps %xmm1, %xmm4
  240. movaps -20 * SIZE(BB), %xmm1
  241. mulps %xmm0, %xmm2
  242. movaps -20 * SIZE(AA), %xmm0
  243. addps %xmm2, %xmm7
  244. pshufd $0x93, %xmm1, %xmm2
  245. mulps %xmm0, %xmm1
  246. addps %xmm3, %xmm6
  247. pshufd $0x93, %xmm2, %xmm3
  248. mulps %xmm0, %xmm2
  249. addps %xmm2, %xmm5
  250. pshufd $0x93, %xmm3, %xmm2
  251. mulps %xmm0, %xmm3
  252. addps %xmm1, %xmm4
  253. movaps -16 * SIZE(BB), %xmm1
  254. mulps %xmm0, %xmm2
  255. movaps -16 * SIZE(AA), %xmm0
  256. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  257. addps %xmm2, %xmm7
  258. pshufd $0x93, %xmm1, %xmm2
  259. mulps %xmm0, %xmm1
  260. addps %xmm3, %xmm6
  261. pshufd $0x93, %xmm2, %xmm3
  262. mulps %xmm0, %xmm2
  263. addps %xmm2, %xmm5
  264. pshufd $0x93, %xmm3, %xmm2
  265. mulps %xmm0, %xmm3
  266. addps %xmm1, %xmm4
  267. movaps -12 * SIZE(BB), %xmm1
  268. mulps %xmm0, %xmm2
  269. movaps -12 * SIZE(AA), %xmm0
  270. addps %xmm2, %xmm7
  271. pshufd $0x93, %xmm1, %xmm2
  272. mulps %xmm0, %xmm1
  273. addps %xmm3, %xmm6
  274. pshufd $0x93, %xmm2, %xmm3
  275. mulps %xmm0, %xmm2
  276. addps %xmm2, %xmm5
  277. pshufd $0x93, %xmm3, %xmm2
  278. mulps %xmm0, %xmm3
  279. addps %xmm1, %xmm4
  280. movaps -8 * SIZE(BB), %xmm1
  281. mulps %xmm0, %xmm2
  282. movaps -8 * SIZE(AA), %xmm0
  283. addps %xmm2, %xmm7
  284. pshufd $0x93, %xmm1, %xmm2
  285. mulps %xmm0, %xmm1
  286. addps %xmm3, %xmm6
  287. pshufd $0x93, %xmm2, %xmm3
  288. mulps %xmm0, %xmm2
  289. addps %xmm2, %xmm5
  290. pshufd $0x93, %xmm3, %xmm2
  291. mulps %xmm0, %xmm3
  292. addps %xmm1, %xmm4
  293. movaps -4 * SIZE(BB), %xmm1
  294. mulps %xmm0, %xmm2
  295. movaps -4 * SIZE(AA), %xmm0
  296. addps %xmm2, %xmm7
  297. subl $-32 * SIZE, BB
  298. pshufd $0x93, %xmm1, %xmm2
  299. mulps %xmm0, %xmm1
  300. addps %xmm3, %xmm6
  301. pshufd $0x93, %xmm2, %xmm3
  302. mulps %xmm0, %xmm2
  303. addps %xmm2, %xmm5
  304. subl $-32 * SIZE, AA
  305. pshufd $0x93, %xmm3, %xmm2
  306. mulps %xmm0, %xmm3
  307. addps %xmm1, %xmm4
  308. movaps -32 * SIZE(BB), %xmm1
  309. mulps %xmm0, %xmm2
  310. movaps -32 * SIZE(AA), %xmm0
  311. subl $1, %eax
  312. jne .L12
  313. ALIGN_4
  314. .L15:
  315. #if defined(LT) || defined(RN)
  316. movl KK, %eax
  317. #else
  318. movl K, %eax
  319. subl KK, %eax
  320. #endif
  321. andl $7, %eax # if (k & 1)
  322. BRANCH
  323. je .L18
  324. ALIGN_4
  325. .L16:
  326. addps %xmm2, %xmm7
  327. pshufd $0x93, %xmm1, %xmm2
  328. mulps %xmm0, %xmm1
  329. addps %xmm3, %xmm6
  330. pshufd $0x93, %xmm2, %xmm3
  331. mulps %xmm0, %xmm2
  332. addps %xmm2, %xmm5
  333. pshufd $0x93, %xmm3, %xmm2
  334. mulps %xmm0, %xmm3
  335. addps %xmm1, %xmm4
  336. movaps -28 * SIZE(BB), %xmm1
  337. mulps %xmm0, %xmm2
  338. movaps -28 * SIZE(AA), %xmm0
  339. addl $4 * SIZE, AA
  340. addl $4 * SIZE, BB
  341. decl %eax
  342. jg .L16
  343. ALIGN_4
  344. .L18:
  345. #if defined(LN) || defined(RT)
  346. movl KK, %eax
  347. #ifdef LN
  348. subl $4, %eax
  349. #else
  350. subl $4, %eax
  351. #endif
  352. movl AORIG, AA
  353. leal (, %eax, SIZE), %eax
  354. leal (AA, %eax, 4), AA
  355. leal (B, %eax, 4), BB
  356. #endif
  357. addps %xmm3, %xmm6
  358. addps %xmm2, %xmm7
  359. #if defined(LN) || defined(LT)
  360. movaps %xmm4, %xmm0
  361. unpcklps %xmm7, %xmm0
  362. unpckhps %xmm7, %xmm4
  363. movaps %xmm6, %xmm2
  364. unpcklps %xmm5, %xmm2
  365. unpckhps %xmm5, %xmm6
  366. movaps %xmm0, %xmm1
  367. movlhps %xmm2, %xmm0
  368. movhlps %xmm2, %xmm1
  369. movaps %xmm6, %xmm7
  370. movlhps %xmm4, %xmm6
  371. movhlps %xmm4, %xmm7
  372. pshufd $0x39, %xmm1, %xmm2
  373. pshufd $0x39, %xmm7, %xmm4
  374. movaps -32 * SIZE(BB), %xmm1
  375. movaps -28 * SIZE(BB), %xmm3
  376. movaps -24 * SIZE(BB), %xmm5
  377. movaps -20 * SIZE(BB), %xmm7
  378. subps %xmm0, %xmm1
  379. subps %xmm2, %xmm3
  380. subps %xmm6, %xmm5
  381. subps %xmm4, %xmm7
  382. #else
  383. pshufd $0x39, %xmm5, %xmm2
  384. pshufd $0x4e, %xmm6, %xmm0
  385. pshufd $0x93, %xmm7, %xmm7
  386. movaps %xmm4, %xmm6
  387. unpcklps %xmm0, %xmm4
  388. unpckhps %xmm0, %xmm6
  389. movaps %xmm2, %xmm1
  390. unpcklps %xmm7, %xmm2
  391. unpckhps %xmm7, %xmm1
  392. movaps %xmm4, %xmm5
  393. unpcklps %xmm2, %xmm4
  394. unpckhps %xmm2, %xmm5
  395. movaps %xmm6, %xmm7
  396. unpcklps %xmm1, %xmm6
  397. unpckhps %xmm1, %xmm7
  398. pshufd $0x93, %xmm5, %xmm5
  399. pshufd $0x4e, %xmm6, %xmm6
  400. pshufd $0x39, %xmm7, %xmm7
  401. movaps -32 * SIZE(AA), %xmm0
  402. movaps -28 * SIZE(AA), %xmm1
  403. movaps -24 * SIZE(AA), %xmm2
  404. movaps -20 * SIZE(AA), %xmm3
  405. subps %xmm4, %xmm0
  406. subps %xmm5, %xmm1
  407. subps %xmm6, %xmm2
  408. subps %xmm7, %xmm3
  409. #endif
  410. #ifdef LN
  411. movaps -20 * SIZE(AA), %xmm4
  412. pshufd $0xff, %xmm4, %xmm6
  413. mulps %xmm6, %xmm7
  414. pshufd $0xaa, %xmm4, %xmm6
  415. mulps %xmm7, %xmm6
  416. subps %xmm6, %xmm5
  417. pshufd $0x55, %xmm4, %xmm6
  418. mulps %xmm7, %xmm6
  419. subps %xmm6, %xmm3
  420. pshufd $0x00, %xmm4, %xmm6
  421. mulps %xmm7, %xmm6
  422. subps %xmm6, %xmm1
  423. movaps -24 * SIZE(AA), %xmm4
  424. pshufd $0xaa, %xmm4, %xmm6
  425. mulps %xmm6, %xmm5
  426. pshufd $0x55, %xmm4, %xmm6
  427. mulps %xmm5, %xmm6
  428. subps %xmm6, %xmm3
  429. pshufd $0x00, %xmm4, %xmm6
  430. mulps %xmm5, %xmm6
  431. subps %xmm6, %xmm1
  432. movaps -28 * SIZE(AA), %xmm4
  433. pshufd $0x55, %xmm4, %xmm6
  434. mulps %xmm6, %xmm3
  435. pshufd $0x00, %xmm4, %xmm6
  436. mulps %xmm3, %xmm6
  437. subps %xmm6, %xmm1
  438. movaps -32 * SIZE(AA), %xmm4
  439. pshufd $0x00, %xmm4, %xmm6
  440. mulps %xmm6, %xmm1
  441. #endif
  442. #ifdef LT
  443. movaps -32 * SIZE(AA), %xmm4
  444. pshufd $0x00, %xmm4, %xmm6
  445. mulps %xmm6, %xmm1
  446. pshufd $0x55, %xmm4, %xmm6
  447. mulps %xmm1, %xmm6
  448. subps %xmm6, %xmm3
  449. pshufd $0xaa, %xmm4, %xmm6
  450. mulps %xmm1, %xmm6
  451. subps %xmm6, %xmm5
  452. pshufd $0xff, %xmm4, %xmm6
  453. mulps %xmm1, %xmm6
  454. subps %xmm6, %xmm7
  455. movaps -28 * SIZE(AA), %xmm4
  456. pshufd $0x55, %xmm4, %xmm6
  457. mulps %xmm6, %xmm3
  458. pshufd $0xaa, %xmm4, %xmm6
  459. mulps %xmm3, %xmm6
  460. subps %xmm6, %xmm5
  461. pshufd $0xff, %xmm4, %xmm6
  462. mulps %xmm3, %xmm6
  463. subps %xmm6, %xmm7
  464. movaps -24 * SIZE(AA), %xmm4
  465. pshufd $0xaa, %xmm4, %xmm6
  466. mulps %xmm6, %xmm5
  467. pshufd $0xff, %xmm4, %xmm6
  468. mulps %xmm5, %xmm6
  469. subps %xmm6, %xmm7
  470. movaps -20 * SIZE(AA), %xmm4
  471. pshufd $0xff, %xmm4, %xmm6
  472. mulps %xmm6, %xmm7
  473. #endif
  474. #ifdef RN
  475. movaps -32 * SIZE(BB), %xmm6
  476. pshufd $0x00, %xmm6, %xmm7
  477. mulps %xmm7, %xmm0
  478. pshufd $0x55, %xmm6, %xmm7
  479. mulps %xmm0, %xmm7
  480. subps %xmm7, %xmm1
  481. pshufd $0xaa, %xmm6, %xmm7
  482. mulps %xmm0, %xmm7
  483. subps %xmm7, %xmm2
  484. pshufd $0xff, %xmm6, %xmm7
  485. mulps %xmm0, %xmm7
  486. subps %xmm7, %xmm3
  487. movaps -28 * SIZE(BB), %xmm6
  488. pshufd $0x55, %xmm6, %xmm7
  489. mulps %xmm7, %xmm1
  490. pshufd $0xaa, %xmm6, %xmm7
  491. mulps %xmm1, %xmm7
  492. subps %xmm7, %xmm2
  493. pshufd $0xff, %xmm6, %xmm7
  494. mulps %xmm1, %xmm7
  495. subps %xmm7, %xmm3
  496. movaps -24 * SIZE(BB), %xmm6
  497. pshufd $0xaa, %xmm6, %xmm7
  498. mulps %xmm7, %xmm2
  499. pshufd $0xff, %xmm6, %xmm7
  500. mulps %xmm2, %xmm7
  501. subps %xmm7, %xmm3
  502. movaps -20 * SIZE(BB), %xmm6
  503. pshufd $0xff, %xmm6, %xmm7
  504. mulps %xmm7, %xmm3
  505. #endif
  506. #ifdef RT
  507. movaps -20 * SIZE(BB), %xmm6
  508. pshufd $0xff, %xmm6, %xmm7
  509. mulps %xmm7, %xmm3
  510. pshufd $0xaa, %xmm6, %xmm7
  511. mulps %xmm3, %xmm7
  512. subps %xmm7, %xmm2
  513. pshufd $0x55, %xmm6, %xmm7
  514. mulps %xmm3, %xmm7
  515. subps %xmm7, %xmm1
  516. pshufd $0x00, %xmm6, %xmm7
  517. mulps %xmm3, %xmm7
  518. subps %xmm7, %xmm0
  519. movaps -24 * SIZE(BB), %xmm6
  520. pshufd $0xaa, %xmm6, %xmm7
  521. mulps %xmm7, %xmm2
  522. pshufd $0x55, %xmm6, %xmm7
  523. mulps %xmm2, %xmm7
  524. subps %xmm7, %xmm1
  525. pshufd $0x00, %xmm6, %xmm7
  526. mulps %xmm2, %xmm7
  527. subps %xmm7, %xmm0
  528. movaps -28 * SIZE(BB), %xmm6
  529. pshufd $0x55, %xmm6, %xmm7
  530. mulps %xmm7, %xmm1
  531. pshufd $0x00, %xmm6, %xmm7
  532. mulps %xmm1, %xmm7
  533. subps %xmm7, %xmm0
  534. movaps -32 * SIZE(BB), %xmm6
  535. pshufd $0x00, %xmm6, %xmm7
  536. mulps %xmm7, %xmm0
  537. #endif
  538. #if defined(LN) || defined(LT)
  539. movaps %xmm1, -32 * SIZE(BB)
  540. movaps %xmm3, -28 * SIZE(BB)
  541. movaps %xmm5, -24 * SIZE(BB)
  542. movaps %xmm7, -20 * SIZE(BB)
  543. #else
  544. movaps %xmm0, -32 * SIZE(AA)
  545. movaps %xmm1, -28 * SIZE(AA)
  546. movaps %xmm2, -24 * SIZE(AA)
  547. movaps %xmm3, -20 * SIZE(AA)
  548. #endif
  549. #ifdef LN
  550. subl $4 * SIZE, CO1
  551. #endif
  552. leal (LDC, LDC, 2), %eax
  553. #if defined(LN) || defined(LT)
  554. movaps %xmm1, %xmm0
  555. unpcklps %xmm5, %xmm1
  556. unpckhps %xmm5, %xmm0
  557. movaps %xmm3, %xmm4
  558. unpcklps %xmm7, %xmm3
  559. unpckhps %xmm7, %xmm4
  560. movaps %xmm1, %xmm2
  561. unpcklps %xmm3, %xmm1
  562. unpckhps %xmm3, %xmm2
  563. movaps %xmm0, %xmm6
  564. unpcklps %xmm4, %xmm0
  565. unpckhps %xmm4, %xmm6
  566. movlps %xmm1, 0 * SIZE(CO1)
  567. movhps %xmm1, 2 * SIZE(CO1)
  568. movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
  569. movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
  570. movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
  571. movhps %xmm0, 2 * SIZE(CO1, LDC, 2)
  572. movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
  573. movhps %xmm6, 2 * SIZE(CO1, %eax, 1)
  574. #else
  575. movlps %xmm0, 0 * SIZE(CO1)
  576. movhps %xmm0, 2 * SIZE(CO1)
  577. movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
  578. movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
  579. movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
  580. movhps %xmm2, 2 * SIZE(CO1, LDC, 2)
  581. movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
  582. movhps %xmm3, 2 * SIZE(CO1, %eax, 1)
  583. #endif
  584. #ifndef LN
  585. addl $4 * SIZE, CO1
  586. #endif
  587. #if defined(LT) || defined(RN)
  588. movl K, %eax
  589. subl KK, %eax
  590. leal (,%eax, SIZE), %eax
  591. leal (AA, %eax, 4), AA
  592. leal (BB, %eax, 4), BB
  593. #endif
  594. #ifdef LN
  595. subl $4, KK
  596. #endif
  597. #ifdef LT
  598. addl $4, KK
  599. #endif
  600. #ifdef RT
  601. movl K, %eax
  602. sall $2 + BASE_SHIFT, %eax
  603. addl %eax, AORIG
  604. #endif
  605. decl %ebx # i --
  606. jg .L11
  607. ALIGN_4
  608. .L20:
  609. testl $2, M
  610. je .L30
  611. #ifdef LN
  612. movl K, %eax
  613. sall $1 + BASE_SHIFT, %eax
  614. subl %eax, AORIG
  615. #endif
  616. #if defined(LN) || defined(RT)
  617. movl KK, %eax
  618. movl AORIG, AA
  619. leal (, %eax, SIZE), %eax
  620. leal (AA, %eax, 2), AA
  621. #endif
  622. movl B, BB
  623. #if defined(LN) || defined(RT)
  624. movl KK, %eax
  625. sall $2 + BASE_SHIFT, %eax
  626. addl %eax, BB
  627. #endif
  628. pxor %xmm4, %xmm4
  629. movaps -32 * SIZE(AA), %xmm0
  630. pxor %xmm5, %xmm5
  631. movaps -32 * SIZE(BB), %xmm1
  632. pxor %xmm6, %xmm6
  633. pxor %xmm7, %xmm7
  634. #if defined(LT) || defined(RN)
  635. movl KK, %eax
  636. #else
  637. movl K, %eax
  638. subl KK, %eax
  639. #endif
  640. sarl $3, %eax
  641. je .L25
  642. ALIGN_4
  643. .L22:
  644. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  645. pshufd $0x44, %xmm0, %xmm2
  646. pshufd $0x50, %xmm1, %xmm3
  647. mulps %xmm2, %xmm3
  648. addps %xmm3, %xmm4
  649. pshufd $0xfa, %xmm1, %xmm3
  650. movaps -28 * SIZE(BB), %xmm1
  651. mulps %xmm2, %xmm3
  652. addps %xmm3, %xmm6
  653. pshufd $0xee, %xmm0, %xmm2
  654. movaps -28 * SIZE(AA), %xmm0
  655. pshufd $0x50, %xmm1, %xmm3
  656. mulps %xmm2, %xmm3
  657. addps %xmm3, %xmm5
  658. pshufd $0xfa, %xmm1, %xmm3
  659. movaps -24 * SIZE(BB), %xmm1
  660. mulps %xmm2, %xmm3
  661. addps %xmm3, %xmm7
  662. pshufd $0x44, %xmm0, %xmm2
  663. pshufd $0x50, %xmm1, %xmm3
  664. mulps %xmm2, %xmm3
  665. addps %xmm3, %xmm4
  666. pshufd $0xfa, %xmm1, %xmm3
  667. movaps -20 * SIZE(BB), %xmm1
  668. mulps %xmm2, %xmm3
  669. addps %xmm3, %xmm6
  670. pshufd $0xee, %xmm0, %xmm2
  671. movaps -24 * SIZE(AA), %xmm0
  672. pshufd $0x50, %xmm1, %xmm3
  673. mulps %xmm2, %xmm3
  674. addps %xmm3, %xmm5
  675. pshufd $0xfa, %xmm1, %xmm3
  676. movaps -16 * SIZE(BB), %xmm1
  677. mulps %xmm2, %xmm3
  678. addps %xmm3, %xmm7
  679. pshufd $0x44, %xmm0, %xmm2
  680. pshufd $0x50, %xmm1, %xmm3
  681. mulps %xmm2, %xmm3
  682. addps %xmm3, %xmm4
  683. pshufd $0xfa, %xmm1, %xmm3
  684. movaps -12 * SIZE(BB), %xmm1
  685. mulps %xmm2, %xmm3
  686. addps %xmm3, %xmm6
  687. pshufd $0xee, %xmm0, %xmm2
  688. movaps -20 * SIZE(AA), %xmm0
  689. pshufd $0x50, %xmm1, %xmm3
  690. mulps %xmm2, %xmm3
  691. addps %xmm3, %xmm5
  692. pshufd $0xfa, %xmm1, %xmm3
  693. movaps -8 * SIZE(BB), %xmm1
  694. mulps %xmm2, %xmm3
  695. addps %xmm3, %xmm7
  696. pshufd $0x44, %xmm0, %xmm2
  697. pshufd $0x50, %xmm1, %xmm3
  698. mulps %xmm2, %xmm3
  699. addps %xmm3, %xmm4
  700. pshufd $0xfa, %xmm1, %xmm3
  701. movaps -4 * SIZE(BB), %xmm1
  702. mulps %xmm2, %xmm3
  703. addps %xmm3, %xmm6
  704. pshufd $0xee, %xmm0, %xmm2
  705. movaps -16 * SIZE(AA), %xmm0
  706. pshufd $0x50, %xmm1, %xmm3
  707. mulps %xmm2, %xmm3
  708. addps %xmm3, %xmm5
  709. pshufd $0xfa, %xmm1, %xmm3
  710. movaps 0 * SIZE(BB), %xmm1
  711. mulps %xmm2, %xmm3
  712. addps %xmm3, %xmm7
  713. subl $-16 * SIZE, AA
  714. subl $-32 * SIZE, BB
  715. subl $1, %eax
  716. jne .L22
  717. ALIGN_4
  718. .L25:
  719. #if defined(LT) || defined(RN)
  720. movl KK, %eax
  721. #else
  722. movl K, %eax
  723. subl KK, %eax
  724. #endif
  725. andl $7, %eax # if (k & 1)
  726. BRANCH
  727. je .L28
  728. ALIGN_4
  729. .L26:
  730. pshufd $0x44, %xmm0, %xmm2
  731. movsd -30 * SIZE(AA), %xmm0
  732. pshufd $0x50, %xmm1, %xmm3
  733. mulps %xmm2, %xmm3
  734. addps %xmm3, %xmm4
  735. pshufd $0xfa, %xmm1, %xmm3
  736. movaps -28 * SIZE(BB), %xmm1
  737. mulps %xmm2, %xmm3
  738. addps %xmm3, %xmm6
  739. addl $2 * SIZE, AA
  740. addl $4 * SIZE, BB
  741. decl %eax
  742. jg .L26
  743. ALIGN_4
  744. .L28:
  745. #if defined(LN) || defined(RT)
  746. movl KK, %eax
  747. #ifdef LN
  748. subl $2, %eax
  749. #else
  750. subl $4, %eax
  751. #endif
  752. movl AORIG, AA
  753. leal (, %eax, SIZE), %eax
  754. leal (AA, %eax, 2), AA
  755. leal (B, %eax, 4), BB
  756. #endif
  757. addps %xmm5, %xmm4
  758. addps %xmm7, %xmm6
  759. movhlps %xmm4, %xmm5
  760. movhlps %xmm6, %xmm7
  761. #if defined(LN) || defined(LT)
  762. unpcklps %xmm6, %xmm4
  763. unpcklps %xmm7, %xmm5
  764. movaps %xmm4, %xmm6
  765. unpcklps %xmm5, %xmm4
  766. unpckhps %xmm5, %xmm6
  767. movaps -32 * SIZE(BB), %xmm1
  768. movaps -28 * SIZE(BB), %xmm3
  769. subps %xmm4, %xmm1
  770. subps %xmm6, %xmm3
  771. #else
  772. movsd -32 * SIZE(AA), %xmm0
  773. movsd -30 * SIZE(AA), %xmm1
  774. movsd -28 * SIZE(AA), %xmm2
  775. movsd -26 * SIZE(AA), %xmm3
  776. subps %xmm4, %xmm0
  777. subps %xmm5, %xmm1
  778. subps %xmm6, %xmm2
  779. subps %xmm7, %xmm3
  780. #endif
  781. #ifdef LN
  782. movaps -32 * SIZE(AA), %xmm4
  783. pshufd $0xff, %xmm4, %xmm6
  784. mulps %xmm6, %xmm3
  785. pshufd $0xaa, %xmm4, %xmm6
  786. mulps %xmm3, %xmm6
  787. subps %xmm6, %xmm1
  788. pshufd $0x00, %xmm4, %xmm6
  789. mulps %xmm6, %xmm1
  790. #endif
  791. #ifdef LT
  792. movaps -32 * SIZE(AA), %xmm4
  793. pshufd $0x00, %xmm4, %xmm6
  794. mulps %xmm6, %xmm1
  795. pshufd $0x55, %xmm4, %xmm6
  796. mulps %xmm1, %xmm6
  797. subps %xmm6, %xmm3
  798. pshufd $0xff, %xmm4, %xmm6
  799. mulps %xmm6, %xmm3
  800. #endif
  801. #ifdef RN
  802. movaps -32 * SIZE(BB), %xmm6
  803. pshufd $0x00, %xmm6, %xmm7
  804. mulps %xmm7, %xmm0
  805. pshufd $0x55, %xmm6, %xmm7
  806. mulps %xmm0, %xmm7
  807. subps %xmm7, %xmm1
  808. pshufd $0xaa, %xmm6, %xmm7
  809. mulps %xmm0, %xmm7
  810. subps %xmm7, %xmm2
  811. pshufd $0xff, %xmm6, %xmm7
  812. mulps %xmm0, %xmm7
  813. subps %xmm7, %xmm3
  814. movaps -28 * SIZE(BB), %xmm6
  815. pshufd $0x55, %xmm6, %xmm7
  816. mulps %xmm7, %xmm1
  817. pshufd $0xaa, %xmm6, %xmm7
  818. mulps %xmm1, %xmm7
  819. subps %xmm7, %xmm2
  820. pshufd $0xff, %xmm6, %xmm7
  821. mulps %xmm1, %xmm7
  822. subps %xmm7, %xmm3
  823. movaps -24 * SIZE(BB), %xmm6
  824. pshufd $0xaa, %xmm6, %xmm7
  825. mulps %xmm7, %xmm2
  826. pshufd $0xff, %xmm6, %xmm7
  827. mulps %xmm2, %xmm7
  828. subps %xmm7, %xmm3
  829. movaps -20 * SIZE(BB), %xmm6
  830. pshufd $0xff, %xmm6, %xmm7
  831. mulps %xmm7, %xmm3
  832. #endif
  833. #ifdef RT
  834. movaps -20 * SIZE(BB), %xmm6
  835. pshufd $0xff, %xmm6, %xmm7
  836. mulps %xmm7, %xmm3
  837. pshufd $0xaa, %xmm6, %xmm7
  838. mulps %xmm3, %xmm7
  839. subps %xmm7, %xmm2
  840. pshufd $0x55, %xmm6, %xmm7
  841. mulps %xmm3, %xmm7
  842. subps %xmm7, %xmm1
  843. pshufd $0x00, %xmm6, %xmm7
  844. mulps %xmm3, %xmm7
  845. subps %xmm7, %xmm0
  846. movaps -24 * SIZE(BB), %xmm6
  847. pshufd $0xaa, %xmm6, %xmm7
  848. mulps %xmm7, %xmm2
  849. pshufd $0x55, %xmm6, %xmm7
  850. mulps %xmm2, %xmm7
  851. subps %xmm7, %xmm1
  852. pshufd $0x00, %xmm6, %xmm7
  853. mulps %xmm2, %xmm7
  854. subps %xmm7, %xmm0
  855. movaps -28 * SIZE(BB), %xmm6
  856. pshufd $0x55, %xmm6, %xmm7
  857. mulps %xmm7, %xmm1
  858. pshufd $0x00, %xmm6, %xmm7
  859. mulps %xmm1, %xmm7
  860. subps %xmm7, %xmm0
  861. movaps -32 * SIZE(BB), %xmm6
  862. pshufd $0x00, %xmm6, %xmm7
  863. mulps %xmm7, %xmm0
  864. #endif
  865. #if defined(LN) || defined(LT)
  866. movaps %xmm1, -32 * SIZE(BB)
  867. movaps %xmm3, -28 * SIZE(BB)
  868. #else
  869. movlps %xmm0, -32 * SIZE(AA)
  870. movlps %xmm1, -30 * SIZE(AA)
  871. movlps %xmm2, -28 * SIZE(AA)
  872. movlps %xmm3, -26 * SIZE(AA)
  873. #endif
  874. #ifdef LN
  875. subl $2 * SIZE, CO1
  876. #endif
  877. leal (LDC, LDC, 2), %eax
  878. #if defined(LN) || defined(LT)
  879. movaps %xmm1, %xmm0
  880. unpcklps %xmm5, %xmm1
  881. unpckhps %xmm5, %xmm0
  882. movaps %xmm3, %xmm4
  883. unpcklps %xmm7, %xmm3
  884. unpckhps %xmm7, %xmm4
  885. movaps %xmm1, %xmm2
  886. unpcklps %xmm3, %xmm1
  887. unpckhps %xmm3, %xmm2
  888. movaps %xmm0, %xmm6
  889. unpcklps %xmm4, %xmm0
  890. unpckhps %xmm4, %xmm6
  891. movlps %xmm1, 0 * SIZE(CO1)
  892. movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
  893. movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
  894. movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
  895. #else
  896. movlps %xmm0, 0 * SIZE(CO1)
  897. movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
  898. movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
  899. movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
  900. #endif
  901. #ifndef LN
  902. addl $2 * SIZE, CO1
  903. #endif
  904. #if defined(LT) || defined(RN)
  905. movl K, %eax
  906. subl KK, %eax
  907. leal (,%eax, SIZE), %eax
  908. leal (AA, %eax, 2), AA
  909. leal (BB, %eax, 4), BB
  910. #endif
  911. #ifdef LN
  912. subl $2, KK
  913. #endif
  914. #ifdef LT
  915. addl $2, KK
  916. #endif
  917. #ifdef RT
  918. movl K, %eax
  919. sall $1 + BASE_SHIFT, %eax
  920. addl %eax, AORIG
  921. #endif
  922. ALIGN_4
  923. .L30:
  924. testl $1, M
  925. je .L39
  926. #ifdef LN
  927. movl K, %eax
  928. sall $BASE_SHIFT, %eax
  929. subl %eax, AORIG
  930. #endif
  931. #if defined(LN) || defined(RT)
  932. movl KK, %eax
  933. movl AORIG, AA
  934. leal (AA, %eax, SIZE), AA
  935. #endif
  936. movl B, BB
  937. #if defined(LN) || defined(RT)
  938. movl KK, %eax
  939. sall $2 + BASE_SHIFT, %eax
  940. addl %eax, BB
  941. #endif
  942. pxor %xmm4, %xmm4
  943. movsd -32 * SIZE(AA), %xmm0
  944. pxor %xmm5, %xmm5
  945. movaps -32 * SIZE(BB), %xmm1
  946. pxor %xmm6, %xmm6
  947. pxor %xmm7, %xmm7
  948. #if defined(LT) || defined(RN)
  949. movl KK, %eax
  950. #else
  951. movl K, %eax
  952. subl KK, %eax
  953. #endif
  954. sarl $3, %eax
  955. je .L35
  956. ALIGN_4
  957. .L32:
  958. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  959. pshufd $0x00, %xmm0, %xmm2
  960. mulps %xmm2, %xmm1
  961. addps %xmm1, %xmm4
  962. movaps -28 * SIZE(BB), %xmm1
  963. pshufd $0x55, %xmm0, %xmm2
  964. movsd -30 * SIZE(AA), %xmm0
  965. mulps %xmm2, %xmm1
  966. addps %xmm1, %xmm4
  967. movaps -24 * SIZE(BB), %xmm1
  968. pshufd $0x00, %xmm0, %xmm2
  969. mulps %xmm2, %xmm1
  970. addps %xmm1, %xmm4
  971. movaps -20 * SIZE(BB), %xmm1
  972. pshufd $0x55, %xmm0, %xmm2
  973. movsd -28 * SIZE(AA), %xmm0
  974. mulps %xmm2, %xmm1
  975. addps %xmm1, %xmm4
  976. movaps -16 * SIZE(BB), %xmm1
  977. pshufd $0x00, %xmm0, %xmm2
  978. mulps %xmm2, %xmm1
  979. addps %xmm1, %xmm4
  980. movaps -12 * SIZE(BB), %xmm1
  981. pshufd $0x55, %xmm0, %xmm2
  982. movsd -26 * SIZE(AA), %xmm0
  983. mulps %xmm2, %xmm1
  984. addps %xmm1, %xmm4
  985. movaps -8 * SIZE(BB), %xmm1
  986. pshufd $0x00, %xmm0, %xmm2
  987. mulps %xmm2, %xmm1
  988. addps %xmm1, %xmm4
  989. movaps -4 * SIZE(BB), %xmm1
  990. pshufd $0x55, %xmm0, %xmm2
  991. movsd -24 * SIZE(AA), %xmm0
  992. mulps %xmm2, %xmm1
  993. addps %xmm1, %xmm4
  994. movaps 0 * SIZE(BB), %xmm1
  995. subl $ -8 * SIZE, AA
  996. subl $-32 * SIZE, BB
  997. subl $1, %eax
  998. jne .L32
  999. ALIGN_4
  1000. .L35:
  1001. #if defined(LT) || defined(RN)
  1002. movl KK, %eax
  1003. #else
  1004. movl K, %eax
  1005. subl KK, %eax
  1006. #endif
  1007. andl $7, %eax # if (k & 1)
  1008. BRANCH
  1009. je .L38
  1010. ALIGN_4
  1011. .L36:
  1012. pshufd $0x00, %xmm0, %xmm2
  1013. movss -31 * SIZE(AA), %xmm0
  1014. mulps %xmm2, %xmm1
  1015. addps %xmm1, %xmm4
  1016. movaps -28 * SIZE(BB), %xmm1
  1017. addl $1 * SIZE, AA
  1018. addl $4 * SIZE, BB
  1019. decl %eax
  1020. jg .L36
  1021. ALIGN_4
  1022. .L38:
  1023. #if defined(LN) || defined(RT)
  1024. movl KK, %eax
  1025. #ifdef LN
  1026. subl $1, %eax
  1027. #else
  1028. subl $4, %eax
  1029. #endif
  1030. movl AORIG, AA
  1031. leal (, %eax, SIZE), %eax
  1032. leal (AA, %eax, 1), AA
  1033. leal (B, %eax, 4), BB
  1034. #endif
  1035. #if defined(LN) || defined(LT)
  1036. movaps -32 * SIZE(BB), %xmm1
  1037. subps %xmm4, %xmm1
  1038. #else
  1039. movsd -32 * SIZE(AA), %xmm0
  1040. movhps -30 * SIZE(AA), %xmm0
  1041. subps %xmm4, %xmm0
  1042. pshufd $0xff, %xmm0, %xmm3
  1043. pshufd $0xaa, %xmm0, %xmm2
  1044. pshufd $0x55, %xmm0, %xmm1
  1045. pshufd $0x00, %xmm0, %xmm0
  1046. #endif
  1047. #if defined(LN) || defined(LT)
  1048. movss -32 * SIZE(AA), %xmm4
  1049. pshufd $0x00, %xmm4, %xmm6
  1050. mulps %xmm6, %xmm1
  1051. #endif
  1052. #ifdef RN
  1053. movaps -32 * SIZE(BB), %xmm6
  1054. pshufd $0x00, %xmm6, %xmm7
  1055. mulss %xmm7, %xmm0
  1056. pshufd $0x55, %xmm6, %xmm7
  1057. mulss %xmm0, %xmm7
  1058. subss %xmm7, %xmm1
  1059. pshufd $0xaa, %xmm6, %xmm7
  1060. mulss %xmm0, %xmm7
  1061. subss %xmm7, %xmm2
  1062. pshufd $0xff, %xmm6, %xmm7
  1063. mulss %xmm0, %xmm7
  1064. subss %xmm7, %xmm3
  1065. movaps -28 * SIZE(BB), %xmm6
  1066. pshufd $0x55, %xmm6, %xmm7
  1067. mulss %xmm7, %xmm1
  1068. pshufd $0xaa, %xmm6, %xmm7
  1069. mulss %xmm1, %xmm7
  1070. subss %xmm7, %xmm2
  1071. pshufd $0xff, %xmm6, %xmm7
  1072. mulss %xmm1, %xmm7
  1073. subss %xmm7, %xmm3
  1074. movaps -24 * SIZE(BB), %xmm6
  1075. pshufd $0xaa, %xmm6, %xmm7
  1076. mulss %xmm7, %xmm2
  1077. pshufd $0xff, %xmm6, %xmm7
  1078. mulss %xmm2, %xmm7
  1079. subss %xmm7, %xmm3
  1080. movaps -20 * SIZE(BB), %xmm6
  1081. pshufd $0xff, %xmm6, %xmm7
  1082. mulss %xmm7, %xmm3
  1083. #endif
  1084. #ifdef RT
  1085. movaps -20 * SIZE(BB), %xmm6
  1086. pshufd $0xff, %xmm6, %xmm7
  1087. mulss %xmm7, %xmm3
  1088. pshufd $0xaa, %xmm6, %xmm7
  1089. mulss %xmm3, %xmm7
  1090. subss %xmm7, %xmm2
  1091. pshufd $0x55, %xmm6, %xmm7
  1092. mulss %xmm3, %xmm7
  1093. subss %xmm7, %xmm1
  1094. pshufd $0x00, %xmm6, %xmm7
  1095. mulss %xmm3, %xmm7
  1096. subss %xmm7, %xmm0
  1097. movaps -24 * SIZE(BB), %xmm6
  1098. pshufd $0xaa, %xmm6, %xmm7
  1099. mulss %xmm7, %xmm2
  1100. pshufd $0x55, %xmm6, %xmm7
  1101. mulss %xmm2, %xmm7
  1102. subss %xmm7, %xmm1
  1103. pshufd $0x00, %xmm6, %xmm7
  1104. mulss %xmm2, %xmm7
  1105. subss %xmm7, %xmm0
  1106. movaps -28 * SIZE(BB), %xmm6
  1107. pshufd $0x55, %xmm6, %xmm7
  1108. mulss %xmm7, %xmm1
  1109. pshufd $0x00, %xmm6, %xmm7
  1110. mulss %xmm1, %xmm7
  1111. subss %xmm7, %xmm0
  1112. movaps -32 * SIZE(BB), %xmm6
  1113. pshufd $0x00, %xmm6, %xmm7
  1114. mulss %xmm7, %xmm0
  1115. #endif
  1116. #if defined(LN) || defined(LT)
  1117. movaps %xmm1, -32 * SIZE(BB)
  1118. #else
  1119. movss %xmm0, -32 * SIZE(AA)
  1120. movss %xmm1, -31 * SIZE(AA)
  1121. movss %xmm2, -30 * SIZE(AA)
  1122. movss %xmm3, -29 * SIZE(AA)
  1123. #endif
  1124. #ifdef LN
  1125. subl $1 * SIZE, CO1
  1126. #endif
  1127. leal (LDC, LDC, 2), %eax
  1128. #if defined(LN) || defined(LT)
  1129. movaps %xmm1, %xmm0
  1130. unpcklps %xmm5, %xmm1
  1131. unpckhps %xmm5, %xmm0
  1132. movaps %xmm3, %xmm4
  1133. unpcklps %xmm7, %xmm3
  1134. unpckhps %xmm7, %xmm4
  1135. movaps %xmm1, %xmm2
  1136. unpcklps %xmm3, %xmm1
  1137. unpckhps %xmm3, %xmm2
  1138. movaps %xmm0, %xmm6
  1139. unpcklps %xmm4, %xmm0
  1140. unpckhps %xmm4, %xmm6
  1141. movss %xmm1, 0 * SIZE(CO1)
  1142. movss %xmm2, 0 * SIZE(CO1, LDC, 1)
  1143. movss %xmm0, 0 * SIZE(CO1, LDC, 2)
  1144. movss %xmm6, 0 * SIZE(CO1, %eax, 1)
  1145. #else
  1146. movss %xmm0, 0 * SIZE(CO1)
  1147. movss %xmm1, 0 * SIZE(CO1, LDC, 1)
  1148. movss %xmm2, 0 * SIZE(CO1, LDC, 2)
  1149. movss %xmm3, 0 * SIZE(CO1, %eax, 1)
  1150. #endif
  1151. #ifndef LN
  1152. addl $1 * SIZE, CO1
  1153. #endif
  1154. #if defined(LT) || defined(RN)
  1155. movl K, %eax
  1156. subl KK, %eax
  1157. leal (,%eax, SIZE), %eax
  1158. leal (AA, %eax, 1), AA
  1159. leal (BB, %eax, 4), BB
  1160. #endif
  1161. #ifdef LN
  1162. subl $1, KK
  1163. #endif
  1164. #ifdef LT
  1165. addl $1, KK
  1166. #endif
  1167. #ifdef RT
  1168. movl K, %eax
  1169. sall $BASE_SHIFT, %eax
  1170. addl %eax, AORIG
  1171. #endif
  1172. ALIGN_4
  1173. .L39:
  1174. #ifdef LN
  1175. movl K, %eax
  1176. leal (, %eax, SIZE), %eax
  1177. leal (B, %eax, 4), B
  1178. #endif
  1179. #if defined(LT) || defined(RN)
  1180. movl BB, B
  1181. #endif
  1182. #ifdef RN
  1183. addl $4, KK
  1184. #endif
  1185. #ifdef RT
  1186. subl $4, KK
  1187. #endif
  1188. decl J # j --
  1189. jg .L10
  1190. ALIGN_4
  1191. .L40:
  1192. testl $2, N
  1193. je .L80
  1194. #if defined(LT) || defined(RN)
  1195. movl A, AA
  1196. #else
  1197. movl A, %eax
  1198. movl %eax, AORIG
  1199. #endif
  1200. #ifdef RT
  1201. movl K, %eax
  1202. sall $1 + BASE_SHIFT, %eax
  1203. subl %eax, B
  1204. #endif
  1205. leal (, LDC, 2), %eax
  1206. #ifdef RT
  1207. subl %eax, C
  1208. #endif
  1209. movl C, CO1
  1210. #ifndef RT
  1211. addl %eax, C
  1212. #endif
  1213. #ifdef LN
  1214. movl OFFSET, %eax
  1215. addl M, %eax
  1216. movl %eax, KK
  1217. #endif
  1218. #ifdef LT
  1219. movl OFFSET, %eax
  1220. movl %eax, KK
  1221. #endif
  1222. movl M, %ebx
  1223. sarl $2, %ebx # i = (m >> 2)
  1224. jle .L60
  1225. ALIGN_4
  1226. .L51:
  1227. #ifdef LN
  1228. movl K, %eax
  1229. sall $2 + BASE_SHIFT, %eax
  1230. subl %eax, AORIG
  1231. #endif
  1232. #if defined(LN) || defined(RT)
  1233. movl KK, %eax
  1234. movl AORIG, AA
  1235. leal (, %eax, SIZE), %eax
  1236. leal (AA, %eax, 4), AA
  1237. #endif
  1238. movl B, BB
  1239. #if defined(LN) || defined(RT)
  1240. movl KK, %eax
  1241. sall $1 + BASE_SHIFT, %eax
  1242. addl %eax, BB
  1243. #endif
  1244. movaps -32 * SIZE(AA), %xmm0
  1245. pxor %xmm2, %xmm2
  1246. movaps -32 * SIZE(BB), %xmm1
  1247. pxor %xmm3, %xmm3
  1248. #ifdef LN
  1249. pxor %xmm4, %xmm4
  1250. prefetcht0 -4 * SIZE(CO1)
  1251. pxor %xmm5, %xmm5
  1252. prefetcht0 -4 * SIZE(CO1, LDC)
  1253. #else
  1254. pxor %xmm4, %xmm4
  1255. prefetcht0 3 * SIZE(CO1)
  1256. pxor %xmm5, %xmm5
  1257. prefetcht0 3 * SIZE(CO1, LDC)
  1258. #endif
  1259. pxor %xmm6, %xmm6
  1260. pxor %xmm7, %xmm7
  1261. #if defined(LT) || defined(RN)
  1262. movl KK, %eax
  1263. #else
  1264. movl K, %eax
  1265. subl KK, %eax
  1266. #endif
  1267. sarl $3, %eax
  1268. je .L55
  1269. ALIGN_4
  1270. .L52:
  1271. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1272. addps %xmm2, %xmm4
  1273. pshufd $0x00, %xmm1, %xmm2
  1274. mulps %xmm0, %xmm2
  1275. addps %xmm3, %xmm5
  1276. pshufd $0x55, %xmm1, %xmm3
  1277. mulps %xmm0, %xmm3
  1278. movaps -28 * SIZE(AA), %xmm0
  1279. addps %xmm2, %xmm6
  1280. pshufd $0xaa, %xmm1, %xmm2
  1281. mulps %xmm0, %xmm2
  1282. addps %xmm3, %xmm7
  1283. pshufd $0xff, %xmm1, %xmm3
  1284. movaps -28 * SIZE(BB), %xmm1
  1285. mulps %xmm0, %xmm3
  1286. movaps -24 * SIZE(AA), %xmm0
  1287. addps %xmm2, %xmm4
  1288. pshufd $0x00, %xmm1, %xmm2
  1289. mulps %xmm0, %xmm2
  1290. addps %xmm3, %xmm5
  1291. pshufd $0x55, %xmm1, %xmm3
  1292. mulps %xmm0, %xmm3
  1293. movaps -20 * SIZE(AA), %xmm0
  1294. addps %xmm2, %xmm6
  1295. pshufd $0xaa, %xmm1, %xmm2
  1296. mulps %xmm0, %xmm2
  1297. addps %xmm3, %xmm7
  1298. pshufd $0xff, %xmm1, %xmm3
  1299. movaps -24 * SIZE(BB), %xmm1
  1300. mulps %xmm0, %xmm3
  1301. movaps -16 * SIZE(AA), %xmm0
  1302. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  1303. addps %xmm2, %xmm4
  1304. pshufd $0x00, %xmm1, %xmm2
  1305. mulps %xmm0, %xmm2
  1306. addps %xmm3, %xmm5
  1307. pshufd $0x55, %xmm1, %xmm3
  1308. mulps %xmm0, %xmm3
  1309. movaps -12 * SIZE(AA), %xmm0
  1310. addps %xmm2, %xmm6
  1311. pshufd $0xaa, %xmm1, %xmm2
  1312. mulps %xmm0, %xmm2
  1313. addps %xmm3, %xmm7
  1314. pshufd $0xff, %xmm1, %xmm3
  1315. movaps -20 * SIZE(BB), %xmm1
  1316. mulps %xmm0, %xmm3
  1317. movaps -8 * SIZE(AA), %xmm0
  1318. addps %xmm2, %xmm4
  1319. pshufd $0x00, %xmm1, %xmm2
  1320. mulps %xmm0, %xmm2
  1321. addps %xmm3, %xmm5
  1322. pshufd $0x55, %xmm1, %xmm3
  1323. mulps %xmm0, %xmm3
  1324. movaps -4 * SIZE(AA), %xmm0
  1325. addps %xmm2, %xmm6
  1326. pshufd $0xaa, %xmm1, %xmm2
  1327. mulps %xmm0, %xmm2
  1328. addps %xmm3, %xmm7
  1329. pshufd $0xff, %xmm1, %xmm3
  1330. movaps -16 * SIZE(BB), %xmm1
  1331. mulps %xmm0, %xmm3
  1332. movaps 0 * SIZE(AA), %xmm0
  1333. subl $-32 * SIZE, AA
  1334. subl $-16 * SIZE, BB
  1335. subl $1, %eax
  1336. jne .L52
  1337. ALIGN_4
  1338. .L55:
  1339. #if defined(LT) || defined(RN)
  1340. movl KK, %eax
  1341. #else
  1342. movl K, %eax
  1343. subl KK, %eax
  1344. #endif
  1345. andl $7, %eax # if (k & 1)
  1346. BRANCH
  1347. je .L58
  1348. ALIGN_4
  1349. .L56:
  1350. addps %xmm2, %xmm4
  1351. pshufd $0x00, %xmm1, %xmm2
  1352. mulps %xmm0, %xmm2
  1353. addps %xmm3, %xmm5
  1354. pshufd $0x55, %xmm1, %xmm3
  1355. movsd -30 * SIZE(BB), %xmm1
  1356. mulps %xmm0, %xmm3
  1357. movaps -28 * SIZE(AA), %xmm0
  1358. addl $4 * SIZE, AA
  1359. addl $2 * SIZE, BB
  1360. decl %eax
  1361. jg .L56
  1362. ALIGN_4
  1363. .L58:
  1364. #if defined(LN) || defined(RT)
  1365. movl KK, %eax
  1366. #ifdef LN
  1367. subl $4, %eax
  1368. #else
  1369. subl $2, %eax
  1370. #endif
  1371. movl AORIG, AA
  1372. leal (, %eax, SIZE), %eax
  1373. leal (AA, %eax, 4), AA
  1374. leal (B, %eax, 2), BB
  1375. #endif
  1376. addps %xmm6, %xmm4
  1377. addps %xmm7, %xmm5
  1378. addps %xmm2, %xmm4
  1379. addps %xmm3, %xmm5
  1380. #if defined(LN) || defined(LT)
  1381. movaps %xmm4, %xmm0
  1382. unpcklps %xmm6, %xmm4
  1383. unpckhps %xmm6, %xmm0
  1384. movaps %xmm5, %xmm1
  1385. unpcklps %xmm7, %xmm5
  1386. unpckhps %xmm7, %xmm1
  1387. movaps %xmm4, %xmm6
  1388. unpcklps %xmm5, %xmm4
  1389. unpckhps %xmm5, %xmm6
  1390. movaps %xmm0, %xmm2
  1391. unpcklps %xmm1, %xmm0
  1392. unpckhps %xmm1, %xmm2
  1393. movsd -32 * SIZE(BB), %xmm1
  1394. movsd -30 * SIZE(BB), %xmm3
  1395. movsd -28 * SIZE(BB), %xmm5
  1396. movsd -26 * SIZE(BB), %xmm7
  1397. subps %xmm4, %xmm1
  1398. subps %xmm6, %xmm3
  1399. subps %xmm0, %xmm5
  1400. subps %xmm2, %xmm7
  1401. #else
  1402. movaps -32 * SIZE(AA), %xmm0
  1403. movaps -28 * SIZE(AA), %xmm1
  1404. subps %xmm4, %xmm0
  1405. subps %xmm5, %xmm1
  1406. #endif
  1407. #ifdef LN
  1408. movaps -20 * SIZE(AA), %xmm4
  1409. pshufd $0xff, %xmm4, %xmm6
  1410. mulps %xmm6, %xmm7
  1411. pshufd $0xaa, %xmm4, %xmm6
  1412. mulps %xmm7, %xmm6
  1413. subps %xmm6, %xmm5
  1414. pshufd $0x55, %xmm4, %xmm6
  1415. mulps %xmm7, %xmm6
  1416. subps %xmm6, %xmm3
  1417. pshufd $0x00, %xmm4, %xmm6
  1418. mulps %xmm7, %xmm6
  1419. subps %xmm6, %xmm1
  1420. movaps -24 * SIZE(AA), %xmm4
  1421. pshufd $0xaa, %xmm4, %xmm6
  1422. mulps %xmm6, %xmm5
  1423. pshufd $0x55, %xmm4, %xmm6
  1424. mulps %xmm5, %xmm6
  1425. subps %xmm6, %xmm3
  1426. pshufd $0x00, %xmm4, %xmm6
  1427. mulps %xmm5, %xmm6
  1428. subps %xmm6, %xmm1
  1429. movaps -28 * SIZE(AA), %xmm4
  1430. pshufd $0x55, %xmm4, %xmm6
  1431. mulps %xmm6, %xmm3
  1432. pshufd $0x00, %xmm4, %xmm6
  1433. mulps %xmm3, %xmm6
  1434. subps %xmm6, %xmm1
  1435. movaps -32 * SIZE(AA), %xmm4
  1436. pshufd $0x00, %xmm4, %xmm6
  1437. mulps %xmm6, %xmm1
  1438. #endif
  1439. #ifdef LT
  1440. movaps -32 * SIZE(AA), %xmm4
  1441. pshufd $0x00, %xmm4, %xmm6
  1442. mulps %xmm6, %xmm1
  1443. pshufd $0x55, %xmm4, %xmm6
  1444. mulps %xmm1, %xmm6
  1445. subps %xmm6, %xmm3
  1446. pshufd $0xaa, %xmm4, %xmm6
  1447. mulps %xmm1, %xmm6
  1448. subps %xmm6, %xmm5
  1449. pshufd $0xff, %xmm4, %xmm6
  1450. mulps %xmm1, %xmm6
  1451. subps %xmm6, %xmm7
  1452. movaps -28 * SIZE(AA), %xmm4
  1453. pshufd $0x55, %xmm4, %xmm6
  1454. mulps %xmm6, %xmm3
  1455. pshufd $0xaa, %xmm4, %xmm6
  1456. mulps %xmm3, %xmm6
  1457. subps %xmm6, %xmm5
  1458. pshufd $0xff, %xmm4, %xmm6
  1459. mulps %xmm3, %xmm6
  1460. subps %xmm6, %xmm7
  1461. movaps -24 * SIZE(AA), %xmm4
  1462. pshufd $0xaa, %xmm4, %xmm6
  1463. mulps %xmm6, %xmm5
  1464. pshufd $0xff, %xmm4, %xmm6
  1465. mulps %xmm5, %xmm6
  1466. subps %xmm6, %xmm7
  1467. movaps -20 * SIZE(AA), %xmm4
  1468. pshufd $0xff, %xmm4, %xmm6
  1469. mulps %xmm6, %xmm7
  1470. #endif
  1471. #ifdef RN
  1472. movaps -32 * SIZE(BB), %xmm6
  1473. pshufd $0x00, %xmm6, %xmm7
  1474. mulps %xmm7, %xmm0
  1475. pshufd $0x55, %xmm6, %xmm7
  1476. mulps %xmm0, %xmm7
  1477. subps %xmm7, %xmm1
  1478. pshufd $0xff, %xmm6, %xmm7
  1479. mulps %xmm7, %xmm1
  1480. #endif
  1481. #ifdef RT
  1482. movaps -32 * SIZE(BB), %xmm6
  1483. pshufd $0xff, %xmm6, %xmm7
  1484. mulps %xmm7, %xmm1
  1485. pshufd $0xaa, %xmm6, %xmm7
  1486. mulps %xmm1, %xmm7
  1487. subps %xmm7, %xmm0
  1488. pshufd $0x00, %xmm6, %xmm7
  1489. mulps %xmm7, %xmm0
  1490. #endif
  1491. #if defined(LN) || defined(LT)
  1492. movlps %xmm1, -32 * SIZE(BB)
  1493. movlps %xmm3, -30 * SIZE(BB)
  1494. movlps %xmm5, -28 * SIZE(BB)
  1495. movlps %xmm7, -26 * SIZE(BB)
  1496. #else
  1497. movaps %xmm0, -32 * SIZE(AA)
  1498. movaps %xmm1, -28 * SIZE(AA)
  1499. #endif
  1500. #ifdef LN
  1501. subl $4 * SIZE, CO1
  1502. #endif
  1503. #if defined(LN) || defined(LT)
  1504. unpcklps %xmm5, %xmm1
  1505. unpcklps %xmm7, %xmm3
  1506. movaps %xmm1, %xmm2
  1507. unpcklps %xmm3, %xmm1
  1508. unpckhps %xmm3, %xmm2
  1509. movlps %xmm1, 0 * SIZE(CO1)
  1510. movhps %xmm1, 2 * SIZE(CO1)
  1511. movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
  1512. movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
  1513. #else
  1514. movlps %xmm0, 0 * SIZE(CO1)
  1515. movhps %xmm0, 2 * SIZE(CO1)
  1516. movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
  1517. movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
  1518. #endif
  1519. #ifndef LN
  1520. addl $4 * SIZE, CO1
  1521. #endif
  1522. #if defined(LT) || defined(RN)
  1523. movl K, %eax
  1524. subl KK, %eax
  1525. leal (,%eax, SIZE), %eax
  1526. leal (AA, %eax, 4), AA
  1527. leal (BB, %eax, 2), BB
  1528. #endif
  1529. #ifdef LN
  1530. subl $4, KK
  1531. #endif
  1532. #ifdef LT
  1533. addl $4, KK
  1534. #endif
  1535. #ifdef RT
  1536. movl K, %eax
  1537. sall $2 + BASE_SHIFT, %eax
  1538. addl %eax, AORIG
  1539. #endif
  1540. decl %ebx # i --
  1541. jg .L51
  1542. ALIGN_4
  1543. .L60:
  1544. testl $2, M
  1545. je .L70
  1546. #ifdef LN
  1547. movl K, %eax
  1548. sall $1 + BASE_SHIFT, %eax
  1549. subl %eax, AORIG
  1550. #endif
  1551. #if defined(LN) || defined(RT)
  1552. movl KK, %eax
  1553. movl AORIG, AA
  1554. leal (, %eax, SIZE), %eax
  1555. leal (AA, %eax, 2), AA
  1556. #endif
  1557. movl B, BB
  1558. #if defined(LN) || defined(RT)
  1559. movl KK, %eax
  1560. sall $1 + BASE_SHIFT, %eax
  1561. addl %eax, BB
  1562. #endif
  1563. movaps -32 * SIZE(AA), %xmm0
  1564. pxor %xmm3, %xmm3
  1565. movaps -32 * SIZE(BB), %xmm1
  1566. pxor %xmm4, %xmm4
  1567. pxor %xmm5, %xmm5
  1568. #if defined(LT) || defined(RN)
  1569. movl KK, %eax
  1570. #else
  1571. movl K, %eax
  1572. subl KK, %eax
  1573. #endif
  1574. sarl $3, %eax
  1575. je .L65
  1576. ALIGN_4
  1577. .L62:
  1578. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1579. pshufd $0x44, %xmm0, %xmm2
  1580. addps %xmm3, %xmm4
  1581. pshufd $0x50, %xmm1, %xmm3
  1582. mulps %xmm2, %xmm3
  1583. pshufd $0xee, %xmm0, %xmm2
  1584. movaps -28 * SIZE(AA), %xmm0
  1585. addps %xmm3, %xmm5
  1586. pshufd $0xfa, %xmm1, %xmm3
  1587. movaps -28 * SIZE(BB), %xmm1
  1588. mulps %xmm2, %xmm3
  1589. pshufd $0x44, %xmm0, %xmm2
  1590. addps %xmm3, %xmm4
  1591. pshufd $0x50, %xmm1, %xmm3
  1592. mulps %xmm2, %xmm3
  1593. pshufd $0xee, %xmm0, %xmm2
  1594. movaps -24 * SIZE(AA), %xmm0
  1595. addps %xmm3, %xmm5
  1596. pshufd $0xfa, %xmm1, %xmm3
  1597. movaps -24 * SIZE(BB), %xmm1
  1598. mulps %xmm2, %xmm3
  1599. pshufd $0x44, %xmm0, %xmm2
  1600. addps %xmm3, %xmm4
  1601. pshufd $0x50, %xmm1, %xmm3
  1602. mulps %xmm2, %xmm3
  1603. pshufd $0xee, %xmm0, %xmm2
  1604. movaps -20 * SIZE(AA), %xmm0
  1605. addps %xmm3, %xmm5
  1606. pshufd $0xfa, %xmm1, %xmm3
  1607. movaps -20 * SIZE(BB), %xmm1
  1608. mulps %xmm2, %xmm3
  1609. pshufd $0x44, %xmm0, %xmm2
  1610. addps %xmm3, %xmm4
  1611. pshufd $0x50, %xmm1, %xmm3
  1612. mulps %xmm2, %xmm3
  1613. pshufd $0xee, %xmm0, %xmm2
  1614. movaps -16 * SIZE(AA), %xmm0
  1615. addps %xmm3, %xmm5
  1616. pshufd $0xfa, %xmm1, %xmm3
  1617. movaps -16 * SIZE(BB), %xmm1
  1618. mulps %xmm2, %xmm3
  1619. subl $-16 * SIZE, AA
  1620. subl $-16 * SIZE, BB
  1621. subl $1, %eax
  1622. jne .L62
  1623. ALIGN_4
  1624. .L65:
  1625. #if defined(LT) || defined(RN)
  1626. movl KK, %eax
  1627. #else
  1628. movl K, %eax
  1629. subl KK, %eax
  1630. #endif
  1631. andl $7, %eax # if (k & 1)
  1632. BRANCH
  1633. je .L68
  1634. ALIGN_4
  1635. .L66:
  1636. pshufd $0x44, %xmm0, %xmm2
  1637. movsd -30 * SIZE(AA), %xmm0
  1638. addps %xmm3, %xmm4
  1639. pshufd $0x50, %xmm1, %xmm3
  1640. movsd -30 * SIZE(BB), %xmm1
  1641. mulps %xmm2, %xmm3
  1642. addl $2 * SIZE, AA
  1643. addl $2 * SIZE, BB
  1644. decl %eax
  1645. jg .L66
  1646. ALIGN_4
  1647. .L68:
  1648. #if defined(LN) || defined(RT)
  1649. movl KK, %eax
  1650. #ifdef LN
  1651. subl $2, %eax
  1652. #else
  1653. subl $2, %eax
  1654. #endif
  1655. movl AORIG, AA
  1656. leal (, %eax, SIZE), %eax
  1657. leal (AA, %eax, 2), AA
  1658. leal (B, %eax, 2), BB
  1659. #endif
  1660. addps %xmm3, %xmm4
  1661. addps %xmm5, %xmm4
  1662. movhlps %xmm4, %xmm5
  1663. #if defined(LN) || defined(LT)
  1664. unpcklps %xmm6, %xmm4
  1665. unpcklps %xmm7, %xmm5
  1666. movaps %xmm4, %xmm6
  1667. unpcklps %xmm5, %xmm4
  1668. unpckhps %xmm5, %xmm6
  1669. movsd -32 * SIZE(BB), %xmm1
  1670. movsd -30 * SIZE(BB), %xmm3
  1671. subps %xmm4, %xmm1
  1672. subps %xmm6, %xmm3
  1673. #else
  1674. movsd -32 * SIZE(AA), %xmm0
  1675. movsd -30 * SIZE(AA), %xmm1
  1676. subps %xmm4, %xmm0
  1677. subps %xmm5, %xmm1
  1678. #endif
  1679. #ifdef LN
  1680. movaps -32 * SIZE(AA), %xmm4
  1681. pshufd $0xff, %xmm4, %xmm6
  1682. mulps %xmm6, %xmm3
  1683. pshufd $0xaa, %xmm4, %xmm6
  1684. mulps %xmm3, %xmm6
  1685. subps %xmm6, %xmm1
  1686. pshufd $0x00, %xmm4, %xmm6
  1687. mulps %xmm6, %xmm1
  1688. #endif
  1689. #ifdef LT
  1690. movaps -32 * SIZE(AA), %xmm4
  1691. pshufd $0x00, %xmm4, %xmm6
  1692. mulps %xmm6, %xmm1
  1693. pshufd $0x55, %xmm4, %xmm6
  1694. mulps %xmm1, %xmm6
  1695. subps %xmm6, %xmm3
  1696. pshufd $0xff, %xmm4, %xmm6
  1697. mulps %xmm6, %xmm3
  1698. #endif
  1699. #ifdef RN
  1700. movaps -32 * SIZE(BB), %xmm6
  1701. pshufd $0x00, %xmm6, %xmm7
  1702. mulps %xmm7, %xmm0
  1703. pshufd $0x55, %xmm6, %xmm7
  1704. mulps %xmm0, %xmm7
  1705. subps %xmm7, %xmm1
  1706. pshufd $0xff, %xmm6, %xmm7
  1707. mulps %xmm7, %xmm1
  1708. #endif
  1709. #ifdef RT
  1710. movaps -32 * SIZE(BB), %xmm6
  1711. pshufd $0xff, %xmm6, %xmm7
  1712. mulps %xmm7, %xmm1
  1713. pshufd $0xaa, %xmm6, %xmm7
  1714. mulps %xmm1, %xmm7
  1715. subps %xmm7, %xmm0
  1716. pshufd $0x00, %xmm6, %xmm7
  1717. mulps %xmm7, %xmm0
  1718. #endif
  1719. #if defined(LN) || defined(LT)
  1720. movlps %xmm1, -32 * SIZE(BB)
  1721. movlps %xmm3, -30 * SIZE(BB)
  1722. #else
  1723. movlps %xmm0, -32 * SIZE(AA)
  1724. movlps %xmm1, -30 * SIZE(AA)
  1725. #endif
  1726. #ifdef LN
  1727. subl $2 * SIZE, CO1
  1728. #endif
  1729. #if defined(LN) || defined(LT)
  1730. unpcklps %xmm3, %xmm1
  1731. movlps %xmm1, 0 * SIZE(CO1)
  1732. movhps %xmm1, 0 * SIZE(CO1, LDC)
  1733. #else
  1734. movlps %xmm0, 0 * SIZE(CO1)
  1735. movlps %xmm1, 0 * SIZE(CO1, LDC)
  1736. #endif
  1737. #ifndef LN
  1738. addl $2 * SIZE, CO1
  1739. #endif
  1740. #if defined(LT) || defined(RN)
  1741. movl K, %eax
  1742. subl KK, %eax
  1743. leal (,%eax, SIZE), %eax
  1744. leal (AA, %eax, 2), AA
  1745. leal (BB, %eax, 2), BB
  1746. #endif
  1747. #ifdef LN
  1748. subl $2, KK
  1749. #endif
  1750. #ifdef LT
  1751. addl $2, KK
  1752. #endif
  1753. #ifdef RT
  1754. movl K, %eax
  1755. sall $1 + BASE_SHIFT, %eax
  1756. addl %eax, AORIG
  1757. #endif
  1758. ALIGN_4
  1759. .L70:
  1760. testl $1, M
  1761. je .L79
  1762. #ifdef LN
  1763. movl K, %eax
  1764. sall $BASE_SHIFT, %eax
  1765. subl %eax, AORIG
  1766. #endif
  1767. #if defined(LN) || defined(RT)
  1768. movl KK, %eax
  1769. movl AORIG, AA
  1770. leal (AA, %eax, SIZE), AA
  1771. #endif
  1772. movl B, BB
  1773. #if defined(LN) || defined(RT)
  1774. movl KK, %eax
  1775. sall $1 + BASE_SHIFT, %eax
  1776. addl %eax, BB
  1777. #endif
  1778. pxor %xmm4, %xmm4
  1779. movsd -32 * SIZE(AA), %xmm0
  1780. pxor %xmm5, %xmm5
  1781. movsd -32 * SIZE(BB), %xmm1
  1782. #if defined(LT) || defined(RN)
  1783. movl KK, %eax
  1784. #else
  1785. movl K, %eax
  1786. subl KK, %eax
  1787. #endif
  1788. sarl $3, %eax
  1789. je .L75
  1790. ALIGN_4
  1791. .L72:
  1792. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1793. pshufd $0x00, %xmm0, %xmm2
  1794. mulps %xmm2, %xmm1
  1795. addps %xmm1, %xmm4
  1796. movsd -30 * SIZE(BB), %xmm1
  1797. pshufd $0x55, %xmm0, %xmm2
  1798. movsd -30 * SIZE(AA), %xmm0
  1799. mulps %xmm2, %xmm1
  1800. addps %xmm1, %xmm5
  1801. movsd -28 * SIZE(BB), %xmm1
  1802. pshufd $0x00, %xmm0, %xmm2
  1803. mulps %xmm2, %xmm1
  1804. addps %xmm1, %xmm4
  1805. movsd -26 * SIZE(BB), %xmm1
  1806. pshufd $0x55, %xmm0, %xmm2
  1807. movsd -28 * SIZE(AA), %xmm0
  1808. mulps %xmm2, %xmm1
  1809. addps %xmm1, %xmm5
  1810. movsd -24 * SIZE(BB), %xmm1
  1811. pshufd $0x00, %xmm0, %xmm2
  1812. mulps %xmm2, %xmm1
  1813. addps %xmm1, %xmm4
  1814. movsd -22 * SIZE(BB), %xmm1
  1815. pshufd $0x55, %xmm0, %xmm2
  1816. movsd -26 * SIZE(AA), %xmm0
  1817. mulps %xmm2, %xmm1
  1818. addps %xmm1, %xmm5
  1819. movsd -20 * SIZE(BB), %xmm1
  1820. pshufd $0x00, %xmm0, %xmm2
  1821. mulps %xmm2, %xmm1
  1822. addps %xmm1, %xmm4
  1823. movsd -18 * SIZE(BB), %xmm1
  1824. pshufd $0x55, %xmm0, %xmm2
  1825. movsd -24 * SIZE(AA), %xmm0
  1826. mulps %xmm2, %xmm1
  1827. addps %xmm1, %xmm5
  1828. movsd -16 * SIZE(BB), %xmm1
  1829. subl $ -8 * SIZE, AA
  1830. subl $-16 * SIZE, BB
  1831. subl $1, %eax
  1832. jne .L72
  1833. ALIGN_4
  1834. .L75:
  1835. #if defined(LT) || defined(RN)
  1836. movl KK, %eax
  1837. #else
  1838. movl K, %eax
  1839. subl KK, %eax
  1840. #endif
  1841. andl $7, %eax # if (k & 1)
  1842. BRANCH
  1843. je .L78
  1844. ALIGN_4
  1845. .L76:
  1846. pshufd $0x00, %xmm0, %xmm2
  1847. movss -31 * SIZE(AA), %xmm0
  1848. mulps %xmm2, %xmm1
  1849. addps %xmm1, %xmm4
  1850. movsd -30 * SIZE(BB), %xmm1
  1851. addl $1 * SIZE, AA
  1852. addl $2 * SIZE, BB
  1853. decl %eax
  1854. jg .L76
  1855. ALIGN_4
  1856. .L78:
  1857. #if defined(LN) || defined(RT)
  1858. movl KK, %eax
  1859. #ifdef LN
  1860. subl $1, %eax
  1861. #else
  1862. subl $2, %eax
  1863. #endif
  1864. movl AORIG, AA
  1865. leal (, %eax, SIZE), %eax
  1866. leal (AA, %eax, 1), AA
  1867. leal (B, %eax, 2), BB
  1868. #endif
  1869. addps %xmm5, %xmm4
  1870. pshufd $0x55, %xmm4, %xmm5
  1871. pshufd $0x00, %xmm4, %xmm4
  1872. #if defined(LN) || defined(LT)
  1873. unpcklps %xmm5, %xmm4
  1874. movsd -32 * SIZE(BB), %xmm1
  1875. subps %xmm4, %xmm1
  1876. #else
  1877. movss -32 * SIZE(AA), %xmm0
  1878. movss -31 * SIZE(AA), %xmm1
  1879. subss %xmm4, %xmm0
  1880. subss %xmm5, %xmm1
  1881. #endif
  1882. #if defined(LN) || defined(LT)
  1883. movss -32 * SIZE(AA), %xmm4
  1884. pshufd $0x00, %xmm4, %xmm6
  1885. mulps %xmm6, %xmm1
  1886. #endif
  1887. #ifdef RN
  1888. movaps -32 * SIZE(BB), %xmm6
  1889. pshufd $0x00, %xmm6, %xmm7
  1890. mulss %xmm7, %xmm0
  1891. pshufd $0x55, %xmm6, %xmm7
  1892. mulss %xmm0, %xmm7
  1893. subss %xmm7, %xmm1
  1894. pshufd $0xff, %xmm6, %xmm7
  1895. mulss %xmm7, %xmm1
  1896. #endif
  1897. #ifdef RT
  1898. movaps -32 * SIZE(BB), %xmm6
  1899. pshufd $0xff, %xmm6, %xmm7
  1900. mulss %xmm7, %xmm1
  1901. pshufd $0xaa, %xmm6, %xmm7
  1902. mulss %xmm1, %xmm7
  1903. subss %xmm7, %xmm0
  1904. pshufd $0x00, %xmm6, %xmm7
  1905. mulss %xmm7, %xmm0
  1906. #endif
  1907. #if defined(LN) || defined(LT)
  1908. movlps %xmm1, -32 * SIZE(BB)
  1909. #else
  1910. movss %xmm0, -32 * SIZE(AA)
  1911. movss %xmm1, -31 * SIZE(AA)
  1912. #endif
  1913. #ifdef LN
  1914. subl $1 * SIZE, CO1
  1915. #endif
  1916. #if defined(LN) || defined(LT)
  1917. pshufd $1, %xmm1, %xmm3
  1918. movss %xmm1, 0 * SIZE(CO1)
  1919. movss %xmm3, 0 * SIZE(CO1, LDC)
  1920. #else
  1921. movss %xmm0, 0 * SIZE(CO1)
  1922. movss %xmm1, 0 * SIZE(CO1, LDC)
  1923. #endif
  1924. #ifndef LN
  1925. addl $1 * SIZE, CO1
  1926. #endif
  1927. #if defined(LT) || defined(RN)
  1928. movl K, %eax
  1929. subl KK, %eax
  1930. leal (,%eax, SIZE), %eax
  1931. leal (AA, %eax, 1), AA
  1932. leal (BB, %eax, 2), BB
  1933. #endif
  1934. #ifdef LN
  1935. subl $1, KK
  1936. #endif
  1937. #ifdef LT
  1938. addl $1, KK
  1939. #endif
  1940. #ifdef RT
  1941. movl K, %eax
  1942. sall $BASE_SHIFT, %eax
  1943. addl %eax, AORIG
  1944. #endif
  1945. ALIGN_4
  1946. .L79:
  1947. #ifdef LN
  1948. movl K, %eax
  1949. leal (, %eax, SIZE), %eax
  1950. leal (B, %eax, 2), B
  1951. #endif
  1952. #if defined(LT) || defined(RN)
  1953. movl BB, B
  1954. #endif
  1955. #ifdef RN
  1956. addl $2, KK
  1957. #endif
  1958. #ifdef RT
  1959. subl $2, KK
  1960. #endif
  1961. ALIGN_4
  1962. .L80:
  1963. testl $1, N
  1964. je .L999
  1965. #if defined(LT) || defined(RN)
  1966. movl A, AA
  1967. #else
  1968. movl A, %eax
  1969. movl %eax, AORIG
  1970. #endif
  1971. #ifdef RT
  1972. movl K, %eax
  1973. sall $BASE_SHIFT, %eax
  1974. subl %eax, B
  1975. #endif
  1976. #ifdef RT
  1977. subl LDC, C
  1978. #endif
  1979. movl C, CO1
  1980. #ifndef RT
  1981. addl LDC, C
  1982. #endif
  1983. #ifdef LN
  1984. movl OFFSET, %eax
  1985. addl M, %eax
  1986. movl %eax, KK
  1987. #endif
  1988. #ifdef LT
  1989. movl OFFSET, %eax
  1990. movl %eax, KK
  1991. #endif
  1992. movl M, %ebx
  1993. sarl $2, %ebx # i = (m >> 2)
  1994. jle .L100
  1995. ALIGN_4
  1996. .L91:
  1997. #ifdef LN
  1998. movl K, %eax
  1999. sall $2 + BASE_SHIFT, %eax
  2000. subl %eax, AORIG
  2001. #endif
  2002. #if defined(LN) || defined(RT)
  2003. movl KK, %eax
  2004. movl AORIG, AA
  2005. leal (, %eax, SIZE), %eax
  2006. leal (AA, %eax, 4), AA
  2007. #endif
  2008. movl B, BB
  2009. #if defined(LN) || defined(RT)
  2010. movl KK, %eax
  2011. sall $BASE_SHIFT, %eax
  2012. addl %eax, BB
  2013. #endif
  2014. movaps -32 * SIZE(AA), %xmm0
  2015. pxor %xmm2, %xmm2
  2016. movsd -32 * SIZE(BB), %xmm1
  2017. pxor %xmm4, %xmm4
  2018. #ifdef LN
  2019. prefetcht0 -4 * SIZE(CO1)
  2020. #else
  2021. prefetcht0 3 * SIZE(CO1)
  2022. #endif
  2023. pxor %xmm5, %xmm5
  2024. #if defined(LT) || defined(RN)
  2025. movl KK, %eax
  2026. #else
  2027. movl K, %eax
  2028. subl KK, %eax
  2029. #endif
  2030. sarl $3, %eax
  2031. je .L95
  2032. ALIGN_4
  2033. .L92:
  2034. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  2035. addps %xmm2, %xmm4
  2036. pshufd $0x00, %xmm1, %xmm2
  2037. mulps %xmm0, %xmm2
  2038. movaps -28 * SIZE(AA), %xmm0
  2039. addps %xmm2, %xmm5
  2040. pshufd $0x55, %xmm1, %xmm2
  2041. movsd -30 * SIZE(BB), %xmm1
  2042. mulps %xmm0, %xmm2
  2043. movaps -24 * SIZE(AA), %xmm0
  2044. addps %xmm2, %xmm4
  2045. pshufd $0x00, %xmm1, %xmm2
  2046. mulps %xmm0, %xmm2
  2047. movaps -20 * SIZE(AA), %xmm0
  2048. addps %xmm2, %xmm5
  2049. pshufd $0x55, %xmm1, %xmm2
  2050. movsd -28 * SIZE(BB), %xmm1
  2051. mulps %xmm0, %xmm2
  2052. movaps -16 * SIZE(AA), %xmm0
  2053. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  2054. addps %xmm2, %xmm4
  2055. pshufd $0x00, %xmm1, %xmm2
  2056. mulps %xmm0, %xmm2
  2057. movaps -12 * SIZE(AA), %xmm0
  2058. addps %xmm2, %xmm5
  2059. pshufd $0x55, %xmm1, %xmm2
  2060. movsd -26 * SIZE(BB), %xmm1
  2061. mulps %xmm0, %xmm2
  2062. movaps -8 * SIZE(AA), %xmm0
  2063. addps %xmm2, %xmm4
  2064. pshufd $0x00, %xmm1, %xmm2
  2065. mulps %xmm0, %xmm2
  2066. movaps -4 * SIZE(AA), %xmm0
  2067. addps %xmm2, %xmm5
  2068. pshufd $0x55, %xmm1, %xmm2
  2069. movsd -24 * SIZE(BB), %xmm1
  2070. mulps %xmm0, %xmm2
  2071. movaps 0 * SIZE(AA), %xmm0
  2072. subl $-32 * SIZE, AA
  2073. subl $ -8 * SIZE, BB
  2074. subl $1, %eax
  2075. jne .L92
  2076. ALIGN_4
  2077. .L95:
  2078. #if defined(LT) || defined(RN)
  2079. movl KK, %eax
  2080. #else
  2081. movl K, %eax
  2082. subl KK, %eax
  2083. #endif
  2084. andl $7, %eax # if (k & 1)
  2085. BRANCH
  2086. je .L98
  2087. ALIGN_4
  2088. .L96:
  2089. addps %xmm2, %xmm4
  2090. pshufd $0x00, %xmm1, %xmm2
  2091. movss -31 * SIZE(BB), %xmm1
  2092. mulps %xmm0, %xmm2
  2093. movaps -28 * SIZE(AA), %xmm0
  2094. addl $4 * SIZE, AA
  2095. addl $1 * SIZE, BB
  2096. decl %eax
  2097. jg .L96
  2098. ALIGN_4
  2099. .L98:
  2100. #if defined(LN) || defined(RT)
  2101. movl KK, %eax
  2102. #ifdef LN
  2103. subl $4, %eax
  2104. #else
  2105. subl $1, %eax
  2106. #endif
  2107. movl AORIG, AA
  2108. leal (, %eax, SIZE), %eax
  2109. leal (AA, %eax, 4), AA
  2110. leal (B, %eax, 1), BB
  2111. #endif
  2112. addps %xmm2, %xmm4
  2113. addps %xmm5, %xmm4
  2114. #if defined(LN) || defined(LT)
  2115. movaps %xmm4, %xmm0
  2116. unpcklps %xmm6, %xmm4
  2117. unpckhps %xmm6, %xmm0
  2118. movaps %xmm5, %xmm1
  2119. unpcklps %xmm7, %xmm5
  2120. unpckhps %xmm7, %xmm1
  2121. movaps %xmm4, %xmm6
  2122. unpcklps %xmm5, %xmm4
  2123. unpckhps %xmm5, %xmm6
  2124. movaps %xmm0, %xmm2
  2125. unpcklps %xmm1, %xmm0
  2126. unpckhps %xmm1, %xmm2
  2127. movss -32 * SIZE(BB), %xmm1
  2128. movss -31 * SIZE(BB), %xmm3
  2129. movss -30 * SIZE(BB), %xmm5
  2130. movss -29 * SIZE(BB), %xmm7
  2131. subss %xmm4, %xmm1
  2132. subss %xmm6, %xmm3
  2133. subss %xmm0, %xmm5
  2134. subss %xmm2, %xmm7
  2135. #else
  2136. movaps -32 * SIZE(AA), %xmm0
  2137. subps %xmm4, %xmm0
  2138. #endif
  2139. #ifdef LN
  2140. movaps -20 * SIZE(AA), %xmm4
  2141. pshufd $0xff, %xmm4, %xmm6
  2142. mulss %xmm6, %xmm7
  2143. pshufd $0xaa, %xmm4, %xmm6
  2144. mulss %xmm7, %xmm6
  2145. subss %xmm6, %xmm5
  2146. pshufd $0x55, %xmm4, %xmm6
  2147. mulss %xmm7, %xmm6
  2148. subss %xmm6, %xmm3
  2149. pshufd $0x00, %xmm4, %xmm6
  2150. mulss %xmm7, %xmm6
  2151. subss %xmm6, %xmm1
  2152. movaps -24 * SIZE(AA), %xmm4
  2153. pshufd $0xaa, %xmm4, %xmm6
  2154. mulss %xmm6, %xmm5
  2155. pshufd $0x55, %xmm4, %xmm6
  2156. mulss %xmm5, %xmm6
  2157. subss %xmm6, %xmm3
  2158. pshufd $0x00, %xmm4, %xmm6
  2159. mulss %xmm5, %xmm6
  2160. subss %xmm6, %xmm1
  2161. movaps -28 * SIZE(AA), %xmm4
  2162. pshufd $0x55, %xmm4, %xmm6
  2163. mulss %xmm6, %xmm3
  2164. pshufd $0x00, %xmm4, %xmm6
  2165. mulss %xmm3, %xmm6
  2166. subss %xmm6, %xmm1
  2167. movaps -32 * SIZE(AA), %xmm4
  2168. pshufd $0x00, %xmm4, %xmm6
  2169. mulss %xmm6, %xmm1
  2170. #endif
  2171. #ifdef LT
  2172. movaps -32 * SIZE(AA), %xmm4
  2173. pshufd $0x00, %xmm4, %xmm6
  2174. mulss %xmm6, %xmm1
  2175. pshufd $0x55, %xmm4, %xmm6
  2176. mulss %xmm1, %xmm6
  2177. subss %xmm6, %xmm3
  2178. pshufd $0xaa, %xmm4, %xmm6
  2179. mulss %xmm1, %xmm6
  2180. subss %xmm6, %xmm5
  2181. pshufd $0xff, %xmm4, %xmm6
  2182. mulss %xmm1, %xmm6
  2183. subss %xmm6, %xmm7
  2184. movaps -28 * SIZE(AA), %xmm4
  2185. pshufd $0x55, %xmm4, %xmm6
  2186. mulss %xmm6, %xmm3
  2187. pshufd $0xaa, %xmm4, %xmm6
  2188. mulss %xmm3, %xmm6
  2189. subss %xmm6, %xmm5
  2190. pshufd $0xff, %xmm4, %xmm6
  2191. mulss %xmm3, %xmm6
  2192. subss %xmm6, %xmm7
  2193. movaps -24 * SIZE(AA), %xmm4
  2194. pshufd $0xaa, %xmm4, %xmm6
  2195. mulss %xmm6, %xmm5
  2196. pshufd $0xff, %xmm4, %xmm6
  2197. mulss %xmm5, %xmm6
  2198. subss %xmm6, %xmm7
  2199. movaps -20 * SIZE(AA), %xmm4
  2200. pshufd $0xff, %xmm4, %xmm6
  2201. mulss %xmm6, %xmm7
  2202. #endif
  2203. #if defined(RN) || defined(RT)
  2204. movss -32 * SIZE(BB), %xmm6
  2205. pshufd $0x00, %xmm6, %xmm7
  2206. mulps %xmm7, %xmm0
  2207. #endif
  2208. #if defined(LN) || defined(LT)
  2209. movss %xmm1, -32 * SIZE(BB)
  2210. movss %xmm3, -31 * SIZE(BB)
  2211. movss %xmm5, -30 * SIZE(BB)
  2212. movss %xmm7, -29 * SIZE(BB)
  2213. #else
  2214. movaps %xmm0, -32 * SIZE(AA)
  2215. #endif
  2216. #ifdef LN
  2217. subl $4 * SIZE, CO1
  2218. #endif
  2219. #if defined(LN) || defined(LT)
  2220. unpcklps %xmm5, %xmm1
  2221. unpcklps %xmm7, %xmm3
  2222. unpcklps %xmm3, %xmm1
  2223. movlps %xmm1, 0 * SIZE(CO1)
  2224. movhps %xmm1, 2 * SIZE(CO1)
  2225. #else
  2226. movlps %xmm0, 0 * SIZE(CO1)
  2227. movhps %xmm0, 2 * SIZE(CO1)
  2228. #endif
  2229. #ifndef LN
  2230. addl $4 * SIZE, CO1
  2231. #endif
  2232. #if defined(LT) || defined(RN)
  2233. movl K, %eax
  2234. subl KK, %eax
  2235. leal (,%eax, SIZE), %eax
  2236. leal (AA, %eax, 4), AA
  2237. leal (BB, %eax, 1), BB
  2238. #endif
  2239. #ifdef LN
  2240. subl $4, KK
  2241. #endif
  2242. #ifdef LT
  2243. addl $4, KK
  2244. #endif
  2245. #ifdef RT
  2246. movl K, %eax
  2247. sall $2 + BASE_SHIFT, %eax
  2248. addl %eax, AORIG
  2249. #endif
  2250. decl %ebx # i --
  2251. jg .L91
  2252. ALIGN_4
  2253. .L100:
  2254. testl $2, M
  2255. je .L110
  2256. #ifdef LN
  2257. movl K, %eax
  2258. sall $1 + BASE_SHIFT, %eax
  2259. subl %eax, AORIG
  2260. #endif
  2261. #if defined(LN) || defined(RT)
  2262. movl KK, %eax
  2263. movl AORIG, AA
  2264. leal (, %eax, SIZE), %eax
  2265. leal (AA, %eax, 2), AA
  2266. #endif
  2267. movl B, BB
  2268. #if defined(LN) || defined(RT)
  2269. movl KK, %eax
  2270. sall $BASE_SHIFT, %eax
  2271. addl %eax, BB
  2272. #endif
  2273. movsd -32 * SIZE(AA), %xmm0
  2274. pxor %xmm3, %xmm3
  2275. movsd -32 * SIZE(BB), %xmm1
  2276. pxor %xmm4, %xmm4
  2277. pxor %xmm5, %xmm5
  2278. #if defined(LT) || defined(RN)
  2279. movl KK, %eax
  2280. #else
  2281. movl K, %eax
  2282. subl KK, %eax
  2283. #endif
  2284. sarl $3, %eax
  2285. je .L105
  2286. ALIGN_4
  2287. .L102:
  2288. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  2289. pshufd $0x00, %xmm1, %xmm2
  2290. mulps %xmm0, %xmm2
  2291. movsd -30 * SIZE(AA), %xmm0
  2292. addps %xmm2, %xmm4
  2293. pshufd $0x55, %xmm1, %xmm2
  2294. movsd -30 * SIZE(BB), %xmm1
  2295. mulps %xmm0, %xmm2
  2296. movsd -28 * SIZE(AA), %xmm0
  2297. addps %xmm2, %xmm5
  2298. pshufd $0x00, %xmm1, %xmm2
  2299. mulps %xmm0, %xmm2
  2300. movsd -26 * SIZE(AA), %xmm0
  2301. addps %xmm2, %xmm4
  2302. pshufd $0x55, %xmm1, %xmm2
  2303. movsd -28 * SIZE(BB), %xmm1
  2304. mulps %xmm0, %xmm2
  2305. movsd -24 * SIZE(AA), %xmm0
  2306. addps %xmm2, %xmm5
  2307. pshufd $0x00, %xmm1, %xmm2
  2308. mulps %xmm0, %xmm2
  2309. movsd -22 * SIZE(AA), %xmm0
  2310. addps %xmm2, %xmm4
  2311. pshufd $0x55, %xmm1, %xmm2
  2312. movsd -26 * SIZE(BB), %xmm1
  2313. mulps %xmm0, %xmm2
  2314. movsd -20 * SIZE(AA), %xmm0
  2315. addps %xmm2, %xmm5
  2316. pshufd $0x00, %xmm1, %xmm2
  2317. mulps %xmm0, %xmm2
  2318. movsd -18 * SIZE(AA), %xmm0
  2319. addps %xmm2, %xmm4
  2320. pshufd $0x55, %xmm1, %xmm2
  2321. movsd -24 * SIZE(BB), %xmm1
  2322. mulps %xmm0, %xmm2
  2323. movsd -16 * SIZE(AA), %xmm0
  2324. addps %xmm2, %xmm5
  2325. subl $-16 * SIZE, AA
  2326. subl $ -8 * SIZE, BB
  2327. subl $1, %eax
  2328. jne .L102
  2329. ALIGN_4
  2330. .L105:
  2331. #if defined(LT) || defined(RN)
  2332. movl KK, %eax
  2333. #else
  2334. movl K, %eax
  2335. subl KK, %eax
  2336. #endif
  2337. andl $7, %eax # if (k & 1)
  2338. BRANCH
  2339. je .L108
  2340. ALIGN_4
  2341. .L106:
  2342. pshufd $0x00, %xmm1, %xmm2
  2343. movss -31 * SIZE(BB), %xmm1
  2344. mulps %xmm0, %xmm2
  2345. movsd -30 * SIZE(AA), %xmm0
  2346. addps %xmm2, %xmm4
  2347. addl $2 * SIZE, AA
  2348. addl $1 * SIZE, BB
  2349. decl %eax
  2350. jg .L106
  2351. ALIGN_4
  2352. .L108:
  2353. #if defined(LN) || defined(RT)
  2354. movl KK, %eax
  2355. #ifdef LN
  2356. subl $2, %eax
  2357. #else
  2358. subl $1, %eax
  2359. #endif
  2360. movl AORIG, AA
  2361. leal (, %eax, SIZE), %eax
  2362. leal (AA, %eax, 2), AA
  2363. leal (B, %eax, 1), BB
  2364. #endif
  2365. addps %xmm5, %xmm4
  2366. #if defined(LN) || defined(LT)
  2367. pshufd $1, %xmm4, %xmm6
  2368. movss -32 * SIZE(BB), %xmm1
  2369. movss -31 * SIZE(BB), %xmm3
  2370. subss %xmm4, %xmm1
  2371. subss %xmm6, %xmm3
  2372. #else
  2373. movsd -32 * SIZE(AA), %xmm0
  2374. subps %xmm4, %xmm0
  2375. #endif
  2376. #ifdef LN
  2377. movsd -32 * SIZE(AA), %xmm4
  2378. movhps -30 * SIZE(AA), %xmm4
  2379. pshufd $0xff, %xmm4, %xmm6
  2380. mulss %xmm6, %xmm3
  2381. pshufd $0xaa, %xmm4, %xmm6
  2382. mulss %xmm3, %xmm6
  2383. subss %xmm6, %xmm1
  2384. pshufd $0x00, %xmm4, %xmm6
  2385. mulss %xmm6, %xmm1
  2386. #endif
  2387. #ifdef LT
  2388. movaps -32 * SIZE(AA), %xmm4
  2389. pshufd $0x00, %xmm4, %xmm6
  2390. mulss %xmm6, %xmm1
  2391. pshufd $0x55, %xmm4, %xmm6
  2392. mulss %xmm1, %xmm6
  2393. subss %xmm6, %xmm3
  2394. pshufd $0xff, %xmm4, %xmm6
  2395. mulss %xmm6, %xmm3
  2396. #endif
  2397. #if defined(RN) || defined(RT)
  2398. movss -32 * SIZE(BB), %xmm6
  2399. pshufd $0x00, %xmm6, %xmm7
  2400. mulps %xmm7, %xmm0
  2401. #endif
  2402. #if defined(LN) || defined(LT)
  2403. movss %xmm1, -32 * SIZE(BB)
  2404. movss %xmm3, -31 * SIZE(BB)
  2405. #else
  2406. movlps %xmm0, -32 * SIZE(AA)
  2407. #endif
  2408. #ifdef LN
  2409. subl $2 * SIZE, CO1
  2410. #endif
  2411. #if defined(LN) || defined(LT)
  2412. movss %xmm1, 0 * SIZE(CO1)
  2413. movss %xmm3, 1 * SIZE(CO1)
  2414. #else
  2415. movlps %xmm0, 0 * SIZE(CO1)
  2416. #endif
  2417. #ifndef LN
  2418. addl $2 * SIZE, CO1
  2419. #endif
  2420. #if defined(LT) || defined(RN)
  2421. movl K, %eax
  2422. subl KK, %eax
  2423. leal (,%eax, SIZE), %eax
  2424. leal (AA, %eax, 2), AA
  2425. leal (BB, %eax, 1), BB
  2426. #endif
  2427. #ifdef LN
  2428. subl $2, KK
  2429. #endif
  2430. #ifdef LT
  2431. addl $2, KK
  2432. #endif
  2433. #ifdef RT
  2434. movl K, %eax
  2435. sall $1 + BASE_SHIFT, %eax
  2436. addl %eax, AORIG
  2437. #endif
  2438. ALIGN_4
  2439. .L110:
  2440. testl $1, M
  2441. je .L119
  2442. #ifdef LN
  2443. movl K, %eax
  2444. sall $BASE_SHIFT, %eax
  2445. subl %eax, AORIG
  2446. #endif
  2447. #if defined(LN) || defined(RT)
  2448. movl KK, %eax
  2449. movl AORIG, AA
  2450. leal (AA, %eax, SIZE), AA
  2451. #endif
  2452. movl B, BB
  2453. #if defined(LN) || defined(RT)
  2454. movl KK, %eax
  2455. sall $BASE_SHIFT, %eax
  2456. addl %eax, BB
  2457. #endif
  2458. pxor %xmm4, %xmm4
  2459. movsd -32 * SIZE(AA), %xmm0
  2460. pxor %xmm5, %xmm5
  2461. movsd -32 * SIZE(BB), %xmm1
  2462. #if defined(LT) || defined(RN)
  2463. movl KK, %eax
  2464. #else
  2465. movl K, %eax
  2466. subl KK, %eax
  2467. #endif
  2468. sarl $3, %eax
  2469. je .L115
  2470. ALIGN_4
  2471. .L112:
  2472. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  2473. mulps %xmm0, %xmm1
  2474. movsd -30 * SIZE(AA), %xmm0
  2475. addps %xmm1, %xmm4
  2476. movsd -30 * SIZE(BB), %xmm1
  2477. mulps %xmm0, %xmm1
  2478. movsd -28 * SIZE(AA), %xmm0
  2479. addps %xmm1, %xmm4
  2480. movsd -28 * SIZE(BB), %xmm1
  2481. mulps %xmm0, %xmm1
  2482. movsd -26 * SIZE(AA), %xmm0
  2483. addps %xmm1, %xmm4
  2484. movsd -26 * SIZE(BB), %xmm1
  2485. mulps %xmm0, %xmm1
  2486. movsd -24 * SIZE(AA), %xmm0
  2487. addps %xmm1, %xmm4
  2488. movsd -24 * SIZE(BB), %xmm1
  2489. subl $-8 * SIZE, AA
  2490. subl $-8 * SIZE, BB
  2491. subl $1, %eax
  2492. jne .L112
  2493. ALIGN_4
  2494. .L115:
  2495. #if defined(LT) || defined(RN)
  2496. movl KK, %eax
  2497. #else
  2498. movl K, %eax
  2499. subl KK, %eax
  2500. #endif
  2501. andl $7, %eax # if (k & 1)
  2502. BRANCH
  2503. je .L118
  2504. ALIGN_4
  2505. .L116:
  2506. mulss %xmm0, %xmm1
  2507. movss -31 * SIZE(AA), %xmm0
  2508. addss %xmm1, %xmm4
  2509. movss -31 * SIZE(BB), %xmm1
  2510. addl $1 * SIZE, AA
  2511. addl $1 * SIZE, BB
  2512. decl %eax
  2513. jg .L116
  2514. ALIGN_4
  2515. .L118:
  2516. #if defined(LN) || defined(RT)
  2517. movl KK, %eax
  2518. subl $1, %eax
  2519. movl AORIG, AA
  2520. leal (AA, %eax, SIZE), AA
  2521. leal (B, %eax, SIZE), BB
  2522. #endif
  2523. haddps %xmm4, %xmm4
  2524. #if defined(LN) || defined(LT)
  2525. movss -32 * SIZE(BB), %xmm1
  2526. subss %xmm4, %xmm1
  2527. #else
  2528. movss -32 * SIZE(AA), %xmm0
  2529. subss %xmm4, %xmm0
  2530. #endif
  2531. #if defined(LN) || defined(LT)
  2532. mulss -32 * SIZE(AA), %xmm1
  2533. #endif
  2534. #if defined(RN) || defined(RT)
  2535. mulss -32 * SIZE(BB), %xmm0
  2536. #endif
  2537. #if defined(LN) || defined(LT)
  2538. movss %xmm1, -32 * SIZE(BB)
  2539. #else
  2540. movss %xmm0, -32 * SIZE(AA)
  2541. #endif
  2542. #ifdef LN
  2543. subl $1 * SIZE, CO1
  2544. #endif
  2545. #if defined(LN) || defined(LT)
  2546. movss %xmm1, 0 * SIZE(CO1)
  2547. #else
  2548. movss %xmm0, 0 * SIZE(CO1)
  2549. #endif
  2550. #ifndef LN
  2551. addl $1 * SIZE, CO1
  2552. #endif
  2553. #if defined(LT) || defined(RN)
  2554. movl K, %eax
  2555. subl KK, %eax
  2556. leal (AA, %eax, SIZE), AA
  2557. leal (BB, %eax, SIZE), BB
  2558. #endif
  2559. #ifdef LN
  2560. subl $1, KK
  2561. #endif
  2562. #ifdef LT
  2563. addl $1, KK
  2564. #endif
  2565. #ifdef RT
  2566. movl K, %eax
  2567. sall $BASE_SHIFT, %eax
  2568. addl %eax, AORIG
  2569. #endif
  2570. ALIGN_4
  2571. .L119:
  2572. #ifdef LN
  2573. movl K, %eax
  2574. leal (B, %eax, SIZE), B
  2575. #endif
  2576. #if defined(LT) || defined(RN)
  2577. movl BB, B
  2578. #endif
  2579. #ifdef RN
  2580. addl $1, KK
  2581. #endif
  2582. #ifdef RT
  2583. subl $1, KK
  2584. #endif
  2585. ALIGN_4
  2586. .L999:
  2587. popl %ebx
  2588. popl %esi
  2589. popl %edi
  2590. popl %ebp
  2591. addl $ARGS, %esp
  2592. ret
  2593. EPILOGUE