You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_4x4_penryn.S 61 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define OLD_K %rdx
  43. #define M %r13
  44. #define N %r14
  45. #define K %r15
  46. #define A %rcx
  47. #define B %r8
  48. #define C %r9
  49. #define LDC %r10
  50. #define I %r11
  51. #define AO %rdi
  52. #define BO %rsi
  53. #define CO1 %rbx
  54. #define CO2 %rbp
  55. #define KK %rdx
  56. #define BB %r12
  57. #ifndef WINDOWS_ABI
  58. #define STACKSIZE 128
  59. #define OLD_LDC 8 + STACKSIZE(%rsp)
  60. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  61. #define OFFSET 48(%rsp)
  62. #define J 56(%rsp)
  63. #define KKK 64(%rsp)
  64. #define AORIG 72(%rsp)
  65. #else
  66. #define STACKSIZE 256
  67. #define OLD_A 40 + STACKSIZE(%rsp)
  68. #define OLD_B 48 + STACKSIZE(%rsp)
  69. #define OLD_C 56 + STACKSIZE(%rsp)
  70. #define OLD_LDC 64 + STACKSIZE(%rsp)
  71. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  72. #define OFFSET 224(%rsp)
  73. #define J 232(%rsp)
  74. #define KKK 240(%rsp)
  75. #define AORIG 248(%rsp)
  76. #endif
  77. #define PREFETCH_R (8 * 4 + 0)
  78. #define PREFETCHSIZE (8 * 21 + 6)
  79. #define PREFETCH prefetcht0
  80. PROLOGUE
  81. PROFCODE
  82. subq $STACKSIZE, %rsp
  83. movq %rbx, 0(%rsp)
  84. movq %rbp, 8(%rsp)
  85. movq %r12, 16(%rsp)
  86. movq %r13, 24(%rsp)
  87. movq %r14, 32(%rsp)
  88. movq %r15, 40(%rsp)
  89. #ifdef WINDOWS_ABI
  90. movq %rdi, 48(%rsp)
  91. movq %rsi, 56(%rsp)
  92. movups %xmm6, 64(%rsp)
  93. movups %xmm7, 80(%rsp)
  94. movups %xmm8, 96(%rsp)
  95. movups %xmm9, 112(%rsp)
  96. movups %xmm10, 128(%rsp)
  97. movups %xmm11, 144(%rsp)
  98. movups %xmm12, 160(%rsp)
  99. movups %xmm13, 176(%rsp)
  100. movups %xmm14, 192(%rsp)
  101. movups %xmm15, 208(%rsp)
  102. movq ARG1, OLD_M
  103. movq ARG2, OLD_N
  104. movq ARG3, OLD_K
  105. movq OLD_A, A
  106. movq OLD_B, B
  107. movq OLD_C, C
  108. #endif
  109. movq OLD_M, M
  110. movq OLD_N, N
  111. movq OLD_K, K
  112. movq OLD_LDC, LDC
  113. movq OLD_OFFSET, KK
  114. subq $-16 * SIZE, A
  115. subq $-16 * SIZE, B
  116. leaq (, LDC, SIZE), LDC
  117. movq KK, OFFSET
  118. negq KK
  119. #ifdef LN
  120. leaq (, M, SIZE), %rax
  121. addq %rax, C
  122. imulq K, %rax
  123. addq %rax, A
  124. #endif
  125. #ifdef RT
  126. leaq (, N, SIZE), %rax
  127. imulq K, %rax
  128. addq %rax, B
  129. movq N, %rax
  130. imulq LDC, %rax
  131. addq %rax, C
  132. #endif
  133. #ifdef RT
  134. movq N, %rax
  135. subq OFFSET, %rax
  136. movq %rax, KK
  137. #endif
  138. testq $1, N
  139. BRANCH
  140. jle .L40
  141. #if defined(LT) || defined(RN)
  142. movq A, AO
  143. #else
  144. movq A, AORIG
  145. #endif
  146. #ifdef RT
  147. movq K, %rax
  148. salq $BASE_SHIFT, %rax
  149. subq %rax, B
  150. subq LDC, C
  151. #endif
  152. movq C, CO1
  153. #ifndef RT
  154. addq LDC, C
  155. #endif
  156. #ifdef LN
  157. movq OFFSET, %rax
  158. addq M, %rax
  159. movq %rax, KK
  160. #endif
  161. #ifdef LT
  162. movq OFFSET, %rax
  163. movq %rax, KK
  164. #endif
  165. movq M, I
  166. sarq $2, I # i = (m >> 2)
  167. NOBRANCH
  168. jle .L100
  169. ALIGN_4
  170. .L91:
  171. #ifdef LN
  172. movq K, %rax
  173. salq $2 + BASE_SHIFT, %rax
  174. subq %rax, AORIG
  175. #endif
  176. #if defined(LN) || defined(RT)
  177. movq KK, %rax
  178. leaq (, %rax, SIZE), %rax
  179. movq AORIG, AO
  180. leaq (AO, %rax, 4), AO
  181. leaq (B, %rax, 1), BO
  182. #else
  183. movq B, BO
  184. #endif
  185. movaps -16 * SIZE(AO), %xmm0
  186. movaps -14 * SIZE(AO), %xmm1
  187. movsd -16 * SIZE(BO), %xmm2
  188. #ifdef LN
  189. prefetcht0 -4 * SIZE(CO1)
  190. #else
  191. prefetcht0 3 * SIZE(CO1)
  192. #endif
  193. pxor %xmm8, %xmm8
  194. pxor %xmm9, %xmm9
  195. pxor %xmm12, %xmm12
  196. pxor %xmm13, %xmm13
  197. #if defined(LT) || defined(RN)
  198. movq KK, %rax
  199. #else
  200. movq K, %rax
  201. subq KK, %rax
  202. #endif
  203. sarq $2, %rax
  204. NOBRANCH
  205. jle .L95
  206. ALIGN_4
  207. .L92:
  208. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  209. pshufd $0x44, %xmm2, %xmm3
  210. pshufd $0x44, %xmm2, %xmm4
  211. movsd -15 * SIZE(BO), %xmm2
  212. mulpd %xmm0, %xmm3
  213. movaps -12 * SIZE(AO), %xmm0
  214. mulpd %xmm1, %xmm4
  215. movaps -10 * SIZE(AO), %xmm1
  216. addpd %xmm3, %xmm8
  217. addpd %xmm4, %xmm12
  218. pshufd $0x44, %xmm2, %xmm3
  219. pshufd $0x44, %xmm2, %xmm4
  220. movsd -14 * SIZE(BO), %xmm2
  221. mulpd %xmm0, %xmm3
  222. movaps -8 * SIZE(AO), %xmm0
  223. mulpd %xmm1, %xmm4
  224. movaps -6 * SIZE(AO), %xmm1
  225. addpd %xmm3, %xmm8
  226. addpd %xmm4, %xmm12
  227. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  228. pshufd $0x44, %xmm2, %xmm3
  229. pshufd $0x44, %xmm2, %xmm4
  230. movsd -13 * SIZE(BO), %xmm2
  231. mulpd %xmm0, %xmm3
  232. movaps -4 * SIZE(AO), %xmm0
  233. mulpd %xmm1, %xmm4
  234. movaps -2 * SIZE(AO), %xmm1
  235. addpd %xmm3, %xmm8
  236. addpd %xmm4, %xmm12
  237. pshufd $0x44, %xmm2, %xmm3
  238. pshufd $0x44, %xmm2, %xmm4
  239. movsd -12 * SIZE(BO), %xmm2
  240. mulpd %xmm0, %xmm3
  241. movaps 0 * SIZE(AO), %xmm0
  242. mulpd %xmm1, %xmm4
  243. movaps 2 * SIZE(AO), %xmm1
  244. addpd %xmm3, %xmm8
  245. addpd %xmm4, %xmm12
  246. subq $-16 * SIZE, AO
  247. subq $ -4 * SIZE, BO
  248. subq $1, %rax
  249. BRANCH
  250. jg .L92
  251. ALIGN_4
  252. .L95:
  253. #if defined(LT) || defined(RN)
  254. movq KK, %rax
  255. #else
  256. movq K, %rax
  257. subq KK, %rax
  258. #endif
  259. andq $3, %rax # if (k & 1)
  260. BRANCH
  261. je .L98
  262. ALIGN_4
  263. .L96:
  264. pshufd $0x44, %xmm2, %xmm3
  265. pshufd $0x44, %xmm2, %xmm4
  266. movsd -15 * SIZE(BO), %xmm2
  267. mulpd %xmm0, %xmm3
  268. movaps -12 * SIZE(AO), %xmm0
  269. mulpd %xmm1, %xmm4
  270. movaps -10 * SIZE(AO), %xmm1
  271. addpd %xmm3, %xmm8
  272. addpd %xmm4, %xmm12
  273. addq $4 * SIZE, AO
  274. addq $1 * SIZE, BO
  275. subq $1, %rax
  276. BRANCH
  277. jg .L96
  278. ALIGN_4
  279. .L98:
  280. #if defined(LN) || defined(RT)
  281. movq KK, %rax
  282. #ifdef LN
  283. subq $4, %rax
  284. #else
  285. subq $1, %rax
  286. #endif
  287. leaq (, %rax, SIZE), %rax
  288. movq AORIG, AO
  289. leaq (AO, %rax, 4), AO
  290. leaq (B, %rax, 1), BO
  291. #endif
  292. #if defined(LN) || defined(LT)
  293. movapd -16 * SIZE(BO), %xmm10
  294. movapd -14 * SIZE(BO), %xmm11
  295. subpd %xmm8, %xmm10
  296. subpd %xmm12, %xmm11
  297. #else
  298. movapd -16 * SIZE(AO), %xmm10
  299. movapd -14 * SIZE(AO), %xmm11
  300. subpd %xmm8, %xmm10
  301. subpd %xmm12, %xmm11
  302. #endif
  303. #ifdef LN
  304. movapd %xmm10, %xmm8
  305. unpckhpd %xmm8, %xmm8
  306. movapd %xmm11, %xmm9
  307. unpckhpd %xmm9, %xmm9
  308. movsd -1 * SIZE(AO), %xmm12
  309. mulsd %xmm12, %xmm9
  310. movsd -2 * SIZE(AO), %xmm13
  311. mulsd %xmm9, %xmm13
  312. subsd %xmm13, %xmm11
  313. movsd -3 * SIZE(AO), %xmm14
  314. mulsd %xmm9, %xmm14
  315. subsd %xmm14, %xmm8
  316. movsd -4 * SIZE(AO), %xmm15
  317. mulsd %xmm9, %xmm15
  318. subsd %xmm15, %xmm10
  319. movsd -6 * SIZE(AO), %xmm12
  320. mulsd %xmm12, %xmm11
  321. movsd -7 * SIZE(AO), %xmm13
  322. mulsd %xmm11, %xmm13
  323. subsd %xmm13, %xmm8
  324. movsd -8 * SIZE(AO), %xmm14
  325. mulsd %xmm11, %xmm14
  326. subsd %xmm14, %xmm10
  327. movsd -11 * SIZE(AO), %xmm12
  328. mulsd %xmm12, %xmm8
  329. movsd -12 * SIZE(AO), %xmm13
  330. mulsd %xmm8, %xmm13
  331. subsd %xmm13, %xmm10
  332. movsd -16 * SIZE(AO), %xmm12
  333. mulsd %xmm12, %xmm10
  334. unpcklpd %xmm8, %xmm10
  335. unpcklpd %xmm9, %xmm11
  336. #endif
  337. #ifdef LT
  338. movapd %xmm10, %xmm8
  339. unpckhpd %xmm8, %xmm8
  340. movapd %xmm11, %xmm9
  341. unpckhpd %xmm9, %xmm9
  342. movsd -16 * SIZE(AO), %xmm12
  343. mulsd %xmm12, %xmm10
  344. movsd -15 * SIZE(AO), %xmm13
  345. mulsd %xmm10, %xmm13
  346. subsd %xmm13, %xmm8
  347. movsd -14 * SIZE(AO), %xmm14
  348. mulsd %xmm10, %xmm14
  349. subsd %xmm14, %xmm11
  350. movsd -13 * SIZE(AO), %xmm15
  351. mulsd %xmm10, %xmm15
  352. subsd %xmm15, %xmm9
  353. movsd -11 * SIZE(AO), %xmm12
  354. mulsd %xmm12, %xmm8
  355. movsd -10 * SIZE(AO), %xmm13
  356. mulsd %xmm8, %xmm13
  357. subsd %xmm13, %xmm11
  358. movsd -9 * SIZE(AO), %xmm14
  359. mulsd %xmm8, %xmm14
  360. subsd %xmm14, %xmm9
  361. movsd -6 * SIZE(AO), %xmm12
  362. mulsd %xmm12, %xmm11
  363. movsd -5 * SIZE(AO), %xmm13
  364. mulsd %xmm11, %xmm13
  365. subsd %xmm13, %xmm9
  366. movsd -1 * SIZE(AO), %xmm12
  367. mulsd %xmm12, %xmm9
  368. unpcklpd %xmm8, %xmm10
  369. unpcklpd %xmm9, %xmm11
  370. #endif
  371. #ifdef RN
  372. movddup -16 * SIZE(BO), %xmm8
  373. mulpd %xmm8, %xmm10
  374. mulpd %xmm8, %xmm11
  375. #endif
  376. #ifdef RT
  377. movddup -16 * SIZE(BO), %xmm8
  378. mulpd %xmm8, %xmm10
  379. mulpd %xmm8, %xmm11
  380. #endif
  381. #ifdef LN
  382. subq $4 * SIZE, CO1
  383. #endif
  384. #if defined(LN) || defined(LT)
  385. movsd %xmm10, 0 * SIZE(CO1)
  386. movhpd %xmm10, 1 * SIZE(CO1)
  387. movsd %xmm11, 2 * SIZE(CO1)
  388. movhpd %xmm11, 3 * SIZE(CO1)
  389. #else
  390. movsd %xmm10, 0 * SIZE(CO1)
  391. movhpd %xmm10, 1 * SIZE(CO1)
  392. movsd %xmm11, 2 * SIZE(CO1)
  393. movhpd %xmm11, 3 * SIZE(CO1)
  394. #endif
  395. #if defined(LN) || defined(LT)
  396. movapd %xmm10, -16 * SIZE(BO)
  397. movapd %xmm11, -14 * SIZE(BO)
  398. #else
  399. movapd %xmm10, -16 * SIZE(AO)
  400. movapd %xmm11, -14 * SIZE(AO)
  401. #endif
  402. #ifndef LN
  403. addq $4 * SIZE, CO1
  404. #endif
  405. #if defined(LT) || defined(RN)
  406. movq K, %rax
  407. subq KK, %rax
  408. leaq (,%rax, SIZE), %rax
  409. leaq (AO, %rax, 4), AO
  410. leaq (BO, %rax, 1), BO
  411. #endif
  412. #ifdef LN
  413. subq $4, KK
  414. #endif
  415. #ifdef LT
  416. addq $4, KK
  417. #endif
  418. #ifdef RT
  419. movq K, %rax
  420. salq $2 + BASE_SHIFT, %rax
  421. addq %rax, AORIG
  422. #endif
  423. decq I
  424. BRANCH
  425. jg .L91
  426. ALIGN_4
  427. .L100:
  428. testq $2, M
  429. BRANCH
  430. jle .L110
  431. ALIGN_4
  432. #ifdef LN
  433. movq K, %rax
  434. salq $1 + BASE_SHIFT, %rax
  435. subq %rax, AORIG
  436. #endif
  437. #if defined(LN) || defined(RT)
  438. movq KK, %rax
  439. leaq (, %rax, SIZE), %rax
  440. movq AORIG, AO
  441. leaq (AO, %rax, 2), AO
  442. leaq (B, %rax, 1), BO
  443. #else
  444. movq B, BO
  445. #endif
  446. movaps -16 * SIZE(AO), %xmm0
  447. pxor %xmm8, %xmm8
  448. movsd -16 * SIZE(BO), %xmm2
  449. pxor %xmm9, %xmm9
  450. movhps -15 * SIZE(BO), %xmm2
  451. #if defined(LT) || defined(RN)
  452. movq KK, %rax
  453. #else
  454. movq K, %rax
  455. subq KK, %rax
  456. #endif
  457. sarq $2, %rax
  458. NOBRANCH
  459. jle .L105
  460. ALIGN_4
  461. .L102:
  462. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  463. pshufd $0x44, %xmm2, %xmm3
  464. movsd -15 * SIZE(BO), %xmm2
  465. mulpd %xmm0, %xmm3
  466. movaps -14 * SIZE(AO), %xmm0
  467. addpd %xmm3, %xmm8
  468. pshufd $0x44, %xmm2, %xmm3
  469. movsd -14 * SIZE(BO), %xmm2
  470. mulpd %xmm0, %xmm3
  471. movaps -12 * SIZE(AO), %xmm0
  472. addpd %xmm3, %xmm9
  473. pshufd $0x44, %xmm2, %xmm3
  474. movsd -13 * SIZE(BO), %xmm2
  475. mulpd %xmm0, %xmm3
  476. movaps -10 * SIZE(AO), %xmm0
  477. addpd %xmm3, %xmm8
  478. pshufd $0x44, %xmm2, %xmm3
  479. movsd -12 * SIZE(BO), %xmm2
  480. mulpd %xmm0, %xmm3
  481. movaps -8 * SIZE(AO), %xmm0
  482. addpd %xmm3, %xmm9
  483. subq $-8 * SIZE, AO
  484. subq $-4 * SIZE, BO
  485. subq $1, %rax
  486. BRANCH
  487. jg .L102
  488. ALIGN_4
  489. .L105:
  490. #if defined(LT) || defined(RN)
  491. movq KK, %rax
  492. #else
  493. movq K, %rax
  494. subq KK, %rax
  495. #endif
  496. andq $3, %rax # if (k & 1)
  497. BRANCH
  498. je .L108
  499. ALIGN_4
  500. .L106:
  501. pshufd $0x44, %xmm2, %xmm3
  502. movsd -15 * SIZE(BO), %xmm2
  503. mulpd %xmm0, %xmm3
  504. movaps -14 * SIZE(AO), %xmm0
  505. addpd %xmm3, %xmm8
  506. addq $2 * SIZE, AO
  507. addq $1 * SIZE, BO
  508. subq $1, %rax
  509. BRANCH
  510. jg .L106
  511. ALIGN_4
  512. .L108:
  513. #if defined(LN) || defined(RT)
  514. movq KK, %rax
  515. #ifdef LN
  516. subq $2, %rax
  517. #else
  518. subq $1, %rax
  519. #endif
  520. leaq (, %rax, SIZE), %rax
  521. movq AORIG, AO
  522. leaq (AO, %rax, 2), AO
  523. leaq (B, %rax, 1), BO
  524. #endif
  525. addpd %xmm9, %xmm8
  526. #if defined(LN) || defined(LT)
  527. movapd -16 * SIZE(BO), %xmm10
  528. subpd %xmm8, %xmm10
  529. #else
  530. movapd -16 * SIZE(AO), %xmm10
  531. subpd %xmm8, %xmm10
  532. #endif
  533. #ifdef LN
  534. movapd %xmm10, %xmm8
  535. unpckhpd %xmm8, %xmm8
  536. movsd -13 * SIZE(AO), %xmm12
  537. mulsd %xmm12, %xmm8
  538. movsd -14 * SIZE(AO), %xmm13
  539. mulsd %xmm8, %xmm13
  540. subsd %xmm13, %xmm10
  541. movsd -16 * SIZE(AO), %xmm12
  542. mulsd %xmm12, %xmm10
  543. unpcklpd %xmm8, %xmm10
  544. #endif
  545. #ifdef LT
  546. movapd %xmm10, %xmm8
  547. unpckhpd %xmm8, %xmm8
  548. movsd -16 * SIZE(AO), %xmm12
  549. mulsd %xmm12, %xmm10
  550. movsd -15 * SIZE(AO), %xmm13
  551. mulsd %xmm10, %xmm13
  552. subsd %xmm13, %xmm8
  553. movsd -13 * SIZE(AO), %xmm12
  554. mulsd %xmm12, %xmm8
  555. unpcklpd %xmm8, %xmm10
  556. #endif
  557. #ifdef RN
  558. movddup -16 * SIZE(BO), %xmm8
  559. mulpd %xmm8, %xmm10
  560. #endif
  561. #ifdef RT
  562. movddup -16 * SIZE(BO), %xmm8
  563. mulpd %xmm8, %xmm10
  564. #endif
  565. #ifdef LN
  566. subq $2 * SIZE, CO1
  567. #endif
  568. #if defined(LN) || defined(LT)
  569. movsd %xmm10, 0 * SIZE(CO1)
  570. movhpd %xmm10, 1 * SIZE(CO1)
  571. #else
  572. movsd %xmm10, 0 * SIZE(CO1)
  573. movhpd %xmm10, 1 * SIZE(CO1)
  574. #endif
  575. #if defined(LN) || defined(LT)
  576. movapd %xmm10, -16 * SIZE(BO)
  577. #else
  578. movapd %xmm10, -16 * SIZE(AO)
  579. #endif
  580. #ifndef LN
  581. addq $2 * SIZE, CO1
  582. #endif
  583. #if defined(LT) || defined(RN)
  584. movq K, %rax
  585. subq KK, %rax
  586. leaq (,%rax, SIZE), %rax
  587. leaq (AO, %rax, 2), AO
  588. leaq (BO, %rax, 1), BO
  589. #endif
  590. #ifdef LN
  591. subq $2, KK
  592. #endif
  593. #ifdef LT
  594. addq $2, KK
  595. #endif
  596. #ifdef RT
  597. movq K, %rax
  598. salq $1 + BASE_SHIFT, %rax
  599. addq %rax, AORIG
  600. #endif
  601. ALIGN_4
  602. .L110:
  603. testq $1, M
  604. BRANCH
  605. jle .L119
  606. #ifdef LN
  607. movq K, %rax
  608. salq $BASE_SHIFT, %rax
  609. subq %rax, AORIG
  610. #endif
  611. #if defined(LN) || defined(RT)
  612. movq KK, %rax
  613. leaq (, %rax, SIZE), %rax
  614. movq AORIG, AO
  615. leaq (AO, %rax, 1), AO
  616. leaq (B, %rax, 1), BO
  617. #else
  618. movq B, BO
  619. #endif
  620. movsd -16 * SIZE(AO), %xmm0
  621. movsd -16 * SIZE(BO), %xmm2
  622. pxor %xmm8, %xmm8
  623. pxor %xmm9, %xmm9
  624. #if defined(LT) || defined(RN)
  625. movq KK, %rax
  626. #else
  627. movq K, %rax
  628. subq KK, %rax
  629. #endif
  630. sarq $2, %rax
  631. NOBRANCH
  632. jle .L115
  633. ALIGN_4
  634. .L112:
  635. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  636. mulsd %xmm0, %xmm2
  637. addsd %xmm2, %xmm8
  638. movsd -15 * SIZE(AO), %xmm0
  639. movsd -15 * SIZE(BO), %xmm2
  640. mulsd %xmm0, %xmm2
  641. addsd %xmm2, %xmm8
  642. movsd -14 * SIZE(AO), %xmm0
  643. movsd -14 * SIZE(BO), %xmm2
  644. mulsd %xmm0, %xmm2
  645. addsd %xmm2, %xmm8
  646. movsd -13 * SIZE(AO), %xmm0
  647. movsd -13 * SIZE(BO), %xmm2
  648. mulsd %xmm0, %xmm2
  649. addsd %xmm2, %xmm8
  650. movsd -12 * SIZE(AO), %xmm0
  651. movsd -12 * SIZE(BO), %xmm2
  652. subq $-4 * SIZE, AO
  653. subq $-4 * SIZE, BO
  654. subq $1, %rax
  655. BRANCH
  656. jg .L112
  657. ALIGN_4
  658. .L115:
  659. #if defined(LT) || defined(RN)
  660. movq KK, %rax
  661. #else
  662. movq K, %rax
  663. subq KK, %rax
  664. #endif
  665. andq $3, %rax # if (k & 1)
  666. BRANCH
  667. je .L118
  668. ALIGN_4
  669. .L116:
  670. mulsd %xmm0, %xmm2
  671. addsd %xmm2, %xmm8
  672. movsd -15 * SIZE(AO), %xmm0
  673. movsd -15 * SIZE(BO), %xmm2
  674. addq $1 * SIZE, AO
  675. addq $1 * SIZE, BO
  676. subq $1, %rax
  677. BRANCH
  678. jg .L116
  679. ALIGN_4
  680. .L118:
  681. #if defined(LN) || defined(RT)
  682. movq KK, %rax
  683. subq $1, %rax
  684. leaq (, %rax, SIZE), %rax
  685. movq AORIG, AO
  686. leaq (AO, %rax, 1), AO
  687. leaq (B, %rax, 1), BO
  688. #endif
  689. addpd %xmm9, %xmm8
  690. #if defined(LN) || defined(LT)
  691. movsd -16 * SIZE(BO), %xmm10
  692. subsd %xmm8, %xmm10
  693. #else
  694. movsd -16 * SIZE(AO), %xmm10
  695. subsd %xmm8, %xmm10
  696. #endif
  697. #ifdef LN
  698. movsd -16 * SIZE(AO), %xmm12
  699. mulsd %xmm12, %xmm10
  700. #endif
  701. #ifdef LT
  702. movsd -16 * SIZE(AO), %xmm12
  703. mulsd %xmm12, %xmm10
  704. #endif
  705. #ifdef RN
  706. movsd -16 * SIZE(BO), %xmm8
  707. mulsd %xmm8, %xmm10
  708. #endif
  709. #ifdef RT
  710. movsd -16 * SIZE(BO), %xmm8
  711. mulsd %xmm8, %xmm10
  712. #endif
  713. #ifdef LN
  714. subq $1 * SIZE, CO1
  715. #endif
  716. #if defined(LN) || defined(LT)
  717. movsd %xmm10, 0 * SIZE(CO1)
  718. #else
  719. movsd %xmm10, 0 * SIZE(CO1)
  720. #endif
  721. #if defined(LN) || defined(LT)
  722. movsd %xmm10, -16 * SIZE(BO)
  723. #else
  724. movsd %xmm10, -16 * SIZE(AO)
  725. #endif
  726. #ifndef LN
  727. addq $1 * SIZE, CO1
  728. #endif
  729. #if defined(LT) || defined(RN)
  730. movq K, %rax
  731. subq KK, %rax
  732. leaq (,%rax, SIZE), %rax
  733. leaq (AO, %rax, 1), AO
  734. leaq (BO, %rax, 1), BO
  735. #endif
  736. #ifdef LN
  737. subq $1, KK
  738. #endif
  739. #ifdef LT
  740. addq $1, KK
  741. #endif
  742. #ifdef RT
  743. movq K, %rax
  744. salq $BASE_SHIFT, %rax
  745. addq %rax, AORIG
  746. #endif
  747. ALIGN_4
  748. .L119:
  749. #ifdef LN
  750. leaq (B, K, SIZE), B
  751. #endif
  752. #if defined(LT) || defined(RN)
  753. movq BO, B
  754. #endif
  755. #ifdef RN
  756. addq $1, KK
  757. #endif
  758. #ifdef RT
  759. subq $1, KK
  760. #endif
  761. ALIGN_4
  762. .L40:
  763. testq $2, N
  764. BRANCH
  765. jle .L80
  766. #if defined(LT) || defined(RN)
  767. movq A, AO
  768. #else
  769. movq A, AORIG
  770. #endif
  771. #ifdef RT
  772. movq K, %rax
  773. salq $1 + BASE_SHIFT, %rax
  774. subq %rax, B
  775. leaq (, LDC, 2), %rax
  776. subq %rax, C
  777. #endif
  778. movq C, CO1
  779. leaq (C, LDC, 1), CO2
  780. #ifndef RT
  781. leaq (C, LDC, 2), C
  782. #endif
  783. #ifdef LN
  784. movq OFFSET, %rax
  785. addq M, %rax
  786. movq %rax, KK
  787. #endif
  788. movq K, %rax
  789. salq $BASE_SHIFT + 1, %rax
  790. movq B, BB
  791. subq %rax, BB
  792. #ifdef LT
  793. movq OFFSET, %rax
  794. movq %rax, KK
  795. #endif
  796. movq M, I
  797. sarq $2, I # i = (m >> 2)
  798. NOBRANCH
  799. jle .L60
  800. ALIGN_4
  801. .L51:
  802. #ifdef LN
  803. movq K, %rax
  804. salq $2 + BASE_SHIFT, %rax
  805. subq %rax, AORIG
  806. #endif
  807. #if defined(LN) || defined(RT)
  808. movq KK, %rax
  809. leaq (, %rax, SIZE), %rax
  810. movq AORIG, AO
  811. leaq (AO, %rax, 4), AO
  812. leaq (B, %rax, 2), BO
  813. #else
  814. movq B, BO
  815. #endif
  816. prefetcht2 -16 * SIZE(BB)
  817. subq $-4 * SIZE, BB
  818. movaps -16 * SIZE(AO), %xmm0
  819. movaps -14 * SIZE(AO), %xmm1
  820. movaps -16 * SIZE(BO), %xmm2
  821. #ifdef LN
  822. prefetcht0 -4 * SIZE(CO1)
  823. pxor %xmm8, %xmm8
  824. pxor %xmm9, %xmm9
  825. prefetcht0 -4 * SIZE(CO2)
  826. pxor %xmm12, %xmm12
  827. pxor %xmm13, %xmm13
  828. #else
  829. prefetcht0 3 * SIZE(CO1)
  830. pxor %xmm8, %xmm8
  831. pxor %xmm9, %xmm9
  832. prefetcht0 3 * SIZE(CO2)
  833. pxor %xmm12, %xmm12
  834. pxor %xmm13, %xmm13
  835. #endif
  836. #if defined(LT) || defined(RN)
  837. movq KK, %rax
  838. #else
  839. movq K, %rax
  840. subq KK, %rax
  841. #endif
  842. sarq $2, %rax
  843. NOBRANCH
  844. jle .L55
  845. ALIGN_4
  846. .L52:
  847. movaps %xmm2, %xmm4
  848. pshufd $0x4e, %xmm2, %xmm7
  849. mulpd %xmm0, %xmm2
  850. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  851. mulpd %xmm1, %xmm4
  852. movaps %xmm7, %xmm6
  853. mulpd %xmm0, %xmm7
  854. movaps -12 * SIZE(AO), %xmm0
  855. mulpd %xmm1, %xmm6
  856. movaps -10 * SIZE(AO), %xmm1
  857. addpd %xmm2, %xmm9
  858. movaps -14 * SIZE(BO), %xmm2
  859. addpd %xmm4, %xmm13
  860. addpd %xmm7, %xmm8
  861. addpd %xmm6, %xmm12
  862. movaps %xmm2, %xmm4
  863. pshufd $0x4e, %xmm2, %xmm7
  864. mulpd %xmm0, %xmm2
  865. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  866. mulpd %xmm1, %xmm4
  867. movaps %xmm7, %xmm6
  868. mulpd %xmm0, %xmm7
  869. movaps -8 * SIZE(AO), %xmm0
  870. mulpd %xmm1, %xmm6
  871. movaps -6 * SIZE(AO), %xmm1
  872. addpd %xmm2, %xmm9
  873. movaps -12 * SIZE(BO), %xmm2
  874. addpd %xmm4, %xmm13
  875. addpd %xmm7, %xmm8
  876. addpd %xmm6, %xmm12
  877. movaps %xmm2, %xmm4
  878. pshufd $0x4e, %xmm2, %xmm7
  879. mulpd %xmm0, %xmm2
  880. mulpd %xmm1, %xmm4
  881. movaps %xmm7, %xmm6
  882. mulpd %xmm0, %xmm7
  883. movaps -4 * SIZE(AO), %xmm0
  884. mulpd %xmm1, %xmm6
  885. movaps -2 * SIZE(AO), %xmm1
  886. addpd %xmm2, %xmm9
  887. movaps -10 * SIZE(BO), %xmm2
  888. addpd %xmm4, %xmm13
  889. addpd %xmm7, %xmm8
  890. addpd %xmm6, %xmm12
  891. movaps %xmm2, %xmm4
  892. pshufd $0x4e, %xmm2, %xmm7
  893. mulpd %xmm0, %xmm2
  894. mulpd %xmm1, %xmm4
  895. movaps %xmm7, %xmm6
  896. mulpd %xmm0, %xmm7
  897. movaps 0 * SIZE(AO), %xmm0
  898. mulpd %xmm1, %xmm6
  899. movaps 2 * SIZE(AO), %xmm1
  900. addpd %xmm2, %xmm9
  901. movaps -8 * SIZE(BO), %xmm2
  902. addpd %xmm4, %xmm13
  903. addpd %xmm7, %xmm8
  904. addpd %xmm6, %xmm12
  905. subq $-16 * SIZE, AO
  906. subq $ -8 * SIZE, BO
  907. subq $1, %rax
  908. BRANCH
  909. jg .L52
  910. ALIGN_4
  911. .L55:
  912. #if defined(LT) || defined(RN)
  913. movq KK, %rax
  914. #else
  915. movq K, %rax
  916. subq KK, %rax
  917. #endif
  918. andq $3, %rax # if (k & 1)
  919. BRANCH
  920. je .L58
  921. ALIGN_4
  922. .L56:
  923. movaps %xmm2, %xmm4
  924. pshufd $0x4e, %xmm2, %xmm7
  925. mulpd %xmm0, %xmm2
  926. mulpd %xmm1, %xmm4
  927. movaps %xmm7, %xmm6
  928. mulpd %xmm0, %xmm7
  929. movaps -12 * SIZE(AO), %xmm0
  930. mulpd %xmm1, %xmm6
  931. movaps -10 * SIZE(AO), %xmm1
  932. addpd %xmm2, %xmm9
  933. movaps -14 * SIZE(BO), %xmm2
  934. addpd %xmm4, %xmm13
  935. addpd %xmm7, %xmm8
  936. addpd %xmm6, %xmm12
  937. addq $4 * SIZE, AO
  938. addq $2 * SIZE, BO
  939. subq $1, %rax
  940. BRANCH
  941. jg .L56
  942. ALIGN_4
  943. .L58:
  944. #if defined(LN) || defined(RT)
  945. movq KK, %rax
  946. #ifdef LN
  947. subq $4, %rax
  948. #else
  949. subq $2, %rax
  950. #endif
  951. leaq (, %rax, SIZE), %rax
  952. movq AORIG, AO
  953. leaq (AO, %rax, 4), AO
  954. leaq (B, %rax, 2), BO
  955. #endif
  956. movapd %xmm8, %xmm0
  957. movsd %xmm9, %xmm8
  958. movsd %xmm0, %xmm9
  959. movapd %xmm12, %xmm0
  960. movsd %xmm13, %xmm12
  961. movsd %xmm0, %xmm13
  962. #if defined(LN) || defined(LT)
  963. movapd %xmm8, %xmm0
  964. unpcklpd %xmm9, %xmm8
  965. unpckhpd %xmm9, %xmm0
  966. movapd %xmm12, %xmm4
  967. unpcklpd %xmm13, %xmm12
  968. unpckhpd %xmm13, %xmm4
  969. movapd -16 * SIZE(BO), %xmm9
  970. movapd -14 * SIZE(BO), %xmm13
  971. movapd -12 * SIZE(BO), %xmm1
  972. movapd -10 * SIZE(BO), %xmm5
  973. subpd %xmm8, %xmm9
  974. subpd %xmm0, %xmm13
  975. subpd %xmm12, %xmm1
  976. subpd %xmm4, %xmm5
  977. #else
  978. movapd -16 * SIZE(AO), %xmm0
  979. movapd -14 * SIZE(AO), %xmm1
  980. movapd -12 * SIZE(AO), %xmm2
  981. movapd -10 * SIZE(AO), %xmm3
  982. subpd %xmm8, %xmm0
  983. subpd %xmm12, %xmm1
  984. subpd %xmm9, %xmm2
  985. subpd %xmm13, %xmm3
  986. #endif
  987. #ifdef LN
  988. movddup -1 * SIZE(AO), %xmm8
  989. mulpd %xmm8, %xmm5
  990. movddup -2 * SIZE(AO), %xmm10
  991. mulpd %xmm5, %xmm10
  992. subpd %xmm10, %xmm1
  993. movddup -3 * SIZE(AO), %xmm12
  994. mulpd %xmm5, %xmm12
  995. subpd %xmm12, %xmm13
  996. movddup -4 * SIZE(AO), %xmm14
  997. mulpd %xmm5, %xmm14
  998. subpd %xmm14, %xmm9
  999. movddup -6 * SIZE(AO), %xmm8
  1000. mulpd %xmm8, %xmm1
  1001. movddup -7 * SIZE(AO), %xmm10
  1002. mulpd %xmm1, %xmm10
  1003. subpd %xmm10, %xmm13
  1004. movddup -8 * SIZE(AO), %xmm12
  1005. mulpd %xmm1, %xmm12
  1006. subpd %xmm12, %xmm9
  1007. movddup -11 * SIZE(AO), %xmm8
  1008. mulpd %xmm8, %xmm13
  1009. movddup -12 * SIZE(AO), %xmm10
  1010. mulpd %xmm13, %xmm10
  1011. subpd %xmm10, %xmm9
  1012. movddup -16 * SIZE(AO), %xmm8
  1013. mulpd %xmm8, %xmm9
  1014. #endif
  1015. #ifdef LT
  1016. movddup -16 * SIZE(AO), %xmm8
  1017. mulpd %xmm8, %xmm9
  1018. movddup -15 * SIZE(AO), %xmm10
  1019. mulpd %xmm9, %xmm10
  1020. subpd %xmm10, %xmm13
  1021. movddup -14 * SIZE(AO), %xmm12
  1022. mulpd %xmm9, %xmm12
  1023. subpd %xmm12, %xmm1
  1024. movddup -13 * SIZE(AO), %xmm14
  1025. mulpd %xmm9, %xmm14
  1026. subpd %xmm14, %xmm5
  1027. movddup -11 * SIZE(AO), %xmm8
  1028. mulpd %xmm8, %xmm13
  1029. movddup -10 * SIZE(AO), %xmm10
  1030. mulpd %xmm13, %xmm10
  1031. subpd %xmm10, %xmm1
  1032. movddup -9 * SIZE(AO), %xmm12
  1033. mulpd %xmm13, %xmm12
  1034. subpd %xmm12, %xmm5
  1035. movddup -6 * SIZE(AO), %xmm8
  1036. mulpd %xmm8, %xmm1
  1037. movddup -5 * SIZE(AO), %xmm10
  1038. mulpd %xmm1, %xmm10
  1039. subpd %xmm10, %xmm5
  1040. movddup -1 * SIZE(AO), %xmm8
  1041. mulpd %xmm8, %xmm5
  1042. #endif
  1043. #ifdef RN
  1044. movddup -16 * SIZE(BO), %xmm8
  1045. mulpd %xmm8, %xmm0
  1046. mulpd %xmm8, %xmm1
  1047. movddup -15 * SIZE(BO), %xmm9
  1048. movapd %xmm9, %xmm10
  1049. mulpd %xmm0, %xmm9
  1050. subpd %xmm9, %xmm2
  1051. mulpd %xmm1, %xmm10
  1052. subpd %xmm10, %xmm3
  1053. movddup -13 * SIZE(BO), %xmm8
  1054. mulpd %xmm8, %xmm2
  1055. mulpd %xmm8, %xmm3
  1056. #endif
  1057. #ifdef RT
  1058. movddup -13 * SIZE(BO), %xmm8
  1059. mulpd %xmm8, %xmm2
  1060. mulpd %xmm8, %xmm3
  1061. movddup -14 * SIZE(BO), %xmm9
  1062. movapd %xmm9, %xmm10
  1063. mulpd %xmm2, %xmm9
  1064. subpd %xmm9, %xmm0
  1065. mulpd %xmm3, %xmm10
  1066. subpd %xmm10, %xmm1
  1067. movddup -16 * SIZE(BO), %xmm8
  1068. mulpd %xmm8, %xmm0
  1069. mulpd %xmm8, %xmm1
  1070. #endif
  1071. #ifdef LN
  1072. subq $4 * SIZE, CO1
  1073. subq $4 * SIZE, CO2
  1074. #endif
  1075. #if defined(LN) || defined(LT)
  1076. movsd %xmm9, 0 * SIZE(CO1)
  1077. movsd %xmm13, 1 * SIZE(CO1)
  1078. movsd %xmm1, 2 * SIZE(CO1)
  1079. movsd %xmm5, 3 * SIZE(CO1)
  1080. movhpd %xmm9, 0 * SIZE(CO2)
  1081. movhpd %xmm13, 1 * SIZE(CO2)
  1082. movhpd %xmm1, 2 * SIZE(CO2)
  1083. movhpd %xmm5, 3 * SIZE(CO2)
  1084. #else
  1085. movsd %xmm0, 0 * SIZE(CO1)
  1086. movhpd %xmm0, 1 * SIZE(CO1)
  1087. movsd %xmm1, 2 * SIZE(CO1)
  1088. movhpd %xmm1, 3 * SIZE(CO1)
  1089. movsd %xmm2, 0 * SIZE(CO2)
  1090. movhpd %xmm2, 1 * SIZE(CO2)
  1091. movsd %xmm3, 2 * SIZE(CO2)
  1092. movhpd %xmm3, 3 * SIZE(CO2)
  1093. #endif
  1094. #if defined(LN) || defined(LT)
  1095. movapd %xmm9, -16 * SIZE(BO)
  1096. movapd %xmm13, -14 * SIZE(BO)
  1097. movapd %xmm1, -12 * SIZE(BO)
  1098. movapd %xmm5, -10 * SIZE(BO)
  1099. #else
  1100. movapd %xmm0, -16 * SIZE(AO)
  1101. movapd %xmm1, -14 * SIZE(AO)
  1102. movapd %xmm2, -12 * SIZE(AO)
  1103. movapd %xmm3, -10 * SIZE(AO)
  1104. #endif
  1105. #ifndef LN
  1106. addq $4 * SIZE, CO1
  1107. addq $4 * SIZE, CO2
  1108. #endif
  1109. #if defined(LT) || defined(RN)
  1110. movq K, %rax
  1111. subq KK, %rax
  1112. leaq (,%rax, SIZE), %rax
  1113. leaq (AO, %rax, 4), AO
  1114. leaq (BO, %rax, 2), BO
  1115. #endif
  1116. #ifdef LN
  1117. subq $4, KK
  1118. #endif
  1119. #ifdef LT
  1120. addq $4, KK
  1121. #endif
  1122. #ifdef RT
  1123. movq K, %rax
  1124. salq $2 + BASE_SHIFT, %rax
  1125. addq %rax, AORIG
  1126. #endif
  1127. decq I
  1128. BRANCH
  1129. jg .L51
  1130. ALIGN_4
  1131. .L60:
  1132. testq $2, M
  1133. BRANCH
  1134. jle .L70
  1135. ALIGN_4
  1136. #ifdef LN
  1137. movq K, %rax
  1138. salq $1 + BASE_SHIFT, %rax
  1139. subq %rax, AORIG
  1140. #endif
  1141. #if defined(LN) || defined(RT)
  1142. movq KK, %rax
  1143. leaq (, %rax, SIZE), %rax
  1144. movq AORIG, AO
  1145. leaq (AO, %rax, 2), AO
  1146. leaq (B, %rax, 2), BO
  1147. #else
  1148. movq B, BO
  1149. #endif
  1150. movaps -16 * SIZE(AO), %xmm0
  1151. pxor %xmm8, %xmm8
  1152. pxor %xmm9, %xmm9
  1153. movaps -16 * SIZE(BO), %xmm2
  1154. pxor %xmm10, %xmm10
  1155. pxor %xmm11, %xmm11
  1156. #if defined(LT) || defined(RN)
  1157. movq KK, %rax
  1158. #else
  1159. movq K, %rax
  1160. subq KK, %rax
  1161. #endif
  1162. sarq $2, %rax
  1163. NOBRANCH
  1164. jle .L65
  1165. ALIGN_4
  1166. .L62:
  1167. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1168. pshufd $0x4e, %xmm2, %xmm7
  1169. mulpd %xmm0, %xmm2
  1170. mulpd %xmm0, %xmm7
  1171. movaps -14 * SIZE(AO), %xmm0
  1172. addpd %xmm2, %xmm9
  1173. addpd %xmm7, %xmm8
  1174. movaps -14 * SIZE(BO), %xmm2
  1175. pshufd $0x4e, %xmm2, %xmm7
  1176. mulpd %xmm0, %xmm2
  1177. mulpd %xmm0, %xmm7
  1178. movaps -12 * SIZE(AO), %xmm0
  1179. addpd %xmm2, %xmm11
  1180. addpd %xmm7, %xmm10
  1181. movaps -12 * SIZE(BO), %xmm2
  1182. pshufd $0x4e, %xmm2, %xmm7
  1183. mulpd %xmm0, %xmm2
  1184. mulpd %xmm0, %xmm7
  1185. movaps -10 * SIZE(AO), %xmm0
  1186. addpd %xmm2, %xmm9
  1187. addpd %xmm7, %xmm8
  1188. movaps -10 * SIZE(BO), %xmm2
  1189. pshufd $0x4e, %xmm2, %xmm7
  1190. mulpd %xmm0, %xmm2
  1191. mulpd %xmm0, %xmm7
  1192. movaps -8 * SIZE(AO), %xmm0
  1193. addpd %xmm2, %xmm11
  1194. addpd %xmm7, %xmm10
  1195. movaps -8 * SIZE(BO), %xmm2
  1196. subq $-8 * SIZE, AO
  1197. subq $-8 * SIZE, BO
  1198. subq $1, %rax
  1199. BRANCH
  1200. jg .L62
  1201. ALIGN_4
  1202. .L65:
  1203. #if defined(LT) || defined(RN)
  1204. movq KK, %rax
  1205. #else
  1206. movq K, %rax
  1207. subq KK, %rax
  1208. #endif
  1209. andq $3, %rax # if (k & 1)
  1210. BRANCH
  1211. je .L68
  1212. ALIGN_4
  1213. .L66:
  1214. pshufd $0x4e, %xmm2, %xmm7
  1215. mulpd %xmm0, %xmm2
  1216. mulpd %xmm0, %xmm7
  1217. movaps -14 * SIZE(AO), %xmm0
  1218. addpd %xmm2, %xmm9
  1219. addpd %xmm7, %xmm8
  1220. movaps -14 * SIZE(BO), %xmm2
  1221. addq $2 * SIZE, AO
  1222. addq $2 * SIZE, BO
  1223. subq $1, %rax
  1224. BRANCH
  1225. jg .L66
  1226. ALIGN_4
  1227. .L68:
  1228. #if defined(LN) || defined(RT)
  1229. movq KK, %rax
  1230. #ifdef LN
  1231. subq $2, %rax
  1232. #else
  1233. subq $2, %rax
  1234. #endif
  1235. leaq (, %rax, SIZE), %rax
  1236. movq AORIG, AO
  1237. leaq (AO, %rax, 2), AO
  1238. leaq (B, %rax, 2), BO
  1239. #endif
  1240. addpd %xmm10, %xmm8
  1241. addpd %xmm11, %xmm9
  1242. movapd %xmm8, %xmm0
  1243. movsd %xmm9, %xmm8
  1244. movsd %xmm0, %xmm9
  1245. #if defined(LN) || defined(LT)
  1246. movapd %xmm8, %xmm0
  1247. unpcklpd %xmm9, %xmm8
  1248. unpckhpd %xmm9, %xmm0
  1249. movapd -16 * SIZE(BO), %xmm9
  1250. movapd -14 * SIZE(BO), %xmm13
  1251. subpd %xmm8, %xmm9
  1252. subpd %xmm0, %xmm13
  1253. #else
  1254. movapd -16 * SIZE(AO), %xmm0
  1255. movapd -14 * SIZE(AO), %xmm2
  1256. subpd %xmm8, %xmm0
  1257. subpd %xmm9, %xmm2
  1258. #endif
  1259. #ifdef LN
  1260. movddup -13 * SIZE(AO), %xmm8
  1261. mulpd %xmm8, %xmm13
  1262. movddup -14 * SIZE(AO), %xmm10
  1263. mulpd %xmm13, %xmm10
  1264. subpd %xmm10, %xmm9
  1265. movddup -16 * SIZE(AO), %xmm8
  1266. mulpd %xmm8, %xmm9
  1267. #endif
  1268. #ifdef LT
  1269. movddup -16 * SIZE(AO), %xmm8
  1270. mulpd %xmm8, %xmm9
  1271. movddup -15 * SIZE(AO), %xmm10
  1272. mulpd %xmm9, %xmm10
  1273. subpd %xmm10, %xmm13
  1274. movddup -13 * SIZE(AO), %xmm8
  1275. mulpd %xmm8, %xmm13
  1276. #endif
  1277. #ifdef RN
  1278. movddup -16 * SIZE(BO), %xmm8
  1279. mulpd %xmm8, %xmm0
  1280. movddup -15 * SIZE(BO), %xmm9
  1281. mulpd %xmm0, %xmm9
  1282. subpd %xmm9, %xmm2
  1283. movddup -13 * SIZE(BO), %xmm8
  1284. mulpd %xmm8, %xmm2
  1285. #endif
  1286. #ifdef RT
  1287. movddup -13 * SIZE(BO), %xmm8
  1288. mulpd %xmm8, %xmm2
  1289. movddup -14 * SIZE(BO), %xmm9
  1290. mulpd %xmm2, %xmm9
  1291. subpd %xmm9, %xmm0
  1292. movddup -16 * SIZE(BO), %xmm8
  1293. mulpd %xmm8, %xmm0
  1294. #endif
  1295. #ifdef LN
  1296. subq $2 * SIZE, CO1
  1297. subq $2 * SIZE, CO2
  1298. #endif
  1299. #if defined(LN) || defined(LT)
  1300. movsd %xmm9, 0 * SIZE(CO1)
  1301. movsd %xmm13, 1 * SIZE(CO1)
  1302. movhpd %xmm9, 0 * SIZE(CO2)
  1303. movhpd %xmm13, 1 * SIZE(CO2)
  1304. #else
  1305. movsd %xmm0, 0 * SIZE(CO1)
  1306. movhpd %xmm0, 1 * SIZE(CO1)
  1307. movsd %xmm2, 0 * SIZE(CO2)
  1308. movhpd %xmm2, 1 * SIZE(CO2)
  1309. #endif
  1310. #if defined(LN) || defined(LT)
  1311. movapd %xmm9, -16 * SIZE(BO)
  1312. movapd %xmm13, -14 * SIZE(BO)
  1313. #else
  1314. movapd %xmm0, -16 * SIZE(AO)
  1315. movapd %xmm2, -14 * SIZE(AO)
  1316. #endif
  1317. #ifndef LN
  1318. addq $2 * SIZE, CO1
  1319. addq $2 * SIZE, CO2
  1320. #endif
  1321. #if defined(LT) || defined(RN)
  1322. movq K, %rax
  1323. subq KK, %rax
  1324. leaq (,%rax, SIZE), %rax
  1325. leaq (AO, %rax, 2), AO
  1326. leaq (BO, %rax, 2), BO
  1327. #endif
  1328. #ifdef LN
  1329. subq $2, KK
  1330. #endif
  1331. #ifdef LT
  1332. addq $2, KK
  1333. #endif
  1334. #ifdef RT
  1335. movq K, %rax
  1336. salq $1 + BASE_SHIFT, %rax
  1337. addq %rax, AORIG
  1338. #endif
  1339. ALIGN_4
  1340. .L70:
  1341. testq $1, M
  1342. BRANCH
  1343. jle .L79
  1344. ALIGN_4
  1345. #ifdef LN
  1346. movq K, %rax
  1347. salq $BASE_SHIFT, %rax
  1348. subq %rax, AORIG
  1349. #endif
  1350. #if defined(LN) || defined(RT)
  1351. movq KK, %rax
  1352. leaq (, %rax, SIZE), %rax
  1353. movq AORIG, AO
  1354. leaq (AO, %rax, 1), AO
  1355. leaq (B, %rax, 2), BO
  1356. #else
  1357. movq B, BO
  1358. #endif
  1359. movsd -16 * SIZE(AO), %xmm0
  1360. movaps -16 * SIZE(BO), %xmm2
  1361. pxor %xmm8, %xmm8
  1362. pxor %xmm9, %xmm9
  1363. #if defined(LT) || defined(RN)
  1364. movq KK, %rax
  1365. #else
  1366. movq K, %rax
  1367. subq KK, %rax
  1368. #endif
  1369. sarq $2, %rax
  1370. NOBRANCH
  1371. jle .L75
  1372. ALIGN_4
  1373. .L72:
  1374. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1375. shufps $0x44, %xmm0, %xmm0
  1376. mulpd %xmm0, %xmm2
  1377. movsd -15 * SIZE(AO), %xmm0
  1378. addpd %xmm2, %xmm8
  1379. movaps -14 * SIZE(BO), %xmm2
  1380. shufps $0x44, %xmm0, %xmm0
  1381. mulpd %xmm0, %xmm2
  1382. movsd -14 * SIZE(AO), %xmm0
  1383. addpd %xmm2, %xmm9
  1384. movaps -12 * SIZE(BO), %xmm2
  1385. shufps $0x44, %xmm0, %xmm0
  1386. mulpd %xmm0, %xmm2
  1387. movsd -13 * SIZE(AO), %xmm0
  1388. addpd %xmm2, %xmm8
  1389. movaps -10 * SIZE(BO), %xmm2
  1390. shufps $0x44, %xmm0, %xmm0
  1391. mulpd %xmm0, %xmm2
  1392. movsd -12 * SIZE(AO), %xmm0
  1393. addpd %xmm2, %xmm9
  1394. movaps -8 * SIZE(BO), %xmm2
  1395. subq $-4 * SIZE, AO
  1396. subq $-8 * SIZE, BO
  1397. subq $1, %rax
  1398. BRANCH
  1399. jg .L72
  1400. ALIGN_4
  1401. .L75:
  1402. #if defined(LT) || defined(RN)
  1403. movq KK, %rax
  1404. #else
  1405. movq K, %rax
  1406. subq KK, %rax
  1407. #endif
  1408. andq $3, %rax # if (k & 1)
  1409. BRANCH
  1410. je .L78
  1411. ALIGN_4
  1412. .L76:
  1413. shufps $0x44, %xmm0, %xmm0
  1414. mulpd %xmm0, %xmm2
  1415. movsd -15 * SIZE(AO), %xmm0
  1416. addpd %xmm2, %xmm8
  1417. movaps -14 * SIZE(BO), %xmm2
  1418. addq $1 * SIZE, AO
  1419. addq $2 * SIZE, BO
  1420. subq $1, %rax
  1421. BRANCH
  1422. jg .L76
  1423. ALIGN_4
  1424. .L78:
  1425. #if defined(LN) || defined(RT)
  1426. movq KK, %rax
  1427. #ifdef LN
  1428. subq $1, %rax
  1429. #else
  1430. subq $2, %rax
  1431. #endif
  1432. leaq (, %rax, SIZE), %rax
  1433. movq AORIG, AO
  1434. leaq (AO, %rax, 1), AO
  1435. leaq (B, %rax, 2), BO
  1436. #endif
  1437. addpd %xmm9, %xmm8
  1438. movhlps %xmm8, %xmm9
  1439. #if defined(LN) || defined(LT)
  1440. movsd -16 * SIZE(BO), %xmm12
  1441. movsd -15 * SIZE(BO), %xmm13
  1442. #else
  1443. movsd -16 * SIZE(AO), %xmm12
  1444. movsd -15 * SIZE(AO), %xmm13
  1445. #endif
  1446. subsd %xmm8, %xmm12
  1447. subsd %xmm9, %xmm13
  1448. #ifdef LN
  1449. movsd -16 * SIZE(AO), %xmm8
  1450. mulsd %xmm8, %xmm12
  1451. mulsd %xmm8, %xmm13
  1452. #endif
  1453. #ifdef LT
  1454. movsd -16 * SIZE(AO), %xmm8
  1455. mulsd %xmm8, %xmm12
  1456. mulsd %xmm8, %xmm13
  1457. #endif
  1458. #ifdef RN
  1459. mulsd -16 * SIZE(BO), %xmm12
  1460. movsd -15 * SIZE(BO), %xmm9
  1461. mulsd %xmm12, %xmm9
  1462. subsd %xmm9, %xmm13
  1463. mulsd -13 * SIZE(BO), %xmm13
  1464. #endif
  1465. #ifdef RT
  1466. mulsd -13 * SIZE(BO), %xmm13
  1467. movlpd -14 * SIZE(BO), %xmm9
  1468. mulsd %xmm13, %xmm9
  1469. subsd %xmm9, %xmm12
  1470. mulsd -16 * SIZE(BO), %xmm12
  1471. #endif
  1472. #ifdef LN
  1473. subq $1 * SIZE, CO1
  1474. subq $1 * SIZE, CO2
  1475. #endif
  1476. movsd %xmm12, 0 * SIZE(CO1)
  1477. movsd %xmm13, 0 * SIZE(CO2)
  1478. #if defined(LN) || defined(LT)
  1479. movsd %xmm12, -16 * SIZE(BO)
  1480. movsd %xmm13, -15 * SIZE(BO)
  1481. #else
  1482. movsd %xmm12, -16 * SIZE(AO)
  1483. movsd %xmm13, -15 * SIZE(AO)
  1484. #endif
  1485. #ifndef LN
  1486. addq $1 * SIZE, CO1
  1487. addq $1 * SIZE, CO2
  1488. #endif
  1489. #if defined(LT) || defined(RN)
  1490. movq K, %rax
  1491. subq KK, %rax
  1492. leaq (,%rax, SIZE), %rax
  1493. leaq (AO, %rax, 1), AO
  1494. leaq (BO, %rax, 2), BO
  1495. #endif
  1496. #ifdef LN
  1497. subq $1, KK
  1498. #endif
  1499. #ifdef LT
  1500. addq $1, KK
  1501. #endif
  1502. #ifdef RT
  1503. movq K, %rax
  1504. salq $BASE_SHIFT, %rax
  1505. addq %rax, AORIG
  1506. #endif
  1507. ALIGN_4
  1508. .L79:
  1509. #ifdef LN
  1510. leaq (, K, SIZE), %rax
  1511. leaq (B, %rax, 2), B
  1512. #endif
  1513. #if defined(LT) || defined(RN)
  1514. movq BO, B
  1515. #endif
  1516. #ifdef RN
  1517. addq $2, KK
  1518. #endif
  1519. #ifdef RT
  1520. subq $2, KK
  1521. #endif
  1522. ALIGN_4
  1523. .L80:
  1524. movq N, J
  1525. sarq $2, J
  1526. NOBRANCH
  1527. jle .L999
  1528. ALIGN_4
  1529. .L01:
  1530. #if defined(LT) || defined(RN)
  1531. movq A, AO
  1532. #else
  1533. movq A, AORIG
  1534. #endif
  1535. #ifdef RT
  1536. movq K, %rax
  1537. salq $2 + BASE_SHIFT, %rax
  1538. subq %rax, B
  1539. leaq (, LDC, 4), %rax
  1540. subq %rax, C
  1541. #endif
  1542. movq C, CO1
  1543. leaq (C, LDC, 1), CO2
  1544. #ifndef RT
  1545. leaq (C, LDC, 4), C
  1546. #endif
  1547. #ifdef LN
  1548. movq OFFSET, %rax
  1549. addq M, %rax
  1550. movq %rax, KK
  1551. #endif
  1552. movq K, %rax
  1553. salq $BASE_SHIFT + 2, %rax
  1554. movq B, BB
  1555. subq %rax, BB
  1556. #ifdef LT
  1557. movq OFFSET, %rax
  1558. movq %rax, KK
  1559. #endif
  1560. movq M, I
  1561. sarq $2, I # i = (m >> 2)
  1562. NOBRANCH
  1563. jle .L20
  1564. ALIGN_4
  1565. .L11:
  1566. #ifdef LN
  1567. movq K, %rax
  1568. salq $2 + BASE_SHIFT, %rax
  1569. subq %rax, AORIG
  1570. #endif
  1571. #if defined(LN) || defined(RT)
  1572. movq KK, %rax
  1573. leaq (, %rax, SIZE), %rax
  1574. movq AORIG, AO
  1575. leaq (AO, %rax, 4), AO
  1576. leaq (B, %rax, 4), BO
  1577. #else
  1578. movq B, BO
  1579. #endif
  1580. prefetcht2 -16 * SIZE(BB)
  1581. subq $-8 * SIZE, BB
  1582. movaps -16 * SIZE(AO), %xmm0
  1583. pxor %xmm3, %xmm3
  1584. movaps -14 * SIZE(AO), %xmm1
  1585. pxor %xmm4, %xmm4
  1586. movaps -16 * SIZE(BO), %xmm2
  1587. pxor %xmm5, %xmm5
  1588. pxor %xmm6, %xmm6
  1589. #ifdef LN
  1590. prefetcht0 -4 * SIZE(CO1)
  1591. movapd %xmm4, %xmm8
  1592. movapd %xmm4, %xmm9
  1593. prefetcht0 -4 * SIZE(CO2)
  1594. movapd %xmm4, %xmm10
  1595. movapd %xmm4, %xmm11
  1596. prefetcht0 -4 * SIZE(CO1, LDC, 2)
  1597. movapd %xmm4, %xmm12
  1598. movapd %xmm4, %xmm13
  1599. prefetcht0 -4 * SIZE(CO2, LDC, 2)
  1600. movapd %xmm4, %xmm14
  1601. movapd %xmm4, %xmm15
  1602. #else
  1603. prefetcht0 3 * SIZE(CO1)
  1604. movapd %xmm4, %xmm8
  1605. movapd %xmm4, %xmm9
  1606. prefetcht0 3 * SIZE(CO2)
  1607. movapd %xmm4, %xmm10
  1608. movapd %xmm4, %xmm11
  1609. prefetcht0 3 * SIZE(CO1, LDC, 2)
  1610. movapd %xmm4, %xmm12
  1611. movapd %xmm4, %xmm13
  1612. prefetcht0 3 * SIZE(CO2, LDC, 2)
  1613. movapd %xmm4, %xmm14
  1614. movapd %xmm4, %xmm15
  1615. #endif
  1616. #if defined(LT) || defined(RN)
  1617. movq KK, %rax
  1618. #else
  1619. movq K, %rax
  1620. subq KK, %rax
  1621. #endif
  1622. sarq $3, %rax
  1623. NOBRANCH
  1624. jle .L15
  1625. ALIGN_3
  1626. .L12:
  1627. addpd %xmm3, %xmm11
  1628. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1629. movaps -14 * SIZE(BO), %xmm3
  1630. addpd %xmm4, %xmm15
  1631. movaps %xmm2, %xmm4
  1632. pshufd $0x4e, %xmm2, %xmm7
  1633. mulpd %xmm0, %xmm2
  1634. mulpd %xmm1, %xmm4
  1635. addpd %xmm5, %xmm10
  1636. addpd %xmm6, %xmm14
  1637. movaps %xmm7, %xmm6
  1638. mulpd %xmm0, %xmm7
  1639. mulpd %xmm1, %xmm6
  1640. addpd %xmm2, %xmm9
  1641. movaps -12 * SIZE(BO), %xmm2
  1642. addpd %xmm4, %xmm13
  1643. movaps %xmm3, %xmm4
  1644. pshufd $0x4e, %xmm3, %xmm5
  1645. mulpd %xmm0, %xmm3
  1646. mulpd %xmm1, %xmm4
  1647. addpd %xmm7, %xmm8
  1648. addpd %xmm6, %xmm12
  1649. movaps %xmm5, %xmm6
  1650. mulpd %xmm0, %xmm5
  1651. movaps -12 * SIZE(AO), %xmm0
  1652. mulpd %xmm1, %xmm6
  1653. movaps -10 * SIZE(AO), %xmm1
  1654. addpd %xmm3, %xmm11
  1655. movaps -10 * SIZE(BO), %xmm3
  1656. addpd %xmm4, %xmm15
  1657. movaps %xmm2, %xmm4
  1658. pshufd $0x4e, %xmm2, %xmm7
  1659. mulpd %xmm0, %xmm2
  1660. mulpd %xmm1, %xmm4
  1661. addpd %xmm5, %xmm10
  1662. addpd %xmm6, %xmm14
  1663. movaps %xmm7, %xmm6
  1664. mulpd %xmm0, %xmm7
  1665. mulpd %xmm1, %xmm6
  1666. addpd %xmm2, %xmm9
  1667. movaps -8 * SIZE(BO), %xmm2
  1668. addpd %xmm4, %xmm13
  1669. movaps %xmm3, %xmm4
  1670. pshufd $0x4e, %xmm3, %xmm5
  1671. mulpd %xmm0, %xmm3
  1672. mulpd %xmm1, %xmm4
  1673. addpd %xmm7, %xmm8
  1674. addpd %xmm6, %xmm12
  1675. movaps %xmm5, %xmm6
  1676. mulpd %xmm0, %xmm5
  1677. movaps -8 * SIZE(AO), %xmm0
  1678. mulpd %xmm1, %xmm6
  1679. movaps -6 * SIZE(AO), %xmm1
  1680. addpd %xmm3, %xmm11
  1681. movaps -6 * SIZE(BO), %xmm3
  1682. addpd %xmm4, %xmm15
  1683. movaps %xmm2, %xmm4
  1684. pshufd $0x4e, %xmm2, %xmm7
  1685. mulpd %xmm0, %xmm2
  1686. mulpd %xmm1, %xmm4
  1687. addpd %xmm5, %xmm10
  1688. addpd %xmm6, %xmm14
  1689. movaps %xmm7, %xmm6
  1690. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  1691. mulpd %xmm0, %xmm7
  1692. mulpd %xmm1, %xmm6
  1693. addpd %xmm2, %xmm9
  1694. movaps -4 * SIZE(BO), %xmm2
  1695. addpd %xmm4, %xmm13
  1696. movaps %xmm3, %xmm4
  1697. pshufd $0x4e, %xmm3, %xmm5
  1698. mulpd %xmm0, %xmm3
  1699. mulpd %xmm1, %xmm4
  1700. addpd %xmm7, %xmm8
  1701. addpd %xmm6, %xmm12
  1702. movaps %xmm5, %xmm6
  1703. mulpd %xmm0, %xmm5
  1704. movaps -4 * SIZE(AO), %xmm0
  1705. mulpd %xmm1, %xmm6
  1706. movaps -2 * SIZE(AO), %xmm1
  1707. addpd %xmm3, %xmm11
  1708. movaps -2 * SIZE(BO), %xmm3
  1709. addpd %xmm4, %xmm15
  1710. movaps %xmm2, %xmm4
  1711. pshufd $0x4e, %xmm2, %xmm7
  1712. mulpd %xmm0, %xmm2
  1713. mulpd %xmm1, %xmm4
  1714. addpd %xmm5, %xmm10
  1715. addpd %xmm6, %xmm14
  1716. movaps %xmm7, %xmm6
  1717. mulpd %xmm0, %xmm7
  1718. mulpd %xmm1, %xmm6
  1719. addpd %xmm2, %xmm9
  1720. movaps 0 * SIZE(BO), %xmm2
  1721. addpd %xmm4, %xmm13
  1722. movaps %xmm3, %xmm4
  1723. pshufd $0x4e, %xmm3, %xmm5
  1724. mulpd %xmm0, %xmm3
  1725. mulpd %xmm1, %xmm4
  1726. addpd %xmm7, %xmm8
  1727. addpd %xmm6, %xmm12
  1728. movaps %xmm5, %xmm6
  1729. mulpd %xmm0, %xmm5
  1730. movaps 0 * SIZE(AO), %xmm0
  1731. mulpd %xmm1, %xmm6
  1732. movaps 2 * SIZE(AO), %xmm1
  1733. addpd %xmm3, %xmm11
  1734. movaps 2 * SIZE(BO), %xmm3
  1735. addpd %xmm4, %xmm15
  1736. movaps %xmm2, %xmm4
  1737. pshufd $0x4e, %xmm2, %xmm7
  1738. mulpd %xmm0, %xmm2
  1739. mulpd %xmm1, %xmm4
  1740. addpd %xmm5, %xmm10
  1741. addpd %xmm6, %xmm14
  1742. movaps %xmm7, %xmm6
  1743. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1744. mulpd %xmm0, %xmm7
  1745. mulpd %xmm1, %xmm6
  1746. addpd %xmm2, %xmm9
  1747. movaps 4 * SIZE(BO), %xmm2
  1748. addpd %xmm4, %xmm13
  1749. movaps %xmm3, %xmm4
  1750. pshufd $0x4e, %xmm3, %xmm5
  1751. mulpd %xmm0, %xmm3
  1752. mulpd %xmm1, %xmm4
  1753. addpd %xmm7, %xmm8
  1754. addpd %xmm6, %xmm12
  1755. movaps %xmm5, %xmm6
  1756. mulpd %xmm0, %xmm5
  1757. movaps 4 * SIZE(AO), %xmm0
  1758. mulpd %xmm1, %xmm6
  1759. movaps 6 * SIZE(AO), %xmm1
  1760. addpd %xmm3, %xmm11
  1761. movaps 6 * SIZE(BO), %xmm3
  1762. addpd %xmm4, %xmm15
  1763. movaps %xmm2, %xmm4
  1764. pshufd $0x4e, %xmm2, %xmm7
  1765. mulpd %xmm0, %xmm2
  1766. mulpd %xmm1, %xmm4
  1767. addpd %xmm5, %xmm10
  1768. addpd %xmm6, %xmm14
  1769. movaps %xmm7, %xmm6
  1770. mulpd %xmm0, %xmm7
  1771. mulpd %xmm1, %xmm6
  1772. addpd %xmm2, %xmm9
  1773. movaps 8 * SIZE(BO), %xmm2
  1774. addpd %xmm4, %xmm13
  1775. movaps %xmm3, %xmm4
  1776. pshufd $0x4e, %xmm3, %xmm5
  1777. mulpd %xmm0, %xmm3
  1778. mulpd %xmm1, %xmm4
  1779. addpd %xmm7, %xmm8
  1780. addpd %xmm6, %xmm12
  1781. movaps %xmm5, %xmm6
  1782. mulpd %xmm0, %xmm5
  1783. movaps 8 * SIZE(AO), %xmm0
  1784. mulpd %xmm1, %xmm6
  1785. movaps 10 * SIZE(AO), %xmm1
  1786. addpd %xmm3, %xmm11
  1787. movaps 10 * SIZE(BO), %xmm3
  1788. addpd %xmm4, %xmm15
  1789. movaps %xmm2, %xmm4
  1790. pshufd $0x4e, %xmm2, %xmm7
  1791. mulpd %xmm0, %xmm2
  1792. mulpd %xmm1, %xmm4
  1793. addpd %xmm5, %xmm10
  1794. addpd %xmm6, %xmm14
  1795. movaps %xmm7, %xmm6
  1796. PREFETCH (PREFETCHSIZE + 24) * SIZE(AO)
  1797. mulpd %xmm0, %xmm7
  1798. mulpd %xmm1, %xmm6
  1799. addpd %xmm2, %xmm9
  1800. movaps 12 * SIZE(BO), %xmm2
  1801. addpd %xmm4, %xmm13
  1802. movaps %xmm3, %xmm4
  1803. pshufd $0x4e, %xmm3, %xmm5
  1804. mulpd %xmm0, %xmm3
  1805. mulpd %xmm1, %xmm4
  1806. addpd %xmm7, %xmm8
  1807. addpd %xmm6, %xmm12
  1808. movaps %xmm5, %xmm6
  1809. mulpd %xmm0, %xmm5
  1810. movaps 12 * SIZE(AO), %xmm0
  1811. mulpd %xmm1, %xmm6
  1812. movaps 14 * SIZE(AO), %xmm1
  1813. addpd %xmm3, %xmm11
  1814. movaps 14 * SIZE(BO), %xmm3
  1815. addpd %xmm4, %xmm15
  1816. movaps %xmm2, %xmm4
  1817. pshufd $0x4e, %xmm2, %xmm7
  1818. mulpd %xmm0, %xmm2
  1819. mulpd %xmm1, %xmm4
  1820. addpd %xmm5, %xmm10
  1821. addpd %xmm6, %xmm14
  1822. movaps %xmm7, %xmm6
  1823. mulpd %xmm0, %xmm7
  1824. mulpd %xmm1, %xmm6
  1825. addpd %xmm2, %xmm9
  1826. movaps 16 * SIZE(BO), %xmm2
  1827. addpd %xmm4, %xmm13
  1828. movaps %xmm3, %xmm4
  1829. pshufd $0x4e, %xmm3, %xmm5
  1830. mulpd %xmm0, %xmm3
  1831. mulpd %xmm1, %xmm4
  1832. addpd %xmm7, %xmm8
  1833. subq $-32 * SIZE, AO
  1834. addpd %xmm6, %xmm12
  1835. movaps %xmm5, %xmm6
  1836. mulpd %xmm0, %xmm5
  1837. movaps -16 * SIZE(AO), %xmm0
  1838. mulpd %xmm1, %xmm6
  1839. movaps -14 * SIZE(AO), %xmm1
  1840. subq $-32 * SIZE, BO
  1841. subq $1, %rax
  1842. BRANCH
  1843. jg .L12
  1844. ALIGN_3
  1845. .L15:
  1846. #if defined(LT) || defined(RN)
  1847. movq KK, %rax
  1848. #else
  1849. movq K, %rax
  1850. subq KK, %rax
  1851. #endif
  1852. andq $7, %rax # if (k & 1)
  1853. BRANCH
  1854. je .L18
  1855. ALIGN_3
  1856. .L16:
  1857. addpd %xmm3, %xmm11
  1858. movaps -14 * SIZE(BO), %xmm3
  1859. addpd %xmm4, %xmm15
  1860. movaps %xmm2, %xmm4
  1861. pshufd $0x4e, %xmm2, %xmm7
  1862. mulpd %xmm0, %xmm2
  1863. mulpd %xmm1, %xmm4
  1864. addpd %xmm5, %xmm10
  1865. addpd %xmm6, %xmm14
  1866. movaps %xmm7, %xmm6
  1867. mulpd %xmm0, %xmm7
  1868. mulpd %xmm1, %xmm6
  1869. addpd %xmm2, %xmm9
  1870. movaps -12 * SIZE(BO), %xmm2
  1871. addpd %xmm4, %xmm13
  1872. movaps %xmm3, %xmm4
  1873. pshufd $0x4e, %xmm3, %xmm5
  1874. mulpd %xmm0, %xmm3
  1875. mulpd %xmm1, %xmm4
  1876. addpd %xmm7, %xmm8
  1877. addpd %xmm6, %xmm12
  1878. movaps %xmm5, %xmm6
  1879. mulpd %xmm0, %xmm5
  1880. movaps -12 * SIZE(AO), %xmm0
  1881. mulpd %xmm1, %xmm6
  1882. movaps -10 * SIZE(AO), %xmm1
  1883. addq $4 * SIZE, AO
  1884. addq $4 * SIZE, BO
  1885. subq $1, %rax
  1886. BRANCH
  1887. jg .L16
  1888. ALIGN_3
  1889. .L18:
  1890. #if defined(LN) || defined(RT)
  1891. movq KK, %rax
  1892. subq $4, %rax
  1893. leaq (, %rax, SIZE), %rax
  1894. movq AORIG, AO
  1895. leaq (AO, %rax, 4), AO
  1896. leaq (B, %rax, 4), BO
  1897. #endif
  1898. addpd %xmm3, %xmm11
  1899. addpd %xmm4, %xmm15
  1900. addpd %xmm5, %xmm10
  1901. addpd %xmm6, %xmm14
  1902. movapd %xmm8, %xmm0
  1903. movsd %xmm9, %xmm8
  1904. movsd %xmm0, %xmm9
  1905. movapd %xmm10, %xmm0
  1906. movsd %xmm11, %xmm10
  1907. movsd %xmm0, %xmm11
  1908. movapd %xmm12, %xmm0
  1909. movsd %xmm13, %xmm12
  1910. movsd %xmm0, %xmm13
  1911. movapd %xmm14, %xmm0
  1912. movsd %xmm15, %xmm14
  1913. movsd %xmm0, %xmm15
  1914. #if defined(LN) || defined(LT)
  1915. movapd %xmm8, %xmm0
  1916. unpcklpd %xmm9, %xmm8
  1917. unpckhpd %xmm9, %xmm0
  1918. movapd %xmm10, %xmm2
  1919. unpcklpd %xmm11, %xmm10
  1920. unpckhpd %xmm11, %xmm2
  1921. movapd %xmm12, %xmm4
  1922. unpcklpd %xmm13, %xmm12
  1923. unpckhpd %xmm13, %xmm4
  1924. movapd %xmm14, %xmm6
  1925. unpcklpd %xmm15, %xmm14
  1926. unpckhpd %xmm15, %xmm6
  1927. movapd -16 * SIZE(BO), %xmm9
  1928. movapd -14 * SIZE(BO), %xmm11
  1929. movapd -12 * SIZE(BO), %xmm13
  1930. movapd -10 * SIZE(BO), %xmm15
  1931. movapd -8 * SIZE(BO), %xmm1
  1932. movapd -6 * SIZE(BO), %xmm3
  1933. movapd -4 * SIZE(BO), %xmm5
  1934. movapd -2 * SIZE(BO), %xmm7
  1935. subpd %xmm8, %xmm9
  1936. subpd %xmm10, %xmm11
  1937. subpd %xmm0, %xmm13
  1938. subpd %xmm2, %xmm15
  1939. subpd %xmm12, %xmm1
  1940. subpd %xmm14, %xmm3
  1941. subpd %xmm4, %xmm5
  1942. subpd %xmm6, %xmm7
  1943. #else
  1944. movapd -16 * SIZE(AO), %xmm0
  1945. movapd -14 * SIZE(AO), %xmm1
  1946. movapd -12 * SIZE(AO), %xmm2
  1947. movapd -10 * SIZE(AO), %xmm3
  1948. movapd -8 * SIZE(AO), %xmm4
  1949. movapd -6 * SIZE(AO), %xmm5
  1950. movapd -4 * SIZE(AO), %xmm6
  1951. movapd -2 * SIZE(AO), %xmm7
  1952. subpd %xmm8, %xmm0
  1953. subpd %xmm12, %xmm1
  1954. subpd %xmm9, %xmm2
  1955. subpd %xmm13, %xmm3
  1956. subpd %xmm10, %xmm4
  1957. subpd %xmm14, %xmm5
  1958. subpd %xmm11, %xmm6
  1959. subpd %xmm15, %xmm7
  1960. #endif
  1961. #ifdef LN
  1962. movddup -1 * SIZE(AO), %xmm8
  1963. mulpd %xmm8, %xmm5
  1964. mulpd %xmm8, %xmm7
  1965. movddup -2 * SIZE(AO), %xmm10
  1966. movapd %xmm10, %xmm12
  1967. mulpd %xmm5, %xmm10
  1968. subpd %xmm10, %xmm1
  1969. mulpd %xmm7, %xmm12
  1970. subpd %xmm12, %xmm3
  1971. movddup -3 * SIZE(AO), %xmm10
  1972. movapd %xmm10, %xmm12
  1973. mulpd %xmm5, %xmm10
  1974. subpd %xmm10, %xmm13
  1975. mulpd %xmm7, %xmm12
  1976. subpd %xmm12, %xmm15
  1977. movddup -4 * SIZE(AO), %xmm10
  1978. movapd %xmm10, %xmm12
  1979. mulpd %xmm5, %xmm10
  1980. subpd %xmm10, %xmm9
  1981. mulpd %xmm7, %xmm12
  1982. subpd %xmm12, %xmm11
  1983. movddup -6 * SIZE(AO), %xmm8
  1984. mulpd %xmm8, %xmm1
  1985. mulpd %xmm8, %xmm3
  1986. movddup -7 * SIZE(AO), %xmm10
  1987. movapd %xmm10, %xmm12
  1988. mulpd %xmm1, %xmm10
  1989. subpd %xmm10, %xmm13
  1990. mulpd %xmm3, %xmm12
  1991. subpd %xmm12, %xmm15
  1992. movddup -8 * SIZE(AO), %xmm10
  1993. movapd %xmm10, %xmm12
  1994. mulpd %xmm1, %xmm10
  1995. subpd %xmm10, %xmm9
  1996. mulpd %xmm3, %xmm12
  1997. subpd %xmm12, %xmm11
  1998. movddup -11 * SIZE(AO), %xmm8
  1999. mulpd %xmm8, %xmm13
  2000. mulpd %xmm8, %xmm15
  2001. movddup -12 * SIZE(AO), %xmm10
  2002. movapd %xmm10, %xmm12
  2003. mulpd %xmm13, %xmm10
  2004. subpd %xmm10, %xmm9
  2005. mulpd %xmm15, %xmm12
  2006. subpd %xmm12, %xmm11
  2007. movddup -16 * SIZE(AO), %xmm8
  2008. mulpd %xmm8, %xmm9
  2009. mulpd %xmm8, %xmm11
  2010. #endif
  2011. #ifdef LT
  2012. movddup -16 * SIZE(AO), %xmm8
  2013. mulpd %xmm8, %xmm9
  2014. mulpd %xmm8, %xmm11
  2015. movddup -15 * SIZE(AO), %xmm10
  2016. movapd %xmm10, %xmm12
  2017. mulpd %xmm9, %xmm10
  2018. subpd %xmm10, %xmm13
  2019. mulpd %xmm11, %xmm12
  2020. subpd %xmm12, %xmm15
  2021. movddup -14 * SIZE(AO), %xmm10
  2022. movapd %xmm10, %xmm12
  2023. mulpd %xmm9, %xmm10
  2024. subpd %xmm10, %xmm1
  2025. mulpd %xmm11, %xmm12
  2026. subpd %xmm12, %xmm3
  2027. movddup -13 * SIZE(AO), %xmm10
  2028. movapd %xmm10, %xmm12
  2029. mulpd %xmm9, %xmm10
  2030. subpd %xmm10, %xmm5
  2031. mulpd %xmm11, %xmm12
  2032. subpd %xmm12, %xmm7
  2033. movddup -11 * SIZE(AO), %xmm8
  2034. mulpd %xmm8, %xmm13
  2035. mulpd %xmm8, %xmm15
  2036. movddup -10 * SIZE(AO), %xmm10
  2037. movapd %xmm10, %xmm12
  2038. mulpd %xmm13, %xmm10
  2039. subpd %xmm10, %xmm1
  2040. mulpd %xmm15, %xmm12
  2041. subpd %xmm12, %xmm3
  2042. movddup -9 * SIZE(AO), %xmm10
  2043. movapd %xmm10, %xmm12
  2044. mulpd %xmm13, %xmm10
  2045. subpd %xmm10, %xmm5
  2046. mulpd %xmm15, %xmm12
  2047. subpd %xmm12, %xmm7
  2048. movddup -6 * SIZE(AO), %xmm8
  2049. mulpd %xmm8, %xmm1
  2050. mulpd %xmm8, %xmm3
  2051. movddup -5 * SIZE(AO), %xmm10
  2052. movapd %xmm10, %xmm12
  2053. mulpd %xmm1, %xmm10
  2054. subpd %xmm10, %xmm5
  2055. mulpd %xmm3, %xmm12
  2056. subpd %xmm12, %xmm7
  2057. movddup -1 * SIZE(AO), %xmm8
  2058. mulpd %xmm8, %xmm5
  2059. mulpd %xmm8, %xmm7
  2060. #endif
  2061. #ifdef RN
  2062. movddup -16 * SIZE(BO), %xmm8
  2063. mulpd %xmm8, %xmm0
  2064. mulpd %xmm8, %xmm1
  2065. movddup -15 * SIZE(BO), %xmm10
  2066. movapd %xmm10, %xmm12
  2067. mulpd %xmm0, %xmm10
  2068. subpd %xmm10, %xmm2
  2069. mulpd %xmm1, %xmm12
  2070. subpd %xmm12, %xmm3
  2071. movddup -14 * SIZE(BO), %xmm10
  2072. movapd %xmm10, %xmm12
  2073. mulpd %xmm0, %xmm10
  2074. subpd %xmm10, %xmm4
  2075. mulpd %xmm1, %xmm12
  2076. subpd %xmm12, %xmm5
  2077. movddup -13 * SIZE(BO), %xmm10
  2078. movapd %xmm10, %xmm12
  2079. mulpd %xmm0, %xmm10
  2080. subpd %xmm10, %xmm6
  2081. mulpd %xmm1, %xmm12
  2082. subpd %xmm12, %xmm7
  2083. movddup -11 * SIZE(BO), %xmm8
  2084. mulpd %xmm8, %xmm2
  2085. mulpd %xmm8, %xmm3
  2086. movddup -10 * SIZE(BO), %xmm10
  2087. movapd %xmm10, %xmm12
  2088. mulpd %xmm2, %xmm10
  2089. subpd %xmm10, %xmm4
  2090. mulpd %xmm3, %xmm12
  2091. subpd %xmm12, %xmm5
  2092. movddup -9 * SIZE(BO), %xmm10
  2093. movapd %xmm10, %xmm12
  2094. mulpd %xmm2, %xmm10
  2095. subpd %xmm10, %xmm6
  2096. mulpd %xmm3, %xmm12
  2097. subpd %xmm12, %xmm7
  2098. movddup -6 * SIZE(BO), %xmm8
  2099. mulpd %xmm8, %xmm4
  2100. mulpd %xmm8, %xmm5
  2101. movddup -5 * SIZE(BO), %xmm10
  2102. movapd %xmm10, %xmm12
  2103. mulpd %xmm4, %xmm10
  2104. subpd %xmm10, %xmm6
  2105. mulpd %xmm5, %xmm12
  2106. subpd %xmm12, %xmm7
  2107. movddup -1 * SIZE(BO), %xmm8
  2108. mulpd %xmm8, %xmm6
  2109. mulpd %xmm8, %xmm7
  2110. #endif
  2111. #ifdef RT
  2112. movddup -1 * SIZE(BO), %xmm8
  2113. mulpd %xmm8, %xmm6
  2114. mulpd %xmm8, %xmm7
  2115. movddup -2 * SIZE(BO), %xmm10
  2116. movapd %xmm10, %xmm12
  2117. mulpd %xmm6, %xmm10
  2118. subpd %xmm10, %xmm4
  2119. mulpd %xmm7, %xmm12
  2120. subpd %xmm12, %xmm5
  2121. movddup -3 * SIZE(BO), %xmm10
  2122. movapd %xmm10, %xmm12
  2123. mulpd %xmm6, %xmm10
  2124. subpd %xmm10, %xmm2
  2125. mulpd %xmm7, %xmm12
  2126. subpd %xmm12, %xmm3
  2127. movddup -4 * SIZE(BO), %xmm10
  2128. movapd %xmm10, %xmm12
  2129. mulpd %xmm6, %xmm10
  2130. subpd %xmm10, %xmm0
  2131. mulpd %xmm7, %xmm12
  2132. subpd %xmm12, %xmm1
  2133. movddup -6 * SIZE(BO), %xmm8
  2134. mulpd %xmm8, %xmm4
  2135. mulpd %xmm8, %xmm5
  2136. movddup -7 * SIZE(BO), %xmm10
  2137. movapd %xmm10, %xmm12
  2138. mulpd %xmm4, %xmm10
  2139. subpd %xmm10, %xmm2
  2140. mulpd %xmm5, %xmm12
  2141. subpd %xmm12, %xmm3
  2142. movddup -8 * SIZE(BO), %xmm10
  2143. movapd %xmm10, %xmm12
  2144. mulpd %xmm4, %xmm10
  2145. subpd %xmm10, %xmm0
  2146. mulpd %xmm5, %xmm12
  2147. subpd %xmm12, %xmm1
  2148. movddup -11 * SIZE(BO), %xmm8
  2149. mulpd %xmm8, %xmm2
  2150. mulpd %xmm8, %xmm3
  2151. movddup -12 * SIZE(BO), %xmm10
  2152. movapd %xmm10, %xmm12
  2153. mulpd %xmm2, %xmm10
  2154. subpd %xmm10, %xmm0
  2155. mulpd %xmm3, %xmm12
  2156. subpd %xmm12, %xmm1
  2157. movddup -16 * SIZE(BO), %xmm8
  2158. mulpd %xmm8, %xmm0
  2159. mulpd %xmm8, %xmm1
  2160. #endif
  2161. #ifdef LN
  2162. subq $4 * SIZE, CO1
  2163. subq $4 * SIZE, CO2
  2164. #endif
  2165. #if defined(LN) || defined(LT)
  2166. movsd %xmm9, 0 * SIZE(CO1)
  2167. movsd %xmm13, 1 * SIZE(CO1)
  2168. movsd %xmm1, 2 * SIZE(CO1)
  2169. movsd %xmm5, 3 * SIZE(CO1)
  2170. movhpd %xmm9, 0 * SIZE(CO2)
  2171. movhpd %xmm13, 1 * SIZE(CO2)
  2172. movhpd %xmm1, 2 * SIZE(CO2)
  2173. movhpd %xmm5, 3 * SIZE(CO2)
  2174. movsd %xmm11, 0 * SIZE(CO1, LDC, 2)
  2175. movsd %xmm15, 1 * SIZE(CO1, LDC, 2)
  2176. movsd %xmm3, 2 * SIZE(CO1, LDC, 2)
  2177. movsd %xmm7, 3 * SIZE(CO1, LDC, 2)
  2178. movhpd %xmm11, 0 * SIZE(CO2, LDC, 2)
  2179. movhpd %xmm15, 1 * SIZE(CO2, LDC, 2)
  2180. movhpd %xmm3, 2 * SIZE(CO2, LDC, 2)
  2181. movhpd %xmm7, 3 * SIZE(CO2, LDC, 2)
  2182. #else
  2183. movsd %xmm0, 0 * SIZE(CO1)
  2184. movhpd %xmm0, 1 * SIZE(CO1)
  2185. movsd %xmm1, 2 * SIZE(CO1)
  2186. movhpd %xmm1, 3 * SIZE(CO1)
  2187. movsd %xmm2, 0 * SIZE(CO2)
  2188. movhpd %xmm2, 1 * SIZE(CO2)
  2189. movsd %xmm3, 2 * SIZE(CO2)
  2190. movhpd %xmm3, 3 * SIZE(CO2)
  2191. movsd %xmm4, 0 * SIZE(CO1, LDC, 2)
  2192. movhpd %xmm4, 1 * SIZE(CO1, LDC, 2)
  2193. movsd %xmm5, 2 * SIZE(CO1, LDC, 2)
  2194. movhpd %xmm5, 3 * SIZE(CO1, LDC, 2)
  2195. movsd %xmm6, 0 * SIZE(CO2, LDC, 2)
  2196. movhpd %xmm6, 1 * SIZE(CO2, LDC, 2)
  2197. movsd %xmm7, 2 * SIZE(CO2, LDC, 2)
  2198. movhpd %xmm7, 3 * SIZE(CO2, LDC, 2)
  2199. #endif
  2200. #if defined(LN) || defined(LT)
  2201. movapd %xmm9, -16 * SIZE(BO)
  2202. movapd %xmm11, -14 * SIZE(BO)
  2203. movapd %xmm13, -12 * SIZE(BO)
  2204. movapd %xmm15, -10 * SIZE(BO)
  2205. movapd %xmm1, -8 * SIZE(BO)
  2206. movapd %xmm3, -6 * SIZE(BO)
  2207. movapd %xmm5, -4 * SIZE(BO)
  2208. movapd %xmm7, -2 * SIZE(BO)
  2209. #else
  2210. movapd %xmm0, -16 * SIZE(AO)
  2211. movapd %xmm1, -14 * SIZE(AO)
  2212. movapd %xmm2, -12 * SIZE(AO)
  2213. movapd %xmm3, -10 * SIZE(AO)
  2214. movapd %xmm4, -8 * SIZE(AO)
  2215. movapd %xmm5, -6 * SIZE(AO)
  2216. movapd %xmm6, -4 * SIZE(AO)
  2217. movapd %xmm7, -2 * SIZE(AO)
  2218. #endif
  2219. #ifndef LN
  2220. addq $4 * SIZE, CO1
  2221. addq $4 * SIZE, CO2
  2222. #endif
  2223. #if defined(LT) || defined(RN)
  2224. movq K, %rax
  2225. subq KK, %rax
  2226. leaq (,%rax, SIZE), %rax
  2227. leaq (AO, %rax, 4), AO
  2228. leaq (BO, %rax, 4), BO
  2229. #endif
  2230. #ifdef LN
  2231. subq $4, KK
  2232. #endif
  2233. #ifdef LT
  2234. addq $4, KK
  2235. #endif
  2236. #ifdef RT
  2237. movq K, %rax
  2238. salq $2 + BASE_SHIFT, %rax
  2239. addq %rax, AORIG
  2240. #endif
  2241. decq I # i --
  2242. BRANCH
  2243. jg .L11
  2244. ALIGN_4
  2245. .L20:
  2246. testq $2, M
  2247. BRANCH
  2248. jle .L30
  2249. ALIGN_4
  2250. #ifdef LN
  2251. movq K, %rax
  2252. salq $1 + BASE_SHIFT, %rax
  2253. subq %rax, AORIG
  2254. #endif
  2255. #if defined(LN) || defined(RT)
  2256. movq KK, %rax
  2257. leaq (, %rax, SIZE), %rax
  2258. movq AORIG, AO
  2259. leaq (AO, %rax, 2), AO
  2260. leaq (B, %rax, 4), BO
  2261. #else
  2262. movq B, BO
  2263. #endif
  2264. movaps -16 * SIZE(AO), %xmm0
  2265. pxor %xmm3, %xmm3
  2266. movaps -16 * SIZE(BO), %xmm2
  2267. pxor %xmm5, %xmm5
  2268. pxor %xmm8, %xmm8
  2269. pxor %xmm9, %xmm9
  2270. pxor %xmm10, %xmm10
  2271. pxor %xmm11, %xmm11
  2272. #if defined(LT) || defined(RN)
  2273. movq KK, %rax
  2274. #else
  2275. movq K, %rax
  2276. subq KK, %rax
  2277. #endif
  2278. sarq $2, %rax
  2279. NOBRANCH
  2280. jle .L25
  2281. ALIGN_4
  2282. .L22:
  2283. addpd %xmm3, %xmm11
  2284. movaps -14 * SIZE(BO), %xmm3
  2285. pshufd $0x4e, %xmm2, %xmm7
  2286. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2287. mulpd %xmm0, %xmm2
  2288. addpd %xmm5, %xmm10
  2289. mulpd %xmm0, %xmm7
  2290. addpd %xmm2, %xmm9
  2291. movaps -12 * SIZE(BO), %xmm2
  2292. pshufd $0x4e, %xmm3, %xmm5
  2293. mulpd %xmm0, %xmm3
  2294. addpd %xmm7, %xmm8
  2295. mulpd %xmm0, %xmm5
  2296. movaps -14 * SIZE(AO), %xmm0
  2297. addpd %xmm3, %xmm11
  2298. movaps -10 * SIZE(BO), %xmm3
  2299. pshufd $0x4e, %xmm2, %xmm7
  2300. mulpd %xmm0, %xmm2
  2301. addpd %xmm5, %xmm10
  2302. mulpd %xmm0, %xmm7
  2303. addpd %xmm2, %xmm9
  2304. movaps -8 * SIZE(BO), %xmm2
  2305. pshufd $0x4e, %xmm3, %xmm5
  2306. mulpd %xmm0, %xmm3
  2307. addpd %xmm7, %xmm8
  2308. mulpd %xmm0, %xmm5
  2309. movaps -12 * SIZE(AO), %xmm0
  2310. addpd %xmm3, %xmm11
  2311. movaps -6 * SIZE(BO), %xmm3
  2312. pshufd $0x4e, %xmm2, %xmm7
  2313. mulpd %xmm0, %xmm2
  2314. addpd %xmm5, %xmm10
  2315. mulpd %xmm0, %xmm7
  2316. addpd %xmm2, %xmm9
  2317. movaps -4 * SIZE(BO), %xmm2
  2318. pshufd $0x4e, %xmm3, %xmm5
  2319. mulpd %xmm0, %xmm3
  2320. addpd %xmm7, %xmm8
  2321. mulpd %xmm0, %xmm5
  2322. movaps -10 * SIZE(AO), %xmm0
  2323. addpd %xmm3, %xmm11
  2324. movaps -2 * SIZE(BO), %xmm3
  2325. pshufd $0x4e, %xmm2, %xmm7
  2326. mulpd %xmm0, %xmm2
  2327. addpd %xmm5, %xmm10
  2328. mulpd %xmm0, %xmm7
  2329. subq $ -8 * SIZE, AO
  2330. addpd %xmm2, %xmm9
  2331. movaps 0 * SIZE(BO), %xmm2
  2332. pshufd $0x4e, %xmm3, %xmm5
  2333. mulpd %xmm0, %xmm3
  2334. addpd %xmm7, %xmm8
  2335. mulpd %xmm0, %xmm5
  2336. movaps -16 * SIZE(AO), %xmm0
  2337. subq $-16 * SIZE, BO
  2338. subq $1, %rax
  2339. BRANCH
  2340. jg .L22
  2341. ALIGN_4
  2342. .L25:
  2343. #if defined(LT) || defined(RN)
  2344. movq KK, %rax
  2345. #else
  2346. movq K, %rax
  2347. subq KK, %rax
  2348. #endif
  2349. andq $3, %rax # if (k & 1)
  2350. BRANCH
  2351. je .L28
  2352. ALIGN_4
  2353. .L26:
  2354. addpd %xmm3, %xmm11
  2355. movaps -14 * SIZE(BO), %xmm3
  2356. pshufd $0x4e, %xmm2, %xmm7
  2357. mulpd %xmm0, %xmm2
  2358. addpd %xmm5, %xmm10
  2359. mulpd %xmm0, %xmm7
  2360. addpd %xmm2, %xmm9
  2361. movaps -12 * SIZE(BO), %xmm2
  2362. pshufd $0x4e, %xmm3, %xmm5
  2363. mulpd %xmm0, %xmm3
  2364. addpd %xmm7, %xmm8
  2365. mulpd %xmm0, %xmm5
  2366. movaps -14 * SIZE(AO), %xmm0
  2367. addq $2 * SIZE, AO
  2368. addq $4 * SIZE, BO
  2369. subq $1, %rax
  2370. BRANCH
  2371. jg .L26
  2372. ALIGN_4
  2373. .L28:
  2374. #if defined(LN) || defined(RT)
  2375. movq KK, %rax
  2376. #ifdef LN
  2377. subq $2, %rax
  2378. #else
  2379. subq $4, %rax
  2380. #endif
  2381. leaq (, %rax, SIZE), %rax
  2382. movq AORIG, AO
  2383. leaq (AO, %rax, 2), AO
  2384. leaq (B, %rax, 4), BO
  2385. #endif
  2386. addpd %xmm3, %xmm11
  2387. addpd %xmm5, %xmm10
  2388. movapd %xmm8, %xmm0
  2389. movsd %xmm9, %xmm8
  2390. movsd %xmm0, %xmm9
  2391. movapd %xmm10, %xmm0
  2392. movsd %xmm11, %xmm10
  2393. movsd %xmm0, %xmm11
  2394. #if defined(LN) || defined(LT)
  2395. movapd %xmm8, %xmm0
  2396. unpcklpd %xmm9, %xmm8
  2397. unpckhpd %xmm9, %xmm0
  2398. movapd %xmm10, %xmm2
  2399. unpcklpd %xmm11, %xmm10
  2400. unpckhpd %xmm11, %xmm2
  2401. movapd -16 * SIZE(BO), %xmm9
  2402. movapd -14 * SIZE(BO), %xmm11
  2403. movapd -12 * SIZE(BO), %xmm13
  2404. movapd -10 * SIZE(BO), %xmm15
  2405. subpd %xmm8, %xmm9
  2406. subpd %xmm10, %xmm11
  2407. subpd %xmm0, %xmm13
  2408. subpd %xmm2, %xmm15
  2409. #else
  2410. movapd -16 * SIZE(AO), %xmm0
  2411. movapd -14 * SIZE(AO), %xmm2
  2412. movapd -12 * SIZE(AO), %xmm4
  2413. movapd -10 * SIZE(AO), %xmm6
  2414. subpd %xmm8, %xmm0
  2415. subpd %xmm9, %xmm2
  2416. subpd %xmm10, %xmm4
  2417. subpd %xmm11, %xmm6
  2418. #endif
  2419. #ifdef LN
  2420. movddup -13 * SIZE(AO), %xmm8
  2421. mulpd %xmm8, %xmm13
  2422. mulpd %xmm8, %xmm15
  2423. movddup -14 * SIZE(AO), %xmm10
  2424. movapd %xmm10, %xmm12
  2425. mulpd %xmm13, %xmm10
  2426. subpd %xmm10, %xmm9
  2427. mulpd %xmm15, %xmm12
  2428. subpd %xmm12, %xmm11
  2429. movddup -16 * SIZE(AO), %xmm8
  2430. mulpd %xmm8, %xmm9
  2431. mulpd %xmm8, %xmm11
  2432. #endif
  2433. #ifdef LT
  2434. movddup -16 * SIZE(AO), %xmm8
  2435. mulpd %xmm8, %xmm9
  2436. mulpd %xmm8, %xmm11
  2437. movddup -15 * SIZE(AO), %xmm10
  2438. movapd %xmm10, %xmm12
  2439. mulpd %xmm9, %xmm10
  2440. subpd %xmm10, %xmm13
  2441. mulpd %xmm11, %xmm12
  2442. subpd %xmm12, %xmm15
  2443. movddup -13 * SIZE(AO), %xmm8
  2444. mulpd %xmm8, %xmm13
  2445. mulpd %xmm8, %xmm15
  2446. #endif
  2447. #ifdef RN
  2448. movddup -16 * SIZE(BO), %xmm8
  2449. mulpd %xmm8, %xmm0
  2450. movddup -15 * SIZE(BO), %xmm9
  2451. mulpd %xmm0, %xmm9
  2452. subpd %xmm9, %xmm2
  2453. movddup -14 * SIZE(BO), %xmm10
  2454. mulpd %xmm0, %xmm10
  2455. subpd %xmm10, %xmm4
  2456. movddup -13 * SIZE(BO), %xmm11
  2457. mulpd %xmm0, %xmm11
  2458. subpd %xmm11, %xmm6
  2459. movddup -11 * SIZE(BO), %xmm8
  2460. mulpd %xmm8, %xmm2
  2461. movddup -10 * SIZE(BO), %xmm9
  2462. mulpd %xmm2, %xmm9
  2463. subpd %xmm9, %xmm4
  2464. movddup -9 * SIZE(BO), %xmm10
  2465. mulpd %xmm2, %xmm10
  2466. subpd %xmm10, %xmm6
  2467. movddup -6 * SIZE(BO), %xmm8
  2468. mulpd %xmm8, %xmm4
  2469. movddup -5 * SIZE(BO), %xmm9
  2470. mulpd %xmm4, %xmm9
  2471. subpd %xmm9, %xmm6
  2472. movddup -1 * SIZE(BO), %xmm8
  2473. mulpd %xmm8, %xmm6
  2474. #endif
  2475. #ifdef RT
  2476. movddup -1 * SIZE(BO), %xmm8
  2477. mulpd %xmm8, %xmm6
  2478. movddup -2 * SIZE(BO), %xmm9
  2479. mulpd %xmm6, %xmm9
  2480. subpd %xmm9, %xmm4
  2481. movddup -3 * SIZE(BO), %xmm10
  2482. mulpd %xmm6, %xmm10
  2483. subpd %xmm10, %xmm2
  2484. movddup -4 * SIZE(BO), %xmm11
  2485. mulpd %xmm6, %xmm11
  2486. subpd %xmm11, %xmm0
  2487. movddup -6 * SIZE(BO), %xmm8
  2488. mulpd %xmm8, %xmm4
  2489. movddup -7 * SIZE(BO), %xmm9
  2490. mulpd %xmm4, %xmm9
  2491. subpd %xmm9, %xmm2
  2492. movddup -8 * SIZE(BO), %xmm10
  2493. mulpd %xmm4, %xmm10
  2494. subpd %xmm10, %xmm0
  2495. movddup -11 * SIZE(BO), %xmm8
  2496. mulpd %xmm8, %xmm2
  2497. movddup -12 * SIZE(BO), %xmm9
  2498. mulpd %xmm2, %xmm9
  2499. subpd %xmm9, %xmm0
  2500. movddup -16 * SIZE(BO), %xmm8
  2501. mulpd %xmm8, %xmm0
  2502. #endif
  2503. #ifdef LN
  2504. subq $2 * SIZE, CO1
  2505. subq $2 * SIZE, CO2
  2506. #endif
  2507. #if defined(LN) || defined(LT)
  2508. movsd %xmm9, 0 * SIZE(CO1)
  2509. movsd %xmm13, 1 * SIZE(CO1)
  2510. movhpd %xmm9, 0 * SIZE(CO2)
  2511. movhpd %xmm13, 1 * SIZE(CO2)
  2512. movsd %xmm11, 0 * SIZE(CO1, LDC, 2)
  2513. movsd %xmm15, 1 * SIZE(CO1, LDC, 2)
  2514. movhpd %xmm11, 0 * SIZE(CO2, LDC, 2)
  2515. movhpd %xmm15, 1 * SIZE(CO2, LDC, 2)
  2516. #else
  2517. movsd %xmm0, 0 * SIZE(CO1)
  2518. movhpd %xmm0, 1 * SIZE(CO1)
  2519. movsd %xmm2, 0 * SIZE(CO2)
  2520. movhpd %xmm2, 1 * SIZE(CO2)
  2521. movsd %xmm4, 0 * SIZE(CO1, LDC, 2)
  2522. movhpd %xmm4, 1 * SIZE(CO1, LDC, 2)
  2523. movsd %xmm6, 0 * SIZE(CO2, LDC, 2)
  2524. movhpd %xmm6, 1 * SIZE(CO2, LDC, 2)
  2525. #endif
  2526. #if defined(LN) || defined(LT)
  2527. movapd %xmm9, -16 * SIZE(BO)
  2528. movapd %xmm11, -14 * SIZE(BO)
  2529. movapd %xmm13, -12 * SIZE(BO)
  2530. movapd %xmm15, -10 * SIZE(BO)
  2531. #else
  2532. movapd %xmm0, -16 * SIZE(AO)
  2533. movapd %xmm2, -14 * SIZE(AO)
  2534. movapd %xmm4, -12 * SIZE(AO)
  2535. movapd %xmm6, -10 * SIZE(AO)
  2536. #endif
  2537. #ifndef LN
  2538. addq $2 * SIZE, CO1
  2539. addq $2 * SIZE, CO2
  2540. #endif
  2541. #if defined(LT) || defined(RN)
  2542. movq K, %rax
  2543. subq KK, %rax
  2544. leaq (,%rax, SIZE), %rax
  2545. leaq (AO, %rax, 2), AO
  2546. leaq (BO, %rax, 4), BO
  2547. #endif
  2548. #ifdef LN
  2549. subq $2, KK
  2550. #endif
  2551. #ifdef LT
  2552. addq $2, KK
  2553. #endif
  2554. #ifdef RT
  2555. movq K, %rax
  2556. salq $1 + BASE_SHIFT, %rax
  2557. addq %rax, AORIG
  2558. #endif
  2559. ALIGN_4
  2560. .L30:
  2561. testq $1, M
  2562. BRANCH
  2563. jle .L39
  2564. ALIGN_4
  2565. #ifdef LN
  2566. movq K, %rax
  2567. salq $BASE_SHIFT, %rax
  2568. subq %rax, AORIG
  2569. #endif
  2570. #if defined(LN) || defined(RT)
  2571. movq KK, %rax
  2572. leaq (, %rax, SIZE), %rax
  2573. movq AORIG, AO
  2574. leaq (AO, %rax, 1), AO
  2575. leaq (B, %rax, 4), BO
  2576. #else
  2577. movq B, BO
  2578. #endif
  2579. movsd -16 * SIZE(AO), %xmm0
  2580. movaps -16 * SIZE(BO), %xmm2
  2581. movaps -14 * SIZE(BO), %xmm3
  2582. pxor %xmm8, %xmm8
  2583. pxor %xmm9, %xmm9
  2584. pxor %xmm10, %xmm10
  2585. pxor %xmm11, %xmm11
  2586. #if defined(LT) || defined(RN)
  2587. movq KK, %rax
  2588. #else
  2589. movq K, %rax
  2590. subq KK, %rax
  2591. #endif
  2592. sarq $2, %rax
  2593. NOBRANCH
  2594. jle .L35
  2595. ALIGN_4
  2596. .L32:
  2597. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2598. shufps $0x44, %xmm0, %xmm0
  2599. mulpd %xmm0, %xmm2
  2600. mulpd %xmm0, %xmm3
  2601. movsd -15 * SIZE(AO), %xmm0
  2602. addpd %xmm2, %xmm8
  2603. movaps -12 * SIZE(BO), %xmm2
  2604. addpd %xmm3, %xmm9
  2605. movaps -10 * SIZE(BO), %xmm3
  2606. shufps $0x44, %xmm0, %xmm0
  2607. mulpd %xmm0, %xmm2
  2608. mulpd %xmm0, %xmm3
  2609. movsd -14 * SIZE(AO), %xmm0
  2610. addpd %xmm2, %xmm10
  2611. movaps -8 * SIZE(BO), %xmm2
  2612. addpd %xmm3, %xmm11
  2613. movaps -6 * SIZE(BO), %xmm3
  2614. shufps $0x44, %xmm0, %xmm0
  2615. mulpd %xmm0, %xmm2
  2616. mulpd %xmm0, %xmm3
  2617. movsd -13 * SIZE(AO), %xmm0
  2618. addpd %xmm2, %xmm8
  2619. movaps -4 * SIZE(BO), %xmm2
  2620. addpd %xmm3, %xmm9
  2621. movaps -2 * SIZE(BO), %xmm3
  2622. shufps $0x44, %xmm0, %xmm0
  2623. mulpd %xmm0, %xmm2
  2624. mulpd %xmm0, %xmm3
  2625. movsd -12 * SIZE(AO), %xmm0
  2626. addpd %xmm2, %xmm10
  2627. movaps 0 * SIZE(BO), %xmm2
  2628. addpd %xmm3, %xmm11
  2629. movaps 2 * SIZE(BO), %xmm3
  2630. subq $ -4 * SIZE, AO
  2631. subq $-16 * SIZE, BO
  2632. subq $1, %rax
  2633. BRANCH
  2634. jg .L32
  2635. ALIGN_4
  2636. .L35:
  2637. #if defined(LT) || defined(RN)
  2638. movq KK, %rax
  2639. #else
  2640. movq K, %rax
  2641. subq KK, %rax
  2642. #endif
  2643. andq $3, %rax # if (k & 1)
  2644. BRANCH
  2645. je .L38
  2646. ALIGN_4
  2647. .L36:
  2648. shufps $0x44, %xmm0, %xmm0
  2649. mulpd %xmm0, %xmm2
  2650. mulpd %xmm0, %xmm3
  2651. movsd -15 * SIZE(AO), %xmm0
  2652. addpd %xmm2, %xmm8
  2653. movaps -12 * SIZE(BO), %xmm2
  2654. addpd %xmm3, %xmm9
  2655. movaps -10 * SIZE(BO), %xmm3
  2656. addq $1 * SIZE, AO
  2657. addq $4 * SIZE, BO
  2658. subq $1, %rax
  2659. BRANCH
  2660. jg .L36
  2661. ALIGN_4
  2662. .L38:
  2663. #if defined(LN) || defined(RT)
  2664. movq KK, %rax
  2665. #ifdef LN
  2666. subq $1, %rax
  2667. #else
  2668. subq $4, %rax
  2669. #endif
  2670. leaq (, %rax, SIZE), %rax
  2671. movq AORIG, AO
  2672. leaq (AO, %rax, 1), AO
  2673. leaq (B, %rax, 4), BO
  2674. #endif
  2675. addpd %xmm10, %xmm8
  2676. addpd %xmm11, %xmm9
  2677. #if defined(LN) || defined(LT)
  2678. movaps -16 * SIZE(BO), %xmm12
  2679. movaps -14 * SIZE(BO), %xmm13
  2680. #else
  2681. movaps -16 * SIZE(AO), %xmm12
  2682. movaps -14 * SIZE(AO), %xmm13
  2683. #endif
  2684. subpd %xmm8, %xmm12
  2685. subpd %xmm9, %xmm13
  2686. #if defined(RN) || defined(RT)
  2687. movhlps %xmm13, %xmm15
  2688. movsd %xmm13, %xmm14
  2689. movhlps %xmm12, %xmm13
  2690. movsd %xmm12, %xmm12
  2691. #endif
  2692. #if defined(LN) || defined(LT)
  2693. movddup -16 * SIZE(AO), %xmm8
  2694. mulpd %xmm8, %xmm12
  2695. mulpd %xmm8, %xmm13
  2696. #endif
  2697. #ifdef RN
  2698. mulsd -16 * SIZE(BO), %xmm12
  2699. movlpd -15 * SIZE(BO), %xmm9
  2700. mulsd %xmm12, %xmm9
  2701. subsd %xmm9, %xmm13
  2702. movlpd -14 * SIZE(BO), %xmm10
  2703. mulsd %xmm12, %xmm10
  2704. subsd %xmm10, %xmm14
  2705. movlpd -13 * SIZE(BO), %xmm11
  2706. mulsd %xmm12, %xmm11
  2707. subsd %xmm11, %xmm15
  2708. mulsd -11 * SIZE(BO), %xmm13
  2709. movlpd -10 * SIZE(BO), %xmm9
  2710. mulsd %xmm13, %xmm9
  2711. subsd %xmm9, %xmm14
  2712. movlpd -9 * SIZE(BO), %xmm10
  2713. mulsd %xmm13, %xmm10
  2714. subsd %xmm10, %xmm15
  2715. mulsd -6 * SIZE(BO), %xmm14
  2716. movlpd -5 * SIZE(BO), %xmm9
  2717. mulsd %xmm14, %xmm9
  2718. subsd %xmm9, %xmm15
  2719. mulsd -1 * SIZE(BO), %xmm15
  2720. #endif
  2721. #ifdef RT
  2722. mulsd -1 * SIZE(BO), %xmm15
  2723. movlpd -2 * SIZE(BO), %xmm9
  2724. mulsd %xmm15, %xmm9
  2725. subsd %xmm9, %xmm14
  2726. movlpd -3 * SIZE(BO), %xmm10
  2727. mulsd %xmm15, %xmm10
  2728. subsd %xmm10, %xmm13
  2729. movlpd -4 * SIZE(BO), %xmm11
  2730. mulsd %xmm15, %xmm11
  2731. subsd %xmm11, %xmm12
  2732. mulsd -6 * SIZE(BO), %xmm14
  2733. movlpd -7 * SIZE(BO), %xmm9
  2734. mulsd %xmm14, %xmm9
  2735. subsd %xmm9, %xmm13
  2736. movlpd -8 * SIZE(BO), %xmm10
  2737. mulsd %xmm14, %xmm10
  2738. subsd %xmm10, %xmm12
  2739. mulsd -11 * SIZE(BO), %xmm13
  2740. movlpd -12 * SIZE(BO), %xmm9
  2741. mulsd %xmm13, %xmm9
  2742. subsd %xmm9, %xmm12
  2743. mulsd -16 * SIZE(BO), %xmm12
  2744. #endif
  2745. #ifdef LN
  2746. subq $1 * SIZE, CO1
  2747. subq $1 * SIZE, CO2
  2748. #endif
  2749. #if defined(LN) || defined(LT)
  2750. movsd %xmm12, 0 * SIZE(CO1)
  2751. movhps %xmm12, 0 * SIZE(CO2)
  2752. movsd %xmm13, 0 * SIZE(CO1, LDC, 2)
  2753. movhps %xmm13, 0 * SIZE(CO2, LDC, 2)
  2754. movaps %xmm12, -16 * SIZE(BO)
  2755. movaps %xmm13, -14 * SIZE(BO)
  2756. #else
  2757. movsd %xmm12, 0 * SIZE(CO1)
  2758. movsd %xmm13, 0 * SIZE(CO2)
  2759. movsd %xmm14, 0 * SIZE(CO1, LDC, 2)
  2760. movsd %xmm15, 0 * SIZE(CO2, LDC, 2)
  2761. movsd %xmm12, -16 * SIZE(AO)
  2762. movsd %xmm13, -15 * SIZE(AO)
  2763. movsd %xmm14, -14 * SIZE(AO)
  2764. movsd %xmm15, -13 * SIZE(AO)
  2765. #endif
  2766. #ifndef LN
  2767. addq $1 * SIZE, CO1
  2768. addq $1 * SIZE, CO2
  2769. #endif
  2770. #if defined(LT) || defined(RN)
  2771. movq K, %rax
  2772. subq KK, %rax
  2773. leaq (,%rax, SIZE), %rax
  2774. leaq (AO, %rax, 1), AO
  2775. leaq (BO, %rax, 4), BO
  2776. #endif
  2777. #ifdef LN
  2778. subq $1, KK
  2779. #endif
  2780. #ifdef LT
  2781. addq $1, KK
  2782. #endif
  2783. #ifdef RT
  2784. movq K, %rax
  2785. salq $BASE_SHIFT, %rax
  2786. addq %rax, AORIG
  2787. #endif
  2788. ALIGN_4
  2789. .L39:
  2790. #ifdef LN
  2791. leaq (, K, SIZE), %rax
  2792. leaq (B, %rax, 4), B
  2793. #endif
  2794. #if defined(LT) || defined(RN)
  2795. movq BO, B
  2796. #endif
  2797. #ifdef RN
  2798. addq $4, KK
  2799. #endif
  2800. #ifdef RT
  2801. subq $4, KK
  2802. #endif
  2803. subq $1, J
  2804. BRANCH
  2805. jg .L01
  2806. ALIGN_4
  2807. .L999:
  2808. movq 0(%rsp), %rbx
  2809. movq 8(%rsp), %rbp
  2810. movq 16(%rsp), %r12
  2811. movq 24(%rsp), %r13
  2812. movq 32(%rsp), %r14
  2813. movq 40(%rsp), %r15
  2814. #ifdef WINDOWS_ABI
  2815. movq 48(%rsp), %rdi
  2816. movq 56(%rsp), %rsi
  2817. movups 64(%rsp), %xmm6
  2818. movups 80(%rsp), %xmm7
  2819. movups 96(%rsp), %xmm8
  2820. movups 112(%rsp), %xmm9
  2821. movups 128(%rsp), %xmm10
  2822. movups 144(%rsp), %xmm11
  2823. movups 160(%rsp), %xmm12
  2824. movups 176(%rsp), %xmm13
  2825. movups 192(%rsp), %xmm14
  2826. movups 208(%rsp), %xmm15
  2827. #endif
  2828. addq $STACKSIZE, %rsp
  2829. ret
  2830. EPILOGUE