You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm3m_kernel_4x4_opteron.S 54 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define OLD_M 4 + STACK(%esi)
  42. #define OLD_N 8 + STACK(%esi)
  43. #define OLD_K 12 + STACK(%esi)
  44. #define OLD_ALPHA_R 16 + STACK(%esi)
  45. #define OLD_ALPHA_I 20 + STACK(%esi)
  46. #define OLD_A 24 + STACK(%esi)
  47. #define OLD_B 28 + STACK(%esi)
  48. #define OLD_C 32 + STACK(%esi)
  49. #define OLD_LDC 36 + STACK(%esi)
  50. #define ALPHA 0(%esp)
  51. #define K 16(%esp)
  52. #define N 20(%esp)
  53. #define M 24(%esp)
  54. #define A 28(%esp)
  55. #define C 32(%esp)
  56. #define J 36(%esp)
  57. #define BX 40(%esp)
  58. #define OLD_STACK 44(%esp)
  59. #define OFFSET 48(%esp)
  60. #define KK 52(%esp)
  61. #define KKK 56(%esp)
  62. #define BUFFER 128(%esp)
  63. #ifdef ATHLON
  64. #define PREFETCH prefetch
  65. #define PREFETCHSIZE 64
  66. #endif
  67. #if defined(OPTERON) || defined(BARCELONA)
  68. #define PREFETCH prefetch
  69. #define PREFETCHSIZE (16 * 10 + 8)
  70. #endif
  71. #ifdef PENTIUM4
  72. #define PREFETCH prefetcht0
  73. #define PREFETCHSIZE 96
  74. #endif
  75. #define AA %edx
  76. #define BB %ecx
  77. #define LDC %ebp
  78. #if defined(OPTERON) || defined(BARCELONA)
  79. #define movsd movlps
  80. #endif
  81. #if defined(OPTERON) || defined(BARCELONA)
  82. #define KERNEL1(address) \
  83. mulps %xmm0, %xmm2; \
  84. addps %xmm2, %xmm4; \
  85. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  86. movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  87. mulps %xmm0, %xmm2; \
  88. addps %xmm2, %xmm5; \
  89. movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  90. mulps %xmm0, %xmm2; \
  91. mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  92. addps %xmm2, %xmm6; \
  93. movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  94. addps %xmm0, %xmm7; \
  95. movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  96. #define KERNEL2(address) \
  97. mulps %xmm0, %xmm3; \
  98. addps %xmm3, %xmm4; \
  99. movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  100. mulps %xmm0, %xmm3; \
  101. addps %xmm3, %xmm5; \
  102. movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  103. mulps %xmm0, %xmm3; \
  104. mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  105. addps %xmm3, %xmm6; \
  106. movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  107. addps %xmm0, %xmm7; \
  108. movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  109. #define KERNEL3(address) \
  110. mulps %xmm0, %xmm2; \
  111. addps %xmm2, %xmm4; \
  112. movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  113. mulps %xmm0, %xmm2; \
  114. addps %xmm2, %xmm5; \
  115. movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  116. mulps %xmm0, %xmm2; \
  117. mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  118. addps %xmm2, %xmm6; \
  119. movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  120. addps %xmm0, %xmm7; \
  121. movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  122. #define KERNEL4(address) \
  123. mulps %xmm0, %xmm3; \
  124. addps %xmm3, %xmm4; \
  125. movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  126. mulps %xmm0, %xmm3; \
  127. addps %xmm3, %xmm5; \
  128. movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  129. mulps %xmm0, %xmm3; \
  130. mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  131. addps %xmm3, %xmm6; \
  132. movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  133. addps %xmm0, %xmm7; \
  134. movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  135. #define KERNEL5(address) \
  136. PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \
  137. mulps %xmm1, %xmm2; \
  138. addps %xmm2, %xmm4; \
  139. movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  140. mulps %xmm1, %xmm2; \
  141. addps %xmm2, %xmm5; \
  142. movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  143. mulps %xmm1, %xmm2; \
  144. mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  145. addps %xmm2, %xmm6; \
  146. movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  147. addps %xmm1, %xmm7; \
  148. movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  149. #define KERNEL6(address) \
  150. mulps %xmm1, %xmm3; \
  151. addps %xmm3, %xmm4; \
  152. movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  153. mulps %xmm1, %xmm3; \
  154. addps %xmm3, %xmm5; \
  155. movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  156. mulps %xmm1, %xmm3; \
  157. mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  158. addps %xmm3, %xmm6; \
  159. movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  160. addps %xmm1, %xmm7; \
  161. movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  162. #define KERNEL7(address) \
  163. mulps %xmm1, %xmm2; \
  164. addps %xmm2, %xmm4; \
  165. movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  166. mulps %xmm1, %xmm2; \
  167. addps %xmm2, %xmm5; \
  168. movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  169. mulps %xmm1, %xmm2; \
  170. mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  171. addps %xmm2, %xmm6; \
  172. movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  173. addps %xmm1, %xmm7; \
  174. movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  175. #define KERNEL8(address) \
  176. mulps %xmm1, %xmm3; \
  177. addps %xmm3, %xmm4; \
  178. movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  179. mulps %xmm1, %xmm3; \
  180. addps %xmm3, %xmm5; \
  181. movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  182. mulps %xmm1, %xmm3; \
  183. mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  184. addps %xmm3, %xmm6; \
  185. movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  186. addps %xmm1, %xmm7; \
  187. movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
  188. #endif
  189. #ifdef PENTIUM4
  190. #define KERNEL1(address) \
  191. mulps %xmm0, %xmm2; \
  192. addps %xmm2, %xmm4; \
  193. movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  194. mulps %xmm0, %xmm2; \
  195. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  196. addps %xmm2, %xmm5; \
  197. movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  198. mulps %xmm0, %xmm2; \
  199. mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  200. addps %xmm2, %xmm6; \
  201. movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  202. addps %xmm0, %xmm7; \
  203. movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  204. #define KERNEL2(address) \
  205. mulps %xmm0, %xmm3; \
  206. addps %xmm3, %xmm4; \
  207. movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  208. mulps %xmm0, %xmm3; \
  209. addps %xmm3, %xmm5; \
  210. movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  211. mulps %xmm0, %xmm3; \
  212. mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  213. addps %xmm3, %xmm6; \
  214. movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  215. addps %xmm0, %xmm7; \
  216. movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  217. #define KERNEL3(address) \
  218. mulps %xmm0, %xmm2; \
  219. addps %xmm2, %xmm4; \
  220. movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  221. mulps %xmm0, %xmm2; \
  222. addps %xmm2, %xmm5; \
  223. movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  224. mulps %xmm0, %xmm2; \
  225. mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  226. addps %xmm2, %xmm6; \
  227. movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  228. addps %xmm0, %xmm7; \
  229. movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  230. #define KERNEL4(address) \
  231. mulps %xmm0, %xmm3; \
  232. addps %xmm3, %xmm4; \
  233. movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  234. mulps %xmm0, %xmm3; \
  235. addps %xmm3, %xmm5; \
  236. movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  237. mulps %xmm0, %xmm3; \
  238. mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  239. addps %xmm3, %xmm6; \
  240. movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  241. addps %xmm0, %xmm7; \
  242. movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  243. #define KERNEL5(address) \
  244. mulps %xmm1, %xmm2; \
  245. addps %xmm2, %xmm4; \
  246. movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  247. mulps %xmm1, %xmm2; \
  248. addps %xmm2, %xmm5; \
  249. movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  250. mulps %xmm1, %xmm2; \
  251. mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  252. addps %xmm2, %xmm6; \
  253. movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  254. addps %xmm1, %xmm7; \
  255. movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  256. #define KERNEL6(address) \
  257. mulps %xmm1, %xmm3; \
  258. addps %xmm3, %xmm4; \
  259. movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  260. mulps %xmm1, %xmm3; \
  261. addps %xmm3, %xmm5; \
  262. movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  263. mulps %xmm1, %xmm3; \
  264. mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  265. addps %xmm3, %xmm6; \
  266. movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  267. addps %xmm1, %xmm7; \
  268. movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  269. #define KERNEL7(address) \
  270. mulps %xmm1, %xmm2; \
  271. addps %xmm2, %xmm4; \
  272. movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  273. mulps %xmm1, %xmm2; \
  274. addps %xmm2, %xmm5; \
  275. movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  276. mulps %xmm1, %xmm2; \
  277. mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  278. addps %xmm2, %xmm6; \
  279. movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  280. addps %xmm1, %xmm7; \
  281. movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  282. #define KERNEL8(address) \
  283. mulps %xmm1, %xmm3; \
  284. addps %xmm3, %xmm4; \
  285. movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  286. mulps %xmm1, %xmm3; \
  287. addps %xmm3, %xmm5; \
  288. movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  289. mulps %xmm1, %xmm3; \
  290. mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  291. addps %xmm3, %xmm6; \
  292. movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  293. addps %xmm1, %xmm7; \
  294. movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  295. #endif
  296. PROLOGUE
  297. pushl %ebp
  298. pushl %edi
  299. pushl %esi
  300. pushl %ebx
  301. PROFCODE
  302. EMMS
  303. movl %esp, %esi # save old stack
  304. subl $128 + LOCAL_BUFFER_SIZE, %esp
  305. movl OLD_M, %ebx
  306. andl $-1024, %esp # align stack
  307. STACK_TOUCHING
  308. movl OLD_N, %eax
  309. movl OLD_K, %ecx
  310. movl OLD_A, %edx
  311. movss OLD_ALPHA_R, %xmm0
  312. movss OLD_ALPHA_I, %xmm1
  313. movl %ebx, M
  314. movl %eax, N
  315. movl %ecx, K
  316. movl %edx, A
  317. movl %esi, OLD_STACK
  318. movl OLD_B, %edi
  319. movl OLD_C, %ebx
  320. unpcklps %xmm1, %xmm0
  321. movlhps %xmm0, %xmm0
  322. movaps %xmm0, ALPHA
  323. movl %ebx, C
  324. movl OLD_LDC, LDC
  325. sall $ZBASE_SHIFT, LDC
  326. sarl $2, %eax
  327. movl %eax, J
  328. jle .L40
  329. .L01:
  330. #if defined(TRMMKERNEL) && defined(LEFT)
  331. movl OFFSET, %eax
  332. movl %eax, KK
  333. #endif
  334. /* Copying to Sub Buffer */
  335. leal BUFFER, %ecx
  336. movl K, %eax
  337. sarl $1, %eax
  338. jle .L05
  339. ALIGN_4
  340. .L02:
  341. #ifdef HAVE_SSE2
  342. movss 0 * SIZE(%edi), %xmm0
  343. movss 1 * SIZE(%edi), %xmm1
  344. movss 2 * SIZE(%edi), %xmm2
  345. movss 3 * SIZE(%edi), %xmm3
  346. movss 4 * SIZE(%edi), %xmm4
  347. movss 5 * SIZE(%edi), %xmm5
  348. movss 6 * SIZE(%edi), %xmm6
  349. movss 7 * SIZE(%edi), %xmm7
  350. shufps $0, %xmm0, %xmm0
  351. shufps $0, %xmm1, %xmm1
  352. shufps $0, %xmm2, %xmm2
  353. shufps $0, %xmm3, %xmm3
  354. shufps $0, %xmm4, %xmm4
  355. shufps $0, %xmm5, %xmm5
  356. shufps $0, %xmm6, %xmm6
  357. shufps $0, %xmm7, %xmm7
  358. movaps %xmm0, 0 * SIZE(%ecx)
  359. movaps %xmm1, 4 * SIZE(%ecx)
  360. movaps %xmm2, 8 * SIZE(%ecx)
  361. movaps %xmm3, 12 * SIZE(%ecx)
  362. movaps %xmm4, 16 * SIZE(%ecx)
  363. movaps %xmm5, 20 * SIZE(%ecx)
  364. movaps %xmm6, 24 * SIZE(%ecx)
  365. movaps %xmm7, 28 * SIZE(%ecx)
  366. #else
  367. movd 0 * SIZE(%edi), %mm0
  368. movd 1 * SIZE(%edi), %mm1
  369. movd 2 * SIZE(%edi), %mm2
  370. movd 3 * SIZE(%edi), %mm3
  371. movd 4 * SIZE(%edi), %mm4
  372. movd 5 * SIZE(%edi), %mm5
  373. movd 6 * SIZE(%edi), %mm6
  374. movd 7 * SIZE(%edi), %mm7
  375. movd %mm0, 0 * SIZE(%ecx)
  376. movd %mm0, 1 * SIZE(%ecx)
  377. movd %mm0, 2 * SIZE(%ecx)
  378. movd %mm0, 3 * SIZE(%ecx)
  379. movd %mm1, 4 * SIZE(%ecx)
  380. movd %mm1, 5 * SIZE(%ecx)
  381. movd %mm1, 6 * SIZE(%ecx)
  382. movd %mm1, 7 * SIZE(%ecx)
  383. movd %mm2, 8 * SIZE(%ecx)
  384. movd %mm2, 9 * SIZE(%ecx)
  385. movd %mm2, 10 * SIZE(%ecx)
  386. movd %mm2, 11 * SIZE(%ecx)
  387. movd %mm3, 12 * SIZE(%ecx)
  388. movd %mm3, 13 * SIZE(%ecx)
  389. movd %mm3, 14 * SIZE(%ecx)
  390. movd %mm3, 15 * SIZE(%ecx)
  391. movd %mm4, 16 * SIZE(%ecx)
  392. movd %mm4, 17 * SIZE(%ecx)
  393. movd %mm4, 18 * SIZE(%ecx)
  394. movd %mm4, 19 * SIZE(%ecx)
  395. movd %mm5, 20 * SIZE(%ecx)
  396. movd %mm5, 21 * SIZE(%ecx)
  397. movd %mm5, 22 * SIZE(%ecx)
  398. movd %mm5, 23 * SIZE(%ecx)
  399. movd %mm6, 24 * SIZE(%ecx)
  400. movd %mm6, 25 * SIZE(%ecx)
  401. movd %mm6, 26 * SIZE(%ecx)
  402. movd %mm6, 27 * SIZE(%ecx)
  403. movd %mm7, 28 * SIZE(%ecx)
  404. movd %mm7, 29 * SIZE(%ecx)
  405. movd %mm7, 30 * SIZE(%ecx)
  406. movd %mm7, 31 * SIZE(%ecx)
  407. #endif
  408. #ifdef PENTIUM4
  409. prefetcht2 112 * SIZE(%ecx)
  410. #endif
  411. #if defined(OPTERON) || defined(BARCELONA)
  412. prefetchnta 80 * SIZE(%edi)
  413. prefetchw 112 * SIZE(%ecx)
  414. prefetchw 120 * SIZE(%ecx)
  415. #endif
  416. addl $ 8 * SIZE, %edi
  417. addl $32 * SIZE, %ecx
  418. decl %eax
  419. jne .L02
  420. ALIGN_2
  421. .L05:
  422. movl K, %eax
  423. andl $1, %eax
  424. BRANCH
  425. jle .L10
  426. #ifdef HAVE_SSE2
  427. movss 0 * SIZE(%edi), %xmm0
  428. movss 1 * SIZE(%edi), %xmm1
  429. movss 2 * SIZE(%edi), %xmm2
  430. movss 3 * SIZE(%edi), %xmm3
  431. shufps $0, %xmm0, %xmm0
  432. shufps $0, %xmm1, %xmm1
  433. shufps $0, %xmm2, %xmm2
  434. shufps $0, %xmm3, %xmm3
  435. movaps %xmm0, 0 * SIZE(%ecx)
  436. movaps %xmm1, 4 * SIZE(%ecx)
  437. movaps %xmm2, 8 * SIZE(%ecx)
  438. movaps %xmm3, 12 * SIZE(%ecx)
  439. #else
  440. movd 0 * SIZE(%edi), %mm0
  441. movd 1 * SIZE(%edi), %mm1
  442. movd 2 * SIZE(%edi), %mm2
  443. movd 3 * SIZE(%edi), %mm3
  444. movd %mm0, 0 * SIZE(%ecx)
  445. movd %mm0, 1 * SIZE(%ecx)
  446. movd %mm0, 2 * SIZE(%ecx)
  447. movd %mm0, 3 * SIZE(%ecx)
  448. movd %mm1, 4 * SIZE(%ecx)
  449. movd %mm1, 5 * SIZE(%ecx)
  450. movd %mm1, 6 * SIZE(%ecx)
  451. movd %mm1, 7 * SIZE(%ecx)
  452. movd %mm2, 8 * SIZE(%ecx)
  453. movd %mm2, 9 * SIZE(%ecx)
  454. movd %mm2, 10 * SIZE(%ecx)
  455. movd %mm2, 11 * SIZE(%ecx)
  456. movd %mm3, 12 * SIZE(%ecx)
  457. movd %mm3, 13 * SIZE(%ecx)
  458. movd %mm3, 14 * SIZE(%ecx)
  459. movd %mm3, 15 * SIZE(%ecx)
  460. #endif
  461. addl $4 * SIZE, %edi
  462. ALIGN_4
  463. .L10:
  464. movl %edi, BX
  465. movl C, %esi # coffset = c
  466. movl A, %edx # aoffset = a
  467. movl M, %ebx
  468. sarl $2, %ebx # i = (m >> 2)
  469. jle .L20
  470. ALIGN_4
  471. .L11:
  472. #if !defined(TRMMKERNEL) || \
  473. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  474. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  475. leal BUFFER, BB # boffset1 = boffset
  476. #else
  477. leal BUFFER, BB # boffset1 = boffset
  478. movl KK, %eax
  479. leal (, %eax, 8), %eax
  480. leal (AA, %eax, 2), AA
  481. leal (BB, %eax, 8), BB
  482. #endif
  483. movl BX, %eax
  484. #ifdef HAVE_SSE
  485. prefetcht2 0 * SIZE(%eax)
  486. prefetcht2 4 * SIZE(%eax)
  487. #if L2_SIZE > 262144
  488. subl $-8 * SIZE, BX
  489. #elif L2_SIZE > 131072
  490. prefetcht2 8 * SIZE(%eax)
  491. prefetcht2 12 * SIZE(%eax)
  492. subl $-16 * SIZE, BX
  493. #else
  494. prefetcht2 16 * SIZE(%eax)
  495. prefetcht2 20 * SIZE(%eax)
  496. prefetcht2 24 * SIZE(%eax)
  497. prefetcht2 28 * SIZE(%eax)
  498. subl $-32 * SIZE, BX
  499. #endif
  500. #endif
  501. movaps 0 * SIZE(AA), %xmm0
  502. pxor %xmm4, %xmm4
  503. movaps 16 * SIZE(AA), %xmm1
  504. pxor %xmm5, %xmm5
  505. movaps 0 * SIZE(BB), %xmm2
  506. pxor %xmm6, %xmm6
  507. movaps 16 * SIZE(BB), %xmm3
  508. pxor %xmm7, %xmm7
  509. leal (LDC, LDC, 2), %eax
  510. #if defined(OPTERON) || defined(BARCELONA)
  511. prefetchw 4 * SIZE(%esi)
  512. prefetchw 4 * SIZE(%esi, LDC)
  513. prefetchw 4 * SIZE(%esi, LDC, 2)
  514. prefetchw 4 * SIZE(%esi, %eax)
  515. #endif
  516. #ifdef PENTIUM4
  517. prefetchnta 4 * SIZE(%esi)
  518. prefetchnta 4 * SIZE(%esi, LDC)
  519. prefetchnta 4 * SIZE(%esi, LDC, 2)
  520. prefetchnta 4 * SIZE(%esi, %eax)
  521. #endif
  522. #ifndef TRMMKERNEL
  523. movl K, %eax
  524. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  525. movl K, %eax
  526. subl KK, %eax
  527. movl %eax, KKK
  528. #else
  529. movl KK, %eax
  530. #ifdef LEFT
  531. addl $4, %eax
  532. #else
  533. addl $4, %eax
  534. #endif
  535. movl %eax, KKK
  536. #endif
  537. #if 1
  538. andl $-8, %eax
  539. sall $4, %eax
  540. je .L15
  541. .L1X:
  542. KERNEL1(32 * 0)
  543. KERNEL2(32 * 0)
  544. KERNEL3(32 * 0)
  545. KERNEL4(32 * 0)
  546. KERNEL5(32 * 0)
  547. KERNEL6(32 * 0)
  548. KERNEL7(32 * 0)
  549. KERNEL8(32 * 0)
  550. cmpl $128 * 1, %eax
  551. jle .L12
  552. KERNEL1(32 * 1)
  553. KERNEL2(32 * 1)
  554. KERNEL3(32 * 1)
  555. KERNEL4(32 * 1)
  556. KERNEL5(32 * 1)
  557. KERNEL6(32 * 1)
  558. KERNEL7(32 * 1)
  559. KERNEL8(32 * 1)
  560. cmpl $128 * 2, %eax
  561. jle .L12
  562. KERNEL1(32 * 2)
  563. KERNEL2(32 * 2)
  564. KERNEL3(32 * 2)
  565. KERNEL4(32 * 2)
  566. KERNEL5(32 * 2)
  567. KERNEL6(32 * 2)
  568. KERNEL7(32 * 2)
  569. KERNEL8(32 * 2)
  570. cmpl $128 * 3, %eax
  571. jle .L12
  572. KERNEL1(32 * 3)
  573. KERNEL2(32 * 3)
  574. KERNEL3(32 * 3)
  575. KERNEL4(32 * 3)
  576. KERNEL5(32 * 3)
  577. KERNEL6(32 * 3)
  578. KERNEL7(32 * 3)
  579. KERNEL8(32 * 3)
  580. cmpl $128 * 4, %eax
  581. jle .L12
  582. KERNEL1(32 * 4)
  583. KERNEL2(32 * 4)
  584. KERNEL3(32 * 4)
  585. KERNEL4(32 * 4)
  586. KERNEL5(32 * 4)
  587. KERNEL6(32 * 4)
  588. KERNEL7(32 * 4)
  589. KERNEL8(32 * 4)
  590. cmpl $128 * 5, %eax
  591. jle .L12
  592. KERNEL1(32 * 5)
  593. KERNEL2(32 * 5)
  594. KERNEL3(32 * 5)
  595. KERNEL4(32 * 5)
  596. KERNEL5(32 * 5)
  597. KERNEL6(32 * 5)
  598. KERNEL7(32 * 5)
  599. KERNEL8(32 * 5)
  600. cmpl $128 * 6, %eax
  601. jle .L12
  602. KERNEL1(32 * 6)
  603. KERNEL2(32 * 6)
  604. KERNEL3(32 * 6)
  605. KERNEL4(32 * 6)
  606. KERNEL5(32 * 6)
  607. KERNEL6(32 * 6)
  608. KERNEL7(32 * 6)
  609. KERNEL8(32 * 6)
  610. cmpl $128 * 7, %eax
  611. jle .L12
  612. KERNEL1(32 * 7)
  613. KERNEL2(32 * 7)
  614. KERNEL3(32 * 7)
  615. KERNEL4(32 * 7)
  616. KERNEL5(32 * 7)
  617. KERNEL6(32 * 7)
  618. KERNEL7(32 * 7)
  619. KERNEL8(32 * 7)
  620. addl $128 * 8 * SIZE, BB
  621. addl $128 * 2 * SIZE, AA
  622. subl $128 * 8, %eax
  623. jg .L1X
  624. jmp .L15
  625. .L12:
  626. leal (AA, %eax, 1), AA
  627. leal (BB, %eax, 4), BB
  628. ALIGN_4
  629. #else
  630. sarl $3, %eax
  631. je .L15
  632. ALIGN_4
  633. .L12:
  634. KERNEL1(32 * 0)
  635. KERNEL2(32 * 0)
  636. KERNEL3(32 * 0)
  637. KERNEL4(32 * 0)
  638. KERNEL5(32 * 0)
  639. KERNEL6(32 * 0)
  640. KERNEL7(32 * 0)
  641. KERNEL8(32 * 0)
  642. addl $128 * SIZE, BB
  643. addl $32 * SIZE, AA
  644. decl %eax
  645. jne .L12
  646. ALIGN_4
  647. #endif
  648. .L15:
  649. #ifndef TRMMKERNEL
  650. movl K, %eax
  651. #else
  652. movl KKK, %eax
  653. #endif
  654. movaps ALPHA, %xmm3
  655. andl $7, %eax # if (k & 1)
  656. BRANCH
  657. je .L18
  658. ALIGN_4
  659. .L16:
  660. mulps %xmm0, %xmm2
  661. addps %xmm2, %xmm4
  662. movaps 4 * SIZE(BB), %xmm2
  663. mulps %xmm0, %xmm2
  664. addps %xmm2, %xmm5
  665. movaps 8 * SIZE(BB), %xmm2
  666. mulps %xmm0, %xmm2
  667. mulps 12 * SIZE(BB), %xmm0
  668. addps %xmm2, %xmm6
  669. movaps 16 * SIZE(BB), %xmm2
  670. addps %xmm0, %xmm7
  671. movaps 4 * SIZE(AA), %xmm0
  672. addl $ 4 * SIZE, AA
  673. addl $16 * SIZE, BB
  674. decl %eax
  675. jg .L16
  676. ALIGN_4
  677. .L18:
  678. leal (LDC, LDC, 2), %eax
  679. movsd 0 * SIZE(%esi), %xmm0
  680. movhps 2 * SIZE(%esi), %xmm0
  681. movsd 4 * SIZE(%esi), %xmm1
  682. movhps 6 * SIZE(%esi), %xmm1
  683. pshufd $0x50, %xmm4, %xmm2
  684. pshufd $0xfa, %xmm4, %xmm4
  685. mulps %xmm3, %xmm2
  686. mulps %xmm3, %xmm4
  687. addps %xmm2, %xmm0
  688. addps %xmm4, %xmm1
  689. movlps %xmm0, 0 * SIZE(%esi)
  690. movhps %xmm0, 2 * SIZE(%esi)
  691. movlps %xmm1, 4 * SIZE(%esi)
  692. movhps %xmm1, 6 * SIZE(%esi)
  693. movsd 0 * SIZE(%esi, LDC), %xmm0
  694. movhps 2 * SIZE(%esi, LDC), %xmm0
  695. movsd 4 * SIZE(%esi, LDC), %xmm1
  696. movhps 6 * SIZE(%esi, LDC), %xmm1
  697. pshufd $0x50, %xmm5, %xmm2
  698. pshufd $0xfa, %xmm5, %xmm5
  699. mulps %xmm3, %xmm2
  700. mulps %xmm3, %xmm5
  701. addps %xmm2, %xmm0
  702. addps %xmm5, %xmm1
  703. movlps %xmm0, 0 * SIZE(%esi, LDC)
  704. movhps %xmm0, 2 * SIZE(%esi, LDC)
  705. movlps %xmm1, 4 * SIZE(%esi, LDC)
  706. movhps %xmm1, 6 * SIZE(%esi, LDC)
  707. movsd 0 * SIZE(%esi, LDC, 2), %xmm0
  708. movhps 2 * SIZE(%esi, LDC, 2), %xmm0
  709. movsd 4 * SIZE(%esi, LDC, 2), %xmm1
  710. movhps 6 * SIZE(%esi, LDC, 2), %xmm1
  711. pshufd $0x50, %xmm6, %xmm2
  712. pshufd $0xfa, %xmm6, %xmm6
  713. mulps %xmm3, %xmm2
  714. mulps %xmm3, %xmm6
  715. addps %xmm2, %xmm0
  716. addps %xmm6, %xmm1
  717. movlps %xmm0, 0 * SIZE(%esi, LDC, 2)
  718. movhps %xmm0, 2 * SIZE(%esi, LDC, 2)
  719. movlps %xmm1, 4 * SIZE(%esi, LDC, 2)
  720. movhps %xmm1, 6 * SIZE(%esi, LDC, 2)
  721. movsd 0 * SIZE(%esi, %eax), %xmm0
  722. movhps 2 * SIZE(%esi, %eax), %xmm0
  723. movsd 4 * SIZE(%esi, %eax), %xmm1
  724. movhps 6 * SIZE(%esi, %eax), %xmm1
  725. pshufd $0x50, %xmm7, %xmm2
  726. pshufd $0xfa, %xmm7, %xmm7
  727. mulps %xmm3, %xmm2
  728. mulps %xmm3, %xmm7
  729. addps %xmm2, %xmm0
  730. addps %xmm7, %xmm1
  731. movlps %xmm0, 0 * SIZE(%esi, %eax)
  732. movhps %xmm0, 2 * SIZE(%esi, %eax)
  733. movlps %xmm1, 4 * SIZE(%esi, %eax)
  734. movhps %xmm1, 6 * SIZE(%esi, %eax)
  735. addl $8 * SIZE, %esi # coffset += 2
  736. decl %ebx # i --
  737. jg .L11
  738. ALIGN_4
  739. .L20:
  740. testl $2, M
  741. je .L30
  742. #if !defined(TRMMKERNEL) || \
  743. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  744. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  745. leal BUFFER, BB # boffset1 = boffset
  746. #else
  747. leal BUFFER, BB # boffset1 = boffset
  748. movl KK, %eax
  749. leal (, %eax, 8), %eax
  750. leal (AA, %eax, 1), AA
  751. leal (BB, %eax, 8), BB
  752. #endif
  753. movsd 0 * SIZE(AA), %xmm0
  754. pxor %xmm4, %xmm4
  755. movsd 8 * SIZE(AA), %xmm1
  756. pxor %xmm5, %xmm5
  757. movsd 0 * SIZE(BB), %xmm2
  758. pxor %xmm6, %xmm6
  759. movsd 16 * SIZE(BB), %xmm3
  760. pxor %xmm7, %xmm7
  761. #ifndef TRMMKERNEL
  762. movl K, %eax
  763. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  764. movl K, %eax
  765. subl KK, %eax
  766. movl %eax, KKK
  767. #else
  768. movl KK, %eax
  769. #ifdef LEFT
  770. addl $2, %eax
  771. #else
  772. addl $4, %eax
  773. #endif
  774. movl %eax, KKK
  775. #endif
  776. sarl $3, %eax
  777. je .L25
  778. ALIGN_4
  779. .L22:
  780. mulps %xmm0, %xmm2
  781. addps %xmm2, %xmm4
  782. #if defined(OPTERON) || defined(BARCELONA)
  783. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  784. #endif
  785. movsd 4 * SIZE(BB), %xmm2
  786. mulps %xmm0, %xmm2
  787. addps %xmm2, %xmm5
  788. movsd 8 * SIZE(BB), %xmm2
  789. mulps %xmm0, %xmm2
  790. addps %xmm2, %xmm6
  791. movsd 12 * SIZE(BB), %xmm2
  792. mulps %xmm0, %xmm2
  793. movsd 2 * SIZE(AA), %xmm0
  794. addps %xmm2, %xmm7
  795. movsd 32 * SIZE(BB), %xmm2
  796. mulps %xmm0, %xmm3
  797. addps %xmm3, %xmm4
  798. movsd 20 * SIZE(BB), %xmm3
  799. mulps %xmm0, %xmm3
  800. addps %xmm3, %xmm5
  801. movsd 24 * SIZE(BB), %xmm3
  802. mulps %xmm0, %xmm3
  803. addps %xmm3, %xmm6
  804. movsd 28 * SIZE(BB), %xmm3
  805. mulps %xmm0, %xmm3
  806. movsd 4 * SIZE(AA), %xmm0
  807. addps %xmm3, %xmm7
  808. movsd 48 * SIZE(BB), %xmm3
  809. mulps %xmm0, %xmm2
  810. addps %xmm2, %xmm4
  811. movsd 36 * SIZE(BB), %xmm2
  812. mulps %xmm0, %xmm2
  813. addps %xmm2, %xmm5
  814. movsd 40 * SIZE(BB), %xmm2
  815. mulps %xmm0, %xmm2
  816. addps %xmm2, %xmm6
  817. movsd 44 * SIZE(BB), %xmm2
  818. mulps %xmm0, %xmm2
  819. movsd 6 * SIZE(AA), %xmm0
  820. addps %xmm2, %xmm7
  821. movsd 64 * SIZE(BB), %xmm2
  822. mulps %xmm0, %xmm3
  823. addps %xmm3, %xmm4
  824. movsd 52 * SIZE(BB), %xmm3
  825. mulps %xmm0, %xmm3
  826. addps %xmm3, %xmm5
  827. movsd 56 * SIZE(BB), %xmm3
  828. mulps %xmm0, %xmm3
  829. addps %xmm3, %xmm6
  830. movsd 60 * SIZE(BB), %xmm3
  831. mulps %xmm0, %xmm3
  832. movsd 16 * SIZE(AA), %xmm0
  833. addps %xmm3, %xmm7
  834. movsd 80 * SIZE(BB), %xmm3
  835. mulps %xmm1, %xmm2
  836. addps %xmm2, %xmm4
  837. movsd 68 * SIZE(BB), %xmm2
  838. mulps %xmm1, %xmm2
  839. addps %xmm2, %xmm5
  840. movsd 72 * SIZE(BB), %xmm2
  841. mulps %xmm1, %xmm2
  842. addps %xmm2, %xmm6
  843. movsd 76 * SIZE(BB), %xmm2
  844. mulps %xmm1, %xmm2
  845. movsd 10 * SIZE(AA), %xmm1
  846. addps %xmm2, %xmm7
  847. movsd 96 * SIZE(BB), %xmm2
  848. mulps %xmm1, %xmm3
  849. addps %xmm3, %xmm4
  850. movsd 84 * SIZE(BB), %xmm3
  851. mulps %xmm1, %xmm3
  852. addps %xmm3, %xmm5
  853. movsd 88 * SIZE(BB), %xmm3
  854. mulps %xmm1, %xmm3
  855. addps %xmm3, %xmm6
  856. movsd 92 * SIZE(BB), %xmm3
  857. mulps %xmm1, %xmm3
  858. movsd 12 * SIZE(AA), %xmm1
  859. addps %xmm3, %xmm7
  860. movsd 112 * SIZE(BB), %xmm3
  861. mulps %xmm1, %xmm2
  862. addps %xmm2, %xmm4
  863. movsd 100 * SIZE(BB), %xmm2
  864. mulps %xmm1, %xmm2
  865. addps %xmm2, %xmm5
  866. movsd 104 * SIZE(BB), %xmm2
  867. mulps %xmm1, %xmm2
  868. addps %xmm2, %xmm6
  869. movsd 108 * SIZE(BB), %xmm2
  870. mulps %xmm1, %xmm2
  871. movsd 14 * SIZE(AA), %xmm1
  872. addps %xmm2, %xmm7
  873. movsd 128 * SIZE(BB), %xmm2
  874. mulps %xmm1, %xmm3
  875. addps %xmm3, %xmm4
  876. movsd 116 * SIZE(BB), %xmm3
  877. mulps %xmm1, %xmm3
  878. addps %xmm3, %xmm5
  879. movsd 120 * SIZE(BB), %xmm3
  880. mulps %xmm1, %xmm3
  881. addps %xmm3, %xmm6
  882. movsd 124 * SIZE(BB), %xmm3
  883. mulps %xmm1, %xmm3
  884. movsd 24 * SIZE(AA), %xmm1
  885. addps %xmm3, %xmm7
  886. movsd 144 * SIZE(BB), %xmm3
  887. addl $ 16 * SIZE, AA
  888. addl $128 * SIZE, BB
  889. decl %eax
  890. jne .L22
  891. ALIGN_4
  892. .L25:
  893. #ifndef TRMMKERNEL
  894. movl K, %eax
  895. #else
  896. movl KKK, %eax
  897. #endif
  898. movaps ALPHA, %xmm3
  899. andl $7, %eax # if (k & 1)
  900. BRANCH
  901. je .L28
  902. ALIGN_4
  903. .L26:
  904. mulps %xmm0, %xmm2
  905. addps %xmm2, %xmm4
  906. movsd 4 * SIZE(BB), %xmm2
  907. mulps %xmm0, %xmm2
  908. addps %xmm2, %xmm5
  909. movsd 8 * SIZE(BB), %xmm2
  910. mulps %xmm0, %xmm2
  911. addps %xmm2, %xmm6
  912. movsd 12 * SIZE(BB), %xmm2
  913. mulps %xmm0, %xmm2
  914. movsd 2 * SIZE(AA), %xmm0
  915. addps %xmm2, %xmm7
  916. movsd 16 * SIZE(BB), %xmm2
  917. addl $ 2 * SIZE, AA
  918. addl $16 * SIZE, BB
  919. decl %eax
  920. jg .L26
  921. ALIGN_4
  922. .L28:
  923. leal (LDC, LDC, 2), %eax
  924. movsd 0 * SIZE(%esi), %xmm0
  925. movhps 2 * SIZE(%esi), %xmm0
  926. movsd 0 * SIZE(%esi, LDC), %xmm1
  927. movhps 2 * SIZE(%esi, LDC), %xmm1
  928. shufps $0x50, %xmm4, %xmm4
  929. shufps $0x50, %xmm5, %xmm5
  930. mulps %xmm3, %xmm4
  931. mulps %xmm3, %xmm5
  932. addps %xmm4, %xmm0
  933. addps %xmm5, %xmm1
  934. movlps %xmm0, 0 * SIZE(%esi)
  935. movhps %xmm0, 2 * SIZE(%esi)
  936. movlps %xmm1, 0 * SIZE(%esi, LDC)
  937. movhps %xmm1, 2 * SIZE(%esi, LDC)
  938. movsd 0 * SIZE(%esi, LDC, 2), %xmm0
  939. movhps 2 * SIZE(%esi, LDC, 2), %xmm0
  940. movsd 0 * SIZE(%esi, %eax), %xmm1
  941. movhps 2 * SIZE(%esi, %eax), %xmm1
  942. shufps $0x50, %xmm6, %xmm6
  943. shufps $0x50, %xmm7, %xmm7
  944. mulps %xmm3, %xmm6
  945. mulps %xmm3, %xmm7
  946. addps %xmm6, %xmm0
  947. addps %xmm7, %xmm1
  948. movlps %xmm0, 0 * SIZE(%esi, LDC, 2)
  949. movhps %xmm0, 2 * SIZE(%esi, LDC, 2)
  950. movlps %xmm1, 0 * SIZE(%esi, %eax)
  951. movhps %xmm1, 2 * SIZE(%esi, %eax)
  952. addl $4 * SIZE, %esi # coffset += 2
  953. ALIGN_4
  954. .L30:
  955. testl $1, M
  956. je .L39
  957. #if !defined(TRMMKERNEL) || \
  958. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  959. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  960. leal BUFFER, BB # boffset1 = boffset
  961. #else
  962. leal BUFFER, BB # boffset1 = boffset
  963. movl KK, %eax
  964. leal (, %eax, 4), %eax
  965. leal (AA, %eax, 1), AA
  966. leal (BB, %eax, 8), BB
  967. leal (BB, %eax, 8), BB
  968. #endif
  969. movss 0 * SIZE(AA), %xmm0
  970. pxor %xmm4, %xmm4
  971. movss 4 * SIZE(AA), %xmm1
  972. pxor %xmm5, %xmm5
  973. movss 0 * SIZE(BB), %xmm2
  974. pxor %xmm6, %xmm6
  975. movss 16 * SIZE(BB), %xmm3
  976. pxor %xmm7, %xmm7
  977. #ifndef TRMMKERNEL
  978. movl K, %eax
  979. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  980. movl K, %eax
  981. subl KK, %eax
  982. movl %eax, KKK
  983. #else
  984. movl KK, %eax
  985. #ifdef LEFT
  986. addl $1, %eax
  987. #else
  988. addl $4, %eax
  989. #endif
  990. movl %eax, KKK
  991. #endif
  992. sarl $3, %eax
  993. je .L35
  994. ALIGN_4
  995. .L32:
  996. mulss %xmm0, %xmm2
  997. addss %xmm2, %xmm4
  998. #if defined(OPTERON) || defined(BARCELONA)
  999. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1000. #endif
  1001. movss 4 * SIZE(BB), %xmm2
  1002. mulss %xmm0, %xmm2
  1003. addss %xmm2, %xmm5
  1004. movss 8 * SIZE(BB), %xmm2
  1005. mulss %xmm0, %xmm2
  1006. mulss 12 * SIZE(BB), %xmm0
  1007. addss %xmm2, %xmm6
  1008. movss 32 * SIZE(BB), %xmm2
  1009. addss %xmm0, %xmm7
  1010. movss 1 * SIZE(AA), %xmm0
  1011. mulss %xmm0, %xmm3
  1012. addss %xmm3, %xmm4
  1013. movss 20 * SIZE(BB), %xmm3
  1014. mulss %xmm0, %xmm3
  1015. addss %xmm3, %xmm5
  1016. movss 24 * SIZE(BB), %xmm3
  1017. mulss %xmm0, %xmm3
  1018. mulss 28 * SIZE(BB), %xmm0
  1019. addss %xmm3, %xmm6
  1020. movss 48 * SIZE(BB), %xmm3
  1021. addss %xmm0, %xmm7
  1022. movss 2 * SIZE(AA), %xmm0
  1023. mulss %xmm0, %xmm2
  1024. addss %xmm2, %xmm4
  1025. movss 36 * SIZE(BB), %xmm2
  1026. mulss %xmm0, %xmm2
  1027. addss %xmm2, %xmm5
  1028. movss 40 * SIZE(BB), %xmm2
  1029. mulss %xmm0, %xmm2
  1030. mulss 44 * SIZE(BB), %xmm0
  1031. addss %xmm2, %xmm6
  1032. movss 64 * SIZE(BB), %xmm2
  1033. addss %xmm0, %xmm7
  1034. movss 3 * SIZE(AA), %xmm0
  1035. mulss %xmm0, %xmm3
  1036. addss %xmm3, %xmm4
  1037. movss 52 * SIZE(BB), %xmm3
  1038. mulss %xmm0, %xmm3
  1039. addss %xmm3, %xmm5
  1040. movss 56 * SIZE(BB), %xmm3
  1041. mulss %xmm0, %xmm3
  1042. mulss 60 * SIZE(BB), %xmm0
  1043. addss %xmm3, %xmm6
  1044. movss 80 * SIZE(BB), %xmm3
  1045. addss %xmm0, %xmm7
  1046. movss 8 * SIZE(AA), %xmm0
  1047. mulss %xmm1, %xmm2
  1048. addss %xmm2, %xmm4
  1049. movss 68 * SIZE(BB), %xmm2
  1050. mulss %xmm1, %xmm2
  1051. addss %xmm2, %xmm5
  1052. movss 72 * SIZE(BB), %xmm2
  1053. mulss %xmm1, %xmm2
  1054. mulss 76 * SIZE(BB), %xmm1
  1055. addss %xmm2, %xmm6
  1056. movss 96 * SIZE(BB), %xmm2
  1057. addss %xmm1, %xmm7
  1058. movss 5 * SIZE(AA), %xmm1
  1059. mulss %xmm1, %xmm3
  1060. addss %xmm3, %xmm4
  1061. movss 84 * SIZE(BB), %xmm3
  1062. mulss %xmm1, %xmm3
  1063. addss %xmm3, %xmm5
  1064. movss 88 * SIZE(BB), %xmm3
  1065. mulss %xmm1, %xmm3
  1066. mulss 92 * SIZE(BB), %xmm1
  1067. addss %xmm3, %xmm6
  1068. movss 112 * SIZE(BB), %xmm3
  1069. addss %xmm1, %xmm7
  1070. movss 6 * SIZE(AA), %xmm1
  1071. mulss %xmm1, %xmm2
  1072. addss %xmm2, %xmm4
  1073. movss 100 * SIZE(BB), %xmm2
  1074. mulss %xmm1, %xmm2
  1075. addss %xmm2, %xmm5
  1076. movss 104 * SIZE(BB), %xmm2
  1077. mulss %xmm1, %xmm2
  1078. mulss 108 * SIZE(BB), %xmm1
  1079. addss %xmm2, %xmm6
  1080. movss 128 * SIZE(BB), %xmm2
  1081. addss %xmm1, %xmm7
  1082. movss 7 * SIZE(AA), %xmm1
  1083. mulss %xmm1, %xmm3
  1084. addss %xmm3, %xmm4
  1085. movss 116 * SIZE(BB), %xmm3
  1086. mulss %xmm1, %xmm3
  1087. addss %xmm3, %xmm5
  1088. movss 120 * SIZE(BB), %xmm3
  1089. mulss %xmm1, %xmm3
  1090. mulss 124 * SIZE(BB), %xmm1
  1091. addss %xmm3, %xmm6
  1092. movss 144 * SIZE(BB), %xmm3
  1093. addss %xmm1, %xmm7
  1094. movss 12 * SIZE(AA), %xmm1
  1095. addl $ 8 * SIZE, AA
  1096. addl $128 * SIZE, BB
  1097. decl %eax
  1098. jne .L32
  1099. ALIGN_4
  1100. .L35:
  1101. #ifndef TRMMKERNEL
  1102. movl K, %eax
  1103. #else
  1104. movl KKK, %eax
  1105. #endif
  1106. movaps ALPHA, %xmm3
  1107. andl $7, %eax # if (k & 1)
  1108. BRANCH
  1109. je .L38
  1110. ALIGN_4
  1111. .L36:
  1112. mulss %xmm0, %xmm2
  1113. addss %xmm2, %xmm4
  1114. movss 4 * SIZE(BB), %xmm2
  1115. mulss %xmm0, %xmm2
  1116. addss %xmm2, %xmm5
  1117. movss 8 * SIZE(BB), %xmm2
  1118. mulss %xmm0, %xmm2
  1119. mulss 12 * SIZE(BB), %xmm0
  1120. addss %xmm2, %xmm6
  1121. movss 16 * SIZE(BB), %xmm2
  1122. addss %xmm0, %xmm7
  1123. movss 1 * SIZE(AA), %xmm0
  1124. addl $ 1 * SIZE, AA
  1125. addl $16 * SIZE, BB
  1126. decl %eax
  1127. jg .L36
  1128. ALIGN_4
  1129. .L38:
  1130. leal (LDC, LDC, 2), %eax
  1131. movsd (%esi), %xmm0
  1132. movhps (%esi, LDC), %xmm0
  1133. shufps $0, %xmm5, %xmm4
  1134. mulps %xmm3, %xmm4
  1135. addps %xmm4, %xmm0
  1136. movlps %xmm0, (%esi)
  1137. movhps %xmm0, (%esi, LDC)
  1138. movsd (%esi, LDC, 2), %xmm0
  1139. movhps (%esi, %eax), %xmm0
  1140. shufps $0, %xmm7, %xmm6
  1141. mulps %xmm3, %xmm6
  1142. addps %xmm6, %xmm0
  1143. movlps %xmm0, (%esi, LDC, 2)
  1144. movhps %xmm0, (%esi, %eax)
  1145. ALIGN_4
  1146. .L39:
  1147. #if defined(TRMMKERNEL) && !defined(LEFT)
  1148. addl $4, KK
  1149. #endif
  1150. leal (, LDC, 4), %eax
  1151. addl %eax, C # c += 4 * ldc
  1152. decl J # j --
  1153. jg .L01
  1154. ALIGN_4
  1155. .L40:
  1156. testl $2, N
  1157. je .L80
  1158. #if defined(TRMMKERNEL) && defined(LEFT)
  1159. movl OFFSET, %eax
  1160. movl %eax, KK
  1161. #endif
  1162. movl K, %eax
  1163. leal BUFFER, %ecx
  1164. sarl $2, %eax
  1165. jle .L45
  1166. ALIGN_4
  1167. .L42:
  1168. prefetchnta 80 * SIZE(%edi)
  1169. #if defined(OPTERON) || defined(BARCELONA)
  1170. prefetchw 112 * SIZE(%ecx)
  1171. prefetchw 120 * SIZE(%ecx)
  1172. #endif
  1173. #ifdef PENTIUM4
  1174. prefetcht1 112 * SIZE(%ecx)
  1175. #endif
  1176. #ifdef HAVE_SSE2
  1177. movss 0 * SIZE(%edi), %xmm0
  1178. movss 1 * SIZE(%edi), %xmm1
  1179. movss 2 * SIZE(%edi), %xmm2
  1180. movss 3 * SIZE(%edi), %xmm3
  1181. movss 4 * SIZE(%edi), %xmm4
  1182. movss 5 * SIZE(%edi), %xmm5
  1183. movss 6 * SIZE(%edi), %xmm6
  1184. movss 7 * SIZE(%edi), %xmm7
  1185. shufps $0, %xmm0, %xmm0
  1186. shufps $0, %xmm1, %xmm1
  1187. shufps $0, %xmm2, %xmm2
  1188. shufps $0, %xmm3, %xmm3
  1189. shufps $0, %xmm4, %xmm4
  1190. shufps $0, %xmm5, %xmm5
  1191. shufps $0, %xmm6, %xmm6
  1192. shufps $0, %xmm7, %xmm7
  1193. movaps %xmm0, 0 * SIZE(%ecx)
  1194. movaps %xmm1, 4 * SIZE(%ecx)
  1195. movaps %xmm2, 8 * SIZE(%ecx)
  1196. movaps %xmm3, 12 * SIZE(%ecx)
  1197. movaps %xmm4, 16 * SIZE(%ecx)
  1198. movaps %xmm5, 20 * SIZE(%ecx)
  1199. movaps %xmm6, 24 * SIZE(%ecx)
  1200. movaps %xmm7, 28 * SIZE(%ecx)
  1201. #else
  1202. movd 0 * SIZE(%edi), %mm0
  1203. movd 1 * SIZE(%edi), %mm1
  1204. movd 2 * SIZE(%edi), %mm2
  1205. movd 3 * SIZE(%edi), %mm3
  1206. movd 4 * SIZE(%edi), %mm4
  1207. movd 5 * SIZE(%edi), %mm5
  1208. movd 6 * SIZE(%edi), %mm6
  1209. movd 7 * SIZE(%edi), %mm7
  1210. movd %mm0, 0 * SIZE(%ecx)
  1211. movd %mm0, 1 * SIZE(%ecx)
  1212. movd %mm0, 2 * SIZE(%ecx)
  1213. movd %mm0, 3 * SIZE(%ecx)
  1214. movd %mm1, 4 * SIZE(%ecx)
  1215. movd %mm1, 5 * SIZE(%ecx)
  1216. movd %mm1, 6 * SIZE(%ecx)
  1217. movd %mm1, 7 * SIZE(%ecx)
  1218. movd %mm2, 8 * SIZE(%ecx)
  1219. movd %mm2, 9 * SIZE(%ecx)
  1220. movd %mm2, 10 * SIZE(%ecx)
  1221. movd %mm2, 11 * SIZE(%ecx)
  1222. movd %mm3, 12 * SIZE(%ecx)
  1223. movd %mm3, 13 * SIZE(%ecx)
  1224. movd %mm3, 14 * SIZE(%ecx)
  1225. movd %mm3, 15 * SIZE(%ecx)
  1226. movd %mm4, 16 * SIZE(%ecx)
  1227. movd %mm4, 17 * SIZE(%ecx)
  1228. movd %mm4, 18 * SIZE(%ecx)
  1229. movd %mm4, 19 * SIZE(%ecx)
  1230. movd %mm5, 20 * SIZE(%ecx)
  1231. movd %mm5, 21 * SIZE(%ecx)
  1232. movd %mm5, 22 * SIZE(%ecx)
  1233. movd %mm5, 23 * SIZE(%ecx)
  1234. movd %mm6, 24 * SIZE(%ecx)
  1235. movd %mm6, 25 * SIZE(%ecx)
  1236. movd %mm6, 26 * SIZE(%ecx)
  1237. movd %mm6, 27 * SIZE(%ecx)
  1238. movd %mm7, 28 * SIZE(%ecx)
  1239. movd %mm7, 29 * SIZE(%ecx)
  1240. movd %mm7, 30 * SIZE(%ecx)
  1241. movd %mm7, 31 * SIZE(%ecx)
  1242. #endif
  1243. addl $ 8 * SIZE, %edi
  1244. addl $32 * SIZE, %ecx
  1245. decl %eax
  1246. jne .L42
  1247. ALIGN_4
  1248. .L45:
  1249. movl K, %eax
  1250. andl $3, %eax
  1251. BRANCH
  1252. jle .L50
  1253. ALIGN_4
  1254. .L46:
  1255. #ifdef HAVE_SSE2
  1256. movss 0 * SIZE(%edi), %xmm0
  1257. movss 1 * SIZE(%edi), %xmm1
  1258. shufps $0, %xmm0, %xmm0
  1259. shufps $0, %xmm1, %xmm1
  1260. movaps %xmm0, 0 * SIZE(%ecx)
  1261. movaps %xmm1, 4 * SIZE(%ecx)
  1262. #else
  1263. movd 0 * SIZE(%edi), %mm0
  1264. movd 1 * SIZE(%edi), %mm1
  1265. movd %mm0, 0 * SIZE(%ecx)
  1266. movd %mm0, 1 * SIZE(%ecx)
  1267. movd %mm0, 2 * SIZE(%ecx)
  1268. movd %mm0, 3 * SIZE(%ecx)
  1269. movd %mm1, 4 * SIZE(%ecx)
  1270. movd %mm1, 5 * SIZE(%ecx)
  1271. movd %mm1, 6 * SIZE(%ecx)
  1272. movd %mm1, 7 * SIZE(%ecx)
  1273. #endif
  1274. addl $2 * SIZE, %edi
  1275. addl $8 * SIZE, %ecx
  1276. decl %eax
  1277. jne .L46
  1278. ALIGN_4
  1279. .L50:
  1280. movl C, %esi # coffset = c
  1281. movl A, %edx # aoffset = a
  1282. movl M, %ebx
  1283. sarl $2, %ebx # i = (m >> 2)
  1284. jle .L60
  1285. ALIGN_4
  1286. .L51:
  1287. #if !defined(TRMMKERNEL) || \
  1288. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1289. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1290. leal BUFFER, BB # boffset1 = boffset
  1291. #else
  1292. leal BUFFER, BB # boffset1 = boffset
  1293. movl KK, %eax
  1294. leal (, %eax, 8), %eax
  1295. leal (AA, %eax, 2), AA
  1296. leal (BB, %eax, 4), BB
  1297. #endif
  1298. pxor %xmm4, %xmm4
  1299. pxor %xmm5, %xmm5
  1300. pxor %xmm6, %xmm6
  1301. pxor %xmm7, %xmm7
  1302. movaps 0 * SIZE(AA), %xmm0
  1303. movaps 16 * SIZE(AA), %xmm1
  1304. movaps 0 * SIZE(BB), %xmm2
  1305. movaps 16 * SIZE(BB), %xmm3
  1306. #ifdef HAVE_3DNOW
  1307. prefetchw 4 * SIZE(%esi)
  1308. prefetchw 4 * SIZE(%esi, LDC)
  1309. #elif defined(HAVE_SSE) || defined(HAVE_SSE2)
  1310. prefetcht2 4 * SIZE(%esi)
  1311. prefetcht2 4 * SIZE(%esi, LDC)
  1312. #endif
  1313. #ifndef TRMMKERNEL
  1314. movl K, %eax
  1315. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1316. movl K, %eax
  1317. subl KK, %eax
  1318. movl %eax, KKK
  1319. #else
  1320. movl KK, %eax
  1321. #ifdef LEFT
  1322. addl $4, %eax
  1323. #else
  1324. addl $2, %eax
  1325. #endif
  1326. movl %eax, KKK
  1327. #endif
  1328. sarl $3, %eax
  1329. je .L55
  1330. ALIGN_4
  1331. .L52:
  1332. mulps %xmm0, %xmm2
  1333. #if defined(OPTERON) || defined(BARCELONA)
  1334. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1335. #endif
  1336. mulps 4 * SIZE(BB), %xmm0
  1337. addps %xmm2, %xmm4
  1338. movaps 8 * SIZE(BB), %xmm2
  1339. addps %xmm0, %xmm5
  1340. movaps 4 * SIZE(AA), %xmm0
  1341. mulps %xmm0, %xmm2
  1342. mulps 12 * SIZE(BB), %xmm0
  1343. addps %xmm2, %xmm4
  1344. movaps 32 * SIZE(BB), %xmm2
  1345. addps %xmm0, %xmm5
  1346. movaps 8 * SIZE(AA), %xmm0
  1347. mulps %xmm0, %xmm3
  1348. mulps 20 * SIZE(BB), %xmm0
  1349. addps %xmm3, %xmm4
  1350. movaps 24 * SIZE(BB), %xmm3
  1351. addps %xmm0, %xmm5
  1352. movaps 12 * SIZE(AA), %xmm0
  1353. mulps %xmm0, %xmm3
  1354. mulps 28 * SIZE(BB), %xmm0
  1355. addps %xmm3, %xmm4
  1356. movaps 48 * SIZE(BB), %xmm3
  1357. addps %xmm0, %xmm5
  1358. movaps 32 * SIZE(AA), %xmm0
  1359. #if defined(OPTERON) || defined(BARCELONA)
  1360. prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
  1361. #endif
  1362. mulps %xmm1, %xmm2
  1363. mulps 36 * SIZE(BB), %xmm1
  1364. addps %xmm2, %xmm4
  1365. movaps 40 * SIZE(BB), %xmm2
  1366. addps %xmm1, %xmm5
  1367. movaps 20 * SIZE(AA), %xmm1
  1368. mulps %xmm1, %xmm2
  1369. mulps 44 * SIZE(BB), %xmm1
  1370. addps %xmm2, %xmm4
  1371. movaps 64 * SIZE(BB), %xmm2
  1372. addps %xmm1, %xmm5
  1373. movaps 24 * SIZE(AA), %xmm1
  1374. mulps %xmm1, %xmm3
  1375. mulps 52 * SIZE(BB), %xmm1
  1376. addps %xmm3, %xmm4
  1377. movaps 56 * SIZE(BB), %xmm3
  1378. addps %xmm1, %xmm5
  1379. movaps 28 * SIZE(AA), %xmm1
  1380. mulps %xmm1, %xmm3
  1381. mulps 60 * SIZE(BB), %xmm1
  1382. addps %xmm3, %xmm4
  1383. movaps 80 * SIZE(BB), %xmm3
  1384. addps %xmm1, %xmm5
  1385. movaps 48 * SIZE(AA), %xmm1
  1386. addl $32 * SIZE, AA
  1387. addl $64 * SIZE, BB
  1388. decl %eax
  1389. jne .L52
  1390. ALIGN_4
  1391. .L55:
  1392. #ifndef TRMMKERNEL
  1393. movl K, %eax
  1394. #else
  1395. movl KKK, %eax
  1396. #endif
  1397. movaps ALPHA, %xmm3
  1398. andl $7, %eax # if (k & 1)
  1399. BRANCH
  1400. je .L58
  1401. ALIGN_4
  1402. .L56:
  1403. mulps %xmm0, %xmm2
  1404. mulps 4 * SIZE(BB), %xmm0
  1405. addps %xmm2, %xmm4
  1406. movaps 8 * SIZE(BB), %xmm2
  1407. addps %xmm0, %xmm5
  1408. movaps 4 * SIZE(AA), %xmm0
  1409. addl $4 * SIZE, AA
  1410. addl $8 * SIZE, BB
  1411. decl %eax
  1412. jg .L56
  1413. ALIGN_4
  1414. .L58:
  1415. movsd 0 * SIZE(%esi), %xmm0
  1416. movhps 2 * SIZE(%esi), %xmm0
  1417. movsd 4 * SIZE(%esi), %xmm1
  1418. movhps 6 * SIZE(%esi), %xmm1
  1419. pshufd $0x50, %xmm4, %xmm2
  1420. pshufd $0xfa, %xmm4, %xmm4
  1421. mulps %xmm3, %xmm2
  1422. mulps %xmm3, %xmm4
  1423. addps %xmm2, %xmm0
  1424. addps %xmm4, %xmm1
  1425. movlps %xmm0, 0 * SIZE(%esi)
  1426. movhps %xmm0, 2 * SIZE(%esi)
  1427. movlps %xmm1, 4 * SIZE(%esi)
  1428. movhps %xmm1, 6 * SIZE(%esi)
  1429. movsd 0 * SIZE(%esi, LDC), %xmm0
  1430. movhps 2 * SIZE(%esi, LDC), %xmm0
  1431. movsd 4 * SIZE(%esi, LDC), %xmm1
  1432. movhps 6 * SIZE(%esi, LDC), %xmm1
  1433. pshufd $0x50, %xmm5, %xmm2
  1434. pshufd $0xfa, %xmm5, %xmm5
  1435. mulps %xmm3, %xmm2
  1436. mulps %xmm3, %xmm5
  1437. addps %xmm2, %xmm0
  1438. addps %xmm5, %xmm1
  1439. movlps %xmm0, 0 * SIZE(%esi, LDC)
  1440. movhps %xmm0, 2 * SIZE(%esi, LDC)
  1441. movlps %xmm1, 4 * SIZE(%esi, LDC)
  1442. movhps %xmm1, 6 * SIZE(%esi, LDC)
  1443. addl $8 * SIZE, %esi # coffset += 2
  1444. decl %ebx # i --
  1445. jg .L51
  1446. ALIGN_4
  1447. .L60:
  1448. testl $2, M
  1449. je .L70
  1450. #if !defined(TRMMKERNEL) || \
  1451. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1452. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1453. leal BUFFER, BB # boffset1 = boffset
  1454. #else
  1455. leal BUFFER, BB # boffset1 = boffset
  1456. movl KK, %eax
  1457. leal (, %eax, 8), %eax
  1458. leal (AA, %eax, 1), AA
  1459. leal (BB, %eax, 4), BB
  1460. #endif
  1461. pxor %xmm4, %xmm4
  1462. pxor %xmm5, %xmm5
  1463. pxor %xmm6, %xmm6
  1464. pxor %xmm7, %xmm7
  1465. movsd 0 * SIZE(AA), %xmm0
  1466. movsd 8 * SIZE(AA), %xmm1
  1467. movsd 0 * SIZE(BB), %xmm2
  1468. movsd 16 * SIZE(BB), %xmm3
  1469. leal (LDC, LDC, 2), %eax
  1470. #ifndef TRMMKERNEL
  1471. movl K, %eax
  1472. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1473. movl K, %eax
  1474. subl KK, %eax
  1475. movl %eax, KKK
  1476. #else
  1477. movl KK, %eax
  1478. #ifdef LEFT
  1479. addl $2, %eax
  1480. #else
  1481. addl $2, %eax
  1482. #endif
  1483. movl %eax, KKK
  1484. #endif
  1485. sarl $3, %eax
  1486. je .L65
  1487. ALIGN_4
  1488. .L62:
  1489. #if defined(OPTERON) || defined(BARCELONA)
  1490. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1491. #endif
  1492. mulps %xmm0, %xmm2
  1493. addps %xmm2, %xmm4
  1494. movsd 4 * SIZE(BB), %xmm2
  1495. mulps %xmm0, %xmm2
  1496. movsd 2 * SIZE(AA), %xmm0
  1497. addps %xmm2, %xmm5
  1498. movsd 8 * SIZE(BB), %xmm2
  1499. mulps %xmm0, %xmm2
  1500. addps %xmm2, %xmm6
  1501. movsd 12 * SIZE(BB), %xmm2
  1502. mulps %xmm0, %xmm2
  1503. movsd 4 * SIZE(AA), %xmm0
  1504. addps %xmm2, %xmm7
  1505. movsd 32 * SIZE(BB), %xmm2
  1506. mulps %xmm0, %xmm3
  1507. addps %xmm3, %xmm4
  1508. movsd 20 * SIZE(BB), %xmm3
  1509. mulps %xmm0, %xmm3
  1510. movsd 6 * SIZE(AA), %xmm0
  1511. addps %xmm3, %xmm5
  1512. movsd 24 * SIZE(BB), %xmm3
  1513. mulps %xmm0, %xmm3
  1514. addps %xmm3, %xmm6
  1515. movsd 28 * SIZE(BB), %xmm3
  1516. mulps %xmm0, %xmm3
  1517. movsd 16 * SIZE(AA), %xmm0
  1518. addps %xmm3, %xmm7
  1519. movsd 48 * SIZE(BB), %xmm3
  1520. mulps %xmm1, %xmm2
  1521. addps %xmm2, %xmm4
  1522. movsd 36 * SIZE(BB), %xmm2
  1523. mulps %xmm1, %xmm2
  1524. movsd 10 * SIZE(AA), %xmm1
  1525. addps %xmm2, %xmm5
  1526. movsd 40 * SIZE(BB), %xmm2
  1527. mulps %xmm1, %xmm2
  1528. addps %xmm2, %xmm6
  1529. movsd 44 * SIZE(BB), %xmm2
  1530. mulps %xmm1, %xmm2
  1531. movsd 12 * SIZE(AA), %xmm1
  1532. addps %xmm2, %xmm7
  1533. movsd 64 * SIZE(BB), %xmm2
  1534. mulps %xmm1, %xmm3
  1535. addps %xmm3, %xmm4
  1536. movsd 52 * SIZE(BB), %xmm3
  1537. mulps %xmm1, %xmm3
  1538. movsd 14 * SIZE(AA), %xmm1
  1539. addps %xmm3, %xmm5
  1540. movsd 56 * SIZE(BB), %xmm3
  1541. mulps %xmm1, %xmm3
  1542. addps %xmm3, %xmm6
  1543. movsd 60 * SIZE(BB), %xmm3
  1544. mulps %xmm1, %xmm3
  1545. movsd 24 * SIZE(AA), %xmm1
  1546. addps %xmm3, %xmm7
  1547. movsd 80 * SIZE(BB), %xmm3
  1548. addl $16 * SIZE, AA
  1549. addl $64 * SIZE, BB
  1550. decl %eax
  1551. jne .L62
  1552. ALIGN_4
  1553. .L65:
  1554. #ifndef TRMMKERNEL
  1555. movl K, %eax
  1556. #else
  1557. movl KKK, %eax
  1558. #endif
  1559. movaps ALPHA, %xmm3
  1560. andl $7, %eax # if (k & 1)
  1561. BRANCH
  1562. je .L68
  1563. ALIGN_4
  1564. .L66:
  1565. mulps %xmm0, %xmm2
  1566. addps %xmm2, %xmm4
  1567. movsd 4 * SIZE(BB), %xmm2
  1568. mulps %xmm0, %xmm2
  1569. movsd 2 * SIZE(AA), %xmm0
  1570. addps %xmm2, %xmm5
  1571. movsd 8 * SIZE(BB), %xmm2
  1572. addl $2 * SIZE, AA
  1573. addl $8 * SIZE, BB
  1574. decl %eax
  1575. jg .L66
  1576. ALIGN_4
  1577. .L68:
  1578. addps %xmm6, %xmm4
  1579. addps %xmm7, %xmm5
  1580. movsd 0 * SIZE(%esi), %xmm0
  1581. movhps 2 * SIZE(%esi), %xmm0
  1582. movsd 0 * SIZE(%esi, LDC), %xmm1
  1583. movhps 2 * SIZE(%esi, LDC), %xmm1
  1584. shufps $0x50, %xmm4, %xmm4
  1585. shufps $0x50, %xmm5, %xmm5
  1586. mulps %xmm3, %xmm4
  1587. mulps %xmm3, %xmm5
  1588. addps %xmm4, %xmm0
  1589. addps %xmm5, %xmm1
  1590. movlps %xmm0, 0 * SIZE(%esi)
  1591. movhps %xmm0, 2 * SIZE(%esi)
  1592. movlps %xmm1, 0 * SIZE(%esi, LDC)
  1593. movhps %xmm1, 2 * SIZE(%esi, LDC)
  1594. addl $4 * SIZE, %esi
  1595. ALIGN_4
  1596. .L70:
  1597. testl $1, M
  1598. je .L79
  1599. #if !defined(TRMMKERNEL) || \
  1600. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1601. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1602. leal BUFFER, BB # boffset1 = boffset
  1603. #else
  1604. leal BUFFER, BB # boffset1 = boffset
  1605. movl KK, %eax
  1606. leal (, %eax, 4), %eax
  1607. leal (AA, %eax, 1), AA
  1608. leal (BB, %eax, 8), BB
  1609. #endif
  1610. pxor %xmm4, %xmm4
  1611. pxor %xmm5, %xmm5
  1612. pxor %xmm6, %xmm6
  1613. pxor %xmm7, %xmm7
  1614. movss 0 * SIZE(AA), %xmm0
  1615. movss 4 * SIZE(AA), %xmm1
  1616. movss 0 * SIZE(BB), %xmm2
  1617. movss 16 * SIZE(BB), %xmm3
  1618. leal (LDC, LDC, 2), %eax
  1619. #ifndef TRMMKERNEL
  1620. movl K, %eax
  1621. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1622. movl K, %eax
  1623. subl KK, %eax
  1624. movl %eax, KKK
  1625. #else
  1626. movl KK, %eax
  1627. #ifdef LEFT
  1628. addl $1, %eax
  1629. #else
  1630. addl $2, %eax
  1631. #endif
  1632. movl %eax, KKK
  1633. #endif
  1634. sarl $3, %eax
  1635. je .L75
  1636. ALIGN_4
  1637. .L72:
  1638. mulss %xmm0, %xmm2
  1639. #if defined(OPTERON) || defined(BARCELONA)
  1640. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1641. #endif
  1642. mulss 4 * SIZE(BB), %xmm0
  1643. addss %xmm2, %xmm4
  1644. movss 8 * SIZE(BB), %xmm2
  1645. addss %xmm0, %xmm5
  1646. movss 1 * SIZE(AA), %xmm0
  1647. mulss %xmm0, %xmm2
  1648. mulss 12 * SIZE(BB), %xmm0
  1649. addss %xmm2, %xmm6
  1650. movss 32 * SIZE(BB), %xmm2
  1651. addss %xmm0, %xmm7
  1652. movss 2 * SIZE(AA), %xmm0
  1653. mulss %xmm0, %xmm3
  1654. mulss 20 * SIZE(BB), %xmm0
  1655. addss %xmm3, %xmm4
  1656. movss 24 * SIZE(BB), %xmm3
  1657. addss %xmm0, %xmm5
  1658. movss 3 * SIZE(AA), %xmm0
  1659. mulss %xmm0, %xmm3
  1660. mulss 28 * SIZE(BB), %xmm0
  1661. addss %xmm3, %xmm6
  1662. movss 48 * SIZE(BB), %xmm3
  1663. addss %xmm0, %xmm7
  1664. movss 8 * SIZE(AA), %xmm0
  1665. mulss %xmm1, %xmm2
  1666. mulss 36 * SIZE(BB), %xmm1
  1667. addss %xmm2, %xmm4
  1668. movss 40 * SIZE(BB), %xmm2
  1669. addss %xmm1, %xmm5
  1670. movss 5 * SIZE(AA), %xmm1
  1671. mulss %xmm1, %xmm2
  1672. mulss 44 * SIZE(BB), %xmm1
  1673. addss %xmm2, %xmm6
  1674. movss 64 * SIZE(BB), %xmm2
  1675. addss %xmm1, %xmm7
  1676. movss 6 * SIZE(AA), %xmm1
  1677. mulss %xmm1, %xmm3
  1678. mulss 52 * SIZE(BB), %xmm1
  1679. addss %xmm3, %xmm4
  1680. movss 56 * SIZE(BB), %xmm3
  1681. addss %xmm1, %xmm5
  1682. movss 7 * SIZE(AA), %xmm1
  1683. mulss %xmm1, %xmm3
  1684. mulss 60 * SIZE(BB), %xmm1
  1685. addss %xmm3, %xmm6
  1686. movss 80 * SIZE(BB), %xmm3
  1687. addss %xmm1, %xmm7
  1688. movss 12 * SIZE(AA), %xmm1
  1689. addl $ 8 * SIZE, AA
  1690. addl $64 * SIZE, BB
  1691. decl %eax
  1692. jne .L72
  1693. ALIGN_4
  1694. .L75:
  1695. #ifndef TRMMKERNEL
  1696. movl K, %eax
  1697. #else
  1698. movl KKK, %eax
  1699. #endif
  1700. movaps ALPHA, %xmm3
  1701. andl $7, %eax # if (k & 1)
  1702. BRANCH
  1703. je .L78
  1704. ALIGN_4
  1705. .L76:
  1706. mulss %xmm0, %xmm2
  1707. mulss 4 * SIZE(BB), %xmm0
  1708. addss %xmm2, %xmm4
  1709. movss 8 * SIZE(BB), %xmm2
  1710. addss %xmm0, %xmm5
  1711. movss 1 * SIZE(AA), %xmm0
  1712. addl $ 1 * SIZE, AA
  1713. addl $ 8 * SIZE, BB
  1714. decl %eax
  1715. jg .L76
  1716. ALIGN_4
  1717. .L78:
  1718. addss %xmm6, %xmm4
  1719. addss %xmm7, %xmm5
  1720. movsd (%esi), %xmm0
  1721. movhps (%esi, LDC), %xmm0
  1722. shufps $0, %xmm5, %xmm4
  1723. mulps %xmm3, %xmm4
  1724. addps %xmm4, %xmm0
  1725. movlps %xmm0, (%esi)
  1726. movhps %xmm0, (%esi, LDC)
  1727. ALIGN_4
  1728. .L79:
  1729. #if defined(TRMMKERNEL) && !defined(LEFT)
  1730. addl $2, KK
  1731. #endif
  1732. leal (, LDC, 2), %eax
  1733. addl %eax, C
  1734. ALIGN_4
  1735. .L80:
  1736. testl $1, N
  1737. je .L999
  1738. #if defined(TRMMKERNEL) && defined(LEFT)
  1739. movl OFFSET, %eax
  1740. movl %eax, KK
  1741. #endif
  1742. movl K, %eax
  1743. leal BUFFER, %ecx
  1744. sarl $3, %eax
  1745. jle .L85
  1746. ALIGN_4
  1747. .L82:
  1748. prefetchnta 80 * SIZE(%edi)
  1749. #if defined(OPTERON) || defined(BARCELONA)
  1750. prefetchw 112 * SIZE(%ecx)
  1751. prefetchw 120 * SIZE(%ecx)
  1752. #endif
  1753. #ifdef PENTIUM4
  1754. prefetcht1 112 * SIZE(%ecx)
  1755. #endif
  1756. #ifdef HAVE_SSE2
  1757. movss 0 * SIZE(%edi), %xmm0
  1758. movss 1 * SIZE(%edi), %xmm1
  1759. movss 2 * SIZE(%edi), %xmm2
  1760. movss 3 * SIZE(%edi), %xmm3
  1761. movss 4 * SIZE(%edi), %xmm4
  1762. movss 5 * SIZE(%edi), %xmm5
  1763. movss 6 * SIZE(%edi), %xmm6
  1764. movss 7 * SIZE(%edi), %xmm7
  1765. shufps $0, %xmm0, %xmm0
  1766. shufps $0, %xmm1, %xmm1
  1767. shufps $0, %xmm2, %xmm2
  1768. shufps $0, %xmm3, %xmm3
  1769. shufps $0, %xmm4, %xmm4
  1770. shufps $0, %xmm5, %xmm5
  1771. shufps $0, %xmm6, %xmm6
  1772. shufps $0, %xmm7, %xmm7
  1773. movaps %xmm0, 0 * SIZE(%ecx)
  1774. movaps %xmm1, 4 * SIZE(%ecx)
  1775. movaps %xmm2, 8 * SIZE(%ecx)
  1776. movaps %xmm3, 12 * SIZE(%ecx)
  1777. movaps %xmm4, 16 * SIZE(%ecx)
  1778. movaps %xmm5, 20 * SIZE(%ecx)
  1779. movaps %xmm6, 24 * SIZE(%ecx)
  1780. movaps %xmm7, 28 * SIZE(%ecx)
  1781. #else
  1782. movd 0 * SIZE(%edi), %mm0
  1783. movd 1 * SIZE(%edi), %mm1
  1784. movd 2 * SIZE(%edi), %mm2
  1785. movd 3 * SIZE(%edi), %mm3
  1786. movd 4 * SIZE(%edi), %mm4
  1787. movd 5 * SIZE(%edi), %mm5
  1788. movd 6 * SIZE(%edi), %mm6
  1789. movd 7 * SIZE(%edi), %mm7
  1790. movd %mm0, 0 * SIZE(%ecx)
  1791. movd %mm0, 1 * SIZE(%ecx)
  1792. movd %mm0, 2 * SIZE(%ecx)
  1793. movd %mm0, 3 * SIZE(%ecx)
  1794. movd %mm1, 4 * SIZE(%ecx)
  1795. movd %mm1, 5 * SIZE(%ecx)
  1796. movd %mm1, 6 * SIZE(%ecx)
  1797. movd %mm1, 7 * SIZE(%ecx)
  1798. movd %mm2, 8 * SIZE(%ecx)
  1799. movd %mm2, 9 * SIZE(%ecx)
  1800. movd %mm2, 10 * SIZE(%ecx)
  1801. movd %mm2, 11 * SIZE(%ecx)
  1802. movd %mm3, 12 * SIZE(%ecx)
  1803. movd %mm3, 13 * SIZE(%ecx)
  1804. movd %mm3, 14 * SIZE(%ecx)
  1805. movd %mm3, 15 * SIZE(%ecx)
  1806. movd %mm4, 16 * SIZE(%ecx)
  1807. movd %mm4, 17 * SIZE(%ecx)
  1808. movd %mm4, 18 * SIZE(%ecx)
  1809. movd %mm4, 19 * SIZE(%ecx)
  1810. movd %mm5, 20 * SIZE(%ecx)
  1811. movd %mm5, 21 * SIZE(%ecx)
  1812. movd %mm5, 22 * SIZE(%ecx)
  1813. movd %mm5, 23 * SIZE(%ecx)
  1814. movd %mm6, 24 * SIZE(%ecx)
  1815. movd %mm6, 25 * SIZE(%ecx)
  1816. movd %mm6, 26 * SIZE(%ecx)
  1817. movd %mm6, 27 * SIZE(%ecx)
  1818. movd %mm7, 28 * SIZE(%ecx)
  1819. movd %mm7, 29 * SIZE(%ecx)
  1820. movd %mm7, 30 * SIZE(%ecx)
  1821. movd %mm7, 31 * SIZE(%ecx)
  1822. #endif
  1823. addl $ 8 * SIZE, %edi
  1824. addl $32 * SIZE, %ecx
  1825. decl %eax
  1826. jne .L82
  1827. ALIGN_4
  1828. .L85:
  1829. movl K, %eax
  1830. andl $7, %eax
  1831. BRANCH
  1832. jle .L90
  1833. ALIGN_4
  1834. .L86:
  1835. #ifdef HAVE_SSE2
  1836. movss 0 * SIZE(%edi), %xmm0
  1837. shufps $0, %xmm0, %xmm0
  1838. movaps %xmm0, 0 * SIZE(%ecx)
  1839. #else
  1840. movd 0 * SIZE(%edi), %mm0
  1841. movd %mm0, 0 * SIZE(%ecx)
  1842. movd %mm0, 1 * SIZE(%ecx)
  1843. movd %mm0, 2 * SIZE(%ecx)
  1844. movd %mm0, 3 * SIZE(%ecx)
  1845. #endif
  1846. addl $1 * SIZE, %edi
  1847. addl $4 * SIZE, %ecx
  1848. decl %eax
  1849. jne .L86
  1850. ALIGN_4
  1851. .L90:
  1852. movl C, %esi # coffset = c
  1853. movl A, %edx # aoffset = a
  1854. movl M, %ebx
  1855. sarl $2, %ebx # i = (m >> 2)
  1856. jle .L100
  1857. ALIGN_4
  1858. .L91:
  1859. #if !defined(TRMMKERNEL) || \
  1860. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1861. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1862. leal BUFFER, BB # boffset1 = boffset
  1863. #else
  1864. leal BUFFER, BB # boffset1 = boffset
  1865. movl KK, %eax
  1866. leal (, %eax, 8), %eax
  1867. leal (AA, %eax, 2), AA
  1868. leal (BB, %eax, 2), BB
  1869. #endif
  1870. pxor %xmm4, %xmm4
  1871. pxor %xmm5, %xmm5
  1872. pxor %xmm6, %xmm6
  1873. pxor %xmm7, %xmm7
  1874. movaps 0 * SIZE(AA), %xmm0
  1875. movaps 16 * SIZE(AA), %xmm1
  1876. movaps 0 * SIZE(BB), %xmm2
  1877. movaps 16 * SIZE(BB), %xmm3
  1878. #ifdef HAVE_3DNOW
  1879. prefetchw 4 * SIZE(%esi)
  1880. #elif defined(HAVE_SSE) || defined(HAVE_SSE2)
  1881. prefetcht2 4 * SIZE(%esi)
  1882. #endif
  1883. #ifndef TRMMKERNEL
  1884. movl K, %eax
  1885. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1886. movl K, %eax
  1887. subl KK, %eax
  1888. movl %eax, KKK
  1889. #else
  1890. movl KK, %eax
  1891. #ifdef LEFT
  1892. addl $4, %eax
  1893. #else
  1894. addl $1, %eax
  1895. #endif
  1896. movl %eax, KKK
  1897. #endif
  1898. sarl $3, %eax
  1899. je .L95
  1900. ALIGN_4
  1901. .L92:
  1902. mulps %xmm0, %xmm2
  1903. #if defined(OPTERON) || defined(BARCELONA)
  1904. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1905. #endif
  1906. movaps 4 * SIZE(AA), %xmm0
  1907. addps %xmm2, %xmm4
  1908. movaps 32 * SIZE(BB), %xmm2
  1909. mulps 4 * SIZE(BB), %xmm0
  1910. addps %xmm0, %xmm5
  1911. movaps 8 * SIZE(AA), %xmm0
  1912. mulps 8 * SIZE(BB), %xmm0
  1913. addps %xmm0, %xmm6
  1914. movaps 12 * SIZE(AA), %xmm0
  1915. mulps 12 * SIZE(BB), %xmm0
  1916. addps %xmm0, %xmm7
  1917. movaps 32 * SIZE(AA), %xmm0
  1918. #if defined(OPTERON) || defined(BARCELONA)
  1919. prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
  1920. #endif
  1921. mulps %xmm1, %xmm3
  1922. movaps 20 * SIZE(AA), %xmm1
  1923. addps %xmm3, %xmm4
  1924. movaps 48 * SIZE(BB), %xmm3
  1925. mulps 20 * SIZE(BB), %xmm1
  1926. addps %xmm1, %xmm5
  1927. movaps 24 * SIZE(AA), %xmm1
  1928. mulps 24 * SIZE(BB), %xmm1
  1929. addps %xmm1, %xmm6
  1930. movaps 28 * SIZE(AA), %xmm1
  1931. mulps 28 * SIZE(BB), %xmm1
  1932. addps %xmm1, %xmm7
  1933. movaps 48 * SIZE(AA), %xmm1
  1934. addl $32 * SIZE, AA
  1935. addl $32 * SIZE, BB
  1936. decl %eax
  1937. jne .L92
  1938. ALIGN_4
  1939. .L95:
  1940. #ifndef TRMMKERNEL
  1941. movl K, %eax
  1942. #else
  1943. movl KKK, %eax
  1944. #endif
  1945. movaps ALPHA, %xmm3
  1946. andl $7, %eax # if (k & 1)
  1947. BRANCH
  1948. je .L98
  1949. ALIGN_4
  1950. .L96:
  1951. mulps %xmm0, %xmm2
  1952. addps %xmm2, %xmm4
  1953. movaps 4 * SIZE(AA), %xmm0
  1954. movaps 4 * SIZE(BB), %xmm2
  1955. addl $4 * SIZE, AA
  1956. addl $4 * SIZE, BB
  1957. decl %eax
  1958. jg .L96
  1959. ALIGN_4
  1960. .L98:
  1961. addps %xmm5, %xmm4
  1962. addps %xmm7, %xmm6
  1963. addps %xmm6, %xmm4
  1964. movsd 0 * SIZE(%esi), %xmm0
  1965. movhps 2 * SIZE(%esi), %xmm0
  1966. movsd 4 * SIZE(%esi), %xmm1
  1967. movhps 6 * SIZE(%esi), %xmm1
  1968. pshufd $0x50, %xmm4, %xmm2
  1969. pshufd $0xfa, %xmm4, %xmm4
  1970. mulps %xmm3, %xmm2
  1971. mulps %xmm3, %xmm4
  1972. addps %xmm2, %xmm0
  1973. addps %xmm4, %xmm1
  1974. movlps %xmm0, 0 * SIZE(%esi)
  1975. movhps %xmm0, 2 * SIZE(%esi)
  1976. movlps %xmm1, 4 * SIZE(%esi)
  1977. movhps %xmm1, 6 * SIZE(%esi)
  1978. addl $8 * SIZE, %esi # coffset += 2
  1979. decl %ebx # i --
  1980. jg .L91
  1981. ALIGN_4
  1982. .L100:
  1983. testl $2, M
  1984. je .L110
  1985. #if !defined(TRMMKERNEL) || \
  1986. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1987. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1988. leal BUFFER, BB # boffset1 = boffset
  1989. #else
  1990. leal BUFFER, BB # boffset1 = boffset
  1991. movl KK, %eax
  1992. leal (, %eax, 8), %eax
  1993. leal (AA, %eax, 1), AA
  1994. leal (BB, %eax, 2), BB
  1995. #endif
  1996. pxor %xmm4, %xmm4
  1997. pxor %xmm5, %xmm5
  1998. pxor %xmm6, %xmm6
  1999. pxor %xmm7, %xmm7
  2000. movsd 0 * SIZE(AA), %xmm0
  2001. movsd 8 * SIZE(AA), %xmm1
  2002. movsd 0 * SIZE(BB), %xmm2
  2003. movsd 16 * SIZE(BB), %xmm3
  2004. leal (LDC, LDC, 2), %eax
  2005. #ifndef TRMMKERNEL
  2006. movl K, %eax
  2007. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2008. movl K, %eax
  2009. subl KK, %eax
  2010. movl %eax, KKK
  2011. #else
  2012. movl KK, %eax
  2013. #ifdef LEFT
  2014. addl $2, %eax
  2015. #else
  2016. addl $1, %eax
  2017. #endif
  2018. movl %eax, KKK
  2019. #endif
  2020. sarl $3, %eax
  2021. je .L105
  2022. ALIGN_4
  2023. .L102:
  2024. mulps %xmm0, %xmm2
  2025. #if defined(OPTERON) || defined(BARCELONA)
  2026. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  2027. #endif
  2028. movsd 2 * SIZE(AA), %xmm0
  2029. addps %xmm2, %xmm4
  2030. movsd 4 * SIZE(BB), %xmm2
  2031. mulps %xmm0, %xmm2
  2032. movsd 4 * SIZE(AA), %xmm0
  2033. addps %xmm2, %xmm5
  2034. movsd 8 * SIZE(BB), %xmm2
  2035. mulps %xmm0, %xmm2
  2036. movsd 6 * SIZE(AA), %xmm0
  2037. addps %xmm2, %xmm6
  2038. movsd 12 * SIZE(BB), %xmm2
  2039. mulps %xmm0, %xmm2
  2040. movsd 16 * SIZE(AA), %xmm0
  2041. addps %xmm2, %xmm7
  2042. movsd 32 * SIZE(BB), %xmm2
  2043. mulps %xmm1, %xmm3
  2044. movsd 10 * SIZE(AA), %xmm1
  2045. addps %xmm3, %xmm4
  2046. movsd 20 * SIZE(BB), %xmm3
  2047. mulps %xmm1, %xmm3
  2048. movsd 12 * SIZE(AA), %xmm1
  2049. addps %xmm3, %xmm5
  2050. movsd 24 * SIZE(BB), %xmm3
  2051. mulps %xmm1, %xmm3
  2052. movsd 14 * SIZE(AA), %xmm1
  2053. addps %xmm3, %xmm6
  2054. movsd 28 * SIZE(BB), %xmm3
  2055. mulps %xmm1, %xmm3
  2056. movsd 24 * SIZE(AA), %xmm1
  2057. addps %xmm3, %xmm7
  2058. movsd 48 * SIZE(BB), %xmm3
  2059. addl $16 * SIZE, AA
  2060. addl $32 * SIZE, BB
  2061. decl %eax
  2062. jne .L102
  2063. ALIGN_4
  2064. .L105:
  2065. #ifndef TRMMKERNEL
  2066. movl K, %eax
  2067. #else
  2068. movl KKK, %eax
  2069. #endif
  2070. movaps ALPHA, %xmm3
  2071. andl $7, %eax # if (k & 1)
  2072. BRANCH
  2073. je .L108
  2074. ALIGN_4
  2075. .L106:
  2076. mulps %xmm0, %xmm2
  2077. addps %xmm2, %xmm4
  2078. movsd 2 * SIZE(AA), %xmm0
  2079. movsd 4 * SIZE(BB), %xmm2
  2080. addl $2 * SIZE, AA
  2081. addl $4 * SIZE, BB
  2082. decl %eax
  2083. jg .L106
  2084. ALIGN_4
  2085. .L108:
  2086. addps %xmm5, %xmm4
  2087. addps %xmm7, %xmm6
  2088. addps %xmm6, %xmm4
  2089. movsd 0 * SIZE(%esi), %xmm0
  2090. movhps 2 * SIZE(%esi), %xmm0
  2091. shufps $0x50, %xmm4, %xmm4
  2092. mulps %xmm3, %xmm4
  2093. addps %xmm4, %xmm0
  2094. movlps %xmm0, 0 * SIZE(%esi)
  2095. movhps %xmm0, 2 * SIZE(%esi)
  2096. addl $4 * SIZE, %esi
  2097. ALIGN_4
  2098. .L110:
  2099. testl $1, M
  2100. je .L999
  2101. #if !defined(TRMMKERNEL) || \
  2102. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2103. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2104. leal BUFFER, BB # boffset1 = boffset
  2105. #else
  2106. leal BUFFER, BB # boffset1 = boffset
  2107. movl KK, %eax
  2108. leal (, %eax, 4), %eax
  2109. leal (AA, %eax, 1), AA
  2110. leal (BB, %eax, 4), BB
  2111. #endif
  2112. pxor %xmm4, %xmm4
  2113. pxor %xmm5, %xmm5
  2114. pxor %xmm6, %xmm6
  2115. pxor %xmm7, %xmm7
  2116. movss 0 * SIZE(AA), %xmm0
  2117. movss 4 * SIZE(AA), %xmm1
  2118. movss 0 * SIZE(BB), %xmm2
  2119. movss 16 * SIZE(BB), %xmm3
  2120. leal (LDC, LDC, 2), %eax
  2121. #ifndef TRMMKERNEL
  2122. movl K, %eax
  2123. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2124. movl K, %eax
  2125. subl KK, %eax
  2126. movl %eax, KKK
  2127. #else
  2128. movl KK, %eax
  2129. #ifdef LEFT
  2130. addl $1, %eax
  2131. #else
  2132. addl $1, %eax
  2133. #endif
  2134. movl %eax, KKK
  2135. #endif
  2136. sarl $3, %eax
  2137. je .L115
  2138. ALIGN_4
  2139. .L112:
  2140. mulss %xmm0, %xmm2
  2141. #if defined(OPTERON) || defined(BARCELONA)
  2142. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  2143. #endif
  2144. movss 1 * SIZE(AA), %xmm0
  2145. addss %xmm2, %xmm4
  2146. movss 32 * SIZE(BB), %xmm2
  2147. mulss 4 * SIZE(BB), %xmm0
  2148. addss %xmm0, %xmm5
  2149. movss 2 * SIZE(AA), %xmm0
  2150. mulss 8 * SIZE(BB), %xmm0
  2151. addss %xmm0, %xmm6
  2152. movss 3 * SIZE(AA), %xmm0
  2153. mulss 12 * SIZE(BB), %xmm0
  2154. addss %xmm0, %xmm7
  2155. movss 8 * SIZE(AA), %xmm0
  2156. mulss %xmm1, %xmm3
  2157. movss 5 * SIZE(AA), %xmm1
  2158. addss %xmm3, %xmm4
  2159. movss 48 * SIZE(BB), %xmm3
  2160. mulss 20 * SIZE(BB), %xmm1
  2161. addss %xmm1, %xmm5
  2162. movss 6 * SIZE(AA), %xmm1
  2163. mulss 24 * SIZE(BB), %xmm1
  2164. addss %xmm1, %xmm6
  2165. movss 7 * SIZE(AA), %xmm1
  2166. mulss 28 * SIZE(BB), %xmm1
  2167. addss %xmm1, %xmm7
  2168. movss 12 * SIZE(AA), %xmm1
  2169. addl $ 8 * SIZE, AA
  2170. addl $32 * SIZE, BB
  2171. decl %eax
  2172. jne .L112
  2173. ALIGN_4
  2174. .L115:
  2175. #ifndef TRMMKERNEL
  2176. movl K, %eax
  2177. #else
  2178. movl KKK, %eax
  2179. #endif
  2180. movaps ALPHA, %xmm3
  2181. andl $7, %eax # if (k & 1)
  2182. BRANCH
  2183. je .L118
  2184. ALIGN_4
  2185. .L116:
  2186. mulss %xmm0, %xmm2
  2187. movss 1 * SIZE(AA), %xmm0
  2188. addss %xmm2, %xmm4
  2189. movss 4 * SIZE(BB), %xmm2
  2190. addl $ 1 * SIZE, AA
  2191. addl $ 4 * SIZE, BB
  2192. decl %eax
  2193. jg .L116
  2194. ALIGN_4
  2195. .L118:
  2196. addss %xmm5, %xmm4
  2197. addss %xmm7, %xmm6
  2198. addss %xmm6, %xmm4
  2199. movsd (%esi), %xmm0
  2200. shufps $0, %xmm5, %xmm4
  2201. mulps %xmm3, %xmm4
  2202. addps %xmm4, %xmm0
  2203. movlps %xmm0, (%esi)
  2204. ALIGN_4
  2205. .L999:
  2206. EMMS
  2207. movl OLD_STACK, %esp
  2208. popl %ebx
  2209. popl %esi
  2210. popl %edi
  2211. popl %ebp
  2212. ret
  2213. EPILOGUE