You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_4x4_penryn.S 55 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 20 + STACK + ARGS(%esp)
  47. #define ARG_B 24 + STACK + ARGS(%esp)
  48. #define C 28 + STACK + ARGS(%esp)
  49. #define ARG_LDC 32 + STACK + ARGS(%esp)
  50. #define OFFSET 36 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define KK 4 + STACK(%esp)
  53. #define KKK 8 + STACK(%esp)
  54. #define AORIG 12 + STACK(%esp)
  55. #if defined(PENRYN) || defined(DUNNINGTON)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHSIZE (8 * 21 + 4)
  58. #endif
  59. #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
  60. #define PREFETCH prefetcht0
  61. #define PREFETCHSIZE (8 * 21 + 4)
  62. #endif
  63. #ifdef ATOM
  64. #define PREFETCH prefetcht0
  65. #define PREFETCHSIZE (8 * 8 + 4)
  66. #endif
  67. #ifdef NANO
  68. #define PREFETCH prefetcht0
  69. #define PREFETCHSIZE (16 * 2)
  70. #endif
  71. #define B %edi
  72. #define AA %edx
  73. #define BB %ecx
  74. #define LDC %ebp
  75. #define CO1 %esi
  76. PROLOGUE
  77. subl $ARGS, %esp
  78. pushl %ebp
  79. pushl %edi
  80. pushl %esi
  81. pushl %ebx
  82. PROFCODE
  83. movl ARG_B, B
  84. movl ARG_LDC, LDC
  85. movl OFFSET, %eax
  86. #ifdef RN
  87. negl %eax
  88. #endif
  89. movl %eax, KK
  90. leal (, LDC, SIZE), LDC
  91. subl $-32 * SIZE, A
  92. subl $-32 * SIZE, B
  93. #ifdef LN
  94. movl M, %eax
  95. leal (, %eax, SIZE), %eax
  96. addl %eax, C
  97. imull K, %eax
  98. addl %eax, A
  99. #endif
  100. #ifdef RT
  101. movl N, %eax
  102. leal (, %eax, SIZE), %eax
  103. imull K, %eax
  104. addl %eax, B
  105. movl N, %eax
  106. imull LDC, %eax
  107. addl %eax, C
  108. #endif
  109. #ifdef RT
  110. movl N, %eax
  111. subl OFFSET, %eax
  112. movl %eax, KK
  113. #endif
  114. testl $1, N
  115. je .L40
  116. #if defined(LT) || defined(RN)
  117. movl A, AA
  118. #else
  119. movl A, %eax
  120. movl %eax, AORIG
  121. #endif
  122. #ifdef RT
  123. movl K, %eax
  124. sall $BASE_SHIFT, %eax
  125. subl %eax, B
  126. #endif
  127. #ifdef RT
  128. subl LDC, C
  129. #endif
  130. movl C, CO1
  131. #ifndef RT
  132. addl LDC, C
  133. #endif
  134. #ifdef LN
  135. movl OFFSET, %eax
  136. addl M, %eax
  137. movl %eax, KK
  138. #endif
  139. #ifdef LT
  140. movl OFFSET, %eax
  141. movl %eax, KK
  142. #endif
  143. movl M, %ebx
  144. sarl $2, %ebx # i = (m >> 2)
  145. jle .L100
  146. ALIGN_4
  147. .L91:
  148. #ifdef LN
  149. movl K, %eax
  150. sall $2 + BASE_SHIFT, %eax
  151. subl %eax, AORIG
  152. #endif
  153. #if defined(LN) || defined(RT)
  154. movl KK, %eax
  155. movl AORIG, AA
  156. leal (, %eax, SIZE), %eax
  157. leal (AA, %eax, 4), AA
  158. #endif
  159. movl B, BB
  160. #if defined(LN) || defined(RT)
  161. movl KK, %eax
  162. sall $BASE_SHIFT, %eax
  163. addl %eax, BB
  164. #endif
  165. movaps -32 * SIZE(AA), %xmm0
  166. pxor %xmm2, %xmm2
  167. movsd -32 * SIZE(BB), %xmm1
  168. pxor %xmm4, %xmm4
  169. #ifdef LN
  170. prefetcht0 -4 * SIZE(CO1)
  171. #else
  172. prefetcht0 3 * SIZE(CO1)
  173. #endif
  174. pxor %xmm5, %xmm5
  175. #if defined(LT) || defined(RN)
  176. movl KK, %eax
  177. #else
  178. movl K, %eax
  179. subl KK, %eax
  180. #endif
  181. sarl $3, %eax
  182. je .L95
  183. ALIGN_4
  184. .L92:
  185. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  186. addps %xmm2, %xmm4
  187. pshufd $0x00, %xmm1, %xmm2
  188. mulps %xmm0, %xmm2
  189. movaps -28 * SIZE(AA), %xmm0
  190. addps %xmm2, %xmm5
  191. pshufd $0x55, %xmm1, %xmm2
  192. movsd -30 * SIZE(BB), %xmm1
  193. mulps %xmm0, %xmm2
  194. movaps -24 * SIZE(AA), %xmm0
  195. addps %xmm2, %xmm4
  196. pshufd $0x00, %xmm1, %xmm2
  197. mulps %xmm0, %xmm2
  198. movaps -20 * SIZE(AA), %xmm0
  199. addps %xmm2, %xmm5
  200. pshufd $0x55, %xmm1, %xmm2
  201. movsd -28 * SIZE(BB), %xmm1
  202. mulps %xmm0, %xmm2
  203. movaps -16 * SIZE(AA), %xmm0
  204. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  205. addps %xmm2, %xmm4
  206. pshufd $0x00, %xmm1, %xmm2
  207. mulps %xmm0, %xmm2
  208. movaps -12 * SIZE(AA), %xmm0
  209. addps %xmm2, %xmm5
  210. pshufd $0x55, %xmm1, %xmm2
  211. movsd -26 * SIZE(BB), %xmm1
  212. mulps %xmm0, %xmm2
  213. movaps -8 * SIZE(AA), %xmm0
  214. addps %xmm2, %xmm4
  215. pshufd $0x00, %xmm1, %xmm2
  216. mulps %xmm0, %xmm2
  217. movaps -4 * SIZE(AA), %xmm0
  218. addps %xmm2, %xmm5
  219. pshufd $0x55, %xmm1, %xmm2
  220. movsd -24 * SIZE(BB), %xmm1
  221. mulps %xmm0, %xmm2
  222. movaps 0 * SIZE(AA), %xmm0
  223. subl $-32 * SIZE, AA
  224. subl $ -8 * SIZE, BB
  225. subl $1, %eax
  226. jne .L92
  227. ALIGN_4
  228. .L95:
  229. #if defined(LT) || defined(RN)
  230. movl KK, %eax
  231. #else
  232. movl K, %eax
  233. subl KK, %eax
  234. #endif
  235. andl $7, %eax # if (k & 1)
  236. BRANCH
  237. je .L98
  238. ALIGN_4
  239. .L96:
  240. addps %xmm2, %xmm4
  241. pshufd $0x00, %xmm1, %xmm2
  242. movss -31 * SIZE(BB), %xmm1
  243. mulps %xmm0, %xmm2
  244. movaps -28 * SIZE(AA), %xmm0
  245. addl $4 * SIZE, AA
  246. addl $1 * SIZE, BB
  247. decl %eax
  248. jg .L96
  249. ALIGN_4
  250. .L98:
  251. #if defined(LN) || defined(RT)
  252. movl KK, %eax
  253. #ifdef LN
  254. subl $4, %eax
  255. #else
  256. subl $1, %eax
  257. #endif
  258. movl AORIG, AA
  259. leal (, %eax, SIZE), %eax
  260. leal (AA, %eax, 4), AA
  261. leal (B, %eax, 1), BB
  262. #endif
  263. addps %xmm2, %xmm4
  264. addps %xmm5, %xmm4
  265. #if defined(LN) || defined(LT)
  266. movaps %xmm4, %xmm0
  267. unpcklps %xmm6, %xmm4
  268. unpckhps %xmm6, %xmm0
  269. movaps %xmm5, %xmm1
  270. unpcklps %xmm7, %xmm5
  271. unpckhps %xmm7, %xmm1
  272. movaps %xmm4, %xmm6
  273. unpcklps %xmm5, %xmm4
  274. unpckhps %xmm5, %xmm6
  275. movaps %xmm0, %xmm2
  276. unpcklps %xmm1, %xmm0
  277. unpckhps %xmm1, %xmm2
  278. movss -32 * SIZE(BB), %xmm1
  279. movss -31 * SIZE(BB), %xmm3
  280. movss -30 * SIZE(BB), %xmm5
  281. movss -29 * SIZE(BB), %xmm7
  282. subss %xmm4, %xmm1
  283. subss %xmm6, %xmm3
  284. subss %xmm0, %xmm5
  285. subss %xmm2, %xmm7
  286. #else
  287. movaps -32 * SIZE(AA), %xmm0
  288. subps %xmm4, %xmm0
  289. #endif
  290. #ifdef LN
  291. movaps -20 * SIZE(AA), %xmm4
  292. pshufd $0xff, %xmm4, %xmm6
  293. mulss %xmm6, %xmm7
  294. pshufd $0xaa, %xmm4, %xmm6
  295. mulss %xmm7, %xmm6
  296. subss %xmm6, %xmm5
  297. pshufd $0x55, %xmm4, %xmm6
  298. mulss %xmm7, %xmm6
  299. subss %xmm6, %xmm3
  300. pshufd $0x00, %xmm4, %xmm6
  301. mulss %xmm7, %xmm6
  302. subss %xmm6, %xmm1
  303. movaps -24 * SIZE(AA), %xmm4
  304. pshufd $0xaa, %xmm4, %xmm6
  305. mulss %xmm6, %xmm5
  306. pshufd $0x55, %xmm4, %xmm6
  307. mulss %xmm5, %xmm6
  308. subss %xmm6, %xmm3
  309. pshufd $0x00, %xmm4, %xmm6
  310. mulss %xmm5, %xmm6
  311. subss %xmm6, %xmm1
  312. movaps -28 * SIZE(AA), %xmm4
  313. pshufd $0x55, %xmm4, %xmm6
  314. mulss %xmm6, %xmm3
  315. pshufd $0x00, %xmm4, %xmm6
  316. mulss %xmm3, %xmm6
  317. subss %xmm6, %xmm1
  318. movaps -32 * SIZE(AA), %xmm4
  319. pshufd $0x00, %xmm4, %xmm6
  320. mulss %xmm6, %xmm1
  321. #endif
  322. #ifdef LT
  323. movaps -32 * SIZE(AA), %xmm4
  324. pshufd $0x00, %xmm4, %xmm6
  325. mulss %xmm6, %xmm1
  326. pshufd $0x55, %xmm4, %xmm6
  327. mulss %xmm1, %xmm6
  328. subss %xmm6, %xmm3
  329. pshufd $0xaa, %xmm4, %xmm6
  330. mulss %xmm1, %xmm6
  331. subss %xmm6, %xmm5
  332. pshufd $0xff, %xmm4, %xmm6
  333. mulss %xmm1, %xmm6
  334. subss %xmm6, %xmm7
  335. movaps -28 * SIZE(AA), %xmm4
  336. pshufd $0x55, %xmm4, %xmm6
  337. mulss %xmm6, %xmm3
  338. pshufd $0xaa, %xmm4, %xmm6
  339. mulss %xmm3, %xmm6
  340. subss %xmm6, %xmm5
  341. pshufd $0xff, %xmm4, %xmm6
  342. mulss %xmm3, %xmm6
  343. subss %xmm6, %xmm7
  344. movaps -24 * SIZE(AA), %xmm4
  345. pshufd $0xaa, %xmm4, %xmm6
  346. mulss %xmm6, %xmm5
  347. pshufd $0xff, %xmm4, %xmm6
  348. mulss %xmm5, %xmm6
  349. subss %xmm6, %xmm7
  350. movaps -20 * SIZE(AA), %xmm4
  351. pshufd $0xff, %xmm4, %xmm6
  352. mulss %xmm6, %xmm7
  353. #endif
  354. #if defined(RN) || defined(RT)
  355. movss -32 * SIZE(BB), %xmm6
  356. pshufd $0x00, %xmm6, %xmm7
  357. mulps %xmm7, %xmm0
  358. #endif
  359. #if defined(LN) || defined(LT)
  360. movss %xmm1, -32 * SIZE(BB)
  361. movss %xmm3, -31 * SIZE(BB)
  362. movss %xmm5, -30 * SIZE(BB)
  363. movss %xmm7, -29 * SIZE(BB)
  364. #else
  365. movaps %xmm0, -32 * SIZE(AA)
  366. #endif
  367. #ifdef LN
  368. subl $4 * SIZE, CO1
  369. #endif
  370. #if defined(LN) || defined(LT)
  371. unpcklps %xmm5, %xmm1
  372. unpcklps %xmm7, %xmm3
  373. unpcklps %xmm3, %xmm1
  374. movlps %xmm1, 0 * SIZE(CO1)
  375. movhps %xmm1, 2 * SIZE(CO1)
  376. #else
  377. movlps %xmm0, 0 * SIZE(CO1)
  378. movhps %xmm0, 2 * SIZE(CO1)
  379. #endif
  380. #ifndef LN
  381. addl $4 * SIZE, CO1
  382. #endif
  383. #if defined(LT) || defined(RN)
  384. movl K, %eax
  385. subl KK, %eax
  386. leal (,%eax, SIZE), %eax
  387. leal (AA, %eax, 4), AA
  388. leal (BB, %eax, 1), BB
  389. #endif
  390. #ifdef LN
  391. subl $4, KK
  392. #endif
  393. #ifdef LT
  394. addl $4, KK
  395. #endif
  396. #ifdef RT
  397. movl K, %eax
  398. sall $2 + BASE_SHIFT, %eax
  399. addl %eax, AORIG
  400. #endif
  401. decl %ebx # i --
  402. jg .L91
  403. ALIGN_4
  404. .L100:
  405. testl $2, M
  406. je .L110
  407. #ifdef LN
  408. movl K, %eax
  409. sall $1 + BASE_SHIFT, %eax
  410. subl %eax, AORIG
  411. #endif
  412. #if defined(LN) || defined(RT)
  413. movl KK, %eax
  414. movl AORIG, AA
  415. leal (, %eax, SIZE), %eax
  416. leal (AA, %eax, 2), AA
  417. #endif
  418. movl B, BB
  419. #if defined(LN) || defined(RT)
  420. movl KK, %eax
  421. sall $BASE_SHIFT, %eax
  422. addl %eax, BB
  423. #endif
  424. movsd -32 * SIZE(AA), %xmm0
  425. pxor %xmm3, %xmm3
  426. movsd -32 * SIZE(BB), %xmm1
  427. pxor %xmm4, %xmm4
  428. pxor %xmm5, %xmm5
  429. #if defined(LT) || defined(RN)
  430. movl KK, %eax
  431. #else
  432. movl K, %eax
  433. subl KK, %eax
  434. #endif
  435. sarl $3, %eax
  436. je .L105
  437. ALIGN_4
  438. .L102:
  439. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  440. pshufd $0x00, %xmm1, %xmm2
  441. mulps %xmm0, %xmm2
  442. movsd -30 * SIZE(AA), %xmm0
  443. addps %xmm2, %xmm4
  444. pshufd $0x55, %xmm1, %xmm2
  445. movsd -30 * SIZE(BB), %xmm1
  446. mulps %xmm0, %xmm2
  447. movsd -28 * SIZE(AA), %xmm0
  448. addps %xmm2, %xmm5
  449. pshufd $0x00, %xmm1, %xmm2
  450. mulps %xmm0, %xmm2
  451. movsd -26 * SIZE(AA), %xmm0
  452. addps %xmm2, %xmm4
  453. pshufd $0x55, %xmm1, %xmm2
  454. movsd -28 * SIZE(BB), %xmm1
  455. mulps %xmm0, %xmm2
  456. movsd -24 * SIZE(AA), %xmm0
  457. addps %xmm2, %xmm5
  458. pshufd $0x00, %xmm1, %xmm2
  459. mulps %xmm0, %xmm2
  460. movsd -22 * SIZE(AA), %xmm0
  461. addps %xmm2, %xmm4
  462. pshufd $0x55, %xmm1, %xmm2
  463. movsd -26 * SIZE(BB), %xmm1
  464. mulps %xmm0, %xmm2
  465. movsd -20 * SIZE(AA), %xmm0
  466. addps %xmm2, %xmm5
  467. pshufd $0x00, %xmm1, %xmm2
  468. mulps %xmm0, %xmm2
  469. movsd -18 * SIZE(AA), %xmm0
  470. addps %xmm2, %xmm4
  471. pshufd $0x55, %xmm1, %xmm2
  472. movsd -24 * SIZE(BB), %xmm1
  473. mulps %xmm0, %xmm2
  474. movsd -16 * SIZE(AA), %xmm0
  475. addps %xmm2, %xmm5
  476. subl $-16 * SIZE, AA
  477. subl $ -8 * SIZE, BB
  478. subl $1, %eax
  479. jne .L102
  480. ALIGN_4
  481. .L105:
  482. #if defined(LT) || defined(RN)
  483. movl KK, %eax
  484. #else
  485. movl K, %eax
  486. subl KK, %eax
  487. #endif
  488. andl $7, %eax # if (k & 1)
  489. BRANCH
  490. je .L108
  491. ALIGN_4
  492. .L106:
  493. pshufd $0x00, %xmm1, %xmm2
  494. movss -31 * SIZE(BB), %xmm1
  495. mulps %xmm0, %xmm2
  496. movsd -30 * SIZE(AA), %xmm0
  497. addps %xmm2, %xmm4
  498. addl $2 * SIZE, AA
  499. addl $1 * SIZE, BB
  500. decl %eax
  501. jg .L106
  502. ALIGN_4
  503. .L108:
  504. #if defined(LN) || defined(RT)
  505. movl KK, %eax
  506. #ifdef LN
  507. subl $2, %eax
  508. #else
  509. subl $1, %eax
  510. #endif
  511. movl AORIG, AA
  512. leal (, %eax, SIZE), %eax
  513. leal (AA, %eax, 2), AA
  514. leal (B, %eax, 1), BB
  515. #endif
  516. addps %xmm5, %xmm4
  517. #if defined(LN) || defined(LT)
  518. pshufd $1, %xmm4, %xmm6
  519. movss -32 * SIZE(BB), %xmm1
  520. movss -31 * SIZE(BB), %xmm3
  521. subss %xmm4, %xmm1
  522. subss %xmm6, %xmm3
  523. #else
  524. movsd -32 * SIZE(AA), %xmm0
  525. subps %xmm4, %xmm0
  526. #endif
  527. #ifdef LN
  528. movsd -32 * SIZE(AA), %xmm4
  529. movhps -30 * SIZE(AA), %xmm4
  530. pshufd $0xff, %xmm4, %xmm6
  531. mulss %xmm6, %xmm3
  532. pshufd $0xaa, %xmm4, %xmm6
  533. mulss %xmm3, %xmm6
  534. subss %xmm6, %xmm1
  535. pshufd $0x00, %xmm4, %xmm6
  536. mulss %xmm6, %xmm1
  537. #endif
  538. #ifdef LT
  539. movaps -32 * SIZE(AA), %xmm4
  540. pshufd $0x00, %xmm4, %xmm6
  541. mulss %xmm6, %xmm1
  542. pshufd $0x55, %xmm4, %xmm6
  543. mulss %xmm1, %xmm6
  544. subss %xmm6, %xmm3
  545. pshufd $0xff, %xmm4, %xmm6
  546. mulss %xmm6, %xmm3
  547. #endif
  548. #if defined(RN) || defined(RT)
  549. movss -32 * SIZE(BB), %xmm6
  550. pshufd $0x00, %xmm6, %xmm7
  551. mulps %xmm7, %xmm0
  552. #endif
  553. #if defined(LN) || defined(LT)
  554. movss %xmm1, -32 * SIZE(BB)
  555. movss %xmm3, -31 * SIZE(BB)
  556. #else
  557. movlps %xmm0, -32 * SIZE(AA)
  558. #endif
  559. #ifdef LN
  560. subl $2 * SIZE, CO1
  561. #endif
  562. #if defined(LN) || defined(LT)
  563. movss %xmm1, 0 * SIZE(CO1)
  564. movss %xmm3, 1 * SIZE(CO1)
  565. #else
  566. movlps %xmm0, 0 * SIZE(CO1)
  567. #endif
  568. #ifndef LN
  569. addl $2 * SIZE, CO1
  570. #endif
  571. #if defined(LT) || defined(RN)
  572. movl K, %eax
  573. subl KK, %eax
  574. leal (,%eax, SIZE), %eax
  575. leal (AA, %eax, 2), AA
  576. leal (BB, %eax, 1), BB
  577. #endif
  578. #ifdef LN
  579. subl $2, KK
  580. #endif
  581. #ifdef LT
  582. addl $2, KK
  583. #endif
  584. #ifdef RT
  585. movl K, %eax
  586. sall $1 + BASE_SHIFT, %eax
  587. addl %eax, AORIG
  588. #endif
  589. ALIGN_4
  590. .L110:
  591. testl $1, M
  592. je .L119
  593. #ifdef LN
  594. movl K, %eax
  595. sall $BASE_SHIFT, %eax
  596. subl %eax, AORIG
  597. #endif
  598. #if defined(LN) || defined(RT)
  599. movl KK, %eax
  600. movl AORIG, AA
  601. leal (AA, %eax, SIZE), AA
  602. #endif
  603. movl B, BB
  604. #if defined(LN) || defined(RT)
  605. movl KK, %eax
  606. sall $BASE_SHIFT, %eax
  607. addl %eax, BB
  608. #endif
  609. pxor %xmm4, %xmm4
  610. movsd -32 * SIZE(AA), %xmm0
  611. pxor %xmm5, %xmm5
  612. movsd -32 * SIZE(BB), %xmm1
  613. #if defined(LT) || defined(RN)
  614. movl KK, %eax
  615. #else
  616. movl K, %eax
  617. subl KK, %eax
  618. #endif
  619. sarl $3, %eax
  620. je .L115
  621. ALIGN_4
  622. .L112:
  623. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  624. mulps %xmm0, %xmm1
  625. movsd -30 * SIZE(AA), %xmm0
  626. addps %xmm1, %xmm4
  627. movsd -30 * SIZE(BB), %xmm1
  628. mulps %xmm0, %xmm1
  629. movsd -28 * SIZE(AA), %xmm0
  630. addps %xmm1, %xmm4
  631. movsd -28 * SIZE(BB), %xmm1
  632. mulps %xmm0, %xmm1
  633. movsd -26 * SIZE(AA), %xmm0
  634. addps %xmm1, %xmm4
  635. movsd -26 * SIZE(BB), %xmm1
  636. mulps %xmm0, %xmm1
  637. movsd -24 * SIZE(AA), %xmm0
  638. addps %xmm1, %xmm4
  639. movsd -24 * SIZE(BB), %xmm1
  640. subl $-8 * SIZE, AA
  641. subl $-8 * SIZE, BB
  642. subl $1, %eax
  643. jne .L112
  644. ALIGN_4
  645. .L115:
  646. #if defined(LT) || defined(RN)
  647. movl KK, %eax
  648. #else
  649. movl K, %eax
  650. subl KK, %eax
  651. #endif
  652. andl $7, %eax # if (k & 1)
  653. BRANCH
  654. je .L118
  655. ALIGN_4
  656. .L116:
  657. mulss %xmm0, %xmm1
  658. movss -31 * SIZE(AA), %xmm0
  659. addss %xmm1, %xmm4
  660. movss -31 * SIZE(BB), %xmm1
  661. addl $1 * SIZE, AA
  662. addl $1 * SIZE, BB
  663. decl %eax
  664. jg .L116
  665. ALIGN_4
  666. .L118:
  667. #if defined(LN) || defined(RT)
  668. movl KK, %eax
  669. subl $1, %eax
  670. movl AORIG, AA
  671. leal (AA, %eax, SIZE), AA
  672. leal (B, %eax, SIZE), BB
  673. #endif
  674. haddps %xmm4, %xmm4
  675. #if defined(LN) || defined(LT)
  676. movss -32 * SIZE(BB), %xmm1
  677. subss %xmm4, %xmm1
  678. #else
  679. movss -32 * SIZE(AA), %xmm0
  680. subss %xmm4, %xmm0
  681. #endif
  682. #if defined(LN) || defined(LT)
  683. mulss -32 * SIZE(AA), %xmm1
  684. #endif
  685. #if defined(RN) || defined(RT)
  686. mulss -32 * SIZE(BB), %xmm0
  687. #endif
  688. #if defined(LN) || defined(LT)
  689. movss %xmm1, -32 * SIZE(BB)
  690. #else
  691. movss %xmm0, -32 * SIZE(AA)
  692. #endif
  693. #ifdef LN
  694. subl $1 * SIZE, CO1
  695. #endif
  696. #if defined(LN) || defined(LT)
  697. movss %xmm1, 0 * SIZE(CO1)
  698. #else
  699. movss %xmm0, 0 * SIZE(CO1)
  700. #endif
  701. #ifndef LN
  702. addl $1 * SIZE, CO1
  703. #endif
  704. #if defined(LT) || defined(RN)
  705. movl K, %eax
  706. subl KK, %eax
  707. leal (AA, %eax, SIZE), AA
  708. leal (BB, %eax, SIZE), BB
  709. #endif
  710. #ifdef LN
  711. subl $1, KK
  712. #endif
  713. #ifdef LT
  714. addl $1, KK
  715. #endif
  716. #ifdef RT
  717. movl K, %eax
  718. sall $BASE_SHIFT, %eax
  719. addl %eax, AORIG
  720. #endif
  721. ALIGN_4
  722. .L119:
  723. #ifdef LN
  724. movl K, %eax
  725. leal (B, %eax, SIZE), B
  726. #endif
  727. #if defined(LT) || defined(RN)
  728. movl BB, B
  729. #endif
  730. #ifdef RN
  731. addl $1, KK
  732. #endif
  733. #ifdef RT
  734. subl $1, KK
  735. #endif
  736. ALIGN_4
  737. .L40:
  738. testl $2, N
  739. je .L80
  740. #if defined(LT) || defined(RN)
  741. movl A, AA
  742. #else
  743. movl A, %eax
  744. movl %eax, AORIG
  745. #endif
  746. #ifdef RT
  747. movl K, %eax
  748. sall $1 + BASE_SHIFT, %eax
  749. subl %eax, B
  750. #endif
  751. leal (, LDC, 2), %eax
  752. #ifdef RT
  753. subl %eax, C
  754. #endif
  755. movl C, CO1
  756. #ifndef RT
  757. addl %eax, C
  758. #endif
  759. #ifdef LN
  760. movl OFFSET, %eax
  761. addl M, %eax
  762. movl %eax, KK
  763. #endif
  764. #ifdef LT
  765. movl OFFSET, %eax
  766. movl %eax, KK
  767. #endif
  768. movl M, %ebx
  769. sarl $2, %ebx # i = (m >> 2)
  770. jle .L60
  771. ALIGN_4
  772. .L51:
  773. #ifdef LN
  774. movl K, %eax
  775. sall $2 + BASE_SHIFT, %eax
  776. subl %eax, AORIG
  777. #endif
  778. #if defined(LN) || defined(RT)
  779. movl KK, %eax
  780. movl AORIG, AA
  781. leal (, %eax, SIZE), %eax
  782. leal (AA, %eax, 4), AA
  783. #endif
  784. movl B, BB
  785. #if defined(LN) || defined(RT)
  786. movl KK, %eax
  787. sall $1 + BASE_SHIFT, %eax
  788. addl %eax, BB
  789. #endif
  790. movaps -32 * SIZE(AA), %xmm0
  791. pxor %xmm2, %xmm2
  792. movaps -32 * SIZE(BB), %xmm1
  793. pxor %xmm3, %xmm3
  794. #ifdef LN
  795. pxor %xmm4, %xmm4
  796. prefetcht0 -4 * SIZE(CO1)
  797. pxor %xmm5, %xmm5
  798. prefetcht0 -4 * SIZE(CO1, LDC)
  799. #else
  800. pxor %xmm4, %xmm4
  801. prefetcht0 3 * SIZE(CO1)
  802. pxor %xmm5, %xmm5
  803. prefetcht0 3 * SIZE(CO1, LDC)
  804. #endif
  805. pxor %xmm6, %xmm6
  806. pxor %xmm7, %xmm7
  807. #if defined(LT) || defined(RN)
  808. movl KK, %eax
  809. #else
  810. movl K, %eax
  811. subl KK, %eax
  812. #endif
  813. sarl $3, %eax
  814. je .L55
  815. ALIGN_4
  816. .L52:
  817. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  818. addps %xmm2, %xmm4
  819. pshufd $0x00, %xmm1, %xmm2
  820. mulps %xmm0, %xmm2
  821. addps %xmm3, %xmm5
  822. pshufd $0x55, %xmm1, %xmm3
  823. mulps %xmm0, %xmm3
  824. movaps -28 * SIZE(AA), %xmm0
  825. addps %xmm2, %xmm6
  826. pshufd $0xaa, %xmm1, %xmm2
  827. mulps %xmm0, %xmm2
  828. addps %xmm3, %xmm7
  829. pshufd $0xff, %xmm1, %xmm3
  830. movaps -28 * SIZE(BB), %xmm1
  831. mulps %xmm0, %xmm3
  832. movaps -24 * SIZE(AA), %xmm0
  833. addps %xmm2, %xmm4
  834. pshufd $0x00, %xmm1, %xmm2
  835. mulps %xmm0, %xmm2
  836. addps %xmm3, %xmm5
  837. pshufd $0x55, %xmm1, %xmm3
  838. mulps %xmm0, %xmm3
  839. movaps -20 * SIZE(AA), %xmm0
  840. addps %xmm2, %xmm6
  841. pshufd $0xaa, %xmm1, %xmm2
  842. mulps %xmm0, %xmm2
  843. addps %xmm3, %xmm7
  844. pshufd $0xff, %xmm1, %xmm3
  845. movaps -24 * SIZE(BB), %xmm1
  846. mulps %xmm0, %xmm3
  847. movaps -16 * SIZE(AA), %xmm0
  848. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  849. addps %xmm2, %xmm4
  850. pshufd $0x00, %xmm1, %xmm2
  851. mulps %xmm0, %xmm2
  852. addps %xmm3, %xmm5
  853. pshufd $0x55, %xmm1, %xmm3
  854. mulps %xmm0, %xmm3
  855. movaps -12 * SIZE(AA), %xmm0
  856. addps %xmm2, %xmm6
  857. pshufd $0xaa, %xmm1, %xmm2
  858. mulps %xmm0, %xmm2
  859. addps %xmm3, %xmm7
  860. pshufd $0xff, %xmm1, %xmm3
  861. movaps -20 * SIZE(BB), %xmm1
  862. mulps %xmm0, %xmm3
  863. movaps -8 * SIZE(AA), %xmm0
  864. addps %xmm2, %xmm4
  865. pshufd $0x00, %xmm1, %xmm2
  866. mulps %xmm0, %xmm2
  867. addps %xmm3, %xmm5
  868. pshufd $0x55, %xmm1, %xmm3
  869. mulps %xmm0, %xmm3
  870. movaps -4 * SIZE(AA), %xmm0
  871. addps %xmm2, %xmm6
  872. pshufd $0xaa, %xmm1, %xmm2
  873. mulps %xmm0, %xmm2
  874. addps %xmm3, %xmm7
  875. pshufd $0xff, %xmm1, %xmm3
  876. movaps -16 * SIZE(BB), %xmm1
  877. mulps %xmm0, %xmm3
  878. movaps 0 * SIZE(AA), %xmm0
  879. subl $-32 * SIZE, AA
  880. subl $-16 * SIZE, BB
  881. subl $1, %eax
  882. jne .L52
  883. ALIGN_4
  884. .L55:
  885. #if defined(LT) || defined(RN)
  886. movl KK, %eax
  887. #else
  888. movl K, %eax
  889. subl KK, %eax
  890. #endif
  891. andl $7, %eax # if (k & 1)
  892. BRANCH
  893. je .L58
  894. ALIGN_4
  895. .L56:
  896. addps %xmm2, %xmm4
  897. pshufd $0x00, %xmm1, %xmm2
  898. mulps %xmm0, %xmm2
  899. addps %xmm3, %xmm5
  900. pshufd $0x55, %xmm1, %xmm3
  901. movsd -30 * SIZE(BB), %xmm1
  902. mulps %xmm0, %xmm3
  903. movaps -28 * SIZE(AA), %xmm0
  904. addl $4 * SIZE, AA
  905. addl $2 * SIZE, BB
  906. decl %eax
  907. jg .L56
  908. ALIGN_4
  909. .L58:
  910. #if defined(LN) || defined(RT)
  911. movl KK, %eax
  912. #ifdef LN
  913. subl $4, %eax
  914. #else
  915. subl $2, %eax
  916. #endif
  917. movl AORIG, AA
  918. leal (, %eax, SIZE), %eax
  919. leal (AA, %eax, 4), AA
  920. leal (B, %eax, 2), BB
  921. #endif
  922. addps %xmm6, %xmm4
  923. addps %xmm7, %xmm5
  924. addps %xmm2, %xmm4
  925. addps %xmm3, %xmm5
  926. #if defined(LN) || defined(LT)
  927. movaps %xmm4, %xmm0
  928. unpcklps %xmm6, %xmm4
  929. unpckhps %xmm6, %xmm0
  930. movaps %xmm5, %xmm1
  931. unpcklps %xmm7, %xmm5
  932. unpckhps %xmm7, %xmm1
  933. movaps %xmm4, %xmm6
  934. unpcklps %xmm5, %xmm4
  935. unpckhps %xmm5, %xmm6
  936. movaps %xmm0, %xmm2
  937. unpcklps %xmm1, %xmm0
  938. unpckhps %xmm1, %xmm2
  939. movsd -32 * SIZE(BB), %xmm1
  940. movsd -30 * SIZE(BB), %xmm3
  941. movsd -28 * SIZE(BB), %xmm5
  942. movsd -26 * SIZE(BB), %xmm7
  943. subps %xmm4, %xmm1
  944. subps %xmm6, %xmm3
  945. subps %xmm0, %xmm5
  946. subps %xmm2, %xmm7
  947. #else
  948. movaps -32 * SIZE(AA), %xmm0
  949. movaps -28 * SIZE(AA), %xmm1
  950. subps %xmm4, %xmm0
  951. subps %xmm5, %xmm1
  952. #endif
  953. #ifdef LN
  954. movaps -20 * SIZE(AA), %xmm4
  955. pshufd $0xff, %xmm4, %xmm6
  956. mulps %xmm6, %xmm7
  957. pshufd $0xaa, %xmm4, %xmm6
  958. mulps %xmm7, %xmm6
  959. subps %xmm6, %xmm5
  960. pshufd $0x55, %xmm4, %xmm6
  961. mulps %xmm7, %xmm6
  962. subps %xmm6, %xmm3
  963. pshufd $0x00, %xmm4, %xmm6
  964. mulps %xmm7, %xmm6
  965. subps %xmm6, %xmm1
  966. movaps -24 * SIZE(AA), %xmm4
  967. pshufd $0xaa, %xmm4, %xmm6
  968. mulps %xmm6, %xmm5
  969. pshufd $0x55, %xmm4, %xmm6
  970. mulps %xmm5, %xmm6
  971. subps %xmm6, %xmm3
  972. pshufd $0x00, %xmm4, %xmm6
  973. mulps %xmm5, %xmm6
  974. subps %xmm6, %xmm1
  975. movaps -28 * SIZE(AA), %xmm4
  976. pshufd $0x55, %xmm4, %xmm6
  977. mulps %xmm6, %xmm3
  978. pshufd $0x00, %xmm4, %xmm6
  979. mulps %xmm3, %xmm6
  980. subps %xmm6, %xmm1
  981. movaps -32 * SIZE(AA), %xmm4
  982. pshufd $0x00, %xmm4, %xmm6
  983. mulps %xmm6, %xmm1
  984. #endif
  985. #ifdef LT
  986. movaps -32 * SIZE(AA), %xmm4
  987. pshufd $0x00, %xmm4, %xmm6
  988. mulps %xmm6, %xmm1
  989. pshufd $0x55, %xmm4, %xmm6
  990. mulps %xmm1, %xmm6
  991. subps %xmm6, %xmm3
  992. pshufd $0xaa, %xmm4, %xmm6
  993. mulps %xmm1, %xmm6
  994. subps %xmm6, %xmm5
  995. pshufd $0xff, %xmm4, %xmm6
  996. mulps %xmm1, %xmm6
  997. subps %xmm6, %xmm7
  998. movaps -28 * SIZE(AA), %xmm4
  999. pshufd $0x55, %xmm4, %xmm6
  1000. mulps %xmm6, %xmm3
  1001. pshufd $0xaa, %xmm4, %xmm6
  1002. mulps %xmm3, %xmm6
  1003. subps %xmm6, %xmm5
  1004. pshufd $0xff, %xmm4, %xmm6
  1005. mulps %xmm3, %xmm6
  1006. subps %xmm6, %xmm7
  1007. movaps -24 * SIZE(AA), %xmm4
  1008. pshufd $0xaa, %xmm4, %xmm6
  1009. mulps %xmm6, %xmm5
  1010. pshufd $0xff, %xmm4, %xmm6
  1011. mulps %xmm5, %xmm6
  1012. subps %xmm6, %xmm7
  1013. movaps -20 * SIZE(AA), %xmm4
  1014. pshufd $0xff, %xmm4, %xmm6
  1015. mulps %xmm6, %xmm7
  1016. #endif
  1017. #ifdef RN
  1018. movaps -32 * SIZE(BB), %xmm6
  1019. pshufd $0x00, %xmm6, %xmm7
  1020. mulps %xmm7, %xmm0
  1021. pshufd $0x55, %xmm6, %xmm7
  1022. mulps %xmm0, %xmm7
  1023. subps %xmm7, %xmm1
  1024. pshufd $0xff, %xmm6, %xmm7
  1025. mulps %xmm7, %xmm1
  1026. #endif
  1027. #ifdef RT
  1028. movaps -32 * SIZE(BB), %xmm6
  1029. pshufd $0xff, %xmm6, %xmm7
  1030. mulps %xmm7, %xmm1
  1031. pshufd $0xaa, %xmm6, %xmm7
  1032. mulps %xmm1, %xmm7
  1033. subps %xmm7, %xmm0
  1034. pshufd $0x00, %xmm6, %xmm7
  1035. mulps %xmm7, %xmm0
  1036. #endif
  1037. #if defined(LN) || defined(LT)
  1038. movlps %xmm1, -32 * SIZE(BB)
  1039. movlps %xmm3, -30 * SIZE(BB)
  1040. movlps %xmm5, -28 * SIZE(BB)
  1041. movlps %xmm7, -26 * SIZE(BB)
  1042. #else
  1043. movaps %xmm0, -32 * SIZE(AA)
  1044. movaps %xmm1, -28 * SIZE(AA)
  1045. #endif
  1046. #ifdef LN
  1047. subl $4 * SIZE, CO1
  1048. #endif
  1049. #if defined(LN) || defined(LT)
  1050. unpcklps %xmm5, %xmm1
  1051. unpcklps %xmm7, %xmm3
  1052. movaps %xmm1, %xmm2
  1053. unpcklps %xmm3, %xmm1
  1054. unpckhps %xmm3, %xmm2
  1055. movlps %xmm1, 0 * SIZE(CO1)
  1056. movhps %xmm1, 2 * SIZE(CO1)
  1057. movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
  1058. movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
  1059. #else
  1060. movlps %xmm0, 0 * SIZE(CO1)
  1061. movhps %xmm0, 2 * SIZE(CO1)
  1062. movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
  1063. movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
  1064. #endif
  1065. #ifndef LN
  1066. addl $4 * SIZE, CO1
  1067. #endif
  1068. #if defined(LT) || defined(RN)
  1069. movl K, %eax
  1070. subl KK, %eax
  1071. leal (,%eax, SIZE), %eax
  1072. leal (AA, %eax, 4), AA
  1073. leal (BB, %eax, 2), BB
  1074. #endif
  1075. #ifdef LN
  1076. subl $4, KK
  1077. #endif
  1078. #ifdef LT
  1079. addl $4, KK
  1080. #endif
  1081. #ifdef RT
  1082. movl K, %eax
  1083. sall $2 + BASE_SHIFT, %eax
  1084. addl %eax, AORIG
  1085. #endif
  1086. decl %ebx # i --
  1087. jg .L51
  1088. ALIGN_4
  1089. .L60:
  1090. testl $2, M
  1091. je .L70
  1092. #ifdef LN
  1093. movl K, %eax
  1094. sall $1 + BASE_SHIFT, %eax
  1095. subl %eax, AORIG
  1096. #endif
  1097. #if defined(LN) || defined(RT)
  1098. movl KK, %eax
  1099. movl AORIG, AA
  1100. leal (, %eax, SIZE), %eax
  1101. leal (AA, %eax, 2), AA
  1102. #endif
  1103. movl B, BB
  1104. #if defined(LN) || defined(RT)
  1105. movl KK, %eax
  1106. sall $1 + BASE_SHIFT, %eax
  1107. addl %eax, BB
  1108. #endif
  1109. movaps -32 * SIZE(AA), %xmm0
  1110. pxor %xmm3, %xmm3
  1111. movaps -32 * SIZE(BB), %xmm1
  1112. pxor %xmm4, %xmm4
  1113. pxor %xmm5, %xmm5
  1114. #if defined(LT) || defined(RN)
  1115. movl KK, %eax
  1116. #else
  1117. movl K, %eax
  1118. subl KK, %eax
  1119. #endif
  1120. sarl $3, %eax
  1121. je .L65
  1122. ALIGN_4
  1123. .L62:
  1124. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1125. pshufd $0x44, %xmm0, %xmm2
  1126. addps %xmm3, %xmm4
  1127. pshufd $0x50, %xmm1, %xmm3
  1128. mulps %xmm2, %xmm3
  1129. pshufd $0xee, %xmm0, %xmm2
  1130. movaps -28 * SIZE(AA), %xmm0
  1131. addps %xmm3, %xmm5
  1132. pshufd $0xfa, %xmm1, %xmm3
  1133. movaps -28 * SIZE(BB), %xmm1
  1134. mulps %xmm2, %xmm3
  1135. pshufd $0x44, %xmm0, %xmm2
  1136. addps %xmm3, %xmm4
  1137. pshufd $0x50, %xmm1, %xmm3
  1138. mulps %xmm2, %xmm3
  1139. pshufd $0xee, %xmm0, %xmm2
  1140. movaps -24 * SIZE(AA), %xmm0
  1141. addps %xmm3, %xmm5
  1142. pshufd $0xfa, %xmm1, %xmm3
  1143. movaps -24 * SIZE(BB), %xmm1
  1144. mulps %xmm2, %xmm3
  1145. pshufd $0x44, %xmm0, %xmm2
  1146. addps %xmm3, %xmm4
  1147. pshufd $0x50, %xmm1, %xmm3
  1148. mulps %xmm2, %xmm3
  1149. pshufd $0xee, %xmm0, %xmm2
  1150. movaps -20 * SIZE(AA), %xmm0
  1151. addps %xmm3, %xmm5
  1152. pshufd $0xfa, %xmm1, %xmm3
  1153. movaps -20 * SIZE(BB), %xmm1
  1154. mulps %xmm2, %xmm3
  1155. pshufd $0x44, %xmm0, %xmm2
  1156. addps %xmm3, %xmm4
  1157. pshufd $0x50, %xmm1, %xmm3
  1158. mulps %xmm2, %xmm3
  1159. pshufd $0xee, %xmm0, %xmm2
  1160. movaps -16 * SIZE(AA), %xmm0
  1161. addps %xmm3, %xmm5
  1162. pshufd $0xfa, %xmm1, %xmm3
  1163. movaps -16 * SIZE(BB), %xmm1
  1164. mulps %xmm2, %xmm3
  1165. subl $-16 * SIZE, AA
  1166. subl $-16 * SIZE, BB
  1167. subl $1, %eax
  1168. jne .L62
  1169. ALIGN_4
  1170. .L65:
  1171. #if defined(LT) || defined(RN)
  1172. movl KK, %eax
  1173. #else
  1174. movl K, %eax
  1175. subl KK, %eax
  1176. #endif
  1177. andl $7, %eax # if (k & 1)
  1178. BRANCH
  1179. je .L68
  1180. ALIGN_4
  1181. .L66:
  1182. pshufd $0x44, %xmm0, %xmm2
  1183. movsd -30 * SIZE(AA), %xmm0
  1184. addps %xmm3, %xmm4
  1185. pshufd $0x50, %xmm1, %xmm3
  1186. movsd -30 * SIZE(BB), %xmm1
  1187. mulps %xmm2, %xmm3
  1188. addl $2 * SIZE, AA
  1189. addl $2 * SIZE, BB
  1190. decl %eax
  1191. jg .L66
  1192. ALIGN_4
  1193. .L68:
  1194. #if defined(LN) || defined(RT)
  1195. movl KK, %eax
  1196. #ifdef LN
  1197. subl $2, %eax
  1198. #else
  1199. subl $2, %eax
  1200. #endif
  1201. movl AORIG, AA
  1202. leal (, %eax, SIZE), %eax
  1203. leal (AA, %eax, 2), AA
  1204. leal (B, %eax, 2), BB
  1205. #endif
  1206. addps %xmm3, %xmm4
  1207. addps %xmm5, %xmm4
  1208. movhlps %xmm4, %xmm5
  1209. #if defined(LN) || defined(LT)
  1210. unpcklps %xmm6, %xmm4
  1211. unpcklps %xmm7, %xmm5
  1212. movaps %xmm4, %xmm6
  1213. unpcklps %xmm5, %xmm4
  1214. unpckhps %xmm5, %xmm6
  1215. movsd -32 * SIZE(BB), %xmm1
  1216. movsd -30 * SIZE(BB), %xmm3
  1217. subps %xmm4, %xmm1
  1218. subps %xmm6, %xmm3
  1219. #else
  1220. movsd -32 * SIZE(AA), %xmm0
  1221. movsd -30 * SIZE(AA), %xmm1
  1222. subps %xmm4, %xmm0
  1223. subps %xmm5, %xmm1
  1224. #endif
  1225. #ifdef LN
  1226. movaps -32 * SIZE(AA), %xmm4
  1227. pshufd $0xff, %xmm4, %xmm6
  1228. mulps %xmm6, %xmm3
  1229. pshufd $0xaa, %xmm4, %xmm6
  1230. mulps %xmm3, %xmm6
  1231. subps %xmm6, %xmm1
  1232. pshufd $0x00, %xmm4, %xmm6
  1233. mulps %xmm6, %xmm1
  1234. #endif
  1235. #ifdef LT
  1236. movaps -32 * SIZE(AA), %xmm4
  1237. pshufd $0x00, %xmm4, %xmm6
  1238. mulps %xmm6, %xmm1
  1239. pshufd $0x55, %xmm4, %xmm6
  1240. mulps %xmm1, %xmm6
  1241. subps %xmm6, %xmm3
  1242. pshufd $0xff, %xmm4, %xmm6
  1243. mulps %xmm6, %xmm3
  1244. #endif
  1245. #ifdef RN
  1246. movaps -32 * SIZE(BB), %xmm6
  1247. pshufd $0x00, %xmm6, %xmm7
  1248. mulps %xmm7, %xmm0
  1249. pshufd $0x55, %xmm6, %xmm7
  1250. mulps %xmm0, %xmm7
  1251. subps %xmm7, %xmm1
  1252. pshufd $0xff, %xmm6, %xmm7
  1253. mulps %xmm7, %xmm1
  1254. #endif
  1255. #ifdef RT
  1256. movaps -32 * SIZE(BB), %xmm6
  1257. pshufd $0xff, %xmm6, %xmm7
  1258. mulps %xmm7, %xmm1
  1259. pshufd $0xaa, %xmm6, %xmm7
  1260. mulps %xmm1, %xmm7
  1261. subps %xmm7, %xmm0
  1262. pshufd $0x00, %xmm6, %xmm7
  1263. mulps %xmm7, %xmm0
  1264. #endif
  1265. #if defined(LN) || defined(LT)
  1266. movlps %xmm1, -32 * SIZE(BB)
  1267. movlps %xmm3, -30 * SIZE(BB)
  1268. #else
  1269. movlps %xmm0, -32 * SIZE(AA)
  1270. movlps %xmm1, -30 * SIZE(AA)
  1271. #endif
  1272. #ifdef LN
  1273. subl $2 * SIZE, CO1
  1274. #endif
  1275. #if defined(LN) || defined(LT)
  1276. unpcklps %xmm3, %xmm1
  1277. movlps %xmm1, 0 * SIZE(CO1)
  1278. movhps %xmm1, 0 * SIZE(CO1, LDC)
  1279. #else
  1280. movlps %xmm0, 0 * SIZE(CO1)
  1281. movlps %xmm1, 0 * SIZE(CO1, LDC)
  1282. #endif
  1283. #ifndef LN
  1284. addl $2 * SIZE, CO1
  1285. #endif
  1286. #if defined(LT) || defined(RN)
  1287. movl K, %eax
  1288. subl KK, %eax
  1289. leal (,%eax, SIZE), %eax
  1290. leal (AA, %eax, 2), AA
  1291. leal (BB, %eax, 2), BB
  1292. #endif
  1293. #ifdef LN
  1294. subl $2, KK
  1295. #endif
  1296. #ifdef LT
  1297. addl $2, KK
  1298. #endif
  1299. #ifdef RT
  1300. movl K, %eax
  1301. sall $1 + BASE_SHIFT, %eax
  1302. addl %eax, AORIG
  1303. #endif
  1304. ALIGN_4
  1305. .L70:
  1306. testl $1, M
  1307. je .L79
  1308. #ifdef LN
  1309. movl K, %eax
  1310. sall $BASE_SHIFT, %eax
  1311. subl %eax, AORIG
  1312. #endif
  1313. #if defined(LN) || defined(RT)
  1314. movl KK, %eax
  1315. movl AORIG, AA
  1316. leal (AA, %eax, SIZE), AA
  1317. #endif
  1318. movl B, BB
  1319. #if defined(LN) || defined(RT)
  1320. movl KK, %eax
  1321. sall $1 + BASE_SHIFT, %eax
  1322. addl %eax, BB
  1323. #endif
  1324. pxor %xmm4, %xmm4
  1325. movsd -32 * SIZE(AA), %xmm0
  1326. pxor %xmm5, %xmm5
  1327. movsd -32 * SIZE(BB), %xmm1
  1328. #if defined(LT) || defined(RN)
  1329. movl KK, %eax
  1330. #else
  1331. movl K, %eax
  1332. subl KK, %eax
  1333. #endif
  1334. sarl $3, %eax
  1335. je .L75
  1336. ALIGN_4
  1337. .L72:
  1338. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1339. pshufd $0x00, %xmm0, %xmm2
  1340. mulps %xmm2, %xmm1
  1341. addps %xmm1, %xmm4
  1342. movsd -30 * SIZE(BB), %xmm1
  1343. pshufd $0x55, %xmm0, %xmm2
  1344. movsd -30 * SIZE(AA), %xmm0
  1345. mulps %xmm2, %xmm1
  1346. addps %xmm1, %xmm5
  1347. movsd -28 * SIZE(BB), %xmm1
  1348. pshufd $0x00, %xmm0, %xmm2
  1349. mulps %xmm2, %xmm1
  1350. addps %xmm1, %xmm4
  1351. movsd -26 * SIZE(BB), %xmm1
  1352. pshufd $0x55, %xmm0, %xmm2
  1353. movsd -28 * SIZE(AA), %xmm0
  1354. mulps %xmm2, %xmm1
  1355. addps %xmm1, %xmm5
  1356. movsd -24 * SIZE(BB), %xmm1
  1357. pshufd $0x00, %xmm0, %xmm2
  1358. mulps %xmm2, %xmm1
  1359. addps %xmm1, %xmm4
  1360. movsd -22 * SIZE(BB), %xmm1
  1361. pshufd $0x55, %xmm0, %xmm2
  1362. movsd -26 * SIZE(AA), %xmm0
  1363. mulps %xmm2, %xmm1
  1364. addps %xmm1, %xmm5
  1365. movsd -20 * SIZE(BB), %xmm1
  1366. pshufd $0x00, %xmm0, %xmm2
  1367. mulps %xmm2, %xmm1
  1368. addps %xmm1, %xmm4
  1369. movsd -18 * SIZE(BB), %xmm1
  1370. pshufd $0x55, %xmm0, %xmm2
  1371. movsd -24 * SIZE(AA), %xmm0
  1372. mulps %xmm2, %xmm1
  1373. addps %xmm1, %xmm5
  1374. movsd -16 * SIZE(BB), %xmm1
  1375. subl $ -8 * SIZE, AA
  1376. subl $-16 * SIZE, BB
  1377. subl $1, %eax
  1378. jne .L72
  1379. ALIGN_4
  1380. .L75:
  1381. #if defined(LT) || defined(RN)
  1382. movl KK, %eax
  1383. #else
  1384. movl K, %eax
  1385. subl KK, %eax
  1386. #endif
  1387. andl $7, %eax # if (k & 1)
  1388. BRANCH
  1389. je .L78
  1390. ALIGN_4
  1391. .L76:
  1392. pshufd $0x00, %xmm0, %xmm2
  1393. movss -31 * SIZE(AA), %xmm0
  1394. mulps %xmm2, %xmm1
  1395. addps %xmm1, %xmm4
  1396. movsd -30 * SIZE(BB), %xmm1
  1397. addl $1 * SIZE, AA
  1398. addl $2 * SIZE, BB
  1399. decl %eax
  1400. jg .L76
  1401. ALIGN_4
  1402. .L78:
  1403. #if defined(LN) || defined(RT)
  1404. movl KK, %eax
  1405. #ifdef LN
  1406. subl $1, %eax
  1407. #else
  1408. subl $2, %eax
  1409. #endif
  1410. movl AORIG, AA
  1411. leal (, %eax, SIZE), %eax
  1412. leal (AA, %eax, 1), AA
  1413. leal (B, %eax, 2), BB
  1414. #endif
  1415. addps %xmm5, %xmm4
  1416. pshufd $0x55, %xmm4, %xmm5
  1417. pshufd $0x00, %xmm4, %xmm4
  1418. #if defined(LN) || defined(LT)
  1419. unpcklps %xmm5, %xmm4
  1420. movsd -32 * SIZE(BB), %xmm1
  1421. subps %xmm4, %xmm1
  1422. #else
  1423. movss -32 * SIZE(AA), %xmm0
  1424. movss -31 * SIZE(AA), %xmm1
  1425. subss %xmm4, %xmm0
  1426. subss %xmm5, %xmm1
  1427. #endif
  1428. #if defined(LN) || defined(LT)
  1429. movss -32 * SIZE(AA), %xmm4
  1430. pshufd $0x00, %xmm4, %xmm6
  1431. mulps %xmm6, %xmm1
  1432. #endif
  1433. #ifdef RN
  1434. movaps -32 * SIZE(BB), %xmm6
  1435. pshufd $0x00, %xmm6, %xmm7
  1436. mulss %xmm7, %xmm0
  1437. pshufd $0x55, %xmm6, %xmm7
  1438. mulss %xmm0, %xmm7
  1439. subss %xmm7, %xmm1
  1440. pshufd $0xff, %xmm6, %xmm7
  1441. mulss %xmm7, %xmm1
  1442. #endif
  1443. #ifdef RT
  1444. movaps -32 * SIZE(BB), %xmm6
  1445. pshufd $0xff, %xmm6, %xmm7
  1446. mulss %xmm7, %xmm1
  1447. pshufd $0xaa, %xmm6, %xmm7
  1448. mulss %xmm1, %xmm7
  1449. subss %xmm7, %xmm0
  1450. pshufd $0x00, %xmm6, %xmm7
  1451. mulss %xmm7, %xmm0
  1452. #endif
  1453. #if defined(LN) || defined(LT)
  1454. movlps %xmm1, -32 * SIZE(BB)
  1455. #else
  1456. movss %xmm0, -32 * SIZE(AA)
  1457. movss %xmm1, -31 * SIZE(AA)
  1458. #endif
  1459. #ifdef LN
  1460. subl $1 * SIZE, CO1
  1461. #endif
  1462. #if defined(LN) || defined(LT)
  1463. pshufd $1, %xmm1, %xmm3
  1464. movss %xmm1, 0 * SIZE(CO1)
  1465. movss %xmm3, 0 * SIZE(CO1, LDC)
  1466. #else
  1467. movss %xmm0, 0 * SIZE(CO1)
  1468. movss %xmm1, 0 * SIZE(CO1, LDC)
  1469. #endif
  1470. #ifndef LN
  1471. addl $1 * SIZE, CO1
  1472. #endif
  1473. #if defined(LT) || defined(RN)
  1474. movl K, %eax
  1475. subl KK, %eax
  1476. leal (,%eax, SIZE), %eax
  1477. leal (AA, %eax, 1), AA
  1478. leal (BB, %eax, 2), BB
  1479. #endif
  1480. #ifdef LN
  1481. subl $1, KK
  1482. #endif
  1483. #ifdef LT
  1484. addl $1, KK
  1485. #endif
  1486. #ifdef RT
  1487. movl K, %eax
  1488. sall $BASE_SHIFT, %eax
  1489. addl %eax, AORIG
  1490. #endif
  1491. ALIGN_4
  1492. .L79:
  1493. #ifdef LN
  1494. movl K, %eax
  1495. leal (, %eax, SIZE), %eax
  1496. leal (B, %eax, 2), B
  1497. #endif
  1498. #if defined(LT) || defined(RN)
  1499. movl BB, B
  1500. #endif
  1501. #ifdef RN
  1502. addl $2, KK
  1503. #endif
  1504. #ifdef RT
  1505. subl $2, KK
  1506. #endif
  1507. ALIGN_4
  1508. .L80:
  1509. movl N, %eax
  1510. sarl $2, %eax
  1511. movl %eax, J
  1512. jle .L999
  1513. .L10:
  1514. #if defined(LT) || defined(RN)
  1515. movl A, AA
  1516. #else
  1517. movl A, %eax
  1518. movl %eax, AORIG
  1519. #endif
  1520. #ifdef RT
  1521. movl K, %eax
  1522. sall $2 + BASE_SHIFT, %eax
  1523. subl %eax, B
  1524. #endif
  1525. leal (, LDC, 4), %eax
  1526. #ifdef RT
  1527. subl %eax, C
  1528. #endif
  1529. movl C, CO1
  1530. #ifndef RT
  1531. addl %eax, C
  1532. #endif
  1533. #ifdef LN
  1534. movl OFFSET, %eax
  1535. addl M, %eax
  1536. movl %eax, KK
  1537. #endif
  1538. #ifdef LT
  1539. movl OFFSET, %eax
  1540. movl %eax, KK
  1541. #endif
  1542. movl M, %ebx
  1543. sarl $2, %ebx # i = (m >> 2)
  1544. jle .L20
  1545. ALIGN_4
  1546. .L11:
  1547. #ifdef LN
  1548. movl K, %eax
  1549. sall $2 + BASE_SHIFT, %eax
  1550. subl %eax, AORIG
  1551. #endif
  1552. #if defined(LN) || defined(RT)
  1553. movl KK, %eax
  1554. movl AORIG, AA
  1555. leal (, %eax, SIZE), %eax
  1556. leal (AA, %eax, 4), AA
  1557. #endif
  1558. movl B, BB
  1559. #if defined(LN) || defined(RT)
  1560. movl KK, %eax
  1561. sall $2 + BASE_SHIFT, %eax
  1562. addl %eax, BB
  1563. #endif
  1564. leal (CO1, LDC, 2), %eax
  1565. movaps -32 * SIZE(AA), %xmm0
  1566. pxor %xmm2, %xmm2
  1567. movaps -32 * SIZE(BB), %xmm1
  1568. pxor %xmm3, %xmm3
  1569. #ifdef LN
  1570. pxor %xmm4, %xmm4
  1571. prefetcht0 -4 * SIZE(CO1)
  1572. pxor %xmm5, %xmm5
  1573. prefetcht0 -4 * SIZE(CO1, LDC)
  1574. pxor %xmm6, %xmm6
  1575. prefetcht0 -4 * SIZE(%eax)
  1576. pxor %xmm7, %xmm7
  1577. prefetcht0 -4 * SIZE(%eax, LDC)
  1578. #else
  1579. pxor %xmm4, %xmm4
  1580. prefetcht0 3 * SIZE(CO1)
  1581. pxor %xmm5, %xmm5
  1582. prefetcht0 3 * SIZE(CO1, LDC)
  1583. pxor %xmm6, %xmm6
  1584. prefetcht0 3 * SIZE(%eax)
  1585. pxor %xmm7, %xmm7
  1586. prefetcht0 3 * SIZE(%eax, LDC)
  1587. #endif
  1588. #if defined(LT) || defined(RN)
  1589. movl KK, %eax
  1590. #else
  1591. movl K, %eax
  1592. subl KK, %eax
  1593. #endif
  1594. sarl $3, %eax
  1595. je .L15
  1596. ALIGN_4
  1597. .L12:
  1598. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1599. addps %xmm2, %xmm7
  1600. pshufd $0x93, %xmm1, %xmm2
  1601. mulps %xmm0, %xmm1
  1602. addps %xmm3, %xmm6
  1603. pshufd $0x93, %xmm2, %xmm3
  1604. mulps %xmm0, %xmm2
  1605. addps %xmm2, %xmm5
  1606. pshufd $0x93, %xmm3, %xmm2
  1607. mulps %xmm0, %xmm3
  1608. addps %xmm1, %xmm4
  1609. movaps -28 * SIZE(BB), %xmm1
  1610. mulps %xmm0, %xmm2
  1611. movaps -28 * SIZE(AA), %xmm0
  1612. addps %xmm2, %xmm7
  1613. pshufd $0x93, %xmm1, %xmm2
  1614. mulps %xmm0, %xmm1
  1615. addps %xmm3, %xmm6
  1616. pshufd $0x93, %xmm2, %xmm3
  1617. mulps %xmm0, %xmm2
  1618. addps %xmm2, %xmm5
  1619. pshufd $0x93, %xmm3, %xmm2
  1620. mulps %xmm0, %xmm3
  1621. addps %xmm1, %xmm4
  1622. movaps -24 * SIZE(BB), %xmm1
  1623. mulps %xmm0, %xmm2
  1624. movaps -24 * SIZE(AA), %xmm0
  1625. addps %xmm2, %xmm7
  1626. pshufd $0x93, %xmm1, %xmm2
  1627. mulps %xmm0, %xmm1
  1628. addps %xmm3, %xmm6
  1629. pshufd $0x93, %xmm2, %xmm3
  1630. mulps %xmm0, %xmm2
  1631. addps %xmm2, %xmm5
  1632. pshufd $0x93, %xmm3, %xmm2
  1633. mulps %xmm0, %xmm3
  1634. addps %xmm1, %xmm4
  1635. movaps -20 * SIZE(BB), %xmm1
  1636. mulps %xmm0, %xmm2
  1637. movaps -20 * SIZE(AA), %xmm0
  1638. addps %xmm2, %xmm7
  1639. pshufd $0x93, %xmm1, %xmm2
  1640. mulps %xmm0, %xmm1
  1641. addps %xmm3, %xmm6
  1642. pshufd $0x93, %xmm2, %xmm3
  1643. mulps %xmm0, %xmm2
  1644. addps %xmm2, %xmm5
  1645. pshufd $0x93, %xmm3, %xmm2
  1646. mulps %xmm0, %xmm3
  1647. addps %xmm1, %xmm4
  1648. movaps -16 * SIZE(BB), %xmm1
  1649. mulps %xmm0, %xmm2
  1650. movaps -16 * SIZE(AA), %xmm0
  1651. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  1652. addps %xmm2, %xmm7
  1653. pshufd $0x93, %xmm1, %xmm2
  1654. mulps %xmm0, %xmm1
  1655. addps %xmm3, %xmm6
  1656. pshufd $0x93, %xmm2, %xmm3
  1657. mulps %xmm0, %xmm2
  1658. addps %xmm2, %xmm5
  1659. pshufd $0x93, %xmm3, %xmm2
  1660. mulps %xmm0, %xmm3
  1661. addps %xmm1, %xmm4
  1662. movaps -12 * SIZE(BB), %xmm1
  1663. mulps %xmm0, %xmm2
  1664. movaps -12 * SIZE(AA), %xmm0
  1665. addps %xmm2, %xmm7
  1666. pshufd $0x93, %xmm1, %xmm2
  1667. mulps %xmm0, %xmm1
  1668. addps %xmm3, %xmm6
  1669. pshufd $0x93, %xmm2, %xmm3
  1670. mulps %xmm0, %xmm2
  1671. addps %xmm2, %xmm5
  1672. pshufd $0x93, %xmm3, %xmm2
  1673. mulps %xmm0, %xmm3
  1674. addps %xmm1, %xmm4
  1675. movaps -8 * SIZE(BB), %xmm1
  1676. mulps %xmm0, %xmm2
  1677. movaps -8 * SIZE(AA), %xmm0
  1678. addps %xmm2, %xmm7
  1679. pshufd $0x93, %xmm1, %xmm2
  1680. mulps %xmm0, %xmm1
  1681. addps %xmm3, %xmm6
  1682. pshufd $0x93, %xmm2, %xmm3
  1683. mulps %xmm0, %xmm2
  1684. addps %xmm2, %xmm5
  1685. pshufd $0x93, %xmm3, %xmm2
  1686. mulps %xmm0, %xmm3
  1687. addps %xmm1, %xmm4
  1688. movaps -4 * SIZE(BB), %xmm1
  1689. mulps %xmm0, %xmm2
  1690. movaps -4 * SIZE(AA), %xmm0
  1691. addps %xmm2, %xmm7
  1692. subl $-32 * SIZE, BB
  1693. pshufd $0x93, %xmm1, %xmm2
  1694. mulps %xmm0, %xmm1
  1695. addps %xmm3, %xmm6
  1696. pshufd $0x93, %xmm2, %xmm3
  1697. mulps %xmm0, %xmm2
  1698. addps %xmm2, %xmm5
  1699. subl $-32 * SIZE, AA
  1700. pshufd $0x93, %xmm3, %xmm2
  1701. mulps %xmm0, %xmm3
  1702. addps %xmm1, %xmm4
  1703. movaps -32 * SIZE(BB), %xmm1
  1704. mulps %xmm0, %xmm2
  1705. movaps -32 * SIZE(AA), %xmm0
  1706. subl $1, %eax
  1707. jne .L12
  1708. ALIGN_4
  1709. .L15:
  1710. #if defined(LT) || defined(RN)
  1711. movl KK, %eax
  1712. #else
  1713. movl K, %eax
  1714. subl KK, %eax
  1715. #endif
  1716. andl $7, %eax # if (k & 1)
  1717. BRANCH
  1718. je .L18
  1719. ALIGN_4
  1720. .L16:
  1721. addps %xmm2, %xmm7
  1722. pshufd $0x93, %xmm1, %xmm2
  1723. mulps %xmm0, %xmm1
  1724. addps %xmm3, %xmm6
  1725. pshufd $0x93, %xmm2, %xmm3
  1726. mulps %xmm0, %xmm2
  1727. addps %xmm2, %xmm5
  1728. pshufd $0x93, %xmm3, %xmm2
  1729. mulps %xmm0, %xmm3
  1730. addps %xmm1, %xmm4
  1731. movaps -28 * SIZE(BB), %xmm1
  1732. mulps %xmm0, %xmm2
  1733. movaps -28 * SIZE(AA), %xmm0
  1734. addl $4 * SIZE, AA
  1735. addl $4 * SIZE, BB
  1736. decl %eax
  1737. jg .L16
  1738. ALIGN_4
  1739. .L18:
  1740. #if defined(LN) || defined(RT)
  1741. movl KK, %eax
  1742. #ifdef LN
  1743. subl $4, %eax
  1744. #else
  1745. subl $4, %eax
  1746. #endif
  1747. movl AORIG, AA
  1748. leal (, %eax, SIZE), %eax
  1749. leal (AA, %eax, 4), AA
  1750. leal (B, %eax, 4), BB
  1751. #endif
  1752. addps %xmm3, %xmm6
  1753. addps %xmm2, %xmm7
  1754. #if defined(LN) || defined(LT)
  1755. movaps %xmm4, %xmm0
  1756. unpcklps %xmm7, %xmm0
  1757. unpckhps %xmm7, %xmm4
  1758. movaps %xmm6, %xmm2
  1759. unpcklps %xmm5, %xmm2
  1760. unpckhps %xmm5, %xmm6
  1761. movaps %xmm0, %xmm1
  1762. movlhps %xmm2, %xmm0
  1763. movhlps %xmm2, %xmm1
  1764. movaps %xmm6, %xmm7
  1765. movlhps %xmm4, %xmm6
  1766. movhlps %xmm4, %xmm7
  1767. pshufd $0x39, %xmm1, %xmm2
  1768. pshufd $0x39, %xmm7, %xmm4
  1769. movaps -32 * SIZE(BB), %xmm1
  1770. movaps -28 * SIZE(BB), %xmm3
  1771. movaps -24 * SIZE(BB), %xmm5
  1772. movaps -20 * SIZE(BB), %xmm7
  1773. subps %xmm0, %xmm1
  1774. subps %xmm2, %xmm3
  1775. subps %xmm6, %xmm5
  1776. subps %xmm4, %xmm7
  1777. #else
  1778. pshufd $0x39, %xmm5, %xmm2
  1779. pshufd $0x4e, %xmm6, %xmm0
  1780. pshufd $0x93, %xmm7, %xmm7
  1781. movaps %xmm4, %xmm6
  1782. unpcklps %xmm0, %xmm4
  1783. unpckhps %xmm0, %xmm6
  1784. movaps %xmm2, %xmm1
  1785. unpcklps %xmm7, %xmm2
  1786. unpckhps %xmm7, %xmm1
  1787. movaps %xmm4, %xmm5
  1788. unpcklps %xmm2, %xmm4
  1789. unpckhps %xmm2, %xmm5
  1790. movaps %xmm6, %xmm7
  1791. unpcklps %xmm1, %xmm6
  1792. unpckhps %xmm1, %xmm7
  1793. pshufd $0x93, %xmm5, %xmm5
  1794. pshufd $0x4e, %xmm6, %xmm6
  1795. pshufd $0x39, %xmm7, %xmm7
  1796. movaps -32 * SIZE(AA), %xmm0
  1797. movaps -28 * SIZE(AA), %xmm1
  1798. movaps -24 * SIZE(AA), %xmm2
  1799. movaps -20 * SIZE(AA), %xmm3
  1800. subps %xmm4, %xmm0
  1801. subps %xmm5, %xmm1
  1802. subps %xmm6, %xmm2
  1803. subps %xmm7, %xmm3
  1804. #endif
  1805. #ifdef LN
  1806. movaps -20 * SIZE(AA), %xmm4
  1807. pshufd $0xff, %xmm4, %xmm6
  1808. mulps %xmm6, %xmm7
  1809. pshufd $0xaa, %xmm4, %xmm6
  1810. mulps %xmm7, %xmm6
  1811. subps %xmm6, %xmm5
  1812. pshufd $0x55, %xmm4, %xmm6
  1813. mulps %xmm7, %xmm6
  1814. subps %xmm6, %xmm3
  1815. pshufd $0x00, %xmm4, %xmm6
  1816. mulps %xmm7, %xmm6
  1817. subps %xmm6, %xmm1
  1818. movaps -24 * SIZE(AA), %xmm4
  1819. pshufd $0xaa, %xmm4, %xmm6
  1820. mulps %xmm6, %xmm5
  1821. pshufd $0x55, %xmm4, %xmm6
  1822. mulps %xmm5, %xmm6
  1823. subps %xmm6, %xmm3
  1824. pshufd $0x00, %xmm4, %xmm6
  1825. mulps %xmm5, %xmm6
  1826. subps %xmm6, %xmm1
  1827. movaps -28 * SIZE(AA), %xmm4
  1828. pshufd $0x55, %xmm4, %xmm6
  1829. mulps %xmm6, %xmm3
  1830. pshufd $0x00, %xmm4, %xmm6
  1831. mulps %xmm3, %xmm6
  1832. subps %xmm6, %xmm1
  1833. movaps -32 * SIZE(AA), %xmm4
  1834. pshufd $0x00, %xmm4, %xmm6
  1835. mulps %xmm6, %xmm1
  1836. #endif
  1837. #ifdef LT
  1838. movaps -32 * SIZE(AA), %xmm4
  1839. pshufd $0x00, %xmm4, %xmm6
  1840. mulps %xmm6, %xmm1
  1841. pshufd $0x55, %xmm4, %xmm6
  1842. mulps %xmm1, %xmm6
  1843. subps %xmm6, %xmm3
  1844. pshufd $0xaa, %xmm4, %xmm6
  1845. mulps %xmm1, %xmm6
  1846. subps %xmm6, %xmm5
  1847. pshufd $0xff, %xmm4, %xmm6
  1848. mulps %xmm1, %xmm6
  1849. subps %xmm6, %xmm7
  1850. movaps -28 * SIZE(AA), %xmm4
  1851. pshufd $0x55, %xmm4, %xmm6
  1852. mulps %xmm6, %xmm3
  1853. pshufd $0xaa, %xmm4, %xmm6
  1854. mulps %xmm3, %xmm6
  1855. subps %xmm6, %xmm5
  1856. pshufd $0xff, %xmm4, %xmm6
  1857. mulps %xmm3, %xmm6
  1858. subps %xmm6, %xmm7
  1859. movaps -24 * SIZE(AA), %xmm4
  1860. pshufd $0xaa, %xmm4, %xmm6
  1861. mulps %xmm6, %xmm5
  1862. pshufd $0xff, %xmm4, %xmm6
  1863. mulps %xmm5, %xmm6
  1864. subps %xmm6, %xmm7
  1865. movaps -20 * SIZE(AA), %xmm4
  1866. pshufd $0xff, %xmm4, %xmm6
  1867. mulps %xmm6, %xmm7
  1868. #endif
  1869. #ifdef RN
  1870. movaps -32 * SIZE(BB), %xmm6
  1871. pshufd $0x00, %xmm6, %xmm7
  1872. mulps %xmm7, %xmm0
  1873. pshufd $0x55, %xmm6, %xmm7
  1874. mulps %xmm0, %xmm7
  1875. subps %xmm7, %xmm1
  1876. pshufd $0xaa, %xmm6, %xmm7
  1877. mulps %xmm0, %xmm7
  1878. subps %xmm7, %xmm2
  1879. pshufd $0xff, %xmm6, %xmm7
  1880. mulps %xmm0, %xmm7
  1881. subps %xmm7, %xmm3
  1882. movaps -28 * SIZE(BB), %xmm6
  1883. pshufd $0x55, %xmm6, %xmm7
  1884. mulps %xmm7, %xmm1
  1885. pshufd $0xaa, %xmm6, %xmm7
  1886. mulps %xmm1, %xmm7
  1887. subps %xmm7, %xmm2
  1888. pshufd $0xff, %xmm6, %xmm7
  1889. mulps %xmm1, %xmm7
  1890. subps %xmm7, %xmm3
  1891. movaps -24 * SIZE(BB), %xmm6
  1892. pshufd $0xaa, %xmm6, %xmm7
  1893. mulps %xmm7, %xmm2
  1894. pshufd $0xff, %xmm6, %xmm7
  1895. mulps %xmm2, %xmm7
  1896. subps %xmm7, %xmm3
  1897. movaps -20 * SIZE(BB), %xmm6
  1898. pshufd $0xff, %xmm6, %xmm7
  1899. mulps %xmm7, %xmm3
  1900. #endif
  1901. #ifdef RT
  1902. movaps -20 * SIZE(BB), %xmm6
  1903. pshufd $0xff, %xmm6, %xmm7
  1904. mulps %xmm7, %xmm3
  1905. pshufd $0xaa, %xmm6, %xmm7
  1906. mulps %xmm3, %xmm7
  1907. subps %xmm7, %xmm2
  1908. pshufd $0x55, %xmm6, %xmm7
  1909. mulps %xmm3, %xmm7
  1910. subps %xmm7, %xmm1
  1911. pshufd $0x00, %xmm6, %xmm7
  1912. mulps %xmm3, %xmm7
  1913. subps %xmm7, %xmm0
  1914. movaps -24 * SIZE(BB), %xmm6
  1915. pshufd $0xaa, %xmm6, %xmm7
  1916. mulps %xmm7, %xmm2
  1917. pshufd $0x55, %xmm6, %xmm7
  1918. mulps %xmm2, %xmm7
  1919. subps %xmm7, %xmm1
  1920. pshufd $0x00, %xmm6, %xmm7
  1921. mulps %xmm2, %xmm7
  1922. subps %xmm7, %xmm0
  1923. movaps -28 * SIZE(BB), %xmm6
  1924. pshufd $0x55, %xmm6, %xmm7
  1925. mulps %xmm7, %xmm1
  1926. pshufd $0x00, %xmm6, %xmm7
  1927. mulps %xmm1, %xmm7
  1928. subps %xmm7, %xmm0
  1929. movaps -32 * SIZE(BB), %xmm6
  1930. pshufd $0x00, %xmm6, %xmm7
  1931. mulps %xmm7, %xmm0
  1932. #endif
  1933. #if defined(LN) || defined(LT)
  1934. movaps %xmm1, -32 * SIZE(BB)
  1935. movaps %xmm3, -28 * SIZE(BB)
  1936. movaps %xmm5, -24 * SIZE(BB)
  1937. movaps %xmm7, -20 * SIZE(BB)
  1938. #else
  1939. movaps %xmm0, -32 * SIZE(AA)
  1940. movaps %xmm1, -28 * SIZE(AA)
  1941. movaps %xmm2, -24 * SIZE(AA)
  1942. movaps %xmm3, -20 * SIZE(AA)
  1943. #endif
  1944. #ifdef LN
  1945. subl $4 * SIZE, CO1
  1946. #endif
  1947. leal (LDC, LDC, 2), %eax
  1948. #if defined(LN) || defined(LT)
  1949. movaps %xmm1, %xmm0
  1950. unpcklps %xmm5, %xmm1
  1951. unpckhps %xmm5, %xmm0
  1952. movaps %xmm3, %xmm4
  1953. unpcklps %xmm7, %xmm3
  1954. unpckhps %xmm7, %xmm4
  1955. movaps %xmm1, %xmm2
  1956. unpcklps %xmm3, %xmm1
  1957. unpckhps %xmm3, %xmm2
  1958. movaps %xmm0, %xmm6
  1959. unpcklps %xmm4, %xmm0
  1960. unpckhps %xmm4, %xmm6
  1961. movlps %xmm1, 0 * SIZE(CO1)
  1962. movhps %xmm1, 2 * SIZE(CO1)
  1963. movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
  1964. movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
  1965. movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
  1966. movhps %xmm0, 2 * SIZE(CO1, LDC, 2)
  1967. movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
  1968. movhps %xmm6, 2 * SIZE(CO1, %eax, 1)
  1969. #else
  1970. movlps %xmm0, 0 * SIZE(CO1)
  1971. movhps %xmm0, 2 * SIZE(CO1)
  1972. movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
  1973. movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
  1974. movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
  1975. movhps %xmm2, 2 * SIZE(CO1, LDC, 2)
  1976. movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
  1977. movhps %xmm3, 2 * SIZE(CO1, %eax, 1)
  1978. #endif
  1979. #ifndef LN
  1980. addl $4 * SIZE, CO1
  1981. #endif
  1982. #if defined(LT) || defined(RN)
  1983. movl K, %eax
  1984. subl KK, %eax
  1985. leal (,%eax, SIZE), %eax
  1986. leal (AA, %eax, 4), AA
  1987. leal (BB, %eax, 4), BB
  1988. #endif
  1989. #ifdef LN
  1990. subl $4, KK
  1991. #endif
  1992. #ifdef LT
  1993. addl $4, KK
  1994. #endif
  1995. #ifdef RT
  1996. movl K, %eax
  1997. sall $2 + BASE_SHIFT, %eax
  1998. addl %eax, AORIG
  1999. #endif
  2000. decl %ebx # i --
  2001. jg .L11
  2002. ALIGN_4
  2003. .L20:
  2004. testl $2, M
  2005. je .L30
  2006. #ifdef LN
  2007. movl K, %eax
  2008. sall $1 + BASE_SHIFT, %eax
  2009. subl %eax, AORIG
  2010. #endif
  2011. #if defined(LN) || defined(RT)
  2012. movl KK, %eax
  2013. movl AORIG, AA
  2014. leal (, %eax, SIZE), %eax
  2015. leal (AA, %eax, 2), AA
  2016. #endif
  2017. movl B, BB
  2018. #if defined(LN) || defined(RT)
  2019. movl KK, %eax
  2020. sall $2 + BASE_SHIFT, %eax
  2021. addl %eax, BB
  2022. #endif
  2023. pxor %xmm4, %xmm4
  2024. movaps -32 * SIZE(AA), %xmm0
  2025. pxor %xmm5, %xmm5
  2026. movaps -32 * SIZE(BB), %xmm1
  2027. pxor %xmm6, %xmm6
  2028. pxor %xmm7, %xmm7
  2029. #if defined(LT) || defined(RN)
  2030. movl KK, %eax
  2031. #else
  2032. movl K, %eax
  2033. subl KK, %eax
  2034. #endif
  2035. sarl $3, %eax
  2036. je .L25
  2037. ALIGN_4
  2038. .L22:
  2039. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  2040. pshufd $0x44, %xmm0, %xmm2
  2041. pshufd $0x50, %xmm1, %xmm3
  2042. mulps %xmm2, %xmm3
  2043. addps %xmm3, %xmm4
  2044. pshufd $0xfa, %xmm1, %xmm3
  2045. movaps -28 * SIZE(BB), %xmm1
  2046. mulps %xmm2, %xmm3
  2047. addps %xmm3, %xmm6
  2048. pshufd $0xee, %xmm0, %xmm2
  2049. movaps -28 * SIZE(AA), %xmm0
  2050. pshufd $0x50, %xmm1, %xmm3
  2051. mulps %xmm2, %xmm3
  2052. addps %xmm3, %xmm5
  2053. pshufd $0xfa, %xmm1, %xmm3
  2054. movaps -24 * SIZE(BB), %xmm1
  2055. mulps %xmm2, %xmm3
  2056. addps %xmm3, %xmm7
  2057. pshufd $0x44, %xmm0, %xmm2
  2058. pshufd $0x50, %xmm1, %xmm3
  2059. mulps %xmm2, %xmm3
  2060. addps %xmm3, %xmm4
  2061. pshufd $0xfa, %xmm1, %xmm3
  2062. movaps -20 * SIZE(BB), %xmm1
  2063. mulps %xmm2, %xmm3
  2064. addps %xmm3, %xmm6
  2065. pshufd $0xee, %xmm0, %xmm2
  2066. movaps -24 * SIZE(AA), %xmm0
  2067. pshufd $0x50, %xmm1, %xmm3
  2068. mulps %xmm2, %xmm3
  2069. addps %xmm3, %xmm5
  2070. pshufd $0xfa, %xmm1, %xmm3
  2071. movaps -16 * SIZE(BB), %xmm1
  2072. mulps %xmm2, %xmm3
  2073. addps %xmm3, %xmm7
  2074. pshufd $0x44, %xmm0, %xmm2
  2075. pshufd $0x50, %xmm1, %xmm3
  2076. mulps %xmm2, %xmm3
  2077. addps %xmm3, %xmm4
  2078. pshufd $0xfa, %xmm1, %xmm3
  2079. movaps -12 * SIZE(BB), %xmm1
  2080. mulps %xmm2, %xmm3
  2081. addps %xmm3, %xmm6
  2082. pshufd $0xee, %xmm0, %xmm2
  2083. movaps -20 * SIZE(AA), %xmm0
  2084. pshufd $0x50, %xmm1, %xmm3
  2085. mulps %xmm2, %xmm3
  2086. addps %xmm3, %xmm5
  2087. pshufd $0xfa, %xmm1, %xmm3
  2088. movaps -8 * SIZE(BB), %xmm1
  2089. mulps %xmm2, %xmm3
  2090. addps %xmm3, %xmm7
  2091. pshufd $0x44, %xmm0, %xmm2
  2092. pshufd $0x50, %xmm1, %xmm3
  2093. mulps %xmm2, %xmm3
  2094. addps %xmm3, %xmm4
  2095. pshufd $0xfa, %xmm1, %xmm3
  2096. movaps -4 * SIZE(BB), %xmm1
  2097. mulps %xmm2, %xmm3
  2098. addps %xmm3, %xmm6
  2099. pshufd $0xee, %xmm0, %xmm2
  2100. movaps -16 * SIZE(AA), %xmm0
  2101. pshufd $0x50, %xmm1, %xmm3
  2102. mulps %xmm2, %xmm3
  2103. addps %xmm3, %xmm5
  2104. pshufd $0xfa, %xmm1, %xmm3
  2105. movaps 0 * SIZE(BB), %xmm1
  2106. mulps %xmm2, %xmm3
  2107. addps %xmm3, %xmm7
  2108. subl $-16 * SIZE, AA
  2109. subl $-32 * SIZE, BB
  2110. subl $1, %eax
  2111. jne .L22
  2112. ALIGN_4
  2113. .L25:
  2114. #if defined(LT) || defined(RN)
  2115. movl KK, %eax
  2116. #else
  2117. movl K, %eax
  2118. subl KK, %eax
  2119. #endif
  2120. andl $7, %eax # if (k & 1)
  2121. BRANCH
  2122. je .L28
  2123. ALIGN_4
  2124. .L26:
  2125. pshufd $0x44, %xmm0, %xmm2
  2126. movsd -30 * SIZE(AA), %xmm0
  2127. pshufd $0x50, %xmm1, %xmm3
  2128. mulps %xmm2, %xmm3
  2129. addps %xmm3, %xmm4
  2130. pshufd $0xfa, %xmm1, %xmm3
  2131. movaps -28 * SIZE(BB), %xmm1
  2132. mulps %xmm2, %xmm3
  2133. addps %xmm3, %xmm6
  2134. addl $2 * SIZE, AA
  2135. addl $4 * SIZE, BB
  2136. decl %eax
  2137. jg .L26
  2138. ALIGN_4
  2139. .L28:
  2140. #if defined(LN) || defined(RT)
  2141. movl KK, %eax
  2142. #ifdef LN
  2143. subl $2, %eax
  2144. #else
  2145. subl $4, %eax
  2146. #endif
  2147. movl AORIG, AA
  2148. leal (, %eax, SIZE), %eax
  2149. leal (AA, %eax, 2), AA
  2150. leal (B, %eax, 4), BB
  2151. #endif
  2152. addps %xmm5, %xmm4
  2153. addps %xmm7, %xmm6
  2154. movhlps %xmm4, %xmm5
  2155. movhlps %xmm6, %xmm7
  2156. #if defined(LN) || defined(LT)
  2157. unpcklps %xmm6, %xmm4
  2158. unpcklps %xmm7, %xmm5
  2159. movaps %xmm4, %xmm6
  2160. unpcklps %xmm5, %xmm4
  2161. unpckhps %xmm5, %xmm6
  2162. movaps -32 * SIZE(BB), %xmm1
  2163. movaps -28 * SIZE(BB), %xmm3
  2164. subps %xmm4, %xmm1
  2165. subps %xmm6, %xmm3
  2166. #else
  2167. movsd -32 * SIZE(AA), %xmm0
  2168. movsd -30 * SIZE(AA), %xmm1
  2169. movsd -28 * SIZE(AA), %xmm2
  2170. movsd -26 * SIZE(AA), %xmm3
  2171. subps %xmm4, %xmm0
  2172. subps %xmm5, %xmm1
  2173. subps %xmm6, %xmm2
  2174. subps %xmm7, %xmm3
  2175. #endif
  2176. #ifdef LN
  2177. movaps -32 * SIZE(AA), %xmm4
  2178. pshufd $0xff, %xmm4, %xmm6
  2179. mulps %xmm6, %xmm3
  2180. pshufd $0xaa, %xmm4, %xmm6
  2181. mulps %xmm3, %xmm6
  2182. subps %xmm6, %xmm1
  2183. pshufd $0x00, %xmm4, %xmm6
  2184. mulps %xmm6, %xmm1
  2185. #endif
  2186. #ifdef LT
  2187. movaps -32 * SIZE(AA), %xmm4
  2188. pshufd $0x00, %xmm4, %xmm6
  2189. mulps %xmm6, %xmm1
  2190. pshufd $0x55, %xmm4, %xmm6
  2191. mulps %xmm1, %xmm6
  2192. subps %xmm6, %xmm3
  2193. pshufd $0xff, %xmm4, %xmm6
  2194. mulps %xmm6, %xmm3
  2195. #endif
  2196. #ifdef RN
  2197. movaps -32 * SIZE(BB), %xmm6
  2198. pshufd $0x00, %xmm6, %xmm7
  2199. mulps %xmm7, %xmm0
  2200. pshufd $0x55, %xmm6, %xmm7
  2201. mulps %xmm0, %xmm7
  2202. subps %xmm7, %xmm1
  2203. pshufd $0xaa, %xmm6, %xmm7
  2204. mulps %xmm0, %xmm7
  2205. subps %xmm7, %xmm2
  2206. pshufd $0xff, %xmm6, %xmm7
  2207. mulps %xmm0, %xmm7
  2208. subps %xmm7, %xmm3
  2209. movaps -28 * SIZE(BB), %xmm6
  2210. pshufd $0x55, %xmm6, %xmm7
  2211. mulps %xmm7, %xmm1
  2212. pshufd $0xaa, %xmm6, %xmm7
  2213. mulps %xmm1, %xmm7
  2214. subps %xmm7, %xmm2
  2215. pshufd $0xff, %xmm6, %xmm7
  2216. mulps %xmm1, %xmm7
  2217. subps %xmm7, %xmm3
  2218. movaps -24 * SIZE(BB), %xmm6
  2219. pshufd $0xaa, %xmm6, %xmm7
  2220. mulps %xmm7, %xmm2
  2221. pshufd $0xff, %xmm6, %xmm7
  2222. mulps %xmm2, %xmm7
  2223. subps %xmm7, %xmm3
  2224. movaps -20 * SIZE(BB), %xmm6
  2225. pshufd $0xff, %xmm6, %xmm7
  2226. mulps %xmm7, %xmm3
  2227. #endif
  2228. #ifdef RT
  2229. movaps -20 * SIZE(BB), %xmm6
  2230. pshufd $0xff, %xmm6, %xmm7
  2231. mulps %xmm7, %xmm3
  2232. pshufd $0xaa, %xmm6, %xmm7
  2233. mulps %xmm3, %xmm7
  2234. subps %xmm7, %xmm2
  2235. pshufd $0x55, %xmm6, %xmm7
  2236. mulps %xmm3, %xmm7
  2237. subps %xmm7, %xmm1
  2238. pshufd $0x00, %xmm6, %xmm7
  2239. mulps %xmm3, %xmm7
  2240. subps %xmm7, %xmm0
  2241. movaps -24 * SIZE(BB), %xmm6
  2242. pshufd $0xaa, %xmm6, %xmm7
  2243. mulps %xmm7, %xmm2
  2244. pshufd $0x55, %xmm6, %xmm7
  2245. mulps %xmm2, %xmm7
  2246. subps %xmm7, %xmm1
  2247. pshufd $0x00, %xmm6, %xmm7
  2248. mulps %xmm2, %xmm7
  2249. subps %xmm7, %xmm0
  2250. movaps -28 * SIZE(BB), %xmm6
  2251. pshufd $0x55, %xmm6, %xmm7
  2252. mulps %xmm7, %xmm1
  2253. pshufd $0x00, %xmm6, %xmm7
  2254. mulps %xmm1, %xmm7
  2255. subps %xmm7, %xmm0
  2256. movaps -32 * SIZE(BB), %xmm6
  2257. pshufd $0x00, %xmm6, %xmm7
  2258. mulps %xmm7, %xmm0
  2259. #endif
  2260. #if defined(LN) || defined(LT)
  2261. movaps %xmm1, -32 * SIZE(BB)
  2262. movaps %xmm3, -28 * SIZE(BB)
  2263. #else
  2264. movlps %xmm0, -32 * SIZE(AA)
  2265. movlps %xmm1, -30 * SIZE(AA)
  2266. movlps %xmm2, -28 * SIZE(AA)
  2267. movlps %xmm3, -26 * SIZE(AA)
  2268. #endif
  2269. #ifdef LN
  2270. subl $2 * SIZE, CO1
  2271. #endif
  2272. leal (LDC, LDC, 2), %eax
  2273. #if defined(LN) || defined(LT)
  2274. movaps %xmm1, %xmm0
  2275. unpcklps %xmm5, %xmm1
  2276. unpckhps %xmm5, %xmm0
  2277. movaps %xmm3, %xmm4
  2278. unpcklps %xmm7, %xmm3
  2279. unpckhps %xmm7, %xmm4
  2280. movaps %xmm1, %xmm2
  2281. unpcklps %xmm3, %xmm1
  2282. unpckhps %xmm3, %xmm2
  2283. movaps %xmm0, %xmm6
  2284. unpcklps %xmm4, %xmm0
  2285. unpckhps %xmm4, %xmm6
  2286. movlps %xmm1, 0 * SIZE(CO1)
  2287. movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
  2288. movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
  2289. movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
  2290. #else
  2291. movlps %xmm0, 0 * SIZE(CO1)
  2292. movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
  2293. movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
  2294. movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
  2295. #endif
  2296. #ifndef LN
  2297. addl $2 * SIZE, CO1
  2298. #endif
  2299. #if defined(LT) || defined(RN)
  2300. movl K, %eax
  2301. subl KK, %eax
  2302. leal (,%eax, SIZE), %eax
  2303. leal (AA, %eax, 2), AA
  2304. leal (BB, %eax, 4), BB
  2305. #endif
  2306. #ifdef LN
  2307. subl $2, KK
  2308. #endif
  2309. #ifdef LT
  2310. addl $2, KK
  2311. #endif
  2312. #ifdef RT
  2313. movl K, %eax
  2314. sall $1 + BASE_SHIFT, %eax
  2315. addl %eax, AORIG
  2316. #endif
  2317. ALIGN_4
  2318. .L30:
  2319. testl $1, M
  2320. je .L39
  2321. #ifdef LN
  2322. movl K, %eax
  2323. sall $BASE_SHIFT, %eax
  2324. subl %eax, AORIG
  2325. #endif
  2326. #if defined(LN) || defined(RT)
  2327. movl KK, %eax
  2328. movl AORIG, AA
  2329. leal (AA, %eax, SIZE), AA
  2330. #endif
  2331. movl B, BB
  2332. #if defined(LN) || defined(RT)
  2333. movl KK, %eax
  2334. sall $2 + BASE_SHIFT, %eax
  2335. addl %eax, BB
  2336. #endif
  2337. pxor %xmm4, %xmm4
  2338. movsd -32 * SIZE(AA), %xmm0
  2339. pxor %xmm5, %xmm5
  2340. movaps -32 * SIZE(BB), %xmm1
  2341. pxor %xmm6, %xmm6
  2342. pxor %xmm7, %xmm7
  2343. #if defined(LT) || defined(RN)
  2344. movl KK, %eax
  2345. #else
  2346. movl K, %eax
  2347. subl KK, %eax
  2348. #endif
  2349. sarl $3, %eax
  2350. je .L35
  2351. ALIGN_4
  2352. .L32:
  2353. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  2354. pshufd $0x00, %xmm0, %xmm2
  2355. mulps %xmm2, %xmm1
  2356. addps %xmm1, %xmm4
  2357. movaps -28 * SIZE(BB), %xmm1
  2358. pshufd $0x55, %xmm0, %xmm2
  2359. movsd -30 * SIZE(AA), %xmm0
  2360. mulps %xmm2, %xmm1
  2361. addps %xmm1, %xmm4
  2362. movaps -24 * SIZE(BB), %xmm1
  2363. pshufd $0x00, %xmm0, %xmm2
  2364. mulps %xmm2, %xmm1
  2365. addps %xmm1, %xmm4
  2366. movaps -20 * SIZE(BB), %xmm1
  2367. pshufd $0x55, %xmm0, %xmm2
  2368. movsd -28 * SIZE(AA), %xmm0
  2369. mulps %xmm2, %xmm1
  2370. addps %xmm1, %xmm4
  2371. movaps -16 * SIZE(BB), %xmm1
  2372. pshufd $0x00, %xmm0, %xmm2
  2373. mulps %xmm2, %xmm1
  2374. addps %xmm1, %xmm4
  2375. movaps -12 * SIZE(BB), %xmm1
  2376. pshufd $0x55, %xmm0, %xmm2
  2377. movsd -26 * SIZE(AA), %xmm0
  2378. mulps %xmm2, %xmm1
  2379. addps %xmm1, %xmm4
  2380. movaps -8 * SIZE(BB), %xmm1
  2381. pshufd $0x00, %xmm0, %xmm2
  2382. mulps %xmm2, %xmm1
  2383. addps %xmm1, %xmm4
  2384. movaps -4 * SIZE(BB), %xmm1
  2385. pshufd $0x55, %xmm0, %xmm2
  2386. movsd -24 * SIZE(AA), %xmm0
  2387. mulps %xmm2, %xmm1
  2388. addps %xmm1, %xmm4
  2389. movaps 0 * SIZE(BB), %xmm1
  2390. subl $ -8 * SIZE, AA
  2391. subl $-32 * SIZE, BB
  2392. subl $1, %eax
  2393. jne .L32
  2394. ALIGN_4
  2395. .L35:
  2396. #if defined(LT) || defined(RN)
  2397. movl KK, %eax
  2398. #else
  2399. movl K, %eax
  2400. subl KK, %eax
  2401. #endif
  2402. andl $7, %eax # if (k & 1)
  2403. BRANCH
  2404. je .L38
  2405. ALIGN_4
  2406. .L36:
  2407. pshufd $0x00, %xmm0, %xmm2
  2408. movss -31 * SIZE(AA), %xmm0
  2409. mulps %xmm2, %xmm1
  2410. addps %xmm1, %xmm4
  2411. movaps -28 * SIZE(BB), %xmm1
  2412. addl $1 * SIZE, AA
  2413. addl $4 * SIZE, BB
  2414. decl %eax
  2415. jg .L36
  2416. ALIGN_4
  2417. .L38:
  2418. #if defined(LN) || defined(RT)
  2419. movl KK, %eax
  2420. #ifdef LN
  2421. subl $1, %eax
  2422. #else
  2423. subl $4, %eax
  2424. #endif
  2425. movl AORIG, AA
  2426. leal (, %eax, SIZE), %eax
  2427. leal (AA, %eax, 1), AA
  2428. leal (B, %eax, 4), BB
  2429. #endif
  2430. #if defined(LN) || defined(LT)
  2431. movaps -32 * SIZE(BB), %xmm1
  2432. subps %xmm4, %xmm1
  2433. #else
  2434. movsd -32 * SIZE(AA), %xmm0
  2435. movhps -30 * SIZE(AA), %xmm0
  2436. subps %xmm4, %xmm0
  2437. pshufd $0xff, %xmm0, %xmm3
  2438. pshufd $0xaa, %xmm0, %xmm2
  2439. pshufd $0x55, %xmm0, %xmm1
  2440. pshufd $0x00, %xmm0, %xmm0
  2441. #endif
  2442. #if defined(LN) || defined(LT)
  2443. movss -32 * SIZE(AA), %xmm4
  2444. pshufd $0x00, %xmm4, %xmm6
  2445. mulps %xmm6, %xmm1
  2446. #endif
  2447. #ifdef RN
  2448. movaps -32 * SIZE(BB), %xmm6
  2449. pshufd $0x00, %xmm6, %xmm7
  2450. mulss %xmm7, %xmm0
  2451. pshufd $0x55, %xmm6, %xmm7
  2452. mulss %xmm0, %xmm7
  2453. subss %xmm7, %xmm1
  2454. pshufd $0xaa, %xmm6, %xmm7
  2455. mulss %xmm0, %xmm7
  2456. subss %xmm7, %xmm2
  2457. pshufd $0xff, %xmm6, %xmm7
  2458. mulss %xmm0, %xmm7
  2459. subss %xmm7, %xmm3
  2460. movaps -28 * SIZE(BB), %xmm6
  2461. pshufd $0x55, %xmm6, %xmm7
  2462. mulss %xmm7, %xmm1
  2463. pshufd $0xaa, %xmm6, %xmm7
  2464. mulss %xmm1, %xmm7
  2465. subss %xmm7, %xmm2
  2466. pshufd $0xff, %xmm6, %xmm7
  2467. mulss %xmm1, %xmm7
  2468. subss %xmm7, %xmm3
  2469. movaps -24 * SIZE(BB), %xmm6
  2470. pshufd $0xaa, %xmm6, %xmm7
  2471. mulss %xmm7, %xmm2
  2472. pshufd $0xff, %xmm6, %xmm7
  2473. mulss %xmm2, %xmm7
  2474. subss %xmm7, %xmm3
  2475. movaps -20 * SIZE(BB), %xmm6
  2476. pshufd $0xff, %xmm6, %xmm7
  2477. mulss %xmm7, %xmm3
  2478. #endif
  2479. #ifdef RT
  2480. movaps -20 * SIZE(BB), %xmm6
  2481. pshufd $0xff, %xmm6, %xmm7
  2482. mulss %xmm7, %xmm3
  2483. pshufd $0xaa, %xmm6, %xmm7
  2484. mulss %xmm3, %xmm7
  2485. subss %xmm7, %xmm2
  2486. pshufd $0x55, %xmm6, %xmm7
  2487. mulss %xmm3, %xmm7
  2488. subss %xmm7, %xmm1
  2489. pshufd $0x00, %xmm6, %xmm7
  2490. mulss %xmm3, %xmm7
  2491. subss %xmm7, %xmm0
  2492. movaps -24 * SIZE(BB), %xmm6
  2493. pshufd $0xaa, %xmm6, %xmm7
  2494. mulss %xmm7, %xmm2
  2495. pshufd $0x55, %xmm6, %xmm7
  2496. mulss %xmm2, %xmm7
  2497. subss %xmm7, %xmm1
  2498. pshufd $0x00, %xmm6, %xmm7
  2499. mulss %xmm2, %xmm7
  2500. subss %xmm7, %xmm0
  2501. movaps -28 * SIZE(BB), %xmm6
  2502. pshufd $0x55, %xmm6, %xmm7
  2503. mulss %xmm7, %xmm1
  2504. pshufd $0x00, %xmm6, %xmm7
  2505. mulss %xmm1, %xmm7
  2506. subss %xmm7, %xmm0
  2507. movaps -32 * SIZE(BB), %xmm6
  2508. pshufd $0x00, %xmm6, %xmm7
  2509. mulss %xmm7, %xmm0
  2510. #endif
  2511. #if defined(LN) || defined(LT)
  2512. movaps %xmm1, -32 * SIZE(BB)
  2513. #else
  2514. movss %xmm0, -32 * SIZE(AA)
  2515. movss %xmm1, -31 * SIZE(AA)
  2516. movss %xmm2, -30 * SIZE(AA)
  2517. movss %xmm3, -29 * SIZE(AA)
  2518. #endif
  2519. #ifdef LN
  2520. subl $1 * SIZE, CO1
  2521. #endif
  2522. leal (LDC, LDC, 2), %eax
  2523. #if defined(LN) || defined(LT)
  2524. movaps %xmm1, %xmm0
  2525. unpcklps %xmm5, %xmm1
  2526. unpckhps %xmm5, %xmm0
  2527. movaps %xmm3, %xmm4
  2528. unpcklps %xmm7, %xmm3
  2529. unpckhps %xmm7, %xmm4
  2530. movaps %xmm1, %xmm2
  2531. unpcklps %xmm3, %xmm1
  2532. unpckhps %xmm3, %xmm2
  2533. movaps %xmm0, %xmm6
  2534. unpcklps %xmm4, %xmm0
  2535. unpckhps %xmm4, %xmm6
  2536. movss %xmm1, 0 * SIZE(CO1)
  2537. movss %xmm2, 0 * SIZE(CO1, LDC, 1)
  2538. movss %xmm0, 0 * SIZE(CO1, LDC, 2)
  2539. movss %xmm6, 0 * SIZE(CO1, %eax, 1)
  2540. #else
  2541. movss %xmm0, 0 * SIZE(CO1)
  2542. movss %xmm1, 0 * SIZE(CO1, LDC, 1)
  2543. movss %xmm2, 0 * SIZE(CO1, LDC, 2)
  2544. movss %xmm3, 0 * SIZE(CO1, %eax, 1)
  2545. #endif
  2546. #ifndef LN
  2547. addl $1 * SIZE, CO1
  2548. #endif
  2549. #if defined(LT) || defined(RN)
  2550. movl K, %eax
  2551. subl KK, %eax
  2552. leal (,%eax, SIZE), %eax
  2553. leal (AA, %eax, 1), AA
  2554. leal (BB, %eax, 4), BB
  2555. #endif
  2556. #ifdef LN
  2557. subl $1, KK
  2558. #endif
  2559. #ifdef LT
  2560. addl $1, KK
  2561. #endif
  2562. #ifdef RT
  2563. movl K, %eax
  2564. sall $BASE_SHIFT, %eax
  2565. addl %eax, AORIG
  2566. #endif
  2567. ALIGN_4
  2568. .L39:
  2569. #ifdef LN
  2570. movl K, %eax
  2571. leal (, %eax, SIZE), %eax
  2572. leal (B, %eax, 4), B
  2573. #endif
  2574. #if defined(LT) || defined(RN)
  2575. movl BB, B
  2576. #endif
  2577. #ifdef RN
  2578. addl $4, KK
  2579. #endif
  2580. #ifdef RT
  2581. subl $4, KK
  2582. #endif
  2583. decl J # j --
  2584. jg .L10
  2585. ALIGN_4
  2586. .L999:
  2587. popl %ebx
  2588. popl %esi
  2589. popl %edi
  2590. popl %ebp
  2591. addl $ARGS, %esp
  2592. ret
  2593. EPILOGUE