You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_2x4_penryn.S 36 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define KK 4 + STACK(%esp)
  53. #define KKK 8 + STACK(%esp)
  54. #define AORIG 12 + STACK(%esp)
  55. #if defined(PENRYN) || defined(DUNNINGTON)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHSIZE (8 * 21 + 4)
  58. #endif
  59. #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
  60. #define PREFETCH prefetcht0
  61. #define PREFETCHSIZE (8 * 21 + 4)
  62. #endif
  63. #ifdef NANO
  64. #define PREFETCH prefetcht0
  65. #define PREFETCHSIZE (8 * 2)
  66. #endif
  67. #define AA %edx
  68. #define BB %ecx
  69. #define LDC %ebp
  70. #define B %edi
  71. #define CO1 %esi
  72. PROLOGUE
  73. subl $ARGS, %esp
  74. pushl %ebp
  75. pushl %edi
  76. pushl %esi
  77. pushl %ebx
  78. PROFCODE
  79. movl ARG_B, B
  80. movl ARG_LDC, LDC
  81. movl OFFSET, %eax
  82. #ifdef RN
  83. negl %eax
  84. #endif
  85. movl %eax, KK
  86. leal (, LDC, SIZE), LDC
  87. subl $-16 * SIZE, A
  88. subl $-16 * SIZE, B
  89. #ifdef LN
  90. movl M, %eax
  91. leal (, %eax, SIZE), %eax
  92. addl %eax, C
  93. imull K, %eax
  94. addl %eax, A
  95. #endif
  96. #ifdef RT
  97. movl N, %eax
  98. leal (, %eax, SIZE), %eax
  99. imull K, %eax
  100. addl %eax, B
  101. movl N, %eax
  102. imull LDC, %eax
  103. addl %eax, C
  104. #endif
  105. #ifdef RT
  106. movl N, %eax
  107. subl OFFSET, %eax
  108. movl %eax, KK
  109. #endif
  110. movl N, %eax
  111. sarl $2, %eax
  112. movl %eax, J
  113. jle .L30
  114. ALIGN_4
  115. .L10:
  116. #if defined(LT) || defined(RN)
  117. movl A, AA
  118. #else
  119. movl A, %eax
  120. movl %eax, AORIG
  121. #endif
  122. #ifdef RT
  123. movl K, %eax
  124. sall $2 + BASE_SHIFT, %eax
  125. subl %eax, B
  126. #endif
  127. leal (, LDC, 4), %eax
  128. #ifdef RT
  129. subl %eax, C
  130. #endif
  131. movl C, CO1
  132. #ifndef RT
  133. addl %eax, C
  134. #endif
  135. #ifdef LN
  136. movl OFFSET, %eax
  137. addl M, %eax
  138. movl %eax, KK
  139. #endif
  140. #ifdef LT
  141. movl OFFSET, %eax
  142. movl %eax, KK
  143. #endif
  144. movl M, %ebx
  145. sarl $1, %ebx # i = (m >> 2)
  146. jle .L20
  147. ALIGN_4
  148. .L11:
  149. #ifdef LN
  150. movl K, %eax
  151. sall $1 + BASE_SHIFT, %eax
  152. subl %eax, AORIG
  153. #endif
  154. #if defined(LN) || defined(RT)
  155. movl KK, %eax
  156. movl AORIG, AA
  157. leal (, %eax, SIZE), %eax
  158. leal (AA, %eax, 2), AA
  159. #endif
  160. movl B, BB
  161. #if defined(LN) || defined(RT)
  162. movl KK, %eax
  163. sall $2 + BASE_SHIFT, %eax
  164. addl %eax, BB
  165. #endif
  166. leal (CO1, LDC, 2), %eax
  167. movaps -16 * SIZE(AA), %xmm0
  168. pxor %xmm2, %xmm2
  169. movaps -16 * SIZE(BB), %xmm1
  170. pxor %xmm3, %xmm3
  171. #ifdef LN
  172. pxor %xmm4, %xmm4
  173. prefetcht0 -2 * SIZE(CO1)
  174. pxor %xmm5, %xmm5
  175. prefetcht0 -2 * SIZE(CO1, LDC)
  176. pxor %xmm6, %xmm6
  177. prefetcht0 -2 * SIZE(%eax)
  178. pxor %xmm7, %xmm7
  179. prefetcht0 -2 * SIZE(%eax, LDC)
  180. #else
  181. pxor %xmm4, %xmm4
  182. prefetcht0 1 * SIZE(CO1)
  183. pxor %xmm5, %xmm5
  184. prefetcht0 1 * SIZE(CO1, LDC)
  185. pxor %xmm6, %xmm6
  186. prefetcht0 1 * SIZE(%eax)
  187. pxor %xmm7, %xmm7
  188. prefetcht0 1 * SIZE(%eax, LDC)
  189. #endif
  190. #if defined(LT) || defined(RN)
  191. movl KK, %eax
  192. #else
  193. movl K, %eax
  194. subl KK, %eax
  195. #endif
  196. sarl $3, %eax
  197. je .L15
  198. ALIGN_4
  199. .L12:
  200. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  201. addpd %xmm3, %xmm7
  202. movaps -14 * SIZE(BB), %xmm3
  203. addpd %xmm2, %xmm6
  204. pshufd $0x4e, %xmm1, %xmm2
  205. mulpd %xmm0, %xmm1
  206. mulpd %xmm0, %xmm2
  207. addpd %xmm1, %xmm5
  208. movaps -12 * SIZE(BB), %xmm1
  209. addpd %xmm2, %xmm4
  210. pshufd $0x4e, %xmm3, %xmm2
  211. mulpd %xmm0, %xmm3
  212. mulpd %xmm0, %xmm2
  213. movaps -14 * SIZE(AA), %xmm0
  214. addpd %xmm3, %xmm7
  215. movaps -10 * SIZE(BB), %xmm3
  216. addpd %xmm2, %xmm6
  217. pshufd $0x4e, %xmm1, %xmm2
  218. mulpd %xmm0, %xmm1
  219. mulpd %xmm0, %xmm2
  220. addpd %xmm1, %xmm5
  221. movaps -8 * SIZE(BB), %xmm1
  222. addpd %xmm2, %xmm4
  223. pshufd $0x4e, %xmm3, %xmm2
  224. mulpd %xmm0, %xmm3
  225. mulpd %xmm0, %xmm2
  226. movaps -12 * SIZE(AA), %xmm0
  227. addpd %xmm3, %xmm7
  228. movaps -6 * SIZE(BB), %xmm3
  229. addpd %xmm2, %xmm6
  230. pshufd $0x4e, %xmm1, %xmm2
  231. mulpd %xmm0, %xmm1
  232. mulpd %xmm0, %xmm2
  233. addpd %xmm1, %xmm5
  234. movaps -4 * SIZE(BB), %xmm1
  235. addpd %xmm2, %xmm4
  236. pshufd $0x4e, %xmm3, %xmm2
  237. mulpd %xmm0, %xmm3
  238. mulpd %xmm0, %xmm2
  239. movaps -10 * SIZE(AA), %xmm0
  240. addpd %xmm3, %xmm7
  241. movaps -2 * SIZE(BB), %xmm3
  242. addpd %xmm2, %xmm6
  243. pshufd $0x4e, %xmm1, %xmm2
  244. mulpd %xmm0, %xmm1
  245. mulpd %xmm0, %xmm2
  246. addpd %xmm1, %xmm5
  247. movaps 0 * SIZE(BB), %xmm1
  248. addpd %xmm2, %xmm4
  249. pshufd $0x4e, %xmm3, %xmm2
  250. mulpd %xmm0, %xmm3
  251. mulpd %xmm0, %xmm2
  252. movaps -8 * SIZE(AA), %xmm0
  253. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  254. addpd %xmm3, %xmm7
  255. movaps 2 * SIZE(BB), %xmm3
  256. addpd %xmm2, %xmm6
  257. pshufd $0x4e, %xmm1, %xmm2
  258. mulpd %xmm0, %xmm1
  259. mulpd %xmm0, %xmm2
  260. addpd %xmm1, %xmm5
  261. movaps 4 * SIZE(BB), %xmm1
  262. addpd %xmm2, %xmm4
  263. pshufd $0x4e, %xmm3, %xmm2
  264. mulpd %xmm0, %xmm3
  265. mulpd %xmm0, %xmm2
  266. movaps -6 * SIZE(AA), %xmm0
  267. addpd %xmm3, %xmm7
  268. movaps 6 * SIZE(BB), %xmm3
  269. addpd %xmm2, %xmm6
  270. pshufd $0x4e, %xmm1, %xmm2
  271. mulpd %xmm0, %xmm1
  272. mulpd %xmm0, %xmm2
  273. addpd %xmm1, %xmm5
  274. movaps 8 * SIZE(BB), %xmm1
  275. addpd %xmm2, %xmm4
  276. pshufd $0x4e, %xmm3, %xmm2
  277. mulpd %xmm0, %xmm3
  278. mulpd %xmm0, %xmm2
  279. movaps -4 * SIZE(AA), %xmm0
  280. addpd %xmm3, %xmm7
  281. movaps 10 * SIZE(BB), %xmm3
  282. addpd %xmm2, %xmm6
  283. pshufd $0x4e, %xmm1, %xmm2
  284. mulpd %xmm0, %xmm1
  285. mulpd %xmm0, %xmm2
  286. addpd %xmm1, %xmm5
  287. movaps 12 * SIZE(BB), %xmm1
  288. addpd %xmm2, %xmm4
  289. pshufd $0x4e, %xmm3, %xmm2
  290. mulpd %xmm0, %xmm3
  291. mulpd %xmm0, %xmm2
  292. movaps -2 * SIZE(AA), %xmm0
  293. addpd %xmm3, %xmm7
  294. movaps 14 * SIZE(BB), %xmm3
  295. addpd %xmm2, %xmm6
  296. pshufd $0x4e, %xmm1, %xmm2
  297. mulpd %xmm0, %xmm1
  298. mulpd %xmm0, %xmm2
  299. addpd %xmm1, %xmm5
  300. movaps 16 * SIZE(BB), %xmm1
  301. addpd %xmm2, %xmm4
  302. pshufd $0x4e, %xmm3, %xmm2
  303. mulpd %xmm0, %xmm3
  304. subl $-32 * SIZE, BB
  305. mulpd %xmm0, %xmm2
  306. movaps 0 * SIZE(AA), %xmm0
  307. subl $-16 * SIZE, AA
  308. subl $1, %eax
  309. jne .L12
  310. ALIGN_4
  311. .L15:
  312. #if defined(LT) || defined(RN)
  313. movl KK, %eax
  314. #else
  315. movl K, %eax
  316. subl KK, %eax
  317. #endif
  318. andl $7, %eax # if (k & 1)
  319. BRANCH
  320. je .L18
  321. ALIGN_3
  322. .L16:
  323. addpd %xmm3, %xmm7
  324. movaps -14 * SIZE(BB), %xmm3
  325. addpd %xmm2, %xmm6
  326. pshufd $0x4e, %xmm1, %xmm2
  327. mulpd %xmm0, %xmm1
  328. mulpd %xmm0, %xmm2
  329. addpd %xmm1, %xmm5
  330. movaps -12 * SIZE(BB), %xmm1
  331. addpd %xmm2, %xmm4
  332. pshufd $0x4e, %xmm3, %xmm2
  333. mulpd %xmm0, %xmm3
  334. mulpd %xmm0, %xmm2
  335. movaps -14 * SIZE(AA), %xmm0
  336. addl $2 * SIZE, AA
  337. addl $4 * SIZE, BB
  338. decl %eax
  339. jg .L16
  340. ALIGN_4
  341. .L18:
  342. #if defined(LN) || defined(RT)
  343. movl KK, %eax
  344. #ifdef LN
  345. subl $2, %eax
  346. #else
  347. subl $4, %eax
  348. #endif
  349. movl AORIG, AA
  350. leal (, %eax, SIZE), %eax
  351. leal (AA, %eax, 2), AA
  352. leal (B, %eax, 4), BB
  353. #endif
  354. addpd %xmm2, %xmm6
  355. addpd %xmm3, %xmm7
  356. movaps %xmm4, %xmm0
  357. movsd %xmm5, %xmm4
  358. movsd %xmm0, %xmm5
  359. movaps %xmm6, %xmm0
  360. movsd %xmm7, %xmm6
  361. movsd %xmm0, %xmm7
  362. #if defined(LN) || defined(LT)
  363. movapd %xmm4, %xmm0
  364. unpcklpd %xmm5, %xmm4
  365. unpckhpd %xmm5, %xmm0
  366. movapd %xmm6, %xmm1
  367. unpcklpd %xmm7, %xmm6
  368. unpckhpd %xmm7, %xmm1
  369. movapd -16 * SIZE(BB), %xmm2
  370. movapd -14 * SIZE(BB), %xmm5
  371. movapd -12 * SIZE(BB), %xmm3
  372. movapd -10 * SIZE(BB), %xmm7
  373. subpd %xmm4, %xmm2
  374. subpd %xmm6, %xmm5
  375. subpd %xmm0, %xmm3
  376. subpd %xmm1, %xmm7
  377. #else
  378. movapd -16 * SIZE(AA), %xmm0
  379. movapd -14 * SIZE(AA), %xmm1
  380. movapd -12 * SIZE(AA), %xmm2
  381. movapd -10 * SIZE(AA), %xmm3
  382. subpd %xmm4, %xmm0
  383. subpd %xmm5, %xmm1
  384. subpd %xmm6, %xmm2
  385. subpd %xmm7, %xmm3
  386. #endif
  387. #ifdef LN
  388. movddup -13 * SIZE(AA), %xmm4
  389. mulpd %xmm4, %xmm3
  390. mulpd %xmm4, %xmm7
  391. movddup -14 * SIZE(AA), %xmm4
  392. movapd %xmm4, %xmm6
  393. mulpd %xmm3, %xmm4
  394. subpd %xmm4, %xmm2
  395. mulpd %xmm7, %xmm6
  396. subpd %xmm6, %xmm5
  397. movddup -16 * SIZE(AA), %xmm4
  398. mulpd %xmm4, %xmm2
  399. mulpd %xmm4, %xmm5
  400. #endif
  401. #ifdef LT
  402. movddup -16 * SIZE(AA), %xmm4
  403. mulpd %xmm4, %xmm2
  404. mulpd %xmm4, %xmm5
  405. movddup -15 * SIZE(AA), %xmm4
  406. movapd %xmm4, %xmm6
  407. mulpd %xmm2, %xmm4
  408. subpd %xmm4, %xmm3
  409. mulpd %xmm5, %xmm6
  410. subpd %xmm6, %xmm7
  411. movddup -13 * SIZE(AA), %xmm4
  412. mulpd %xmm4, %xmm3
  413. mulpd %xmm4, %xmm7
  414. #endif
  415. #ifdef RN
  416. movddup -16 * SIZE(BB), %xmm4
  417. mulpd %xmm4, %xmm0
  418. movddup -15 * SIZE(BB), %xmm4
  419. mulpd %xmm0, %xmm4
  420. subpd %xmm4, %xmm1
  421. movddup -14 * SIZE(BB), %xmm4
  422. mulpd %xmm0, %xmm4
  423. subpd %xmm4, %xmm2
  424. movddup -13 * SIZE(BB), %xmm4
  425. mulpd %xmm0, %xmm4
  426. subpd %xmm4, %xmm3
  427. movddup -11 * SIZE(BB), %xmm4
  428. mulpd %xmm4, %xmm1
  429. movddup -10 * SIZE(BB), %xmm4
  430. mulpd %xmm1, %xmm4
  431. subpd %xmm4, %xmm2
  432. movddup -9 * SIZE(BB), %xmm4
  433. mulpd %xmm1, %xmm4
  434. subpd %xmm4, %xmm3
  435. movddup -6 * SIZE(BB), %xmm4
  436. mulpd %xmm4, %xmm2
  437. movddup -5 * SIZE(BB), %xmm4
  438. mulpd %xmm2, %xmm4
  439. subpd %xmm4, %xmm3
  440. movddup -1 * SIZE(BB), %xmm4
  441. mulpd %xmm4, %xmm3
  442. #endif
  443. #ifdef RT
  444. movddup -1 * SIZE(BB), %xmm4
  445. mulpd %xmm4, %xmm3
  446. movddup -2 * SIZE(BB), %xmm4
  447. mulpd %xmm3, %xmm4
  448. subpd %xmm4, %xmm2
  449. movddup -3 * SIZE(BB), %xmm4
  450. mulpd %xmm3, %xmm4
  451. subpd %xmm4, %xmm1
  452. movddup -4 * SIZE(BB), %xmm4
  453. mulpd %xmm3, %xmm4
  454. subpd %xmm4, %xmm0
  455. movddup -6 * SIZE(BB), %xmm4
  456. mulpd %xmm4, %xmm2
  457. movddup -7 * SIZE(BB), %xmm4
  458. mulpd %xmm2, %xmm4
  459. subpd %xmm4, %xmm1
  460. movddup -8 * SIZE(BB), %xmm4
  461. mulpd %xmm2, %xmm4
  462. subpd %xmm4, %xmm0
  463. movddup -11 * SIZE(BB), %xmm4
  464. mulpd %xmm4, %xmm1
  465. movddup -12 * SIZE(BB), %xmm4
  466. mulpd %xmm1, %xmm4
  467. subpd %xmm4, %xmm0
  468. movddup -16 * SIZE(BB), %xmm4
  469. mulpd %xmm4, %xmm0
  470. #endif
  471. #if defined(LN) || defined(LT)
  472. movapd %xmm2, -16 * SIZE(BB)
  473. movapd %xmm5, -14 * SIZE(BB)
  474. movapd %xmm3, -12 * SIZE(BB)
  475. movapd %xmm7, -10 * SIZE(BB)
  476. #else
  477. movapd %xmm0, -16 * SIZE(AA)
  478. movapd %xmm1, -14 * SIZE(AA)
  479. movapd %xmm2, -12 * SIZE(AA)
  480. movapd %xmm3, -10 * SIZE(AA)
  481. #endif
  482. #ifdef LN
  483. subl $2 * SIZE, CO1
  484. #endif
  485. leal (LDC, LDC, 2), %eax
  486. #if defined(LN) || defined(LT)
  487. movsd %xmm2, 0 * SIZE(CO1)
  488. movsd %xmm3, 1 * SIZE(CO1)
  489. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  490. movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
  491. movsd %xmm5, 0 * SIZE(CO1, LDC, 2)
  492. movsd %xmm7, 1 * SIZE(CO1, LDC, 2)
  493. movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
  494. movhpd %xmm7, 1 * SIZE(CO1, %eax, 1)
  495. #else
  496. movsd %xmm0, 0 * SIZE(CO1)
  497. movhpd %xmm0, 1 * SIZE(CO1)
  498. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  499. movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
  500. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  501. movhpd %xmm2, 1 * SIZE(CO1, LDC, 2)
  502. movsd %xmm3, 0 * SIZE(CO1, %eax, 1)
  503. movhpd %xmm3, 1 * SIZE(CO1, %eax, 1)
  504. #endif
  505. #ifndef LN
  506. addl $2 * SIZE, CO1
  507. #endif
  508. #if defined(LT) || defined(RN)
  509. movl K, %eax
  510. subl KK, %eax
  511. leal (,%eax, SIZE), %eax
  512. leal (AA, %eax, 2), AA
  513. leal (BB, %eax, 4), BB
  514. #endif
  515. #ifdef LN
  516. subl $2, KK
  517. #endif
  518. #ifdef LT
  519. addl $2, KK
  520. #endif
  521. #ifdef RT
  522. movl K, %eax
  523. sall $1 + BASE_SHIFT, %eax
  524. addl %eax, AORIG
  525. #endif
  526. decl %ebx # i --
  527. jg .L11
  528. ALIGN_4
  529. .L20:
  530. movl M, %ebx
  531. testl $1, %ebx # i = (m >> 2)
  532. jle .L29
  533. #ifdef LN
  534. movl K, %eax
  535. sall $BASE_SHIFT, %eax
  536. subl %eax, AORIG
  537. #endif
  538. #if defined(LN) || defined(RT)
  539. movl KK, %eax
  540. movl AORIG, AA
  541. leal (AA, %eax, SIZE), AA
  542. #endif
  543. movl B, BB
  544. #if defined(LN) || defined(RT)
  545. movl KK, %eax
  546. sall $2 + BASE_SHIFT, %eax
  547. addl %eax, BB
  548. #endif
  549. movaps -16 * SIZE(AA), %xmm0
  550. pxor %xmm4, %xmm4
  551. movaps -16 * SIZE(BB), %xmm2
  552. pxor %xmm5, %xmm5
  553. movaps -14 * SIZE(BB), %xmm3
  554. pxor %xmm6, %xmm6
  555. pxor %xmm7, %xmm7
  556. #if defined(LT) || defined(RN)
  557. movl KK, %eax
  558. #else
  559. movl K, %eax
  560. subl KK, %eax
  561. #endif
  562. sarl $3, %eax
  563. je .L25
  564. ALIGN_4
  565. .L22:
  566. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  567. pshufd $0x44, %xmm0, %xmm1
  568. mulpd %xmm1, %xmm2
  569. mulpd %xmm1, %xmm3
  570. addpd %xmm2, %xmm4
  571. movaps -12 * SIZE(BB), %xmm2
  572. addpd %xmm3, %xmm5
  573. movaps -10 * SIZE(BB), %xmm3
  574. pshufd $0xee, %xmm0, %xmm1
  575. movaps -14 * SIZE(AA), %xmm0
  576. mulpd %xmm1, %xmm2
  577. mulpd %xmm1, %xmm3
  578. addpd %xmm2, %xmm6
  579. movaps -8 * SIZE(BB), %xmm2
  580. addpd %xmm3, %xmm7
  581. movaps -6 * SIZE(BB), %xmm3
  582. pshufd $0x44, %xmm0, %xmm1
  583. mulpd %xmm1, %xmm2
  584. mulpd %xmm1, %xmm3
  585. addpd %xmm2, %xmm4
  586. movaps -4 * SIZE(BB), %xmm2
  587. addpd %xmm3, %xmm5
  588. movaps -2 * SIZE(BB), %xmm3
  589. pshufd $0xee, %xmm0, %xmm1
  590. movaps -12 * SIZE(AA), %xmm0
  591. mulpd %xmm1, %xmm2
  592. mulpd %xmm1, %xmm3
  593. addpd %xmm2, %xmm6
  594. movaps 0 * SIZE(BB), %xmm2
  595. addpd %xmm3, %xmm7
  596. movaps 2 * SIZE(BB), %xmm3
  597. pshufd $0x44, %xmm0, %xmm1
  598. mulpd %xmm1, %xmm2
  599. mulpd %xmm1, %xmm3
  600. addpd %xmm2, %xmm4
  601. movaps 4 * SIZE(BB), %xmm2
  602. addpd %xmm3, %xmm5
  603. movaps 6 * SIZE(BB), %xmm3
  604. pshufd $0xee, %xmm0, %xmm1
  605. movaps -10 * SIZE(AA), %xmm0
  606. mulpd %xmm1, %xmm2
  607. mulpd %xmm1, %xmm3
  608. addpd %xmm2, %xmm6
  609. movaps 8 * SIZE(BB), %xmm2
  610. addpd %xmm3, %xmm7
  611. movaps 10 * SIZE(BB), %xmm3
  612. pshufd $0x44, %xmm0, %xmm1
  613. mulpd %xmm1, %xmm2
  614. mulpd %xmm1, %xmm3
  615. addpd %xmm2, %xmm4
  616. movaps 12 * SIZE(BB), %xmm2
  617. addpd %xmm3, %xmm5
  618. movaps 14 * SIZE(BB), %xmm3
  619. pshufd $0xee, %xmm0, %xmm1
  620. movaps -8 * SIZE(AA), %xmm0
  621. mulpd %xmm1, %xmm2
  622. mulpd %xmm1, %xmm3
  623. addpd %xmm2, %xmm6
  624. movaps 16 * SIZE(BB), %xmm2
  625. addpd %xmm3, %xmm7
  626. movaps 18 * SIZE(BB), %xmm3
  627. subl $ -8 * SIZE, AA
  628. subl $-32 * SIZE, BB
  629. subl $1, %eax
  630. jne .L22
  631. ALIGN_4
  632. .L25:
  633. #if defined(LT) || defined(RN)
  634. movl KK, %eax
  635. #else
  636. movl K, %eax
  637. subl KK, %eax
  638. #endif
  639. andl $7, %eax
  640. BRANCH
  641. je .L28
  642. .L26:
  643. pshufd $0x44, %xmm0, %xmm1
  644. movsd -15 * SIZE(AA), %xmm0
  645. mulpd %xmm1, %xmm2
  646. mulpd %xmm1, %xmm3
  647. addpd %xmm2, %xmm4
  648. movaps -12 * SIZE(BB), %xmm2
  649. addpd %xmm3, %xmm5
  650. movaps -10 * SIZE(BB), %xmm3
  651. addl $1 * SIZE, AA
  652. addl $4 * SIZE, BB
  653. decl %eax
  654. jg .L26
  655. ALIGN_4
  656. .L28:
  657. addpd %xmm6, %xmm4
  658. addpd %xmm7, %xmm5
  659. #if defined(LN) || defined(RT)
  660. movl KK, %eax
  661. #ifdef LN
  662. subl $1, %eax
  663. #else
  664. subl $4, %eax
  665. #endif
  666. movl AORIG, AA
  667. leal (, %eax, SIZE), %eax
  668. leal (AA, %eax, 1), AA
  669. leal (B, %eax, 4), BB
  670. #endif
  671. #if defined(LN) || defined(LT)
  672. movapd -16 * SIZE(BB), %xmm0
  673. movapd -14 * SIZE(BB), %xmm1
  674. subpd %xmm4, %xmm0
  675. subpd %xmm5, %xmm1
  676. #else
  677. movapd -16 * SIZE(AA), %xmm1
  678. movapd -14 * SIZE(AA), %xmm3
  679. subpd %xmm4, %xmm1
  680. subpd %xmm5, %xmm3
  681. movapd %xmm1, %xmm0
  682. unpckhpd %xmm1, %xmm1
  683. movapd %xmm3, %xmm2
  684. unpckhpd %xmm3, %xmm3
  685. #endif
  686. #ifdef LN
  687. movddup -16 * SIZE(AA), %xmm4
  688. mulpd %xmm4, %xmm0
  689. mulpd %xmm4, %xmm1
  690. #endif
  691. #ifdef LT
  692. movddup -16 * SIZE(AA), %xmm4
  693. mulpd %xmm4, %xmm0
  694. mulpd %xmm4, %xmm1
  695. #endif
  696. #ifdef RN
  697. movsd -16 * SIZE(BB), %xmm4
  698. mulsd %xmm4, %xmm0
  699. movsd -15 * SIZE(BB), %xmm4
  700. mulsd %xmm0, %xmm4
  701. subsd %xmm4, %xmm1
  702. movsd -14 * SIZE(BB), %xmm4
  703. mulsd %xmm0, %xmm4
  704. subsd %xmm4, %xmm2
  705. movsd -13 * SIZE(BB), %xmm4
  706. mulsd %xmm0, %xmm4
  707. subsd %xmm4, %xmm3
  708. movsd -11 * SIZE(BB), %xmm4
  709. mulsd %xmm4, %xmm1
  710. movsd -10 * SIZE(BB), %xmm4
  711. mulsd %xmm1, %xmm4
  712. subsd %xmm4, %xmm2
  713. movsd -9 * SIZE(BB), %xmm4
  714. mulsd %xmm1, %xmm4
  715. subsd %xmm4, %xmm3
  716. movsd -6 * SIZE(BB), %xmm4
  717. mulsd %xmm4, %xmm2
  718. movsd -5 * SIZE(BB), %xmm4
  719. mulsd %xmm2, %xmm4
  720. subsd %xmm4, %xmm3
  721. movsd -1 * SIZE(BB), %xmm4
  722. mulsd %xmm4, %xmm3
  723. #endif
  724. #ifdef RT
  725. movsd -1 * SIZE(BB), %xmm4
  726. mulsd %xmm4, %xmm3
  727. movsd -2 * SIZE(BB), %xmm4
  728. mulsd %xmm3, %xmm4
  729. subsd %xmm4, %xmm2
  730. movsd -3 * SIZE(BB), %xmm4
  731. mulsd %xmm3, %xmm4
  732. subsd %xmm4, %xmm1
  733. movsd -4 * SIZE(BB), %xmm4
  734. mulsd %xmm3, %xmm4
  735. subsd %xmm4, %xmm0
  736. movsd -6 * SIZE(BB), %xmm4
  737. mulsd %xmm4, %xmm2
  738. movsd -7 * SIZE(BB), %xmm4
  739. mulsd %xmm2, %xmm4
  740. subsd %xmm4, %xmm1
  741. movsd -8 * SIZE(BB), %xmm4
  742. mulsd %xmm2, %xmm4
  743. subsd %xmm4, %xmm0
  744. movsd -11 * SIZE(BB), %xmm4
  745. mulsd %xmm4, %xmm1
  746. movsd -12 * SIZE(BB), %xmm4
  747. mulsd %xmm1, %xmm4
  748. subsd %xmm4, %xmm0
  749. movsd -16 * SIZE(BB), %xmm4
  750. mulsd %xmm4, %xmm0
  751. #endif
  752. #if defined(LN) || defined(LT)
  753. movapd %xmm0, -16 * SIZE(BB)
  754. movapd %xmm1, -14 * SIZE(BB)
  755. #else
  756. movsd %xmm0, -16 * SIZE(AA)
  757. movsd %xmm1, -15 * SIZE(AA)
  758. movsd %xmm2, -14 * SIZE(AA)
  759. movsd %xmm3, -13 * SIZE(AA)
  760. #endif
  761. #ifdef LN
  762. subl $1 * SIZE, CO1
  763. #endif
  764. leal (LDC, LDC, 2), %eax
  765. #if defined(LN) || defined(LT)
  766. movsd %xmm0, 0 * SIZE(CO1)
  767. movhpd %xmm0, 0 * SIZE(CO1, LDC, 1)
  768. movsd %xmm1, 0 * SIZE(CO1, LDC, 2)
  769. movhpd %xmm1, 0 * SIZE(CO1, %eax, 1)
  770. #else
  771. movsd %xmm0, 0 * SIZE(CO1)
  772. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  773. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  774. movsd %xmm3, 0 * SIZE(CO1, %eax, 1)
  775. #endif
  776. #ifndef LN
  777. addl $1 * SIZE, CO1
  778. #endif
  779. #if defined(LT) || defined(RN)
  780. movl K, %eax
  781. subl KK, %eax
  782. leal (,%eax, SIZE), %eax
  783. leal (AA, %eax, 1), AA
  784. leal (BB, %eax, 4), BB
  785. #endif
  786. #ifdef LN
  787. subl $1, KK
  788. #endif
  789. #ifdef LT
  790. addl $1, KK
  791. #endif
  792. #ifdef RT
  793. movl K, %eax
  794. sall $BASE_SHIFT, %eax
  795. addl %eax, AORIG
  796. #endif
  797. ALIGN_4
  798. .L29:
  799. #ifdef LN
  800. movl K, %eax
  801. leal (, %eax, SIZE), %eax
  802. leal (B, %eax, 4), B
  803. #endif
  804. #if defined(LT) || defined(RN)
  805. movl BB, B
  806. #endif
  807. #ifdef RN
  808. addl $4, KK
  809. #endif
  810. #ifdef RT
  811. subl $4, KK
  812. #endif
  813. decl J # j --
  814. jg .L10
  815. ALIGN_4
  816. .L30:
  817. testl $2, N
  818. je .L60
  819. #if defined(LT) || defined(RN)
  820. movl A, AA
  821. #else
  822. movl A, %eax
  823. movl %eax, AORIG
  824. #endif
  825. #ifdef RT
  826. movl K, %eax
  827. sall $1 + BASE_SHIFT, %eax
  828. subl %eax, B
  829. #endif
  830. leal (, LDC, 2), %eax
  831. #ifdef RT
  832. subl %eax, C
  833. #endif
  834. movl C, CO1
  835. #ifndef RT
  836. addl %eax, C
  837. #endif
  838. #ifdef LN
  839. movl OFFSET, %eax
  840. addl M, %eax
  841. movl %eax, KK
  842. #endif
  843. #ifdef LT
  844. movl OFFSET, %eax
  845. movl %eax, KK
  846. #endif
  847. movl M, %ebx
  848. sarl $1, %ebx # i = (m >> 2)
  849. jle .L50
  850. ALIGN_4
  851. .L41:
  852. #ifdef LN
  853. movl K, %eax
  854. sall $1 + BASE_SHIFT, %eax
  855. subl %eax, AORIG
  856. #endif
  857. #if defined(LN) || defined(RT)
  858. movl KK, %eax
  859. movl AORIG, AA
  860. leal (, %eax, SIZE), %eax
  861. leal (AA, %eax, 2), AA
  862. #endif
  863. movl B, BB
  864. #if defined(LN) || defined(RT)
  865. movl KK, %eax
  866. sall $1 + BASE_SHIFT, %eax
  867. addl %eax, BB
  868. #endif
  869. movaps -16 * SIZE(AA), %xmm0
  870. pxor %xmm4, %xmm4
  871. movaps -16 * SIZE(BB), %xmm1
  872. pxor %xmm5, %xmm5
  873. #ifdef LN
  874. prefetcht0 -2 * SIZE(CO1)
  875. pxor %xmm6, %xmm6
  876. prefetcht0 -2 * SIZE(CO1, LDC)
  877. pxor %xmm7, %xmm7
  878. #else
  879. prefetcht0 1 * SIZE(CO1)
  880. pxor %xmm6, %xmm6
  881. prefetcht0 1 * SIZE(CO1, LDC)
  882. pxor %xmm7, %xmm7
  883. #endif
  884. #if defined(LT) || defined(RN)
  885. movl KK, %eax
  886. #else
  887. movl K, %eax
  888. subl KK, %eax
  889. #endif
  890. sarl $3, %eax
  891. je .L45
  892. ALIGN_4
  893. .L42:
  894. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  895. pshufd $0x4e, %xmm1, %xmm2
  896. mulpd %xmm0, %xmm1
  897. mulpd %xmm0, %xmm2
  898. movaps -14 * SIZE(AA), %xmm0
  899. addpd %xmm1, %xmm5
  900. movaps -14 * SIZE(BB), %xmm1
  901. addpd %xmm2, %xmm4
  902. pshufd $0x4e, %xmm1, %xmm2
  903. mulpd %xmm0, %xmm1
  904. mulpd %xmm0, %xmm2
  905. movaps -12 * SIZE(AA), %xmm0
  906. addpd %xmm1, %xmm7
  907. movaps -12 * SIZE(BB), %xmm1
  908. addpd %xmm2, %xmm6
  909. pshufd $0x4e, %xmm1, %xmm2
  910. mulpd %xmm0, %xmm1
  911. mulpd %xmm0, %xmm2
  912. movaps -10 * SIZE(AA), %xmm0
  913. addpd %xmm1, %xmm5
  914. movaps -10 * SIZE(BB), %xmm1
  915. addpd %xmm2, %xmm4
  916. pshufd $0x4e, %xmm1, %xmm2
  917. mulpd %xmm0, %xmm1
  918. mulpd %xmm0, %xmm2
  919. movaps -8 * SIZE(AA), %xmm0
  920. addpd %xmm1, %xmm7
  921. movaps -8 * SIZE(BB), %xmm1
  922. addpd %xmm2, %xmm6
  923. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  924. pshufd $0x4e, %xmm1, %xmm2
  925. mulpd %xmm0, %xmm1
  926. mulpd %xmm0, %xmm2
  927. movaps -6 * SIZE(AA), %xmm0
  928. addpd %xmm1, %xmm5
  929. movaps -6 * SIZE(BB), %xmm1
  930. addpd %xmm2, %xmm4
  931. pshufd $0x4e, %xmm1, %xmm2
  932. mulpd %xmm0, %xmm1
  933. mulpd %xmm0, %xmm2
  934. movaps -4 * SIZE(AA), %xmm0
  935. addpd %xmm1, %xmm7
  936. movaps -4 * SIZE(BB), %xmm1
  937. addpd %xmm2, %xmm6
  938. pshufd $0x4e, %xmm1, %xmm2
  939. mulpd %xmm0, %xmm1
  940. mulpd %xmm0, %xmm2
  941. movaps -2 * SIZE(AA), %xmm0
  942. addpd %xmm1, %xmm5
  943. movaps -2 * SIZE(BB), %xmm1
  944. addpd %xmm2, %xmm4
  945. pshufd $0x4e, %xmm1, %xmm2
  946. mulpd %xmm0, %xmm1
  947. mulpd %xmm0, %xmm2
  948. movaps 0 * SIZE(AA), %xmm0
  949. addpd %xmm1, %xmm7
  950. movaps 0 * SIZE(BB), %xmm1
  951. addpd %xmm2, %xmm6
  952. subl $-16 * SIZE, AA
  953. subl $-16 * SIZE, BB
  954. subl $1, %eax
  955. jne .L42
  956. ALIGN_4
  957. .L45:
  958. #if defined(LT) || defined(RN)
  959. movl KK, %eax
  960. #else
  961. movl K, %eax
  962. subl KK, %eax
  963. #endif
  964. andl $7, %eax # if (k & 1)
  965. BRANCH
  966. je .L48
  967. ALIGN_3
  968. .L46:
  969. pshufd $0x4e, %xmm1, %xmm2
  970. mulpd %xmm0, %xmm1
  971. mulpd %xmm0, %xmm2
  972. movaps -14 * SIZE(AA), %xmm0
  973. addpd %xmm1, %xmm5
  974. movaps -14 * SIZE(BB), %xmm1
  975. addpd %xmm2, %xmm4
  976. addl $2 * SIZE, AA
  977. addl $2 * SIZE, BB
  978. decl %eax
  979. jg .L46
  980. ALIGN_4
  981. .L48:
  982. addpd %xmm6, %xmm4
  983. addpd %xmm7, %xmm5
  984. movaps %xmm4, %xmm0
  985. movsd %xmm5, %xmm4
  986. movsd %xmm0, %xmm5
  987. #if defined(LN) || defined(RT)
  988. movl KK, %eax
  989. #ifdef LN
  990. subl $2, %eax
  991. #else
  992. subl $2, %eax
  993. #endif
  994. movl AORIG, AA
  995. leal (, %eax, SIZE), %eax
  996. leal (AA, %eax, 2), AA
  997. leal (B, %eax, 2), BB
  998. #endif
  999. #if defined(LN) || defined(LT)
  1000. movapd %xmm4, %xmm0
  1001. unpcklpd %xmm5, %xmm4
  1002. unpckhpd %xmm5, %xmm0
  1003. movapd -16 * SIZE(BB), %xmm2
  1004. movapd -14 * SIZE(BB), %xmm3
  1005. subpd %xmm4, %xmm2
  1006. subpd %xmm0, %xmm3
  1007. #else
  1008. movapd -16 * SIZE(AA), %xmm0
  1009. movapd -14 * SIZE(AA), %xmm1
  1010. subpd %xmm4, %xmm0
  1011. subpd %xmm5, %xmm1
  1012. #endif
  1013. #ifdef LN
  1014. movddup -13 * SIZE(AA), %xmm4
  1015. mulpd %xmm4, %xmm3
  1016. movddup -14 * SIZE(AA), %xmm4
  1017. mulpd %xmm3, %xmm4
  1018. subpd %xmm4, %xmm2
  1019. movddup -16 * SIZE(AA), %xmm4
  1020. mulpd %xmm4, %xmm2
  1021. #endif
  1022. #ifdef LT
  1023. movddup -16 * SIZE(AA), %xmm4
  1024. mulpd %xmm4, %xmm2
  1025. movddup -15 * SIZE(AA), %xmm4
  1026. mulpd %xmm2, %xmm4
  1027. subpd %xmm4, %xmm3
  1028. movddup -13 * SIZE(AA), %xmm4
  1029. mulpd %xmm4, %xmm3
  1030. #endif
  1031. #ifdef RN
  1032. movddup -16 * SIZE(BB), %xmm4
  1033. mulpd %xmm4, %xmm0
  1034. movddup -15 * SIZE(BB), %xmm4
  1035. mulpd %xmm0, %xmm4
  1036. subpd %xmm4, %xmm1
  1037. movddup -13 * SIZE(BB), %xmm4
  1038. mulpd %xmm4, %xmm1
  1039. #endif
  1040. #ifdef RT
  1041. movddup -13 * SIZE(BB), %xmm4
  1042. mulpd %xmm4, %xmm1
  1043. movddup -14 * SIZE(BB), %xmm4
  1044. mulpd %xmm1, %xmm4
  1045. subpd %xmm4, %xmm0
  1046. movddup -16 * SIZE(BB), %xmm4
  1047. mulpd %xmm4, %xmm0
  1048. #endif
  1049. #if defined(LN) || defined(LT)
  1050. movapd %xmm2, -16 * SIZE(BB)
  1051. movapd %xmm3, -14 * SIZE(BB)
  1052. #else
  1053. movapd %xmm0, -16 * SIZE(AA)
  1054. movapd %xmm1, -14 * SIZE(AA)
  1055. #endif
  1056. #ifdef LN
  1057. subl $2 * SIZE, CO1
  1058. #endif
  1059. #if defined(LN) || defined(LT)
  1060. movsd %xmm2, 0 * SIZE(CO1)
  1061. movsd %xmm3, 1 * SIZE(CO1)
  1062. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  1063. movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
  1064. #else
  1065. movsd %xmm0, 0 * SIZE(CO1)
  1066. movhpd %xmm0, 1 * SIZE(CO1)
  1067. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1068. movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
  1069. #endif
  1070. #ifndef LN
  1071. addl $2 * SIZE, CO1
  1072. #endif
  1073. #if defined(LT) || defined(RN)
  1074. movl K, %eax
  1075. subl KK, %eax
  1076. leal (,%eax, SIZE), %eax
  1077. leal (AA, %eax, 2), AA
  1078. leal (BB, %eax, 2), BB
  1079. #endif
  1080. #ifdef LN
  1081. subl $2, KK
  1082. #endif
  1083. #ifdef LT
  1084. addl $2, KK
  1085. #endif
  1086. #ifdef RT
  1087. movl K, %eax
  1088. sall $1 + BASE_SHIFT, %eax
  1089. addl %eax, AORIG
  1090. #endif
  1091. decl %ebx # i --
  1092. jg .L41
  1093. ALIGN_4
  1094. .L50:
  1095. movl M, %ebx
  1096. testl $1, %ebx # i = (m >> 2)
  1097. jle .L59
  1098. #ifdef LN
  1099. movl K, %eax
  1100. sall $BASE_SHIFT, %eax
  1101. subl %eax, AORIG
  1102. #endif
  1103. #if defined(LN) || defined(RT)
  1104. movl KK, %eax
  1105. movl AORIG, AA
  1106. leal (AA, %eax, SIZE), AA
  1107. #endif
  1108. movl B, BB
  1109. #if defined(LN) || defined(RT)
  1110. movl KK, %eax
  1111. sall $1 + BASE_SHIFT, %eax
  1112. addl %eax, BB
  1113. #endif
  1114. movaps -16 * SIZE(AA), %xmm0
  1115. pxor %xmm4, %xmm4
  1116. movaps -16 * SIZE(BB), %xmm2
  1117. pxor %xmm5, %xmm5
  1118. #if defined(LT) || defined(RN)
  1119. movl KK, %eax
  1120. #else
  1121. movl K, %eax
  1122. subl KK, %eax
  1123. #endif
  1124. sarl $3, %eax
  1125. je .L55
  1126. ALIGN_4
  1127. .L52:
  1128. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1129. pshufd $0x44, %xmm0, %xmm1
  1130. mulpd %xmm1, %xmm2
  1131. addpd %xmm2, %xmm4
  1132. movaps -14 * SIZE(BB), %xmm2
  1133. pshufd $0xee, %xmm0, %xmm1
  1134. movaps -14 * SIZE(AA), %xmm0
  1135. mulpd %xmm1, %xmm2
  1136. addpd %xmm2, %xmm5
  1137. movaps -12 * SIZE(BB), %xmm2
  1138. pshufd $0x44, %xmm0, %xmm1
  1139. mulpd %xmm1, %xmm2
  1140. addpd %xmm2, %xmm4
  1141. movaps -10 * SIZE(BB), %xmm2
  1142. pshufd $0xee, %xmm0, %xmm1
  1143. movaps -12 * SIZE(AA), %xmm0
  1144. mulpd %xmm1, %xmm2
  1145. addpd %xmm2, %xmm5
  1146. movaps -8 * SIZE(BB), %xmm2
  1147. pshufd $0x44, %xmm0, %xmm1
  1148. mulpd %xmm1, %xmm2
  1149. addpd %xmm2, %xmm4
  1150. movaps -6 * SIZE(BB), %xmm2
  1151. pshufd $0xee, %xmm0, %xmm1
  1152. movaps -10 * SIZE(AA), %xmm0
  1153. mulpd %xmm1, %xmm2
  1154. addpd %xmm2, %xmm5
  1155. movaps -4 * SIZE(BB), %xmm2
  1156. pshufd $0x44, %xmm0, %xmm1
  1157. mulpd %xmm1, %xmm2
  1158. addpd %xmm2, %xmm4
  1159. movaps -2 * SIZE(BB), %xmm2
  1160. pshufd $0xee, %xmm0, %xmm1
  1161. movaps -8 * SIZE(AA), %xmm0
  1162. mulpd %xmm1, %xmm2
  1163. addpd %xmm2, %xmm5
  1164. movaps 0 * SIZE(BB), %xmm2
  1165. subl $ -8 * SIZE, AA
  1166. subl $-16 * SIZE, BB
  1167. subl $1, %eax
  1168. jne .L52
  1169. ALIGN_4
  1170. .L55:
  1171. #if defined(LT) || defined(RN)
  1172. movl KK, %eax
  1173. #else
  1174. movl K, %eax
  1175. subl KK, %eax
  1176. #endif
  1177. andl $7, %eax
  1178. BRANCH
  1179. je .L58
  1180. .L56:
  1181. pshufd $0x44, %xmm0, %xmm1
  1182. movsd -15 * SIZE(AA), %xmm0
  1183. mulpd %xmm1, %xmm2
  1184. addpd %xmm2, %xmm4
  1185. movaps -14 * SIZE(BB), %xmm2
  1186. addl $1 * SIZE, AA
  1187. addl $2 * SIZE, BB
  1188. decl %eax
  1189. jg .L56
  1190. ALIGN_4
  1191. .L58:
  1192. addpd %xmm5, %xmm4
  1193. #if defined(LN) || defined(RT)
  1194. movl KK, %eax
  1195. #ifdef LN
  1196. subl $1, %eax
  1197. #else
  1198. subl $2, %eax
  1199. #endif
  1200. movl AORIG, AA
  1201. leal (, %eax, SIZE), %eax
  1202. addl %eax, AA
  1203. leal (B, %eax, 2), BB
  1204. #endif
  1205. #if defined(LN) || defined(LT)
  1206. movapd -16 * SIZE(BB), %xmm0
  1207. subpd %xmm4, %xmm0
  1208. #else
  1209. movapd -16 * SIZE(AA), %xmm1
  1210. subpd %xmm4, %xmm1
  1211. movapd %xmm1, %xmm0
  1212. unpckhpd %xmm1, %xmm1
  1213. #endif
  1214. #ifdef LN
  1215. movddup -16 * SIZE(AA), %xmm4
  1216. mulpd %xmm4, %xmm0
  1217. #endif
  1218. #ifdef LT
  1219. movddup -16 * SIZE(AA), %xmm4
  1220. mulpd %xmm4, %xmm0
  1221. #endif
  1222. #ifdef RN
  1223. movsd -16 * SIZE(BB), %xmm4
  1224. mulsd %xmm4, %xmm0
  1225. movsd -15 * SIZE(BB), %xmm4
  1226. mulsd %xmm0, %xmm4
  1227. subsd %xmm4, %xmm1
  1228. movsd -13 * SIZE(BB), %xmm4
  1229. mulsd %xmm4, %xmm1
  1230. #endif
  1231. #ifdef RT
  1232. movsd -13 * SIZE(BB), %xmm4
  1233. mulsd %xmm4, %xmm1
  1234. movsd -14 * SIZE(BB), %xmm4
  1235. mulsd %xmm1, %xmm4
  1236. subsd %xmm4, %xmm0
  1237. movsd -16 * SIZE(BB), %xmm4
  1238. mulsd %xmm4, %xmm0
  1239. #endif
  1240. #if defined(LN) || defined(LT)
  1241. movapd %xmm0, -16 * SIZE(BB)
  1242. #else
  1243. movsd %xmm0, -16 * SIZE(AA)
  1244. movsd %xmm1, -15 * SIZE(AA)
  1245. #endif
  1246. #ifdef LN
  1247. subl $1 * SIZE, CO1
  1248. #endif
  1249. #if defined(LN) || defined(LT)
  1250. movsd %xmm0, 0 * SIZE(CO1)
  1251. movhpd %xmm0, 0 * SIZE(CO1, LDC, 1)
  1252. #else
  1253. movsd %xmm0, 0 * SIZE(CO1)
  1254. movsd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1255. #endif
  1256. #ifndef LN
  1257. addl $1 * SIZE, CO1
  1258. #endif
  1259. #if defined(LT) || defined(RN)
  1260. movl K, %eax
  1261. subl KK, %eax
  1262. leal (,%eax, SIZE), %eax
  1263. leal (AA, %eax, 1), AA
  1264. leal (BB, %eax, 2), BB
  1265. #endif
  1266. #ifdef LN
  1267. subl $1, KK
  1268. #endif
  1269. #ifdef LT
  1270. addl $1, KK
  1271. #endif
  1272. #ifdef RT
  1273. movl K, %eax
  1274. sall $1 + BASE_SHIFT, %eax
  1275. addl %eax, AORIG
  1276. #endif
  1277. ALIGN_4
  1278. .L59:
  1279. #ifdef LN
  1280. movl K, %eax
  1281. leal (, %eax, SIZE), %eax
  1282. leal (B, %eax, 2), B
  1283. #endif
  1284. #if defined(LT) || defined(RN)
  1285. movl BB, B
  1286. #endif
  1287. #ifdef RN
  1288. addl $2, KK
  1289. #endif
  1290. #ifdef RT
  1291. subl $2, KK
  1292. #endif
  1293. ALIGN_4
  1294. .L60:
  1295. testl $1, N
  1296. je .L999
  1297. #if defined(LT) || defined(RN)
  1298. movl A, AA
  1299. #else
  1300. movl A, %eax
  1301. movl %eax, AORIG
  1302. #endif
  1303. #ifdef RT
  1304. movl K, %eax
  1305. sall $BASE_SHIFT, %eax
  1306. subl %eax, B
  1307. #endif
  1308. #ifdef RT
  1309. subl LDC, C
  1310. #endif
  1311. movl C, CO1
  1312. #ifndef RT
  1313. addl LDC, C
  1314. #endif
  1315. #ifdef LN
  1316. movl OFFSET, %eax
  1317. addl M, %eax
  1318. movl %eax, KK
  1319. #endif
  1320. #ifdef LT
  1321. movl OFFSET, %eax
  1322. movl %eax, KK
  1323. #endif
  1324. movl M, %ebx
  1325. sarl $1, %ebx # i = (m >> 2)
  1326. jle .L80
  1327. ALIGN_4
  1328. .L71:
  1329. #ifdef LN
  1330. movl K, %eax
  1331. sall $1 + BASE_SHIFT, %eax
  1332. subl %eax, AORIG
  1333. #endif
  1334. #if defined(LN) || defined(RT)
  1335. movl KK, %eax
  1336. movl AORIG, AA
  1337. leal (, %eax, SIZE), %eax
  1338. leal (AA, %eax, 2), AA
  1339. #endif
  1340. movl B, BB
  1341. #if defined(LN) || defined(RT)
  1342. movl KK, %eax
  1343. sall $BASE_SHIFT, %eax
  1344. addl %eax, BB
  1345. #endif
  1346. movaps -16 * SIZE(AA), %xmm0
  1347. pxor %xmm4, %xmm4
  1348. movaps -16 * SIZE(BB), %xmm1
  1349. pxor %xmm5, %xmm5
  1350. #ifdef LN
  1351. prefetcht0 -2 * SIZE(CO1)
  1352. #else
  1353. prefetcht0 1 * SIZE(CO1)
  1354. #endif
  1355. #if defined(LT) || defined(RN)
  1356. movl KK, %eax
  1357. #else
  1358. movl K, %eax
  1359. subl KK, %eax
  1360. #endif
  1361. sarl $3, %eax
  1362. je .L75
  1363. ALIGN_4
  1364. .L72:
  1365. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1366. pshufd $0x44, %xmm1, %xmm2
  1367. mulpd %xmm0, %xmm2
  1368. movaps -14 * SIZE(AA), %xmm0
  1369. addpd %xmm2, %xmm4
  1370. pshufd $0xee, %xmm1, %xmm2
  1371. movaps -14 * SIZE(BB), %xmm1
  1372. mulpd %xmm0, %xmm2
  1373. movaps -12 * SIZE(AA), %xmm0
  1374. addpd %xmm2, %xmm5
  1375. pshufd $0x44, %xmm1, %xmm2
  1376. mulpd %xmm0, %xmm2
  1377. movaps -10 * SIZE(AA), %xmm0
  1378. addpd %xmm2, %xmm4
  1379. pshufd $0xee, %xmm1, %xmm2
  1380. movaps -12 * SIZE(BB), %xmm1
  1381. mulpd %xmm0, %xmm2
  1382. movaps -8 * SIZE(AA), %xmm0
  1383. addpd %xmm2, %xmm5
  1384. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  1385. pshufd $0x44, %xmm1, %xmm2
  1386. mulpd %xmm0, %xmm2
  1387. movaps -6 * SIZE(AA), %xmm0
  1388. addpd %xmm2, %xmm4
  1389. pshufd $0xee, %xmm1, %xmm2
  1390. movaps -10 * SIZE(BB), %xmm1
  1391. mulpd %xmm0, %xmm2
  1392. movaps -4 * SIZE(AA), %xmm0
  1393. addpd %xmm2, %xmm5
  1394. pshufd $0x44, %xmm1, %xmm2
  1395. mulpd %xmm0, %xmm2
  1396. movaps -2 * SIZE(AA), %xmm0
  1397. addpd %xmm2, %xmm4
  1398. pshufd $0xee, %xmm1, %xmm2
  1399. movaps -8 * SIZE(BB), %xmm1
  1400. mulpd %xmm0, %xmm2
  1401. movaps 0 * SIZE(AA), %xmm0
  1402. addpd %xmm2, %xmm5
  1403. subl $-16 * SIZE, AA
  1404. subl $ -8 * SIZE, BB
  1405. subl $1, %eax
  1406. jne .L72
  1407. ALIGN_4
  1408. .L75:
  1409. #if defined(LT) || defined(RN)
  1410. movl KK, %eax
  1411. #else
  1412. movl K, %eax
  1413. subl KK, %eax
  1414. #endif
  1415. andl $7, %eax # if (k & 1)
  1416. BRANCH
  1417. je .L78
  1418. ALIGN_3
  1419. .L76:
  1420. pshufd $0x44, %xmm1, %xmm2
  1421. movsd -15 * SIZE(BB), %xmm1
  1422. mulpd %xmm0, %xmm2
  1423. movaps -14 * SIZE(AA), %xmm0
  1424. addpd %xmm2, %xmm4
  1425. addl $2 * SIZE, AA
  1426. addl $1 * SIZE, BB
  1427. decl %eax
  1428. jg .L76
  1429. ALIGN_4
  1430. .L78:
  1431. addpd %xmm5, %xmm4
  1432. #if defined(LN) || defined(RT)
  1433. movl KK, %eax
  1434. #ifdef LN
  1435. subl $2, %eax
  1436. #else
  1437. subl $1, %eax
  1438. #endif
  1439. movl AORIG, AA
  1440. leal (, %eax, SIZE), %eax
  1441. leal (AA, %eax, 2), AA
  1442. leal (B, %eax, 1), BB
  1443. #endif
  1444. #if defined(LN) || defined(LT)
  1445. movapd -16 * SIZE(BB), %xmm1
  1446. subpd %xmm4, %xmm1
  1447. movapd %xmm1, %xmm0
  1448. unpckhpd %xmm1, %xmm1
  1449. #else
  1450. movapd -16 * SIZE(AA), %xmm0
  1451. subpd %xmm4, %xmm0
  1452. #endif
  1453. #ifdef LN
  1454. movsd -13 * SIZE(AA), %xmm4
  1455. mulsd %xmm4, %xmm1
  1456. movsd -14 * SIZE(AA), %xmm4
  1457. mulsd %xmm1, %xmm4
  1458. subsd %xmm4, %xmm0
  1459. movsd -16 * SIZE(AA), %xmm4
  1460. mulsd %xmm4, %xmm0
  1461. #endif
  1462. #ifdef LT
  1463. movsd -16 * SIZE(AA), %xmm4
  1464. mulsd %xmm4, %xmm0
  1465. movsd -15 * SIZE(AA), %xmm4
  1466. mulsd %xmm0, %xmm4
  1467. subsd %xmm4, %xmm1
  1468. movsd -13 * SIZE(AA), %xmm4
  1469. mulsd %xmm4, %xmm1
  1470. #endif
  1471. #ifdef RN
  1472. movddup -16 * SIZE(BB), %xmm4
  1473. mulpd %xmm4, %xmm0
  1474. #endif
  1475. #ifdef RT
  1476. movddup -16 * SIZE(BB), %xmm4
  1477. mulpd %xmm4, %xmm0
  1478. #endif
  1479. #if defined(LN) || defined(LT)
  1480. movsd %xmm0, -16 * SIZE(BB)
  1481. movsd %xmm1, -15 * SIZE(BB)
  1482. #else
  1483. movapd %xmm0, -16 * SIZE(AA)
  1484. #endif
  1485. #ifdef LN
  1486. subl $2 * SIZE, CO1
  1487. #endif
  1488. #if defined(LN) || defined(LT)
  1489. movsd %xmm0, 0 * SIZE(CO1)
  1490. movsd %xmm1, 1 * SIZE(CO1)
  1491. #else
  1492. movsd %xmm0, 0 * SIZE(CO1)
  1493. movhpd %xmm0, 1 * SIZE(CO1)
  1494. #endif
  1495. #ifndef LN
  1496. addl $2 * SIZE, CO1
  1497. #endif
  1498. #if defined(LT) || defined(RN)
  1499. movl K, %eax
  1500. subl KK, %eax
  1501. leal (,%eax, SIZE), %eax
  1502. leal (AA, %eax, 2), AA
  1503. addl %eax, BB
  1504. #endif
  1505. #ifdef LN
  1506. subl $2, KK
  1507. #endif
  1508. #ifdef LT
  1509. addl $2, KK
  1510. #endif
  1511. #ifdef RT
  1512. movl K, %eax
  1513. sall $1 + BASE_SHIFT, %eax
  1514. addl %eax, AORIG
  1515. #endif
  1516. decl %ebx # i --
  1517. jg .L71
  1518. ALIGN_4
  1519. .L80:
  1520. movl M, %ebx
  1521. testl $1, %ebx # i = (m >> 2)
  1522. jle .L89
  1523. #ifdef LN
  1524. movl K, %eax
  1525. sall $BASE_SHIFT, %eax
  1526. subl %eax, AORIG
  1527. #endif
  1528. #if defined(LN) || defined(RT)
  1529. movl KK, %eax
  1530. movl AORIG, AA
  1531. leal (AA, %eax, SIZE), AA
  1532. #endif
  1533. movl B, BB
  1534. #if defined(LN) || defined(RT)
  1535. movl KK, %eax
  1536. sall $BASE_SHIFT, %eax
  1537. addl %eax, BB
  1538. #endif
  1539. movaps -16 * SIZE(AA), %xmm0
  1540. pxor %xmm4, %xmm4
  1541. movaps -16 * SIZE(BB), %xmm2
  1542. pxor %xmm5, %xmm5
  1543. #if defined(LT) || defined(RN)
  1544. movl KK, %eax
  1545. #else
  1546. movl K, %eax
  1547. subl KK, %eax
  1548. #endif
  1549. sarl $3, %eax
  1550. je .L85
  1551. ALIGN_4
  1552. .L82:
  1553. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1554. mulpd %xmm0, %xmm2
  1555. movaps -14 * SIZE(AA), %xmm0
  1556. addpd %xmm2, %xmm4
  1557. movaps -14 * SIZE(BB), %xmm2
  1558. mulpd %xmm0, %xmm2
  1559. movaps -12 * SIZE(AA), %xmm0
  1560. addpd %xmm2, %xmm5
  1561. movaps -12 * SIZE(BB), %xmm2
  1562. mulpd %xmm0, %xmm2
  1563. movaps -10 * SIZE(AA), %xmm0
  1564. addpd %xmm2, %xmm4
  1565. movaps -10 * SIZE(BB), %xmm2
  1566. mulpd %xmm0, %xmm2
  1567. movaps -8 * SIZE(AA), %xmm0
  1568. addpd %xmm2, %xmm5
  1569. movaps -8 * SIZE(BB), %xmm2
  1570. subl $-8 * SIZE, AA
  1571. subl $-8 * SIZE, BB
  1572. decl %eax
  1573. jne .L82
  1574. ALIGN_4
  1575. .L85:
  1576. #if defined(LT) || defined(RN)
  1577. movl KK, %eax
  1578. #else
  1579. movl K, %eax
  1580. subl KK, %eax
  1581. #endif
  1582. andl $7, %eax
  1583. BRANCH
  1584. je .L88
  1585. .L86:
  1586. mulsd %xmm0, %xmm2
  1587. movsd -15 * SIZE(AA), %xmm0
  1588. addsd %xmm2, %xmm4
  1589. movsd -15 * SIZE(BB), %xmm2
  1590. addl $1 * SIZE, AA
  1591. addl $1 * SIZE, BB
  1592. decl %eax
  1593. jg .L86
  1594. ALIGN_4
  1595. .L88:
  1596. addpd %xmm5, %xmm4
  1597. haddpd %xmm4, %xmm4
  1598. #if defined(LN) || defined(RT)
  1599. movl KK, %eax
  1600. #ifdef LN
  1601. subl $1, %eax
  1602. #else
  1603. subl $1, %eax
  1604. #endif
  1605. movl AORIG, AA
  1606. leal (, %eax, SIZE), %eax
  1607. addl %eax, AA
  1608. leal (B, %eax, 1), BB
  1609. #endif
  1610. #if defined(LN) || defined(LT)
  1611. movsd -16 * SIZE(BB), %xmm0
  1612. subsd %xmm4, %xmm0
  1613. #else
  1614. movsd -16 * SIZE(AA), %xmm0
  1615. subsd %xmm4, %xmm0
  1616. #endif
  1617. #ifdef LN
  1618. movsd -16 * SIZE(AA), %xmm4
  1619. mulsd %xmm4, %xmm0
  1620. #endif
  1621. #ifdef LT
  1622. movsd -16 * SIZE(AA), %xmm4
  1623. mulsd %xmm4, %xmm0
  1624. #endif
  1625. #ifdef RN
  1626. movsd -16 * SIZE(BB), %xmm4
  1627. mulsd %xmm4, %xmm0
  1628. #endif
  1629. #ifdef RT
  1630. movsd -16 * SIZE(BB), %xmm4
  1631. mulsd %xmm4, %xmm0
  1632. #endif
  1633. #if defined(LN) || defined(LT)
  1634. movsd %xmm0, -16 * SIZE(BB)
  1635. #else
  1636. movsd %xmm0, -16 * SIZE(AA)
  1637. #endif
  1638. #ifdef LN
  1639. subl $1 * SIZE, CO1
  1640. #endif
  1641. #if defined(LN) || defined(LT)
  1642. movsd %xmm0, 0 * SIZE(CO1)
  1643. #else
  1644. movsd %xmm0, 0 * SIZE(CO1)
  1645. #endif
  1646. #ifndef LN
  1647. addl $1 * SIZE, CO1
  1648. #endif
  1649. #if defined(LT) || defined(RN)
  1650. movl K, %eax
  1651. subl KK, %eax
  1652. leal (,%eax, SIZE), %eax
  1653. addl %eax, AA
  1654. addl %eax, BB
  1655. #endif
  1656. #ifdef LN
  1657. subl $1, KK
  1658. #endif
  1659. #ifdef LT
  1660. addl $1, KK
  1661. #endif
  1662. #ifdef RT
  1663. movl K, %eax
  1664. sall $BASE_SHIFT, %eax
  1665. addl %eax, AORIG
  1666. #endif
  1667. ALIGN_4
  1668. .L89:
  1669. #ifdef LN
  1670. movl K, %eax
  1671. leal (B, %eax, SIZE), B
  1672. #endif
  1673. #if defined(LT) || defined(RN)
  1674. movl BB, B
  1675. #endif
  1676. #ifdef RN
  1677. addl $1, KK
  1678. #endif
  1679. #ifdef RT
  1680. subl $1, KK
  1681. #endif
  1682. ALIGN_4
  1683. .L999:
  1684. popl %ebx
  1685. popl %esi
  1686. popl %edi
  1687. popl %ebp
  1688. addl $ARGS, %esp
  1689. ret
  1690. EPILOGUE