You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LT_2x2_penryn.S 34 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define A 24 + STACK + ARGS(%esp)
  46. #define ARG_B 28 + STACK + ARGS(%esp)
  47. #define C 32 + STACK + ARGS(%esp)
  48. #define ARG_LDC 36 + STACK + ARGS(%esp)
  49. #define OFFSET 40 + STACK + ARGS(%esp)
  50. #define J 0 + STACK(%esp)
  51. #define KK 4 + STACK(%esp)
  52. #define KKK 8 + STACK(%esp)
  53. #define AORIG 12 + STACK(%esp)
  54. #if defined(PENRYN) || defined(DUNNINGTON)
  55. #define PREFETCH prefetcht1
  56. #define PREFETCHSIZE 84
  57. #endif
  58. #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
  59. #define PREFETCH prefetcht1
  60. #define PREFETCHSIZE 84
  61. #endif
  62. #ifdef ATOM
  63. #define PREFETCH prefetcht0
  64. #define PREFETCHSIZE 84
  65. #endif
  66. #ifdef NANO
  67. #define PREFETCH prefetcht0
  68. #define PREFETCHSIZE (16 * 2)
  69. #endif
  70. #define B %edi
  71. #define LDC %ebp
  72. #define AA %edx
  73. #define BB %ecx
  74. #define CO1 %esi
  75. #define ADD1 addps
  76. #define ADD2 addps
  77. PROLOGUE
  78. subl $ARGS, %esp
  79. pushl %ebp
  80. pushl %edi
  81. pushl %esi
  82. pushl %ebx
  83. PROFCODE
  84. movl ARG_B, B
  85. movl ARG_LDC, LDC
  86. movl OFFSET, %eax
  87. #ifdef RN
  88. negl %eax
  89. #endif
  90. movl %eax, KK
  91. movl M, %ebx
  92. testl %ebx, %ebx
  93. jle .L999
  94. subl $-32 * SIZE, A
  95. subl $-32 * SIZE, B
  96. sall $ZBASE_SHIFT, LDC
  97. #ifdef LN
  98. movl M, %eax
  99. sall $ZBASE_SHIFT, %eax
  100. addl %eax, C
  101. imull K, %eax
  102. addl %eax, A
  103. #endif
  104. #ifdef RT
  105. movl N, %eax
  106. sall $ZBASE_SHIFT, %eax
  107. imull K, %eax
  108. addl %eax, B
  109. movl N, %eax
  110. imull LDC, %eax
  111. addl %eax, C
  112. #endif
  113. #ifdef RN
  114. negl KK
  115. #endif
  116. #ifdef RT
  117. movl N, %eax
  118. subl OFFSET, %eax
  119. movl %eax, KK
  120. #endif
  121. movl N, %eax
  122. movl %eax, J
  123. sarl $1, J
  124. jle .L100
  125. ALIGN_4
  126. .L01:
  127. #if defined(LT) || defined(RN)
  128. movl A, %eax
  129. movl %eax, AA
  130. #else
  131. movl A, %eax
  132. movl %eax, AORIG
  133. #endif
  134. #ifdef RT
  135. movl K, %eax
  136. sall $1 + ZBASE_SHIFT, %eax
  137. subl %eax, B
  138. #endif
  139. leal (, LDC, 2), %eax
  140. #ifdef RT
  141. subl %eax, C
  142. #endif
  143. movl C, CO1
  144. #ifndef RT
  145. addl %eax, C
  146. #endif
  147. #ifdef LN
  148. movl OFFSET, %eax
  149. addl M, %eax
  150. movl %eax, KK
  151. #endif
  152. #ifdef LT
  153. movl OFFSET, %eax
  154. movl %eax, KK
  155. #endif
  156. movl M, %ebx
  157. sarl $1, %ebx
  158. jle .L30
  159. ALIGN_4
  160. .L10:
  161. #ifdef LN
  162. movl K, %eax
  163. sall $1 + ZBASE_SHIFT, %eax
  164. subl %eax, AORIG
  165. #endif
  166. #if defined(LN) || defined(RT)
  167. movl KK, %eax
  168. movl AORIG, AA
  169. sall $1 + ZBASE_SHIFT, %eax
  170. addl %eax, AA
  171. #endif
  172. movl B, BB
  173. #if defined(LN) || defined(RT)
  174. movl KK, %eax
  175. sall $1 + ZBASE_SHIFT, %eax
  176. addl %eax, BB
  177. #endif
  178. movaps -32 * SIZE(AA), %xmm0
  179. pxor %xmm2, %xmm2
  180. movaps -32 * SIZE(BB), %xmm1
  181. pxor %xmm3, %xmm3
  182. #ifdef LN
  183. pxor %xmm4, %xmm4
  184. prefetcht0 -4 * SIZE(CO1)
  185. pxor %xmm5, %xmm5
  186. prefetcht0 -4 * SIZE(CO1, LDC)
  187. pxor %xmm6, %xmm6
  188. pxor %xmm7, %xmm7
  189. #else
  190. pxor %xmm4, %xmm4
  191. prefetcht0 3 * SIZE(CO1)
  192. pxor %xmm5, %xmm5
  193. prefetcht0 3 * SIZE(CO1, LDC)
  194. pxor %xmm6, %xmm6
  195. pxor %xmm7, %xmm7
  196. #endif
  197. #if defined(LT) || defined(RN)
  198. movl KK, %eax
  199. #else
  200. movl K, %eax
  201. subl KK, %eax
  202. #endif
  203. sarl $3, %eax
  204. je .L15
  205. ALIGN_4
  206. .L11:
  207. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  208. ADD2 %xmm2, %xmm7
  209. pshufd $0xb1, %xmm1, %xmm2
  210. mulps %xmm0, %xmm1
  211. ADD1 %xmm3, %xmm6
  212. pshufd $0x1b, %xmm2, %xmm3
  213. mulps %xmm0, %xmm2
  214. ADD2 %xmm2, %xmm5
  215. pshufd $0xb1, %xmm3, %xmm2
  216. mulps %xmm0, %xmm3
  217. ADD1 %xmm1, %xmm4
  218. movaps -28 * SIZE(BB), %xmm1
  219. mulps %xmm0, %xmm2
  220. movaps -28 * SIZE(AA), %xmm0
  221. ADD2 %xmm2, %xmm7
  222. pshufd $0xb1, %xmm1, %xmm2
  223. mulps %xmm0, %xmm1
  224. ADD1 %xmm3, %xmm6
  225. pshufd $0x1b, %xmm2, %xmm3
  226. mulps %xmm0, %xmm2
  227. ADD2 %xmm2, %xmm5
  228. pshufd $0xb1, %xmm3, %xmm2
  229. mulps %xmm0, %xmm3
  230. ADD1 %xmm1, %xmm4
  231. movaps -24 * SIZE(BB), %xmm1
  232. mulps %xmm0, %xmm2
  233. movaps -24 * SIZE(AA), %xmm0
  234. ADD2 %xmm2, %xmm7
  235. pshufd $0xb1, %xmm1, %xmm2
  236. mulps %xmm0, %xmm1
  237. ADD1 %xmm3, %xmm6
  238. pshufd $0x1b, %xmm2, %xmm3
  239. mulps %xmm0, %xmm2
  240. ADD2 %xmm2, %xmm5
  241. pshufd $0xb1, %xmm3, %xmm2
  242. mulps %xmm0, %xmm3
  243. ADD1 %xmm1, %xmm4
  244. movaps -20 * SIZE(BB), %xmm1
  245. mulps %xmm0, %xmm2
  246. movaps -20 * SIZE(AA), %xmm0
  247. ADD2 %xmm2, %xmm7
  248. pshufd $0xb1, %xmm1, %xmm2
  249. mulps %xmm0, %xmm1
  250. ADD1 %xmm3, %xmm6
  251. pshufd $0x1b, %xmm2, %xmm3
  252. mulps %xmm0, %xmm2
  253. ADD2 %xmm2, %xmm5
  254. pshufd $0xb1, %xmm3, %xmm2
  255. mulps %xmm0, %xmm3
  256. ADD1 %xmm1, %xmm4
  257. movaps -16 * SIZE(BB), %xmm1
  258. mulps %xmm0, %xmm2
  259. movaps -16 * SIZE(AA), %xmm0
  260. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  261. ADD2 %xmm2, %xmm7
  262. pshufd $0xb1, %xmm1, %xmm2
  263. mulps %xmm0, %xmm1
  264. ADD1 %xmm3, %xmm6
  265. pshufd $0x1b, %xmm2, %xmm3
  266. mulps %xmm0, %xmm2
  267. ADD2 %xmm2, %xmm5
  268. pshufd $0xb1, %xmm3, %xmm2
  269. mulps %xmm0, %xmm3
  270. ADD1 %xmm1, %xmm4
  271. movaps -12 * SIZE(BB), %xmm1
  272. mulps %xmm0, %xmm2
  273. movaps -12 * SIZE(AA), %xmm0
  274. ADD2 %xmm2, %xmm7
  275. pshufd $0xb1, %xmm1, %xmm2
  276. mulps %xmm0, %xmm1
  277. ADD1 %xmm3, %xmm6
  278. pshufd $0x1b, %xmm2, %xmm3
  279. mulps %xmm0, %xmm2
  280. ADD2 %xmm2, %xmm5
  281. pshufd $0xb1, %xmm3, %xmm2
  282. mulps %xmm0, %xmm3
  283. ADD1 %xmm1, %xmm4
  284. movaps -8 * SIZE(BB), %xmm1
  285. mulps %xmm0, %xmm2
  286. movaps -8 * SIZE(AA), %xmm0
  287. ADD2 %xmm2, %xmm7
  288. pshufd $0xb1, %xmm1, %xmm2
  289. mulps %xmm0, %xmm1
  290. ADD1 %xmm3, %xmm6
  291. pshufd $0x1b, %xmm2, %xmm3
  292. mulps %xmm0, %xmm2
  293. ADD2 %xmm2, %xmm5
  294. pshufd $0xb1, %xmm3, %xmm2
  295. mulps %xmm0, %xmm3
  296. ADD1 %xmm1, %xmm4
  297. movaps -4 * SIZE(BB), %xmm1
  298. mulps %xmm0, %xmm2
  299. movaps -4 * SIZE(AA), %xmm0
  300. ADD2 %xmm2, %xmm7
  301. subl $-32 * SIZE, BB
  302. pshufd $0xb1, %xmm1, %xmm2
  303. mulps %xmm0, %xmm1
  304. ADD1 %xmm3, %xmm6
  305. pshufd $0x1b, %xmm2, %xmm3
  306. mulps %xmm0, %xmm2
  307. ADD2 %xmm2, %xmm5
  308. subl $-32 * SIZE, AA
  309. pshufd $0xb1, %xmm3, %xmm2
  310. mulps %xmm0, %xmm3
  311. ADD1 %xmm1, %xmm4
  312. movaps -32 * SIZE(BB), %xmm1
  313. mulps %xmm0, %xmm2
  314. movaps -32 * SIZE(AA), %xmm0
  315. decl %eax
  316. jne .L11
  317. ALIGN_4
  318. .L15:
  319. #if defined(LT) || defined(RN)
  320. movl KK, %eax
  321. #else
  322. movl K, %eax
  323. subl KK, %eax
  324. #endif
  325. andl $7, %eax # if (k & 1)
  326. BRANCH
  327. je .L14
  328. ALIGN_4
  329. .L13:
  330. ADD2 %xmm2, %xmm7
  331. pshufd $0xb1, %xmm1, %xmm2
  332. mulps %xmm0, %xmm1
  333. ADD1 %xmm3, %xmm6
  334. pshufd $0x1b, %xmm2, %xmm3
  335. mulps %xmm0, %xmm2
  336. ADD2 %xmm2, %xmm5
  337. pshufd $0xb1, %xmm3, %xmm2
  338. mulps %xmm0, %xmm3
  339. ADD1 %xmm1, %xmm4
  340. movaps -28 * SIZE(BB), %xmm1
  341. mulps %xmm0, %xmm2
  342. movaps -28 * SIZE(AA), %xmm0
  343. addl $4 * SIZE, AA
  344. addl $4 * SIZE, BB
  345. decl %eax
  346. jg .L13
  347. ALIGN_4
  348. .L14:
  349. #if defined(LN) || defined(RT)
  350. movl KK, %eax
  351. #ifdef LN
  352. subl $2, %eax
  353. #else
  354. subl $2, %eax
  355. #endif
  356. movl AORIG, AA
  357. sall $ZBASE_SHIFT, %eax
  358. leal (AA, %eax, 2), AA
  359. leal (B, %eax, 2), BB
  360. #endif
  361. ADD2 %xmm2, %xmm7
  362. pcmpeqb %xmm0, %xmm0
  363. ADD1 %xmm3, %xmm6
  364. psllq $63, %xmm0
  365. #ifndef CONJ
  366. pxor %xmm0, %xmm4
  367. pxor %xmm0, %xmm6
  368. shufps $0xb1, %xmm0, %xmm0
  369. #else
  370. #if defined(LN) || defined(LT)
  371. pxor %xmm0, %xmm5
  372. pxor %xmm0, %xmm7
  373. #else
  374. pshufd $0xb1, %xmm0, %xmm1
  375. pxor %xmm1, %xmm5
  376. pxor %xmm1, %xmm7
  377. #endif
  378. #endif
  379. haddps %xmm5, %xmm4
  380. haddps %xmm7, %xmm6
  381. shufps $0xd8, %xmm4, %xmm4
  382. shufps $0xd8, %xmm6, %xmm6
  383. movaps %xmm4, %xmm5
  384. shufps $0xe4, %xmm6, %xmm4
  385. shufps $0xe4, %xmm5, %xmm6
  386. #if defined(LN) || defined(LT)
  387. movaps %xmm4, %xmm5
  388. unpcklpd %xmm6, %xmm4
  389. unpckhpd %xmm6, %xmm5
  390. movaps -32 * SIZE(BB), %xmm2
  391. movaps -28 * SIZE(BB), %xmm3
  392. subps %xmm4, %xmm2
  393. subps %xmm5, %xmm3
  394. #else
  395. movaps -32 * SIZE(AA), %xmm1
  396. movaps -28 * SIZE(AA), %xmm5
  397. subps %xmm4, %xmm1
  398. subps %xmm6, %xmm5
  399. #endif
  400. #ifdef LN
  401. movaps -28 * SIZE(AA), %xmm5
  402. pshufd $0xee, %xmm5, %xmm6
  403. pshufd $0xbb, %xmm5, %xmm7
  404. pshufd $0xa0, %xmm3, %xmm4
  405. pshufd $0xf5, %xmm3, %xmm3
  406. #ifndef CONJ
  407. xorps %xmm0, %xmm3
  408. #else
  409. xorps %xmm0, %xmm4
  410. #endif
  411. mulps %xmm6, %xmm4
  412. mulps %xmm7, %xmm3
  413. addps %xmm4, %xmm3
  414. pshufd $0x44, %xmm5, %xmm6
  415. pshufd $0x11, %xmm5, %xmm7
  416. pshufd $0xa0, %xmm3, %xmm4
  417. pshufd $0xf5, %xmm3, %xmm1
  418. #ifndef CONJ
  419. xorps %xmm0, %xmm1
  420. #else
  421. xorps %xmm0, %xmm4
  422. #endif
  423. mulps %xmm6, %xmm4
  424. mulps %xmm7, %xmm1
  425. subps %xmm4, %xmm2
  426. subps %xmm1, %xmm2
  427. movaps -32 * SIZE(AA), %xmm5
  428. pshufd $0x44, %xmm5, %xmm6
  429. pshufd $0x11, %xmm5, %xmm7
  430. pshufd $0xa0, %xmm2, %xmm4
  431. pshufd $0xf5, %xmm2, %xmm2
  432. #ifndef CONJ
  433. xorps %xmm0, %xmm2
  434. #else
  435. xorps %xmm0, %xmm4
  436. #endif
  437. mulps %xmm6, %xmm4
  438. mulps %xmm7, %xmm2
  439. addps %xmm4, %xmm2
  440. #endif
  441. #ifdef LT
  442. movaps -32 * SIZE(AA), %xmm5
  443. pshufd $0x44, %xmm5, %xmm6
  444. pshufd $0x11, %xmm5, %xmm7
  445. pshufd $0xa0, %xmm2, %xmm4
  446. pshufd $0xf5, %xmm2, %xmm2
  447. #ifndef CONJ
  448. xorps %xmm0, %xmm2
  449. #else
  450. xorps %xmm0, %xmm4
  451. #endif
  452. mulps %xmm6, %xmm4
  453. mulps %xmm7, %xmm2
  454. addps %xmm4, %xmm2
  455. pshufd $0xee, %xmm5, %xmm6
  456. pshufd $0xbb, %xmm5, %xmm7
  457. pshufd $0xa0, %xmm2, %xmm4
  458. pshufd $0xf5, %xmm2, %xmm1
  459. #ifndef CONJ
  460. xorps %xmm0, %xmm1
  461. #else
  462. xorps %xmm0, %xmm4
  463. #endif
  464. mulps %xmm6, %xmm4
  465. mulps %xmm7, %xmm1
  466. subps %xmm4, %xmm3
  467. subps %xmm1, %xmm3
  468. movaps -28 * SIZE(AA), %xmm5
  469. pshufd $0xee, %xmm5, %xmm6
  470. pshufd $0xbb, %xmm5, %xmm7
  471. pshufd $0xa0, %xmm3, %xmm4
  472. pshufd $0xf5, %xmm3, %xmm3
  473. #ifndef CONJ
  474. xorps %xmm0, %xmm3
  475. #else
  476. xorps %xmm0, %xmm4
  477. #endif
  478. mulps %xmm6, %xmm4
  479. mulps %xmm7, %xmm3
  480. addps %xmm4, %xmm3
  481. #endif
  482. #ifdef RN
  483. movaps -32 * SIZE(BB), %xmm4
  484. pshufd $0x44, %xmm4, %xmm6
  485. pshufd $0x11, %xmm4, %xmm7
  486. pshufd $0xa0, %xmm1, %xmm3
  487. pshufd $0xf5, %xmm1, %xmm1
  488. #ifndef CONJ
  489. xorps %xmm0, %xmm1
  490. #else
  491. xorps %xmm0, %xmm3
  492. #endif
  493. mulps %xmm6, %xmm3
  494. mulps %xmm7, %xmm1
  495. addps %xmm3, %xmm1
  496. pshufd $0xee, %xmm4, %xmm6
  497. pshufd $0xbb, %xmm4, %xmm7
  498. pshufd $0xa0, %xmm1, %xmm3
  499. pshufd $0xf5, %xmm1, %xmm2
  500. #ifndef CONJ
  501. xorps %xmm0, %xmm2
  502. #else
  503. xorps %xmm0, %xmm3
  504. #endif
  505. mulps %xmm6, %xmm3
  506. mulps %xmm7, %xmm2
  507. subps %xmm3, %xmm5
  508. subps %xmm2, %xmm5
  509. movaps -28 * SIZE(BB), %xmm4
  510. pshufd $0xee, %xmm4, %xmm6
  511. pshufd $0xbb, %xmm4, %xmm7
  512. pshufd $0xa0, %xmm5, %xmm3
  513. pshufd $0xf5, %xmm5, %xmm5
  514. #ifndef CONJ
  515. xorps %xmm0, %xmm5
  516. #else
  517. xorps %xmm0, %xmm3
  518. #endif
  519. mulps %xmm6, %xmm3
  520. mulps %xmm7, %xmm5
  521. addps %xmm3, %xmm5
  522. #endif
  523. #ifdef RT
  524. movaps -28 * SIZE(BB), %xmm4
  525. pshufd $0xee, %xmm4, %xmm6
  526. pshufd $0xbb, %xmm4, %xmm7
  527. pshufd $0xa0, %xmm5, %xmm3
  528. pshufd $0xf5, %xmm5, %xmm5
  529. #ifndef CONJ
  530. xorps %xmm0, %xmm5
  531. #else
  532. xorps %xmm0, %xmm3
  533. #endif
  534. mulps %xmm6, %xmm3
  535. mulps %xmm7, %xmm5
  536. addps %xmm3, %xmm5
  537. pshufd $0x44, %xmm4, %xmm6
  538. pshufd $0x11, %xmm4, %xmm7
  539. pshufd $0xa0, %xmm5, %xmm3
  540. pshufd $0xf5, %xmm5, %xmm2
  541. #ifndef CONJ
  542. xorps %xmm0, %xmm2
  543. #else
  544. xorps %xmm0, %xmm3
  545. #endif
  546. mulps %xmm6, %xmm3
  547. mulps %xmm7, %xmm2
  548. subps %xmm3, %xmm1
  549. subps %xmm2, %xmm1
  550. movaps -32 * SIZE(BB), %xmm4
  551. pshufd $0x44, %xmm4, %xmm6
  552. pshufd $0x11, %xmm4, %xmm7
  553. pshufd $0xa0, %xmm1, %xmm3
  554. pshufd $0xf5, %xmm1, %xmm1
  555. #ifndef CONJ
  556. xorps %xmm0, %xmm1
  557. #else
  558. xorps %xmm0, %xmm3
  559. #endif
  560. mulps %xmm6, %xmm3
  561. mulps %xmm7, %xmm1
  562. addps %xmm3, %xmm1
  563. #endif
  564. #ifdef LN
  565. subl $4 * SIZE, CO1
  566. #endif
  567. #if defined(LN) || defined(LT)
  568. movaps %xmm2, -32 * SIZE(BB)
  569. movaps %xmm3, -28 * SIZE(BB)
  570. movlps %xmm2, 0 * SIZE(CO1)
  571. movlps %xmm3, 2 * SIZE(CO1)
  572. movhps %xmm2, 0 * SIZE(CO1, LDC)
  573. movhps %xmm3, 2 * SIZE(CO1, LDC)
  574. #else
  575. movaps %xmm1, -32 * SIZE(AA)
  576. movaps %xmm5, -28 * SIZE(AA)
  577. movlps %xmm1, 0 * SIZE(CO1)
  578. movhps %xmm1, 2 * SIZE(CO1)
  579. movlps %xmm5, 0 * SIZE(CO1, LDC)
  580. movhps %xmm5, 2 * SIZE(CO1, LDC)
  581. #endif
  582. #ifndef LN
  583. addl $4 * SIZE, CO1
  584. #endif
  585. #if defined(LT) || defined(RN)
  586. movl K, %eax
  587. subl KK, %eax
  588. sall $ZBASE_SHIFT, %eax
  589. leal (AA, %eax, 2), AA
  590. leal (BB, %eax, 2), BB
  591. #endif
  592. #ifdef LN
  593. subl $2, KK
  594. #endif
  595. #ifdef LT
  596. addl $2, KK
  597. #endif
  598. #ifdef RT
  599. movl K, %eax
  600. sall $1 + ZBASE_SHIFT, %eax
  601. addl %eax, AORIG
  602. #endif
  603. decl %ebx
  604. jg .L10
  605. ALIGN_4
  606. .L30:
  607. movl M, %ebx
  608. andl $1, %ebx
  609. jle .L99
  610. #ifdef LN
  611. movl K, %eax
  612. sall $ZBASE_SHIFT, %eax
  613. subl %eax, AORIG
  614. #endif
  615. #if defined(LN) || defined(RT)
  616. movl KK, %eax
  617. movl AORIG, AA
  618. sall $ZBASE_SHIFT, %eax
  619. addl %eax, AA
  620. #endif
  621. movl B, BB
  622. #if defined(LN) || defined(RT)
  623. movl KK, %eax
  624. sall $1 + ZBASE_SHIFT, %eax
  625. addl %eax, BB
  626. #endif
  627. movsd -32 * SIZE(AA), %xmm0
  628. pxor %xmm2, %xmm2
  629. movaps -32 * SIZE(BB), %xmm1
  630. pxor %xmm3, %xmm3
  631. pxor %xmm4, %xmm4
  632. pxor %xmm5, %xmm5
  633. pxor %xmm6, %xmm6
  634. pxor %xmm7, %xmm7
  635. #if defined(LT) || defined(RN)
  636. movl KK, %eax
  637. #else
  638. movl K, %eax
  639. subl KK, %eax
  640. #endif
  641. sarl $3, %eax
  642. je .L42
  643. ALIGN_4
  644. .L41:
  645. addps %xmm2, %xmm6
  646. pshufd $0x00, %xmm1, %xmm2
  647. mulps %xmm0, %xmm2
  648. addps %xmm3, %xmm7
  649. pshufd $0x55, %xmm1, %xmm3
  650. mulps %xmm0, %xmm3
  651. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  652. addps %xmm2, %xmm4
  653. pshufd $0xaa, %xmm1, %xmm2
  654. mulps %xmm0, %xmm2
  655. addps %xmm3, %xmm5
  656. pshufd $0xff, %xmm1, %xmm3
  657. movaps -28 * SIZE(BB), %xmm1
  658. mulps %xmm0, %xmm3
  659. movsd -30 * SIZE(AA), %xmm0
  660. addps %xmm2, %xmm6
  661. pshufd $0x00, %xmm1, %xmm2
  662. mulps %xmm0, %xmm2
  663. addps %xmm3, %xmm7
  664. pshufd $0x55, %xmm1, %xmm3
  665. mulps %xmm0, %xmm3
  666. addps %xmm2, %xmm4
  667. pshufd $0xaa, %xmm1, %xmm2
  668. mulps %xmm0, %xmm2
  669. addps %xmm3, %xmm5
  670. pshufd $0xff, %xmm1, %xmm3
  671. movaps -24 * SIZE(BB), %xmm1
  672. mulps %xmm0, %xmm3
  673. movsd -28 * SIZE(AA), %xmm0
  674. addps %xmm2, %xmm6
  675. pshufd $0x00, %xmm1, %xmm2
  676. mulps %xmm0, %xmm2
  677. addps %xmm3, %xmm7
  678. pshufd $0x55, %xmm1, %xmm3
  679. mulps %xmm0, %xmm3
  680. addps %xmm2, %xmm4
  681. pshufd $0xaa, %xmm1, %xmm2
  682. mulps %xmm0, %xmm2
  683. addps %xmm3, %xmm5
  684. pshufd $0xff, %xmm1, %xmm3
  685. movaps -20 * SIZE(BB), %xmm1
  686. mulps %xmm0, %xmm3
  687. movsd -26 * SIZE(AA), %xmm0
  688. addps %xmm2, %xmm6
  689. pshufd $0x00, %xmm1, %xmm2
  690. mulps %xmm0, %xmm2
  691. addps %xmm3, %xmm7
  692. pshufd $0x55, %xmm1, %xmm3
  693. mulps %xmm0, %xmm3
  694. addps %xmm2, %xmm4
  695. pshufd $0xaa, %xmm1, %xmm2
  696. mulps %xmm0, %xmm2
  697. addps %xmm3, %xmm5
  698. pshufd $0xff, %xmm1, %xmm3
  699. movaps -16 * SIZE(BB), %xmm1
  700. mulps %xmm0, %xmm3
  701. movsd -24 * SIZE(AA), %xmm0
  702. addps %xmm2, %xmm6
  703. pshufd $0x00, %xmm1, %xmm2
  704. mulps %xmm0, %xmm2
  705. addps %xmm3, %xmm7
  706. pshufd $0x55, %xmm1, %xmm3
  707. mulps %xmm0, %xmm3
  708. addps %xmm2, %xmm4
  709. pshufd $0xaa, %xmm1, %xmm2
  710. mulps %xmm0, %xmm2
  711. addps %xmm3, %xmm5
  712. pshufd $0xff, %xmm1, %xmm3
  713. movaps -12 * SIZE(BB), %xmm1
  714. mulps %xmm0, %xmm3
  715. movsd -22 * SIZE(AA), %xmm0
  716. addps %xmm2, %xmm6
  717. pshufd $0x00, %xmm1, %xmm2
  718. mulps %xmm0, %xmm2
  719. addps %xmm3, %xmm7
  720. pshufd $0x55, %xmm1, %xmm3
  721. mulps %xmm0, %xmm3
  722. addps %xmm2, %xmm4
  723. pshufd $0xaa, %xmm1, %xmm2
  724. mulps %xmm0, %xmm2
  725. addps %xmm3, %xmm5
  726. pshufd $0xff, %xmm1, %xmm3
  727. movaps -8 * SIZE(BB), %xmm1
  728. mulps %xmm0, %xmm3
  729. movsd -20 * SIZE(AA), %xmm0
  730. addps %xmm2, %xmm6
  731. pshufd $0x00, %xmm1, %xmm2
  732. mulps %xmm0, %xmm2
  733. addps %xmm3, %xmm7
  734. pshufd $0x55, %xmm1, %xmm3
  735. mulps %xmm0, %xmm3
  736. addps %xmm2, %xmm4
  737. pshufd $0xaa, %xmm1, %xmm2
  738. mulps %xmm0, %xmm2
  739. addps %xmm3, %xmm5
  740. pshufd $0xff, %xmm1, %xmm3
  741. movaps -4 * SIZE(BB), %xmm1
  742. mulps %xmm0, %xmm3
  743. movsd -18 * SIZE(AA), %xmm0
  744. addps %xmm2, %xmm6
  745. pshufd $0x00, %xmm1, %xmm2
  746. mulps %xmm0, %xmm2
  747. addps %xmm3, %xmm7
  748. pshufd $0x55, %xmm1, %xmm3
  749. mulps %xmm0, %xmm3
  750. addps %xmm2, %xmm4
  751. pshufd $0xaa, %xmm1, %xmm2
  752. mulps %xmm0, %xmm2
  753. addps %xmm3, %xmm5
  754. pshufd $0xff, %xmm1, %xmm3
  755. movaps 0 * SIZE(BB), %xmm1
  756. mulps %xmm0, %xmm3
  757. movsd -16 * SIZE(AA), %xmm0
  758. subl $-16 * SIZE, AA
  759. subl $-32 * SIZE, BB
  760. decl %eax
  761. jne .L41
  762. ALIGN_4
  763. .L42:
  764. #if defined(LT) || defined(RN)
  765. movl KK, %eax
  766. #else
  767. movl K, %eax
  768. subl KK, %eax
  769. #endif
  770. andl $7, %eax # if (k & 1)
  771. BRANCH
  772. je .L44
  773. ALIGN_4
  774. .L43:
  775. addps %xmm2, %xmm6
  776. pshufd $0x00, %xmm1, %xmm2
  777. mulps %xmm0, %xmm2
  778. addps %xmm3, %xmm7
  779. pshufd $0x55, %xmm1, %xmm3
  780. mulps %xmm0, %xmm3
  781. addps %xmm2, %xmm4
  782. pshufd $0xaa, %xmm1, %xmm2
  783. mulps %xmm0, %xmm2
  784. addps %xmm3, %xmm5
  785. pshufd $0xff, %xmm1, %xmm3
  786. movaps -28 * SIZE(BB), %xmm1
  787. mulps %xmm0, %xmm3
  788. movsd -30 * SIZE(AA), %xmm0
  789. addl $2 * SIZE, AA
  790. addl $4 * SIZE, BB
  791. decl %eax
  792. jg .L43
  793. ALIGN_4
  794. .L44:
  795. #if defined(LN) || defined(RT)
  796. movl KK, %eax
  797. #ifdef LN
  798. subl $1, %eax
  799. #else
  800. subl $2, %eax
  801. #endif
  802. movl AORIG, AA
  803. sall $ZBASE_SHIFT, %eax
  804. leal (AA, %eax, 1), AA
  805. leal (B, %eax, 2), BB
  806. #endif
  807. addps %xmm2, %xmm6
  808. addps %xmm3, %xmm7
  809. pshufd $0xb1, %xmm5, %xmm5
  810. pcmpeqb %xmm0, %xmm0
  811. pshufd $0xb1, %xmm7, %xmm7
  812. psllq $63, %xmm0
  813. #ifndef CONJ
  814. shufps $0xb1, %xmm0, %xmm0
  815. pxor %xmm0, %xmm5
  816. pxor %xmm0, %xmm7
  817. #else
  818. #if defined(LN) || defined(LT)
  819. pxor %xmm0, %xmm4
  820. pxor %xmm0, %xmm6
  821. #else
  822. pxor %xmm0, %xmm5
  823. pxor %xmm0, %xmm7
  824. #endif
  825. #endif
  826. addps %xmm5, %xmm4
  827. addps %xmm7, %xmm6
  828. #if defined(LN) || defined(LT)
  829. unpcklpd %xmm6, %xmm4
  830. movaps -32 * SIZE(BB), %xmm2
  831. subps %xmm4, %xmm2
  832. #else
  833. movsd -32 * SIZE(AA), %xmm1
  834. movsd -30 * SIZE(AA), %xmm5
  835. subps %xmm4, %xmm1
  836. subps %xmm6, %xmm5
  837. #endif
  838. #if defined(LN) || defined(LT)
  839. movaps -32 * SIZE(AA), %xmm5
  840. pshufd $0x44, %xmm5, %xmm6
  841. pshufd $0x11, %xmm5, %xmm7
  842. pshufd $0xa0, %xmm2, %xmm4
  843. pshufd $0xf5, %xmm2, %xmm2
  844. #ifndef CONJ
  845. xorps %xmm0, %xmm2
  846. #else
  847. xorps %xmm0, %xmm4
  848. #endif
  849. mulps %xmm6, %xmm4
  850. mulps %xmm7, %xmm2
  851. addps %xmm4, %xmm2
  852. #endif
  853. #ifdef RN
  854. movaps -32 * SIZE(BB), %xmm4
  855. pshufd $0x44, %xmm4, %xmm6
  856. pshufd $0x11, %xmm4, %xmm7
  857. pshufd $0xa0, %xmm1, %xmm3
  858. pshufd $0xf5, %xmm1, %xmm1
  859. #ifndef CONJ
  860. xorps %xmm0, %xmm1
  861. #else
  862. xorps %xmm0, %xmm3
  863. #endif
  864. mulps %xmm6, %xmm3
  865. mulps %xmm7, %xmm1
  866. addps %xmm3, %xmm1
  867. pshufd $0xee, %xmm4, %xmm6
  868. pshufd $0xbb, %xmm4, %xmm7
  869. pshufd $0xa0, %xmm1, %xmm3
  870. pshufd $0xf5, %xmm1, %xmm2
  871. #ifndef CONJ
  872. xorps %xmm0, %xmm2
  873. #else
  874. xorps %xmm0, %xmm3
  875. #endif
  876. mulps %xmm6, %xmm3
  877. mulps %xmm7, %xmm2
  878. subps %xmm3, %xmm5
  879. subps %xmm2, %xmm5
  880. movaps -28 * SIZE(BB), %xmm4
  881. pshufd $0xee, %xmm4, %xmm6
  882. pshufd $0xbb, %xmm4, %xmm7
  883. pshufd $0xa0, %xmm5, %xmm3
  884. pshufd $0xf5, %xmm5, %xmm5
  885. #ifndef CONJ
  886. xorps %xmm0, %xmm5
  887. #else
  888. xorps %xmm0, %xmm3
  889. #endif
  890. mulps %xmm6, %xmm3
  891. mulps %xmm7, %xmm5
  892. addps %xmm3, %xmm5
  893. #endif
  894. #ifdef RT
  895. movaps -28 * SIZE(BB), %xmm4
  896. pshufd $0xee, %xmm4, %xmm6
  897. pshufd $0xbb, %xmm4, %xmm7
  898. pshufd $0xa0, %xmm5, %xmm3
  899. pshufd $0xf5, %xmm5, %xmm5
  900. #ifndef CONJ
  901. xorps %xmm0, %xmm5
  902. #else
  903. xorps %xmm0, %xmm3
  904. #endif
  905. mulps %xmm6, %xmm3
  906. mulps %xmm7, %xmm5
  907. addps %xmm3, %xmm5
  908. pshufd $0x44, %xmm4, %xmm6
  909. pshufd $0x11, %xmm4, %xmm7
  910. pshufd $0xa0, %xmm5, %xmm3
  911. pshufd $0xf5, %xmm5, %xmm2
  912. #ifndef CONJ
  913. xorps %xmm0, %xmm2
  914. #else
  915. xorps %xmm0, %xmm3
  916. #endif
  917. mulps %xmm6, %xmm3
  918. mulps %xmm7, %xmm2
  919. subps %xmm3, %xmm1
  920. subps %xmm2, %xmm1
  921. movaps -32 * SIZE(BB), %xmm4
  922. pshufd $0x44, %xmm4, %xmm6
  923. pshufd $0x11, %xmm4, %xmm7
  924. pshufd $0xa0, %xmm1, %xmm3
  925. pshufd $0xf5, %xmm1, %xmm1
  926. #ifndef CONJ
  927. xorps %xmm0, %xmm1
  928. #else
  929. xorps %xmm0, %xmm3
  930. #endif
  931. mulps %xmm6, %xmm3
  932. mulps %xmm7, %xmm1
  933. addps %xmm3, %xmm1
  934. #endif
  935. #ifdef LN
  936. subl $2 * SIZE, CO1
  937. #endif
  938. #if defined(LN) || defined(LT)
  939. movaps %xmm2, -32 * SIZE(BB)
  940. movlps %xmm2, 0 * SIZE(CO1)
  941. movhps %xmm2, 0 * SIZE(CO1, LDC)
  942. #else
  943. movlps %xmm1, -32 * SIZE(AA)
  944. movlps %xmm5, -30 * SIZE(AA)
  945. movlps %xmm1, 0 * SIZE(CO1)
  946. movlps %xmm5, 0 * SIZE(CO1, LDC)
  947. #endif
  948. #ifndef LN
  949. addl $2 * SIZE, CO1
  950. #endif
  951. #if defined(LT) || defined(RN)
  952. movl K, %eax
  953. subl KK, %eax
  954. sall $ZBASE_SHIFT, %eax
  955. leal (AA, %eax, 1), AA
  956. leal (BB, %eax, 2), BB
  957. #endif
  958. #ifdef LN
  959. subl $1, KK
  960. #endif
  961. #ifdef LT
  962. addl $1, KK
  963. #endif
  964. #ifdef RT
  965. movl K, %eax
  966. sall $ZBASE_SHIFT, %eax
  967. addl %eax, AORIG
  968. #endif
  969. ALIGN_4
  970. .L99:
  971. #ifdef LN
  972. movl K, %eax
  973. sall $1 + ZBASE_SHIFT, %eax
  974. addl %eax, B
  975. #endif
  976. #if defined(LT) || defined(RN)
  977. movl BB, B
  978. #endif
  979. #ifdef RN
  980. addl $2, KK
  981. #endif
  982. #ifdef RT
  983. subl $2, KK
  984. #endif
  985. decl J # j --
  986. jg .L01
  987. ALIGN_4
  988. .L100:
  989. movl N, %eax
  990. andl $1, %eax
  991. jle .L999
  992. #if defined(LT) || defined(RN)
  993. movl A, %eax
  994. movl %eax, AA
  995. #else
  996. movl A, %eax
  997. movl %eax, AORIG
  998. #endif
  999. #ifdef RT
  1000. movl K, %eax
  1001. sall $ZBASE_SHIFT, %eax
  1002. subl %eax, B
  1003. #endif
  1004. #ifdef RT
  1005. subl LDC, C
  1006. #endif
  1007. movl C, CO1
  1008. #ifndef RT
  1009. addl LDC, C
  1010. #endif
  1011. #ifdef LN
  1012. movl OFFSET, %eax
  1013. addl M, %eax
  1014. movl %eax, KK
  1015. #endif
  1016. #ifdef LT
  1017. movl OFFSET, %eax
  1018. movl %eax, KK
  1019. #endif
  1020. movl M, %ebx
  1021. sarl $1, %ebx
  1022. jle .L130
  1023. ALIGN_4
  1024. .L110:
  1025. #ifdef LN
  1026. movl K, %eax
  1027. sall $1 + ZBASE_SHIFT, %eax
  1028. subl %eax, AORIG
  1029. #endif
  1030. #if defined(LN) || defined(RT)
  1031. movl KK, %eax
  1032. movl AORIG, AA
  1033. sall $1 + ZBASE_SHIFT, %eax
  1034. addl %eax, AA
  1035. #endif
  1036. movl B, BB
  1037. #if defined(LN) || defined(RT)
  1038. movl KK, %eax
  1039. sall $ZBASE_SHIFT, %eax
  1040. addl %eax, BB
  1041. #endif
  1042. movaps -32 * SIZE(AA), %xmm0
  1043. pxor %xmm2, %xmm2
  1044. movsd -32 * SIZE(BB), %xmm1
  1045. pxor %xmm3, %xmm3
  1046. movhps -30 * SIZE(BB), %xmm1
  1047. pxor %xmm4, %xmm4
  1048. #ifdef LN
  1049. prefetcht0 -4 * SIZE(CO1)
  1050. #else
  1051. prefetcht0 3 * SIZE(CO1)
  1052. #endif
  1053. pxor %xmm5, %xmm5
  1054. pxor %xmm6, %xmm6
  1055. pxor %xmm7, %xmm7
  1056. #if defined(LT) || defined(RN)
  1057. movl KK, %eax
  1058. #else
  1059. movl K, %eax
  1060. subl KK, %eax
  1061. #endif
  1062. sarl $3, %eax
  1063. je .L112
  1064. ALIGN_4
  1065. .L111:
  1066. addps %xmm2, %xmm4
  1067. pshufd $0x00, %xmm1, %xmm2
  1068. mulps %xmm0, %xmm2
  1069. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1070. addps %xmm3, %xmm5
  1071. pshufd $0x55, %xmm1, %xmm3
  1072. mulps %xmm0, %xmm3
  1073. movaps -28 * SIZE(AA), %xmm0
  1074. addps %xmm2, %xmm4
  1075. pshufd $0xaa, %xmm1, %xmm2
  1076. mulps %xmm0, %xmm2
  1077. addps %xmm3, %xmm5
  1078. pshufd $0xff, %xmm1, %xmm3
  1079. movaps -28 * SIZE(BB), %xmm1
  1080. mulps %xmm0, %xmm3
  1081. movaps -24 * SIZE(AA), %xmm0
  1082. addps %xmm2, %xmm4
  1083. pshufd $0x00, %xmm1, %xmm2
  1084. mulps %xmm0, %xmm2
  1085. addps %xmm3, %xmm5
  1086. pshufd $0x55, %xmm1, %xmm3
  1087. mulps %xmm0, %xmm3
  1088. movaps -20 * SIZE(AA), %xmm0
  1089. addps %xmm2, %xmm4
  1090. pshufd $0xaa, %xmm1, %xmm2
  1091. mulps %xmm0, %xmm2
  1092. addps %xmm3, %xmm5
  1093. pshufd $0xff, %xmm1, %xmm3
  1094. movaps -24 * SIZE(BB), %xmm1
  1095. mulps %xmm0, %xmm3
  1096. movaps -16 * SIZE(AA), %xmm0
  1097. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  1098. addps %xmm2, %xmm4
  1099. pshufd $0x00, %xmm1, %xmm2
  1100. mulps %xmm0, %xmm2
  1101. addps %xmm3, %xmm5
  1102. pshufd $0x55, %xmm1, %xmm3
  1103. mulps %xmm0, %xmm3
  1104. movaps -12 * SIZE(AA), %xmm0
  1105. addps %xmm2, %xmm4
  1106. pshufd $0xaa, %xmm1, %xmm2
  1107. mulps %xmm0, %xmm2
  1108. addps %xmm3, %xmm5
  1109. pshufd $0xff, %xmm1, %xmm3
  1110. movaps -20 * SIZE(BB), %xmm1
  1111. mulps %xmm0, %xmm3
  1112. movaps -8 * SIZE(AA), %xmm0
  1113. addps %xmm2, %xmm4
  1114. pshufd $0x00, %xmm1, %xmm2
  1115. mulps %xmm0, %xmm2
  1116. addps %xmm3, %xmm5
  1117. pshufd $0x55, %xmm1, %xmm3
  1118. mulps %xmm0, %xmm3
  1119. movaps -4 * SIZE(AA), %xmm0
  1120. addps %xmm2, %xmm4
  1121. pshufd $0xaa, %xmm1, %xmm2
  1122. mulps %xmm0, %xmm2
  1123. addps %xmm3, %xmm5
  1124. pshufd $0xff, %xmm1, %xmm3
  1125. movaps -16 * SIZE(BB), %xmm1
  1126. mulps %xmm0, %xmm3
  1127. movaps 0 * SIZE(AA), %xmm0
  1128. subl $-32 * SIZE, AA
  1129. subl $-16 * SIZE, BB
  1130. decl %eax
  1131. jne .L111
  1132. ALIGN_4
  1133. .L112:
  1134. #if defined(LT) || defined(RN)
  1135. movl KK, %eax
  1136. #else
  1137. movl K, %eax
  1138. subl KK, %eax
  1139. #endif
  1140. andl $7, %eax # if (k & 1)
  1141. BRANCH
  1142. je .L114
  1143. ALIGN_4
  1144. .L113:
  1145. addps %xmm2, %xmm4
  1146. pshufd $0x00, %xmm1, %xmm2
  1147. mulps %xmm0, %xmm2
  1148. addps %xmm3, %xmm5
  1149. pshufd $0x55, %xmm1, %xmm3
  1150. movsd -30 * SIZE(BB), %xmm1
  1151. mulps %xmm0, %xmm3
  1152. movaps -28 * SIZE(AA), %xmm0
  1153. addl $4 * SIZE, AA
  1154. addl $2 * SIZE, BB
  1155. decl %eax
  1156. jg .L113
  1157. ALIGN_4
  1158. .L114:
  1159. #if defined(LN) || defined(RT)
  1160. movl KK, %eax
  1161. #ifdef LN
  1162. subl $2, %eax
  1163. #else
  1164. subl $1, %eax
  1165. #endif
  1166. movl AORIG, AA
  1167. sall $ZBASE_SHIFT, %eax
  1168. leal (AA, %eax, 2), AA
  1169. leal (B, %eax, 1), BB
  1170. #endif
  1171. addps %xmm2, %xmm4
  1172. addps %xmm3, %xmm5
  1173. pshufd $0xb1, %xmm5, %xmm5
  1174. pcmpeqb %xmm0, %xmm0
  1175. psllq $63, %xmm0
  1176. #ifndef CONJ
  1177. shufps $0xb1, %xmm0, %xmm0
  1178. pxor %xmm0, %xmm5
  1179. #else
  1180. #if defined(LN) || defined(LT)
  1181. pxor %xmm0, %xmm4
  1182. #else
  1183. pxor %xmm0, %xmm5
  1184. #endif
  1185. #endif
  1186. addps %xmm5, %xmm4
  1187. #if defined(LN) || defined(LT)
  1188. movaps %xmm4, %xmm5
  1189. unpcklpd %xmm6, %xmm4
  1190. unpckhpd %xmm6, %xmm5
  1191. movsd -32 * SIZE(BB), %xmm2
  1192. movsd -30 * SIZE(BB), %xmm3
  1193. subps %xmm4, %xmm2
  1194. subps %xmm5, %xmm3
  1195. #else
  1196. movaps -32 * SIZE(AA), %xmm1
  1197. subps %xmm4, %xmm1
  1198. #endif
  1199. #ifdef LN
  1200. movaps -28 * SIZE(AA), %xmm5
  1201. pshufd $0xee, %xmm5, %xmm6
  1202. pshufd $0xbb, %xmm5, %xmm7
  1203. pshufd $0xa0, %xmm3, %xmm4
  1204. pshufd $0xf5, %xmm3, %xmm3
  1205. #ifndef CONJ
  1206. xorps %xmm0, %xmm3
  1207. #else
  1208. xorps %xmm0, %xmm4
  1209. #endif
  1210. mulps %xmm6, %xmm4
  1211. mulps %xmm7, %xmm3
  1212. addps %xmm4, %xmm3
  1213. pshufd $0x44, %xmm5, %xmm6
  1214. pshufd $0x11, %xmm5, %xmm7
  1215. pshufd $0xa0, %xmm3, %xmm4
  1216. pshufd $0xf5, %xmm3, %xmm1
  1217. #ifndef CONJ
  1218. xorps %xmm0, %xmm1
  1219. #else
  1220. xorps %xmm0, %xmm4
  1221. #endif
  1222. mulps %xmm6, %xmm4
  1223. mulps %xmm7, %xmm1
  1224. subps %xmm4, %xmm2
  1225. subps %xmm1, %xmm2
  1226. movaps -32 * SIZE(AA), %xmm5
  1227. pshufd $0x44, %xmm5, %xmm6
  1228. pshufd $0x11, %xmm5, %xmm7
  1229. pshufd $0xa0, %xmm2, %xmm4
  1230. pshufd $0xf5, %xmm2, %xmm2
  1231. #ifndef CONJ
  1232. xorps %xmm0, %xmm2
  1233. #else
  1234. xorps %xmm0, %xmm4
  1235. #endif
  1236. mulps %xmm6, %xmm4
  1237. mulps %xmm7, %xmm2
  1238. addps %xmm4, %xmm2
  1239. #endif
  1240. #ifdef LT
  1241. movaps -32 * SIZE(AA), %xmm5
  1242. pshufd $0x44, %xmm5, %xmm6
  1243. pshufd $0x11, %xmm5, %xmm7
  1244. pshufd $0xa0, %xmm2, %xmm4
  1245. pshufd $0xf5, %xmm2, %xmm2
  1246. #ifndef CONJ
  1247. xorps %xmm0, %xmm2
  1248. #else
  1249. xorps %xmm0, %xmm4
  1250. #endif
  1251. mulps %xmm6, %xmm4
  1252. mulps %xmm7, %xmm2
  1253. addps %xmm4, %xmm2
  1254. pshufd $0xee, %xmm5, %xmm6
  1255. pshufd $0xbb, %xmm5, %xmm7
  1256. pshufd $0xa0, %xmm2, %xmm4
  1257. pshufd $0xf5, %xmm2, %xmm1
  1258. #ifndef CONJ
  1259. xorps %xmm0, %xmm1
  1260. #else
  1261. xorps %xmm0, %xmm4
  1262. #endif
  1263. mulps %xmm6, %xmm4
  1264. mulps %xmm7, %xmm1
  1265. subps %xmm4, %xmm3
  1266. subps %xmm1, %xmm3
  1267. movaps -28 * SIZE(AA), %xmm5
  1268. pshufd $0xee, %xmm5, %xmm6
  1269. pshufd $0xbb, %xmm5, %xmm7
  1270. pshufd $0xa0, %xmm3, %xmm4
  1271. pshufd $0xf5, %xmm3, %xmm3
  1272. #ifndef CONJ
  1273. xorps %xmm0, %xmm3
  1274. #else
  1275. xorps %xmm0, %xmm4
  1276. #endif
  1277. mulps %xmm6, %xmm4
  1278. mulps %xmm7, %xmm3
  1279. addps %xmm4, %xmm3
  1280. #endif
  1281. #if defined(RN) || defined(RT)
  1282. movaps -32 * SIZE(BB), %xmm4
  1283. pshufd $0x44, %xmm4, %xmm6
  1284. pshufd $0x11, %xmm4, %xmm7
  1285. pshufd $0xa0, %xmm1, %xmm3
  1286. pshufd $0xf5, %xmm1, %xmm1
  1287. #ifndef CONJ
  1288. xorps %xmm0, %xmm1
  1289. #else
  1290. xorps %xmm0, %xmm3
  1291. #endif
  1292. mulps %xmm6, %xmm3
  1293. mulps %xmm7, %xmm1
  1294. addps %xmm3, %xmm1
  1295. #endif
  1296. #ifdef LN
  1297. subl $4 * SIZE, CO1
  1298. #endif
  1299. #if defined(LN) || defined(LT)
  1300. movlps %xmm2, -32 * SIZE(BB)
  1301. movlps %xmm3, -30 * SIZE(BB)
  1302. movlps %xmm2, 0 * SIZE(CO1)
  1303. movlps %xmm3, 2 * SIZE(CO1)
  1304. #else
  1305. movaps %xmm1, -32 * SIZE(AA)
  1306. movlps %xmm1, 0 * SIZE(CO1)
  1307. movhps %xmm1, 2 * SIZE(CO1)
  1308. #endif
  1309. #ifndef LN
  1310. addl $4 * SIZE, CO1
  1311. #endif
  1312. #if defined(LT) || defined(RN)
  1313. movl K, %eax
  1314. subl KK, %eax
  1315. sall $ZBASE_SHIFT, %eax
  1316. leal (AA, %eax, 2), AA
  1317. leal (BB, %eax, 1), BB
  1318. #endif
  1319. #ifdef LN
  1320. subl $2, KK
  1321. #endif
  1322. #ifdef LT
  1323. addl $2, KK
  1324. #endif
  1325. #ifdef RT
  1326. movl K, %eax
  1327. sall $1 + ZBASE_SHIFT, %eax
  1328. addl %eax, AORIG
  1329. #endif
  1330. decl %ebx # i --
  1331. jg .L110
  1332. ALIGN_4
  1333. .L130:
  1334. movl M, %ebx
  1335. andl $1, %ebx
  1336. jle .L149
  1337. #ifdef LN
  1338. movl K, %eax
  1339. sall $ZBASE_SHIFT, %eax
  1340. subl %eax, AORIG
  1341. #endif
  1342. #if defined(LN) || defined(RT)
  1343. movl KK, %eax
  1344. movl AORIG, AA
  1345. sall $ZBASE_SHIFT, %eax
  1346. addl %eax, AA
  1347. #endif
  1348. movl B, BB
  1349. #if defined(LN) || defined(RT)
  1350. movl KK, %eax
  1351. sall $ZBASE_SHIFT, %eax
  1352. addl %eax, BB
  1353. #endif
  1354. movsd -32 * SIZE(AA), %xmm0
  1355. pxor %xmm2, %xmm2
  1356. movsd -32 * SIZE(BB), %xmm1
  1357. pxor %xmm3, %xmm3
  1358. pxor %xmm4, %xmm4
  1359. pxor %xmm5, %xmm5
  1360. pxor %xmm6, %xmm6
  1361. pxor %xmm7, %xmm7
  1362. #if defined(LT) || defined(RN)
  1363. movl KK, %eax
  1364. #else
  1365. movl K, %eax
  1366. subl KK, %eax
  1367. #endif
  1368. sarl $3, %eax
  1369. je .L142
  1370. ALIGN_4
  1371. .L141:
  1372. addps %xmm2, %xmm4
  1373. pshufd $0x00, %xmm1, %xmm2
  1374. mulps %xmm0, %xmm2
  1375. addps %xmm3, %xmm5
  1376. pshufd $0x55, %xmm1, %xmm3
  1377. movsd -30 * SIZE(BB), %xmm1
  1378. mulps %xmm0, %xmm3
  1379. movsd -30 * SIZE(AA), %xmm0
  1380. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1381. addps %xmm2, %xmm4
  1382. pshufd $0x00, %xmm1, %xmm2
  1383. mulps %xmm0, %xmm2
  1384. addps %xmm3, %xmm5
  1385. pshufd $0x55, %xmm1, %xmm3
  1386. movsd -28 * SIZE(BB), %xmm1
  1387. mulps %xmm0, %xmm3
  1388. movsd -28 * SIZE(AA), %xmm0
  1389. addps %xmm2, %xmm4
  1390. pshufd $0x00, %xmm1, %xmm2
  1391. mulps %xmm0, %xmm2
  1392. addps %xmm3, %xmm5
  1393. pshufd $0x55, %xmm1, %xmm3
  1394. movsd -26 * SIZE(BB), %xmm1
  1395. mulps %xmm0, %xmm3
  1396. movsd -26 * SIZE(AA), %xmm0
  1397. addps %xmm2, %xmm4
  1398. pshufd $0x00, %xmm1, %xmm2
  1399. mulps %xmm0, %xmm2
  1400. addps %xmm3, %xmm5
  1401. pshufd $0x55, %xmm1, %xmm3
  1402. movsd -24 * SIZE(BB), %xmm1
  1403. mulps %xmm0, %xmm3
  1404. movsd -24 * SIZE(AA), %xmm0
  1405. addps %xmm2, %xmm4
  1406. pshufd $0x00, %xmm1, %xmm2
  1407. mulps %xmm0, %xmm2
  1408. addps %xmm3, %xmm5
  1409. pshufd $0x55, %xmm1, %xmm3
  1410. movsd -22 * SIZE(BB), %xmm1
  1411. mulps %xmm0, %xmm3
  1412. movsd -22 * SIZE(AA), %xmm0
  1413. addps %xmm2, %xmm4
  1414. pshufd $0x00, %xmm1, %xmm2
  1415. mulps %xmm0, %xmm2
  1416. addps %xmm3, %xmm5
  1417. pshufd $0x55, %xmm1, %xmm3
  1418. movsd -20 * SIZE(BB), %xmm1
  1419. mulps %xmm0, %xmm3
  1420. movsd -20 * SIZE(AA), %xmm0
  1421. addps %xmm2, %xmm4
  1422. pshufd $0x00, %xmm1, %xmm2
  1423. mulps %xmm0, %xmm2
  1424. addps %xmm3, %xmm5
  1425. pshufd $0x55, %xmm1, %xmm3
  1426. movsd -18 * SIZE(BB), %xmm1
  1427. mulps %xmm0, %xmm3
  1428. movsd -18 * SIZE(AA), %xmm0
  1429. addps %xmm2, %xmm4
  1430. pshufd $0x00, %xmm1, %xmm2
  1431. mulps %xmm0, %xmm2
  1432. addps %xmm3, %xmm5
  1433. pshufd $0x55, %xmm1, %xmm3
  1434. movsd -16 * SIZE(BB), %xmm1
  1435. mulps %xmm0, %xmm3
  1436. movsd -16 * SIZE(AA), %xmm0
  1437. subl $-16 * SIZE, AA
  1438. subl $-16 * SIZE, BB
  1439. decl %eax
  1440. jne .L141
  1441. ALIGN_4
  1442. .L142:
  1443. #if defined(LT) || defined(RN)
  1444. movl KK, %eax
  1445. #else
  1446. movl K, %eax
  1447. subl KK, %eax
  1448. #endif
  1449. andl $7, %eax # if (k & 1)
  1450. BRANCH
  1451. je .L144
  1452. ALIGN_4
  1453. .L143:
  1454. addps %xmm2, %xmm4
  1455. pshufd $0x00, %xmm1, %xmm2
  1456. mulps %xmm0, %xmm2
  1457. addps %xmm3, %xmm5
  1458. pshufd $0x55, %xmm1, %xmm3
  1459. movsd -30 * SIZE(BB), %xmm1
  1460. mulps %xmm0, %xmm3
  1461. movsd -30 * SIZE(AA), %xmm0
  1462. addl $2 * SIZE, AA
  1463. addl $2 * SIZE, BB
  1464. decl %eax
  1465. jg .L143
  1466. ALIGN_4
  1467. .L144:
  1468. #if defined(LN) || defined(RT)
  1469. movl KK, %eax
  1470. subl $1, %eax
  1471. movl AORIG, AA
  1472. sall $ZBASE_SHIFT, %eax
  1473. leal (AA, %eax, 1), AA
  1474. leal (B, %eax, 1), BB
  1475. #endif
  1476. addps %xmm2, %xmm4
  1477. addps %xmm3, %xmm5
  1478. pshufd $0xb1, %xmm5, %xmm5
  1479. pcmpeqb %xmm0, %xmm0
  1480. psllq $63, %xmm0
  1481. #ifndef CONJ
  1482. shufps $0xb1, %xmm0, %xmm0
  1483. pxor %xmm0, %xmm5
  1484. #else
  1485. #if defined(LN) || defined(LT)
  1486. pxor %xmm0, %xmm4
  1487. #else
  1488. pxor %xmm0, %xmm5
  1489. #endif
  1490. #endif
  1491. addps %xmm5, %xmm4
  1492. #if defined(LN) || defined(LT)
  1493. movsd -32 * SIZE(BB), %xmm2
  1494. subps %xmm4, %xmm2
  1495. #else
  1496. movsd -32 * SIZE(AA), %xmm1
  1497. subps %xmm4, %xmm1
  1498. #endif
  1499. #if defined(LN) || defined(LT)
  1500. movaps -32 * SIZE(AA), %xmm5
  1501. pshufd $0x44, %xmm5, %xmm6
  1502. pshufd $0x11, %xmm5, %xmm7
  1503. pshufd $0xa0, %xmm2, %xmm4
  1504. pshufd $0xf5, %xmm2, %xmm2
  1505. #ifndef CONJ
  1506. xorps %xmm0, %xmm2
  1507. #else
  1508. xorps %xmm0, %xmm4
  1509. #endif
  1510. mulps %xmm6, %xmm4
  1511. mulps %xmm7, %xmm2
  1512. addps %xmm4, %xmm2
  1513. #endif
  1514. #if defined(RN) || defined(RT)
  1515. movaps -32 * SIZE(BB), %xmm4
  1516. pshufd $0x44, %xmm4, %xmm6
  1517. pshufd $0x11, %xmm4, %xmm7
  1518. pshufd $0xa0, %xmm1, %xmm3
  1519. pshufd $0xf5, %xmm1, %xmm1
  1520. #ifndef CONJ
  1521. xorps %xmm0, %xmm1
  1522. #else
  1523. xorps %xmm0, %xmm3
  1524. #endif
  1525. mulps %xmm6, %xmm3
  1526. mulps %xmm7, %xmm1
  1527. addps %xmm3, %xmm1
  1528. #endif
  1529. #ifdef LN
  1530. subl $2 * SIZE, CO1
  1531. #endif
  1532. #if defined(LN) || defined(LT)
  1533. movlps %xmm2, -32 * SIZE(BB)
  1534. movlps %xmm2, 0 * SIZE(CO1)
  1535. #else
  1536. movlps %xmm1, -32 * SIZE(AA)
  1537. movlps %xmm1, 0 * SIZE(CO1)
  1538. #endif
  1539. #ifndef LN
  1540. addl $2 * SIZE, CO1
  1541. #endif
  1542. #if defined(LT) || defined(RN)
  1543. movl K, %eax
  1544. subl KK, %eax
  1545. sall $ZBASE_SHIFT, %eax
  1546. leal (AA, %eax, 1), AA
  1547. leal (BB, %eax, 1), BB
  1548. #endif
  1549. #ifdef LN
  1550. subl $1, KK
  1551. #endif
  1552. #ifdef LT
  1553. addl $1, KK
  1554. #endif
  1555. #ifdef RT
  1556. movl K, %eax
  1557. sall $ZBASE_SHIFT, %eax
  1558. addl %eax, AORIG
  1559. #endif
  1560. ALIGN_4
  1561. .L149:
  1562. #ifdef LN
  1563. movl K, %eax
  1564. sall $ZBASE_SHIFT, %eax
  1565. addl %eax, B
  1566. #endif
  1567. #if defined(LT) || defined(RN)
  1568. movl BB, B
  1569. #endif
  1570. #ifdef RN
  1571. addl $1, KK
  1572. #endif
  1573. #ifdef RT
  1574. subl $1, KK
  1575. #endif
  1576. ALIGN_4
  1577. .L999:
  1578. popl %ebx
  1579. popl %esi
  1580. popl %edi
  1581. popl %ebp
  1582. addl $ARGS, %esp
  1583. ret
  1584. EPILOGUE