You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_4x2_atom.S 37 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define AO %r13
  49. #define BO %r14
  50. #define CO1 %r15
  51. #define CO2 %rbx
  52. #define KK %rbp
  53. #define BB %r12
  54. #ifndef WINDOWS_ABI
  55. #define STACKSIZE 128
  56. #define OLD_LDC 8 + STACKSIZE(%rsp)
  57. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  58. #define OFFSET 48(%rsp)
  59. #define J 56(%rsp)
  60. #define KKK 64(%rsp)
  61. #define AORIG 72(%rsp)
  62. #else
  63. #define STACKSIZE 256
  64. #define OLD_A 40 + STACKSIZE(%rsp)
  65. #define OLD_B 48 + STACKSIZE(%rsp)
  66. #define OLD_C 56 + STACKSIZE(%rsp)
  67. #define OLD_LDC 64 + STACKSIZE(%rsp)
  68. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  69. #define OFFSET 224(%rsp)
  70. #define J 232(%rsp)
  71. #define KKK 240(%rsp)
  72. #define AORIG 248(%rsp)
  73. #endif
  74. #define PREFETCH prefetcht0
  75. #define PREFETCHSIZE (8 * 8 + 3)
  76. PROLOGUE
  77. PROFCODE
  78. subq $STACKSIZE, %rsp
  79. movq %rbx, 0(%rsp)
  80. movq %rbp, 8(%rsp)
  81. movq %r12, 16(%rsp)
  82. movq %r13, 24(%rsp)
  83. movq %r14, 32(%rsp)
  84. movq %r15, 40(%rsp)
  85. #ifdef WINDOWS_ABI
  86. movq %rdi, 48(%rsp)
  87. movq %rsi, 56(%rsp)
  88. movups %xmm6, 64(%rsp)
  89. movups %xmm7, 80(%rsp)
  90. movups %xmm8, 96(%rsp)
  91. movups %xmm9, 112(%rsp)
  92. movups %xmm10, 128(%rsp)
  93. movups %xmm11, 144(%rsp)
  94. movups %xmm12, 160(%rsp)
  95. movups %xmm13, 176(%rsp)
  96. movups %xmm14, 192(%rsp)
  97. movups %xmm15, 208(%rsp)
  98. movq ARG1, M
  99. movq ARG2, N
  100. movq ARG3, K
  101. movq OLD_A, A
  102. movq OLD_B, B
  103. movq OLD_C, C
  104. #endif
  105. movq OLD_LDC, LDC
  106. movq OLD_OFFSET, KK
  107. movq KK, OFFSET
  108. leaq (, LDC, SIZE), LDC
  109. #ifdef LN
  110. leaq (, M, SIZE), %rax
  111. addq %rax, C
  112. imulq K, %rax
  113. addq %rax, A
  114. #endif
  115. #ifdef RT
  116. leaq (, N, SIZE), %rax
  117. imulq K, %rax
  118. addq %rax, B
  119. movq N, %rax
  120. imulq LDC, %rax
  121. addq %rax, C
  122. #endif
  123. #ifdef RN
  124. negq KK
  125. #endif
  126. #ifdef RT
  127. movq N, %rax
  128. subq OFFSET, %rax
  129. movq %rax, KK
  130. #endif
  131. movq N, J
  132. sarq $1, J
  133. jle .L40
  134. ALIGN_4
  135. .L10:
  136. #if defined(LT) || defined(RN)
  137. movq A, AO
  138. #else
  139. movq A, AORIG
  140. #endif
  141. #ifdef RT
  142. movq K, %rax
  143. salq $1 + BASE_SHIFT, %rax
  144. subq %rax, B
  145. leaq (, LDC, 2), %rax
  146. subq %rax, C
  147. #endif
  148. movq C, CO1
  149. leaq (C, LDC, 1), CO2
  150. #ifndef RT
  151. leaq (C, LDC, 2), C
  152. #endif
  153. #ifdef LN
  154. movq OFFSET, %rax
  155. addq M, %rax
  156. movq %rax, KK
  157. #endif
  158. movq K, %rax
  159. salq $BASE_SHIFT + 1, %rax
  160. leaq (B, %rax), BB
  161. #ifdef LT
  162. movq OFFSET, %rax
  163. movq %rax, KK
  164. #endif
  165. movq M, I
  166. sarq $2, I
  167. jle .L20
  168. ALIGN_4
  169. .L11:
  170. #ifdef LN
  171. movq K, %rax
  172. salq $2 + BASE_SHIFT, %rax
  173. subq %rax, AORIG
  174. #endif
  175. #if defined(LN) || defined(RT)
  176. movq KK, %rax
  177. leaq (, %rax, SIZE), %rax
  178. movq AORIG, AO
  179. leaq (AO, %rax, 4), AO
  180. leaq (B, %rax, 2), BO
  181. #else
  182. movq B, BO
  183. #endif
  184. prefetcht0 0 * SIZE(BB)
  185. subq $-8 * SIZE, BB
  186. movsd 0 * SIZE(AO), %xmm0
  187. xorps %xmm2, %xmm2
  188. movsd 1 * SIZE(AO), %xmm4
  189. xorps %xmm5, %xmm5
  190. movsd 2 * SIZE(AO), %xmm5
  191. xorps %xmm6, %xmm6
  192. xorps %xmm7, %xmm7
  193. movsd 0 * SIZE(BO), %xmm1
  194. xorps %xmm8, %xmm8
  195. xorps %xmm9, %xmm9
  196. movsd 1 * SIZE(BO), %xmm3
  197. xorps %xmm10, %xmm10
  198. xorps %xmm11, %xmm11
  199. prefetcht0 3 * SIZE(CO1)
  200. xorps %xmm12, %xmm12
  201. xorps %xmm13, %xmm13
  202. prefetcht0 3 * SIZE(CO2)
  203. xorps %xmm14, %xmm14
  204. xorps %xmm15, %xmm15
  205. #if defined(LT) || defined(RN)
  206. movq KK, %rax
  207. #else
  208. movq K, %rax
  209. subq KK, %rax
  210. #endif
  211. sarq $2, %rax
  212. je .L15
  213. ALIGN_4
  214. .L12:
  215. addsd %xmm2, %xmm13
  216. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  217. movaps %xmm0, %xmm2
  218. mulsd %xmm1, %xmm0
  219. addsd %xmm7, %xmm14
  220. movsd 3 * SIZE(AO), %xmm7
  221. mulsd %xmm3, %xmm2
  222. addsd %xmm6, %xmm15
  223. PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO)
  224. movaps %xmm4, %xmm6
  225. mulsd %xmm1, %xmm4
  226. addsd %xmm0, %xmm8
  227. movsd 4 * SIZE(AO), %xmm0
  228. mulsd %xmm3, %xmm6
  229. addsd %xmm2, %xmm9
  230. movaps %xmm5, %xmm2
  231. mulsd %xmm1, %xmm5
  232. addsd %xmm4, %xmm10
  233. movsd 5 * SIZE(AO), %xmm4
  234. mulsd %xmm3, %xmm2
  235. addsd %xmm6, %xmm11
  236. movaps %xmm7, %xmm6
  237. mulsd %xmm1, %xmm7
  238. movsd 2 * SIZE(BO), %xmm1
  239. addsd %xmm5, %xmm12
  240. movsd 6 * SIZE(AO), %xmm5
  241. mulsd %xmm3, %xmm6
  242. movsd 3 * SIZE(BO), %xmm3
  243. addsd %xmm2, %xmm13
  244. movaps %xmm0, %xmm2
  245. mulsd %xmm1, %xmm0
  246. addsd %xmm7, %xmm14
  247. movsd 7 * SIZE(AO), %xmm7
  248. mulsd %xmm3, %xmm2
  249. addsd %xmm6, %xmm15
  250. movaps %xmm4, %xmm6
  251. mulsd %xmm1, %xmm4
  252. addsd %xmm0, %xmm8
  253. movsd 8 * SIZE(AO), %xmm0
  254. mulsd %xmm3, %xmm6
  255. addsd %xmm2, %xmm9
  256. movaps %xmm5, %xmm2
  257. mulsd %xmm1, %xmm5
  258. addsd %xmm4, %xmm10
  259. movsd 9 * SIZE(AO), %xmm4
  260. mulsd %xmm3, %xmm2
  261. addsd %xmm6, %xmm11
  262. movaps %xmm7, %xmm6
  263. mulsd %xmm1, %xmm7
  264. movsd 4 * SIZE(BO), %xmm1
  265. addsd %xmm5, %xmm12
  266. movsd 10 * SIZE(AO), %xmm5
  267. mulsd %xmm3, %xmm6
  268. movsd 5 * SIZE(BO), %xmm3
  269. addsd %xmm2, %xmm13
  270. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  271. movaps %xmm0, %xmm2
  272. mulsd %xmm1, %xmm0
  273. addsd %xmm7, %xmm14
  274. movsd 11 * SIZE(AO), %xmm7
  275. mulsd %xmm3, %xmm2
  276. addsd %xmm6, %xmm15
  277. movaps %xmm4, %xmm6
  278. mulsd %xmm1, %xmm4
  279. addsd %xmm0, %xmm8
  280. movsd 12 * SIZE(AO), %xmm0
  281. mulsd %xmm3, %xmm6
  282. addsd %xmm2, %xmm9
  283. movaps %xmm5, %xmm2
  284. mulsd %xmm1, %xmm5
  285. addsd %xmm4, %xmm10
  286. movsd 13 * SIZE(AO), %xmm4
  287. mulsd %xmm3, %xmm2
  288. addsd %xmm6, %xmm11
  289. movaps %xmm7, %xmm6
  290. mulsd %xmm1, %xmm7
  291. movsd 6 * SIZE(BO), %xmm1
  292. addsd %xmm5, %xmm12
  293. movsd 14 * SIZE(AO), %xmm5
  294. mulsd %xmm3, %xmm6
  295. movsd 7 * SIZE(BO), %xmm3
  296. addsd %xmm2, %xmm13
  297. movaps %xmm0, %xmm2
  298. mulsd %xmm1, %xmm0
  299. addsd %xmm7, %xmm14
  300. movsd 15 * SIZE(AO), %xmm7
  301. mulsd %xmm3, %xmm2
  302. subq $-16 * SIZE, AO
  303. addsd %xmm6, %xmm15
  304. movaps %xmm4, %xmm6
  305. mulsd %xmm1, %xmm4
  306. addsd %xmm0, %xmm8
  307. movsd 0 * SIZE(AO), %xmm0
  308. mulsd %xmm3, %xmm6
  309. addsd %xmm2, %xmm9
  310. movaps %xmm5, %xmm2
  311. mulsd %xmm1, %xmm5
  312. addq $ 8 * SIZE, BO
  313. addsd %xmm4, %xmm10
  314. movsd 1 * SIZE(AO), %xmm4
  315. mulsd %xmm3, %xmm2
  316. decq %rax
  317. addsd %xmm6, %xmm11
  318. movaps %xmm7, %xmm6
  319. mulsd %xmm1, %xmm7
  320. movsd 0 * SIZE(BO), %xmm1
  321. addsd %xmm5, %xmm12
  322. movsd 2 * SIZE(AO), %xmm5
  323. mulsd %xmm3, %xmm6
  324. movsd 1 * SIZE(BO), %xmm3
  325. jne .L12
  326. ALIGN_4
  327. .L15:
  328. #if defined(LT) || defined(RN)
  329. movq KK, %rax
  330. #else
  331. movq K, %rax
  332. subq KK, %rax
  333. #endif
  334. andq $3, %rax
  335. BRANCH
  336. je .L19
  337. ALIGN_4
  338. .L16:
  339. addsd %xmm2, %xmm13
  340. movaps %xmm0, %xmm2
  341. mulsd %xmm1, %xmm0
  342. addsd %xmm7, %xmm14
  343. movsd 3 * SIZE(AO), %xmm7
  344. mulsd %xmm3, %xmm2
  345. addsd %xmm6, %xmm15
  346. movaps %xmm4, %xmm6
  347. mulsd %xmm1, %xmm4
  348. addsd %xmm0, %xmm8
  349. movsd 4 * SIZE(AO), %xmm0
  350. mulsd %xmm3, %xmm6
  351. addsd %xmm2, %xmm9
  352. movaps %xmm5, %xmm2
  353. mulsd %xmm1, %xmm5
  354. addsd %xmm4, %xmm10
  355. movsd 5 * SIZE(AO), %xmm4
  356. mulsd %xmm3, %xmm2
  357. addsd %xmm6, %xmm11
  358. movaps %xmm7, %xmm6
  359. mulsd %xmm1, %xmm7
  360. movsd 2 * SIZE(BO), %xmm1
  361. addsd %xmm5, %xmm12
  362. movsd 6 * SIZE(AO), %xmm5
  363. mulsd %xmm3, %xmm6
  364. movsd 3 * SIZE(BO), %xmm3
  365. addq $4 * SIZE, AO
  366. addq $2 * SIZE, BO
  367. decq %rax
  368. BRANCH
  369. jg .L16
  370. ALIGN_4
  371. .L19:
  372. addsd %xmm2, %xmm13
  373. addsd %xmm7, %xmm14
  374. addsd %xmm6, %xmm15
  375. #if defined(LN) || defined(RT)
  376. movq KK, %rax
  377. #ifdef LN
  378. subq $4, %rax
  379. #else
  380. subq $2, %rax
  381. #endif
  382. leaq (, %rax, SIZE), %rax
  383. movq AORIG, AO
  384. leaq (AO, %rax, 4), AO
  385. leaq (B, %rax, 2), BO
  386. #endif
  387. #if defined(LN) || defined(LT)
  388. movsd 0 * SIZE(BO), %xmm0
  389. movsd 1 * SIZE(BO), %xmm1
  390. movsd 2 * SIZE(BO), %xmm2
  391. movsd 3 * SIZE(BO), %xmm3
  392. movsd 4 * SIZE(BO), %xmm4
  393. movsd 5 * SIZE(BO), %xmm5
  394. movsd 6 * SIZE(BO), %xmm6
  395. movsd 7 * SIZE(BO), %xmm7
  396. subsd %xmm8, %xmm0
  397. subsd %xmm9, %xmm1
  398. subsd %xmm10, %xmm2
  399. subsd %xmm11, %xmm3
  400. subsd %xmm12, %xmm4
  401. subsd %xmm13, %xmm5
  402. subsd %xmm14, %xmm6
  403. subsd %xmm15, %xmm7
  404. #else
  405. movsd 0 * SIZE(AO), %xmm0
  406. movsd 1 * SIZE(AO), %xmm2
  407. movsd 2 * SIZE(AO), %xmm4
  408. movsd 3 * SIZE(AO), %xmm6
  409. movsd 4 * SIZE(AO), %xmm1
  410. movsd 5 * SIZE(AO), %xmm3
  411. movsd 6 * SIZE(AO), %xmm5
  412. movsd 7 * SIZE(AO), %xmm7
  413. subsd %xmm8, %xmm0
  414. subsd %xmm10, %xmm2
  415. subsd %xmm12, %xmm4
  416. subsd %xmm14, %xmm6
  417. subsd %xmm9, %xmm1
  418. subsd %xmm11, %xmm3
  419. subsd %xmm13, %xmm5
  420. subsd %xmm15, %xmm7
  421. #endif
  422. #ifdef LN
  423. movsd 15 * SIZE(AO), %xmm8
  424. mulsd %xmm8, %xmm6
  425. movsd 14 * SIZE(AO), %xmm9
  426. mulsd %xmm8, %xmm7
  427. movsd 13 * SIZE(AO), %xmm11
  428. movaps %xmm9, %xmm10
  429. movsd 12 * SIZE(AO), %xmm13
  430. mulsd %xmm6, %xmm9
  431. movsd 10 * SIZE(AO), %xmm8
  432. mulsd %xmm7, %xmm10
  433. subsd %xmm9, %xmm4
  434. movsd 9 * SIZE(AO), %xmm9
  435. subsd %xmm10, %xmm5
  436. movaps %xmm11, %xmm12
  437. mulsd %xmm6, %xmm11
  438. mulsd %xmm7, %xmm12
  439. subsd %xmm11, %xmm2
  440. movsd 8 * SIZE(AO), %xmm11
  441. subsd %xmm12, %xmm3
  442. movaps %xmm13, %xmm14
  443. mulsd %xmm6, %xmm13
  444. mulsd %xmm7, %xmm14
  445. subsd %xmm13, %xmm0
  446. subsd %xmm14, %xmm1
  447. mulsd %xmm8, %xmm4
  448. mulsd %xmm8, %xmm5
  449. movsd 5 * SIZE(AO), %xmm8
  450. movaps %xmm9, %xmm10
  451. mulsd %xmm4, %xmm9
  452. mulsd %xmm5, %xmm10
  453. subsd %xmm9, %xmm2
  454. movsd 4 * SIZE(AO), %xmm9
  455. subsd %xmm10, %xmm3
  456. movaps %xmm11, %xmm12
  457. mulsd %xmm4, %xmm11
  458. mulsd %xmm5, %xmm12
  459. subsd %xmm11, %xmm0
  460. movsd 0 * SIZE(AO), %xmm11
  461. subsd %xmm12, %xmm1
  462. mulsd %xmm8, %xmm2
  463. mulsd %xmm8, %xmm3
  464. movaps %xmm9, %xmm10
  465. mulsd %xmm2, %xmm9
  466. mulsd %xmm3, %xmm10
  467. subsd %xmm9, %xmm0
  468. subsd %xmm10, %xmm1
  469. mulsd %xmm11, %xmm0
  470. mulsd %xmm11, %xmm1
  471. #endif
  472. #ifdef LT
  473. movsd 0 * SIZE(AO), %xmm8
  474. mulsd %xmm8, %xmm0
  475. movsd 1 * SIZE(AO), %xmm9
  476. mulsd %xmm8, %xmm1
  477. movsd 2 * SIZE(AO), %xmm11
  478. movaps %xmm9, %xmm10
  479. movsd 3 * SIZE(AO), %xmm13
  480. mulsd %xmm0, %xmm9
  481. movsd 5 * SIZE(AO), %xmm8
  482. mulsd %xmm1, %xmm10
  483. subsd %xmm9, %xmm2
  484. movsd 6 * SIZE(AO), %xmm9
  485. subsd %xmm10, %xmm3
  486. movaps %xmm11, %xmm12
  487. mulsd %xmm0, %xmm11
  488. mulsd %xmm1, %xmm12
  489. subsd %xmm11, %xmm4
  490. movsd 7 * SIZE(AO), %xmm11
  491. subsd %xmm12, %xmm5
  492. movaps %xmm13, %xmm14
  493. mulsd %xmm0, %xmm13
  494. mulsd %xmm1, %xmm14
  495. subsd %xmm13, %xmm6
  496. subsd %xmm14, %xmm7
  497. mulsd %xmm8, %xmm2
  498. mulsd %xmm8, %xmm3
  499. movsd 10 * SIZE(AO), %xmm8
  500. movaps %xmm9, %xmm10
  501. mulsd %xmm2, %xmm9
  502. mulsd %xmm3, %xmm10
  503. subsd %xmm9, %xmm4
  504. movsd 11 * SIZE(AO), %xmm9
  505. subsd %xmm10, %xmm5
  506. movaps %xmm11, %xmm12
  507. mulsd %xmm2, %xmm11
  508. mulsd %xmm3, %xmm12
  509. subsd %xmm11, %xmm6
  510. subsd %xmm12, %xmm7
  511. mulsd %xmm8, %xmm4
  512. mulsd %xmm8, %xmm5
  513. movsd 15 * SIZE(AO), %xmm8
  514. movaps %xmm9, %xmm10
  515. mulsd %xmm4, %xmm9
  516. mulsd %xmm5, %xmm10
  517. subsd %xmm9, %xmm6
  518. subsd %xmm10, %xmm7
  519. mulsd %xmm8, %xmm6
  520. mulsd %xmm8, %xmm7
  521. #endif
  522. #ifdef RN
  523. movsd 0 * SIZE(BO), %xmm8
  524. mulsd %xmm8, %xmm0
  525. movsd 1 * SIZE(BO), %xmm9
  526. mulsd %xmm8, %xmm2
  527. movsd 3 * SIZE(BO), %xmm13
  528. mulsd %xmm8, %xmm4
  529. mulsd %xmm8, %xmm6
  530. movaps %xmm9, %xmm10
  531. movaps %xmm9, %xmm11
  532. movaps %xmm9, %xmm12
  533. mulsd %xmm0, %xmm9
  534. mulsd %xmm2, %xmm10
  535. mulsd %xmm4, %xmm11
  536. mulsd %xmm6, %xmm12
  537. subsd %xmm9, %xmm1
  538. subsd %xmm10, %xmm3
  539. subsd %xmm11, %xmm5
  540. subsd %xmm12, %xmm7
  541. mulsd %xmm13, %xmm1
  542. mulsd %xmm13, %xmm3
  543. mulsd %xmm13, %xmm5
  544. mulsd %xmm13, %xmm7
  545. #endif
  546. #ifdef RT
  547. movsd 3 * SIZE(BO), %xmm8
  548. mulsd %xmm8, %xmm1
  549. movsd 2 * SIZE(BO), %xmm9
  550. mulsd %xmm8, %xmm3
  551. movsd 0 * SIZE(BO), %xmm13
  552. mulsd %xmm8, %xmm5
  553. mulsd %xmm8, %xmm7
  554. movaps %xmm9, %xmm10
  555. movaps %xmm9, %xmm11
  556. movaps %xmm9, %xmm12
  557. mulsd %xmm1, %xmm9
  558. mulsd %xmm3, %xmm10
  559. mulsd %xmm5, %xmm11
  560. mulsd %xmm7, %xmm12
  561. subsd %xmm9, %xmm0
  562. subsd %xmm10, %xmm2
  563. subsd %xmm11, %xmm4
  564. subsd %xmm12, %xmm6
  565. mulsd %xmm13, %xmm0
  566. mulsd %xmm13, %xmm2
  567. mulsd %xmm13, %xmm4
  568. mulsd %xmm13, %xmm6
  569. #endif
  570. #ifdef LN
  571. subq $4 * SIZE, CO1
  572. subq $4 * SIZE, CO2
  573. #endif
  574. movsd %xmm0, 0 * SIZE(CO1)
  575. movsd %xmm2, 1 * SIZE(CO1)
  576. movsd %xmm4, 2 * SIZE(CO1)
  577. movsd %xmm6, 3 * SIZE(CO1)
  578. movsd %xmm1, 0 * SIZE(CO2)
  579. movsd %xmm3, 1 * SIZE(CO2)
  580. movsd %xmm5, 2 * SIZE(CO2)
  581. movsd %xmm7, 3 * SIZE(CO2)
  582. #if defined(LN) || defined(LT)
  583. movsd %xmm0, 0 * SIZE(BO)
  584. movsd %xmm1, 1 * SIZE(BO)
  585. movsd %xmm2, 2 * SIZE(BO)
  586. movsd %xmm3, 3 * SIZE(BO)
  587. movsd %xmm4, 4 * SIZE(BO)
  588. movsd %xmm5, 5 * SIZE(BO)
  589. movsd %xmm6, 6 * SIZE(BO)
  590. movsd %xmm7, 7 * SIZE(BO)
  591. #else
  592. movsd %xmm0, 0 * SIZE(AO)
  593. movsd %xmm2, 1 * SIZE(AO)
  594. movsd %xmm4, 2 * SIZE(AO)
  595. movsd %xmm6, 3 * SIZE(AO)
  596. movsd %xmm1, 4 * SIZE(AO)
  597. movsd %xmm3, 5 * SIZE(AO)
  598. movsd %xmm5, 6 * SIZE(AO)
  599. movsd %xmm7, 7 * SIZE(AO)
  600. #endif
  601. #ifndef LN
  602. addq $4 * SIZE, CO1
  603. addq $4 * SIZE, CO2
  604. #endif
  605. #if defined(LT) || defined(RN)
  606. movq K, %rax
  607. subq KK, %rax
  608. leaq (,%rax, SIZE), %rax
  609. leaq (AO, %rax, 4), AO
  610. leaq (BO, %rax, 2), BO
  611. #endif
  612. #ifdef LN
  613. subq $4, KK
  614. #endif
  615. #ifdef LT
  616. addq $4, KK
  617. #endif
  618. #ifdef RT
  619. movq K, %rax
  620. salq $2 + BASE_SHIFT, %rax
  621. addq %rax, AORIG
  622. #endif
  623. decq I # i --
  624. jg .L11
  625. ALIGN_4
  626. .L20:
  627. testq $2, M
  628. BRANCH
  629. je .L30
  630. #ifdef LN
  631. movq K, %rax
  632. salq $1 + BASE_SHIFT, %rax
  633. subq %rax, AORIG
  634. #endif
  635. #if defined(LN) || defined(RT)
  636. movq KK, %rax
  637. leaq (, %rax, SIZE), %rax
  638. movq AORIG, AO
  639. leaq (AO, %rax, 2), AO
  640. leaq (B, %rax, 2), BO
  641. #else
  642. movq B, BO
  643. #endif
  644. movsd 0 * SIZE(AO), %xmm0
  645. xorps %xmm2, %xmm2
  646. movsd 1 * SIZE(AO), %xmm4
  647. xorps %xmm5, %xmm5
  648. movsd 2 * SIZE(AO), %xmm5
  649. xorps %xmm6, %xmm6
  650. movsd 3 * SIZE(AO), %xmm7
  651. movsd 0 * SIZE(BO), %xmm1
  652. xorps %xmm8, %xmm8
  653. xorps %xmm9, %xmm9
  654. movsd 1 * SIZE(BO), %xmm3
  655. xorps %xmm10, %xmm10
  656. xorps %xmm11, %xmm11
  657. #if defined(LT) || defined(RN)
  658. movq KK, %rax
  659. #else
  660. movq K, %rax
  661. subq KK, %rax
  662. #endif
  663. sarq $2, %rax
  664. je .L25
  665. ALIGN_4
  666. .L22:
  667. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  668. addsd %xmm2, %xmm9
  669. movaps %xmm0, %xmm2
  670. mulsd %xmm1, %xmm0
  671. addsd %xmm6, %xmm11
  672. movaps %xmm4, %xmm6
  673. mulsd %xmm1, %xmm4
  674. movsd 2 * SIZE(BO), %xmm1
  675. addsd %xmm0, %xmm8
  676. movsd 4 * SIZE(AO), %xmm0
  677. mulsd %xmm3, %xmm2
  678. addsd %xmm4, %xmm10
  679. movsd 5 * SIZE(AO), %xmm4
  680. mulsd %xmm3, %xmm6
  681. movsd 3 * SIZE(BO), %xmm3
  682. addsd %xmm2, %xmm9
  683. movaps %xmm5, %xmm2
  684. mulsd %xmm1, %xmm5
  685. addsd %xmm6, %xmm11
  686. movaps %xmm7, %xmm6
  687. mulsd %xmm1, %xmm7
  688. movsd 4 * SIZE(BO), %xmm1
  689. addsd %xmm5, %xmm8
  690. movsd 6 * SIZE(AO), %xmm5
  691. mulsd %xmm3, %xmm2
  692. addsd %xmm7, %xmm10
  693. movsd 7 * SIZE(AO), %xmm7
  694. mulsd %xmm3, %xmm6
  695. movsd 5 * SIZE(BO), %xmm3
  696. addsd %xmm2, %xmm9
  697. movaps %xmm0, %xmm2
  698. mulsd %xmm1, %xmm0
  699. addsd %xmm6, %xmm11
  700. movaps %xmm4, %xmm6
  701. mulsd %xmm1, %xmm4
  702. movsd 6 * SIZE(BO), %xmm1
  703. addsd %xmm0, %xmm8
  704. movsd 8 * SIZE(AO), %xmm0
  705. mulsd %xmm3, %xmm2
  706. addsd %xmm4, %xmm10
  707. movsd 9 * SIZE(AO), %xmm4
  708. mulsd %xmm3, %xmm6
  709. movsd 7 * SIZE(BO), %xmm3
  710. addsd %xmm2, %xmm9
  711. movaps %xmm5, %xmm2
  712. mulsd %xmm1, %xmm5
  713. addsd %xmm6, %xmm11
  714. movaps %xmm7, %xmm6
  715. mulsd %xmm1, %xmm7
  716. movsd 8 * SIZE(BO), %xmm1
  717. addsd %xmm5, %xmm8
  718. movsd 10 * SIZE(AO), %xmm5
  719. mulsd %xmm3, %xmm2
  720. addsd %xmm7, %xmm10
  721. movsd 11 * SIZE(AO), %xmm7
  722. mulsd %xmm3, %xmm6
  723. movsd 9 * SIZE(BO), %xmm3
  724. addq $8 * SIZE, AO
  725. addq $8 * SIZE, BO
  726. decq %rax
  727. jne .L22
  728. ALIGN_4
  729. .L25:
  730. #if defined(LT) || defined(RN)
  731. movq KK, %rax
  732. #else
  733. movq K, %rax
  734. subq KK, %rax
  735. #endif
  736. andq $3, %rax
  737. BRANCH
  738. je .L29
  739. ALIGN_4
  740. .L26:
  741. addsd %xmm2, %xmm9
  742. movaps %xmm0, %xmm2
  743. mulsd %xmm1, %xmm0
  744. addsd %xmm6, %xmm11
  745. movaps %xmm4, %xmm6
  746. mulsd %xmm1, %xmm4
  747. movsd 2 * SIZE(BO), %xmm1
  748. mulsd %xmm3, %xmm2
  749. addsd %xmm0, %xmm8
  750. movsd 2 * SIZE(AO), %xmm0
  751. mulsd %xmm3, %xmm6
  752. movsd 3 * SIZE(BO), %xmm3
  753. addsd %xmm4, %xmm10
  754. movsd 3 * SIZE(AO), %xmm4
  755. addq $2 * SIZE, AO
  756. addq $2 * SIZE, BO
  757. decq %rax
  758. BRANCH
  759. jg .L26
  760. ALIGN_4
  761. .L29:
  762. addsd %xmm2, %xmm9
  763. addsd %xmm6, %xmm11
  764. #if defined(LN) || defined(RT)
  765. movq KK, %rax
  766. #ifdef LN
  767. subq $2, %rax
  768. #else
  769. subq $2, %rax
  770. #endif
  771. leaq (, %rax, SIZE), %rax
  772. movq AORIG, AO
  773. leaq (AO, %rax, 2), AO
  774. leaq (B, %rax, 2), BO
  775. #endif
  776. #if defined(LN) || defined(LT)
  777. movsd 0 * SIZE(BO), %xmm0
  778. movsd 1 * SIZE(BO), %xmm1
  779. movsd 2 * SIZE(BO), %xmm2
  780. movsd 3 * SIZE(BO), %xmm3
  781. subsd %xmm8, %xmm0
  782. subsd %xmm9, %xmm1
  783. subsd %xmm10, %xmm2
  784. subsd %xmm11, %xmm3
  785. #else
  786. movsd 0 * SIZE(AO), %xmm0
  787. movsd 1 * SIZE(AO), %xmm2
  788. movsd 2 * SIZE(AO), %xmm1
  789. movsd 3 * SIZE(AO), %xmm3
  790. subsd %xmm8, %xmm0
  791. subsd %xmm10, %xmm2
  792. subsd %xmm9, %xmm1
  793. subsd %xmm11, %xmm3
  794. #endif
  795. #ifdef LN
  796. movsd 3 * SIZE(AO), %xmm8
  797. mulsd %xmm8, %xmm2
  798. movsd 2 * SIZE(AO), %xmm9
  799. mulsd %xmm8, %xmm3
  800. movsd 0 * SIZE(AO), %xmm13
  801. movaps %xmm9, %xmm10
  802. mulsd %xmm2, %xmm9
  803. mulsd %xmm3, %xmm10
  804. subsd %xmm9, %xmm0
  805. subsd %xmm10, %xmm1
  806. mulsd %xmm13, %xmm0
  807. mulsd %xmm13, %xmm1
  808. #endif
  809. #ifdef LT
  810. movsd 0 * SIZE(AO), %xmm8
  811. mulsd %xmm8, %xmm0
  812. movsd 1 * SIZE(AO), %xmm9
  813. mulsd %xmm8, %xmm1
  814. movsd 3 * SIZE(AO), %xmm13
  815. movaps %xmm9, %xmm10
  816. mulsd %xmm0, %xmm9
  817. mulsd %xmm1, %xmm10
  818. subsd %xmm9, %xmm2
  819. subsd %xmm10, %xmm3
  820. mulsd %xmm13, %xmm2
  821. mulsd %xmm13, %xmm3
  822. #endif
  823. #ifdef RN
  824. movsd 0 * SIZE(BO), %xmm8
  825. mulsd %xmm8, %xmm0
  826. movsd 1 * SIZE(BO), %xmm9
  827. mulsd %xmm8, %xmm2
  828. movsd 3 * SIZE(BO), %xmm13
  829. movaps %xmm9, %xmm10
  830. mulsd %xmm0, %xmm9
  831. mulsd %xmm2, %xmm10
  832. subsd %xmm9, %xmm1
  833. subsd %xmm10, %xmm3
  834. mulsd %xmm13, %xmm1
  835. mulsd %xmm13, %xmm3
  836. #endif
  837. #ifdef RT
  838. movsd 3 * SIZE(BO), %xmm8
  839. mulsd %xmm8, %xmm1
  840. movsd 2 * SIZE(BO), %xmm9
  841. mulsd %xmm8, %xmm3
  842. movsd 0 * SIZE(BO), %xmm13
  843. movaps %xmm9, %xmm10
  844. mulsd %xmm1, %xmm9
  845. mulsd %xmm3, %xmm10
  846. subsd %xmm9, %xmm0
  847. subsd %xmm10, %xmm2
  848. mulsd %xmm13, %xmm0
  849. mulsd %xmm13, %xmm2
  850. #endif
  851. #ifdef LN
  852. subq $2 * SIZE, CO1
  853. subq $2 * SIZE, CO2
  854. #endif
  855. movsd %xmm0, 0 * SIZE(CO1)
  856. movsd %xmm2, 1 * SIZE(CO1)
  857. movsd %xmm1, 0 * SIZE(CO2)
  858. movsd %xmm3, 1 * SIZE(CO2)
  859. #if defined(LN) || defined(LT)
  860. movsd %xmm0, 0 * SIZE(BO)
  861. movsd %xmm1, 1 * SIZE(BO)
  862. movsd %xmm2, 2 * SIZE(BO)
  863. movsd %xmm3, 3 * SIZE(BO)
  864. #else
  865. movsd %xmm0, 0 * SIZE(AO)
  866. movsd %xmm2, 1 * SIZE(AO)
  867. movsd %xmm1, 2 * SIZE(AO)
  868. movsd %xmm3, 3 * SIZE(AO)
  869. #endif
  870. #ifndef LN
  871. addq $2 * SIZE, CO1
  872. addq $2 * SIZE, CO2
  873. #endif
  874. #if defined(LT) || defined(RN)
  875. movq K, %rax
  876. subq KK, %rax
  877. leaq (,%rax, SIZE), %rax
  878. leaq (AO, %rax, 2), AO
  879. leaq (BO, %rax, 2), BO
  880. #endif
  881. #ifdef LN
  882. subq $2, KK
  883. #endif
  884. #ifdef LT
  885. addq $2, KK
  886. #endif
  887. #ifdef RT
  888. movq K, %rax
  889. salq $1 + BASE_SHIFT, %rax
  890. addq %rax, AORIG
  891. #endif
  892. ALIGN_4
  893. .L30:
  894. testq $1, M
  895. je .L39
  896. #ifdef LN
  897. movq K, %rax
  898. salq $0 + BASE_SHIFT, %rax
  899. subq %rax, AORIG
  900. #endif
  901. #if defined(LN) || defined(RT)
  902. movq KK, %rax
  903. leaq (, %rax, SIZE), %rax
  904. movq AORIG, AO
  905. leaq (AO, %rax, 1), AO
  906. leaq (B, %rax, 2), BO
  907. #else
  908. movq B, BO
  909. #endif
  910. movsd 0 * SIZE(AO), %xmm0
  911. xorps %xmm7, %xmm7
  912. movsd 1 * SIZE(AO), %xmm2
  913. xorps %xmm5, %xmm5
  914. movsd 0 * SIZE(BO), %xmm1
  915. xorps %xmm8, %xmm8
  916. xorps %xmm9, %xmm9
  917. movsd 1 * SIZE(BO), %xmm3
  918. #if defined(LT) || defined(RN)
  919. movq KK, %rax
  920. #else
  921. movq K, %rax
  922. subq KK, %rax
  923. #endif
  924. sarq $2, %rax
  925. je .L35
  926. ALIGN_4
  927. .L32:
  928. addsd %xmm5, %xmm8
  929. movsd 2 * SIZE(BO), %xmm5
  930. mulsd %xmm0, %xmm1
  931. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  932. addsd %xmm7, %xmm9
  933. movsd 3 * SIZE(BO), %xmm7
  934. mulsd %xmm0, %xmm3
  935. movsd 2 * SIZE(AO), %xmm0
  936. addsd %xmm1, %xmm8
  937. movsd 4 * SIZE(BO), %xmm1
  938. mulsd %xmm2, %xmm5
  939. addsd %xmm3, %xmm9
  940. movsd 5 * SIZE(BO), %xmm3
  941. mulsd %xmm2, %xmm7
  942. movsd 3 * SIZE(AO), %xmm2
  943. addsd %xmm5, %xmm8
  944. movsd 6 * SIZE(BO), %xmm5
  945. mulsd %xmm0, %xmm1
  946. addsd %xmm7, %xmm9
  947. movsd 7 * SIZE(BO), %xmm7
  948. mulsd %xmm0, %xmm3
  949. movsd 4 * SIZE(AO), %xmm0
  950. addsd %xmm1, %xmm8
  951. movsd 8 * SIZE(BO), %xmm1
  952. mulsd %xmm2, %xmm5
  953. addsd %xmm3, %xmm9
  954. movsd 9 * SIZE(BO), %xmm3
  955. mulsd %xmm2, %xmm7
  956. movsd 5 * SIZE(AO), %xmm2
  957. addq $4 * SIZE, AO
  958. addq $8 * SIZE, BO
  959. decq %rax
  960. jne .L32
  961. ALIGN_4
  962. .L35:
  963. #if defined(LT) || defined(RN)
  964. movq KK, %rax
  965. #else
  966. movq K, %rax
  967. subq KK, %rax
  968. #endif
  969. addsd %xmm5, %xmm8
  970. addsd %xmm7, %xmm9
  971. andq $3, %rax
  972. BRANCH
  973. BRANCH
  974. je .L38
  975. ALIGN_4
  976. .L36:
  977. mulsd %xmm0, %xmm1
  978. addq $2 * SIZE, BO
  979. mulsd %xmm0, %xmm3
  980. movsd 1 * SIZE(AO), %xmm0
  981. addsd %xmm1, %xmm8
  982. movsd 0 * SIZE(BO), %xmm1
  983. addsd %xmm3, %xmm9
  984. movsd 1 * SIZE(BO), %xmm3
  985. addq $1 * SIZE, AO
  986. decq %rax
  987. BRANCH
  988. jg .L36
  989. ALIGN_4
  990. .L38:
  991. #if defined(LN) || defined(RT)
  992. movq KK, %rax
  993. #ifdef LN
  994. subq $1, %rax
  995. #else
  996. subq $2, %rax
  997. #endif
  998. leaq (, %rax, SIZE), %rax
  999. movq AORIG, AO
  1000. leaq (AO, %rax, 1), AO
  1001. leaq (B, %rax, 2), BO
  1002. #endif
  1003. #if defined(LN) || defined(LT)
  1004. movsd 0 * SIZE(BO), %xmm0
  1005. movsd 1 * SIZE(BO), %xmm1
  1006. subsd %xmm8, %xmm0
  1007. subsd %xmm9, %xmm1
  1008. #else
  1009. movsd 0 * SIZE(AO), %xmm0
  1010. movsd 1 * SIZE(AO), %xmm1
  1011. subsd %xmm8, %xmm0
  1012. subsd %xmm9, %xmm1
  1013. #endif
  1014. #if defined(LN) || defined(LT)
  1015. movsd 0 * SIZE(AO), %xmm8
  1016. mulsd %xmm8, %xmm0
  1017. mulsd %xmm8, %xmm1
  1018. #endif
  1019. #ifdef RN
  1020. movsd 0 * SIZE(BO), %xmm8
  1021. mulsd %xmm8, %xmm0
  1022. movsd 1 * SIZE(BO), %xmm9
  1023. mulsd %xmm0, %xmm9
  1024. movsd 3 * SIZE(BO), %xmm13
  1025. subsd %xmm9, %xmm1
  1026. mulsd %xmm13, %xmm1
  1027. #endif
  1028. #ifdef RT
  1029. movsd 3 * SIZE(BO), %xmm8
  1030. mulsd %xmm8, %xmm1
  1031. movsd 2 * SIZE(BO), %xmm9
  1032. mulsd %xmm1, %xmm9
  1033. movsd 0 * SIZE(BO), %xmm13
  1034. subsd %xmm9, %xmm0
  1035. mulsd %xmm13, %xmm0
  1036. #endif
  1037. #ifdef LN
  1038. subq $1 * SIZE, CO1
  1039. subq $1 * SIZE, CO2
  1040. #endif
  1041. movsd %xmm0, 0 * SIZE(CO1)
  1042. movsd %xmm1, 0 * SIZE(CO2)
  1043. #if defined(LN) || defined(LT)
  1044. movsd %xmm0, 0 * SIZE(BO)
  1045. movsd %xmm1, 1 * SIZE(BO)
  1046. #else
  1047. movsd %xmm0, 0 * SIZE(AO)
  1048. movsd %xmm1, 1 * SIZE(AO)
  1049. #endif
  1050. #ifndef LN
  1051. addq $1 * SIZE, CO1
  1052. addq $1 * SIZE, CO2
  1053. #endif
  1054. #if defined(LT) || defined(RN)
  1055. movq K, %rax
  1056. subq KK, %rax
  1057. leaq (,%rax, SIZE), %rax
  1058. leaq (AO, %rax, 1), AO
  1059. leaq (BO, %rax, 2), BO
  1060. #endif
  1061. #ifdef LN
  1062. subq $1, KK
  1063. #endif
  1064. #ifdef LT
  1065. addq $1, KK
  1066. #endif
  1067. #ifdef RT
  1068. movq K, %rax
  1069. salq $0 + BASE_SHIFT, %rax
  1070. addq %rax, AORIG
  1071. #endif
  1072. ALIGN_4
  1073. .L39:
  1074. #ifdef LN
  1075. leaq (, K, SIZE), %rax
  1076. leaq (B, %rax, 2), B
  1077. #endif
  1078. #if defined(LT) || defined(RN)
  1079. movq BO, B
  1080. #endif
  1081. #ifdef RN
  1082. addq $2, KK
  1083. #endif
  1084. #ifdef RT
  1085. subq $2, KK
  1086. #endif
  1087. decq J # j --
  1088. jg .L10
  1089. ALIGN_4
  1090. .L40:
  1091. testq $1, N
  1092. je .L999
  1093. ALIGN_4
  1094. #if defined(LT) || defined(RN)
  1095. movq A, AO
  1096. #else
  1097. movq A, AORIG
  1098. #endif
  1099. #ifdef RT
  1100. movq K, %rax
  1101. salq $0 + BASE_SHIFT, %rax
  1102. subq %rax, B
  1103. subq LDC, C
  1104. #endif
  1105. movq C, CO1
  1106. #ifndef RT
  1107. addq LDC, C
  1108. #endif
  1109. #ifdef LN
  1110. movq OFFSET, %rax
  1111. addq M, %rax
  1112. movq %rax, KK
  1113. #endif
  1114. #ifdef LT
  1115. movq OFFSET, %rax
  1116. movq %rax, KK
  1117. #endif
  1118. movq M, I
  1119. sarq $2, I
  1120. jle .L50
  1121. ALIGN_4
  1122. .L41:
  1123. #ifdef LN
  1124. movq K, %rax
  1125. salq $2 + BASE_SHIFT, %rax
  1126. subq %rax, AORIG
  1127. #endif
  1128. #if defined(LN) || defined(RT)
  1129. movq KK, %rax
  1130. leaq (, %rax, SIZE), %rax
  1131. movq AORIG, AO
  1132. leaq (AO, %rax, 4), AO
  1133. leaq (B, %rax, 1), BO
  1134. #else
  1135. movq B, BO
  1136. #endif
  1137. movsd 0 * SIZE(AO), %xmm0
  1138. xorps %xmm9, %xmm9
  1139. movsd 1 * SIZE(AO), %xmm1
  1140. xorps %xmm11, %xmm11
  1141. movsd 2 * SIZE(AO), %xmm2
  1142. xorps %xmm13, %xmm13
  1143. movsd 3 * SIZE(AO), %xmm3
  1144. xorps %xmm15, %xmm15
  1145. movsd 0 * SIZE(BO), %xmm4
  1146. xorps %xmm8, %xmm8
  1147. movsd 1 * SIZE(BO), %xmm5
  1148. xorps %xmm10, %xmm10
  1149. prefetcht0 3 * SIZE(CO1)
  1150. xorps %xmm12, %xmm12
  1151. xorps %xmm14, %xmm14
  1152. #if defined(LT) || defined(RN)
  1153. movq KK, %rax
  1154. #else
  1155. movq K, %rax
  1156. subq KK, %rax
  1157. #endif
  1158. sarq $2, %rax
  1159. je .L45
  1160. ALIGN_4
  1161. .L42:
  1162. addsd %xmm9, %xmm8
  1163. movsd 4 * SIZE(AO), %xmm9
  1164. mulsd %xmm4, %xmm0
  1165. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1166. addsd %xmm11, %xmm10
  1167. movsd 5 * SIZE(AO), %xmm11
  1168. mulsd %xmm4, %xmm1
  1169. addsd %xmm13, %xmm12
  1170. movsd 6 * SIZE(AO), %xmm13
  1171. mulsd %xmm4, %xmm2
  1172. addsd %xmm15, %xmm14
  1173. movsd 7 * SIZE(AO), %xmm15
  1174. mulsd %xmm4, %xmm3
  1175. movsd 2 * SIZE(BO), %xmm4
  1176. addsd %xmm0, %xmm8
  1177. movsd 8 * SIZE(AO), %xmm0
  1178. mulsd %xmm5, %xmm9
  1179. addsd %xmm1, %xmm10
  1180. movsd 9 * SIZE(AO), %xmm1
  1181. mulsd %xmm5, %xmm11
  1182. addsd %xmm2, %xmm12
  1183. movsd 10 * SIZE(AO), %xmm2
  1184. mulsd %xmm5, %xmm13
  1185. addsd %xmm3, %xmm14
  1186. movsd 11 * SIZE(AO), %xmm3
  1187. mulsd %xmm5, %xmm15
  1188. movsd 3 * SIZE(BO), %xmm5
  1189. addsd %xmm9, %xmm8
  1190. movsd 12 * SIZE(AO), %xmm9
  1191. mulsd %xmm4, %xmm0
  1192. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  1193. addsd %xmm11, %xmm10
  1194. movsd 13 * SIZE(AO), %xmm11
  1195. mulsd %xmm4, %xmm1
  1196. addsd %xmm13, %xmm12
  1197. movsd 14 * SIZE(AO), %xmm13
  1198. mulsd %xmm4, %xmm2
  1199. addsd %xmm15, %xmm14
  1200. movsd 15 * SIZE(AO), %xmm15
  1201. mulsd %xmm4, %xmm3
  1202. movsd 4 * SIZE(BO), %xmm4
  1203. subq $-16 * SIZE, AO
  1204. addsd %xmm0, %xmm8
  1205. movsd 0 * SIZE(AO), %xmm0
  1206. mulsd %xmm5, %xmm9
  1207. addsd %xmm1, %xmm10
  1208. movsd 1 * SIZE(AO), %xmm1
  1209. mulsd %xmm5, %xmm11
  1210. addq $ 4 * SIZE, BO
  1211. addsd %xmm2, %xmm12
  1212. movsd 2 * SIZE(AO), %xmm2
  1213. mulsd %xmm5, %xmm13
  1214. decq %rax
  1215. addsd %xmm3, %xmm14
  1216. movsd 3 * SIZE(AO), %xmm3
  1217. mulsd %xmm5, %xmm15
  1218. movsd 1 * SIZE(BO), %xmm5
  1219. jne .L42
  1220. ALIGN_4
  1221. .L45:
  1222. #if defined(LT) || defined(RN)
  1223. movq KK, %rax
  1224. #else
  1225. movq K, %rax
  1226. subq KK, %rax
  1227. #endif
  1228. addsd %xmm9, %xmm8
  1229. addsd %xmm11, %xmm10
  1230. addsd %xmm13, %xmm12
  1231. addsd %xmm15, %xmm14
  1232. andq $3, %rax
  1233. BRANCH
  1234. BRANCH
  1235. je .L49
  1236. ALIGN_4
  1237. .L46:
  1238. mulsd %xmm4, %xmm0
  1239. mulsd %xmm4, %xmm1
  1240. mulsd %xmm4, %xmm2
  1241. mulsd %xmm4, %xmm3
  1242. movsd 1 * SIZE(BO), %xmm4
  1243. addsd %xmm0, %xmm8
  1244. movsd 4 * SIZE(AO), %xmm0
  1245. addsd %xmm1, %xmm10
  1246. movsd 5 * SIZE(AO), %xmm1
  1247. addsd %xmm2, %xmm12
  1248. movsd 6 * SIZE(AO), %xmm2
  1249. addsd %xmm3, %xmm14
  1250. movsd 7 * SIZE(AO), %xmm3
  1251. addq $4 * SIZE, AO
  1252. addq $1 * SIZE, BO
  1253. decq %rax
  1254. BRANCH
  1255. jg .L46
  1256. ALIGN_4
  1257. .L49:
  1258. #if defined(LN) || defined(RT)
  1259. movq KK, %rax
  1260. #ifdef LN
  1261. subq $4, %rax
  1262. #else
  1263. subq $1, %rax
  1264. #endif
  1265. leaq (, %rax, SIZE), %rax
  1266. movq AORIG, AO
  1267. leaq (AO, %rax, 4), AO
  1268. leaq (B, %rax, 1), BO
  1269. #endif
  1270. #if defined(LN) || defined(LT)
  1271. movsd 0 * SIZE(BO), %xmm0
  1272. movsd 1 * SIZE(BO), %xmm2
  1273. movsd 2 * SIZE(BO), %xmm4
  1274. movsd 3 * SIZE(BO), %xmm6
  1275. subsd %xmm8, %xmm0
  1276. subsd %xmm10, %xmm2
  1277. subsd %xmm12, %xmm4
  1278. subsd %xmm14, %xmm6
  1279. #else
  1280. movsd 0 * SIZE(AO), %xmm0
  1281. movsd 1 * SIZE(AO), %xmm2
  1282. movsd 2 * SIZE(AO), %xmm4
  1283. movsd 3 * SIZE(AO), %xmm6
  1284. subsd %xmm8, %xmm0
  1285. subsd %xmm10, %xmm2
  1286. subsd %xmm12, %xmm4
  1287. subsd %xmm14, %xmm6
  1288. #endif
  1289. #ifdef LN
  1290. movsd 15 * SIZE(AO), %xmm8
  1291. mulsd %xmm8, %xmm6
  1292. movsd 14 * SIZE(AO), %xmm9
  1293. mulsd %xmm6, %xmm9
  1294. movsd 13 * SIZE(AO), %xmm11
  1295. subsd %xmm9, %xmm4
  1296. movsd 12 * SIZE(AO), %xmm13
  1297. mulsd %xmm6, %xmm11
  1298. movsd 10 * SIZE(AO), %xmm8
  1299. subsd %xmm11, %xmm2
  1300. movsd 9 * SIZE(AO), %xmm9
  1301. mulsd %xmm6, %xmm13
  1302. movsd 8 * SIZE(AO), %xmm11
  1303. subsd %xmm13, %xmm0
  1304. mulsd %xmm8, %xmm4
  1305. movsd 5 * SIZE(AO), %xmm8
  1306. mulsd %xmm4, %xmm9
  1307. subsd %xmm9, %xmm2
  1308. movsd 4 * SIZE(AO), %xmm9
  1309. mulsd %xmm4, %xmm11
  1310. subsd %xmm11, %xmm0
  1311. movsd 0 * SIZE(AO), %xmm11
  1312. mulsd %xmm8, %xmm2
  1313. mulsd %xmm2, %xmm9
  1314. subsd %xmm9, %xmm0
  1315. mulsd %xmm11, %xmm0
  1316. #endif
  1317. #ifdef LT
  1318. movsd 0 * SIZE(AO), %xmm8
  1319. mulsd %xmm8, %xmm0
  1320. movsd 1 * SIZE(AO), %xmm9
  1321. mulsd %xmm0, %xmm9
  1322. movsd 2 * SIZE(AO), %xmm11
  1323. subsd %xmm9, %xmm2
  1324. movsd 3 * SIZE(AO), %xmm13
  1325. mulsd %xmm0, %xmm11
  1326. movsd 5 * SIZE(AO), %xmm8
  1327. subsd %xmm11, %xmm4
  1328. movsd 6 * SIZE(AO), %xmm9
  1329. mulsd %xmm0, %xmm13
  1330. movsd 7 * SIZE(AO), %xmm11
  1331. subsd %xmm13, %xmm6
  1332. mulsd %xmm8, %xmm2
  1333. movsd 10 * SIZE(AO), %xmm8
  1334. mulsd %xmm2, %xmm9
  1335. subsd %xmm9, %xmm4
  1336. movsd 11 * SIZE(AO), %xmm9
  1337. mulsd %xmm2, %xmm11
  1338. subsd %xmm11, %xmm6
  1339. mulsd %xmm8, %xmm4
  1340. movsd 15 * SIZE(AO), %xmm8
  1341. mulsd %xmm4, %xmm9
  1342. subsd %xmm9, %xmm6
  1343. mulsd %xmm8, %xmm6
  1344. #endif
  1345. #if defined(RN) || defined(RT)
  1346. movsd 0 * SIZE(BO), %xmm8
  1347. mulsd %xmm8, %xmm0
  1348. mulsd %xmm8, %xmm2
  1349. mulsd %xmm8, %xmm4
  1350. mulsd %xmm8, %xmm6
  1351. #endif
  1352. #ifdef LN
  1353. subq $4 * SIZE, CO1
  1354. #endif
  1355. movsd %xmm0, 0 * SIZE(CO1)
  1356. movsd %xmm2, 1 * SIZE(CO1)
  1357. movsd %xmm4, 2 * SIZE(CO1)
  1358. movsd %xmm6, 3 * SIZE(CO1)
  1359. #if defined(LN) || defined(LT)
  1360. movsd %xmm0, 0 * SIZE(BO)
  1361. movsd %xmm2, 1 * SIZE(BO)
  1362. movsd %xmm4, 2 * SIZE(BO)
  1363. movsd %xmm6, 3 * SIZE(BO)
  1364. #else
  1365. movsd %xmm0, 0 * SIZE(AO)
  1366. movsd %xmm2, 1 * SIZE(AO)
  1367. movsd %xmm4, 2 * SIZE(AO)
  1368. movsd %xmm6, 3 * SIZE(AO)
  1369. #endif
  1370. #ifndef LN
  1371. addq $4 * SIZE, CO1
  1372. #endif
  1373. #if defined(LT) || defined(RN)
  1374. movq K, %rax
  1375. subq KK, %rax
  1376. leaq (,%rax, SIZE), %rax
  1377. leaq (AO, %rax, 4), AO
  1378. leaq (BO, %rax, 1), BO
  1379. #endif
  1380. #ifdef LN
  1381. subq $4, KK
  1382. #endif
  1383. #ifdef LT
  1384. addq $4, KK
  1385. #endif
  1386. #ifdef RT
  1387. movq K, %rax
  1388. salq $2 + BASE_SHIFT, %rax
  1389. addq %rax, AORIG
  1390. #endif
  1391. decq I # i --
  1392. jg .L41
  1393. ALIGN_4
  1394. .L50:
  1395. testq $2, M
  1396. je .L60
  1397. #ifdef LN
  1398. movq K, %rax
  1399. salq $1 + BASE_SHIFT, %rax
  1400. subq %rax, AORIG
  1401. #endif
  1402. #if defined(LN) || defined(RT)
  1403. movq KK, %rax
  1404. leaq (, %rax, SIZE), %rax
  1405. movq AORIG, AO
  1406. leaq (AO, %rax, 2), AO
  1407. leaq (B, %rax, 1), BO
  1408. #else
  1409. movq B, BO
  1410. #endif
  1411. movsd 0 * SIZE(AO), %xmm0
  1412. xorps %xmm2, %xmm2
  1413. movsd 1 * SIZE(AO), %xmm1
  1414. xorps %xmm3, %xmm3
  1415. movsd 0 * SIZE(BO), %xmm4
  1416. xorps %xmm8, %xmm8
  1417. movsd 1 * SIZE(BO), %xmm5
  1418. xorps %xmm10, %xmm10
  1419. #if defined(LT) || defined(RN)
  1420. movq KK, %rax
  1421. #else
  1422. movq K, %rax
  1423. subq KK, %rax
  1424. #endif
  1425. sarq $2, %rax
  1426. je .L55
  1427. ALIGN_4
  1428. .L52:
  1429. addsd %xmm2, %xmm8
  1430. movsd 2 * SIZE(AO), %xmm2
  1431. mulsd %xmm4, %xmm0
  1432. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1433. addsd %xmm3, %xmm10
  1434. movsd 3 * SIZE(AO), %xmm3
  1435. mulsd %xmm4, %xmm1
  1436. movsd 2 * SIZE(BO), %xmm4
  1437. addsd %xmm0, %xmm8
  1438. movsd 4 * SIZE(AO), %xmm0
  1439. mulsd %xmm5, %xmm2
  1440. addq $8 * SIZE, AO
  1441. addsd %xmm1, %xmm10
  1442. movsd -3 * SIZE(AO), %xmm1
  1443. mulsd %xmm5, %xmm3
  1444. movsd 3 * SIZE(BO), %xmm5
  1445. addsd %xmm2, %xmm8
  1446. movsd -2 * SIZE(AO), %xmm2
  1447. mulsd %xmm4, %xmm0
  1448. addq $4 * SIZE, BO
  1449. addsd %xmm3, %xmm10
  1450. movsd -1 * SIZE(AO), %xmm3
  1451. mulsd %xmm4, %xmm1
  1452. movsd 0 * SIZE(BO), %xmm4
  1453. addsd %xmm0, %xmm8
  1454. movsd 0 * SIZE(AO), %xmm0
  1455. mulsd %xmm5, %xmm2
  1456. decq %rax
  1457. addsd %xmm1, %xmm10
  1458. movsd 1 * SIZE(AO), %xmm1
  1459. mulsd %xmm5, %xmm3
  1460. movsd 1 * SIZE(BO), %xmm5
  1461. jne .L52
  1462. ALIGN_4
  1463. .L55:
  1464. #if defined(LT) || defined(RN)
  1465. movq KK, %rax
  1466. #else
  1467. movq K, %rax
  1468. subq KK, %rax
  1469. #endif
  1470. addsd %xmm2, %xmm8
  1471. addsd %xmm3, %xmm10
  1472. andq $3, %rax
  1473. BRANCH
  1474. je .L59
  1475. ALIGN_4
  1476. .L56:
  1477. mulsd %xmm4, %xmm0
  1478. mulsd %xmm4, %xmm1
  1479. movsd 1 * SIZE(BO), %xmm4
  1480. addsd %xmm0, %xmm8
  1481. movsd 2 * SIZE(AO), %xmm0
  1482. addsd %xmm1, %xmm10
  1483. movsd 3 * SIZE(AO), %xmm1
  1484. addq $2 * SIZE, AO
  1485. addq $1 * SIZE, BO
  1486. decq %rax
  1487. BRANCH
  1488. jg .L56
  1489. ALIGN_4
  1490. .L59:
  1491. #if defined(LN) || defined(RT)
  1492. movq KK, %rax
  1493. #ifdef LN
  1494. subq $2, %rax
  1495. #else
  1496. subq $1, %rax
  1497. #endif
  1498. leaq (, %rax, SIZE), %rax
  1499. movq AORIG, AO
  1500. leaq (AO, %rax, 2), AO
  1501. leaq (B, %rax, 1), BO
  1502. #endif
  1503. #if defined(LN) || defined(LT)
  1504. movsd 0 * SIZE(BO), %xmm0
  1505. movsd 1 * SIZE(BO), %xmm2
  1506. subsd %xmm8, %xmm0
  1507. subsd %xmm10, %xmm2
  1508. #else
  1509. movsd 0 * SIZE(AO), %xmm0
  1510. movsd 1 * SIZE(AO), %xmm2
  1511. subsd %xmm8, %xmm0
  1512. subsd %xmm10, %xmm2
  1513. #endif
  1514. #ifdef LN
  1515. movsd 3 * SIZE(AO), %xmm8
  1516. movsd 2 * SIZE(AO), %xmm9
  1517. movsd 0 * SIZE(AO), %xmm11
  1518. mulsd %xmm8, %xmm2
  1519. mulsd %xmm2, %xmm9
  1520. subsd %xmm9, %xmm0
  1521. mulsd %xmm11,%xmm0
  1522. #endif
  1523. #ifdef LT
  1524. movsd 0 * SIZE(AO), %xmm8
  1525. movsd 1 * SIZE(AO), %xmm9
  1526. movsd 3 * SIZE(AO), %xmm11
  1527. mulsd %xmm8, %xmm0
  1528. mulsd %xmm0, %xmm9
  1529. subsd %xmm9, %xmm2
  1530. mulsd %xmm11,%xmm2
  1531. #endif
  1532. #if defined(RN) || defined(RT)
  1533. movsd 0 * SIZE(BO), %xmm8
  1534. mulsd %xmm8, %xmm0
  1535. mulsd %xmm8, %xmm2
  1536. #endif
  1537. #ifdef LN
  1538. subq $2 * SIZE, CO1
  1539. #endif
  1540. movsd %xmm0, 0 * SIZE(CO1)
  1541. movsd %xmm2, 1 * SIZE(CO1)
  1542. #if defined(LN) || defined(LT)
  1543. movsd %xmm0, 0 * SIZE(BO)
  1544. movsd %xmm2, 1 * SIZE(BO)
  1545. #else
  1546. movsd %xmm0, 0 * SIZE(AO)
  1547. movsd %xmm2, 1 * SIZE(AO)
  1548. #endif
  1549. #ifndef LN
  1550. addq $2 * SIZE, CO1
  1551. #endif
  1552. #if defined(LT) || defined(RN)
  1553. movq K, %rax
  1554. subq KK, %rax
  1555. leaq (,%rax, SIZE), %rax
  1556. leaq (AO, %rax, 2), AO
  1557. leaq (BO, %rax, 1), BO
  1558. #endif
  1559. #ifdef LN
  1560. subq $2, KK
  1561. #endif
  1562. #ifdef LT
  1563. addq $2, KK
  1564. #endif
  1565. #ifdef RT
  1566. movq K, %rax
  1567. salq $1 + BASE_SHIFT, %rax
  1568. addq %rax, AORIG
  1569. #endif
  1570. ALIGN_4
  1571. .L60:
  1572. testq $1, M
  1573. je .L69
  1574. #ifdef LN
  1575. movq K, %rax
  1576. salq $0 + BASE_SHIFT, %rax
  1577. subq %rax, AORIG
  1578. #endif
  1579. #if defined(LN) || defined(RT)
  1580. movq KK, %rax
  1581. leaq (, %rax, SIZE), %rax
  1582. movq AORIG, AO
  1583. leaq (AO, %rax, 1), AO
  1584. leaq (B, %rax, 1), BO
  1585. #else
  1586. movq B, BO
  1587. #endif
  1588. movsd 0 * SIZE(AO), %xmm0
  1589. xorps %xmm5, %xmm5
  1590. movsd 1 * SIZE(AO), %xmm2
  1591. xorps %xmm7, %xmm7
  1592. movsd 0 * SIZE(BO), %xmm1
  1593. xorps %xmm8, %xmm8
  1594. movsd 1 * SIZE(BO), %xmm3
  1595. xorps %xmm9, %xmm9
  1596. movsd 2 * SIZE(AO), %xmm4
  1597. movsd 3 * SIZE(AO), %xmm6
  1598. #if defined(LT) || defined(RN)
  1599. movq KK, %rax
  1600. #else
  1601. movq K, %rax
  1602. subq KK, %rax
  1603. #endif
  1604. sarq $2, %rax
  1605. je .L65
  1606. ALIGN_4
  1607. .L62:
  1608. addsd %xmm5, %xmm8
  1609. movsd 2 * SIZE(BO), %xmm5
  1610. mulsd %xmm0, %xmm1
  1611. movsd 4 * SIZE(AO), %xmm0
  1612. addsd %xmm7, %xmm9
  1613. movsd 3 * SIZE(BO), %xmm7
  1614. mulsd %xmm2, %xmm3
  1615. movsd 5 * SIZE(AO), %xmm2
  1616. addsd %xmm1, %xmm8
  1617. movsd 4 * SIZE(BO), %xmm1
  1618. mulsd %xmm4, %xmm5
  1619. movsd 6 * SIZE(AO), %xmm4
  1620. addsd %xmm3, %xmm9
  1621. movsd 5 * SIZE(BO), %xmm3
  1622. mulsd %xmm6, %xmm7
  1623. movsd 7 * SIZE(AO), %xmm6
  1624. addq $4 * SIZE, AO
  1625. addq $4 * SIZE, BO
  1626. decq %rax
  1627. jne .L62
  1628. addsd %xmm5, %xmm8
  1629. addsd %xmm7, %xmm9
  1630. ALIGN_4
  1631. .L65:
  1632. #if defined(LT) || defined(RN)
  1633. movq KK, %rax
  1634. #else
  1635. movq K, %rax
  1636. subq KK, %rax
  1637. #endif
  1638. andq $3, %rax
  1639. BRANCH
  1640. je .L68
  1641. ALIGN_4
  1642. .L66:
  1643. movsd 0 * SIZE(AO), %xmm0
  1644. movsd 0 * SIZE(BO), %xmm1
  1645. mulsd %xmm0, %xmm1
  1646. addsd %xmm1, %xmm8
  1647. addq $1 * SIZE, AO
  1648. addq $1 * SIZE, BO
  1649. decq %rax
  1650. BRANCH
  1651. jg .L66
  1652. ALIGN_4
  1653. .L68:
  1654. addsd %xmm9, %xmm8
  1655. #if defined(LN) || defined(RT)
  1656. movq KK, %rax
  1657. #ifdef LN
  1658. subq $1, %rax
  1659. #else
  1660. subq $1, %rax
  1661. #endif
  1662. leaq (, %rax, SIZE), %rax
  1663. movq AORIG, AO
  1664. leaq (AO, %rax, 1), AO
  1665. leaq (B, %rax, 1), BO
  1666. #endif
  1667. #if defined(LN) || defined(LT)
  1668. movsd 0 * SIZE(BO), %xmm0
  1669. subsd %xmm8, %xmm0
  1670. #else
  1671. movsd 0 * SIZE(AO), %xmm0
  1672. subsd %xmm8, %xmm0
  1673. #endif
  1674. #if defined(LN) || defined(LT)
  1675. movsd 0 * SIZE(AO), %xmm8
  1676. mulsd %xmm8, %xmm0
  1677. #endif
  1678. #if defined(RN) || defined(RT)
  1679. movsd 0 * SIZE(BO), %xmm8
  1680. mulsd %xmm8, %xmm0
  1681. #endif
  1682. #ifdef LN
  1683. subq $1 * SIZE, CO1
  1684. #endif
  1685. movsd %xmm0, 0 * SIZE(CO1)
  1686. #if defined(LN) || defined(LT)
  1687. movsd %xmm0, 0 * SIZE(BO)
  1688. #else
  1689. movsd %xmm0, 0 * SIZE(AO)
  1690. #endif
  1691. #ifndef LN
  1692. addq $1 * SIZE, CO1
  1693. #endif
  1694. #if defined(LT) || defined(RN)
  1695. movq K, %rax
  1696. subq KK, %rax
  1697. leaq (,%rax, SIZE), %rax
  1698. leaq (AO, %rax, 1), AO
  1699. leaq (BO, %rax, 1), BO
  1700. #endif
  1701. #ifdef LN
  1702. subq $1, KK
  1703. #endif
  1704. #ifdef LT
  1705. addq $1, KK
  1706. #endif
  1707. #ifdef RT
  1708. movq K, %rax
  1709. salq $0 + BASE_SHIFT, %rax
  1710. addq %rax, AORIG
  1711. #endif
  1712. ALIGN_4
  1713. .L69:
  1714. #ifdef LN
  1715. leaq (, K, SIZE), %rax
  1716. leaq (B, %rax, 1), B
  1717. #endif
  1718. #if defined(LT) || defined(RN)
  1719. movq BO, B
  1720. #endif
  1721. #ifdef RN
  1722. addq $1, KK
  1723. #endif
  1724. #ifdef RT
  1725. subq $1, KK
  1726. #endif
  1727. ALIGN_2
  1728. .L999:
  1729. movq 0(%rsp), %rbx
  1730. movq 8(%rsp), %rbp
  1731. movq 16(%rsp), %r12
  1732. movq 24(%rsp), %r13
  1733. movq 32(%rsp), %r14
  1734. movq 40(%rsp), %r15
  1735. #ifdef WINDOWS_ABI
  1736. movq 48(%rsp), %rdi
  1737. movq 56(%rsp), %rsi
  1738. movups 64(%rsp), %xmm6
  1739. movups 80(%rsp), %xmm7
  1740. movups 96(%rsp), %xmm8
  1741. movups 112(%rsp), %xmm9
  1742. movups 128(%rsp), %xmm10
  1743. movups 144(%rsp), %xmm11
  1744. movups 160(%rsp), %xmm12
  1745. movups 176(%rsp), %xmm13
  1746. movups 192(%rsp), %xmm14
  1747. movups 208(%rsp), %xmm15
  1748. #endif
  1749. addq $STACKSIZE, %rsp
  1750. ret
  1751. EPILOGUE