You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_2x8_nehalem.S 34 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define OLD_K %rdx
  43. #define M %r13
  44. #define N %r14
  45. #define K %r15
  46. #define A %rcx
  47. #define B %r8
  48. #define C %r9
  49. #define LDC %r10
  50. #define I %r11
  51. #define AO %rdi
  52. #define BO %rsi
  53. #define CO1 %rbx
  54. #define CO2 %rbp
  55. #define BB %r12
  56. #define INC32 %rdx
  57. #ifndef WINDOWS_ABI
  58. #define STACKSIZE 128
  59. #define OLD_LDC 8 + STACKSIZE(%rsp)
  60. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  61. #define ALPHA 48(%rsp)
  62. #define J 56(%rsp)
  63. #define OFFSET 64(%rsp)
  64. #define KK 72(%rsp)
  65. #define KKK 80(%rsp)
  66. #else
  67. #define STACKSIZE 512
  68. #define OLD_A 40 + STACKSIZE(%rsp)
  69. #define OLD_B 48 + STACKSIZE(%rsp)
  70. #define OLD_C 56 + STACKSIZE(%rsp)
  71. #define OLD_LDC 64 + STACKSIZE(%rsp)
  72. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  73. #define ALPHA 224(%rsp)
  74. #define J 232(%rsp)
  75. #define OFFSET 240(%rsp)
  76. #define KK 248(%rsp)
  77. #define KKK 256(%rsp)
  78. #endif
  79. #define PREFETCHSIZE 4
  80. #define PREFETCH prefetcht0
  81. PROLOGUE
  82. PROFCODE
  83. subq $STACKSIZE, %rsp
  84. movq %rbx, 0(%rsp)
  85. movq %rbp, 8(%rsp)
  86. movq %r12, 16(%rsp)
  87. movq %r13, 24(%rsp)
  88. movq %r14, 32(%rsp)
  89. movq %r15, 40(%rsp)
  90. #ifdef WINDOWS_ABI
  91. movq %rdi, 48(%rsp)
  92. movq %rsi, 56(%rsp)
  93. movups %xmm6, 64(%rsp)
  94. movups %xmm7, 80(%rsp)
  95. movups %xmm8, 96(%rsp)
  96. movups %xmm9, 112(%rsp)
  97. movups %xmm10, 128(%rsp)
  98. movups %xmm11, 144(%rsp)
  99. movups %xmm12, 160(%rsp)
  100. movups %xmm13, 176(%rsp)
  101. movups %xmm14, 192(%rsp)
  102. movups %xmm15, 208(%rsp)
  103. movq ARG1, OLD_M
  104. movq ARG2, OLD_N
  105. movq ARG3, OLD_K
  106. movq OLD_A, A
  107. movq OLD_B, B
  108. movq OLD_C, C
  109. movq OLD_LDC, LDC
  110. #ifdef TRMMKERNEL
  111. movq OLD_OFFSET, %r11
  112. #endif
  113. movaps %xmm3, %xmm0
  114. #else
  115. movq OLD_LDC, LDC
  116. #ifdef TRMMKERNEL
  117. movq OLD_OFFSET, %r11
  118. #endif
  119. #endif
  120. movlps %xmm0, ALPHA
  121. subq $-16 * SIZE, A
  122. subq $-16 * SIZE, B
  123. movq OLD_M, M
  124. movq OLD_N, N
  125. movq OLD_K, K
  126. leaq (, LDC, SIZE), LDC
  127. #ifdef TRMMKERNEL
  128. movq %r11, OFFSET
  129. #ifndef LEFT
  130. negq %r11
  131. #endif
  132. movq %r11, KK
  133. #endif
  134. movq N, J
  135. sarq $3, J
  136. NOBRANCH
  137. jle .L30
  138. ALIGN_4
  139. .L01:
  140. #if defined(TRMMKERNEL) && defined(LEFT)
  141. movq OFFSET, %rax
  142. movq %rax, KK
  143. #endif
  144. movq C, CO1
  145. leaq (C, LDC, 4), CO2
  146. movq A, AO
  147. movq K, %rax
  148. salq $BASE_SHIFT + 3, %rax
  149. leaq (B, %rax), BB
  150. movq M, I
  151. sarq $1, I
  152. NOBRANCH
  153. jle .L20
  154. ALIGN_4
  155. .L11:
  156. prefetcht2 -16 * SIZE(BB)
  157. subq $-8 * SIZE, BB
  158. #if !defined(TRMMKERNEL) || \
  159. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  160. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  161. movq B, BO
  162. #else
  163. movq B, BO
  164. movq KK, %rax
  165. leaq (, %rax, SIZE), %rax
  166. leaq (AO, %rax, 2), AO
  167. leaq (BO, %rax, 8), BO
  168. #endif
  169. xorps %xmm1, %xmm1
  170. xorps %xmm2, %xmm2
  171. xorps %xmm3, %xmm3
  172. PADDING
  173. xorps %xmm4, %xmm4
  174. leaq (LDC, LDC, 2), %rax
  175. PADDING
  176. xorps %xmm8, %xmm8
  177. prefetcht0 1 * SIZE(CO1)
  178. xorps %xmm9, %xmm9
  179. prefetcht0 3 * SIZE(CO1, LDC, 1)
  180. PADDING
  181. xorps %xmm10, %xmm10
  182. prefetcht0 1 * SIZE(CO1, LDC, 2)
  183. PADDING
  184. xorps %xmm11, %xmm11
  185. prefetcht0 3 * SIZE(CO1, %rax, 1)
  186. movaps -16 * SIZE(AO), %xmm0
  187. PADDING
  188. xorps %xmm12, %xmm12
  189. prefetcht0 1 * SIZE(CO2)
  190. xorps %xmm13, %xmm13
  191. prefetcht0 3 * SIZE(CO2, LDC, 1)
  192. xorps %xmm14, %xmm14
  193. prefetcht0 1 * SIZE(CO2, LDC, 2)
  194. xorps %xmm15, %xmm15
  195. prefetcht0 3 * SIZE(CO2, %rax, 1)
  196. #ifndef TRMMKERNEL
  197. movq K, %rax
  198. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  199. movq K, %rax
  200. subq KK, %rax
  201. movq %rax, KKK
  202. #else
  203. movq KK, %rax
  204. #ifdef LEFT
  205. addq $2, %rax
  206. #else
  207. addq $8, %rax
  208. #endif
  209. movq %rax, KKK
  210. #endif
  211. sarq $2, %rax
  212. NOBRANCH
  213. jle .L15
  214. ALIGN_3
  215. .L12:
  216. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  217. addpd %xmm1, %xmm12
  218. movaps -16 * SIZE(BO), %xmm1
  219. addpd %xmm2, %xmm13
  220. pshufd $0x4e, %xmm1, %xmm2
  221. mulpd %xmm0, %xmm1
  222. mulpd %xmm0, %xmm2
  223. addpd %xmm3, %xmm14
  224. movaps -14 * SIZE(BO), %xmm3
  225. addpd %xmm4, %xmm15
  226. pshufd $0x4e, %xmm3, %xmm4
  227. mulpd %xmm0, %xmm3
  228. mulpd %xmm0, %xmm4
  229. addpd %xmm1, %xmm8
  230. movaps -12 * SIZE(BO), %xmm1
  231. addpd %xmm2, %xmm9
  232. pshufd $0x4e, %xmm1, %xmm2
  233. mulpd %xmm0, %xmm1
  234. mulpd %xmm0, %xmm2
  235. addpd %xmm3, %xmm10
  236. movaps -10 * SIZE(BO), %xmm3
  237. addpd %xmm4, %xmm11
  238. pshufd $0x4e, %xmm3, %xmm4
  239. movaps -14 * SIZE(AO), %xmm5
  240. mulpd %xmm0, %xmm3
  241. mulpd %xmm0, %xmm4
  242. addpd %xmm1, %xmm12
  243. movaps -8 * SIZE(BO), %xmm1
  244. addpd %xmm2, %xmm13
  245. pshufd $0x4e, %xmm1, %xmm2
  246. mulpd %xmm5, %xmm1
  247. mulpd %xmm5, %xmm2
  248. addpd %xmm3, %xmm14
  249. movaps -6 * SIZE(BO), %xmm3
  250. addpd %xmm4, %xmm15
  251. pshufd $0x4e, %xmm3, %xmm4
  252. mulpd %xmm5, %xmm3
  253. mulpd %xmm5, %xmm4
  254. addpd %xmm1, %xmm8
  255. movaps -4 * SIZE(BO), %xmm1
  256. addpd %xmm2, %xmm9
  257. pshufd $0x4e, %xmm1, %xmm2
  258. mulpd %xmm5, %xmm1
  259. mulpd %xmm5, %xmm2
  260. addpd %xmm3, %xmm10
  261. movaps -2 * SIZE(BO), %xmm3
  262. addpd %xmm4, %xmm11
  263. pshufd $0x4e, %xmm3, %xmm4
  264. movaps -12 * SIZE(AO), %xmm0
  265. mulpd %xmm5, %xmm3
  266. mulpd %xmm5, %xmm4
  267. addpd %xmm1, %xmm12
  268. movaps 0 * SIZE(BO), %xmm1
  269. addpd %xmm2, %xmm13
  270. pshufd $0x4e, %xmm1, %xmm2
  271. mulpd %xmm0, %xmm1
  272. mulpd %xmm0, %xmm2
  273. addpd %xmm3, %xmm14
  274. movaps 2 * SIZE(BO), %xmm3
  275. addpd %xmm4, %xmm15
  276. pshufd $0x4e, %xmm3, %xmm4
  277. mulpd %xmm0, %xmm3
  278. mulpd %xmm0, %xmm4
  279. addpd %xmm1, %xmm8
  280. movaps 4 * SIZE(BO), %xmm1
  281. addpd %xmm2, %xmm9
  282. pshufd $0x4e, %xmm1, %xmm2
  283. mulpd %xmm0, %xmm1
  284. mulpd %xmm0, %xmm2
  285. addpd %xmm3, %xmm10
  286. movaps 6 * SIZE(BO), %xmm3
  287. addpd %xmm4, %xmm11
  288. pshufd $0x4e, %xmm3, %xmm4
  289. mulpd %xmm0, %xmm3
  290. movaps -10 * SIZE(AO), %xmm5
  291. mulpd %xmm0, %xmm4
  292. addpd %xmm1, %xmm12
  293. movaps 8 * SIZE(BO), %xmm1
  294. addpd %xmm2, %xmm13
  295. pshufd $0x4e, %xmm1, %xmm2
  296. mulpd %xmm5, %xmm1
  297. mulpd %xmm5, %xmm2
  298. addpd %xmm3, %xmm14
  299. movaps 10 * SIZE(BO), %xmm3
  300. addpd %xmm4, %xmm15
  301. pshufd $0x4e, %xmm3, %xmm4
  302. mulpd %xmm5, %xmm3
  303. PADDING;
  304. mulpd %xmm5, %xmm4
  305. addpd %xmm1, %xmm8
  306. movaps 12 * SIZE(BO), %xmm1
  307. addpd %xmm2, %xmm9
  308. pshufd $0x4e, %xmm1, %xmm2
  309. mulpd %xmm5, %xmm1
  310. PADDING;
  311. mulpd %xmm5, %xmm2
  312. addpd %xmm3, %xmm10
  313. movaps 14 * SIZE(BO), %xmm3
  314. addpd %xmm4, %xmm11
  315. pshufd $0x4e, %xmm3, %xmm4
  316. mulpd %xmm5, %xmm3
  317. movaps -8 * SIZE(AO), %xmm0
  318. mulpd %xmm5, %xmm4
  319. subq $-32 * SIZE, BO
  320. subq $-8 * SIZE, AO
  321. subq $1, %rax
  322. BRANCH
  323. jg .L12
  324. ALIGN_3
  325. .L15:
  326. movddup ALPHA, %xmm7
  327. #ifndef TRMMKERNEL
  328. movq K, %rax
  329. #else
  330. movq KKK, %rax
  331. #endif
  332. andq $3, %rax # if (k & 1)
  333. BRANCH
  334. je .L18
  335. ALIGN_3
  336. .L16:
  337. addpd %xmm1, %xmm12
  338. movaps -16 * SIZE(BO), %xmm1
  339. addpd %xmm2, %xmm13
  340. pshufd $0x4e, %xmm1, %xmm2
  341. mulpd %xmm0, %xmm1
  342. mulpd %xmm0, %xmm2
  343. addpd %xmm3, %xmm14
  344. movaps -14 * SIZE(BO), %xmm3
  345. addpd %xmm4, %xmm15
  346. pshufd $0x4e, %xmm3, %xmm4
  347. mulpd %xmm0, %xmm3
  348. mulpd %xmm0, %xmm4
  349. addpd %xmm1, %xmm8
  350. movaps -12 * SIZE(BO), %xmm1
  351. addpd %xmm2, %xmm9
  352. pshufd $0x4e, %xmm1, %xmm2
  353. mulpd %xmm0, %xmm1
  354. mulpd %xmm0, %xmm2
  355. addpd %xmm3, %xmm10
  356. movaps -10 * SIZE(BO), %xmm3
  357. addpd %xmm4, %xmm11
  358. pshufd $0x4e, %xmm3, %xmm4
  359. mulpd %xmm0, %xmm3
  360. mulpd %xmm0, %xmm4
  361. movaps -14 * SIZE(AO), %xmm0
  362. addq $2 * SIZE, AO
  363. addq $8 * SIZE, BO
  364. subq $1, %rax
  365. BRANCH
  366. jg .L16
  367. ALIGN_4
  368. .L18:
  369. addpd %xmm1, %xmm12
  370. movaps %xmm8, %xmm0
  371. shufpd $2, %xmm9, %xmm8
  372. mulpd %xmm7, %xmm8
  373. shufpd $2, %xmm0, %xmm9
  374. mulpd %xmm7, %xmm9
  375. addpd %xmm2, %xmm13
  376. movaps %xmm10, %xmm0
  377. shufpd $2, %xmm11, %xmm10
  378. mulpd %xmm7, %xmm10
  379. shufpd $2, %xmm0, %xmm11
  380. mulpd %xmm7, %xmm11
  381. addpd %xmm3, %xmm14
  382. movaps %xmm12, %xmm0
  383. shufpd $2, %xmm13, %xmm12
  384. mulpd %xmm7, %xmm12
  385. shufpd $2, %xmm0, %xmm13
  386. mulpd %xmm7, %xmm13
  387. addpd %xmm4, %xmm15
  388. movaps %xmm14, %xmm0
  389. shufpd $2, %xmm15, %xmm14
  390. mulpd %xmm7, %xmm14
  391. shufpd $2, %xmm0, %xmm15
  392. mulpd %xmm7, %xmm15
  393. movq CO1, %rax
  394. orq LDC, %rax
  395. testq $15, %rax
  396. NOBRANCH
  397. jne .L18x
  398. leaq (LDC, LDC, 2), %rax
  399. #ifndef TRMMKERNEL
  400. movups (CO1), %xmm0
  401. movups (CO1, LDC, 1), %xmm1
  402. movups (CO1, LDC, 2), %xmm2
  403. movups (CO1, %rax, 1), %xmm3
  404. movups (CO2), %xmm4
  405. movups (CO2, LDC, 1), %xmm5
  406. movups (CO2, LDC, 2), %xmm6
  407. movups (CO2, %rax, 1), %xmm7
  408. addpd %xmm0, %xmm8
  409. addpd %xmm1, %xmm9
  410. addpd %xmm2, %xmm10
  411. addpd %xmm3, %xmm11
  412. addpd %xmm4, %xmm12
  413. addpd %xmm5, %xmm13
  414. addpd %xmm6, %xmm14
  415. addpd %xmm7, %xmm15
  416. #endif
  417. movaps %xmm8, (CO1)
  418. movaps %xmm9, (CO1, LDC, 1)
  419. movaps %xmm10, (CO1, LDC, 2)
  420. movaps %xmm11, (CO1, %rax, 1)
  421. movaps %xmm12, (CO2)
  422. movaps %xmm13, (CO2, LDC, 1)
  423. movaps %xmm14, (CO2, LDC, 2)
  424. movaps %xmm15, (CO2, %rax, 1)
  425. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  426. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  427. movq K, %rax
  428. subq KKK, %rax
  429. leaq (,%rax, SIZE), %rax
  430. leaq (AO, %rax, 2), AO
  431. leaq (BO, %rax, 8), BO
  432. #endif
  433. #if defined(TRMMKERNEL) && defined(LEFT)
  434. addq $2, KK
  435. #endif
  436. addq $2 * SIZE, CO1
  437. addq $2 * SIZE, CO2
  438. decq I
  439. BRANCH
  440. jg .L11
  441. jmp .L20
  442. ALIGN_4
  443. .L18x:
  444. leaq (LDC, LDC, 2), %rax
  445. #ifndef TRMMKERNEL
  446. movups (CO1), %xmm0
  447. movups (CO1, LDC, 1), %xmm1
  448. movups (CO1, LDC, 2), %xmm2
  449. movups (CO1, %rax, 1), %xmm3
  450. movups (CO2), %xmm4
  451. movups (CO2, LDC, 1), %xmm5
  452. movups (CO2, LDC, 2), %xmm6
  453. movups (CO2, %rax, 1), %xmm7
  454. addpd %xmm0, %xmm8
  455. addpd %xmm1, %xmm9
  456. addpd %xmm2, %xmm10
  457. addpd %xmm3, %xmm11
  458. addpd %xmm4, %xmm12
  459. addpd %xmm5, %xmm13
  460. addpd %xmm6, %xmm14
  461. addpd %xmm7, %xmm15
  462. #endif
  463. movups %xmm8, (CO1)
  464. movups %xmm9, (CO1, LDC, 1)
  465. movups %xmm10, (CO1, LDC, 2)
  466. movups %xmm11, (CO1, %rax, 1)
  467. movups %xmm12, (CO2)
  468. movups %xmm13, (CO2, LDC, 1)
  469. movups %xmm14, (CO2, LDC, 2)
  470. movups %xmm15, (CO2, %rax, 1)
  471. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  472. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  473. movq K, %rax
  474. subq KKK, %rax
  475. leaq (,%rax, SIZE), %rax
  476. leaq (AO, %rax, 2), AO
  477. leaq (BO, %rax, 8), BO
  478. #endif
  479. #if defined(TRMMKERNEL) && defined(LEFT)
  480. addq $2, KK
  481. #endif
  482. addq $2 * SIZE, CO1
  483. addq $2 * SIZE, CO2
  484. decq I
  485. BRANCH
  486. jg .L11
  487. ALIGN_4
  488. .L20:
  489. testq $1, M
  490. BRANCH
  491. jle .L29
  492. ALIGN_4
  493. #if !defined(TRMMKERNEL) || \
  494. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  495. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  496. movq B, BO
  497. #else
  498. movq B, BO
  499. movq KK, %rax
  500. leaq (, %rax, SIZE), %rax
  501. leaq (AO, %rax, 1), AO
  502. leaq (BO, %rax, 8), BO
  503. #endif
  504. movddup -16 * SIZE(AO), %xmm0
  505. xorps %xmm8, %xmm8
  506. movaps -16 * SIZE(BO), %xmm1
  507. xorps %xmm9, %xmm9
  508. xorps %xmm10, %xmm10
  509. xorps %xmm11, %xmm11
  510. #ifndef TRMMKERNEL
  511. movq K, %rax
  512. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  513. movq K, %rax
  514. subq KK, %rax
  515. movq %rax, KKK
  516. #else
  517. movq KK, %rax
  518. #ifdef LEFT
  519. addq $1, %rax
  520. #else
  521. addq $8, %rax
  522. #endif
  523. movq %rax, KKK
  524. #endif
  525. sarq $2, %rax
  526. NOBRANCH
  527. jle .L25
  528. ALIGN_3
  529. .L22:
  530. mulpd %xmm0, %xmm1
  531. addpd %xmm1, %xmm8
  532. movaps -14 * SIZE(BO), %xmm1
  533. mulpd %xmm0, %xmm1
  534. addpd %xmm1, %xmm9
  535. movaps -12 * SIZE(BO), %xmm1
  536. mulpd %xmm0, %xmm1
  537. addpd %xmm1, %xmm10
  538. movaps -10 * SIZE(BO), %xmm1
  539. mulpd %xmm0, %xmm1
  540. movddup -15 * SIZE(AO), %xmm0
  541. addpd %xmm1, %xmm11
  542. movaps -8 * SIZE(BO), %xmm1
  543. mulpd %xmm0, %xmm1
  544. addpd %xmm1, %xmm8
  545. movaps -6 * SIZE(BO), %xmm1
  546. mulpd %xmm0, %xmm1
  547. addpd %xmm1, %xmm9
  548. movaps -4 * SIZE(BO), %xmm1
  549. mulpd %xmm0, %xmm1
  550. addpd %xmm1, %xmm10
  551. movaps -2 * SIZE(BO), %xmm1
  552. mulpd %xmm0, %xmm1
  553. movddup -14 * SIZE(AO), %xmm0
  554. addpd %xmm1, %xmm11
  555. movaps 0 * SIZE(BO), %xmm1
  556. mulpd %xmm0, %xmm1
  557. addpd %xmm1, %xmm8
  558. movaps 2 * SIZE(BO), %xmm1
  559. mulpd %xmm0, %xmm1
  560. addpd %xmm1, %xmm9
  561. movaps 4 * SIZE(BO), %xmm1
  562. mulpd %xmm0, %xmm1
  563. addpd %xmm1, %xmm10
  564. movaps 6 * SIZE(BO), %xmm1
  565. mulpd %xmm0, %xmm1
  566. movddup -13 * SIZE(AO), %xmm0
  567. addpd %xmm1, %xmm11
  568. movaps 8 * SIZE(BO), %xmm1
  569. mulpd %xmm0, %xmm1
  570. addpd %xmm1, %xmm8
  571. movaps 10 * SIZE(BO), %xmm1
  572. mulpd %xmm0, %xmm1
  573. addpd %xmm1, %xmm9
  574. movaps 12 * SIZE(BO), %xmm1
  575. mulpd %xmm0, %xmm1
  576. addpd %xmm1, %xmm10
  577. movaps 14 * SIZE(BO), %xmm1
  578. mulpd %xmm0, %xmm1
  579. movddup -12 * SIZE(AO), %xmm0
  580. addpd %xmm1, %xmm11
  581. movaps 16 * SIZE(BO), %xmm1
  582. subq $ -4 * SIZE, AO
  583. subq $-32 * SIZE, BO
  584. subq $1, %rax
  585. BRANCH
  586. jg .L22
  587. ALIGN_3
  588. .L25:
  589. movddup ALPHA, %xmm7
  590. #ifndef TRMMKERNEL
  591. movq K, %rax
  592. #else
  593. movq KKK, %rax
  594. #endif
  595. andq $3, %rax # if (k & 1)
  596. BRANCH
  597. je .L28
  598. ALIGN_3
  599. .L26:
  600. mulpd %xmm0, %xmm1
  601. addpd %xmm1, %xmm8
  602. movaps -14 * SIZE(BO), %xmm1
  603. mulpd %xmm0, %xmm1
  604. addpd %xmm1, %xmm9
  605. movaps -12 * SIZE(BO), %xmm1
  606. mulpd %xmm0, %xmm1
  607. addpd %xmm1, %xmm10
  608. movaps -10 * SIZE(BO), %xmm1
  609. mulpd %xmm0, %xmm1
  610. movddup -15 * SIZE(AO), %xmm0
  611. addpd %xmm1, %xmm11
  612. movaps -8 * SIZE(BO), %xmm1
  613. addq $1 * SIZE, AO
  614. addq $8 * SIZE, BO
  615. subq $1, %rax
  616. BRANCH
  617. jg .L26
  618. ALIGN_4
  619. .L28:
  620. mulpd %xmm7, %xmm8
  621. mulpd %xmm7, %xmm9
  622. mulpd %xmm7, %xmm10
  623. mulpd %xmm7, %xmm11
  624. leaq (LDC, LDC, 2), %rax
  625. #ifndef TRMMKERNEL
  626. movsd (CO1), %xmm0
  627. movhps (CO1, LDC, 1), %xmm0
  628. movsd (CO1, LDC, 2), %xmm1
  629. movhps (CO1, %rax, 1), %xmm1
  630. movsd (CO2), %xmm2
  631. movhps (CO2, LDC, 1), %xmm2
  632. movsd (CO2, LDC, 2), %xmm3
  633. movhps (CO2, %rax, 1), %xmm3
  634. addpd %xmm0, %xmm8
  635. addpd %xmm1, %xmm9
  636. addpd %xmm2, %xmm10
  637. addpd %xmm3, %xmm11
  638. #endif
  639. movsd %xmm8, (CO1)
  640. movhps %xmm8, (CO1, LDC, 1)
  641. movsd %xmm9, (CO1, LDC, 2)
  642. movhps %xmm9, (CO1, %rax, 1)
  643. movsd %xmm10, (CO2)
  644. movhps %xmm10, (CO2, LDC, 1)
  645. movsd %xmm11, (CO2, LDC, 2)
  646. movhps %xmm11, (CO2, %rax, 1)
  647. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  648. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  649. movq K, %rax
  650. subq KKK, %rax
  651. leaq (,%rax, SIZE), %rax
  652. leaq (AO, %rax, 1), AO
  653. leaq (BO, %rax, 8), BO
  654. #endif
  655. #if defined(TRMMKERNEL) && defined(LEFT)
  656. addq $1, KK
  657. #endif
  658. ALIGN_4
  659. .L29:
  660. #if defined(TRMMKERNEL) && !defined(LEFT)
  661. addq $8, KK
  662. #endif
  663. movq BO, B
  664. leaq (C, LDC, 8), C
  665. subq $1, J
  666. BRANCH
  667. jg .L01
  668. ALIGN_4
  669. .L30:
  670. testq $4, N
  671. jle .L50
  672. ALIGN_4
  673. #if defined(TRMMKERNEL) && defined(LEFT)
  674. movq OFFSET, %rax
  675. movq %rax, KK
  676. #endif
  677. movq C, CO1
  678. leaq (C, LDC, 2), CO2
  679. movq A, AO
  680. movq M, I
  681. sarq $1, I
  682. NOBRANCH
  683. jle .L40
  684. ALIGN_4
  685. .L31:
  686. #if !defined(TRMMKERNEL) || \
  687. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  688. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  689. movq B, BO
  690. #else
  691. movq B, BO
  692. movq KK, %rax
  693. leaq (, %rax, SIZE), %rax
  694. leaq (AO, %rax, 2), AO
  695. leaq (BO, %rax, 4), BO
  696. #endif
  697. xorps %xmm1, %xmm1
  698. movaps -16 * SIZE(AO), %xmm0
  699. xorps %xmm2, %xmm2
  700. xorps %xmm3, %xmm3
  701. xorps %xmm4, %xmm4
  702. xorps %xmm8, %xmm8
  703. prefetcht0 2 * SIZE(CO1)
  704. xorps %xmm9, %xmm9
  705. prefetcht0 2 * SIZE(CO1, LDC, 1)
  706. xorps %xmm10, %xmm10
  707. prefetcht0 2 * SIZE(CO2)
  708. xorps %xmm11, %xmm11
  709. prefetcht0 2 * SIZE(CO2, LDC, 1)
  710. #ifndef TRMMKERNEL
  711. movq K, %rax
  712. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  713. movq K, %rax
  714. subq KK, %rax
  715. movq %rax, KKK
  716. #else
  717. movq KK, %rax
  718. #ifdef LEFT
  719. addq $2, %rax
  720. #else
  721. addq $4, %rax
  722. #endif
  723. movq %rax, KKK
  724. #endif
  725. sarq $2, %rax
  726. NOBRANCH
  727. jle .L35
  728. ALIGN_3
  729. .L32:
  730. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  731. addpd %xmm1, %xmm8
  732. movaps -16 * SIZE(BO), %xmm1
  733. addpd %xmm2, %xmm9
  734. pshufd $0x4e, %xmm1, %xmm2
  735. mulpd %xmm0, %xmm1
  736. mulpd %xmm0, %xmm2
  737. addpd %xmm3, %xmm10
  738. movaps -14 * SIZE(BO), %xmm3
  739. addpd %xmm4, %xmm11
  740. pshufd $0x4e, %xmm3, %xmm4
  741. mulpd %xmm0, %xmm3
  742. mulpd %xmm0, %xmm4
  743. movaps -14 * SIZE(AO), %xmm0
  744. addpd %xmm1, %xmm8
  745. movaps -12 * SIZE(BO), %xmm1
  746. addpd %xmm2, %xmm9
  747. pshufd $0x4e, %xmm1, %xmm2
  748. mulpd %xmm0, %xmm1
  749. mulpd %xmm0, %xmm2
  750. addpd %xmm3, %xmm10
  751. movaps -10 * SIZE(BO), %xmm3
  752. addpd %xmm4, %xmm11
  753. pshufd $0x4e, %xmm3, %xmm4
  754. mulpd %xmm0, %xmm3
  755. mulpd %xmm0, %xmm4
  756. movaps -12 * SIZE(AO), %xmm0
  757. addpd %xmm1, %xmm8
  758. movaps -8 * SIZE(BO), %xmm1
  759. addpd %xmm2, %xmm9
  760. pshufd $0x4e, %xmm1, %xmm2
  761. mulpd %xmm0, %xmm1
  762. mulpd %xmm0, %xmm2
  763. addpd %xmm3, %xmm10
  764. movaps -6 * SIZE(BO), %xmm3
  765. addpd %xmm4, %xmm11
  766. pshufd $0x4e, %xmm3, %xmm4
  767. mulpd %xmm0, %xmm3
  768. mulpd %xmm0, %xmm4
  769. movaps -10 * SIZE(AO), %xmm0
  770. addpd %xmm1, %xmm8
  771. movaps -4 * SIZE(BO), %xmm1
  772. addpd %xmm2, %xmm9
  773. pshufd $0x4e, %xmm1, %xmm2
  774. mulpd %xmm0, %xmm1
  775. mulpd %xmm0, %xmm2
  776. addpd %xmm3, %xmm10
  777. movaps -2 * SIZE(BO), %xmm3
  778. addpd %xmm4, %xmm11
  779. pshufd $0x4e, %xmm3, %xmm4
  780. mulpd %xmm0, %xmm3
  781. mulpd %xmm0, %xmm4
  782. movaps -8 * SIZE(AO), %xmm0
  783. subq $-8 * SIZE, AO
  784. subq $-16 * SIZE, BO
  785. subq $1, %rax
  786. BRANCH
  787. jg .L32
  788. ALIGN_3
  789. .L35:
  790. movddup ALPHA, %xmm7
  791. #ifndef TRMMKERNEL
  792. movq K, %rax
  793. #else
  794. movq KKK, %rax
  795. #endif
  796. andq $3, %rax # if (k & 1)
  797. BRANCH
  798. je .L38
  799. ALIGN_3
  800. .L36:
  801. addpd %xmm1, %xmm8
  802. movaps -16 * SIZE(BO), %xmm1
  803. addpd %xmm2, %xmm9
  804. pshufd $0x4e, %xmm1, %xmm2
  805. mulpd %xmm0, %xmm1
  806. mulpd %xmm0, %xmm2
  807. addpd %xmm3, %xmm10
  808. movaps -14 * SIZE(BO), %xmm3
  809. addpd %xmm4, %xmm11
  810. pshufd $0x4e, %xmm3, %xmm4
  811. mulpd %xmm0, %xmm3
  812. mulpd %xmm0, %xmm4
  813. movaps -14 * SIZE(AO), %xmm0
  814. addq $2 * SIZE, AO
  815. addq $4 * SIZE, BO
  816. subq $1, %rax
  817. BRANCH
  818. jg .L36
  819. ALIGN_4
  820. .L38:
  821. addpd %xmm1, %xmm8
  822. addpd %xmm2, %xmm9
  823. addpd %xmm3, %xmm10
  824. addpd %xmm4, %xmm11
  825. movaps %xmm8, %xmm0
  826. shufpd $2, %xmm9, %xmm8
  827. mulpd %xmm7, %xmm8
  828. shufpd $2, %xmm0, %xmm9
  829. mulpd %xmm7, %xmm9
  830. movaps %xmm10, %xmm0
  831. shufpd $2, %xmm11, %xmm10
  832. mulpd %xmm7, %xmm10
  833. shufpd $2, %xmm0, %xmm11
  834. mulpd %xmm7, %xmm11
  835. #ifndef TRMMKERNEL
  836. movsd 0 * SIZE(CO1), %xmm0
  837. movhps 1 * SIZE(CO1), %xmm0
  838. movsd 0 * SIZE(CO1, LDC, 1), %xmm1
  839. movhps 1 * SIZE(CO1, LDC, 1), %xmm1
  840. movsd 0 * SIZE(CO2), %xmm2
  841. movhps 1 * SIZE(CO2), %xmm2
  842. movsd 0 * SIZE(CO2, LDC, 1), %xmm3
  843. movhps 1 * SIZE(CO2, LDC, 1), %xmm3
  844. addpd %xmm0, %xmm8
  845. addpd %xmm1, %xmm9
  846. addpd %xmm2, %xmm10
  847. addpd %xmm3, %xmm11
  848. #endif
  849. movsd %xmm8, 0 * SIZE(CO1)
  850. movhps %xmm8, 1 * SIZE(CO1)
  851. movsd %xmm9, 0 * SIZE(CO1, LDC, 1)
  852. movhps %xmm9, 1 * SIZE(CO1, LDC, 1)
  853. movsd %xmm10, 0 * SIZE(CO2)
  854. movhps %xmm10, 1 * SIZE(CO2)
  855. movsd %xmm11, 0 * SIZE(CO2, LDC, 1)
  856. movhps %xmm11, 1 * SIZE(CO2, LDC, 1)
  857. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  858. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  859. movq K, %rax
  860. subq KKK, %rax
  861. leaq (,%rax, SIZE), %rax
  862. leaq (AO, %rax, 2), AO
  863. leaq (BO, %rax, 4), BO
  864. #endif
  865. #if defined(TRMMKERNEL) && defined(LEFT)
  866. addq $2, KK
  867. #endif
  868. addq $2 * SIZE, CO1
  869. addq $2 * SIZE, CO2
  870. decq I
  871. BRANCH
  872. jg .L31
  873. ALIGN_4
  874. .L40:
  875. testq $1, M
  876. BRANCH
  877. jle .L49
  878. ALIGN_4
  879. #if !defined(TRMMKERNEL) || \
  880. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  881. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  882. movq B, BO
  883. #else
  884. movq B, BO
  885. movq KK, %rax
  886. leaq (, %rax, SIZE), %rax
  887. leaq (AO, %rax, 1), AO
  888. leaq (BO, %rax, 4), BO
  889. #endif
  890. movddup -16 * SIZE(AO), %xmm0
  891. xorps %xmm8, %xmm8
  892. movaps -16 * SIZE(BO), %xmm1
  893. xorps %xmm9, %xmm9
  894. xorps %xmm10, %xmm10
  895. xorps %xmm11, %xmm11
  896. #ifndef TRMMKERNEL
  897. movq K, %rax
  898. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  899. movq K, %rax
  900. subq KK, %rax
  901. movq %rax, KKK
  902. #else
  903. movq KK, %rax
  904. #ifdef LEFT
  905. addq $1, %rax
  906. #else
  907. addq $4, %rax
  908. #endif
  909. movq %rax, KKK
  910. #endif
  911. sarq $2, %rax
  912. NOBRANCH
  913. jle .L45
  914. ALIGN_3
  915. .L42:
  916. mulpd %xmm0, %xmm1
  917. addpd %xmm1, %xmm8
  918. movaps -14 * SIZE(BO), %xmm1
  919. mulpd %xmm0, %xmm1
  920. movddup -15 * SIZE(AO), %xmm0
  921. addpd %xmm1, %xmm9
  922. movaps -12 * SIZE(BO), %xmm1
  923. mulpd %xmm0, %xmm1
  924. addpd %xmm1, %xmm10
  925. movaps -10 * SIZE(BO), %xmm1
  926. mulpd %xmm0, %xmm1
  927. movddup -14 * SIZE(AO), %xmm0
  928. addpd %xmm1, %xmm11
  929. movaps -8 * SIZE(BO), %xmm1
  930. mulpd %xmm0, %xmm1
  931. addpd %xmm1, %xmm8
  932. movaps -6 * SIZE(BO), %xmm1
  933. mulpd %xmm0, %xmm1
  934. movddup -13 * SIZE(AO), %xmm0
  935. addpd %xmm1, %xmm9
  936. movaps -4 * SIZE(BO), %xmm1
  937. mulpd %xmm0, %xmm1
  938. addpd %xmm1, %xmm10
  939. movaps -2 * SIZE(BO), %xmm1
  940. mulpd %xmm0, %xmm1
  941. movddup -12 * SIZE(AO), %xmm0
  942. addpd %xmm1, %xmm11
  943. movaps 0 * SIZE(BO), %xmm1
  944. subq $ -4 * SIZE, AO
  945. subq $-16 * SIZE, BO
  946. subq $1, %rax
  947. BRANCH
  948. jg .L42
  949. ALIGN_3
  950. .L45:
  951. movddup ALPHA, %xmm7
  952. #ifndef TRMMKERNEL
  953. movq K, %rax
  954. #else
  955. movq KKK, %rax
  956. #endif
  957. andq $3, %rax # if (k & 1)
  958. BRANCH
  959. je .L48
  960. ALIGN_3
  961. .L46:
  962. mulpd %xmm0, %xmm1
  963. addpd %xmm1, %xmm8
  964. movaps -14 * SIZE(BO), %xmm1
  965. mulpd %xmm0, %xmm1
  966. movddup -15 * SIZE(AO), %xmm0
  967. addpd %xmm1, %xmm9
  968. movaps -12 * SIZE(BO), %xmm1
  969. addq $1 * SIZE, AO
  970. addq $4 * SIZE, BO
  971. subq $1, %rax
  972. BRANCH
  973. jg .L46
  974. ALIGN_4
  975. .L48:
  976. addpd %xmm10, %xmm8
  977. mulpd %xmm7, %xmm8
  978. addpd %xmm11, %xmm9
  979. mulpd %xmm7, %xmm9
  980. #ifndef TRMMKERNEL
  981. movsd (CO1), %xmm0
  982. movhps (CO1, LDC, 1), %xmm0
  983. movsd (CO2), %xmm1
  984. movhps (CO2, LDC, 1), %xmm1
  985. addpd %xmm0, %xmm8
  986. addpd %xmm1, %xmm9
  987. #endif
  988. movsd %xmm8, (CO1)
  989. movhps %xmm8, (CO1, LDC, 1)
  990. movsd %xmm9, (CO2)
  991. movhps %xmm9, (CO2, LDC, 1)
  992. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  993. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  994. movq K, %rax
  995. subq KKK, %rax
  996. leaq (,%rax, SIZE), %rax
  997. leaq (AO, %rax, 1), AO
  998. leaq (BO, %rax, 4), BO
  999. #endif
  1000. #if defined(TRMMKERNEL) && defined(LEFT)
  1001. addq $1, KK
  1002. #endif
  1003. ALIGN_4
  1004. .L49:
  1005. #if defined(TRMMKERNEL) && !defined(LEFT)
  1006. addq $4, KK
  1007. #endif
  1008. movq BO, B
  1009. leaq (C, LDC, 4), C
  1010. ALIGN_4
  1011. .L50:
  1012. testq $2, N
  1013. jle .L70
  1014. ALIGN_4
  1015. #if defined(TRMMKERNEL) && defined(LEFT)
  1016. movq OFFSET, %rax
  1017. movq %rax, KK
  1018. #endif
  1019. movq C, CO1
  1020. leaq (C, LDC), CO2
  1021. movq A, AO
  1022. movq M, I
  1023. sarq $1, I
  1024. NOBRANCH
  1025. jle .L60
  1026. ALIGN_4
  1027. .L51:
  1028. #if !defined(TRMMKERNEL) || \
  1029. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1030. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1031. movq B, BO
  1032. #else
  1033. movq B, BO
  1034. movq KK, %rax
  1035. leaq (, %rax, SIZE), %rax
  1036. leaq (AO, %rax, 2), AO
  1037. leaq (BO, %rax, 2), BO
  1038. #endif
  1039. xorps %xmm1, %xmm1
  1040. movaps -16 * SIZE(AO), %xmm0
  1041. xorps %xmm2, %xmm2
  1042. xorps %xmm8, %xmm8
  1043. prefetcht0 2 * SIZE(CO1)
  1044. xorps %xmm9, %xmm9
  1045. prefetcht0 2 * SIZE(CO2)
  1046. xorps %xmm10, %xmm10
  1047. xorps %xmm11, %xmm11
  1048. #ifndef TRMMKERNEL
  1049. movq K, %rax
  1050. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1051. movq K, %rax
  1052. subq KK, %rax
  1053. movq %rax, KKK
  1054. #else
  1055. movq KK, %rax
  1056. #ifdef LEFT
  1057. addq $2, %rax
  1058. #else
  1059. addq $2, %rax
  1060. #endif
  1061. movq %rax, KKK
  1062. #endif
  1063. sarq $2, %rax
  1064. NOBRANCH
  1065. jle .L55
  1066. ALIGN_3
  1067. .L52:
  1068. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1069. addpd %xmm1, %xmm8
  1070. movaps -16 * SIZE(BO), %xmm1
  1071. addpd %xmm2, %xmm9
  1072. pshufd $0x4e, %xmm1, %xmm2
  1073. mulpd %xmm0, %xmm1
  1074. mulpd %xmm0, %xmm2
  1075. movaps -14 * SIZE(AO), %xmm0
  1076. addpd %xmm1, %xmm10
  1077. movaps -14 * SIZE(BO), %xmm1
  1078. addpd %xmm2, %xmm11
  1079. pshufd $0x4e, %xmm1, %xmm2
  1080. mulpd %xmm0, %xmm1
  1081. mulpd %xmm0, %xmm2
  1082. movaps -12 * SIZE(AO), %xmm0
  1083. addpd %xmm1, %xmm8
  1084. movaps -12 * SIZE(BO), %xmm1
  1085. addpd %xmm2, %xmm9
  1086. pshufd $0x4e, %xmm1, %xmm2
  1087. mulpd %xmm0, %xmm1
  1088. mulpd %xmm0, %xmm2
  1089. movaps -10 * SIZE(AO), %xmm0
  1090. addpd %xmm1, %xmm10
  1091. movaps -10 * SIZE(BO), %xmm1
  1092. addpd %xmm2, %xmm11
  1093. pshufd $0x4e, %xmm1, %xmm2
  1094. mulpd %xmm0, %xmm1
  1095. mulpd %xmm0, %xmm2
  1096. movaps -8 * SIZE(AO), %xmm0
  1097. subq $-8 * SIZE, AO
  1098. subq $-8 * SIZE, BO
  1099. subq $1, %rax
  1100. BRANCH
  1101. jg .L52
  1102. addpd %xmm10, %xmm8
  1103. addpd %xmm11, %xmm9
  1104. ALIGN_3
  1105. .L55:
  1106. movddup ALPHA, %xmm7
  1107. #ifndef TRMMKERNEL
  1108. movq K, %rax
  1109. #else
  1110. movq KKK, %rax
  1111. #endif
  1112. andq $3, %rax # if (k & 1)
  1113. BRANCH
  1114. je .L58
  1115. ALIGN_3
  1116. .L56:
  1117. addpd %xmm1, %xmm8
  1118. movaps -16 * SIZE(BO), %xmm1
  1119. addpd %xmm2, %xmm9
  1120. pshufd $0x4e, %xmm1, %xmm2
  1121. mulpd %xmm0, %xmm1
  1122. mulpd %xmm0, %xmm2
  1123. movaps -14 * SIZE(AO), %xmm0
  1124. addq $2 * SIZE, AO
  1125. addq $2 * SIZE, BO
  1126. subq $1, %rax
  1127. BRANCH
  1128. jg .L56
  1129. ALIGN_4
  1130. .L58:
  1131. addpd %xmm1, %xmm8
  1132. addpd %xmm2, %xmm9
  1133. movaps %xmm8, %xmm0
  1134. shufpd $2, %xmm9, %xmm8
  1135. mulpd %xmm7, %xmm8
  1136. shufpd $2, %xmm0, %xmm9
  1137. mulpd %xmm7, %xmm9
  1138. #ifndef TRMMKERNEL
  1139. movsd 0 * SIZE(CO1), %xmm0
  1140. movhps 1 * SIZE(CO1), %xmm0
  1141. movsd 0 * SIZE(CO2), %xmm1
  1142. movhps 1 * SIZE(CO2), %xmm1
  1143. addpd %xmm0, %xmm8
  1144. addpd %xmm1, %xmm9
  1145. #endif
  1146. movsd %xmm8, 0 * SIZE(CO1)
  1147. movhps %xmm8, 1 * SIZE(CO1)
  1148. movsd %xmm9, 0 * SIZE(CO2)
  1149. movhps %xmm9, 1 * SIZE(CO2)
  1150. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1151. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1152. movq K, %rax
  1153. subq KKK, %rax
  1154. leaq (,%rax, SIZE), %rax
  1155. leaq (AO, %rax, 2), AO
  1156. leaq (BO, %rax, 2), BO
  1157. #endif
  1158. #if defined(TRMMKERNEL) && defined(LEFT)
  1159. addq $2, KK
  1160. #endif
  1161. addq $2 * SIZE, CO1
  1162. addq $2 * SIZE, CO2
  1163. decq I
  1164. BRANCH
  1165. jg .L51
  1166. ALIGN_4
  1167. .L60:
  1168. testq $1, M
  1169. BRANCH
  1170. jle .L69
  1171. ALIGN_4
  1172. #if !defined(TRMMKERNEL) || \
  1173. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1174. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1175. movq B, BO
  1176. #else
  1177. movq B, BO
  1178. movq KK, %rax
  1179. leaq (, %rax, SIZE), %rax
  1180. leaq (AO, %rax, 1), AO
  1181. leaq (BO, %rax, 2), BO
  1182. #endif
  1183. movddup -16 * SIZE(AO), %xmm0
  1184. xorps %xmm8, %xmm8
  1185. movaps -16 * SIZE(BO), %xmm1
  1186. xorps %xmm9, %xmm9
  1187. #ifndef TRMMKERNEL
  1188. movq K, %rax
  1189. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1190. movq K, %rax
  1191. subq KK, %rax
  1192. movq %rax, KKK
  1193. #else
  1194. movq KK, %rax
  1195. #ifdef LEFT
  1196. addq $1, %rax
  1197. #else
  1198. addq $2, %rax
  1199. #endif
  1200. movq %rax, KKK
  1201. #endif
  1202. sarq $2, %rax
  1203. NOBRANCH
  1204. jle .L65
  1205. ALIGN_3
  1206. .L62:
  1207. mulpd %xmm0, %xmm1
  1208. movddup -15 * SIZE(AO), %xmm0
  1209. addpd %xmm1, %xmm8
  1210. movaps -14 * SIZE(BO), %xmm1
  1211. mulpd %xmm0, %xmm1
  1212. movddup -14 * SIZE(AO), %xmm0
  1213. addpd %xmm1, %xmm9
  1214. movaps -12 * SIZE(BO), %xmm1
  1215. mulpd %xmm0, %xmm1
  1216. movddup -13 * SIZE(AO), %xmm0
  1217. addpd %xmm1, %xmm8
  1218. movaps -10 * SIZE(BO), %xmm1
  1219. mulpd %xmm0, %xmm1
  1220. movddup -12 * SIZE(AO), %xmm0
  1221. addpd %xmm1, %xmm9
  1222. movaps -8 * SIZE(BO), %xmm1
  1223. subq $-4 * SIZE, AO
  1224. subq $-8 * SIZE, BO
  1225. subq $1, %rax
  1226. BRANCH
  1227. jg .L62
  1228. ALIGN_3
  1229. .L65:
  1230. movddup ALPHA, %xmm7
  1231. #ifndef TRMMKERNEL
  1232. movq K, %rax
  1233. #else
  1234. movq KKK, %rax
  1235. #endif
  1236. andq $3, %rax # if (k & 1)
  1237. BRANCH
  1238. je .L68
  1239. ALIGN_3
  1240. .L66:
  1241. mulpd %xmm0, %xmm1
  1242. movddup -15 * SIZE(AO), %xmm0
  1243. addpd %xmm1, %xmm8
  1244. movaps -14 * SIZE(BO), %xmm1
  1245. addq $1 * SIZE, AO
  1246. addq $2 * SIZE, BO
  1247. subq $1, %rax
  1248. BRANCH
  1249. jg .L66
  1250. ALIGN_4
  1251. .L68:
  1252. addpd %xmm9, %xmm8
  1253. mulpd %xmm7, %xmm8
  1254. #ifndef TRMMKERNEL
  1255. movsd (CO1), %xmm0
  1256. movhps (CO2), %xmm0
  1257. addpd %xmm0, %xmm8
  1258. #endif
  1259. movsd %xmm8, (CO1)
  1260. movhps %xmm8, (CO2)
  1261. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1262. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1263. movq K, %rax
  1264. subq KKK, %rax
  1265. leaq (,%rax, SIZE), %rax
  1266. leaq (AO, %rax, 1), AO
  1267. leaq (BO, %rax, 2), BO
  1268. #endif
  1269. #if defined(TRMMKERNEL) && defined(LEFT)
  1270. addq $1, KK
  1271. #endif
  1272. ALIGN_4
  1273. .L69:
  1274. #if defined(TRMMKERNEL) && !defined(LEFT)
  1275. addq $2, KK
  1276. #endif
  1277. movq BO, B
  1278. leaq (C, LDC, 2), C
  1279. ALIGN_4
  1280. .L70:
  1281. testq $1, N
  1282. jle .L999
  1283. ALIGN_4
  1284. #if defined(TRMMKERNEL) && defined(LEFT)
  1285. movq OFFSET, %rax
  1286. movq %rax, KK
  1287. #endif
  1288. movq C, CO1
  1289. movq A, AO
  1290. movq M, I
  1291. sarq $1, I
  1292. NOBRANCH
  1293. jle .L80
  1294. ALIGN_4
  1295. .L71:
  1296. #if !defined(TRMMKERNEL) || \
  1297. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1298. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1299. movq B, BO
  1300. #else
  1301. movq B, BO
  1302. movq KK, %rax
  1303. leaq (, %rax, SIZE), %rax
  1304. leaq (AO, %rax, 2), AO
  1305. leaq (BO, %rax, 1), BO
  1306. #endif
  1307. xorps %xmm1, %xmm1
  1308. movaps -16 * SIZE(AO), %xmm0
  1309. xorps %xmm2, %xmm2
  1310. xorps %xmm8, %xmm8
  1311. prefetcht0 2 * SIZE(CO1)
  1312. xorps %xmm9, %xmm9
  1313. xorps %xmm10, %xmm10
  1314. xorps %xmm11, %xmm11
  1315. #ifndef TRMMKERNEL
  1316. movq K, %rax
  1317. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1318. movq K, %rax
  1319. subq KK, %rax
  1320. movq %rax, KKK
  1321. #else
  1322. movq KK, %rax
  1323. #ifdef LEFT
  1324. addq $2, %rax
  1325. #else
  1326. addq $1, %rax
  1327. #endif
  1328. movq %rax, KKK
  1329. #endif
  1330. sarq $2, %rax
  1331. NOBRANCH
  1332. jle .L75
  1333. ALIGN_3
  1334. .L72:
  1335. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1336. addpd %xmm1, %xmm8
  1337. movddup -16 * SIZE(BO), %xmm1
  1338. mulpd %xmm0, %xmm1
  1339. movaps -14 * SIZE(AO), %xmm0
  1340. addpd %xmm1, %xmm9
  1341. movddup -15 * SIZE(BO), %xmm1
  1342. mulpd %xmm0, %xmm1
  1343. movaps -12 * SIZE(AO), %xmm0
  1344. addpd %xmm1, %xmm8
  1345. movddup -14 * SIZE(BO), %xmm1
  1346. mulpd %xmm0, %xmm1
  1347. movaps -10 * SIZE(AO), %xmm0
  1348. addpd %xmm1, %xmm9
  1349. movddup -13 * SIZE(BO), %xmm1
  1350. mulpd %xmm0, %xmm1
  1351. movaps -8 * SIZE(AO), %xmm0
  1352. subq $-8 * SIZE, AO
  1353. subq $-4 * SIZE, BO
  1354. subq $1, %rax
  1355. BRANCH
  1356. jg .L72
  1357. addpd %xmm9, %xmm8
  1358. ALIGN_3
  1359. .L75:
  1360. movddup ALPHA, %xmm7
  1361. #ifndef TRMMKERNEL
  1362. movq K, %rax
  1363. #else
  1364. movq KKK, %rax
  1365. #endif
  1366. andq $3, %rax # if (k & 1)
  1367. BRANCH
  1368. je .L78
  1369. ALIGN_3
  1370. .L76:
  1371. addpd %xmm1, %xmm8
  1372. movddup -16 * SIZE(BO), %xmm1
  1373. mulpd %xmm0, %xmm1
  1374. movaps -14 * SIZE(AO), %xmm0
  1375. addq $2 * SIZE, AO
  1376. addq $1 * SIZE, BO
  1377. subq $1, %rax
  1378. BRANCH
  1379. jg .L76
  1380. ALIGN_4
  1381. .L78:
  1382. addpd %xmm1, %xmm8
  1383. mulpd %xmm7, %xmm8
  1384. #ifndef TRMMKERNEL
  1385. movsd 0 * SIZE(CO1), %xmm0
  1386. movhps 1 * SIZE(CO1), %xmm0
  1387. addpd %xmm0, %xmm8
  1388. #endif
  1389. movsd %xmm8, 0 * SIZE(CO1)
  1390. movhps %xmm8, 1 * SIZE(CO1)
  1391. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1392. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1393. movq K, %rax
  1394. subq KKK, %rax
  1395. leaq (,%rax, SIZE), %rax
  1396. leaq (AO, %rax, 2), AO
  1397. leaq (BO, %rax, 1), BO
  1398. #endif
  1399. #if defined(TRMMKERNEL) && defined(LEFT)
  1400. addq $2, KK
  1401. #endif
  1402. addq $2 * SIZE, CO1
  1403. decq I
  1404. BRANCH
  1405. jg .L71
  1406. ALIGN_4
  1407. .L80:
  1408. testq $1, M
  1409. BRANCH
  1410. jle .L999
  1411. ALIGN_4
  1412. #if !defined(TRMMKERNEL) || \
  1413. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1414. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1415. movq B, BO
  1416. #else
  1417. movq B, BO
  1418. movq KK, %rax
  1419. leaq (, %rax, SIZE), %rax
  1420. leaq (AO, %rax, 1), AO
  1421. leaq (BO, %rax, 1), BO
  1422. #endif
  1423. #ifndef TRMMKERNEL
  1424. movaps -16 * SIZE(AO), %xmm0
  1425. xorps %xmm8, %xmm8
  1426. movaps -16 * SIZE(BO), %xmm1
  1427. xorps %xmm9, %xmm9
  1428. #else
  1429. movsd -16 * SIZE(AO), %xmm0
  1430. movhps -15 * SIZE(AO), %xmm0
  1431. xorps %xmm8, %xmm8
  1432. movsd -16 * SIZE(BO), %xmm1
  1433. movhps -15 * SIZE(BO), %xmm1
  1434. xorps %xmm9, %xmm9
  1435. #endif
  1436. #ifndef TRMMKERNEL
  1437. movq K, %rax
  1438. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1439. movq K, %rax
  1440. subq KK, %rax
  1441. movq %rax, KKK
  1442. #else
  1443. movq KK, %rax
  1444. #ifdef LEFT
  1445. addq $1, %rax
  1446. #else
  1447. addq $1, %rax
  1448. #endif
  1449. movq %rax, KKK
  1450. #endif
  1451. sarq $2, %rax
  1452. NOBRANCH
  1453. jle .L85
  1454. ALIGN_3
  1455. .L82:
  1456. mulpd %xmm0, %xmm1
  1457. #ifndef TRMMKERNEL
  1458. movaps -14 * SIZE(AO), %xmm0
  1459. #else
  1460. movsd -14 * SIZE(AO), %xmm0
  1461. movhps -13 * SIZE(AO), %xmm0
  1462. #endif
  1463. addpd %xmm1, %xmm8
  1464. #ifndef TRMMKERNEL
  1465. movaps -14 * SIZE(BO), %xmm1
  1466. #else
  1467. movsd -14 * SIZE(BO), %xmm1
  1468. movhps -13 * SIZE(BO), %xmm1
  1469. #endif
  1470. mulpd %xmm0, %xmm1
  1471. #ifndef TRMMKERNEL
  1472. movaps -12 * SIZE(AO), %xmm0
  1473. #else
  1474. movsd -12 * SIZE(AO), %xmm0
  1475. movhps -11 * SIZE(AO), %xmm0
  1476. #endif
  1477. addpd %xmm1, %xmm9
  1478. #ifndef TRMMKERNEL
  1479. movaps -12 * SIZE(BO), %xmm1
  1480. #else
  1481. movsd -12 * SIZE(BO), %xmm1
  1482. movhps -11 * SIZE(BO), %xmm1
  1483. #endif
  1484. subq $-4 * SIZE, AO
  1485. subq $-4 * SIZE, BO
  1486. subq $1, %rax
  1487. BRANCH
  1488. jg .L82
  1489. addpd %xmm9, %xmm8
  1490. ALIGN_3
  1491. .L85:
  1492. movsd ALPHA, %xmm7
  1493. #ifndef TRMMKERNEL
  1494. movq K, %rax
  1495. #else
  1496. movq KKK, %rax
  1497. #endif
  1498. andq $3, %rax # if (k & 1)
  1499. BRANCH
  1500. je .L88
  1501. ALIGN_3
  1502. .L86:
  1503. mulsd %xmm0, %xmm1
  1504. movsd -15 * SIZE(AO), %xmm0
  1505. addsd %xmm1, %xmm8
  1506. movsd -15 * SIZE(BO), %xmm1
  1507. addq $1 * SIZE, AO
  1508. addq $1 * SIZE, BO
  1509. subq $1, %rax
  1510. BRANCH
  1511. jg .L86
  1512. ALIGN_4
  1513. .L88:
  1514. haddpd %xmm8, %xmm8
  1515. mulsd %xmm7, %xmm8
  1516. #ifndef TRMMKERNEL
  1517. movsd (CO1), %xmm0
  1518. addsd %xmm0, %xmm8
  1519. #endif
  1520. movsd %xmm8, (CO1)
  1521. ALIGN_4
  1522. .L999:
  1523. movq 0(%rsp), %rbx
  1524. movq 8(%rsp), %rbp
  1525. movq 16(%rsp), %r12
  1526. movq 24(%rsp), %r13
  1527. movq 32(%rsp), %r14
  1528. movq 40(%rsp), %r15
  1529. #ifdef WINDOWS_ABI
  1530. movq 48(%rsp), %rdi
  1531. movq 56(%rsp), %rsi
  1532. movups 64(%rsp), %xmm6
  1533. movups 80(%rsp), %xmm7
  1534. movups 96(%rsp), %xmm8
  1535. movups 112(%rsp), %xmm9
  1536. movups 128(%rsp), %xmm10
  1537. movups 144(%rsp), %xmm11
  1538. movups 160(%rsp), %xmm12
  1539. movups 176(%rsp), %xmm13
  1540. movups 192(%rsp), %xmm14
  1541. movups 208(%rsp), %xmm15
  1542. #endif
  1543. addq $STACKSIZE, %rsp
  1544. ret
  1545. EPILOGUE