You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_ncopy_8.S 39 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef NEHALEM
  41. #define PREFETCHSIZE 12
  42. #define PREFETCH prefetcht0
  43. #define PREFETCHW prefetcht0
  44. #endif
  45. #ifdef SANDYBRIDGE
  46. #define PREFETCHSIZE 12
  47. #define PREFETCH prefetcht0
  48. #define PREFETCHW prefetcht0
  49. #endif
  50. #ifndef MOVAPS
  51. #define MOVAPS movaps
  52. #endif
  53. #ifndef WINDOWS_ABI
  54. #define M ARG1 /* rdi */
  55. #define N ARG2 /* rsi */
  56. #define A ARG3 /* rdx */
  57. #define LDA ARG4 /* rcx */
  58. #define B ARG5 /* r8 */
  59. #define AO1 %r9
  60. #define AO2 %r10
  61. #define LDA3 %r11
  62. #define J %r12
  63. #define MM %r13
  64. #else
  65. #define STACKSIZE 128
  66. #define M ARG1 /* rcx */
  67. #define N ARG2 /* rdx */
  68. #define A ARG3 /* r8 */
  69. #define LDA ARG4 /* r9 */
  70. #define OLD_B 40 + 32 + STACKSIZE(%rsp)
  71. #define B %r15
  72. #define AO1 %r10
  73. #define AO2 %r11
  74. #define LDA3 %r12
  75. #define J %r13
  76. #define MM %r14
  77. #endif
  78. #define I %rax
  79. PROLOGUE
  80. PROFCODE
  81. #ifdef WINDOWS_ABI
  82. pushq %r15
  83. pushq %r14
  84. #endif
  85. pushq %r13
  86. pushq %r12
  87. #ifdef WINDOWS_ABI
  88. subq $STACKSIZE, %rsp
  89. movups %xmm6, 0(%rsp)
  90. movups %xmm7, 16(%rsp)
  91. movups %xmm8, 32(%rsp)
  92. movups %xmm9, 48(%rsp)
  93. movups %xmm10, 64(%rsp)
  94. movups %xmm11, 80(%rsp)
  95. movups %xmm12, 96(%rsp)
  96. movq OLD_B, B
  97. #endif
  98. leaq (,LDA, SIZE), LDA
  99. leaq (LDA, LDA, 2), LDA3
  100. subq $-16 * SIZE, B
  101. movq M, MM
  102. leaq -1(M), %rax
  103. testq $SIZE, A
  104. cmovne %rax, MM
  105. testq $SIZE, LDA
  106. jne .L50
  107. movq N, J
  108. sarq $3, J
  109. jle .L20
  110. ALIGN_4
  111. .L11:
  112. movq A, AO1
  113. leaq (A, LDA, 4), AO2
  114. leaq (A, LDA, 8), A
  115. testq $SIZE, A
  116. je .L12
  117. movsd 0 * SIZE(AO1), %xmm0
  118. movsd 0 * SIZE(AO1, LDA), %xmm1
  119. movsd 0 * SIZE(AO1, LDA, 2), %xmm2
  120. movsd 0 * SIZE(AO1, LDA3), %xmm3
  121. movsd 0 * SIZE(AO2), %xmm4
  122. movsd 0 * SIZE(AO2, LDA), %xmm5
  123. movsd 0 * SIZE(AO2, LDA, 2), %xmm6
  124. movsd 0 * SIZE(AO2, LDA3), %xmm7
  125. unpcklpd %xmm1, %xmm0
  126. unpcklpd %xmm3, %xmm2
  127. unpcklpd %xmm5, %xmm4
  128. unpcklpd %xmm7, %xmm6
  129. movaps %xmm0, -16 * SIZE(B)
  130. movaps %xmm2, -14 * SIZE(B)
  131. movaps %xmm4, -12 * SIZE(B)
  132. movaps %xmm6, -10 * SIZE(B)
  133. addq $1 * SIZE, AO1
  134. addq $1 * SIZE, AO2
  135. subq $-8 * SIZE, B
  136. ALIGN_3
  137. .L12:
  138. movq MM, I
  139. sarq $3, I
  140. jle .L14
  141. ALIGN_4
  142. .L13:
  143. #ifdef PREFETCH
  144. PREFETCH PREFETCHSIZE * SIZE(AO1)
  145. #endif
  146. MOVAPS 0 * SIZE(AO1), %xmm0
  147. MOVAPS 0 * SIZE(AO1, LDA), %xmm1
  148. MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2
  149. MOVAPS 0 * SIZE(AO1, LDA3), %xmm3
  150. movaps %xmm0, %xmm8
  151. unpcklpd %xmm1, %xmm0
  152. movaps %xmm2, %xmm9
  153. unpcklpd %xmm3, %xmm2
  154. #ifdef PREFETCH
  155. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA)
  156. #endif
  157. MOVAPS 0 * SIZE(AO2), %xmm4
  158. MOVAPS 0 * SIZE(AO2, LDA), %xmm5
  159. MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6
  160. MOVAPS 0 * SIZE(AO2, LDA3), %xmm7
  161. movaps %xmm4, %xmm10
  162. unpcklpd %xmm5, %xmm4
  163. movaps %xmm6, %xmm11
  164. unpcklpd %xmm7, %xmm6
  165. #ifdef PREFETCHW
  166. PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B)
  167. #endif
  168. movaps %xmm0, -16 * SIZE(B)
  169. movaps %xmm2, -14 * SIZE(B)
  170. movaps %xmm4, -12 * SIZE(B)
  171. movaps %xmm6, -10 * SIZE(B)
  172. unpckhpd %xmm1, %xmm8
  173. unpckhpd %xmm3, %xmm9
  174. unpckhpd %xmm5, %xmm10
  175. unpckhpd %xmm7, %xmm11
  176. #ifdef PREFETCHW
  177. PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B)
  178. #endif
  179. movaps %xmm8, -8 * SIZE(B)
  180. movaps %xmm9, -6 * SIZE(B)
  181. movaps %xmm10, -4 * SIZE(B)
  182. movaps %xmm11, -2 * SIZE(B)
  183. #ifdef PREFETCH
  184. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2)
  185. #endif
  186. MOVAPS 2 * SIZE(AO1), %xmm0
  187. MOVAPS 2 * SIZE(AO1, LDA), %xmm1
  188. MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2
  189. MOVAPS 2 * SIZE(AO1, LDA3), %xmm3
  190. movaps %xmm0, %xmm8
  191. unpcklpd %xmm1, %xmm0
  192. movaps %xmm2, %xmm9
  193. unpcklpd %xmm3, %xmm2
  194. #ifdef PREFETCH
  195. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3)
  196. #endif
  197. MOVAPS 2 * SIZE(AO2), %xmm4
  198. MOVAPS 2 * SIZE(AO2, LDA), %xmm5
  199. MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6
  200. MOVAPS 2 * SIZE(AO2, LDA3), %xmm7
  201. movaps %xmm4, %xmm10
  202. unpcklpd %xmm5, %xmm4
  203. movaps %xmm6, %xmm11
  204. unpcklpd %xmm7, %xmm6
  205. #ifdef PREFETCHW
  206. PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B)
  207. #endif
  208. movaps %xmm0, 0 * SIZE(B)
  209. movaps %xmm2, 2 * SIZE(B)
  210. movaps %xmm4, 4 * SIZE(B)
  211. movaps %xmm6, 6 * SIZE(B)
  212. unpckhpd %xmm1, %xmm8
  213. unpckhpd %xmm3, %xmm9
  214. unpckhpd %xmm5, %xmm10
  215. unpckhpd %xmm7, %xmm11
  216. #ifdef PREFETCHW
  217. PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B)
  218. #endif
  219. movaps %xmm8, 8 * SIZE(B)
  220. movaps %xmm9, 10 * SIZE(B)
  221. movaps %xmm10, 12 * SIZE(B)
  222. movaps %xmm11, 14 * SIZE(B)
  223. #ifdef PREFETCH
  224. PREFETCH PREFETCHSIZE * SIZE(AO2)
  225. #endif
  226. MOVAPS 4 * SIZE(AO1), %xmm0
  227. MOVAPS 4 * SIZE(AO1, LDA), %xmm1
  228. MOVAPS 4 * SIZE(AO1, LDA, 2), %xmm2
  229. MOVAPS 4 * SIZE(AO1, LDA3), %xmm3
  230. movaps %xmm0, %xmm8
  231. unpcklpd %xmm1, %xmm0
  232. movaps %xmm2, %xmm9
  233. unpcklpd %xmm3, %xmm2
  234. #ifdef PREFETCH
  235. PREFETCH PREFETCHSIZE * SIZE(AO2, LDA)
  236. #endif
  237. MOVAPS 4 * SIZE(AO2), %xmm4
  238. MOVAPS 4 * SIZE(AO2, LDA), %xmm5
  239. MOVAPS 4 * SIZE(AO2, LDA, 2), %xmm6
  240. MOVAPS 4 * SIZE(AO2, LDA3), %xmm7
  241. movaps %xmm4, %xmm10
  242. unpcklpd %xmm5, %xmm4
  243. movaps %xmm6, %xmm11
  244. unpcklpd %xmm7, %xmm6
  245. #ifdef PREFETCHW
  246. PREFETCHW (PREFETCHSIZE * 8 + 32) * SIZE(B)
  247. #endif
  248. movaps %xmm0, 16 * SIZE(B)
  249. movaps %xmm2, 18 * SIZE(B)
  250. movaps %xmm4, 20 * SIZE(B)
  251. movaps %xmm6, 22 * SIZE(B)
  252. unpckhpd %xmm1, %xmm8
  253. unpckhpd %xmm3, %xmm9
  254. unpckhpd %xmm5, %xmm10
  255. unpckhpd %xmm7, %xmm11
  256. #ifdef PREFETCHW
  257. PREFETCHW (PREFETCHSIZE * 8 + 40) * SIZE(B)
  258. #endif
  259. movaps %xmm8, 24 * SIZE(B)
  260. movaps %xmm9, 26 * SIZE(B)
  261. movaps %xmm10, 28 * SIZE(B)
  262. movaps %xmm11, 30 * SIZE(B)
  263. #ifdef PREFETCH
  264. PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2)
  265. #endif
  266. MOVAPS 6 * SIZE(AO1), %xmm0
  267. MOVAPS 6 * SIZE(AO1, LDA), %xmm1
  268. MOVAPS 6 * SIZE(AO1, LDA, 2), %xmm2
  269. MOVAPS 6 * SIZE(AO1, LDA3), %xmm3
  270. movaps %xmm0, %xmm8
  271. unpcklpd %xmm1, %xmm0
  272. movaps %xmm2, %xmm9
  273. unpcklpd %xmm3, %xmm2
  274. #ifdef PREFETCH
  275. PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3)
  276. #endif
  277. MOVAPS 6 * SIZE(AO2), %xmm4
  278. MOVAPS 6 * SIZE(AO2, LDA), %xmm5
  279. MOVAPS 6 * SIZE(AO2, LDA, 2), %xmm6
  280. MOVAPS 6 * SIZE(AO2, LDA3), %xmm7
  281. movaps %xmm4, %xmm10
  282. unpcklpd %xmm5, %xmm4
  283. movaps %xmm6, %xmm11
  284. unpcklpd %xmm7, %xmm6
  285. #ifdef PREFETCHW
  286. PREFETCHW (PREFETCHSIZE * 8 + 48) * SIZE(B)
  287. #endif
  288. movaps %xmm0, 32 * SIZE(B)
  289. movaps %xmm2, 34 * SIZE(B)
  290. movaps %xmm4, 36 * SIZE(B)
  291. movaps %xmm6, 38 * SIZE(B)
  292. unpckhpd %xmm1, %xmm8
  293. unpckhpd %xmm3, %xmm9
  294. unpckhpd %xmm5, %xmm10
  295. unpckhpd %xmm7, %xmm11
  296. #ifdef PREFETCHW
  297. PREFETCHW (PREFETCHSIZE * 8 + 56) * SIZE(B)
  298. #endif
  299. movaps %xmm8, 40 * SIZE(B)
  300. movaps %xmm9, 42 * SIZE(B)
  301. movaps %xmm10, 44 * SIZE(B)
  302. movaps %xmm11, 46 * SIZE(B)
  303. addq $8 * SIZE, AO1
  304. addq $8 * SIZE, AO2
  305. subq $-64 * SIZE, B
  306. decq I
  307. jg .L13
  308. ALIGN_4
  309. .L14:
  310. testq $4, MM
  311. jle .L16
  312. MOVAPS 0 * SIZE(AO1), %xmm0
  313. MOVAPS 0 * SIZE(AO1, LDA), %xmm1
  314. MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2
  315. MOVAPS 0 * SIZE(AO1, LDA3), %xmm3
  316. MOVAPS 0 * SIZE(AO2), %xmm4
  317. MOVAPS 0 * SIZE(AO2, LDA), %xmm5
  318. MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6
  319. MOVAPS 0 * SIZE(AO2, LDA3), %xmm7
  320. movaps %xmm0, %xmm8
  321. unpcklpd %xmm1, %xmm0
  322. movaps %xmm2, %xmm9
  323. unpcklpd %xmm3, %xmm2
  324. movaps %xmm4, %xmm10
  325. unpcklpd %xmm5, %xmm4
  326. movaps %xmm6, %xmm11
  327. unpcklpd %xmm7, %xmm6
  328. movaps %xmm0, -16 * SIZE(B)
  329. movaps %xmm2, -14 * SIZE(B)
  330. movaps %xmm4, -12 * SIZE(B)
  331. movaps %xmm6, -10 * SIZE(B)
  332. unpckhpd %xmm1, %xmm8
  333. unpckhpd %xmm3, %xmm9
  334. unpckhpd %xmm5, %xmm10
  335. unpckhpd %xmm7, %xmm11
  336. movaps %xmm8, -8 * SIZE(B)
  337. movaps %xmm9, -6 * SIZE(B)
  338. movaps %xmm10, -4 * SIZE(B)
  339. movaps %xmm11, -2 * SIZE(B)
  340. MOVAPS 2 * SIZE(AO1), %xmm0
  341. MOVAPS 2 * SIZE(AO1, LDA), %xmm1
  342. MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2
  343. MOVAPS 2 * SIZE(AO1, LDA3), %xmm3
  344. MOVAPS 2 * SIZE(AO2), %xmm4
  345. MOVAPS 2 * SIZE(AO2, LDA), %xmm5
  346. MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6
  347. MOVAPS 2 * SIZE(AO2, LDA3), %xmm7
  348. movaps %xmm0, %xmm8
  349. unpcklpd %xmm1, %xmm0
  350. movaps %xmm2, %xmm9
  351. unpcklpd %xmm3, %xmm2
  352. movaps %xmm4, %xmm10
  353. unpcklpd %xmm5, %xmm4
  354. movaps %xmm6, %xmm11
  355. unpcklpd %xmm7, %xmm6
  356. movaps %xmm0, 0 * SIZE(B)
  357. movaps %xmm2, 2 * SIZE(B)
  358. movaps %xmm4, 4 * SIZE(B)
  359. movaps %xmm6, 6 * SIZE(B)
  360. unpckhpd %xmm1, %xmm8
  361. unpckhpd %xmm3, %xmm9
  362. unpckhpd %xmm5, %xmm10
  363. unpckhpd %xmm7, %xmm11
  364. movaps %xmm8, 8 * SIZE(B)
  365. movaps %xmm9, 10 * SIZE(B)
  366. movaps %xmm10, 12 * SIZE(B)
  367. movaps %xmm11, 14 * SIZE(B)
  368. addq $4 * SIZE, AO1
  369. addq $4 * SIZE, AO2
  370. subq $-32 * SIZE, B
  371. ALIGN_4
  372. .L16:
  373. testq $2, MM
  374. jle .L18
  375. MOVAPS 0 * SIZE(AO1), %xmm0
  376. MOVAPS 0 * SIZE(AO1, LDA), %xmm1
  377. MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2
  378. MOVAPS 0 * SIZE(AO1, LDA3), %xmm3
  379. MOVAPS 0 * SIZE(AO2), %xmm4
  380. MOVAPS 0 * SIZE(AO2, LDA), %xmm5
  381. MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6
  382. MOVAPS 0 * SIZE(AO2, LDA3), %xmm7
  383. movaps %xmm0, %xmm8
  384. unpcklpd %xmm1, %xmm0
  385. movaps %xmm2, %xmm9
  386. unpcklpd %xmm3, %xmm2
  387. movaps %xmm4, %xmm10
  388. unpcklpd %xmm5, %xmm4
  389. movaps %xmm6, %xmm11
  390. unpcklpd %xmm7, %xmm6
  391. movaps %xmm0, -16 * SIZE(B)
  392. movaps %xmm2, -14 * SIZE(B)
  393. movaps %xmm4, -12 * SIZE(B)
  394. movaps %xmm6, -10 * SIZE(B)
  395. unpckhpd %xmm1, %xmm8
  396. unpckhpd %xmm3, %xmm9
  397. unpckhpd %xmm5, %xmm10
  398. unpckhpd %xmm7, %xmm11
  399. movaps %xmm8, -8 * SIZE(B)
  400. movaps %xmm9, -6 * SIZE(B)
  401. movaps %xmm10, -4 * SIZE(B)
  402. movaps %xmm11, -2 * SIZE(B)
  403. addq $2 * SIZE, AO1
  404. addq $2 * SIZE, AO2
  405. subq $-16 * SIZE, B
  406. ALIGN_4
  407. .L18:
  408. testq $1, MM
  409. jle .L19
  410. movsd 0 * SIZE(AO1), %xmm0
  411. movsd 0 * SIZE(AO1, LDA), %xmm1
  412. movsd 0 * SIZE(AO1, LDA, 2), %xmm2
  413. movsd 0 * SIZE(AO1, LDA3), %xmm3
  414. movsd 0 * SIZE(AO2), %xmm4
  415. movsd 0 * SIZE(AO2, LDA), %xmm5
  416. movsd 0 * SIZE(AO2, LDA, 2), %xmm6
  417. movsd 0 * SIZE(AO2, LDA3), %xmm7
  418. unpcklpd %xmm1, %xmm0
  419. unpcklpd %xmm3, %xmm2
  420. unpcklpd %xmm5, %xmm4
  421. unpcklpd %xmm7, %xmm6
  422. movaps %xmm0, -16 * SIZE(B)
  423. movaps %xmm2, -14 * SIZE(B)
  424. movaps %xmm4, -12 * SIZE(B)
  425. movaps %xmm6, -10 * SIZE(B)
  426. subq $-8 * SIZE, B
  427. ALIGN_4
  428. .L19:
  429. decq J
  430. jg .L11
  431. ALIGN_4
  432. .L20:
  433. testq $4, N
  434. jle .L30
  435. movq A, AO1
  436. leaq (A, LDA, 2), AO2
  437. leaq (A, LDA, 4), A
  438. testq $SIZE, A
  439. je .L22
  440. movsd 0 * SIZE(AO1), %xmm0
  441. movsd 0 * SIZE(AO1, LDA), %xmm1
  442. movsd 0 * SIZE(AO2), %xmm2
  443. movsd 0 * SIZE(AO2, LDA), %xmm3
  444. unpcklpd %xmm1, %xmm0
  445. unpcklpd %xmm3, %xmm2
  446. movaps %xmm0, -16 * SIZE(B)
  447. movaps %xmm2, -14 * SIZE(B)
  448. addq $1 * SIZE, AO1
  449. addq $1 * SIZE, AO2
  450. subq $-4 * SIZE, B
  451. ALIGN_3
  452. .L22:
  453. movq MM, I
  454. sarq $3, I
  455. jle .L24
  456. ALIGN_4
  457. .L23:
  458. #ifdef PREFETCH
  459. PREFETCH PREFETCHSIZE * 2 * SIZE(AO1)
  460. #endif
  461. MOVAPS 0 * SIZE(AO1), %xmm0
  462. MOVAPS 0 * SIZE(AO1, LDA), %xmm1
  463. MOVAPS 0 * SIZE(AO2), %xmm2
  464. MOVAPS 0 * SIZE(AO2, LDA), %xmm3
  465. movaps %xmm0, %xmm4
  466. unpcklpd %xmm1, %xmm0
  467. movaps %xmm2, %xmm6
  468. unpcklpd %xmm3, %xmm2
  469. unpckhpd %xmm1, %xmm4
  470. unpckhpd %xmm3, %xmm6
  471. #ifdef PREFETCHW
  472. PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B)
  473. #endif
  474. movaps %xmm0, -16 * SIZE(B)
  475. movaps %xmm2, -14 * SIZE(B)
  476. movaps %xmm4, -12 * SIZE(B)
  477. movaps %xmm6, -10 * SIZE(B)
  478. #ifdef PREFETCH
  479. PREFETCH PREFETCHSIZE * 2 * SIZE(AO1, LDA)
  480. #endif
  481. MOVAPS 2 * SIZE(AO1), %xmm0
  482. MOVAPS 2 * SIZE(AO1, LDA), %xmm1
  483. MOVAPS 2 * SIZE(AO2), %xmm2
  484. MOVAPS 2 * SIZE(AO2, LDA), %xmm3
  485. movaps %xmm0, %xmm4
  486. unpcklpd %xmm1, %xmm0
  487. movaps %xmm2, %xmm6
  488. unpcklpd %xmm3, %xmm2
  489. unpckhpd %xmm1, %xmm4
  490. unpckhpd %xmm3, %xmm6
  491. #ifdef PREFETCHW
  492. PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B)
  493. #endif
  494. movaps %xmm0, -8 * SIZE(B)
  495. movaps %xmm2, -6 * SIZE(B)
  496. movaps %xmm4, -4 * SIZE(B)
  497. movaps %xmm6, -2 * SIZE(B)
  498. #ifdef PREFETCH
  499. PREFETCH PREFETCHSIZE * 2 * SIZE(AO2)
  500. #endif
  501. MOVAPS 4 * SIZE(AO1), %xmm0
  502. MOVAPS 4 * SIZE(AO1, LDA), %xmm1
  503. MOVAPS 4 * SIZE(AO2), %xmm2
  504. MOVAPS 4 * SIZE(AO2, LDA), %xmm3
  505. movaps %xmm0, %xmm4
  506. unpcklpd %xmm1, %xmm0
  507. movaps %xmm2, %xmm6
  508. unpcklpd %xmm3, %xmm2
  509. unpckhpd %xmm1, %xmm4
  510. unpckhpd %xmm3, %xmm6
  511. #ifdef PREFETCHW
  512. PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B)
  513. #endif
  514. movaps %xmm0, 0 * SIZE(B)
  515. movaps %xmm2, 2 * SIZE(B)
  516. movaps %xmm4, 4 * SIZE(B)
  517. movaps %xmm6, 6 * SIZE(B)
  518. #ifdef PREFETCH
  519. PREFETCH PREFETCHSIZE * 2 * SIZE(AO2, LDA)
  520. #endif
  521. MOVAPS 6 * SIZE(AO1), %xmm0
  522. MOVAPS 6 * SIZE(AO1, LDA), %xmm1
  523. MOVAPS 6 * SIZE(AO2), %xmm2
  524. MOVAPS 6 * SIZE(AO2, LDA), %xmm3
  525. movaps %xmm0, %xmm4
  526. unpcklpd %xmm1, %xmm0
  527. movaps %xmm2, %xmm6
  528. unpcklpd %xmm3, %xmm2
  529. unpckhpd %xmm1, %xmm4
  530. unpckhpd %xmm3, %xmm6
  531. #ifdef PREFETCHW
  532. PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B)
  533. #endif
  534. movaps %xmm0, 8 * SIZE(B)
  535. movaps %xmm2, 10 * SIZE(B)
  536. movaps %xmm4, 12 * SIZE(B)
  537. movaps %xmm6, 14 * SIZE(B)
  538. addq $8 * SIZE, AO1
  539. addq $8 * SIZE, AO2
  540. subq $-32 * SIZE, B
  541. decq I
  542. jg .L23
  543. ALIGN_4
  544. .L24:
  545. testq $4, MM
  546. jle .L26
  547. MOVAPS 0 * SIZE(AO1), %xmm0
  548. MOVAPS 0 * SIZE(AO1, LDA), %xmm1
  549. MOVAPS 0 * SIZE(AO2), %xmm2
  550. MOVAPS 0 * SIZE(AO2, LDA), %xmm3
  551. movaps %xmm0, %xmm4
  552. unpcklpd %xmm1, %xmm0
  553. movaps %xmm2, %xmm6
  554. unpcklpd %xmm3, %xmm2
  555. unpckhpd %xmm1, %xmm4
  556. unpckhpd %xmm3, %xmm6
  557. movaps %xmm0, -16 * SIZE(B)
  558. movaps %xmm2, -14 * SIZE(B)
  559. movaps %xmm4, -12 * SIZE(B)
  560. movaps %xmm6, -10 * SIZE(B)
  561. MOVAPS 2 * SIZE(AO1), %xmm0
  562. MOVAPS 2 * SIZE(AO1, LDA), %xmm1
  563. MOVAPS 2 * SIZE(AO2), %xmm2
  564. MOVAPS 2 * SIZE(AO2, LDA), %xmm3
  565. movaps %xmm0, %xmm4
  566. unpcklpd %xmm1, %xmm0
  567. movaps %xmm2, %xmm6
  568. unpcklpd %xmm3, %xmm2
  569. unpckhpd %xmm1, %xmm4
  570. unpckhpd %xmm3, %xmm6
  571. movaps %xmm0, -8 * SIZE(B)
  572. movaps %xmm2, -6 * SIZE(B)
  573. movaps %xmm4, -4 * SIZE(B)
  574. movaps %xmm6, -2 * SIZE(B)
  575. addq $4 * SIZE, AO1
  576. addq $4 * SIZE, AO2
  577. subq $-16 * SIZE, B
  578. ALIGN_4
  579. .L26:
  580. testq $2, MM
  581. jle .L28
  582. MOVAPS 0 * SIZE(AO1), %xmm0
  583. MOVAPS 0 * SIZE(AO1, LDA), %xmm1
  584. MOVAPS 0 * SIZE(AO2), %xmm2
  585. MOVAPS 0 * SIZE(AO2, LDA), %xmm3
  586. movaps %xmm0, %xmm4
  587. unpcklpd %xmm1, %xmm0
  588. movaps %xmm2, %xmm6
  589. unpcklpd %xmm3, %xmm2
  590. unpckhpd %xmm1, %xmm4
  591. unpckhpd %xmm3, %xmm6
  592. movaps %xmm0, -16 * SIZE(B)
  593. movaps %xmm2, -14 * SIZE(B)
  594. movaps %xmm4, -12 * SIZE(B)
  595. movaps %xmm6, -10 * SIZE(B)
  596. addq $2 * SIZE, AO1
  597. addq $2 * SIZE, AO2
  598. subq $-8 * SIZE, B
  599. ALIGN_4
  600. .L28:
  601. testq $1, MM
  602. jle .L30
  603. movsd 0 * SIZE(AO1), %xmm0
  604. movsd 0 * SIZE(AO1, LDA), %xmm1
  605. movsd 0 * SIZE(AO2), %xmm2
  606. movsd 0 * SIZE(AO2, LDA), %xmm3
  607. unpcklpd %xmm1, %xmm0
  608. unpcklpd %xmm3, %xmm2
  609. movaps %xmm0, -16 * SIZE(B)
  610. movaps %xmm2, -14 * SIZE(B)
  611. subq $-4 * SIZE, B
  612. ALIGN_4
  613. .L30:
  614. testq $2, N
  615. jle .L40
  616. movq A, AO1
  617. leaq (A, LDA), AO2
  618. leaq (A, LDA, 2), A
  619. testq $SIZE, A
  620. je .L32
  621. movsd 0 * SIZE(AO1), %xmm0
  622. movsd 0 * SIZE(AO2), %xmm1
  623. unpcklpd %xmm1, %xmm0
  624. movaps %xmm0, -16 * SIZE(B)
  625. addq $1 * SIZE, AO1
  626. addq $1 * SIZE, AO2
  627. subq $-2 * SIZE, B
  628. ALIGN_3
  629. .L32:
  630. movq MM, I
  631. sarq $3, I
  632. jle .L34
  633. ALIGN_4
  634. .L33:
  635. #ifdef PREFETCH
  636. PREFETCH PREFETCHSIZE * 4 * SIZE(AO1)
  637. #endif
  638. MOVAPS 0 * SIZE(AO1), %xmm0
  639. MOVAPS 0 * SIZE(AO2), %xmm1
  640. MOVAPS 2 * SIZE(AO1), %xmm2
  641. MOVAPS 2 * SIZE(AO2), %xmm3
  642. movaps %xmm0, %xmm4
  643. unpcklpd %xmm1, %xmm0
  644. movaps %xmm2, %xmm6
  645. unpcklpd %xmm3, %xmm2
  646. unpckhpd %xmm1, %xmm4
  647. unpckhpd %xmm3, %xmm6
  648. #ifdef PREFETCHW
  649. PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B)
  650. #endif
  651. movaps %xmm0, -16 * SIZE(B)
  652. movaps %xmm4, -14 * SIZE(B)
  653. movaps %xmm2, -12 * SIZE(B)
  654. movaps %xmm6, -10 * SIZE(B)
  655. #ifdef PREFETCH
  656. PREFETCH PREFETCHSIZE * 4 * SIZE(AO2)
  657. #endif
  658. MOVAPS 4 * SIZE(AO1), %xmm0
  659. MOVAPS 4 * SIZE(AO2), %xmm1
  660. MOVAPS 6 * SIZE(AO1), %xmm2
  661. MOVAPS 6 * SIZE(AO2), %xmm3
  662. movaps %xmm0, %xmm4
  663. unpcklpd %xmm1, %xmm0
  664. movaps %xmm2, %xmm6
  665. unpcklpd %xmm3, %xmm2
  666. unpckhpd %xmm1, %xmm4
  667. unpckhpd %xmm3, %xmm6
  668. #ifdef PREFETCHW
  669. PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B)
  670. #endif
  671. movaps %xmm0, -8 * SIZE(B)
  672. movaps %xmm4, -6 * SIZE(B)
  673. movaps %xmm2, -4 * SIZE(B)
  674. movaps %xmm6, -2 * SIZE(B)
  675. addq $8 * SIZE, AO1
  676. addq $8 * SIZE, AO2
  677. subq $-16 * SIZE, B
  678. decq I
  679. jg .L33
  680. ALIGN_4
  681. .L34:
  682. testq $4, MM
  683. jle .L36
  684. MOVAPS 0 * SIZE(AO1), %xmm0
  685. MOVAPS 0 * SIZE(AO2), %xmm1
  686. MOVAPS 2 * SIZE(AO1), %xmm2
  687. MOVAPS 2 * SIZE(AO2), %xmm3
  688. movaps %xmm0, %xmm4
  689. unpcklpd %xmm1, %xmm0
  690. unpckhpd %xmm1, %xmm4
  691. movaps %xmm2, %xmm6
  692. unpcklpd %xmm3, %xmm2
  693. unpckhpd %xmm3, %xmm6
  694. movaps %xmm0, -16 * SIZE(B)
  695. movaps %xmm4, -14 * SIZE(B)
  696. movaps %xmm2, -12 * SIZE(B)
  697. movaps %xmm6, -10 * SIZE(B)
  698. addq $4 * SIZE, AO1
  699. addq $4 * SIZE, AO2
  700. subq $-8 * SIZE, B
  701. ALIGN_4
  702. .L36:
  703. testq $2, MM
  704. jle .L38
  705. MOVAPS 0 * SIZE(AO1), %xmm0
  706. MOVAPS 0 * SIZE(AO2), %xmm1
  707. movaps %xmm0, %xmm2
  708. unpcklpd %xmm1, %xmm0
  709. unpckhpd %xmm1, %xmm2
  710. movaps %xmm0, -16 * SIZE(B)
  711. movaps %xmm2, -14 * SIZE(B)
  712. addq $2 * SIZE, AO1
  713. addq $2 * SIZE, AO2
  714. subq $-4 * SIZE, B
  715. ALIGN_4
  716. .L38:
  717. testq $1, MM
  718. jle .L40
  719. movsd 0 * SIZE(AO1), %xmm0
  720. movsd 0 * SIZE(AO2), %xmm1
  721. unpcklpd %xmm1, %xmm0
  722. movaps %xmm0, -16 * SIZE(B)
  723. subq $-2 * SIZE, B
  724. ALIGN_4
  725. .L40:
  726. testq $1, N
  727. jle .L999
  728. movq A, AO1
  729. testq $SIZE, A
  730. jne .L45
  731. movq MM, I
  732. sarq $3, I
  733. jle .L42
  734. ALIGN_4
  735. .L41:
  736. #ifdef PREFETCH
  737. PREFETCH PREFETCHSIZE * 8 * SIZE(AO1)
  738. #endif
  739. MOVAPS 0 * SIZE(AO1), %xmm0
  740. MOVAPS 2 * SIZE(AO1), %xmm1
  741. MOVAPS 4 * SIZE(AO1), %xmm2
  742. MOVAPS 6 * SIZE(AO1), %xmm3
  743. #ifdef PREFETCHW
  744. PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B)
  745. #endif
  746. movaps %xmm0, -16 * SIZE(B)
  747. movaps %xmm1, -14 * SIZE(B)
  748. movaps %xmm2, -12 * SIZE(B)
  749. movaps %xmm3, -10 * SIZE(B)
  750. addq $8 * SIZE, AO1
  751. subq $-8 * SIZE, B
  752. decq I
  753. jg .L41
  754. ALIGN_4
  755. .L42:
  756. testq $4, MM
  757. jle .L43
  758. MOVAPS 0 * SIZE(AO1), %xmm0
  759. MOVAPS 2 * SIZE(AO1), %xmm1
  760. movaps %xmm0, -16 * SIZE(B)
  761. movaps %xmm1, -14 * SIZE(B)
  762. addq $4 * SIZE, AO1
  763. subq $-4 * SIZE, B
  764. ALIGN_4
  765. .L43:
  766. testq $2, MM
  767. jle .L44
  768. MOVAPS 0 * SIZE(AO1), %xmm0
  769. movaps %xmm0, -16 * SIZE(B)
  770. addq $2 * SIZE, AO1
  771. subq $-2 * SIZE, B
  772. ALIGN_4
  773. .L44:
  774. testq $1, MM
  775. jle .L999
  776. movsd 0 * SIZE(AO1), %xmm0
  777. movlpd %xmm0, -16 * SIZE(B)
  778. jmp .L999
  779. ALIGN_4
  780. .L45:
  781. MOVAPS -1 * SIZE(AO1), %xmm0
  782. movq M, I
  783. sarq $3, I
  784. jle .L46
  785. ALIGN_4
  786. .L46:
  787. #ifdef PREFETCH
  788. PREFETCH PREFETCHSIZE * 8 * SIZE(AO1)
  789. #endif
  790. MOVAPS 1 * SIZE(AO1), %xmm1
  791. MOVAPS 3 * SIZE(AO1), %xmm2
  792. MOVAPS 5 * SIZE(AO1), %xmm3
  793. MOVAPS 7 * SIZE(AO1), %xmm4
  794. shufpd $1, %xmm1, %xmm0
  795. shufpd $1, %xmm2, %xmm1
  796. shufpd $1, %xmm3, %xmm2
  797. shufpd $1, %xmm4, %xmm3
  798. #ifdef PREFETCHW
  799. PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B)
  800. #endif
  801. movaps %xmm0, -16 * SIZE(B)
  802. movaps %xmm1, -14 * SIZE(B)
  803. movaps %xmm2, -12 * SIZE(B)
  804. movaps %xmm3, -10 * SIZE(B)
  805. movaps %xmm4, %xmm0
  806. addq $8 * SIZE, AO1
  807. subq $-8 * SIZE, B
  808. decq I
  809. jg .L46
  810. ALIGN_4
  811. .L47:
  812. testq $4, M
  813. jle .L48
  814. MOVAPS 1 * SIZE(AO1), %xmm1
  815. MOVAPS 3 * SIZE(AO1), %xmm2
  816. shufpd $1, %xmm1, %xmm0
  817. shufpd $1, %xmm2, %xmm1
  818. movaps %xmm0, -16 * SIZE(B)
  819. movaps %xmm1, -14 * SIZE(B)
  820. movaps %xmm2, %xmm0
  821. addq $4 * SIZE, AO1
  822. addq $4 * SIZE, B
  823. ALIGN_4
  824. .L48:
  825. testq $2, M
  826. jle .L49
  827. MOVAPS 1 * SIZE(AO1), %xmm1
  828. shufpd $1, %xmm1, %xmm0
  829. movaps %xmm0, -16 * SIZE(B)
  830. movaps %xmm1, %xmm0
  831. addq $2 * SIZE, AO1
  832. subq $-2 * SIZE, B
  833. ALIGN_4
  834. .L49:
  835. testq $1, M
  836. jle .L999
  837. shufpd $1, %xmm0, %xmm0
  838. movlpd %xmm0, -16 * SIZE(B)
  839. jmp .L999
  840. ALIGN_4
  841. .L50:
  842. movq N, J
  843. sarq $3, J
  844. jle .L60
  845. ALIGN_4
  846. .L51:
  847. movq A, AO1
  848. leaq (A, LDA, 4), AO2
  849. leaq (A, LDA, 8), A
  850. testq $SIZE, A
  851. je .L52
  852. movsd 0 * SIZE(AO1), %xmm0
  853. movsd 0 * SIZE(AO1, LDA), %xmm1
  854. movsd 0 * SIZE(AO1, LDA, 2), %xmm2
  855. movsd 0 * SIZE(AO1, LDA3), %xmm3
  856. movsd 0 * SIZE(AO2), %xmm4
  857. movsd 0 * SIZE(AO2, LDA), %xmm5
  858. movsd 0 * SIZE(AO2, LDA, 2), %xmm6
  859. movsd 0 * SIZE(AO2, LDA3), %xmm7
  860. unpcklpd %xmm1, %xmm0
  861. unpcklpd %xmm3, %xmm2
  862. unpcklpd %xmm5, %xmm4
  863. unpcklpd %xmm7, %xmm6
  864. movaps %xmm0, -16 * SIZE(B)
  865. movaps %xmm2, -14 * SIZE(B)
  866. movaps %xmm4, -12 * SIZE(B)
  867. movaps %xmm6, -10 * SIZE(B)
  868. addq $1 * SIZE, AO1
  869. addq $1 * SIZE, AO2
  870. subq $-8 * SIZE, B
  871. ALIGN_3
  872. .L52:
  873. MOVAPS -1 * SIZE(AO1, LDA), %xmm9
  874. MOVAPS -1 * SIZE(AO1, LDA3), %xmm10
  875. MOVAPS -1 * SIZE(AO2, LDA), %xmm11
  876. MOVAPS -1 * SIZE(AO2, LDA3), %xmm12
  877. movq MM, I
  878. sarq $3, I
  879. jle .L54
  880. ALIGN_4
  881. .L53:
  882. #ifdef PREFETCH
  883. PREFETCH PREFETCHSIZE * SIZE(AO1)
  884. #endif
  885. MOVAPS 0 * SIZE(AO1), %xmm0
  886. MOVAPS 1 * SIZE(AO1, LDA), %xmm1
  887. MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2
  888. MOVAPS 1 * SIZE(AO1, LDA3), %xmm3
  889. #ifdef PREFETCH
  890. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA)
  891. #endif
  892. MOVAPS 0 * SIZE(AO2), %xmm4
  893. MOVAPS 1 * SIZE(AO2, LDA), %xmm5
  894. MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6
  895. MOVAPS 1 * SIZE(AO2, LDA3), %xmm7
  896. movsd %xmm0, %xmm9
  897. movsd %xmm2, %xmm10
  898. movsd %xmm4, %xmm11
  899. movsd %xmm6, %xmm12
  900. #ifdef PREFETCHW
  901. PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B)
  902. #endif
  903. movaps %xmm9, -16 * SIZE(B)
  904. movaps %xmm10, -14 * SIZE(B)
  905. movaps %xmm11, -12 * SIZE(B)
  906. movaps %xmm12, -10 * SIZE(B)
  907. shufpd $1, %xmm1, %xmm0
  908. shufpd $1, %xmm3, %xmm2
  909. shufpd $1, %xmm5, %xmm4
  910. shufpd $1, %xmm7, %xmm6
  911. #ifdef PREFETCHW
  912. PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B)
  913. #endif
  914. movaps %xmm0, -8 * SIZE(B)
  915. movaps %xmm2, -6 * SIZE(B)
  916. movaps %xmm4, -4 * SIZE(B)
  917. movaps %xmm6, -2 * SIZE(B)
  918. #ifdef PREFETCH
  919. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2)
  920. #endif
  921. MOVAPS 2 * SIZE(AO1), %xmm0
  922. MOVAPS 3 * SIZE(AO1, LDA), %xmm9
  923. MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2
  924. MOVAPS 3 * SIZE(AO1, LDA3), %xmm10
  925. #ifdef PREFETCH
  926. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3)
  927. #endif
  928. MOVAPS 2 * SIZE(AO2), %xmm4
  929. MOVAPS 3 * SIZE(AO2, LDA), %xmm11
  930. MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6
  931. MOVAPS 3 * SIZE(AO2, LDA3), %xmm12
  932. movsd %xmm0, %xmm1
  933. movsd %xmm2, %xmm3
  934. movsd %xmm4, %xmm5
  935. movsd %xmm6, %xmm7
  936. #ifdef PREFETCHW
  937. PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B)
  938. #endif
  939. movaps %xmm1, 0 * SIZE(B)
  940. movaps %xmm3, 2 * SIZE(B)
  941. movaps %xmm5, 4 * SIZE(B)
  942. movaps %xmm7, 6 * SIZE(B)
  943. shufpd $1, %xmm9, %xmm0
  944. shufpd $1, %xmm10, %xmm2
  945. shufpd $1, %xmm11, %xmm4
  946. shufpd $1, %xmm12, %xmm6
  947. #ifdef PREFETCHW
  948. PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B)
  949. #endif
  950. movaps %xmm0, 8 * SIZE(B)
  951. movaps %xmm2, 10 * SIZE(B)
  952. movaps %xmm4, 12 * SIZE(B)
  953. movaps %xmm6, 14 * SIZE(B)
  954. #ifdef PREFETCH
  955. PREFETCH PREFETCHSIZE * SIZE(AO2)
  956. #endif
  957. MOVAPS 4 * SIZE(AO1), %xmm0
  958. MOVAPS 5 * SIZE(AO1, LDA), %xmm1
  959. MOVAPS 4 * SIZE(AO1, LDA, 2), %xmm2
  960. MOVAPS 5 * SIZE(AO1, LDA3), %xmm3
  961. #ifdef PREFETCH
  962. PREFETCH PREFETCHSIZE * SIZE(AO2, LDA)
  963. #endif
  964. MOVAPS 4 * SIZE(AO2), %xmm4
  965. MOVAPS 5 * SIZE(AO2, LDA), %xmm5
  966. MOVAPS 4 * SIZE(AO2, LDA, 2), %xmm6
  967. MOVAPS 5 * SIZE(AO2, LDA3), %xmm7
  968. movsd %xmm0, %xmm9
  969. movsd %xmm2, %xmm10
  970. movsd %xmm4, %xmm11
  971. movsd %xmm6, %xmm12
  972. #ifdef PREFETCHW
  973. PREFETCHW (PREFETCHSIZE * 8 + 32) * SIZE(B)
  974. #endif
  975. movaps %xmm9, 16 * SIZE(B)
  976. movaps %xmm10, 18 * SIZE(B)
  977. movaps %xmm11, 20 * SIZE(B)
  978. movaps %xmm12, 22 * SIZE(B)
  979. shufpd $1, %xmm1, %xmm0
  980. shufpd $1, %xmm3, %xmm2
  981. shufpd $1, %xmm5, %xmm4
  982. shufpd $1, %xmm7, %xmm6
  983. #ifdef PREFETCHW
  984. PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B)
  985. #endif
  986. movaps %xmm0, 24 * SIZE(B)
  987. movaps %xmm2, 26 * SIZE(B)
  988. movaps %xmm4, 28 * SIZE(B)
  989. movaps %xmm6, 30 * SIZE(B)
  990. #ifdef PREFETCH
  991. PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2)
  992. #endif
  993. MOVAPS 6 * SIZE(AO1), %xmm0
  994. MOVAPS 7 * SIZE(AO1, LDA), %xmm9
  995. MOVAPS 6 * SIZE(AO1, LDA, 2), %xmm2
  996. MOVAPS 7 * SIZE(AO1, LDA3), %xmm10
  997. #ifdef PREFETCH
  998. PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3)
  999. #endif
  1000. MOVAPS 6 * SIZE(AO2), %xmm4
  1001. MOVAPS 7 * SIZE(AO2, LDA), %xmm11
  1002. MOVAPS 6 * SIZE(AO2, LDA, 2), %xmm6
  1003. MOVAPS 7 * SIZE(AO2, LDA3), %xmm12
  1004. movsd %xmm0, %xmm1
  1005. movsd %xmm2, %xmm3
  1006. movsd %xmm4, %xmm5
  1007. movsd %xmm6, %xmm7
  1008. #ifdef PREFETCHW
  1009. PREFETCHW (PREFETCHSIZE * 8 + 40) * SIZE(B)
  1010. #endif
  1011. movaps %xmm1, 32 * SIZE(B)
  1012. movaps %xmm3, 34 * SIZE(B)
  1013. movaps %xmm5, 36 * SIZE(B)
  1014. movaps %xmm7, 38 * SIZE(B)
  1015. shufpd $1, %xmm9, %xmm0
  1016. shufpd $1, %xmm10, %xmm2
  1017. shufpd $1, %xmm11, %xmm4
  1018. shufpd $1, %xmm12, %xmm6
  1019. #ifdef PREFETCHW
  1020. PREFETCHW (PREFETCHSIZE * 8 + 48) * SIZE(B)
  1021. #endif
  1022. movaps %xmm0, 40 * SIZE(B)
  1023. movaps %xmm2, 42 * SIZE(B)
  1024. movaps %xmm4, 44 * SIZE(B)
  1025. movaps %xmm6, 46 * SIZE(B)
  1026. addq $8 * SIZE, AO1
  1027. addq $8 * SIZE, AO2
  1028. subq $-64 * SIZE, B
  1029. decq I
  1030. jg .L53
  1031. ALIGN_4
  1032. .L54:
  1033. testq $4, MM
  1034. jle .L56
  1035. MOVAPS 0 * SIZE(AO1), %xmm0
  1036. MOVAPS 1 * SIZE(AO1, LDA), %xmm1
  1037. MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2
  1038. MOVAPS 1 * SIZE(AO1, LDA3), %xmm3
  1039. MOVAPS 0 * SIZE(AO2), %xmm4
  1040. MOVAPS 1 * SIZE(AO2, LDA), %xmm5
  1041. MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6
  1042. MOVAPS 1 * SIZE(AO2, LDA3), %xmm7
  1043. movsd %xmm0, %xmm9
  1044. movsd %xmm2, %xmm10
  1045. movsd %xmm4, %xmm11
  1046. movsd %xmm6, %xmm12
  1047. movaps %xmm9, -16 * SIZE(B)
  1048. movaps %xmm10, -14 * SIZE(B)
  1049. movaps %xmm11, -12 * SIZE(B)
  1050. movaps %xmm12, -10 * SIZE(B)
  1051. shufpd $1, %xmm1, %xmm0
  1052. shufpd $1, %xmm3, %xmm2
  1053. shufpd $1, %xmm5, %xmm4
  1054. shufpd $1, %xmm7, %xmm6
  1055. movaps %xmm0, -8 * SIZE(B)
  1056. movaps %xmm2, -6 * SIZE(B)
  1057. movaps %xmm4, -4 * SIZE(B)
  1058. movaps %xmm6, -2 * SIZE(B)
  1059. MOVAPS 2 * SIZE(AO1), %xmm0
  1060. MOVAPS 3 * SIZE(AO1, LDA), %xmm9
  1061. MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2
  1062. MOVAPS 3 * SIZE(AO1, LDA3), %xmm10
  1063. MOVAPS 2 * SIZE(AO2), %xmm4
  1064. MOVAPS 3 * SIZE(AO2, LDA), %xmm11
  1065. MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6
  1066. MOVAPS 3 * SIZE(AO2, LDA3), %xmm12
  1067. movsd %xmm0, %xmm1
  1068. movsd %xmm2, %xmm3
  1069. movsd %xmm4, %xmm5
  1070. movsd %xmm6, %xmm7
  1071. movaps %xmm1, 0 * SIZE(B)
  1072. movaps %xmm3, 2 * SIZE(B)
  1073. movaps %xmm5, 4 * SIZE(B)
  1074. movaps %xmm7, 6 * SIZE(B)
  1075. shufpd $1, %xmm9, %xmm0
  1076. shufpd $1, %xmm10, %xmm2
  1077. shufpd $1, %xmm11, %xmm4
  1078. shufpd $1, %xmm12, %xmm6
  1079. movaps %xmm0, 8 * SIZE(B)
  1080. movaps %xmm2, 10 * SIZE(B)
  1081. movaps %xmm4, 12 * SIZE(B)
  1082. movaps %xmm6, 14 * SIZE(B)
  1083. addq $4 * SIZE, AO1
  1084. addq $4 * SIZE, AO2
  1085. subq $-32 * SIZE, B
  1086. ALIGN_4
  1087. .L56:
  1088. testq $2, MM
  1089. jle .L58
  1090. MOVAPS 0 * SIZE(AO1), %xmm0
  1091. MOVAPS 1 * SIZE(AO1, LDA), %xmm1
  1092. MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2
  1093. MOVAPS 1 * SIZE(AO1, LDA3), %xmm3
  1094. MOVAPS 0 * SIZE(AO2), %xmm4
  1095. MOVAPS 1 * SIZE(AO2, LDA), %xmm5
  1096. MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6
  1097. MOVAPS 1 * SIZE(AO2, LDA3), %xmm7
  1098. movsd %xmm0, %xmm9
  1099. movsd %xmm2, %xmm10
  1100. movsd %xmm4, %xmm11
  1101. movsd %xmm6, %xmm12
  1102. movaps %xmm9, -16 * SIZE(B)
  1103. movaps %xmm10, -14 * SIZE(B)
  1104. movaps %xmm11, -12 * SIZE(B)
  1105. movaps %xmm12, -10 * SIZE(B)
  1106. shufpd $1, %xmm1, %xmm0
  1107. shufpd $1, %xmm3, %xmm2
  1108. shufpd $1, %xmm5, %xmm4
  1109. shufpd $1, %xmm7, %xmm6
  1110. movaps %xmm0, -8 * SIZE(B)
  1111. movaps %xmm2, -6 * SIZE(B)
  1112. movaps %xmm4, -4 * SIZE(B)
  1113. movaps %xmm6, -2 * SIZE(B)
  1114. addq $2 * SIZE, AO1
  1115. addq $2 * SIZE, AO2
  1116. subq $-16 * SIZE, B
  1117. ALIGN_4
  1118. .L58:
  1119. testq $1, MM
  1120. jle .L59
  1121. movsd 0 * SIZE(AO1), %xmm0
  1122. movsd 0 * SIZE(AO1, LDA), %xmm1
  1123. movsd 0 * SIZE(AO1, LDA, 2), %xmm2
  1124. movsd 0 * SIZE(AO1, LDA3), %xmm3
  1125. movsd 0 * SIZE(AO2), %xmm4
  1126. movsd 0 * SIZE(AO2, LDA), %xmm5
  1127. movsd 0 * SIZE(AO2, LDA, 2), %xmm6
  1128. movsd 0 * SIZE(AO2, LDA3), %xmm7
  1129. unpcklpd %xmm1, %xmm0
  1130. unpcklpd %xmm3, %xmm2
  1131. unpcklpd %xmm5, %xmm4
  1132. unpcklpd %xmm7, %xmm6
  1133. movaps %xmm0, -16 * SIZE(B)
  1134. movaps %xmm2, -14 * SIZE(B)
  1135. movaps %xmm4, -12 * SIZE(B)
  1136. movaps %xmm6, -10 * SIZE(B)
  1137. subq $-8 * SIZE, B
  1138. ALIGN_4
  1139. .L59:
  1140. decq J
  1141. jg .L51
  1142. ALIGN_4
  1143. .L60:
  1144. testq $4, N
  1145. jle .L70
  1146. movq A, AO1
  1147. leaq (A, LDA, 2), AO2
  1148. leaq (A, LDA, 4), A
  1149. testq $SIZE, A
  1150. je .L62
  1151. movsd 0 * SIZE(AO1), %xmm0
  1152. movsd 0 * SIZE(AO1, LDA), %xmm1
  1153. movsd 0 * SIZE(AO2), %xmm2
  1154. movsd 0 * SIZE(AO2, LDA), %xmm3
  1155. unpcklpd %xmm1, %xmm0
  1156. unpcklpd %xmm3, %xmm2
  1157. movaps %xmm0, -16 * SIZE(B)
  1158. movaps %xmm2, -14 * SIZE(B)
  1159. addq $1 * SIZE, AO1
  1160. addq $1 * SIZE, AO2
  1161. subq $-4 * SIZE, B
  1162. ALIGN_3
  1163. .L62:
  1164. movaps -1 * SIZE(AO1, LDA), %xmm5
  1165. movaps -1 * SIZE(AO2, LDA), %xmm7
  1166. movq MM, I
  1167. sarq $3, I
  1168. jle .L64
  1169. ALIGN_4
  1170. .L63:
  1171. #ifdef PREFETCH
  1172. PREFETCH PREFETCHSIZE * 2 * SIZE(AO1)
  1173. #endif
  1174. MOVAPS 0 * SIZE(AO1), %xmm0
  1175. MOVAPS 1 * SIZE(AO1, LDA), %xmm1
  1176. MOVAPS 0 * SIZE(AO2), %xmm2
  1177. MOVAPS 1 * SIZE(AO2, LDA), %xmm3
  1178. movsd %xmm0, %xmm5
  1179. movsd %xmm2, %xmm7
  1180. shufpd $1, %xmm1, %xmm0
  1181. shufpd $1, %xmm3, %xmm2
  1182. #ifdef PREFETCHW
  1183. PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B)
  1184. #endif
  1185. movaps %xmm5, -16 * SIZE(B)
  1186. movaps %xmm7, -14 * SIZE(B)
  1187. movaps %xmm0, -12 * SIZE(B)
  1188. movaps %xmm2, -10 * SIZE(B)
  1189. #ifdef PREFETCH
  1190. PREFETCH PREFETCHSIZE * 2 * SIZE(AO1, LDA)
  1191. #endif
  1192. MOVAPS 2 * SIZE(AO1), %xmm0
  1193. MOVAPS 3 * SIZE(AO1, LDA), %xmm5
  1194. MOVAPS 2 * SIZE(AO2), %xmm2
  1195. MOVAPS 3 * SIZE(AO2, LDA), %xmm7
  1196. movsd %xmm0, %xmm1
  1197. movsd %xmm2, %xmm3
  1198. shufpd $1, %xmm5, %xmm0
  1199. shufpd $1, %xmm7, %xmm2
  1200. #ifdef PREFETCHW
  1201. PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B)
  1202. #endif
  1203. movaps %xmm1, -8 * SIZE(B)
  1204. movaps %xmm3, -6 * SIZE(B)
  1205. movaps %xmm0, -4 * SIZE(B)
  1206. movaps %xmm2, -2 * SIZE(B)
  1207. #ifdef PREFETCH
  1208. PREFETCH PREFETCHSIZE * 2 * SIZE(AO2)
  1209. #endif
  1210. MOVAPS 4 * SIZE(AO1), %xmm0
  1211. MOVAPS 5 * SIZE(AO1, LDA), %xmm1
  1212. MOVAPS 4 * SIZE(AO2), %xmm2
  1213. MOVAPS 5 * SIZE(AO2, LDA), %xmm3
  1214. movsd %xmm0, %xmm5
  1215. movsd %xmm2, %xmm7
  1216. shufpd $1, %xmm1, %xmm0
  1217. shufpd $1, %xmm3, %xmm2
  1218. #ifdef PREFETCHW
  1219. PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B)
  1220. #endif
  1221. movaps %xmm5, 0 * SIZE(B)
  1222. movaps %xmm7, 2 * SIZE(B)
  1223. movaps %xmm0, 4 * SIZE(B)
  1224. movaps %xmm2, 6 * SIZE(B)
  1225. #ifdef PREFETCH
  1226. PREFETCH PREFETCHSIZE * 2 * SIZE(AO2, LDA)
  1227. #endif
  1228. MOVAPS 6 * SIZE(AO1), %xmm0
  1229. MOVAPS 7 * SIZE(AO1, LDA), %xmm5
  1230. MOVAPS 6 * SIZE(AO2), %xmm2
  1231. MOVAPS 7 * SIZE(AO2, LDA), %xmm7
  1232. movsd %xmm0, %xmm1
  1233. movsd %xmm2, %xmm3
  1234. shufpd $1, %xmm5, %xmm0
  1235. shufpd $1, %xmm7, %xmm2
  1236. #ifdef PREFETCHW
  1237. PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B)
  1238. #endif
  1239. movaps %xmm1, 8 * SIZE(B)
  1240. movaps %xmm3, 10 * SIZE(B)
  1241. movaps %xmm0, 12 * SIZE(B)
  1242. movaps %xmm2, 14 * SIZE(B)
  1243. addq $8 * SIZE, AO1
  1244. addq $8 * SIZE, AO2
  1245. subq $-32 * SIZE, B
  1246. decq I
  1247. jg .L63
  1248. ALIGN_4
  1249. .L64:
  1250. testq $4, MM
  1251. jle .L66
  1252. MOVAPS 0 * SIZE(AO1), %xmm0
  1253. MOVAPS 1 * SIZE(AO1, LDA), %xmm1
  1254. MOVAPS 0 * SIZE(AO2), %xmm2
  1255. MOVAPS 1 * SIZE(AO2, LDA), %xmm3
  1256. movsd %xmm0, %xmm5
  1257. shufpd $1, %xmm1, %xmm0
  1258. movsd %xmm2, %xmm7
  1259. shufpd $1, %xmm3, %xmm2
  1260. movaps %xmm5, -16 * SIZE(B)
  1261. movaps %xmm7, -14 * SIZE(B)
  1262. movaps %xmm0, -12 * SIZE(B)
  1263. movaps %xmm2, -10 * SIZE(B)
  1264. MOVAPS 2 * SIZE(AO1), %xmm0
  1265. MOVAPS 3 * SIZE(AO1, LDA), %xmm5
  1266. MOVAPS 2 * SIZE(AO2), %xmm2
  1267. MOVAPS 3 * SIZE(AO2, LDA), %xmm7
  1268. movsd %xmm0, %xmm1
  1269. shufpd $1, %xmm5, %xmm0
  1270. movsd %xmm2, %xmm3
  1271. shufpd $1, %xmm7, %xmm2
  1272. movaps %xmm1, -8 * SIZE(B)
  1273. movaps %xmm3, -6 * SIZE(B)
  1274. movaps %xmm0, -4 * SIZE(B)
  1275. movaps %xmm2, -2 * SIZE(B)
  1276. addq $4 * SIZE, AO1
  1277. addq $4 * SIZE, AO2
  1278. subq $-16 * SIZE, B
  1279. ALIGN_4
  1280. .L66:
  1281. testq $2, MM
  1282. jle .L68
  1283. MOVAPS 0 * SIZE(AO1), %xmm0
  1284. MOVAPS 1 * SIZE(AO1, LDA), %xmm1
  1285. MOVAPS 0 * SIZE(AO2), %xmm2
  1286. MOVAPS 1 * SIZE(AO2, LDA), %xmm3
  1287. movsd %xmm0, %xmm5
  1288. movsd %xmm2, %xmm7
  1289. shufpd $1, %xmm1, %xmm0
  1290. shufpd $1, %xmm3, %xmm2
  1291. movaps %xmm5, -16 * SIZE(B)
  1292. movaps %xmm7, -14 * SIZE(B)
  1293. movaps %xmm0, -12 * SIZE(B)
  1294. movaps %xmm2, -10 * SIZE(B)
  1295. addq $2 * SIZE, AO1
  1296. addq $2 * SIZE, AO2
  1297. subq $-8 * SIZE, B
  1298. ALIGN_4
  1299. .L68:
  1300. testq $1, MM
  1301. jle .L70
  1302. movsd 0 * SIZE(AO1), %xmm0
  1303. movsd 0 * SIZE(AO1, LDA), %xmm1
  1304. movsd 0 * SIZE(AO2), %xmm2
  1305. movsd 0 * SIZE(AO2, LDA), %xmm3
  1306. unpcklpd %xmm1, %xmm0
  1307. unpcklpd %xmm3, %xmm2
  1308. movaps %xmm0, -16 * SIZE(B)
  1309. movaps %xmm2, -14 * SIZE(B)
  1310. subq $-4 * SIZE, B
  1311. ALIGN_4
  1312. .L70:
  1313. testq $2, N
  1314. jle .L80
  1315. movq A, AO1
  1316. leaq (A, LDA), AO2
  1317. leaq (A, LDA, 2), A
  1318. testq $SIZE, A
  1319. je .L72
  1320. movsd 0 * SIZE(AO1), %xmm0
  1321. movsd 0 * SIZE(AO2), %xmm1
  1322. unpcklpd %xmm1, %xmm0
  1323. movaps %xmm0, -16 * SIZE(B)
  1324. addq $1 * SIZE, AO1
  1325. addq $1 * SIZE, AO2
  1326. subq $-2 * SIZE, B
  1327. ALIGN_3
  1328. .L72:
  1329. MOVAPS -1 * SIZE(AO2), %xmm5
  1330. movq MM, I
  1331. sarq $3, I
  1332. jle .L74
  1333. ALIGN_4
  1334. .L73:
  1335. #ifdef PREFETCH
  1336. PREFETCH PREFETCHSIZE * 4 * SIZE(AO1)
  1337. #endif
  1338. MOVAPS 0 * SIZE(AO1), %xmm0
  1339. MOVAPS 1 * SIZE(AO2), %xmm1
  1340. MOVAPS 2 * SIZE(AO1), %xmm2
  1341. MOVAPS 3 * SIZE(AO2), %xmm3
  1342. movsd %xmm0, %xmm5
  1343. shufpd $1, %xmm1, %xmm0
  1344. movsd %xmm2, %xmm1
  1345. shufpd $1, %xmm3, %xmm2
  1346. #ifdef PREFETCHW
  1347. PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B)
  1348. #endif
  1349. movaps %xmm5, -16 * SIZE(B)
  1350. movaps %xmm0, -14 * SIZE(B)
  1351. movaps %xmm1, -12 * SIZE(B)
  1352. movaps %xmm2, -10 * SIZE(B)
  1353. #ifdef PREFETCH
  1354. PREFETCH PREFETCHSIZE * 4 * SIZE(AO2)
  1355. #endif
  1356. MOVAPS 4 * SIZE(AO1), %xmm0
  1357. MOVAPS 5 * SIZE(AO2), %xmm1
  1358. MOVAPS 6 * SIZE(AO1), %xmm2
  1359. MOVAPS 7 * SIZE(AO2), %xmm5
  1360. movsd %xmm0, %xmm3
  1361. shufpd $1, %xmm1, %xmm0
  1362. movsd %xmm2, %xmm1
  1363. shufpd $1, %xmm5, %xmm2
  1364. #ifdef PREFETCHW
  1365. PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B)
  1366. #endif
  1367. movaps %xmm3, -8 * SIZE(B)
  1368. movaps %xmm0, -6 * SIZE(B)
  1369. movaps %xmm1, -4 * SIZE(B)
  1370. movaps %xmm2, -2 * SIZE(B)
  1371. addq $8 * SIZE, AO1
  1372. addq $8 * SIZE, AO2
  1373. subq $-16 * SIZE, B
  1374. decq I
  1375. jg .L73
  1376. ALIGN_4
  1377. .L74:
  1378. testq $4, MM
  1379. jle .L76
  1380. MOVAPS 0 * SIZE(AO1), %xmm0
  1381. MOVAPS 1 * SIZE(AO2), %xmm1
  1382. MOVAPS 2 * SIZE(AO1), %xmm2
  1383. MOVAPS 3 * SIZE(AO2), %xmm3
  1384. movsd %xmm0, %xmm5
  1385. shufpd $1, %xmm1, %xmm0
  1386. movsd %xmm2, %xmm1
  1387. shufpd $1, %xmm3, %xmm2
  1388. movaps %xmm5, -16 * SIZE(B)
  1389. movaps %xmm0, -14 * SIZE(B)
  1390. movaps %xmm1, -12 * SIZE(B)
  1391. movaps %xmm2, -10 * SIZE(B)
  1392. movaps %xmm3, %xmm5
  1393. addq $4 * SIZE, AO1
  1394. addq $4 * SIZE, AO2
  1395. subq $-8 * SIZE, B
  1396. ALIGN_4
  1397. .L76:
  1398. testq $2, MM
  1399. jle .L78
  1400. MOVAPS 0 * SIZE(AO1), %xmm0
  1401. MOVAPS 1 * SIZE(AO2), %xmm1
  1402. movsd %xmm0, %xmm5
  1403. shufpd $1, %xmm1, %xmm0
  1404. movaps %xmm5, -16 * SIZE(B)
  1405. movaps %xmm0, -14 * SIZE(B)
  1406. addq $2 * SIZE, AO1
  1407. addq $2 * SIZE, AO2
  1408. subq $-4 * SIZE, B
  1409. ALIGN_4
  1410. .L78:
  1411. testq $1, MM
  1412. jle .L80
  1413. movsd 0 * SIZE(AO1), %xmm0
  1414. movsd 0 * SIZE(AO2), %xmm1
  1415. unpcklpd %xmm1, %xmm0
  1416. movaps %xmm0, -16 * SIZE(B)
  1417. subq $-2 * SIZE, B
  1418. ALIGN_4
  1419. .L80:
  1420. testq $1, N
  1421. jle .L999
  1422. movq A, AO1
  1423. testq $SIZE, A
  1424. jne .L85
  1425. movq MM, I
  1426. sarq $3, I
  1427. jle .L82
  1428. ALIGN_4
  1429. .L81:
  1430. #ifdef PREFETCH
  1431. PREFETCH PREFETCHSIZE * 8 * SIZE(AO1)
  1432. #endif
  1433. MOVAPS 0 * SIZE(AO1), %xmm0
  1434. MOVAPS 2 * SIZE(AO1), %xmm2
  1435. MOVAPS 4 * SIZE(AO1), %xmm4
  1436. MOVAPS 6 * SIZE(AO1), %xmm6
  1437. #ifdef PREFETCHW
  1438. PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B)
  1439. #endif
  1440. movaps %xmm0, -16 * SIZE(B)
  1441. movaps %xmm2, -14 * SIZE(B)
  1442. movaps %xmm4, -12 * SIZE(B)
  1443. movaps %xmm6, -10 * SIZE(B)
  1444. addq $8 * SIZE, AO1
  1445. subq $-8 * SIZE, B
  1446. decq I
  1447. jg .L81
  1448. ALIGN_4
  1449. .L82:
  1450. testq $4, MM
  1451. jle .L83
  1452. MOVAPS 0 * SIZE(AO1), %xmm0
  1453. MOVAPS 2 * SIZE(AO1), %xmm2
  1454. movaps %xmm0, -16 * SIZE(B)
  1455. movaps %xmm2, -14 * SIZE(B)
  1456. addq $4 * SIZE, AO1
  1457. subq $-4 * SIZE, B
  1458. ALIGN_4
  1459. .L83:
  1460. testq $2, MM
  1461. jle .L84
  1462. MOVAPS 0 * SIZE(AO1), %xmm0
  1463. movaps %xmm0, -16 * SIZE(B)
  1464. addq $2 * SIZE, AO1
  1465. subq $-2 * SIZE, B
  1466. ALIGN_4
  1467. .L84:
  1468. testq $1, MM
  1469. jle .L999
  1470. movsd 0 * SIZE(AO1), %xmm0
  1471. movlpd %xmm0, -16 * SIZE(B)
  1472. jmp .L999
  1473. ALIGN_4
  1474. .L85:
  1475. MOVAPS -1 * SIZE(AO1), %xmm0
  1476. movq M, I
  1477. sarq $3, I
  1478. jle .L86
  1479. ALIGN_4
  1480. .L86:
  1481. #ifdef PREFETCH
  1482. PREFETCH PREFETCHSIZE * 8 * SIZE(AO1)
  1483. #endif
  1484. MOVAPS 1 * SIZE(AO1), %xmm1
  1485. MOVAPS 3 * SIZE(AO1), %xmm2
  1486. MOVAPS 5 * SIZE(AO1), %xmm3
  1487. MOVAPS 7 * SIZE(AO1), %xmm4
  1488. shufpd $1, %xmm1, %xmm0
  1489. shufpd $1, %xmm2, %xmm1
  1490. shufpd $1, %xmm3, %xmm2
  1491. shufpd $1, %xmm4, %xmm3
  1492. #ifdef PREFETCHW
  1493. PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B)
  1494. #endif
  1495. movaps %xmm0, -16 * SIZE(B)
  1496. movaps %xmm1, -14 * SIZE(B)
  1497. movaps %xmm2, -12 * SIZE(B)
  1498. movaps %xmm3, -10 * SIZE(B)
  1499. movaps %xmm4, %xmm0
  1500. addq $8 * SIZE, AO1
  1501. subq $-8 * SIZE, B
  1502. decq I
  1503. jg .L86
  1504. ALIGN_4
  1505. .L87:
  1506. testq $4, M
  1507. jle .L88
  1508. MOVAPS 1 * SIZE(AO1), %xmm1
  1509. MOVAPS 3 * SIZE(AO1), %xmm2
  1510. shufpd $1, %xmm1, %xmm0
  1511. shufpd $1, %xmm2, %xmm1
  1512. movaps %xmm0, -16 * SIZE(B)
  1513. movaps %xmm1, -14 * SIZE(B)
  1514. movaps %xmm2, %xmm0
  1515. addq $4 * SIZE, AO1
  1516. addq $4 * SIZE, B
  1517. ALIGN_4
  1518. .L88:
  1519. testq $2, M
  1520. jle .L89
  1521. MOVAPS 1 * SIZE(AO1), %xmm1
  1522. shufpd $1, %xmm1, %xmm0
  1523. movaps %xmm0, -16 * SIZE(B)
  1524. movaps %xmm1, %xmm0
  1525. addq $2 * SIZE, AO1
  1526. subq $-2 * SIZE, B
  1527. ALIGN_4
  1528. .L89:
  1529. testq $1, M
  1530. jle .L999
  1531. shufpd $1, %xmm0, %xmm0
  1532. movlpd %xmm0, -16 * SIZE(B)
  1533. ALIGN_4
  1534. .L999:
  1535. #ifdef WINDOWS_ABI
  1536. movups 0(%rsp), %xmm6
  1537. movups 16(%rsp), %xmm7
  1538. movups 32(%rsp), %xmm8
  1539. movups 48(%rsp), %xmm9
  1540. movups 64(%rsp), %xmm10
  1541. movups 80(%rsp), %xmm11
  1542. movups 96(%rsp), %xmm12
  1543. addq $STACKSIZE, %rsp
  1544. #endif
  1545. popq %r12
  1546. popq %r13
  1547. #ifdef WINDOWS_ABI
  1548. popq %r14
  1549. popq %r15
  1550. #endif
  1551. ret
  1552. EPILOGUE