You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_ncopy_4.S 22 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(PENTIUM4) || defined(GENERIC)
  41. #define PREFETCHSIZE 16
  42. #define PREFETCH prefetcht0
  43. #define PREFETCHW prefetcht0
  44. #endif
  45. #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
  46. #define PREFETCHSIZE 16
  47. #define PREFETCH prefetcht0
  48. #define PREFETCHW prefetcht0
  49. #endif
  50. #ifdef ATOM
  51. #define PREFETCHSIZE 16
  52. #define PREFETCH prefetcht0
  53. #define PREFETCHW prefetcht0
  54. #endif
  55. #ifdef NANO
  56. #define PREFETCHSIZE 16
  57. #define PREFETCH prefetcht0
  58. #define PREFETCHW prefetcht0
  59. #endif
  60. #ifdef OPTERON
  61. #define PREFETCHSIZE 16
  62. #define PREFETCH prefetch
  63. #define PREFETCHW prefetchw
  64. #endif
  65. #ifdef GENERIC
  66. #define PREFETCHSIZE 16
  67. #define PREFETCH prefetcht0
  68. #define PREFETCHW prefetcht0
  69. #endif
  70. #ifndef WINDOWS_ABI
  71. #define M ARG1 /* rdi */
  72. #define N ARG2 /* rsi */
  73. #define A ARG3 /* rdx */
  74. #define LDA ARG4 /* rcx */
  75. #define B ARG5 /* r8 */
  76. #define I %r9
  77. #else
  78. #define STACKSIZE 256
  79. #define M ARG1 /* rcx */
  80. #define N ARG2 /* rdx */
  81. #define A ARG3 /* r8 */
  82. #define LDA ARG4 /* r9 */
  83. #define OLD_B 40 + 32 + STACKSIZE(%rsp)
  84. #define B %r14
  85. #define I %r15
  86. #endif
  87. #define J %r10
  88. #define AO1 %r11
  89. #define AO2 %r12
  90. #define MM %r13
  91. PROLOGUE
  92. PROFCODE
  93. #ifdef WINDOWS_ABI
  94. pushq %r15
  95. pushq %r14
  96. #endif
  97. pushq %r13
  98. pushq %r12
  99. #ifdef WINDOWS_ABI
  100. subq $STACKSIZE, %rsp
  101. movups %xmm6, 0(%rsp)
  102. movups %xmm7, 16(%rsp)
  103. movq OLD_B, B
  104. #endif
  105. leaq (,LDA, SIZE), LDA
  106. subq $-16 * SIZE, B
  107. movq M, MM
  108. leaq -1(M), %rax
  109. testq $SIZE, A
  110. cmovne %rax, MM
  111. testq $SIZE, LDA
  112. jne .L50
  113. movq N, J
  114. sarq $2, J
  115. jle .L20
  116. ALIGN_4
  117. .L11:
  118. movq A, AO1
  119. leaq (A, LDA, 2), AO2
  120. leaq (A, LDA, 4), A
  121. testq $SIZE, A
  122. je .L12
  123. movsd 0 * SIZE(AO1), %xmm0
  124. movsd 0 * SIZE(AO1, LDA), %xmm1
  125. movsd 0 * SIZE(AO2), %xmm2
  126. movsd 0 * SIZE(AO2, LDA), %xmm3
  127. unpcklpd %xmm1, %xmm0
  128. unpcklpd %xmm3, %xmm2
  129. movapd %xmm0, -16 * SIZE(B)
  130. movapd %xmm2, -14 * SIZE(B)
  131. addq $1 * SIZE, AO1
  132. addq $1 * SIZE, AO2
  133. subq $-4 * SIZE, B
  134. ALIGN_3
  135. .L12:
  136. movq MM, I
  137. sarq $3, I
  138. jle .L14
  139. ALIGN_4
  140. .L13:
  141. #ifdef PREFETCH
  142. PREFETCH PREFETCHSIZE * SIZE(AO1)
  143. #endif
  144. movapd 0 * SIZE(AO1), %xmm0
  145. movapd 0 * SIZE(AO1, LDA), %xmm1
  146. movapd 0 * SIZE(AO2), %xmm2
  147. movapd 0 * SIZE(AO2, LDA), %xmm3
  148. movapd %xmm0, %xmm4
  149. unpcklpd %xmm1, %xmm0
  150. movapd %xmm2, %xmm6
  151. unpcklpd %xmm3, %xmm2
  152. unpckhpd %xmm1, %xmm4
  153. unpckhpd %xmm3, %xmm6
  154. #ifdef PREFETCHW
  155. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  156. #endif
  157. movapd %xmm0, -16 * SIZE(B)
  158. movapd %xmm2, -14 * SIZE(B)
  159. movapd %xmm4, -12 * SIZE(B)
  160. movapd %xmm6, -10 * SIZE(B)
  161. #ifdef PREFETCH
  162. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA)
  163. #endif
  164. movapd 2 * SIZE(AO1), %xmm0
  165. movapd 2 * SIZE(AO1, LDA), %xmm1
  166. movapd 2 * SIZE(AO2), %xmm2
  167. movapd 2 * SIZE(AO2, LDA), %xmm3
  168. movapd %xmm0, %xmm4
  169. unpcklpd %xmm1, %xmm0
  170. movapd %xmm2, %xmm6
  171. unpcklpd %xmm3, %xmm2
  172. unpckhpd %xmm1, %xmm4
  173. unpckhpd %xmm3, %xmm6
  174. #ifdef PREFETCHW
  175. PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B)
  176. #endif
  177. movapd %xmm0, -8 * SIZE(B)
  178. movapd %xmm2, -6 * SIZE(B)
  179. movapd %xmm4, -4 * SIZE(B)
  180. movapd %xmm6, -2 * SIZE(B)
  181. #ifdef PREFETCH
  182. PREFETCH PREFETCHSIZE * SIZE(AO2)
  183. #endif
  184. movapd 4 * SIZE(AO1), %xmm0
  185. movapd 4 * SIZE(AO1, LDA), %xmm1
  186. movapd 4 * SIZE(AO2), %xmm2
  187. movapd 4 * SIZE(AO2, LDA), %xmm3
  188. movapd %xmm0, %xmm4
  189. unpcklpd %xmm1, %xmm0
  190. movapd %xmm2, %xmm6
  191. unpcklpd %xmm3, %xmm2
  192. unpckhpd %xmm1, %xmm4
  193. unpckhpd %xmm3, %xmm6
  194. #ifdef PREFETCHW
  195. PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B)
  196. #endif
  197. movapd %xmm0, 0 * SIZE(B)
  198. movapd %xmm2, 2 * SIZE(B)
  199. movapd %xmm4, 4 * SIZE(B)
  200. movapd %xmm6, 6 * SIZE(B)
  201. #ifdef PREFETCH
  202. PREFETCH PREFETCHSIZE * SIZE(AO2, LDA)
  203. #endif
  204. movapd 6 * SIZE(AO1), %xmm0
  205. movapd 6 * SIZE(AO1, LDA), %xmm1
  206. movapd 6 * SIZE(AO2), %xmm2
  207. movapd 6 * SIZE(AO2, LDA), %xmm3
  208. movapd %xmm0, %xmm4
  209. unpcklpd %xmm1, %xmm0
  210. movapd %xmm2, %xmm6
  211. unpcklpd %xmm3, %xmm2
  212. unpckhpd %xmm1, %xmm4
  213. unpckhpd %xmm3, %xmm6
  214. #ifdef PREFETCHW
  215. PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B)
  216. #endif
  217. movapd %xmm0, 8 * SIZE(B)
  218. movapd %xmm2, 10 * SIZE(B)
  219. movapd %xmm4, 12 * SIZE(B)
  220. movapd %xmm6, 14 * SIZE(B)
  221. addq $8 * SIZE, AO1
  222. addq $8 * SIZE, AO2
  223. subq $-32 * SIZE, B
  224. decq I
  225. jg .L13
  226. ALIGN_4
  227. .L14:
  228. testq $4, MM
  229. jle .L16
  230. movapd 0 * SIZE(AO1), %xmm0
  231. movapd 0 * SIZE(AO1, LDA), %xmm1
  232. movapd 0 * SIZE(AO2), %xmm2
  233. movapd 0 * SIZE(AO2, LDA), %xmm3
  234. movapd %xmm0, %xmm4
  235. unpcklpd %xmm1, %xmm0
  236. movapd %xmm2, %xmm6
  237. unpcklpd %xmm3, %xmm2
  238. unpckhpd %xmm1, %xmm4
  239. unpckhpd %xmm3, %xmm6
  240. movapd %xmm0, -16 * SIZE(B)
  241. movapd %xmm2, -14 * SIZE(B)
  242. movapd %xmm4, -12 * SIZE(B)
  243. movapd %xmm6, -10 * SIZE(B)
  244. movapd 2 * SIZE(AO1), %xmm0
  245. movapd 2 * SIZE(AO1, LDA), %xmm1
  246. movapd 2 * SIZE(AO2), %xmm2
  247. movapd 2 * SIZE(AO2, LDA), %xmm3
  248. movapd %xmm0, %xmm4
  249. unpcklpd %xmm1, %xmm0
  250. movapd %xmm2, %xmm6
  251. unpcklpd %xmm3, %xmm2
  252. unpckhpd %xmm1, %xmm4
  253. unpckhpd %xmm3, %xmm6
  254. movapd %xmm0, -8 * SIZE(B)
  255. movapd %xmm2, -6 * SIZE(B)
  256. movapd %xmm4, -4 * SIZE(B)
  257. movapd %xmm6, -2 * SIZE(B)
  258. addq $4 * SIZE, AO1
  259. addq $4 * SIZE, AO2
  260. subq $-16 * SIZE, B
  261. ALIGN_4
  262. .L16:
  263. testq $2, MM
  264. jle .L18
  265. movapd 0 * SIZE(AO1), %xmm0
  266. movapd 0 * SIZE(AO1, LDA), %xmm1
  267. movapd 0 * SIZE(AO2), %xmm2
  268. movapd 0 * SIZE(AO2, LDA), %xmm3
  269. movapd %xmm0, %xmm4
  270. unpcklpd %xmm1, %xmm0
  271. movapd %xmm2, %xmm6
  272. unpcklpd %xmm3, %xmm2
  273. unpckhpd %xmm1, %xmm4
  274. unpckhpd %xmm3, %xmm6
  275. movapd %xmm0, -16 * SIZE(B)
  276. movapd %xmm2, -14 * SIZE(B)
  277. movapd %xmm4, -12 * SIZE(B)
  278. movapd %xmm6, -10 * SIZE(B)
  279. addq $2 * SIZE, AO1
  280. addq $2 * SIZE, AO2
  281. subq $-8 * SIZE, B
  282. ALIGN_4
  283. .L18:
  284. testq $1, MM
  285. jle .L19
  286. movsd 0 * SIZE(AO1), %xmm0
  287. movsd 0 * SIZE(AO1, LDA), %xmm1
  288. movsd 0 * SIZE(AO2), %xmm2
  289. movsd 0 * SIZE(AO2, LDA), %xmm3
  290. unpcklpd %xmm1, %xmm0
  291. unpcklpd %xmm3, %xmm2
  292. movapd %xmm0, -16 * SIZE(B)
  293. movapd %xmm2, -14 * SIZE(B)
  294. subq $-4 * SIZE, B
  295. ALIGN_4
  296. .L19:
  297. decq J
  298. jg .L11
  299. ALIGN_4
  300. .L20:
  301. testq $2, N
  302. jle .L30
  303. movq A, AO1
  304. leaq (A, LDA), AO2
  305. leaq (A, LDA, 2), A
  306. testq $SIZE, A
  307. je .L22
  308. movsd 0 * SIZE(AO1), %xmm0
  309. movsd 0 * SIZE(AO2), %xmm1
  310. unpcklpd %xmm1, %xmm0
  311. movapd %xmm0, -16 * SIZE(B)
  312. addq $1 * SIZE, AO1
  313. addq $1 * SIZE, AO2
  314. subq $-2 * SIZE, B
  315. ALIGN_3
  316. .L22:
  317. movq MM, I
  318. sarq $3, I
  319. jle .L24
  320. ALIGN_4
  321. .L23:
  322. #ifdef PREFETCH
  323. PREFETCH PREFETCHSIZE * 2 * SIZE(AO1)
  324. #endif
  325. movapd 0 * SIZE(AO1), %xmm0
  326. movapd 0 * SIZE(AO2), %xmm1
  327. movapd 2 * SIZE(AO1), %xmm2
  328. movapd 2 * SIZE(AO2), %xmm3
  329. movapd %xmm0, %xmm4
  330. unpcklpd %xmm1, %xmm0
  331. movapd %xmm2, %xmm6
  332. unpcklpd %xmm3, %xmm2
  333. unpckhpd %xmm1, %xmm4
  334. unpckhpd %xmm3, %xmm6
  335. #ifdef PREFETCHW
  336. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  337. #endif
  338. movapd %xmm0, -16 * SIZE(B)
  339. movapd %xmm4, -14 * SIZE(B)
  340. movapd %xmm2, -12 * SIZE(B)
  341. movapd %xmm6, -10 * SIZE(B)
  342. #ifdef PREFETCH
  343. PREFETCH PREFETCHSIZE * 2 * SIZE(AO2)
  344. #endif
  345. movapd 4 * SIZE(AO1), %xmm0
  346. movapd 4 * SIZE(AO2), %xmm1
  347. movapd 6 * SIZE(AO1), %xmm2
  348. movapd 6 * SIZE(AO2), %xmm3
  349. movapd %xmm0, %xmm4
  350. unpcklpd %xmm1, %xmm0
  351. movapd %xmm2, %xmm6
  352. unpcklpd %xmm3, %xmm2
  353. unpckhpd %xmm1, %xmm4
  354. unpckhpd %xmm3, %xmm6
  355. #ifdef PREFETCHW
  356. PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B)
  357. #endif
  358. movapd %xmm0, -8 * SIZE(B)
  359. movapd %xmm4, -6 * SIZE(B)
  360. movapd %xmm2, -4 * SIZE(B)
  361. movapd %xmm6, -2 * SIZE(B)
  362. addq $8 * SIZE, AO1
  363. addq $8 * SIZE, AO2
  364. subq $-16 * SIZE, B
  365. decq I
  366. jg .L23
  367. ALIGN_4
  368. .L24:
  369. testq $4, MM
  370. jle .L26
  371. movapd 0 * SIZE(AO1), %xmm0
  372. movapd 0 * SIZE(AO2), %xmm1
  373. movapd 2 * SIZE(AO1), %xmm2
  374. movapd 2 * SIZE(AO2), %xmm3
  375. movapd %xmm0, %xmm4
  376. unpcklpd %xmm1, %xmm0
  377. unpckhpd %xmm1, %xmm4
  378. movapd %xmm2, %xmm6
  379. unpcklpd %xmm3, %xmm2
  380. unpckhpd %xmm3, %xmm6
  381. movapd %xmm0, -16 * SIZE(B)
  382. movapd %xmm4, -14 * SIZE(B)
  383. movapd %xmm2, -12 * SIZE(B)
  384. movapd %xmm6, -10 * SIZE(B)
  385. addq $4 * SIZE, AO1
  386. addq $4 * SIZE, AO2
  387. subq $-8 * SIZE, B
  388. ALIGN_4
  389. .L26:
  390. testq $2, MM
  391. jle .L28
  392. movapd 0 * SIZE(AO1), %xmm0
  393. movapd 0 * SIZE(AO2), %xmm1
  394. movapd %xmm0, %xmm2
  395. unpcklpd %xmm1, %xmm0
  396. unpckhpd %xmm1, %xmm2
  397. movapd %xmm0, -16 * SIZE(B)
  398. movapd %xmm2, -14 * SIZE(B)
  399. addq $2 * SIZE, AO1
  400. addq $2 * SIZE, AO2
  401. subq $-4 * SIZE, B
  402. ALIGN_4
  403. .L28:
  404. testq $1, MM
  405. jle .L30
  406. movsd 0 * SIZE(AO1), %xmm0
  407. movsd 0 * SIZE(AO2), %xmm1
  408. unpcklpd %xmm1, %xmm0
  409. movapd %xmm0, -16 * SIZE(B)
  410. subq $-2 * SIZE, B
  411. ALIGN_4
  412. .L30:
  413. testq $1, N
  414. jle .L999
  415. movq A, AO1
  416. testq $SIZE, A
  417. jne .L35
  418. movq MM, I
  419. sarq $3, I
  420. jle .L32
  421. ALIGN_4
  422. .L31:
  423. #ifdef PREFETCH
  424. PREFETCH PREFETCHSIZE * 4 * SIZE(AO1)
  425. #endif
  426. movapd 0 * SIZE(AO1), %xmm0
  427. movapd 2 * SIZE(AO1), %xmm1
  428. movapd 4 * SIZE(AO1), %xmm2
  429. movapd 6 * SIZE(AO1), %xmm3
  430. #ifdef PREFETCHW
  431. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  432. #endif
  433. movapd %xmm0, -16 * SIZE(B)
  434. movapd %xmm1, -14 * SIZE(B)
  435. movapd %xmm2, -12 * SIZE(B)
  436. movapd %xmm3, -10 * SIZE(B)
  437. addq $8 * SIZE, AO1
  438. subq $-8 * SIZE, B
  439. decq I
  440. jg .L31
  441. ALIGN_4
  442. .L32:
  443. testq $4, MM
  444. jle .L33
  445. movapd 0 * SIZE(AO1), %xmm0
  446. movapd 2 * SIZE(AO1), %xmm1
  447. movapd %xmm0, -16 * SIZE(B)
  448. movapd %xmm1, -14 * SIZE(B)
  449. addq $4 * SIZE, AO1
  450. subq $-4 * SIZE, B
  451. ALIGN_4
  452. .L33:
  453. testq $2, MM
  454. jle .L34
  455. movapd 0 * SIZE(AO1), %xmm0
  456. movapd %xmm0, -16 * SIZE(B)
  457. addq $2 * SIZE, AO1
  458. subq $-2 * SIZE, B
  459. ALIGN_4
  460. .L34:
  461. testq $1, MM
  462. jle .L999
  463. movsd 0 * SIZE(AO1), %xmm0
  464. movlpd %xmm0, -16 * SIZE(B)
  465. jmp .L999
  466. ALIGN_4
  467. .L35:
  468. movapd -1 * SIZE(AO1), %xmm0
  469. movq MM, I
  470. sarq $3, I
  471. jle .L36
  472. ALIGN_4
  473. .L36:
  474. #ifdef PREFETCH
  475. PREFETCH PREFETCHSIZE * 4 * SIZE(AO1)
  476. #endif
  477. movapd 1 * SIZE(AO1), %xmm1
  478. movapd 3 * SIZE(AO1), %xmm2
  479. movapd 5 * SIZE(AO1), %xmm3
  480. movapd 7 * SIZE(AO1), %xmm4
  481. shufpd $1, %xmm1, %xmm0
  482. shufpd $1, %xmm2, %xmm1
  483. shufpd $1, %xmm3, %xmm2
  484. shufpd $1, %xmm4, %xmm3
  485. #ifdef PREFETCHW
  486. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  487. #endif
  488. movapd %xmm0, -16 * SIZE(B)
  489. movapd %xmm1, -14 * SIZE(B)
  490. movapd %xmm2, -12 * SIZE(B)
  491. movapd %xmm3, -10 * SIZE(B)
  492. movapd %xmm4, %xmm0
  493. addq $8 * SIZE, AO1
  494. subq $-8 * SIZE, B
  495. decq I
  496. jg .L36
  497. ALIGN_4
  498. .L37:
  499. testq $4, MM
  500. jle .L38
  501. movapd 1 * SIZE(AO1), %xmm1
  502. movapd 3 * SIZE(AO1), %xmm2
  503. shufpd $1, %xmm1, %xmm0
  504. shufpd $1, %xmm2, %xmm1
  505. movapd %xmm0, -16 * SIZE(B)
  506. movapd %xmm1, -14 * SIZE(B)
  507. movapd %xmm2, %xmm0
  508. addq $4 * SIZE, AO1
  509. addq $4 * SIZE, B
  510. ALIGN_4
  511. .L38:
  512. testq $2, MM
  513. jle .L39
  514. movapd 1 * SIZE(AO1), %xmm1
  515. shufpd $1, %xmm1, %xmm0
  516. movapd %xmm0, -16 * SIZE(B)
  517. movapd %xmm1, %xmm0
  518. addq $2 * SIZE, AO1
  519. subq $-2 * SIZE, B
  520. ALIGN_4
  521. .L39:
  522. testq $1, MM
  523. jle .L999
  524. shufpd $1, %xmm0, %xmm0
  525. movlpd %xmm0, -16 * SIZE(B)
  526. jmp .L999
  527. ALIGN_4
  528. .L50:
  529. movq N, J
  530. sarq $2, J
  531. jle .L60
  532. ALIGN_4
  533. .L51:
  534. movq A, AO1
  535. leaq (A, LDA, 2), AO2
  536. leaq (A, LDA, 4), A
  537. testq $SIZE, A
  538. je .L52
  539. movsd 0 * SIZE(AO1), %xmm0
  540. movsd 0 * SIZE(AO1, LDA), %xmm1
  541. movsd 0 * SIZE(AO2), %xmm2
  542. movsd 0 * SIZE(AO2, LDA), %xmm3
  543. unpcklpd %xmm1, %xmm0
  544. unpcklpd %xmm3, %xmm2
  545. movapd %xmm0, -16 * SIZE(B)
  546. movapd %xmm2, -14 * SIZE(B)
  547. addq $1 * SIZE, AO1
  548. addq $1 * SIZE, AO2
  549. subq $-4 * SIZE, B
  550. ALIGN_3
  551. .L52:
  552. movapd -1 * SIZE(AO1, LDA), %xmm5
  553. movapd -1 * SIZE(AO2, LDA), %xmm7
  554. movq MM, I
  555. sarq $3, I
  556. jle .L54
  557. ALIGN_4
  558. .L53:
  559. #ifdef PREFETCH
  560. PREFETCH PREFETCHSIZE * SIZE(AO1)
  561. #endif
  562. movapd 0 * SIZE(AO1), %xmm0
  563. movapd 1 * SIZE(AO1, LDA), %xmm1
  564. movapd 0 * SIZE(AO2), %xmm2
  565. movapd 1 * SIZE(AO2, LDA), %xmm3
  566. movsd %xmm0, %xmm5
  567. movsd %xmm2, %xmm7
  568. shufpd $1, %xmm1, %xmm0
  569. shufpd $1, %xmm3, %xmm2
  570. #ifdef PREFETCHW
  571. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  572. #endif
  573. movapd %xmm5, -16 * SIZE(B)
  574. movapd %xmm7, -14 * SIZE(B)
  575. movapd %xmm0, -12 * SIZE(B)
  576. movapd %xmm2, -10 * SIZE(B)
  577. #ifdef PREFETCH
  578. PREFETCH PREFETCHSIZE * SIZE(AO1, LDA)
  579. #endif
  580. movapd 2 * SIZE(AO1), %xmm0
  581. movapd 3 * SIZE(AO1, LDA), %xmm5
  582. movapd 2 * SIZE(AO2), %xmm2
  583. movapd 3 * SIZE(AO2, LDA), %xmm7
  584. movsd %xmm0, %xmm1
  585. movsd %xmm2, %xmm3
  586. shufpd $1, %xmm5, %xmm0
  587. shufpd $1, %xmm7, %xmm2
  588. #ifdef PREFETCHW
  589. PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B)
  590. #endif
  591. movapd %xmm1, -8 * SIZE(B)
  592. movapd %xmm3, -6 * SIZE(B)
  593. movapd %xmm0, -4 * SIZE(B)
  594. movapd %xmm2, -2 * SIZE(B)
  595. #ifdef PREFETCH
  596. PREFETCH PREFETCHSIZE * SIZE(AO2)
  597. #endif
  598. movapd 4 * SIZE(AO1), %xmm0
  599. movapd 5 * SIZE(AO1, LDA), %xmm1
  600. movapd 4 * SIZE(AO2), %xmm2
  601. movapd 5 * SIZE(AO2, LDA), %xmm3
  602. movsd %xmm0, %xmm5
  603. movsd %xmm2, %xmm7
  604. shufpd $1, %xmm1, %xmm0
  605. shufpd $1, %xmm3, %xmm2
  606. #ifdef PREFETCHW
  607. PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B)
  608. #endif
  609. movapd %xmm5, 0 * SIZE(B)
  610. movapd %xmm7, 2 * SIZE(B)
  611. movapd %xmm0, 4 * SIZE(B)
  612. movapd %xmm2, 6 * SIZE(B)
  613. #ifdef PREFETCH
  614. PREFETCH PREFETCHSIZE * SIZE(AO2, LDA)
  615. #endif
  616. movapd 6 * SIZE(AO1), %xmm0
  617. movapd 7 * SIZE(AO1, LDA), %xmm5
  618. movapd 6 * SIZE(AO2), %xmm2
  619. movapd 7 * SIZE(AO2, LDA), %xmm7
  620. movsd %xmm0, %xmm1
  621. movsd %xmm2, %xmm3
  622. shufpd $1, %xmm5, %xmm0
  623. shufpd $1, %xmm7, %xmm2
  624. #ifdef PREFETCHW
  625. PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B)
  626. #endif
  627. movapd %xmm1, 8 * SIZE(B)
  628. movapd %xmm3, 10 * SIZE(B)
  629. movapd %xmm0, 12 * SIZE(B)
  630. movapd %xmm2, 14 * SIZE(B)
  631. addq $8 * SIZE, AO1
  632. addq $8 * SIZE, AO2
  633. subq $-32 * SIZE, B
  634. decq I
  635. jg .L53
  636. ALIGN_4
  637. .L54:
  638. testq $4, MM
  639. jle .L56
  640. movapd 0 * SIZE(AO1), %xmm0
  641. movapd 1 * SIZE(AO1, LDA), %xmm1
  642. movapd 0 * SIZE(AO2), %xmm2
  643. movapd 1 * SIZE(AO2, LDA), %xmm3
  644. movsd %xmm0, %xmm5
  645. shufpd $1, %xmm1, %xmm0
  646. movsd %xmm2, %xmm7
  647. shufpd $1, %xmm3, %xmm2
  648. movapd %xmm5, -16 * SIZE(B)
  649. movapd %xmm7, -14 * SIZE(B)
  650. movapd %xmm0, -12 * SIZE(B)
  651. movapd %xmm2, -10 * SIZE(B)
  652. movapd 2 * SIZE(AO1), %xmm0
  653. movapd 3 * SIZE(AO1, LDA), %xmm5
  654. movapd 2 * SIZE(AO2), %xmm2
  655. movapd 3 * SIZE(AO2, LDA), %xmm7
  656. movsd %xmm0, %xmm1
  657. shufpd $1, %xmm5, %xmm0
  658. movsd %xmm2, %xmm3
  659. shufpd $1, %xmm7, %xmm2
  660. movapd %xmm1, -8 * SIZE(B)
  661. movapd %xmm3, -6 * SIZE(B)
  662. movapd %xmm0, -4 * SIZE(B)
  663. movapd %xmm2, -2 * SIZE(B)
  664. addq $4 * SIZE, AO1
  665. addq $4 * SIZE, AO2
  666. subq $-16 * SIZE, B
  667. ALIGN_4
  668. .L56:
  669. testq $2, MM
  670. jle .L58
  671. movapd 0 * SIZE(AO1), %xmm0
  672. movapd 1 * SIZE(AO1, LDA), %xmm1
  673. movapd 0 * SIZE(AO2), %xmm2
  674. movapd 1 * SIZE(AO2, LDA), %xmm3
  675. movsd %xmm0, %xmm5
  676. movsd %xmm2, %xmm7
  677. shufpd $1, %xmm1, %xmm0
  678. shufpd $1, %xmm3, %xmm2
  679. movapd %xmm5, -16 * SIZE(B)
  680. movapd %xmm7, -14 * SIZE(B)
  681. movapd %xmm0, -12 * SIZE(B)
  682. movapd %xmm2, -10 * SIZE(B)
  683. addq $2 * SIZE, AO1
  684. addq $2 * SIZE, AO2
  685. subq $-8 * SIZE, B
  686. ALIGN_4
  687. .L58:
  688. testq $1, MM
  689. jle .L59
  690. movsd 0 * SIZE(AO1), %xmm0
  691. movsd 0 * SIZE(AO1, LDA), %xmm1
  692. movsd 0 * SIZE(AO2), %xmm2
  693. movsd 0 * SIZE(AO2, LDA), %xmm3
  694. unpcklpd %xmm1, %xmm0
  695. unpcklpd %xmm3, %xmm2
  696. movapd %xmm0, -16 * SIZE(B)
  697. movapd %xmm2, -14 * SIZE(B)
  698. subq $-4 * SIZE, B
  699. ALIGN_4
  700. .L59:
  701. decq J
  702. jg .L51
  703. ALIGN_4
  704. .L60:
  705. testq $2, N
  706. jle .L70
  707. movq A, AO1
  708. leaq (A, LDA), AO2
  709. leaq (A, LDA, 2), A
  710. testq $SIZE, A
  711. je .L62
  712. movsd 0 * SIZE(AO1), %xmm0
  713. movsd 0 * SIZE(AO2), %xmm1
  714. unpcklpd %xmm1, %xmm0
  715. movapd %xmm0, -16 * SIZE(B)
  716. addq $1 * SIZE, AO1
  717. addq $1 * SIZE, AO2
  718. subq $-2 * SIZE, B
  719. ALIGN_3
  720. .L62:
  721. movapd -1 * SIZE(AO2), %xmm5
  722. movq MM, I
  723. sarq $3, I
  724. jle .L64
  725. ALIGN_4
  726. .L63:
  727. #ifdef PREFETCH
  728. PREFETCH PREFETCHSIZE * 2 * SIZE(AO1)
  729. #endif
  730. movapd 0 * SIZE(AO1), %xmm0
  731. movapd 1 * SIZE(AO2), %xmm1
  732. movapd 2 * SIZE(AO1), %xmm2
  733. movapd 3 * SIZE(AO2), %xmm3
  734. movsd %xmm0, %xmm5
  735. shufpd $1, %xmm1, %xmm0
  736. movsd %xmm2, %xmm1
  737. shufpd $1, %xmm3, %xmm2
  738. #ifdef PREFETCHW
  739. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  740. #endif
  741. movapd %xmm5, -16 * SIZE(B)
  742. movapd %xmm0, -14 * SIZE(B)
  743. movapd %xmm1, -12 * SIZE(B)
  744. movapd %xmm2, -10 * SIZE(B)
  745. #ifdef PREFETCH
  746. PREFETCH PREFETCHSIZE * 2 * SIZE(AO2)
  747. #endif
  748. movapd 4 * SIZE(AO1), %xmm0
  749. movapd 5 * SIZE(AO2), %xmm1
  750. movapd 6 * SIZE(AO1), %xmm2
  751. movapd 7 * SIZE(AO2), %xmm5
  752. movsd %xmm0, %xmm3
  753. shufpd $1, %xmm1, %xmm0
  754. movsd %xmm2, %xmm1
  755. shufpd $1, %xmm5, %xmm2
  756. #ifdef PREFETCHW
  757. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  758. #endif
  759. movapd %xmm3, -8 * SIZE(B)
  760. movapd %xmm0, -6 * SIZE(B)
  761. movapd %xmm1, -4 * SIZE(B)
  762. movapd %xmm2, -2 * SIZE(B)
  763. addq $8 * SIZE, AO1
  764. addq $8 * SIZE, AO2
  765. subq $-16 * SIZE, B
  766. decq I
  767. jg .L63
  768. ALIGN_4
  769. .L64:
  770. testq $4, MM
  771. jle .L66
  772. movapd 0 * SIZE(AO1), %xmm0
  773. movapd 1 * SIZE(AO2), %xmm1
  774. movapd 2 * SIZE(AO1), %xmm2
  775. movapd 3 * SIZE(AO2), %xmm3
  776. movsd %xmm0, %xmm5
  777. shufpd $1, %xmm1, %xmm0
  778. movsd %xmm2, %xmm1
  779. shufpd $1, %xmm3, %xmm2
  780. movapd %xmm5, -16 * SIZE(B)
  781. movapd %xmm0, -14 * SIZE(B)
  782. movapd %xmm1, -12 * SIZE(B)
  783. movapd %xmm2, -10 * SIZE(B)
  784. movaps %xmm3, %xmm5
  785. addq $4 * SIZE, AO1
  786. addq $4 * SIZE, AO2
  787. subq $-8 * SIZE, B
  788. ALIGN_4
  789. .L66:
  790. testq $2, MM
  791. jle .L68
  792. movapd 0 * SIZE(AO1), %xmm0
  793. movapd 1 * SIZE(AO2), %xmm1
  794. movsd %xmm0, %xmm5
  795. shufpd $1, %xmm1, %xmm0
  796. movapd %xmm5, -16 * SIZE(B)
  797. movapd %xmm0, -14 * SIZE(B)
  798. addq $2 * SIZE, AO1
  799. addq $2 * SIZE, AO2
  800. subq $-4 * SIZE, B
  801. ALIGN_4
  802. .L68:
  803. testq $1, MM
  804. jle .L70
  805. movsd 0 * SIZE(AO1), %xmm0
  806. movsd 0 * SIZE(AO2), %xmm1
  807. unpcklpd %xmm1, %xmm0
  808. movapd %xmm0, -16 * SIZE(B)
  809. subq $-2 * SIZE, B
  810. ALIGN_4
  811. .L70:
  812. testq $1, N
  813. jle .L999
  814. movq A, AO1
  815. testq $SIZE, A
  816. jne .L75
  817. movq MM, I
  818. sarq $3, I
  819. jle .L72
  820. ALIGN_4
  821. .L71:
  822. #ifdef PREFETCH
  823. PREFETCH PREFETCHSIZE * 4 * SIZE(AO1)
  824. #endif
  825. movapd 0 * SIZE(AO1), %xmm0
  826. movapd 2 * SIZE(AO1), %xmm2
  827. movapd 4 * SIZE(AO1), %xmm4
  828. movapd 6 * SIZE(AO1), %xmm6
  829. #ifdef PREFETCHW
  830. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  831. #endif
  832. movapd %xmm0, -16 * SIZE(B)
  833. movapd %xmm2, -14 * SIZE(B)
  834. movapd %xmm4, -12 * SIZE(B)
  835. movapd %xmm6, -10 * SIZE(B)
  836. addq $8 * SIZE, AO1
  837. subq $-8 * SIZE, B
  838. decq I
  839. jg .L71
  840. ALIGN_4
  841. .L72:
  842. testq $4, MM
  843. jle .L73
  844. movapd 0 * SIZE(AO1), %xmm0
  845. movapd 2 * SIZE(AO1), %xmm2
  846. movapd %xmm0, -16 * SIZE(B)
  847. movapd %xmm2, -14 * SIZE(B)
  848. addq $4 * SIZE, AO1
  849. subq $-4 * SIZE, B
  850. ALIGN_4
  851. .L73:
  852. testq $2, MM
  853. jle .L74
  854. movapd 0 * SIZE(AO1), %xmm0
  855. movapd %xmm0, -16 * SIZE(B)
  856. addq $2 * SIZE, AO1
  857. subq $-2 * SIZE, B
  858. ALIGN_4
  859. .L74:
  860. testq $1, MM
  861. jle .L999
  862. movsd 0 * SIZE(AO1), %xmm0
  863. movlpd %xmm0, -16 * SIZE(B)
  864. jmp .L999
  865. ALIGN_4
  866. .L75:
  867. movapd -1 * SIZE(AO1), %xmm0
  868. movq MM, I
  869. sarq $3, I
  870. jle .L76
  871. ALIGN_4
  872. .L76:
  873. #ifdef PREFETCH
  874. PREFETCH PREFETCHSIZE * 4 * SIZE(AO1)
  875. #endif
  876. movapd 1 * SIZE(AO1), %xmm1
  877. movapd 3 * SIZE(AO1), %xmm2
  878. movapd 5 * SIZE(AO1), %xmm3
  879. movapd 7 * SIZE(AO1), %xmm4
  880. shufpd $1, %xmm1, %xmm0
  881. shufpd $1, %xmm2, %xmm1
  882. shufpd $1, %xmm3, %xmm2
  883. shufpd $1, %xmm4, %xmm3
  884. #ifdef PREFETCHW
  885. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  886. #endif
  887. movapd %xmm0, -16 * SIZE(B)
  888. movapd %xmm1, -14 * SIZE(B)
  889. movapd %xmm2, -12 * SIZE(B)
  890. movapd %xmm3, -10 * SIZE(B)
  891. movapd %xmm4, %xmm0
  892. addq $8 * SIZE, AO1
  893. subq $-8 * SIZE, B
  894. decq I
  895. jg .L76
  896. ALIGN_4
  897. .L77:
  898. testq $4, MM
  899. jle .L78
  900. movapd 1 * SIZE(AO1), %xmm1
  901. movapd 3 * SIZE(AO1), %xmm2
  902. shufpd $1, %xmm1, %xmm0
  903. shufpd $1, %xmm2, %xmm1
  904. movapd %xmm0, -16 * SIZE(B)
  905. movapd %xmm1, -14 * SIZE(B)
  906. movapd %xmm2, %xmm0
  907. addq $4 * SIZE, AO1
  908. addq $4 * SIZE, B
  909. ALIGN_4
  910. .L78:
  911. testq $2, MM
  912. jle .L79
  913. movapd 1 * SIZE(AO1), %xmm1
  914. shufpd $1, %xmm1, %xmm0
  915. movapd %xmm0, -16 * SIZE(B)
  916. movapd %xmm1, %xmm0
  917. addq $2 * SIZE, AO1
  918. subq $-2 * SIZE, B
  919. ALIGN_4
  920. .L79:
  921. testq $1, MM
  922. jle .L999
  923. shufpd $1, %xmm0, %xmm0
  924. movlpd %xmm0, -16 * SIZE(B)
  925. ALIGN_4
  926. .L999:
  927. #ifdef WINDOWS_ABI
  928. movups 0(%rsp), %xmm6
  929. movups 16(%rsp), %xmm7
  930. addq $STACKSIZE, %rsp
  931. #endif
  932. popq %r12
  933. popq %r13
  934. #ifdef WINDOWS_ABI
  935. popq %r14
  936. popq %r15
  937. #endif
  938. ret
  939. EPILOGUE