You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_ncopy_2.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef NEHALEM
  41. #define PREFETCHSIZE 16
  42. #define PREFETCH prefetcht0
  43. #define PREFETCHW prefetcht0
  44. #endif
  45. #ifdef SANDYBRIDGE
  46. #define PREFETCHSIZE 16
  47. #define PREFETCH prefetcht0
  48. #define PREFETCHW prefetcht0
  49. #endif
  50. #ifndef MOVAPS
  51. #define MOVAPS movaps
  52. #endif
  53. #ifndef WINDOWS_ABI
  54. #define M ARG1 /* rdi */
  55. #define N ARG2 /* rsi */
  56. #define A ARG3 /* rdx */
  57. #define LDA ARG4 /* rcx */
  58. #define B ARG5 /* r8 */
  59. #define I %r9
  60. #else
  61. #define STACKSIZE 256
  62. #define M ARG1 /* rcx */
  63. #define N ARG2 /* rdx */
  64. #define A ARG3 /* r8 */
  65. #define LDA ARG4 /* r9 */
  66. #define OLD_B 40 + 32 + STACKSIZE(%rsp)
  67. #define B %r14
  68. #define I %r15
  69. #endif
  70. #define J %r10
  71. #define AO1 %r11
  72. #define AO2 %r12
  73. #define MM %r13
  74. PROLOGUE
  75. PROFCODE
  76. #ifdef WINDOWS_ABI
  77. pushq %r15
  78. pushq %r14
  79. #endif
  80. pushq %r13
  81. pushq %r12
  82. #ifdef WINDOWS_ABI
  83. subq $STACKSIZE, %rsp
  84. movups %xmm6, 0(%rsp)
  85. movq OLD_B, B
  86. #endif
  87. leaq (,LDA, SIZE), LDA
  88. subq $-16 * SIZE, B
  89. movq M, MM
  90. leaq -1(M), %rax
  91. testq $SIZE, A
  92. cmovne %rax, MM
  93. testq $SIZE, LDA
  94. jne .L50
  95. movq N, J
  96. sarq $1, J
  97. jle .L30
  98. ALIGN_4
  99. .L21:
  100. movq A, AO1
  101. leaq (A, LDA), AO2
  102. leaq (A, LDA, 2), A
  103. testq $SIZE, A
  104. je .L22
  105. movsd 0 * SIZE(AO1), %xmm0
  106. movsd 0 * SIZE(AO2), %xmm1
  107. unpcklpd %xmm1, %xmm0
  108. movaps %xmm0, -16 * SIZE(B)
  109. addq $1 * SIZE, AO1
  110. addq $1 * SIZE, AO2
  111. subq $-2 * SIZE, B
  112. ALIGN_3
  113. .L22:
  114. movq MM, I
  115. sarq $3, I
  116. jle .L24
  117. ALIGN_4
  118. .L23:
  119. #ifdef PREFETCH
  120. PREFETCH PREFETCHSIZE * 2 * SIZE(AO1)
  121. #endif
  122. MOVAPS 0 * SIZE(AO1), %xmm0
  123. MOVAPS 0 * SIZE(AO2), %xmm1
  124. MOVAPS 2 * SIZE(AO1), %xmm2
  125. MOVAPS 2 * SIZE(AO2), %xmm3
  126. movaps %xmm0, %xmm4
  127. unpcklpd %xmm1, %xmm0
  128. movaps %xmm2, %xmm6
  129. unpcklpd %xmm3, %xmm2
  130. unpckhpd %xmm1, %xmm4
  131. unpckhpd %xmm3, %xmm6
  132. #ifdef PREFETCHW
  133. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  134. #endif
  135. movaps %xmm0, -16 * SIZE(B)
  136. movaps %xmm4, -14 * SIZE(B)
  137. movaps %xmm2, -12 * SIZE(B)
  138. movaps %xmm6, -10 * SIZE(B)
  139. #ifdef PREFETCH
  140. PREFETCH PREFETCHSIZE * 2 * SIZE(AO2)
  141. #endif
  142. MOVAPS 4 * SIZE(AO1), %xmm0
  143. MOVAPS 4 * SIZE(AO2), %xmm1
  144. MOVAPS 6 * SIZE(AO1), %xmm2
  145. MOVAPS 6 * SIZE(AO2), %xmm3
  146. movaps %xmm0, %xmm4
  147. unpcklpd %xmm1, %xmm0
  148. unpckhpd %xmm1, %xmm4
  149. movaps %xmm2, %xmm6
  150. unpcklpd %xmm3, %xmm2
  151. unpckhpd %xmm3, %xmm6
  152. #ifdef PREFETCHW
  153. PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B)
  154. #endif
  155. movaps %xmm0, -8 * SIZE(B)
  156. movaps %xmm4, -6 * SIZE(B)
  157. movaps %xmm2, -4 * SIZE(B)
  158. movaps %xmm6, -2 * SIZE(B)
  159. addq $8 * SIZE, AO1
  160. addq $8 * SIZE, AO2
  161. subq $-16 * SIZE, B
  162. decq I
  163. jg .L23
  164. ALIGN_4
  165. .L24:
  166. testq $4, MM
  167. jle .L26
  168. MOVAPS 0 * SIZE(AO1), %xmm0
  169. MOVAPS 0 * SIZE(AO2), %xmm1
  170. MOVAPS 2 * SIZE(AO1), %xmm2
  171. MOVAPS 2 * SIZE(AO2), %xmm3
  172. movaps %xmm0, %xmm4
  173. unpcklpd %xmm1, %xmm0
  174. unpckhpd %xmm1, %xmm4
  175. movaps %xmm2, %xmm6
  176. unpcklpd %xmm3, %xmm2
  177. unpckhpd %xmm3, %xmm6
  178. movaps %xmm0, -16 * SIZE(B)
  179. movaps %xmm4, -14 * SIZE(B)
  180. movaps %xmm2, -12 * SIZE(B)
  181. movaps %xmm6, -10 * SIZE(B)
  182. addq $4 * SIZE, AO1
  183. addq $4 * SIZE, AO2
  184. subq $-8 * SIZE, B
  185. ALIGN_4
  186. .L26:
  187. testq $2, MM
  188. jle .L28
  189. MOVAPS 0 * SIZE(AO1), %xmm0
  190. MOVAPS 0 * SIZE(AO2), %xmm1
  191. movaps %xmm0, %xmm2
  192. unpcklpd %xmm1, %xmm0
  193. unpckhpd %xmm1, %xmm2
  194. movaps %xmm0, -16 * SIZE(B)
  195. movaps %xmm2, -14 * SIZE(B)
  196. addq $2 * SIZE, AO1
  197. addq $2 * SIZE, AO2
  198. subq $-4 * SIZE, B
  199. ALIGN_4
  200. .L28:
  201. testq $1, MM
  202. jle .L29
  203. movsd 0 * SIZE(AO1), %xmm0
  204. movsd 0 * SIZE(AO2), %xmm1
  205. unpcklpd %xmm1, %xmm0
  206. movaps %xmm0, -16 * SIZE(B)
  207. subq $-2 * SIZE, B
  208. ALIGN_4
  209. .L29:
  210. decq J
  211. jg .L21
  212. ALIGN_4
  213. .L30:
  214. testq $1, N
  215. jle .L999
  216. .L30x:
  217. movq A, AO1
  218. testq $SIZE, A
  219. jne .L35
  220. movq M, I
  221. sarq $3, I
  222. jle .L32
  223. ALIGN_4
  224. .L31:
  225. #ifdef PREFETCH
  226. PREFETCH PREFETCHSIZE * 4 * SIZE(AO1)
  227. #endif
  228. MOVAPS 0 * SIZE(AO1), %xmm0
  229. MOVAPS 2 * SIZE(AO1), %xmm1
  230. MOVAPS 4 * SIZE(AO1), %xmm2
  231. MOVAPS 6 * SIZE(AO1), %xmm3
  232. #ifdef PREFETCHW
  233. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  234. #endif
  235. movaps %xmm0, -16 * SIZE(B)
  236. movaps %xmm1, -14 * SIZE(B)
  237. movaps %xmm2, -12 * SIZE(B)
  238. movaps %xmm3, -10 * SIZE(B)
  239. addq $8 * SIZE, AO1
  240. addq $8 * SIZE, B
  241. decq I
  242. jg .L31
  243. ALIGN_4
  244. .L32:
  245. testq $4, M
  246. jle .L33
  247. MOVAPS 0 * SIZE(AO1), %xmm0
  248. MOVAPS 2 * SIZE(AO1), %xmm1
  249. movaps %xmm0, -16 * SIZE(B)
  250. movaps %xmm1, -14 * SIZE(B)
  251. addq $4 * SIZE, AO1
  252. subq $-4 * SIZE, B
  253. ALIGN_4
  254. .L33:
  255. testq $2, M
  256. jle .L34
  257. MOVAPS 0 * SIZE(AO1), %xmm0
  258. movaps %xmm0, -16 * SIZE(B)
  259. addq $2 * SIZE, AO1
  260. subq $-2 * SIZE, B
  261. ALIGN_4
  262. .L34:
  263. testq $1, M
  264. jle .L999
  265. movsd 0 * SIZE(AO1), %xmm0
  266. movlpd %xmm0, -16 * SIZE(B)
  267. jmp .L999
  268. ALIGN_4
  269. .L35:
  270. movaps -1 * SIZE(AO1), %xmm0
  271. movq M, I
  272. sarq $3, I
  273. jle .L36
  274. ALIGN_4
  275. .L36:
  276. #ifdef PREFETCH
  277. PREFETCH PREFETCHSIZE * 4 * SIZE(AO1)
  278. #endif
  279. MOVAPS 1 * SIZE(AO1), %xmm1
  280. MOVAPS 3 * SIZE(AO1), %xmm2
  281. MOVAPS 5 * SIZE(AO1), %xmm3
  282. MOVAPS 7 * SIZE(AO1), %xmm4
  283. shufpd $1, %xmm1, %xmm0
  284. shufpd $1, %xmm2, %xmm1
  285. shufpd $1, %xmm3, %xmm2
  286. shufpd $1, %xmm4, %xmm3
  287. #ifdef PREFETCHW
  288. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  289. #endif
  290. movaps %xmm0, -16 * SIZE(B)
  291. movaps %xmm1, -14 * SIZE(B)
  292. movaps %xmm2, -12 * SIZE(B)
  293. movaps %xmm3, -10 * SIZE(B)
  294. movaps %xmm4, %xmm0
  295. addq $8 * SIZE, AO1
  296. subq $-8 * SIZE, B
  297. decq I
  298. jg .L36
  299. ALIGN_4
  300. .L37:
  301. testq $4, M
  302. jle .L38
  303. MOVAPS 1 * SIZE(AO1), %xmm1
  304. MOVAPS 3 * SIZE(AO1), %xmm2
  305. shufpd $1, %xmm1, %xmm0
  306. shufpd $1, %xmm2, %xmm1
  307. movaps %xmm0, -16 * SIZE(B)
  308. movaps %xmm1, -14 * SIZE(B)
  309. movaps %xmm2, %xmm0
  310. addq $4 * SIZE, AO1
  311. addq $4 * SIZE, B
  312. ALIGN_4
  313. .L38:
  314. testq $2, M
  315. jle .L39
  316. MOVAPS 1 * SIZE(AO1), %xmm1
  317. shufpd $1, %xmm1, %xmm0
  318. movaps %xmm0, -16 * SIZE(B)
  319. movaps %xmm1, %xmm0
  320. addq $2 * SIZE, AO1
  321. subq $-2 * SIZE, B
  322. ALIGN_4
  323. .L39:
  324. testq $1, M
  325. jle .L999
  326. movhpd %xmm0, -16 * SIZE(B)
  327. jmp .L999
  328. ALIGN_4
  329. .L50:
  330. movq N, J
  331. sarq $1, J
  332. jle .L30
  333. ALIGN_4
  334. .L61:
  335. movq A, AO1
  336. leaq (A, LDA), AO2
  337. leaq (A, LDA, 2), A
  338. testq $SIZE, A
  339. je .L62
  340. movsd 0 * SIZE(AO1), %xmm0
  341. movsd 0 * SIZE(AO2), %xmm1
  342. unpcklpd %xmm1, %xmm0
  343. movaps %xmm0, -16 * SIZE(B)
  344. addq $1 * SIZE, AO1
  345. addq $1 * SIZE, AO2
  346. subq $-2 * SIZE, B
  347. ALIGN_3
  348. .L62:
  349. MOVAPS -1 * SIZE(AO2), %xmm5
  350. movq MM, I
  351. sarq $3, I
  352. jle .L64
  353. ALIGN_4
  354. .L63:
  355. #ifdef PREFETCH
  356. PREFETCH PREFETCHSIZE * 2 * SIZE(AO1)
  357. #endif
  358. MOVAPS 0 * SIZE(AO1), %xmm0
  359. MOVAPS 1 * SIZE(AO2), %xmm1
  360. MOVAPS 2 * SIZE(AO1), %xmm2
  361. MOVAPS 3 * SIZE(AO2), %xmm3
  362. movsd %xmm0, %xmm5
  363. shufpd $1, %xmm1, %xmm0
  364. movsd %xmm2, %xmm1
  365. shufpd $1, %xmm3, %xmm2
  366. #ifdef PREFETCHW
  367. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  368. #endif
  369. movaps %xmm5, -16 * SIZE(B)
  370. movaps %xmm0, -14 * SIZE(B)
  371. movaps %xmm1, -12 * SIZE(B)
  372. movaps %xmm2, -10 * SIZE(B)
  373. #ifdef PREFETCH
  374. PREFETCH PREFETCHSIZE * 2 * SIZE(AO2)
  375. #endif
  376. MOVAPS 4 * SIZE(AO1), %xmm0
  377. MOVAPS 5 * SIZE(AO2), %xmm1
  378. MOVAPS 6 * SIZE(AO1), %xmm2
  379. MOVAPS 7 * SIZE(AO2), %xmm5
  380. movsd %xmm0, %xmm3
  381. shufpd $1, %xmm1, %xmm0
  382. movsd %xmm2, %xmm1
  383. shufpd $1, %xmm5, %xmm2
  384. #ifdef PREFETCHW
  385. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  386. #endif
  387. movaps %xmm3, -8 * SIZE(B)
  388. movaps %xmm0, -6 * SIZE(B)
  389. movaps %xmm1, -4 * SIZE(B)
  390. movaps %xmm2, -2 * SIZE(B)
  391. addq $8 * SIZE, AO1
  392. addq $8 * SIZE, AO2
  393. subq $-16 * SIZE, B
  394. decq I
  395. jg .L63
  396. ALIGN_4
  397. .L64:
  398. testq $4, MM
  399. jle .L66
  400. MOVAPS 0 * SIZE(AO1), %xmm0
  401. MOVAPS 1 * SIZE(AO2), %xmm1
  402. MOVAPS 2 * SIZE(AO1), %xmm2
  403. MOVAPS 3 * SIZE(AO2), %xmm3
  404. movsd %xmm0, %xmm5
  405. shufpd $1, %xmm1, %xmm0
  406. movsd %xmm2, %xmm1
  407. shufpd $1, %xmm3, %xmm2
  408. movaps %xmm5, -16 * SIZE(B)
  409. movaps %xmm0, -14 * SIZE(B)
  410. movaps %xmm1, -12 * SIZE(B)
  411. movaps %xmm2, -10 * SIZE(B)
  412. movaps %xmm3, %xmm5
  413. addq $4 * SIZE, AO1
  414. addq $4 * SIZE, AO2
  415. subq $-8 * SIZE, B
  416. ALIGN_4
  417. .L66:
  418. testq $2, MM
  419. jle .L68
  420. MOVAPS 0 * SIZE(AO1), %xmm0
  421. MOVAPS 1 * SIZE(AO2), %xmm1
  422. movsd %xmm0, %xmm5
  423. shufpd $1, %xmm1, %xmm0
  424. movaps %xmm5, -16 * SIZE(B)
  425. movaps %xmm0, -14 * SIZE(B)
  426. addq $2 * SIZE, AO1
  427. addq $2 * SIZE, AO2
  428. subq $-4 * SIZE, B
  429. ALIGN_4
  430. .L68:
  431. testq $1, MM
  432. jle .L69
  433. movsd 0 * SIZE(AO1), %xmm0
  434. movsd 0 * SIZE(AO2), %xmm1
  435. unpcklpd %xmm1, %xmm0
  436. movaps %xmm0, -16 * SIZE(B)
  437. subq $-2 * SIZE, B
  438. ALIGN_4
  439. .L69:
  440. decq J
  441. jg .L61
  442. testq $1, N
  443. jne .L30
  444. ALIGN_4
  445. .L999:
  446. #ifdef WINDOWS_ABI
  447. movups 0(%rsp), %xmm6
  448. addq $STACKSIZE, %rsp
  449. #endif
  450. popq %r12
  451. popq %r13
  452. #ifdef WINDOWS_ABI
  453. popq %r14
  454. popq %r15
  455. #endif
  456. ret
  457. EPILOGUE