You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_tcopy_2.S 7.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(PENTIUM4) || defined(GENERIC)
  41. #define PREFETCHSIZE 16
  42. #define PREFETCH prefetcht0
  43. #define PREFETCHW prefetcht0
  44. #endif
  45. #ifdef NEHALEM
  46. #define PREFETCHSIZE 12
  47. #define PREFETCH prefetcht0
  48. #define PREFETCHW prefetcht0
  49. #define MOVUPS_A movups
  50. #endif
  51. #ifdef SANDYBRIDGE
  52. #define PREFETCHSIZE 12
  53. #define PREFETCH prefetcht0
  54. #define PREFETCHW prefetcht0
  55. #define MOVUPS_A movups
  56. #endif
  57. #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON)
  58. #define PREFETCHSIZE 16
  59. #define PREFETCH prefetcht0
  60. #define PREFETCHW prefetcht0
  61. #endif
  62. #ifdef OPTERON
  63. #define PREFETCHSIZE 16
  64. #define PREFETCH prefetch
  65. #define PREFETCHW prefetchw
  66. #endif
  67. #ifdef MOVUPS_A
  68. #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS
  69. #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS
  70. #else
  71. #define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS
  72. #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS
  73. #endif
  74. #ifndef WINDOWS_ABI
  75. #define N ARG1 /* rsi */
  76. #define M ARG2 /* rdi */
  77. #define A ARG3 /* rdx */
  78. #define LDA ARG4 /* rcx */
  79. #define B ARG5 /* r8 */
  80. #define AO1 %r9
  81. #define AO2 %r10
  82. #define LDA3 %r11
  83. #define M8 %r12
  84. #else
  85. #define N ARG1 /* rdx */
  86. #define M ARG2 /* rcx */
  87. #define A ARG3 /* r8 */
  88. #define LDA ARG4 /* r9 */
  89. #define OLD_B 40 + 40(%rsp)
  90. #define B %r12
  91. #define AO1 %rsi
  92. #define AO2 %rdi
  93. #define LDA3 %r10
  94. #define M8 %r11
  95. #endif
  96. #define I %rax
  97. #define B0 %rbp
  98. #define B3 %r13
  99. PROLOGUE
  100. PROFCODE
  101. #ifdef WINDOWS_ABI
  102. pushq %rdi
  103. pushq %rsi
  104. #endif
  105. pushq %r12
  106. pushq %r13
  107. pushq %rbp
  108. #ifdef WINDOWS_ABI
  109. movq OLD_B, B
  110. #endif
  111. subq $-16 * SIZE, B
  112. movq M, B3
  113. andq $-2, B3
  114. imulq N, B3
  115. leaq (B, B3, SIZE), B3
  116. leaq (,LDA, SIZE), LDA
  117. leaq (LDA, LDA, 2), LDA3
  118. leaq (, N, SIZE), M8
  119. cmpq $2, N
  120. jl .L40
  121. ALIGN_4
  122. .L31:
  123. subq $2, N
  124. movq A, AO1
  125. leaq (A, LDA), AO2
  126. leaq (A, LDA, 2), A
  127. movq B, B0
  128. addq $4 * SIZE, B
  129. movq M, I
  130. sarq $3, I
  131. jle .L34
  132. ALIGN_4
  133. .L33:
  134. #ifdef PREFETCH
  135. PREFETCH PREFETCHSIZE * 2 * SIZE(AO1)
  136. #endif
  137. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  138. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  139. MOVUPS_A1(0 * SIZE, AO2, %xmm2)
  140. MOVUPS_A1(2 * SIZE, AO2, %xmm3)
  141. #ifdef PREFETCHW
  142. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  143. #endif
  144. movaps %xmm0, -16 * SIZE(B0)
  145. movaps %xmm2, -14 * SIZE(B0)
  146. movaps %xmm1, -16 * SIZE(B0, M8, 2)
  147. movaps %xmm3, -14 * SIZE(B0, M8, 2)
  148. leaq (B0, M8, 4), B0
  149. #ifdef PREFETCH
  150. PREFETCH PREFETCHSIZE * 2 * SIZE(AO2)
  151. #endif
  152. MOVUPS_A1(4 * SIZE, AO1, %xmm0)
  153. MOVUPS_A1(6 * SIZE, AO1, %xmm1)
  154. MOVUPS_A1(4 * SIZE, AO2, %xmm2)
  155. MOVUPS_A1(6 * SIZE, AO2, %xmm3)
  156. movaps %xmm0, -16 * SIZE(B0)
  157. movaps %xmm2, -14 * SIZE(B0)
  158. movaps %xmm1, -16 * SIZE(B0, M8, 2)
  159. movaps %xmm3, -14 * SIZE(B0, M8, 2)
  160. leaq (B0, M8, 4), B0
  161. addq $8 * SIZE, AO1
  162. addq $8 * SIZE, AO2
  163. decq I
  164. jg .L33
  165. ALIGN_4
  166. .L34:
  167. testq $4, M
  168. jle .L36
  169. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  170. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  171. MOVUPS_A1(0 * SIZE, AO2, %xmm2)
  172. MOVUPS_A1(2 * SIZE, AO2, %xmm3)
  173. movaps %xmm0, -16 * SIZE(B0)
  174. movaps %xmm2, -14 * SIZE(B0)
  175. movaps %xmm1, -16 * SIZE(B0, M8, 2)
  176. movaps %xmm3, -14 * SIZE(B0, M8, 2)
  177. addq $4 * SIZE, AO1
  178. addq $4 * SIZE, AO2
  179. leaq (B0, M8, 4), B0
  180. ALIGN_4
  181. .L36:
  182. testq $2, M
  183. jle .L38
  184. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  185. MOVUPS_A1(0 * SIZE, AO2, %xmm1)
  186. movaps %xmm0, -16 * SIZE(B0)
  187. movaps %xmm1, -14 * SIZE(B0)
  188. addq $2 * SIZE, AO1
  189. addq $2 * SIZE, AO2
  190. leaq (B0, M8, 2), B0
  191. ALIGN_4
  192. .L38:
  193. testq $1, M
  194. jle .L39
  195. movsd 0 * SIZE(AO1), %xmm0
  196. movhpd 0 * SIZE(AO2), %xmm0
  197. movaps %xmm0, -16 * SIZE(B3)
  198. subq $-2 * SIZE, B3
  199. ALIGN_4
  200. .L39:
  201. cmpq $2, N
  202. jge .L31
  203. ALIGN_4
  204. .L40:
  205. cmpq $1, N
  206. jl .L999
  207. movq A, AO1
  208. movq B, B0
  209. movq M, I
  210. sarq $3, I
  211. jle .L44
  212. ALIGN_4
  213. .L43:
  214. #ifdef PREFETCH
  215. PREFETCH PREFETCHSIZE * 4 * SIZE(AO1)
  216. #endif
  217. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  218. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  219. MOVUPS_A1(4 * SIZE, AO1, %xmm2)
  220. MOVUPS_A1(6 * SIZE, AO1, %xmm3)
  221. #ifdef PREFETCHW
  222. PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B)
  223. #endif
  224. addq $8 * SIZE, AO1
  225. movaps %xmm0, -16 * SIZE(B0)
  226. movaps %xmm1, -16 * SIZE(B0, M8, 2)
  227. leaq (B0, M8, 4), B0
  228. movaps %xmm2, -16 * SIZE(B0)
  229. movaps %xmm3, -16 * SIZE(B0, M8, 2)
  230. leaq (B0, M8, 4), B0
  231. decq I
  232. jg .L43
  233. ALIGN_4
  234. .L44:
  235. testq $4, M
  236. jle .L45
  237. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  238. MOVUPS_A1(2 * SIZE, AO1, %xmm1)
  239. addq $4 * SIZE, AO1
  240. movaps %xmm0, -16 * SIZE(B0)
  241. movaps %xmm1, -16 * SIZE(B0, M8, 2)
  242. leaq (B0, M8, 4), B0
  243. ALIGN_4
  244. .L45:
  245. testq $2, M
  246. jle .L46
  247. MOVUPS_A1(0 * SIZE, AO1, %xmm0)
  248. movaps %xmm0, -16 * SIZE(B0)
  249. addq $2 * SIZE, AO1
  250. ALIGN_4
  251. .L46:
  252. testq $1, M
  253. jle .L999
  254. movsd 0 * SIZE(AO1), %xmm0
  255. movlpd %xmm0, -16 * SIZE(B3)
  256. ALIGN_4
  257. .L999:
  258. popq %rbp
  259. popq %r13
  260. popq %r12
  261. #ifdef WINDOWS_ABI
  262. popq %rsi
  263. popq %rdi
  264. #endif
  265. ret
  266. EPILOGUE