Browse Source

bugfix for bulldozer cgemm-, zgemm- and zgemv-kernel

tags/v0.2.10.rc1^2
wernsaar 11 years ago
parent
commit
d8ba46efdb
3 changed files with 51 additions and 45 deletions
  1. +1
    -1
      kernel/x86_64/KERNEL.BULLDOZER
  2. +25
    -22
      kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
  3. +25
    -22
      kernel/x86_64/zgemm_kernel_2x2_bulldozer.S

+ 1
- 1
kernel/x86_64/KERNEL.BULLDOZER View File

@@ -2,7 +2,7 @@ SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S SGEMVTKERNEL = sgemv_t.S


ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
ZGEMVTKERNEL = zgemv_t.S


DGEMVNKERNEL = dgemv_n_bulldozer.S DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S DGEMVTKERNEL = dgemv_t_bulldozer.S


+ 25
- 22
kernel/x86_64/cgemm_kernel_4x2_bulldozer.S View File

@@ -522,16 +522,16 @@
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq %rdi, 48(%rsp) movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp) movq %rsi, 56(%rsp)
movups %xmm6, 64(%rsp)
movups %xmm7, 80(%rsp)
movups %xmm8, 96(%rsp)
movups %xmm9, 112(%rsp)
movups %xmm10, 128(%rsp)
movups %xmm11, 144(%rsp)
movups %xmm12, 160(%rsp)
movups %xmm13, 176(%rsp)
movups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp)
vmovups %xmm6, 64(%rsp)
vmovups %xmm7, 80(%rsp)
vmovups %xmm8, 96(%rsp)
vmovups %xmm9, 112(%rsp)
vmovups %xmm10, 128(%rsp)
vmovups %xmm11, 144(%rsp)
vmovups %xmm12, 160(%rsp)
vmovups %xmm13, 176(%rsp)
vmovups %xmm14, 192(%rsp)
vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M movq ARG1, OLD_M
movq ARG2, OLD_N movq ARG2, OLD_N
@@ -541,14 +541,15 @@
movq OLD_C, C movq OLD_C, C
movq OLD_LDC, LDC movq OLD_LDC, LDC
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
movsd OLD_OFFSET, %xmm12
vmovsd OLD_OFFSET, %xmm12
#endif #endif
vmovaps %xmm3, %xmm0 vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else #else
movq STACKSIZE + 8(%rsp), LDC movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
movsd STACKSIZE + 16(%rsp), %xmm12
vmovsd STACKSIZE + 16(%rsp), %xmm12
#endif #endif
#endif #endif
@@ -1865,6 +1866,8 @@
.L999: .L999:
vzeroupper
movq SP, %rsp movq SP, %rsp
movq (%rsp), %rbx movq (%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
@@ -1876,16 +1879,16 @@
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq 48(%rsp), %rdi movq 48(%rsp), %rdi
movq 56(%rsp), %rsi movq 56(%rsp), %rsi
movups 64(%rsp), %xmm6
movups 80(%rsp), %xmm7
movups 96(%rsp), %xmm8
movups 112(%rsp), %xmm9
movups 128(%rsp), %xmm10
movups 144(%rsp), %xmm11
movups 160(%rsp), %xmm12
movups 176(%rsp), %xmm13
movups 192(%rsp), %xmm14
movups 208(%rsp), %xmm15
vmovups 64(%rsp), %xmm6
vmovups 80(%rsp), %xmm7
vmovups 96(%rsp), %xmm8
vmovups 112(%rsp), %xmm9
vmovups 128(%rsp), %xmm10
vmovups 144(%rsp), %xmm11
vmovups 160(%rsp), %xmm12
vmovups 176(%rsp), %xmm13
vmovups 192(%rsp), %xmm14
vmovups 208(%rsp), %xmm15
#endif #endif
addq $STACKSIZE, %rsp addq $STACKSIZE, %rsp


+ 25
- 22
kernel/x86_64/zgemm_kernel_2x2_bulldozer.S View File

@@ -412,16 +412,16 @@
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq %rdi, 48(%rsp) movq %rdi, 48(%rsp)
movq %rsi, 56(%rsp) movq %rsi, 56(%rsp)
movups %xmm6, 64(%rsp)
movups %xmm7, 80(%rsp)
movups %xmm8, 96(%rsp)
movups %xmm9, 112(%rsp)
movups %xmm10, 128(%rsp)
movups %xmm11, 144(%rsp)
movups %xmm12, 160(%rsp)
movups %xmm13, 176(%rsp)
movups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp)
vmovups %xmm6, 64(%rsp)
vmovups %xmm7, 80(%rsp)
vmovups %xmm8, 96(%rsp)
vmovups %xmm9, 112(%rsp)
vmovups %xmm10, 128(%rsp)
vmovups %xmm11, 144(%rsp)
vmovups %xmm12, 160(%rsp)
vmovups %xmm13, 176(%rsp)
vmovups %xmm14, 192(%rsp)
vmovups %xmm15, 208(%rsp)
movq ARG1, OLD_M movq ARG1, OLD_M
movq ARG2, OLD_N movq ARG2, OLD_N
@@ -431,14 +431,15 @@
movq OLD_C, C movq OLD_C, C
movq OLD_LDC, LDC movq OLD_LDC, LDC
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
movsd OLD_OFFSET, %xmm12
vmovsd OLD_OFFSET, %xmm12
#endif #endif
vmovaps %xmm3, %xmm0 vmovaps %xmm3, %xmm0
vmovsd OLD_ALPHA_I, %xmm1
#else #else
movq STACKSIZE + 8(%rsp), LDC movq STACKSIZE + 8(%rsp), LDC
#ifdef TRMMKERNEL #ifdef TRMMKERNEL
movsd STACKSIZE + 16(%rsp), %xmm12
vmovsd STACKSIZE + 16(%rsp), %xmm12
#endif #endif
#endif #endif
@@ -1372,6 +1373,8 @@
.L999: .L999:
vzeroupper
movq SP, %rsp movq SP, %rsp
movq (%rsp), %rbx movq (%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
@@ -1383,16 +1386,16 @@
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
movq 48(%rsp), %rdi movq 48(%rsp), %rdi
movq 56(%rsp), %rsi movq 56(%rsp), %rsi
movups 64(%rsp), %xmm6
movups 80(%rsp), %xmm7
movups 96(%rsp), %xmm8
movups 112(%rsp), %xmm9
movups 128(%rsp), %xmm10
movups 144(%rsp), %xmm11
movups 160(%rsp), %xmm12
movups 176(%rsp), %xmm13
movups 192(%rsp), %xmm14
movups 208(%rsp), %xmm15
vmovups 64(%rsp), %xmm6
vmovups 80(%rsp), %xmm7
vmovups 96(%rsp), %xmm8
vmovups 112(%rsp), %xmm9
vmovups 128(%rsp), %xmm10
vmovups 144(%rsp), %xmm11
vmovups 160(%rsp), %xmm12
vmovups 176(%rsp), %xmm13
vmovups 192(%rsp), %xmm14
vmovups 208(%rsp), %xmm15
#endif #endif
addq $STACKSIZE, %rsp addq $STACKSIZE, %rsp


Loading…
Cancel
Save