Browse Source

x86_64: clobber all xmm registers after vzeroupper

As observed using GCC 10 using -march=native -ftree-vectorize
on Knights Landing, it is now smart enough to find clobbers inside
non-inlined static functions.

In particular, sgemv counted on a kernel to preserve the whole
%ymm2 register (since it was not in the clobber list), but the top
part was destroyed by vzeroupper. This caused many tests to fail.

This patch makes sure all xmm (and ymm/zmm by extension) registers
are listed as clobbered to avoid this happening, as most kernels
already did correctly in fact.
tags/v0.3.12
Bart Oldeman 5 years ago
parent
commit
b073d759d0
22 changed files with 63 additions and 44 deletions
  1. +3
    -2
      kernel/x86_64/caxpy_microk_bulldozer-2.c
  2. +1
    -1
      kernel/x86_64/caxpy_microk_haswell-2.c
  3. +1
    -1
      kernel/x86_64/caxpy_microk_sandy-2.c
  4. +3
    -2
      kernel/x86_64/caxpy_microk_steamroller-2.c
  5. +3
    -2
      kernel/x86_64/daxpy_microk_haswell-2.c
  6. +3
    -2
      kernel/x86_64/ddot_microk_haswell-2.c
  7. +2
    -0
      kernel/x86_64/ddot_microk_piledriver-2.c
  8. +3
    -2
      kernel/x86_64/ddot_microk_sandy-2.c
  9. +1
    -0
      kernel/x86_64/ddot_microk_steamroller-2.c
  10. +6
    -8
      kernel/x86_64/dgemv_n_microk_haswell-4.c
  11. +4
    -2
      kernel/x86_64/dgemv_n_microk_piledriver-4.c
  12. +2
    -0
      kernel/x86_64/dgemv_t_microk_haswell-4.c
  13. +2
    -1
      kernel/x86_64/saxpy_microk_haswell-2.c
  14. +4
    -2
      kernel/x86_64/saxpy_microk_piledriver-2.c
  15. +3
    -2
      kernel/x86_64/sdot_microk_haswell-2.c
  16. +3
    -2
      kernel/x86_64/sdot_microk_sandy-2.c
  17. +6
    -8
      kernel/x86_64/sgemv_n_microk_haswell-4.c
  18. +2
    -0
      kernel/x86_64/sgemv_t_microk_haswell-4.c
  19. +3
    -2
      kernel/x86_64/zaxpy_microk_bulldozer-2.c
  20. +1
    -1
      kernel/x86_64/zaxpy_microk_haswell-2.c
  21. +4
    -2
      kernel/x86_64/zaxpy_microk_sandy-2.c
  22. +3
    -2
      kernel/x86_64/zaxpy_microk_steamroller-2.c

+ 3
- 2
kernel/x86_64/caxpy_microk_bulldozer-2.c View File

@@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
@@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );




+ 1
- 1
kernel/x86_64/caxpy_microk_haswell-2.c View File

@@ -120,7 +120,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",


+ 1
- 1
kernel/x86_64/caxpy_microk_sandy-2.c View File

@@ -104,7 +104,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",


+ 3
- 2
kernel/x86_64/caxpy_microk_steamroller-2.c View File

@@ -122,7 +122,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
@@ -189,9 +189,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );




+ 3
- 2
kernel/x86_64/daxpy_microk_haswell-2.c View File

@@ -67,8 +67,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4
: "cc", : "cc",
"%xmm0",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );


+ 3
- 2
kernel/x86_64/ddot_microk_haswell-2.c View File

@@ -84,8 +84,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4
: "cc", : "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );


+ 2
- 0
kernel/x86_64/ddot_microk_piledriver-2.c View File

@@ -91,6 +91,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
@@ -155,6 +156,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );


+ 3
- 2
kernel/x86_64/ddot_microk_sandy-2.c View File

@@ -89,8 +89,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4
: "cc", : "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );


+ 1
- 0
kernel/x86_64/ddot_microk_steamroller-2.c View File

@@ -88,6 +88,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );


+ 6
- 8
kernel/x86_64/dgemv_n_microk_haswell-4.c View File

@@ -105,9 +105,8 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (alpha) // 8 "r" (alpha) // 8
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
@@ -182,11 +181,10 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[1]), // 5 "r" (ap[1]), // 5
"r" (alpha) // 6 "r" (alpha) // 6
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm4", "%xmm5",
"%xmm6",
"%xmm8",
"%xmm12", "%xmm13",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
} }

+ 4
- 2
kernel/x86_64/dgemv_n_microk_piledriver-4.c View File

@@ -140,7 +140,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"%xmm2", "%xmm3", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm4", "%xmm5",
"%xmm6", "%xmm7", "%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
@@ -235,9 +235,11 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[3]), // 7 "r" (ap[3]), // 7
"r" (alpha) // 8 "r" (alpha) // 8
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm4", "%xmm5",
"%xmm6", "%xmm7", "%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );


+ 2
- 0
kernel/x86_64/dgemv_t_microk_haswell-4.c View File

@@ -117,7 +117,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"r" (ap[2]), // 6 "r" (ap[2]), // 6
"r" (ap[3]) // 7 "r" (ap[3]) // 7
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );


+ 2
- 1
kernel/x86_64/saxpy_microk_haswell-2.c View File

@@ -67,7 +67,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4
: "cc", : "cc",
"%xmm0",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"


+ 4
- 2
kernel/x86_64/saxpy_microk_piledriver-2.c View File

@@ -86,7 +86,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4
: "cc", : "cc",
"%xmm0",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
@@ -147,7 +148,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (y), // 3 "r" (y), // 3
"r" (alpha) // 4 "r" (alpha) // 4
: "cc", : "cc",
"%xmm0",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"


+ 3
- 2
kernel/x86_64/sdot_microk_haswell-2.c View File

@@ -87,8 +87,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4
: "cc", : "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );


+ 3
- 2
kernel/x86_64/sdot_microk_sandy-2.c View File

@@ -90,8 +90,9 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"r" (y), // 3 "r" (y), // 3
"r" (dot) // 4 "r" (dot) // 4
: "cc", : "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );


+ 6
- 8
kernel/x86_64/sgemv_n_microk_haswell-4.c View File

@@ -164,11 +164,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"r" (ap[3]), // 8 "r" (ap[3]), // 8
"r" (alpha) // 9 "r" (alpha) // 9
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
@@ -286,9 +284,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[3]), // 7 "r" (ap[3]), // 7
"r" (alpha) // 8 "r" (alpha) // 8
: "cc", : "cc",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );


+ 2
- 0
kernel/x86_64/sgemv_t_microk_haswell-4.c View File

@@ -138,7 +138,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"r" (ap[2]), // 6 "r" (ap[2]), // 6
"r" (ap[3]) // 7 "r" (ap[3]) // 7
: "cc", : "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );


+ 3
- 2
kernel/x86_64/zaxpy_microk_bulldozer-2.c View File

@@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
@@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );




+ 1
- 1
kernel/x86_64/zaxpy_microk_haswell-2.c View File

@@ -120,7 +120,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",


+ 4
- 2
kernel/x86_64/zaxpy_microk_sandy-2.c View File

@@ -108,9 +108,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );
return; return;
@@ -185,9 +186,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );




+ 3
- 2
kernel/x86_64/zaxpy_microk_steamroller-2.c View File

@@ -122,7 +122,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15", "%xmm12", "%xmm13", "%xmm14", "%xmm15",
@@ -189,9 +189,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"r" (alpha), // 4 "r" (alpha), // 4
"r" (mvec) // 5 "r" (mvec) // 5
: "cc", : "cc",
"%xmm0", "%xmm1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory" "memory"
); );




Loading…
Cancel
Save