Browse Source

Use the skylake sgemm beta code also for haswell

with a few small changes it's possible to use the skylake sgemm code
also for haswell, this gives a modest gain (10% range) for smallish
matrixes but does wonders for very skinny matrixes
tags/v0.3.5
Arjan van de Ven 7 years ago
parent
commit
00dc09ad19
2 changed files with 12 additions and 4 deletions
  1. +1
    -0
      kernel/x86_64/KERNEL.HASWELL
  2. +11
    -4
      kernel/x86_64/sgemm_beta_skylakex.c

+ 1
- 0
kernel/x86_64/KERNEL.HASWELL View File

@@ -33,6 +33,7 @@ ZAXPYKERNEL = zaxpy.c


STRMMKERNEL = sgemm_kernel_16x4_haswell.S STRMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMM_BETA = sgemm_beta_skylakex.c
SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c


+ 11
- 4
kernel/x86_64/sgemm_beta_skylakex.c View File

@@ -61,11 +61,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
c_offset = c; c_offset = c;


if (beta == ZERO){ if (beta == ZERO){
__m512 z_zero;
__m256 y_zero;
#ifdef __AVX512CD__
__m512 z_zero = _mm512_setzero_ps();
#endif
__m256 y_zero = _mm256_setzero_ps();


z_zero = _mm512_setzero_ps();
y_zero = _mm256_setzero_ps();
j = n; j = n;
do { do {
c_offset1 = c_offset; c_offset1 = c_offset;
@@ -74,8 +74,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
i = m; i = m;


while (i >= 32) { while (i >= 32) {
#ifdef __AVX512CD__
_mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1, z_zero);
_mm512_storeu_ps(c_offset1 + 16, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero);
#else
_mm256_storeu_ps(c_offset1, y_zero);
_mm256_storeu_ps(c_offset1 + 8, y_zero);
_mm256_storeu_ps(c_offset1 + 16, y_zero);
_mm256_storeu_ps(c_offset1 + 24, y_zero);
#endif
c_offset1 += 32; c_offset1 += 32;
i -= 32; i -= 32;
} }


Loading…
Cancel
Save