Browse Source

Merge pull request #1921 from fenrus75/haswelldgemm

Replicate some of the SKYLAKEX dgemm improvements also to HASWELL
tags/v0.3.5
Martin Kroeker GitHub 7 years ago
parent
commit
e23366e860
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 15 additions and 5 deletions
  1. +2
    -1
      kernel/x86_64/KERNEL.HASWELL
  2. +12
    -4
      kernel/x86_64/dgemm_beta_skylakex.c
  3. +1
    -0
      param.h

+ 2
- 1
kernel/x86_64/KERNEL.HASWELL View File

@@ -45,9 +45,10 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)

DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c
DGEMMKERNEL = dgemm_kernel_4x8_haswell.S
DGEMM_BETA = dgemm_beta_skylakex.c
DGEMMINCOPY = ../generic/gemm_ncopy_4.c
DGEMMITCOPY = ../generic/gemm_tcopy_4.c
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
DGEMMONCOPY = dgemm_ncopy_8_skylakex.c
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)


+ 12
- 4
kernel/x86_64/dgemm_beta_skylakex.c View File

@@ -61,17 +61,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
c_offset = c;

if (beta == ZERO){
__m512d z_zero;

z_zero = _mm512_setzero_pd();
j = n;
do {
c_offset1 = c_offset;
c_offset += ldc;

i = m;

#ifdef __AVX2__
#ifdef __AVX512CD__
while (i >= 32) {
__m512d z_zero = _mm512_setzero_pd();
_mm512_storeu_pd(c_offset1, z_zero);
_mm512_storeu_pd(c_offset1 + 8, z_zero);
_mm512_storeu_pd(c_offset1 + 16, z_zero);
@@ -79,12 +79,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
c_offset1 += 32;
i -= 32;
}
#endif
while (i >= 8) {
#ifdef __AVX512CD__
__m512d z_zero = _mm512_setzero_pd();
_mm512_storeu_pd(c_offset1, z_zero);
#else
__m256d y_zero = _mm256_setzero_pd();
_mm256_storeu_pd(c_offset1, y_zero);
_mm256_storeu_pd(c_offset1 + 4, y_zero);
#endif
c_offset1 += 8;
i -= 8;
}

#endif
while (i > 0) {
*c_offset1 = ZERO;
c_offset1 ++;


+ 1
- 0
param.h View File

@@ -1508,6 +1508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 8

#define SWITCH_RATIO 32
#define GEMM_PREFERED_SIZE 16

#ifdef ARCH_X86



Loading…
Cancel
Save