Browse Source

s390x/Z14: Change register blocking for SGEMM to 16x4

Change register blocking for SGEMM (and STRMM) on z14 from 8x4 to 16x4
by adjusting SGEMM_DEFAULT_UNROLL_M and choosing the appropriate copy
implementations. Actually make KERNEL.Z14 more flexible, so that the
change in param.h suffices. As a result, performance for SGEMM improves
by around 30% on z15.

On z14, FP SIMD instructions can operate on float-sized scalars in
vector registers, while z13 could do that for double-sized scalars only.
Thus, we can double the amount of elements of C that are held in
registers in an SGEMM kernel.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
tags/v0.3.10^2
Marius Hillenbrand 6 years ago
parent
commit
1b0b4349a1
3 changed files with 22 additions and 5 deletions
  1. +6
    -4
      kernel/zarch/KERNEL.Z14
  2. +15
    -0
      kernel/zarch/gemm_vec.c
  3. +1
    -1
      param.h

+ 6
- 4
kernel/zarch/KERNEL.Z14 View File

@@ -92,12 +92,14 @@ CTRMMKERNEL = ctrmm4x4V.S
ZTRMMKERNEL = ztrmm4x4V.S

SGEMMKERNEL = gemm_vec.c
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N))
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)



+ 15
- 0
kernel/zarch/gemm_vec.c View File

@@ -220,6 +220,15 @@ typedef FLOAT vector_float __attribute__ ((vector_size (16)));
}


#if UNROLL_M == 16
VECTOR_BLOCK(16, 4)
VECTOR_BLOCK(16, 2)
VECTOR_BLOCK(16, 1)
#endif
#if UNROLL_N == 8
VECTOR_BLOCK(8, 8)
VECTOR_BLOCK(4, 8)
#endif
VECTOR_BLOCK(8, 4)
VECTOR_BLOCK(8, 2)
VECTOR_BLOCK(8, 1)
@@ -284,6 +293,12 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n,
return; \
}

#if UNROLL_M == 16
BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1);
#endif
#if UNROLL_N == 8
BLOCK(8, 8); BLOCK(4, 8);
#endif
BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);



+ 1
- 1
param.h View File

@@ -2999,7 +2999,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL

#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4

#define DGEMM_DEFAULT_UNROLL_M 8


Loading…
Cancel
Save