Browse Source

New dgemm kernel for BULLDOZER: dgemm_kernel_8x2_bulldozer.S

tags/v0.2.7
wernsaar 13 years ago
parent
commit
25491e42f9
6 changed files with 3923 additions and 1997 deletions
  1. +14
    -1
      driver/level3/level3.c
  2. +14
    -2
      driver/level3/level3_thread.c
  3. +27
    -25
      kernel/x86_64/KERNEL.BULLDOZER
  4. +0
    -1959
      kernel/x86_64/dgemm_kernel_4x4_bulldozer.S
  5. +3854
    -0
      kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
  6. +14
    -10
      param.h

+ 14
- 1
driver/level3/level3.c View File

@@ -332,7 +332,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#else
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

#if defined(BULLDOZER) && defined(ARCH_X86_64) && defined(DOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#else

if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif

START_RPCC();


+ 14
- 2
driver/level3/level3_thread.c View File

@@ -360,8 +360,20 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,

for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs;

#if defined(BULLDOZER) && defined(ARCH_X86_64) && defined(DOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#else

if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif

START_RPCC();
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
@@ -634,7 +646,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_cpu_n ++;
}
for (j = 0; j < num_cpu_m; j++) {
for (i = 0; i < num_cpu_m; i++) {
for (k = 0; k < DIVIDE_RATE; k++) {


+ 27
- 25
kernel/x86_64/KERNEL.BULLDOZER View File

@@ -10,13 +10,13 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
@@ -38,25 +38,27 @@ ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S

CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

+ 0
- 1959
kernel/x86_64/dgemm_kernel_4x4_bulldozer.S
File diff suppressed because it is too large
View File


+ 3854
- 0
kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
File diff suppressed because it is too large
View File


+ 14
- 10
param.h View File

@@ -153,7 +153,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_ALIGN 0x0fffUL

#define SGEMM_DEFAULT_UNROLL_N 4
#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(COMPLEX)
#define DGEMM_DEFAULT_UNROLL_N 2
#else
#define DGEMM_DEFAULT_UNROLL_N 4
#endif
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 2
@@ -161,14 +165,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#ifdef ARCH_X86
#define SGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_M 4
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1
#else
#define SGEMM_DEFAULT_UNROLL_M 8
#if defined(BULLDOZER) && !defined(COMPLEX)
#define DGEMM_DEFAULT_UNROLL_M 8
#else
#define DGEMM_DEFAULT_UNROLL_M 4
#endif
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_M 2
@@ -193,26 +201,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else

#define SGEMM_DEFAULT_P 448

#if defined(BULLDOZER) && defined(ARCH_X86_64)
#define DGEMM_DEFAULT_P 248
#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(COMPLEX)
#define DGEMM_DEFAULT_P 384
#else
#define DGEMM_DEFAULT_P 224
#endif

#define QGEMM_DEFAULT_P 112
#define CGEMM_DEFAULT_P 224
#define ZGEMM_DEFAULT_P 112
#define XGEMM_DEFAULT_P 56

#define SGEMM_DEFAULT_Q 224

#if defined(BULLDOZER) && defined(ARCH_X86_64)
#define DGEMM_DEFAULT_Q 248
#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(COMPLEX)
#define DGEMM_DEFAULT_Q 168
#else
#define DGEMM_DEFAULT_Q 224
#endif

#define QGEMM_DEFAULT_Q 224
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 224
@@ -230,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16
#define HAVE_EXCLUSIVE_CACHE

#define GEMM_THREAD gemm_thread_mn
#define GEMM_THREAD gemm_thread_m

#endif



Loading…
Cancel
Save