Browse Source

Merge pull request #390 from wernsaar/develop

Ref #103: enhancement for small matrix dimensions. Fixed some bugs. Enable sgemm for SNB and dgemm for NEHALEM
tags/v0.2.10.rc1^2
Zhang Xianyi 12 years ago
parent
commit
d10db52edb
30 changed files with 3375 additions and 75 deletions
  1. +4
    -0
      Makefile
  2. +2
    -1
      Makefile.rule
  3. +65
    -3
      Makefile.system
  4. +2
    -0
      cpuid_x86.c
  5. +1
    -1
      driver/others/divtable.c
  6. +56
    -7
      interface/gemm.c
  7. +5
    -4
      interface/ger.c
  8. +1
    -1
      interface/rotmg.c
  9. +4
    -4
      interface/zger.c
  10. +3
    -0
      kernel/x86_64/KERNEL.BARCELONA
  11. +3
    -0
      kernel/x86_64/KERNEL.BULLDOZER
  12. +4
    -0
      kernel/x86_64/KERNEL.HASWELL
  13. +15
    -12
      kernel/x86_64/KERNEL.NEHALEM
  14. +3
    -0
      kernel/x86_64/KERNEL.PILEDRIVER
  15. +8
    -6
      kernel/x86_64/KERNEL.SANDYBRIDGE
  16. +1
    -2
      kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
  17. +1
    -3
      kernel/x86_64/cgemm_kernel_4x2_piledriver.S
  18. +1
    -3
      kernel/x86_64/cgemm_kernel_8x2_haswell.S
  19. +1
    -1
      kernel/x86_64/dgemm_kernel_4x4_haswell.S
  20. +2
    -2
      kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
  21. +2
    -2
      kernel/x86_64/dgemm_kernel_8x2_piledriver.S
  22. +2
    -2
      kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
  23. +2
    -2
      kernel/x86_64/sgemm_kernel_16x2_piledriver.S
  24. +1
    -3
      kernel/x86_64/sgemm_kernel_16x4_haswell.S
  25. +3167
    -0
      kernel/x86_64/sgemm_kernel_16x4_sandy.S
  26. +1
    -3
      kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
  27. +1
    -3
      kernel/x86_64/zgemm_kernel_2x2_piledriver.S
  28. +1
    -3
      kernel/x86_64/zgemm_kernel_4x2_haswell.S
  29. +1
    -1
      lapack-netlib/TESTING/nep.in
  30. +15
    -6
      param.h

+ 4
- 0
Makefile View File

@@ -36,9 +36,13 @@ ifndef BINARY64
else else
@echo " BINARY ... 64bit " @echo " BINARY ... 64bit "
endif endif

ifdef INTERFACE64 ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) " @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) "
endif endif
endif

@echo " C compiler ... $(C_COMPILER) (command line : $(CC))" @echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN ifndef NOFORTRAN
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"


+ 2
- 1
Makefile.rule View File

@@ -133,7 +133,8 @@ NO_AFFINITY = 1
# COMMON_OPT = -O2 # COMMON_OPT = -O2


# gfortran option for LAPACK # gfortran option for LAPACK
FCOMMON_OPT = -frecursive
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
# FCOMMON_OPT = -frecursive


# Profiling flags # Profiling flags
COMMON_PROF = -pg COMMON_PROF = -pg


+ 65
- 3
Makefile.system View File

@@ -46,15 +46,55 @@ ifdef TARGET
GETARCH_FLAGS := -DFORCE_$(TARGET) GETARCH_FLAGS := -DFORCE_$(TARGET)
endif endif


# Force fallbacks for 32bit

ifeq ($(BINARY), 32)
ifeq ($(TARGET), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), BULLDOZER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif


#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. #TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
# #
ifdef TARGET_CORE ifdef TARGET_CORE
GETARCH_FLAGS := -DFORCE_$(TARGET_CORE) GETARCH_FLAGS := -DFORCE_$(TARGET_CORE)
endif endif


# Force fallbacks for 32bit

ifeq ($(BINARY), 32)
ifeq ($(TARGET_CORE), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), BULLDOZER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif




ifdef INTERFACE64 ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
GETARCH_FLAGS += -DUSE64BITINT GETARCH_FLAGS += -DUSE64BITINT
endif endif
endif


ifndef GEMM_MULTITHREAD_THRESHOLD ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=4 GEMM_MULTITHREAD_THRESHOLD=4
@@ -65,6 +105,10 @@ ifeq ($(NO_AVX), 1)
GETARCH_FLAGS += -DNO_AVX GETARCH_FLAGS += -DNO_AVX
endif endif


ifeq ($(BINARY), 32)
GETARCH_FLAGS += -DNO_AVX
endif

ifeq ($(DEBUG), 1) ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g GETARCH_FLAGS += -g
endif endif
@@ -336,9 +380,6 @@ ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH), x86) ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif
endif endif


ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
@@ -503,8 +544,10 @@ else
ifdef BINARY64 ifdef BINARY64
FCOMMON_OPT += -m64 FCOMMON_OPT += -m64
ifdef INTERFACE64 ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -fdefault-integer-8 FCOMMON_OPT += -fdefault-integer-8
endif endif
endif
else else
FCOMMON_OPT += -m32 FCOMMON_OPT += -m32
endif endif
@@ -517,8 +560,10 @@ endif
ifeq ($(F_COMPILER), INTEL) ifeq ($(F_COMPILER), INTEL)
CCOMMON_OPT += -DF_INTERFACE_INTEL CCOMMON_OPT += -DF_INTERFACE_INTEL
ifdef INTERFACE64 ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8 FCOMMON_OPT += -i8
endif endif
endif
ifdef USE_OPENMP ifdef USE_OPENMP
FCOMMON_OPT += -openmp FCOMMON_OPT += -openmp
endif endif
@@ -537,8 +582,10 @@ CCOMMON_OPT += -DF_INTERFACE_IBM
ifdef BINARY64 ifdef BINARY64
FCOMMON_OPT += -q64 FCOMMON_OPT += -q64
ifdef INTERFACE64 ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -qintsize=8 FCOMMON_OPT += -qintsize=8
endif endif
endif
else else
FCOMMON_OPT += -q32 FCOMMON_OPT += -q32
endif endif
@@ -552,8 +599,10 @@ CCOMMON_OPT += -DF_INTERFACE_PGI
COMMON_PROF += -DPGICOMPILER COMMON_PROF += -DPGICOMPILER
ifdef BINARY64 ifdef BINARY64
ifdef INTERFACE64 ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8 FCOMMON_OPT += -i8
endif endif
endif
FCOMMON_OPT += -tp p7-64 FCOMMON_OPT += -tp p7-64
else else
FCOMMON_OPT += -tp p7 FCOMMON_OPT += -tp p7
@@ -567,9 +616,11 @@ ifeq ($(F_COMPILER), PATHSCALE)
CCOMMON_OPT += -DF_INTERFACE_PATHSCALE CCOMMON_OPT += -DF_INTERFACE_PATHSCALE
ifdef BINARY64 ifdef BINARY64
ifdef INTERFACE64 ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8 FCOMMON_OPT += -i8
endif endif
endif endif
endif


ifneq ($(ARCH), mips64) ifneq ($(ARCH), mips64)
ifndef BINARY64 ifndef BINARY64
@@ -594,9 +645,11 @@ ifeq ($(F_COMPILER), OPEN64)
CCOMMON_OPT += -DF_INTERFACE_OPEN64 CCOMMON_OPT += -DF_INTERFACE_OPEN64
ifdef BINARY64 ifdef BINARY64
ifdef INTERFACE64 ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8 FCOMMON_OPT += -i8
endif endif
endif endif
endif


ifeq ($(ARCH), mips64) ifeq ($(ARCH), mips64)
ifndef BINARY64 ifndef BINARY64
@@ -682,10 +735,12 @@ endif


ifdef BINARY64 ifdef BINARY64
ifdef INTERFACE64 ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
CCOMMON_OPT += CCOMMON_OPT +=
#-DUSE64BITINT #-DUSE64BITINT
endif endif
endif endif
endif


ifeq ($(NEED_PIC), 1) ifeq ($(NEED_PIC), 1)
ifeq ($(C_COMPILER), IBM) ifeq ($(C_COMPILER), IBM)
@@ -718,6 +773,10 @@ ifeq ($(NO_AVX), 1)
CCOMMON_OPT += -DNO_AVX CCOMMON_OPT += -DNO_AVX
endif endif


ifeq ($(BINARY), 32)
CCOMMON_OPT += -DNO_AVX
endif

ifdef SMP ifdef SMP
CCOMMON_OPT += -DSMP_SERVER CCOMMON_OPT += -DSMP_SERVER


@@ -872,8 +931,11 @@ endif
LAPACK_CFLAGS = $(CFLAGS) LAPACK_CFLAGS = $(CFLAGS)
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
ifdef INTERFACE64 ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
LAPACK_CFLAGS += -DLAPACK_ILP64 LAPACK_CFLAGS += -DLAPACK_ILP64
endif endif
endif

ifdef OS_WINDOWS ifdef OS_WINDOWS
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
endif endif


+ 2
- 0
cpuid_x86.c View File

@@ -40,6 +40,7 @@
#include <string.h> #include <string.h>
#include "cpuid.h" #include "cpuid.h"


/*
#ifdef NO_AVX #ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM #define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM #define CORE_HASWELL CORE_NEHALEM
@@ -50,6 +51,7 @@
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA #define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
#define CORE_PILEDRIVER CORE_BARCELONA #define CORE_PILEDRIVER CORE_BARCELONA
#endif #endif
*/


#ifndef CPUIDEMU #ifndef CPUIDEMU




+ 1
- 1
driver/others/divtable.c View File

@@ -39,7 +39,7 @@
#include "common.h" #include "common.h"


#ifdef SMP #ifdef SMP
#ifndef USE64BITINT
#if !defined(USE64BITINT) || defined(ARCH_X86)
unsigned int blas_quick_divide_table[] = { unsigned int blas_quick_divide_table[] = {
0x00000000, 0x00000001, 0x80000001, 0x55555556, 0x00000000, 0x00000001, 0x80000001, 0x55555556,
0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925, 0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925,


+ 56
- 7
interface/gemm.c View File

@@ -72,7 +72,7 @@
#endif #endif


#ifndef GEMM_MULTITHREAD_THRESHOLD #ifndef GEMM_MULTITHREAD_THRESHOLD
# define GEMM_MULTITHREAD_THRESHOLD 4
#define GEMM_MULTITHREAD_THRESHOLD 4
#endif #endif


static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
@@ -400,14 +400,63 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT);


args.common = NULL;
int nthreads_max = num_cpu_avail(3);
int nthreads_avail = nthreads_max;


if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD
|| args.k <=GEMM_MULTITHREAD_THRESHOLD){
args.nthreads = 1;
}else{
args.nthreads = num_cpu_avail(3);
#ifndef COMPLEX
double MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
else
{
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
{
nthreads_max = 4;
if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD )
{
nthreads_max = 2;
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
}
else
{
if ( args.n <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
}
}
} }
#else
double MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
else
{
if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
{
nthreads_max = 4;
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD )
{
nthreads_max = 2;
if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
}
else
{
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
}
}
}

#endif
args.common = NULL;

if ( nthreads_max > nthreads_avail )
args.nthreads = nthreads_avail;
else
args.nthreads = nthreads_max;


if (args.nthreads == 1) { if (args.nthreads == 1) {
#endif #endif


+ 5
- 4
interface/ger.c View File

@@ -75,7 +75,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
blasint incy = *INCY; blasint incy = *INCY;
blasint lda = *LDA; blasint lda = *LDA;
FLOAT *buffer; FLOAT *buffer;
#ifdef SMP
#ifdef SMPBUG
int nthreads; int nthreads;
#endif #endif


@@ -107,7 +107,7 @@ void CNAME(enum CBLAS_ORDER order,


FLOAT *buffer; FLOAT *buffer;
blasint info, t; blasint info, t;
#ifdef SMP
#ifdef SMPBUG
int nthreads; int nthreads;
#endif #endif


@@ -167,15 +167,16 @@ void CNAME(enum CBLAS_ORDER order,


buffer = (FLOAT *)blas_memory_alloc(1); buffer = (FLOAT *)blas_memory_alloc(1);


#ifdef SMP
#ifdef SMPBUG
nthreads = num_cpu_avail(2); nthreads = num_cpu_avail(2);



if (nthreads == 1) { if (nthreads == 1) {
#endif #endif


GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer);


#ifdef SMP
#ifdef SMPBUG
} else { } else {
GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads);


+ 1
- 1
interface/rotmg.c View File

@@ -62,7 +62,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){


#endif #endif


FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22, dflag, dtemp;
FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp;


if(*dd1 < ZERO) if(*dd1 < ZERO)
{ {


+ 4
- 4
interface/zger.c View File

@@ -109,7 +109,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
blasint incy = *INCY; blasint incy = *INCY;
blasint lda = *LDA; blasint lda = *LDA;
FLOAT *buffer; FLOAT *buffer;
#ifdef SMP
#ifdef SMPBUG
int nthreads; int nthreads;
#endif #endif


@@ -144,7 +144,7 @@ void CNAME(enum CBLAS_ORDER order,


FLOAT *buffer; FLOAT *buffer;
blasint info, t; blasint info, t;
#ifdef SMP
#ifdef SMPBUG
int nthreads; int nthreads;
#endif #endif


@@ -205,7 +205,7 @@ void CNAME(enum CBLAS_ORDER order,


buffer = (FLOAT *)blas_memory_alloc(1); buffer = (FLOAT *)blas_memory_alloc(1);


#ifdef SMP
#ifdef SMPBUG
nthreads = num_cpu_avail(2); nthreads = num_cpu_avail(2);


if (nthreads == 1) { if (nthreads == 1) {
@@ -221,7 +221,7 @@ void CNAME(enum CBLAS_ORDER order,
} }
#endif #endif


#ifdef SMP
#ifdef SMPBUG


} else { } else {




+ 3
- 0
kernel/x86_64/KERNEL.BARCELONA View File

@@ -1,3 +1,6 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S

ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S ZGEMVTKERNEL = zgemv_t_dup.S




+ 3
- 0
kernel/x86_64/KERNEL.BULLDOZER View File

@@ -1,3 +1,6 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S

ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S ZGEMVTKERNEL = zgemv_t_dup.S




+ 4
- 0
kernel/x86_64/KERNEL.HASWELL View File

@@ -1,3 +1,7 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S


SGEMMKERNEL = sgemm_kernel_16x4_haswell.S SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c SGEMMITCOPY = ../generic/gemm_tcopy_16.c


+ 15
- 12
kernel/x86_64/KERNEL.NEHALEM View File

@@ -1,3 +1,7 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S


SGEMMKERNEL = gemm_kernel_4x8_nehalem.S SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
SGEMMINCOPY = gemm_ncopy_4.S SGEMMINCOPY = gemm_ncopy_4.S
SGEMMITCOPY = gemm_tcopy_4.S SGEMMITCOPY = gemm_tcopy_4.S
@@ -9,13 +13,13 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)




DGEMMKERNEL = gemm_kernel_4x4_core2.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4.S
DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMKERNEL = gemm_kernel_2x8_nehalem.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)


@@ -44,11 +48,10 @@ STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S


DTRSMKERNEL_LN = trsm_kernel_LN_4x4_core2.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_core2.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_core2.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_core2.S

DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S


CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S


+ 3
- 0
kernel/x86_64/KERNEL.PILEDRIVER View File

@@ -1,3 +1,6 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S

ZGEMVNKERNEL = zgemv_n_dup.S ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S ZGEMVTKERNEL = zgemv_t_dup.S




+ 8
- 6
kernel/x86_64/KERNEL.SANDYBRIDGE View File

@@ -1,14 +1,16 @@
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
SGEMMINCOPY = gemm_ncopy_4.S
SGEMMITCOPY = gemm_tcopy_4.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S

SGEMMKERNEL = sgemm_kernel_16x4_sandy.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)



DGEMMKERNEL = dgemm_kernel_4x8_sandy.S DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c


+ 1
- 2
kernel/x86_64/cgemm_kernel_4x2_bulldozer.S View File

@@ -79,8 +79,7 @@
#endif #endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp) #define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp) #define Nmod6 32(%rsp)


+ 1
- 3
kernel/x86_64/cgemm_kernel_4x2_piledriver.S View File

@@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 256*8*4
#define Ndiv6 24(%rsp) #define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp) #define Nmod6 32(%rsp)
@@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp) #define KK 72(%rsp)
#define KKK 80(%rsp) #define KKK 80(%rsp)
#define BUFFER1 128(%rsp) #define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS) #if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384 #if L_BUFFER_SIZE > 16384


+ 1
- 3
kernel/x86_64/cgemm_kernel_8x2_haswell.S View File

@@ -93,8 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp) #define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp) #define Nmod6 32(%rsp)
@@ -105,7 +104,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp) #define KK 72(%rsp)
#define KKK 80(%rsp) #define KKK 80(%rsp)
#define BUFFER1 128(%rsp) #define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS) #if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384 #if L_BUFFER_SIZE > 16384


+ 1
- 1
kernel/x86_64/dgemm_kernel_4x4_haswell.S View File

@@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else #else
#define STACKSIZE 256 #define STACKSIZE 256
#define L_BUFFER_SIZE 128*8*12+4096
#define L_BUFFER_SIZE 128*8*12+512
#define OLD_A 40 + STACKSIZE(%rsp) #define OLD_A 40 + STACKSIZE(%rsp)
#define OLD_B 48 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp)


+ 2
- 2
kernel/x86_64/dgemm_kernel_8x2_bulldozer.S View File

@@ -148,8 +148,8 @@
#endif #endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp) #define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp) #define Nmod6 32(%rsp)


+ 2
- 2
kernel/x86_64/dgemm_kernel_8x2_piledriver.S View File

@@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp) #define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp) #define Nmod6 32(%rsp)


+ 2
- 2
kernel/x86_64/sgemm_kernel_16x2_bulldozer.S View File

@@ -78,8 +78,8 @@
#endif #endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp) #define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp) #define Nmod6 32(%rsp)


+ 2
- 2
kernel/x86_64/sgemm_kernel_16x2_piledriver.S View File

@@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp) #define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp) #define Nmod6 32(%rsp)


+ 1
- 3
kernel/x86_64/sgemm_kernel_16x4_haswell.S View File

@@ -90,8 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp) #define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp) #define Nmod6 32(%rsp)
@@ -101,7 +100,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 64(%rsp) #define KK 64(%rsp)
#define KKK 72(%rsp) #define KKK 72(%rsp)
#define BUFFER1 128(%rsp) #define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS) #if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384 #if L_BUFFER_SIZE > 16384


+ 3167
- 0
kernel/x86_64/sgemm_kernel_16x4_sandy.S
File diff suppressed because it is too large
View File


+ 1
- 3
kernel/x86_64/zgemm_kernel_2x2_bulldozer.S View File

@@ -79,8 +79,7 @@
#endif #endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp) #define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp) #define Nmod6 32(%rsp)
@@ -91,7 +90,6 @@
#define KK 72(%rsp) #define KK 72(%rsp)
#define KKK 80(%rsp) #define KKK 80(%rsp)
#define BUFFER1 128(%rsp) #define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS) #if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384 #if L_BUFFER_SIZE > 16384


+ 1
- 3
kernel/x86_64/zgemm_kernel_2x2_piledriver.S View File

@@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 256*8*4
#define Ndiv6 24(%rsp) #define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp) #define Nmod6 32(%rsp)
@@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp) #define KK 72(%rsp)
#define KKK 80(%rsp) #define KKK 80(%rsp)
#define BUFFER1 128(%rsp) #define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS) #if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384 #if L_BUFFER_SIZE > 16384


+ 1
- 3
kernel/x86_64/zgemm_kernel_4x2_haswell.S View File

@@ -92,8 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp) #define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp) #define Nmod6 32(%rsp)
@@ -104,7 +103,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp) #define KK 72(%rsp)
#define KKK 80(%rsp) #define KKK 80(%rsp)
#define BUFFER1 128(%rsp) #define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS) #if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384 #if L_BUFFER_SIZE > 16384


+ 1
- 1
lapack-netlib/TESTING/nep.in View File

@@ -10,7 +10,7 @@ NEP: Data file for testing Nonsymmetric Eigenvalue Problem routines
0 5 7 3 200 Values of INIBL (nibble crossover point) 0 5 7 3 200 Values of INIBL (nibble crossover point)
1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts) 1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts)
0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2) 0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2)
20.0 Threshold value
70.0 Threshold value
T Put T to test the error exits T Put T to test the error exits
1 Code to interpret the seed 1 Code to interpret the seed
NEP 21 NEP 21

+ 15
- 6
param.h View File

@@ -1032,14 +1032,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_UNROLL_N 1 #define XGEMM_DEFAULT_UNROLL_N 1
#else #else
#define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1


#define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 8
#define QGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4
@@ -1073,6 +1073,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#define GETRF_FACTOR 0.72 #define GETRF_FACTOR 0.72


#define CGEMM3M_DEFAULT_UNROLL_N 4
#define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 2
#define ZGEMM3M_DEFAULT_UNROLL_M 8
#endif #endif




@@ -1104,14 +1108,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2
#define XGEMM_DEFAULT_UNROLL_N 1 #define XGEMM_DEFAULT_UNROLL_N 1
#else #else
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_M 8
#define QGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_M 4
#define XGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1


#define SGEMM_DEFAULT_UNROLL_N 8
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_N 4
@@ -1119,7 +1123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_UNROLL_N 1 #define XGEMM_DEFAULT_UNROLL_N 1
#endif #endif


#define SGEMM_DEFAULT_P 512
#define SGEMM_DEFAULT_P 768
#define SGEMM_DEFAULT_R sgemm_r #define SGEMM_DEFAULT_R sgemm_r
//#define SGEMM_DEFAULT_R 1024 //#define SGEMM_DEFAULT_R 1024


@@ -1141,13 +1145,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_P 252 #define XGEMM_DEFAULT_P 252
#define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_R xgemm_r


#define SGEMM_DEFAULT_Q 256
#define SGEMM_DEFAULT_Q 384
#define DGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256
#define QGEMM_DEFAULT_Q 128 #define QGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 256 #define CGEMM_DEFAULT_Q 256
#define ZGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 192
#define XGEMM_DEFAULT_Q 128 #define XGEMM_DEFAULT_Q 128


#define CGEMM3M_DEFAULT_UNROLL_N 4
#define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 2
#define ZGEMM3M_DEFAULT_UNROLL_M 8

#define GETRF_FACTOR 0.72 #define GETRF_FACTOR 0.72


#endif #endif


Loading…
Cancel
Save