Browse Source

Merge pull request #390 from wernsaar/develop

Ref #103: enhancement for small matrix dimensions. Fixed some bugs. Enable sgemm for SNB and dgemm for NEHALEM
tags/v0.2.10.rc1^2
Zhang Xianyi 12 years ago
parent
commit
d10db52edb
30 changed files with 3375 additions and 75 deletions
  1. +4
    -0
      Makefile
  2. +2
    -1
      Makefile.rule
  3. +65
    -3
      Makefile.system
  4. +2
    -0
      cpuid_x86.c
  5. +1
    -1
      driver/others/divtable.c
  6. +56
    -7
      interface/gemm.c
  7. +5
    -4
      interface/ger.c
  8. +1
    -1
      interface/rotmg.c
  9. +4
    -4
      interface/zger.c
  10. +3
    -0
      kernel/x86_64/KERNEL.BARCELONA
  11. +3
    -0
      kernel/x86_64/KERNEL.BULLDOZER
  12. +4
    -0
      kernel/x86_64/KERNEL.HASWELL
  13. +15
    -12
      kernel/x86_64/KERNEL.NEHALEM
  14. +3
    -0
      kernel/x86_64/KERNEL.PILEDRIVER
  15. +8
    -6
      kernel/x86_64/KERNEL.SANDYBRIDGE
  16. +1
    -2
      kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
  17. +1
    -3
      kernel/x86_64/cgemm_kernel_4x2_piledriver.S
  18. +1
    -3
      kernel/x86_64/cgemm_kernel_8x2_haswell.S
  19. +1
    -1
      kernel/x86_64/dgemm_kernel_4x4_haswell.S
  20. +2
    -2
      kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
  21. +2
    -2
      kernel/x86_64/dgemm_kernel_8x2_piledriver.S
  22. +2
    -2
      kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
  23. +2
    -2
      kernel/x86_64/sgemm_kernel_16x2_piledriver.S
  24. +1
    -3
      kernel/x86_64/sgemm_kernel_16x4_haswell.S
  25. +3167
    -0
      kernel/x86_64/sgemm_kernel_16x4_sandy.S
  26. +1
    -3
      kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
  27. +1
    -3
      kernel/x86_64/zgemm_kernel_2x2_piledriver.S
  28. +1
    -3
      kernel/x86_64/zgemm_kernel_4x2_haswell.S
  29. +1
    -1
      lapack-netlib/TESTING/nep.in
  30. +15
    -6
      param.h

+ 4
- 0
Makefile View File

@@ -36,9 +36,13 @@ ifndef BINARY64
else
@echo " BINARY ... 64bit "
endif

ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) "
endif
endif

@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"


+ 2
- 1
Makefile.rule View File

@@ -133,7 +133,8 @@ NO_AFFINITY = 1
# COMMON_OPT = -O2

# gfortran option for LAPACK
FCOMMON_OPT = -frecursive
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
# FCOMMON_OPT = -frecursive

# Profiling flags
COMMON_PROF = -pg


+ 65
- 3
Makefile.system View File

@@ -46,15 +46,55 @@ ifdef TARGET
GETARCH_FLAGS := -DFORCE_$(TARGET)
endif

# Force fallbacks for 32bit

ifeq ($(BINARY), 32)
ifeq ($(TARGET), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), BULLDOZER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif


#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
#
ifdef TARGET_CORE
GETARCH_FLAGS := -DFORCE_$(TARGET_CORE)
endif

# Force fallbacks for 32bit

ifeq ($(BINARY), 32)
ifeq ($(TARGET_CORE), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), BULLDOZER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif




ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
GETARCH_FLAGS += -DUSE64BITINT
endif
endif

ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=4
@@ -65,6 +105,10 @@ ifeq ($(NO_AVX), 1)
GETARCH_FLAGS += -DNO_AVX
endif

ifeq ($(BINARY), 32)
GETARCH_FLAGS += -DNO_AVX
endif

ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g
endif
@@ -336,9 +380,6 @@ ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif
endif

ifeq ($(ARCH), x86_64)
@@ -503,8 +544,10 @@ else
ifdef BINARY64
FCOMMON_OPT += -m64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -fdefault-integer-8
endif
endif
else
FCOMMON_OPT += -m32
endif
@@ -517,8 +560,10 @@ endif
ifeq ($(F_COMPILER), INTEL)
CCOMMON_OPT += -DF_INTERFACE_INTEL
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
ifdef USE_OPENMP
FCOMMON_OPT += -openmp
endif
@@ -537,8 +582,10 @@ CCOMMON_OPT += -DF_INTERFACE_IBM
ifdef BINARY64
FCOMMON_OPT += -q64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -qintsize=8
endif
endif
else
FCOMMON_OPT += -q32
endif
@@ -552,8 +599,10 @@ CCOMMON_OPT += -DF_INTERFACE_PGI
COMMON_PROF += -DPGICOMPILER
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
FCOMMON_OPT += -tp p7-64
else
FCOMMON_OPT += -tp p7
@@ -567,9 +616,11 @@ ifeq ($(F_COMPILER), PATHSCALE)
CCOMMON_OPT += -DF_INTERFACE_PATHSCALE
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
endif

ifneq ($(ARCH), mips64)
ifndef BINARY64
@@ -594,9 +645,11 @@ ifeq ($(F_COMPILER), OPEN64)
CCOMMON_OPT += -DF_INTERFACE_OPEN64
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
endif

ifeq ($(ARCH), mips64)
ifndef BINARY64
@@ -682,10 +735,12 @@ endif

ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
CCOMMON_OPT +=
#-DUSE64BITINT
endif
endif
endif

ifeq ($(NEED_PIC), 1)
ifeq ($(C_COMPILER), IBM)
@@ -718,6 +773,10 @@ ifeq ($(NO_AVX), 1)
CCOMMON_OPT += -DNO_AVX
endif

ifeq ($(BINARY), 32)
CCOMMON_OPT += -DNO_AVX
endif

ifdef SMP
CCOMMON_OPT += -DSMP_SERVER

@@ -872,8 +931,11 @@ endif
LAPACK_CFLAGS = $(CFLAGS)
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
LAPACK_CFLAGS += -DLAPACK_ILP64
endif
endif

ifdef OS_WINDOWS
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
endif


+ 2
- 0
cpuid_x86.c View File

@@ -40,6 +40,7 @@
#include <string.h>
#include "cpuid.h"

/*
#ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM
@@ -50,6 +51,7 @@
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
#define CORE_PILEDRIVER CORE_BARCELONA
#endif
*/

#ifndef CPUIDEMU



+ 1
- 1
driver/others/divtable.c View File

@@ -39,7 +39,7 @@
#include "common.h"

#ifdef SMP
#ifndef USE64BITINT
#if !defined(USE64BITINT) || defined(ARCH_X86)
unsigned int blas_quick_divide_table[] = {
0x00000000, 0x00000001, 0x80000001, 0x55555556,
0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925,


+ 56
- 7
interface/gemm.c View File

@@ -72,7 +72,7 @@
#endif

#ifndef GEMM_MULTITHREAD_THRESHOLD
# define GEMM_MULTITHREAD_THRESHOLD 4
#define GEMM_MULTITHREAD_THRESHOLD 4
#endif

static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
@@ -400,14 +400,63 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT);

args.common = NULL;
int nthreads_max = num_cpu_avail(3);
int nthreads_avail = nthreads_max;

if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD
|| args.k <=GEMM_MULTITHREAD_THRESHOLD){
args.nthreads = 1;
}else{
args.nthreads = num_cpu_avail(3);
#ifndef COMPLEX
double MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
else
{
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
{
nthreads_max = 4;
if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD )
{
nthreads_max = 2;
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
}
else
{
if ( args.n <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
}
}
}
#else
double MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
else
{
if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
{
nthreads_max = 4;
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD )
{
nthreads_max = 2;
if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
}
else
{
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
}
}
}

#endif
args.common = NULL;

if ( nthreads_max > nthreads_avail )
args.nthreads = nthreads_avail;
else
args.nthreads = nthreads_max;


if (args.nthreads == 1) {
#endif


+ 5
- 4
interface/ger.c View File

@@ -75,7 +75,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
blasint incy = *INCY;
blasint lda = *LDA;
FLOAT *buffer;
#ifdef SMP
#ifdef SMPBUG
int nthreads;
#endif

@@ -107,7 +107,7 @@ void CNAME(enum CBLAS_ORDER order,

FLOAT *buffer;
blasint info, t;
#ifdef SMP
#ifdef SMPBUG
int nthreads;
#endif

@@ -167,15 +167,16 @@ void CNAME(enum CBLAS_ORDER order,

buffer = (FLOAT *)blas_memory_alloc(1);

#ifdef SMP
#ifdef SMPBUG
nthreads = num_cpu_avail(2);


if (nthreads == 1) {
#endif

GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer);

#ifdef SMP
#ifdef SMPBUG
} else {
GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads);


+ 1
- 1
interface/rotmg.c View File

@@ -62,7 +62,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){

#endif

FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22, dflag, dtemp;
FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp;

if(*dd1 < ZERO)
{


+ 4
- 4
interface/zger.c View File

@@ -109,7 +109,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
blasint incy = *INCY;
blasint lda = *LDA;
FLOAT *buffer;
#ifdef SMP
#ifdef SMPBUG
int nthreads;
#endif

@@ -144,7 +144,7 @@ void CNAME(enum CBLAS_ORDER order,

FLOAT *buffer;
blasint info, t;
#ifdef SMP
#ifdef SMPBUG
int nthreads;
#endif

@@ -205,7 +205,7 @@ void CNAME(enum CBLAS_ORDER order,

buffer = (FLOAT *)blas_memory_alloc(1);

#ifdef SMP
#ifdef SMPBUG
nthreads = num_cpu_avail(2);

if (nthreads == 1) {
@@ -221,7 +221,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif

#ifdef SMP
#ifdef SMPBUG

} else {



+ 3
- 0
kernel/x86_64/KERNEL.BARCELONA View File

@@ -1,3 +1,6 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S

ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S



+ 3
- 0
kernel/x86_64/KERNEL.BULLDOZER View File

@@ -1,3 +1,6 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S

ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S



+ 4
- 0
kernel/x86_64/KERNEL.HASWELL View File

@@ -1,3 +1,7 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S


SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c


+ 15
- 12
kernel/x86_64/KERNEL.NEHALEM View File

@@ -1,3 +1,7 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S


SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
SGEMMINCOPY = gemm_ncopy_4.S
SGEMMITCOPY = gemm_tcopy_4.S
@@ -9,13 +13,13 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)


DGEMMKERNEL = gemm_kernel_4x4_core2.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4.S
DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMKERNEL = gemm_kernel_2x8_nehalem.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)

@@ -44,11 +48,10 @@ STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S

DTRSMKERNEL_LN = trsm_kernel_LN_4x4_core2.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_core2.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_core2.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_core2.S

DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S

CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S


+ 3
- 0
kernel/x86_64/KERNEL.PILEDRIVER View File

@@ -1,3 +1,6 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S

ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S



+ 8
- 6
kernel/x86_64/KERNEL.SANDYBRIDGE View File

@@ -1,14 +1,16 @@
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
SGEMMINCOPY = gemm_ncopy_4.S
SGEMMITCOPY = gemm_tcopy_4.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S

SGEMMKERNEL = sgemm_kernel_16x4_sandy.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)


DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c


+ 1
- 2
kernel/x86_64/cgemm_kernel_4x2_bulldozer.S View File

@@ -79,8 +79,7 @@
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)


+ 1
- 3
kernel/x86_64/cgemm_kernel_4x2_piledriver.S View File

@@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 256*8*4
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384


+ 1
- 3
kernel/x86_64/cgemm_kernel_8x2_haswell.S View File

@@ -93,8 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -105,7 +104,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384


+ 1
- 1
kernel/x86_64/dgemm_kernel_4x4_haswell.S View File

@@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#define STACKSIZE 256
#define L_BUFFER_SIZE 128*8*12+4096
#define L_BUFFER_SIZE 128*8*12+512
#define OLD_A 40 + STACKSIZE(%rsp)
#define OLD_B 48 + STACKSIZE(%rsp)


+ 2
- 2
kernel/x86_64/dgemm_kernel_8x2_bulldozer.S View File

@@ -148,8 +148,8 @@
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)


+ 2
- 2
kernel/x86_64/dgemm_kernel_8x2_piledriver.S View File

@@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)


+ 2
- 2
kernel/x86_64/sgemm_kernel_16x2_bulldozer.S View File

@@ -78,8 +78,8 @@
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)


+ 2
- 2
kernel/x86_64/sgemm_kernel_16x2_piledriver.S View File

@@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)


+ 1
- 3
kernel/x86_64/sgemm_kernel_16x4_haswell.S View File

@@ -90,8 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -101,7 +100,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 64(%rsp)
#define KKK 72(%rsp)
#define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384


+ 3167
- 0
kernel/x86_64/sgemm_kernel_16x4_sandy.S
File diff suppressed because it is too large
View File


+ 1
- 3
kernel/x86_64/zgemm_kernel_2x2_bulldozer.S View File

@@ -79,8 +79,7 @@
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -91,7 +90,6 @@
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384


+ 1
- 3
kernel/x86_64/zgemm_kernel_2x2_piledriver.S View File

@@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 256*8*4
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384


+ 1
- 3
kernel/x86_64/zgemm_kernel_4x2_haswell.S View File

@@ -92,8 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@@ -104,7 +103,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384


+ 1
- 1
lapack-netlib/TESTING/nep.in View File

@@ -10,7 +10,7 @@ NEP: Data file for testing Nonsymmetric Eigenvalue Problem routines
0 5 7 3 200 Values of INIBL (nibble crossover point)
1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts)
0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2)
20.0 Threshold value
70.0 Threshold value
T Put T to test the error exits
1 Code to interpret the seed
NEP 21

+ 15
- 6
param.h View File

@@ -1032,14 +1032,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_UNROLL_N 1
#else
#define SGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1

#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 8
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_N 4
@@ -1073,6 +1073,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#define GETRF_FACTOR 0.72

#define CGEMM3M_DEFAULT_UNROLL_N 4
#define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 2
#define ZGEMM3M_DEFAULT_UNROLL_M 8
#endif


@@ -1104,14 +1108,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_N 2
#define XGEMM_DEFAULT_UNROLL_N 1
#else
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_M 8
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 4
#define XGEMM_DEFAULT_UNROLL_M 1

#define SGEMM_DEFAULT_UNROLL_N 8
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4
@@ -1119,7 +1123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_UNROLL_N 1
#endif

#define SGEMM_DEFAULT_P 512
#define SGEMM_DEFAULT_P 768
#define SGEMM_DEFAULT_R sgemm_r
//#define SGEMM_DEFAULT_R 1024

@@ -1141,13 +1145,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_P 252
#define XGEMM_DEFAULT_R xgemm_r

#define SGEMM_DEFAULT_Q 256
#define SGEMM_DEFAULT_Q 384
#define DGEMM_DEFAULT_Q 256
#define QGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 256
#define ZGEMM_DEFAULT_Q 192
#define XGEMM_DEFAULT_Q 128

#define CGEMM3M_DEFAULT_UNROLL_N 4
#define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 2
#define ZGEMM3M_DEFAULT_UNROLL_M 8

#define GETRF_FACTOR 0.72

#endif


Loading…
Cancel
Save