Ref #103: enhancement for small matrix dimensions. Fixed some bugs. Enable sgemm for SNB and dgemm for NEHALEMtags/v0.2.10.rc1^2
| @@ -36,9 +36,13 @@ ifndef BINARY64 | |||
| else | |||
| @echo " BINARY ... 64bit " | |||
| endif | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) " | |||
| endif | |||
| endif | |||
| @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" | |||
| ifndef NOFORTRAN | |||
| @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" | |||
| @@ -133,7 +133,8 @@ NO_AFFINITY = 1 | |||
| # COMMON_OPT = -O2 | |||
| # gfortran option for LAPACK | |||
| FCOMMON_OPT = -frecursive | |||
| # enable this flag only on 64bit Linux and if you need a thread safe lapack library | |||
| # FCOMMON_OPT = -frecursive | |||
| # Profiling flags | |||
| COMMON_PROF = -pg | |||
| @@ -46,15 +46,55 @@ ifdef TARGET | |||
| GETARCH_FLAGS := -DFORCE_$(TARGET) | |||
| endif | |||
| # Force fallbacks for 32bit | |||
| ifeq ($(BINARY), 32) | |||
| ifeq ($(TARGET), HASWELL) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET), SANDYBRIDGE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET), BULLDOZER) | |||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||
| endif | |||
| ifeq ($(TARGET), PILEDRIVER) | |||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||
| endif | |||
| endif | |||
| #TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. | |||
| # | |||
| ifdef TARGET_CORE | |||
| GETARCH_FLAGS := -DFORCE_$(TARGET_CORE) | |||
| endif | |||
| # Force fallbacks for 32bit | |||
| ifeq ($(BINARY), 32) | |||
| ifeq ($(TARGET_CORE), HASWELL) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET_CORE), SANDYBRIDGE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET_CORE), BULLDOZER) | |||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||
| endif | |||
| ifeq ($(TARGET_CORE), PILEDRIVER) | |||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||
| endif | |||
| endif | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| GETARCH_FLAGS += -DUSE64BITINT | |||
| endif | |||
| endif | |||
| ifndef GEMM_MULTITHREAD_THRESHOLD | |||
| GEMM_MULTITHREAD_THRESHOLD=4 | |||
| @@ -65,6 +105,10 @@ ifeq ($(NO_AVX), 1) | |||
| GETARCH_FLAGS += -DNO_AVX | |||
| endif | |||
| ifeq ($(BINARY), 32) | |||
| GETARCH_FLAGS += -DNO_AVX | |||
| endif | |||
| ifeq ($(DEBUG), 1) | |||
| GETARCH_FLAGS += -g | |||
| endif | |||
| @@ -336,9 +380,6 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||
| ifeq ($(ARCH), x86) | |||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | |||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
| ifneq ($(NO_AVX), 1) | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| @@ -503,8 +544,10 @@ else | |||
| ifdef BINARY64 | |||
| FCOMMON_OPT += -m64 | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| FCOMMON_OPT += -fdefault-integer-8 | |||
| endif | |||
| endif | |||
| else | |||
| FCOMMON_OPT += -m32 | |||
| endif | |||
| @@ -517,8 +560,10 @@ endif | |||
| ifeq ($(F_COMPILER), INTEL) | |||
| CCOMMON_OPT += -DF_INTERFACE_INTEL | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| ifdef USE_OPENMP | |||
| FCOMMON_OPT += -openmp | |||
| endif | |||
| @@ -537,8 +582,10 @@ CCOMMON_OPT += -DF_INTERFACE_IBM | |||
| ifdef BINARY64 | |||
| FCOMMON_OPT += -q64 | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| FCOMMON_OPT += -qintsize=8 | |||
| endif | |||
| endif | |||
| else | |||
| FCOMMON_OPT += -q32 | |||
| endif | |||
| @@ -552,8 +599,10 @@ CCOMMON_OPT += -DF_INTERFACE_PGI | |||
| COMMON_PROF += -DPGICOMPILER | |||
| ifdef BINARY64 | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| FCOMMON_OPT += -tp p7-64 | |||
| else | |||
| FCOMMON_OPT += -tp p7 | |||
| @@ -567,9 +616,11 @@ ifeq ($(F_COMPILER), PATHSCALE) | |||
| CCOMMON_OPT += -DF_INTERFACE_PATHSCALE | |||
| ifdef BINARY64 | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| endif | |||
| ifneq ($(ARCH), mips64) | |||
| ifndef BINARY64 | |||
| @@ -594,9 +645,11 @@ ifeq ($(F_COMPILER), OPEN64) | |||
| CCOMMON_OPT += -DF_INTERFACE_OPEN64 | |||
| ifdef BINARY64 | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), mips64) | |||
| ifndef BINARY64 | |||
| @@ -682,10 +735,12 @@ endif | |||
| ifdef BINARY64 | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| CCOMMON_OPT += | |||
| #-DUSE64BITINT | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(NEED_PIC), 1) | |||
| ifeq ($(C_COMPILER), IBM) | |||
| @@ -718,6 +773,10 @@ ifeq ($(NO_AVX), 1) | |||
| CCOMMON_OPT += -DNO_AVX | |||
| endif | |||
| ifeq ($(BINARY), 32) | |||
| CCOMMON_OPT += -DNO_AVX | |||
| endif | |||
| ifdef SMP | |||
| CCOMMON_OPT += -DSMP_SERVER | |||
| @@ -872,8 +931,11 @@ endif | |||
| LAPACK_CFLAGS = $(CFLAGS) | |||
| LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| LAPACK_CFLAGS += -DLAPACK_ILP64 | |||
| endif | |||
| endif | |||
| ifdef OS_WINDOWS | |||
| LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS | |||
| endif | |||
| @@ -40,6 +40,7 @@ | |||
| #include <string.h> | |||
| #include "cpuid.h" | |||
| /* | |||
| #ifdef NO_AVX | |||
| #define CPUTYPE_HASWELL CPUTYPE_NEHALEM | |||
| #define CORE_HASWELL CORE_NEHALEM | |||
| @@ -50,6 +51,7 @@ | |||
| #define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA | |||
| #define CORE_PILEDRIVER CORE_BARCELONA | |||
| #endif | |||
| */ | |||
| #ifndef CPUIDEMU | |||
| @@ -39,7 +39,7 @@ | |||
| #include "common.h" | |||
| #ifdef SMP | |||
| #ifndef USE64BITINT | |||
| #if !defined(USE64BITINT) || defined(ARCH_X86) | |||
| unsigned int blas_quick_divide_table[] = { | |||
| 0x00000000, 0x00000001, 0x80000001, 0x55555556, | |||
| 0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925, | |||
| @@ -72,7 +72,7 @@ | |||
| #endif | |||
| #ifndef GEMM_MULTITHREAD_THRESHOLD | |||
| # define GEMM_MULTITHREAD_THRESHOLD 4 | |||
| #define GEMM_MULTITHREAD_THRESHOLD 4 | |||
| #endif | |||
| static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { | |||
| @@ -400,14 +400,63 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| mode |= (transa << BLAS_TRANSA_SHIFT); | |||
| mode |= (transb << BLAS_TRANSB_SHIFT); | |||
| args.common = NULL; | |||
| int nthreads_max = num_cpu_avail(3); | |||
| int nthreads_avail = nthreads_max; | |||
| if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD | |||
| || args.k <=GEMM_MULTITHREAD_THRESHOLD){ | |||
| args.nthreads = 1; | |||
| }else{ | |||
| args.nthreads = num_cpu_avail(3); | |||
| #ifndef COMPLEX | |||
| double MNK = (double) args.m * (double) args.n * (double) args.k; | |||
| if ( MNK <= (1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| nthreads_max = 1; | |||
| else | |||
| { | |||
| if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| { | |||
| nthreads_max = 4; | |||
| if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD ) | |||
| { | |||
| nthreads_max = 2; | |||
| if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; | |||
| if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; | |||
| if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; | |||
| } | |||
| else | |||
| { | |||
| if ( args.n <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2; | |||
| } | |||
| } | |||
| } | |||
| #else | |||
| double MNK = (double) args.m * (double) args.n * (double) args.k; | |||
| if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| nthreads_max = 1; | |||
| else | |||
| { | |||
| if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| { | |||
| nthreads_max = 4; | |||
| if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) | |||
| { | |||
| nthreads_max = 2; | |||
| if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; | |||
| if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; | |||
| if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; | |||
| } | |||
| else | |||
| { | |||
| if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2; | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| args.common = NULL; | |||
| if ( nthreads_max > nthreads_avail ) | |||
| args.nthreads = nthreads_avail; | |||
| else | |||
| args.nthreads = nthreads_max; | |||
| if (args.nthreads == 1) { | |||
| #endif | |||
| @@ -75,7 +75,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha, | |||
| blasint incy = *INCY; | |||
| blasint lda = *LDA; | |||
| FLOAT *buffer; | |||
| #ifdef SMP | |||
| #ifdef SMPBUG | |||
| int nthreads; | |||
| #endif | |||
| @@ -107,7 +107,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| FLOAT *buffer; | |||
| blasint info, t; | |||
| #ifdef SMP | |||
| #ifdef SMPBUG | |||
| int nthreads; | |||
| #endif | |||
| @@ -167,15 +167,16 @@ void CNAME(enum CBLAS_ORDER order, | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| #ifdef SMP | |||
| #ifdef SMPBUG | |||
| nthreads = num_cpu_avail(2); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); | |||
| #ifdef SMP | |||
| #ifdef SMPBUG | |||
| } else { | |||
| GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); | |||
| @@ -62,7 +62,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| #endif | |||
| FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22, dflag, dtemp; | |||
| FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp; | |||
| if(*dd1 < ZERO) | |||
| { | |||
| @@ -109,7 +109,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha, | |||
| blasint incy = *INCY; | |||
| blasint lda = *LDA; | |||
| FLOAT *buffer; | |||
| #ifdef SMP | |||
| #ifdef SMPBUG | |||
| int nthreads; | |||
| #endif | |||
| @@ -144,7 +144,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| FLOAT *buffer; | |||
| blasint info, t; | |||
| #ifdef SMP | |||
| #ifdef SMPBUG | |||
| int nthreads; | |||
| #endif | |||
| @@ -205,7 +205,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| #ifdef SMP | |||
| #ifdef SMPBUG | |||
| nthreads = num_cpu_avail(2); | |||
| if (nthreads == 1) { | |||
| @@ -221,7 +221,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| } | |||
| #endif | |||
| #ifdef SMP | |||
| #ifdef SMPBUG | |||
| } else { | |||
| @@ -1,3 +1,6 @@ | |||
| SGEMVNKERNEL = sgemv_n.S | |||
| SGEMVTKERNEL = sgemv_t.S | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||
| @@ -1,3 +1,6 @@ | |||
| SGEMVNKERNEL = sgemv_n.S | |||
| SGEMVTKERNEL = sgemv_t.S | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||
| @@ -1,3 +1,7 @@ | |||
| SGEMVNKERNEL = sgemv_n.S | |||
| SGEMVTKERNEL = sgemv_t.S | |||
| SGEMMKERNEL = sgemm_kernel_16x4_haswell.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| @@ -1,3 +1,7 @@ | |||
| SGEMVNKERNEL = sgemv_n.S | |||
| SGEMVTKERNEL = sgemv_t.S | |||
| SGEMMKERNEL = gemm_kernel_4x8_nehalem.S | |||
| SGEMMINCOPY = gemm_ncopy_4.S | |||
| SGEMMITCOPY = gemm_tcopy_4.S | |||
| @@ -9,13 +13,13 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = gemm_kernel_4x4_core2.S | |||
| DGEMMINCOPY = | |||
| DGEMMITCOPY = | |||
| DGEMMONCOPY = gemm_ncopy_4.S | |||
| DGEMMOTCOPY = gemm_tcopy_4.S | |||
| DGEMMINCOPYOBJ = | |||
| DGEMMITCOPYOBJ = | |||
| DGEMMKERNEL = gemm_kernel_2x8_nehalem.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -44,11 +48,10 @@ STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||
| STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | |||
| STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_core2.S | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_core2.S | |||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_core2.S | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_core2.S | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S | |||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S | |||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | |||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | |||
| @@ -1,3 +1,6 @@ | |||
| SGEMVNKERNEL = sgemv_n.S | |||
| SGEMVTKERNEL = sgemv_t.S | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||
| @@ -1,14 +1,16 @@ | |||
| SGEMMKERNEL = gemm_kernel_4x8_nehalem.S | |||
| SGEMMINCOPY = gemm_ncopy_4.S | |||
| SGEMMITCOPY = gemm_tcopy_4.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMVNKERNEL = sgemv_n.S | |||
| SGEMVTKERNEL = sgemv_t.S | |||
| SGEMMKERNEL = sgemm_kernel_16x4_sandy.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_4x8_sandy.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| @@ -79,8 +79,7 @@ | |||
| #endif | |||
| #define L_BUFFER_SIZE 512*8*4 | |||
| #define LB2_OFFSET 512*8*2 | |||
| #define L_BUFFER_SIZE 8192 | |||
| #define Ndiv6 24(%rsp) | |||
| #define Nmod6 32(%rsp) | |||
| @@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #define L_BUFFER_SIZE 512*8*4 | |||
| #define LB2_OFFSET 512*8*2 | |||
| #define L_BUFFER_SIZE 256*8*4 | |||
| #define Ndiv6 24(%rsp) | |||
| #define Nmod6 32(%rsp) | |||
| @@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define KK 72(%rsp) | |||
| #define KKK 80(%rsp) | |||
| #define BUFFER1 128(%rsp) | |||
| #define BUFFER2 LB2_OFFSET+128(%rsp) | |||
| #if defined(OS_WINDOWS) | |||
| #if L_BUFFER_SIZE > 16384 | |||
| @@ -93,8 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #define L_BUFFER_SIZE 512*8*4 | |||
| #define LB2_OFFSET 512*8*2 | |||
| #define L_BUFFER_SIZE 8192 | |||
| #define Ndiv6 24(%rsp) | |||
| #define Nmod6 32(%rsp) | |||
| @@ -105,7 +104,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define KK 72(%rsp) | |||
| #define KKK 80(%rsp) | |||
| #define BUFFER1 128(%rsp) | |||
| #define BUFFER2 LB2_OFFSET+128(%rsp) | |||
| #if defined(OS_WINDOWS) | |||
| #if L_BUFFER_SIZE > 16384 | |||
| @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #define STACKSIZE 256 | |||
| #define L_BUFFER_SIZE 128*8*12+4096 | |||
| #define L_BUFFER_SIZE 128*8*12+512 | |||
| #define OLD_A 40 + STACKSIZE(%rsp) | |||
| #define OLD_B 48 + STACKSIZE(%rsp) | |||
| @@ -148,8 +148,8 @@ | |||
| #endif | |||
| #define L_BUFFER_SIZE 512*8*4 | |||
| #define LB2_OFFSET 512*8*2 | |||
| #define L_BUFFER_SIZE 8192 | |||
| #define LB2_OFFSET 4096 | |||
| #define Ndiv6 24(%rsp) | |||
| #define Nmod6 32(%rsp) | |||
| @@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #define L_BUFFER_SIZE 512*8*4 | |||
| #define LB2_OFFSET 512*8*2 | |||
| #define L_BUFFER_SIZE 8192 | |||
| #define LB2_OFFSET 4096 | |||
| #define Ndiv6 24(%rsp) | |||
| #define Nmod6 32(%rsp) | |||
| @@ -78,8 +78,8 @@ | |||
| #endif | |||
| #define L_BUFFER_SIZE 512*8*4 | |||
| #define LB2_OFFSET 512*8*2 | |||
| #define L_BUFFER_SIZE 8192 | |||
| #define LB2_OFFSET 4096 | |||
| #define Ndiv6 24(%rsp) | |||
| #define Nmod6 32(%rsp) | |||
| @@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #define L_BUFFER_SIZE 512*8*4 | |||
| #define LB2_OFFSET 512*8*2 | |||
| #define L_BUFFER_SIZE 8192 | |||
| #define LB2_OFFSET 4096 | |||
| #define Ndiv6 24(%rsp) | |||
| #define Nmod6 32(%rsp) | |||
| @@ -90,8 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #define L_BUFFER_SIZE 512*8*4 | |||
| #define LB2_OFFSET 512*8*2 | |||
| #define L_BUFFER_SIZE 8192 | |||
| #define Ndiv6 24(%rsp) | |||
| #define Nmod6 32(%rsp) | |||
| @@ -101,7 +100,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define KK 64(%rsp) | |||
| #define KKK 72(%rsp) | |||
| #define BUFFER1 128(%rsp) | |||
| #define BUFFER2 LB2_OFFSET+128(%rsp) | |||
| #if defined(OS_WINDOWS) | |||
| #if L_BUFFER_SIZE > 16384 | |||
| @@ -79,8 +79,7 @@ | |||
| #endif | |||
| #define L_BUFFER_SIZE 512*8*4 | |||
| #define LB2_OFFSET 512*8*2 | |||
| #define L_BUFFER_SIZE 8192 | |||
| #define Ndiv6 24(%rsp) | |||
| #define Nmod6 32(%rsp) | |||
| @@ -91,7 +90,6 @@ | |||
| #define KK 72(%rsp) | |||
| #define KKK 80(%rsp) | |||
| #define BUFFER1 128(%rsp) | |||
| #define BUFFER2 LB2_OFFSET+128(%rsp) | |||
| #if defined(OS_WINDOWS) | |||
| #if L_BUFFER_SIZE > 16384 | |||
| @@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #define L_BUFFER_SIZE 512*8*4 | |||
| #define LB2_OFFSET 512*8*2 | |||
| #define L_BUFFER_SIZE 256*8*4 | |||
| #define Ndiv6 24(%rsp) | |||
| #define Nmod6 32(%rsp) | |||
| @@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define KK 72(%rsp) | |||
| #define KKK 80(%rsp) | |||
| #define BUFFER1 128(%rsp) | |||
| #define BUFFER2 LB2_OFFSET+128(%rsp) | |||
| #if defined(OS_WINDOWS) | |||
| #if L_BUFFER_SIZE > 16384 | |||
| @@ -92,8 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #define L_BUFFER_SIZE 512*8*4 | |||
| #define LB2_OFFSET 512*8*2 | |||
| #define L_BUFFER_SIZE 8192 | |||
| #define Ndiv6 24(%rsp) | |||
| #define Nmod6 32(%rsp) | |||
| @@ -104,7 +103,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define KK 72(%rsp) | |||
| #define KKK 80(%rsp) | |||
| #define BUFFER1 128(%rsp) | |||
| #define BUFFER2 LB2_OFFSET+128(%rsp) | |||
| #if defined(OS_WINDOWS) | |||
| #if L_BUFFER_SIZE > 16384 | |||
| @@ -10,7 +10,7 @@ NEP: Data file for testing Nonsymmetric Eigenvalue Problem routines | |||
| 0 5 7 3 200 Values of INIBL (nibble crossover point) | |||
| 1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts) | |||
| 0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2) | |||
| 20.0 Threshold value | |||
| 70.0 Threshold value | |||
| T Put T to test the error exits | |||
| 1 Code to interpret the seed | |||
| NEP 21 | |||
| @@ -1032,14 +1032,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||
| #else | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -1073,6 +1073,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GETRF_FACTOR 0.72 | |||
| #define CGEMM3M_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM3M_DEFAULT_UNROLL_M 8 | |||
| #define ZGEMM3M_DEFAULT_UNROLL_N 2 | |||
| #define ZGEMM3M_DEFAULT_UNROLL_M 8 | |||
| #endif | |||
| @@ -1104,14 +1108,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||
| #else | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -1119,7 +1123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||
| #endif | |||
| #define SGEMM_DEFAULT_P 512 | |||
| #define SGEMM_DEFAULT_P 768 | |||
| #define SGEMM_DEFAULT_R sgemm_r | |||
| //#define SGEMM_DEFAULT_R 1024 | |||
| @@ -1141,13 +1145,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define XGEMM_DEFAULT_P 252 | |||
| #define XGEMM_DEFAULT_R xgemm_r | |||
| #define SGEMM_DEFAULT_Q 256 | |||
| #define SGEMM_DEFAULT_Q 384 | |||
| #define DGEMM_DEFAULT_Q 256 | |||
| #define QGEMM_DEFAULT_Q 128 | |||
| #define CGEMM_DEFAULT_Q 256 | |||
| #define ZGEMM_DEFAULT_Q 192 | |||
| #define XGEMM_DEFAULT_Q 128 | |||
| #define CGEMM3M_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM3M_DEFAULT_UNROLL_M 8 | |||
| #define ZGEMM3M_DEFAULT_UNROLL_N 2 | |||
| #define ZGEMM3M_DEFAULT_UNROLL_M 8 | |||
| #define GETRF_FACTOR 0.72 | |||
| #endif | |||