Ref #103: enhancement for small matrix dimensions. Fixed some bugs. Enable sgemm for SNB and dgemm for NEHALEMtags/v0.2.10.rc1^2
| @@ -36,9 +36,13 @@ ifndef BINARY64 | |||||
| else | else | ||||
| @echo " BINARY ... 64bit " | @echo " BINARY ... 64bit " | ||||
| endif | endif | ||||
| ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
| ifneq ($(INTERFACE64), 0) | |||||
| @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) " | @echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) " | ||||
| endif | endif | ||||
| endif | |||||
| @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" | @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" | ||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" | @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" | ||||
| @@ -133,7 +133,8 @@ NO_AFFINITY = 1 | |||||
| # COMMON_OPT = -O2 | # COMMON_OPT = -O2 | ||||
| # gfortran option for LAPACK | # gfortran option for LAPACK | ||||
| FCOMMON_OPT = -frecursive | |||||
| # enable this flag only on 64bit Linux and if you need a thread safe lapack library | |||||
| # FCOMMON_OPT = -frecursive | |||||
| # Profiling flags | # Profiling flags | ||||
| COMMON_PROF = -pg | COMMON_PROF = -pg | ||||
| @@ -46,15 +46,55 @@ ifdef TARGET | |||||
| GETARCH_FLAGS := -DFORCE_$(TARGET) | GETARCH_FLAGS := -DFORCE_$(TARGET) | ||||
| endif | endif | ||||
| # Force fallbacks for 32bit | |||||
| ifeq ($(BINARY), 32) | |||||
| ifeq ($(TARGET), HASWELL) | |||||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||||
| endif | |||||
| ifeq ($(TARGET), SANDYBRIDGE) | |||||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||||
| endif | |||||
| ifeq ($(TARGET), BULLDOZER) | |||||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||||
| endif | |||||
| ifeq ($(TARGET), PILEDRIVER) | |||||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||||
| endif | |||||
| endif | |||||
| #TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. | #TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. | ||||
| # | # | ||||
| ifdef TARGET_CORE | ifdef TARGET_CORE | ||||
| GETARCH_FLAGS := -DFORCE_$(TARGET_CORE) | GETARCH_FLAGS := -DFORCE_$(TARGET_CORE) | ||||
| endif | endif | ||||
| # Force fallbacks for 32bit | |||||
| ifeq ($(BINARY), 32) | |||||
| ifeq ($(TARGET_CORE), HASWELL) | |||||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||||
| endif | |||||
| ifeq ($(TARGET_CORE), SANDYBRIDGE) | |||||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||||
| endif | |||||
| ifeq ($(TARGET_CORE), BULLDOZER) | |||||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||||
| endif | |||||
| ifeq ($(TARGET_CORE), PILEDRIVER) | |||||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||||
| endif | |||||
| endif | |||||
| ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
| ifneq ($(INTERFACE64), 0) | |||||
| GETARCH_FLAGS += -DUSE64BITINT | GETARCH_FLAGS += -DUSE64BITINT | ||||
| endif | endif | ||||
| endif | |||||
| ifndef GEMM_MULTITHREAD_THRESHOLD | ifndef GEMM_MULTITHREAD_THRESHOLD | ||||
| GEMM_MULTITHREAD_THRESHOLD=4 | GEMM_MULTITHREAD_THRESHOLD=4 | ||||
| @@ -65,6 +105,10 @@ ifeq ($(NO_AVX), 1) | |||||
| GETARCH_FLAGS += -DNO_AVX | GETARCH_FLAGS += -DNO_AVX | ||||
| endif | endif | ||||
| ifeq ($(BINARY), 32) | |||||
| GETARCH_FLAGS += -DNO_AVX | |||||
| endif | |||||
| ifeq ($(DEBUG), 1) | ifeq ($(DEBUG), 1) | ||||
| GETARCH_FLAGS += -g | GETARCH_FLAGS += -g | ||||
| endif | endif | ||||
| @@ -336,9 +380,6 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||||
| ifeq ($(ARCH), x86) | ifeq ($(ARCH), x86) | ||||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | ||||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | |||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL | |||||
| endif | |||||
| endif | endif | ||||
| ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
| @@ -503,8 +544,10 @@ else | |||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| FCOMMON_OPT += -m64 | FCOMMON_OPT += -m64 | ||||
| ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
| ifneq ($(INTERFACE64), 0) | |||||
| FCOMMON_OPT += -fdefault-integer-8 | FCOMMON_OPT += -fdefault-integer-8 | ||||
| endif | endif | ||||
| endif | |||||
| else | else | ||||
| FCOMMON_OPT += -m32 | FCOMMON_OPT += -m32 | ||||
| endif | endif | ||||
| @@ -517,8 +560,10 @@ endif | |||||
| ifeq ($(F_COMPILER), INTEL) | ifeq ($(F_COMPILER), INTEL) | ||||
| CCOMMON_OPT += -DF_INTERFACE_INTEL | CCOMMON_OPT += -DF_INTERFACE_INTEL | ||||
| ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
| ifneq ($(INTERFACE64), 0) | |||||
| FCOMMON_OPT += -i8 | FCOMMON_OPT += -i8 | ||||
| endif | endif | ||||
| endif | |||||
| ifdef USE_OPENMP | ifdef USE_OPENMP | ||||
| FCOMMON_OPT += -openmp | FCOMMON_OPT += -openmp | ||||
| endif | endif | ||||
| @@ -537,8 +582,10 @@ CCOMMON_OPT += -DF_INTERFACE_IBM | |||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| FCOMMON_OPT += -q64 | FCOMMON_OPT += -q64 | ||||
| ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
| ifneq ($(INTERFACE64), 0) | |||||
| FCOMMON_OPT += -qintsize=8 | FCOMMON_OPT += -qintsize=8 | ||||
| endif | endif | ||||
| endif | |||||
| else | else | ||||
| FCOMMON_OPT += -q32 | FCOMMON_OPT += -q32 | ||||
| endif | endif | ||||
| @@ -552,8 +599,10 @@ CCOMMON_OPT += -DF_INTERFACE_PGI | |||||
| COMMON_PROF += -DPGICOMPILER | COMMON_PROF += -DPGICOMPILER | ||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
| ifneq ($(INTERFACE64), 0) | |||||
| FCOMMON_OPT += -i8 | FCOMMON_OPT += -i8 | ||||
| endif | endif | ||||
| endif | |||||
| FCOMMON_OPT += -tp p7-64 | FCOMMON_OPT += -tp p7-64 | ||||
| else | else | ||||
| FCOMMON_OPT += -tp p7 | FCOMMON_OPT += -tp p7 | ||||
| @@ -567,9 +616,11 @@ ifeq ($(F_COMPILER), PATHSCALE) | |||||
| CCOMMON_OPT += -DF_INTERFACE_PATHSCALE | CCOMMON_OPT += -DF_INTERFACE_PATHSCALE | ||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
| ifneq ($(INTERFACE64), 0) | |||||
| FCOMMON_OPT += -i8 | FCOMMON_OPT += -i8 | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifneq ($(ARCH), mips64) | ifneq ($(ARCH), mips64) | ||||
| ifndef BINARY64 | ifndef BINARY64 | ||||
| @@ -594,9 +645,11 @@ ifeq ($(F_COMPILER), OPEN64) | |||||
| CCOMMON_OPT += -DF_INTERFACE_OPEN64 | CCOMMON_OPT += -DF_INTERFACE_OPEN64 | ||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
| ifneq ($(INTERFACE64), 0) | |||||
| FCOMMON_OPT += -i8 | FCOMMON_OPT += -i8 | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(ARCH), mips64) | ifeq ($(ARCH), mips64) | ||||
| ifndef BINARY64 | ifndef BINARY64 | ||||
| @@ -682,10 +735,12 @@ endif | |||||
| ifdef BINARY64 | ifdef BINARY64 | ||||
| ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
| ifneq ($(INTERFACE64), 0) | |||||
| CCOMMON_OPT += | CCOMMON_OPT += | ||||
| #-DUSE64BITINT | #-DUSE64BITINT | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(NEED_PIC), 1) | ifeq ($(NEED_PIC), 1) | ||||
| ifeq ($(C_COMPILER), IBM) | ifeq ($(C_COMPILER), IBM) | ||||
| @@ -718,6 +773,10 @@ ifeq ($(NO_AVX), 1) | |||||
| CCOMMON_OPT += -DNO_AVX | CCOMMON_OPT += -DNO_AVX | ||||
| endif | endif | ||||
| ifeq ($(BINARY), 32) | |||||
| CCOMMON_OPT += -DNO_AVX | |||||
| endif | |||||
| ifdef SMP | ifdef SMP | ||||
| CCOMMON_OPT += -DSMP_SERVER | CCOMMON_OPT += -DSMP_SERVER | ||||
| @@ -872,8 +931,11 @@ endif | |||||
| LAPACK_CFLAGS = $(CFLAGS) | LAPACK_CFLAGS = $(CFLAGS) | ||||
| LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H | LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H | ||||
| ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
| ifneq ($(INTERFACE64), 0) | |||||
| LAPACK_CFLAGS += -DLAPACK_ILP64 | LAPACK_CFLAGS += -DLAPACK_ILP64 | ||||
| endif | endif | ||||
| endif | |||||
| ifdef OS_WINDOWS | ifdef OS_WINDOWS | ||||
| LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS | LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS | ||||
| endif | endif | ||||
| @@ -40,6 +40,7 @@ | |||||
| #include <string.h> | #include <string.h> | ||||
| #include "cpuid.h" | #include "cpuid.h" | ||||
| /* | |||||
| #ifdef NO_AVX | #ifdef NO_AVX | ||||
| #define CPUTYPE_HASWELL CPUTYPE_NEHALEM | #define CPUTYPE_HASWELL CPUTYPE_NEHALEM | ||||
| #define CORE_HASWELL CORE_NEHALEM | #define CORE_HASWELL CORE_NEHALEM | ||||
| @@ -50,6 +51,7 @@ | |||||
| #define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA | #define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA | ||||
| #define CORE_PILEDRIVER CORE_BARCELONA | #define CORE_PILEDRIVER CORE_BARCELONA | ||||
| #endif | #endif | ||||
| */ | |||||
| #ifndef CPUIDEMU | #ifndef CPUIDEMU | ||||
| @@ -39,7 +39,7 @@ | |||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef SMP | #ifdef SMP | ||||
| #ifndef USE64BITINT | |||||
| #if !defined(USE64BITINT) || defined(ARCH_X86) | |||||
| unsigned int blas_quick_divide_table[] = { | unsigned int blas_quick_divide_table[] = { | ||||
| 0x00000000, 0x00000001, 0x80000001, 0x55555556, | 0x00000000, 0x00000001, 0x80000001, 0x55555556, | ||||
| 0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925, | 0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925, | ||||
| @@ -72,7 +72,7 @@ | |||||
| #endif | #endif | ||||
| #ifndef GEMM_MULTITHREAD_THRESHOLD | #ifndef GEMM_MULTITHREAD_THRESHOLD | ||||
| # define GEMM_MULTITHREAD_THRESHOLD 4 | |||||
| #define GEMM_MULTITHREAD_THRESHOLD 4 | |||||
| #endif | #endif | ||||
| static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { | static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { | ||||
| @@ -400,14 +400,63 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| mode |= (transa << BLAS_TRANSA_SHIFT); | mode |= (transa << BLAS_TRANSA_SHIFT); | ||||
| mode |= (transb << BLAS_TRANSB_SHIFT); | mode |= (transb << BLAS_TRANSB_SHIFT); | ||||
| args.common = NULL; | |||||
| int nthreads_max = num_cpu_avail(3); | |||||
| int nthreads_avail = nthreads_max; | |||||
| if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD | |||||
| || args.k <=GEMM_MULTITHREAD_THRESHOLD){ | |||||
| args.nthreads = 1; | |||||
| }else{ | |||||
| args.nthreads = num_cpu_avail(3); | |||||
| #ifndef COMPLEX | |||||
| double MNK = (double) args.m * (double) args.n * (double) args.k; | |||||
| if ( MNK <= (1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||||
| nthreads_max = 1; | |||||
| else | |||||
| { | |||||
| if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||||
| { | |||||
| nthreads_max = 4; | |||||
| if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD ) | |||||
| { | |||||
| nthreads_max = 2; | |||||
| if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; | |||||
| if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; | |||||
| if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; | |||||
| } | |||||
| else | |||||
| { | |||||
| if ( args.n <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2; | |||||
| } | |||||
| } | |||||
| } | } | ||||
| #else | |||||
| double MNK = (double) args.m * (double) args.n * (double) args.k; | |||||
| if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||||
| nthreads_max = 1; | |||||
| else | |||||
| { | |||||
| if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||||
| { | |||||
| nthreads_max = 4; | |||||
| if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) | |||||
| { | |||||
| nthreads_max = 2; | |||||
| if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; | |||||
| if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; | |||||
| if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1; | |||||
| } | |||||
| else | |||||
| { | |||||
| if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2; | |||||
| } | |||||
| } | |||||
| } | |||||
| #endif | |||||
| args.common = NULL; | |||||
| if ( nthreads_max > nthreads_avail ) | |||||
| args.nthreads = nthreads_avail; | |||||
| else | |||||
| args.nthreads = nthreads_max; | |||||
| if (args.nthreads == 1) { | if (args.nthreads == 1) { | ||||
| #endif | #endif | ||||
| @@ -75,7 +75,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha, | |||||
| blasint incy = *INCY; | blasint incy = *INCY; | ||||
| blasint lda = *LDA; | blasint lda = *LDA; | ||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| #ifdef SMP | |||||
| #ifdef SMPBUG | |||||
| int nthreads; | int nthreads; | ||||
| #endif | #endif | ||||
| @@ -107,7 +107,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| blasint info, t; | blasint info, t; | ||||
| #ifdef SMP | |||||
| #ifdef SMPBUG | |||||
| int nthreads; | int nthreads; | ||||
| #endif | #endif | ||||
| @@ -167,15 +167,16 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| buffer = (FLOAT *)blas_memory_alloc(1); | buffer = (FLOAT *)blas_memory_alloc(1); | ||||
| #ifdef SMP | |||||
| #ifdef SMPBUG | |||||
| nthreads = num_cpu_avail(2); | nthreads = num_cpu_avail(2); | ||||
| if (nthreads == 1) { | if (nthreads == 1) { | ||||
| #endif | #endif | ||||
| GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); | GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); | ||||
| #ifdef SMP | |||||
| #ifdef SMPBUG | |||||
| } else { | } else { | ||||
| GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); | GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); | ||||
| @@ -62,7 +62,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||||
| #endif | #endif | ||||
| FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22, dflag, dtemp; | |||||
| FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp; | |||||
| if(*dd1 < ZERO) | if(*dd1 < ZERO) | ||||
| { | { | ||||
| @@ -109,7 +109,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha, | |||||
| blasint incy = *INCY; | blasint incy = *INCY; | ||||
| blasint lda = *LDA; | blasint lda = *LDA; | ||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| #ifdef SMP | |||||
| #ifdef SMPBUG | |||||
| int nthreads; | int nthreads; | ||||
| #endif | #endif | ||||
| @@ -144,7 +144,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| blasint info, t; | blasint info, t; | ||||
| #ifdef SMP | |||||
| #ifdef SMPBUG | |||||
| int nthreads; | int nthreads; | ||||
| #endif | #endif | ||||
| @@ -205,7 +205,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| buffer = (FLOAT *)blas_memory_alloc(1); | buffer = (FLOAT *)blas_memory_alloc(1); | ||||
| #ifdef SMP | |||||
| #ifdef SMPBUG | |||||
| nthreads = num_cpu_avail(2); | nthreads = num_cpu_avail(2); | ||||
| if (nthreads == 1) { | if (nthreads == 1) { | ||||
| @@ -221,7 +221,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| } | } | ||||
| #endif | #endif | ||||
| #ifdef SMP | |||||
| #ifdef SMPBUG | |||||
| } else { | } else { | ||||
| @@ -1,3 +1,6 @@ | |||||
| SGEMVNKERNEL = sgemv_n.S | |||||
| SGEMVTKERNEL = sgemv_t.S | |||||
| ZGEMVNKERNEL = zgemv_n_dup.S | ZGEMVNKERNEL = zgemv_n_dup.S | ||||
| ZGEMVTKERNEL = zgemv_t_dup.S | ZGEMVTKERNEL = zgemv_t_dup.S | ||||
| @@ -1,3 +1,6 @@ | |||||
| SGEMVNKERNEL = sgemv_n.S | |||||
| SGEMVTKERNEL = sgemv_t.S | |||||
| ZGEMVNKERNEL = zgemv_n_dup.S | ZGEMVNKERNEL = zgemv_n_dup.S | ||||
| ZGEMVTKERNEL = zgemv_t_dup.S | ZGEMVTKERNEL = zgemv_t_dup.S | ||||
| @@ -1,3 +1,7 @@ | |||||
| SGEMVNKERNEL = sgemv_n.S | |||||
| SGEMVTKERNEL = sgemv_t.S | |||||
| SGEMMKERNEL = sgemm_kernel_16x4_haswell.S | SGEMMKERNEL = sgemm_kernel_16x4_haswell.S | ||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | SGEMMINCOPY = ../generic/gemm_ncopy_16.c | ||||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | SGEMMITCOPY = ../generic/gemm_tcopy_16.c | ||||
| @@ -1,3 +1,7 @@ | |||||
| SGEMVNKERNEL = sgemv_n.S | |||||
| SGEMVTKERNEL = sgemv_t.S | |||||
| SGEMMKERNEL = gemm_kernel_4x8_nehalem.S | SGEMMKERNEL = gemm_kernel_4x8_nehalem.S | ||||
| SGEMMINCOPY = gemm_ncopy_4.S | SGEMMINCOPY = gemm_ncopy_4.S | ||||
| SGEMMITCOPY = gemm_tcopy_4.S | SGEMMITCOPY = gemm_tcopy_4.S | ||||
| @@ -9,13 +13,13 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMKERNEL = gemm_kernel_4x4_core2.S | |||||
| DGEMMINCOPY = | |||||
| DGEMMITCOPY = | |||||
| DGEMMONCOPY = gemm_ncopy_4.S | |||||
| DGEMMOTCOPY = gemm_tcopy_4.S | |||||
| DGEMMINCOPYOBJ = | |||||
| DGEMMITCOPYOBJ = | |||||
| DGEMMKERNEL = gemm_kernel_2x8_nehalem.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| @@ -44,11 +48,10 @@ STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | ||||
| STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S | ||||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_core2.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_core2.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_core2.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_core2.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S | ||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S | ||||
| @@ -1,3 +1,6 @@ | |||||
| SGEMVNKERNEL = sgemv_n.S | |||||
| SGEMVTKERNEL = sgemv_t.S | |||||
| ZGEMVNKERNEL = zgemv_n_dup.S | ZGEMVNKERNEL = zgemv_n_dup.S | ||||
| ZGEMVTKERNEL = zgemv_t_dup.S | ZGEMVTKERNEL = zgemv_t_dup.S | ||||
| @@ -1,14 +1,16 @@ | |||||
| SGEMMKERNEL = gemm_kernel_4x8_nehalem.S | |||||
| SGEMMINCOPY = gemm_ncopy_4.S | |||||
| SGEMMITCOPY = gemm_tcopy_4.S | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMVNKERNEL = sgemv_n.S | |||||
| SGEMVTKERNEL = sgemv_t.S | |||||
| SGEMMKERNEL = sgemm_kernel_16x4_sandy.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMKERNEL = dgemm_kernel_4x8_sandy.S | DGEMMKERNEL = dgemm_kernel_4x8_sandy.S | ||||
| DGEMMINCOPY = ../generic/gemm_ncopy_8.c | DGEMMINCOPY = ../generic/gemm_ncopy_8.c | ||||
| DGEMMITCOPY = ../generic/gemm_tcopy_8.c | DGEMMITCOPY = ../generic/gemm_tcopy_8.c | ||||
| @@ -79,8 +79,7 @@ | |||||
| #endif | #endif | ||||
| #define L_BUFFER_SIZE 512*8*4 | |||||
| #define LB2_OFFSET 512*8*2 | |||||
| #define L_BUFFER_SIZE 8192 | |||||
| #define Ndiv6 24(%rsp) | #define Ndiv6 24(%rsp) | ||||
| #define Nmod6 32(%rsp) | #define Nmod6 32(%rsp) | ||||
| @@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #define L_BUFFER_SIZE 512*8*4 | |||||
| #define LB2_OFFSET 512*8*2 | |||||
| #define L_BUFFER_SIZE 256*8*4 | |||||
| #define Ndiv6 24(%rsp) | #define Ndiv6 24(%rsp) | ||||
| #define Nmod6 32(%rsp) | #define Nmod6 32(%rsp) | ||||
| @@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define KK 72(%rsp) | #define KK 72(%rsp) | ||||
| #define KKK 80(%rsp) | #define KKK 80(%rsp) | ||||
| #define BUFFER1 128(%rsp) | #define BUFFER1 128(%rsp) | ||||
| #define BUFFER2 LB2_OFFSET+128(%rsp) | |||||
| #if defined(OS_WINDOWS) | #if defined(OS_WINDOWS) | ||||
| #if L_BUFFER_SIZE > 16384 | #if L_BUFFER_SIZE > 16384 | ||||
| @@ -93,8 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #define L_BUFFER_SIZE 512*8*4 | |||||
| #define LB2_OFFSET 512*8*2 | |||||
| #define L_BUFFER_SIZE 8192 | |||||
| #define Ndiv6 24(%rsp) | #define Ndiv6 24(%rsp) | ||||
| #define Nmod6 32(%rsp) | #define Nmod6 32(%rsp) | ||||
| @@ -105,7 +104,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define KK 72(%rsp) | #define KK 72(%rsp) | ||||
| #define KKK 80(%rsp) | #define KKK 80(%rsp) | ||||
| #define BUFFER1 128(%rsp) | #define BUFFER1 128(%rsp) | ||||
| #define BUFFER2 LB2_OFFSET+128(%rsp) | |||||
| #if defined(OS_WINDOWS) | #if defined(OS_WINDOWS) | ||||
| #if L_BUFFER_SIZE > 16384 | #if L_BUFFER_SIZE > 16384 | ||||
| @@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| #define STACKSIZE 256 | #define STACKSIZE 256 | ||||
| #define L_BUFFER_SIZE 128*8*12+4096 | |||||
| #define L_BUFFER_SIZE 128*8*12+512 | |||||
| #define OLD_A 40 + STACKSIZE(%rsp) | #define OLD_A 40 + STACKSIZE(%rsp) | ||||
| #define OLD_B 48 + STACKSIZE(%rsp) | #define OLD_B 48 + STACKSIZE(%rsp) | ||||
| @@ -148,8 +148,8 @@ | |||||
| #endif | #endif | ||||
| #define L_BUFFER_SIZE 512*8*4 | |||||
| #define LB2_OFFSET 512*8*2 | |||||
| #define L_BUFFER_SIZE 8192 | |||||
| #define LB2_OFFSET 4096 | |||||
| #define Ndiv6 24(%rsp) | #define Ndiv6 24(%rsp) | ||||
| #define Nmod6 32(%rsp) | #define Nmod6 32(%rsp) | ||||
| @@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #define L_BUFFER_SIZE 512*8*4 | |||||
| #define LB2_OFFSET 512*8*2 | |||||
| #define L_BUFFER_SIZE 8192 | |||||
| #define LB2_OFFSET 4096 | |||||
| #define Ndiv6 24(%rsp) | #define Ndiv6 24(%rsp) | ||||
| #define Nmod6 32(%rsp) | #define Nmod6 32(%rsp) | ||||
| @@ -78,8 +78,8 @@ | |||||
| #endif | #endif | ||||
| #define L_BUFFER_SIZE 512*8*4 | |||||
| #define LB2_OFFSET 512*8*2 | |||||
| #define L_BUFFER_SIZE 8192 | |||||
| #define LB2_OFFSET 4096 | |||||
| #define Ndiv6 24(%rsp) | #define Ndiv6 24(%rsp) | ||||
| #define Nmod6 32(%rsp) | #define Nmod6 32(%rsp) | ||||
| @@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #define L_BUFFER_SIZE 512*8*4 | |||||
| #define LB2_OFFSET 512*8*2 | |||||
| #define L_BUFFER_SIZE 8192 | |||||
| #define LB2_OFFSET 4096 | |||||
| #define Ndiv6 24(%rsp) | #define Ndiv6 24(%rsp) | ||||
| #define Nmod6 32(%rsp) | #define Nmod6 32(%rsp) | ||||
| @@ -90,8 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #define L_BUFFER_SIZE 512*8*4 | |||||
| #define LB2_OFFSET 512*8*2 | |||||
| #define L_BUFFER_SIZE 8192 | |||||
| #define Ndiv6 24(%rsp) | #define Ndiv6 24(%rsp) | ||||
| #define Nmod6 32(%rsp) | #define Nmod6 32(%rsp) | ||||
| @@ -101,7 +100,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define KK 64(%rsp) | #define KK 64(%rsp) | ||||
| #define KKK 72(%rsp) | #define KKK 72(%rsp) | ||||
| #define BUFFER1 128(%rsp) | #define BUFFER1 128(%rsp) | ||||
| #define BUFFER2 LB2_OFFSET+128(%rsp) | |||||
| #if defined(OS_WINDOWS) | #if defined(OS_WINDOWS) | ||||
| #if L_BUFFER_SIZE > 16384 | #if L_BUFFER_SIZE > 16384 | ||||
| @@ -79,8 +79,7 @@ | |||||
| #endif | #endif | ||||
| #define L_BUFFER_SIZE 512*8*4 | |||||
| #define LB2_OFFSET 512*8*2 | |||||
| #define L_BUFFER_SIZE 8192 | |||||
| #define Ndiv6 24(%rsp) | #define Ndiv6 24(%rsp) | ||||
| #define Nmod6 32(%rsp) | #define Nmod6 32(%rsp) | ||||
| @@ -91,7 +90,6 @@ | |||||
| #define KK 72(%rsp) | #define KK 72(%rsp) | ||||
| #define KKK 80(%rsp) | #define KKK 80(%rsp) | ||||
| #define BUFFER1 128(%rsp) | #define BUFFER1 128(%rsp) | ||||
| #define BUFFER2 LB2_OFFSET+128(%rsp) | |||||
| #if defined(OS_WINDOWS) | #if defined(OS_WINDOWS) | ||||
| #if L_BUFFER_SIZE > 16384 | #if L_BUFFER_SIZE > 16384 | ||||
| @@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #define L_BUFFER_SIZE 512*8*4 | |||||
| #define LB2_OFFSET 512*8*2 | |||||
| #define L_BUFFER_SIZE 256*8*4 | |||||
| #define Ndiv6 24(%rsp) | #define Ndiv6 24(%rsp) | ||||
| #define Nmod6 32(%rsp) | #define Nmod6 32(%rsp) | ||||
| @@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define KK 72(%rsp) | #define KK 72(%rsp) | ||||
| #define KKK 80(%rsp) | #define KKK 80(%rsp) | ||||
| #define BUFFER1 128(%rsp) | #define BUFFER1 128(%rsp) | ||||
| #define BUFFER2 LB2_OFFSET+128(%rsp) | |||||
| #if defined(OS_WINDOWS) | #if defined(OS_WINDOWS) | ||||
| #if L_BUFFER_SIZE > 16384 | #if L_BUFFER_SIZE > 16384 | ||||
| @@ -92,8 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #define L_BUFFER_SIZE 512*8*4 | |||||
| #define LB2_OFFSET 512*8*2 | |||||
| #define L_BUFFER_SIZE 8192 | |||||
| #define Ndiv6 24(%rsp) | #define Ndiv6 24(%rsp) | ||||
| #define Nmod6 32(%rsp) | #define Nmod6 32(%rsp) | ||||
| @@ -104,7 +103,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define KK 72(%rsp) | #define KK 72(%rsp) | ||||
| #define KKK 80(%rsp) | #define KKK 80(%rsp) | ||||
| #define BUFFER1 128(%rsp) | #define BUFFER1 128(%rsp) | ||||
| #define BUFFER2 LB2_OFFSET+128(%rsp) | |||||
| #if defined(OS_WINDOWS) | #if defined(OS_WINDOWS) | ||||
| #if L_BUFFER_SIZE > 16384 | #if L_BUFFER_SIZE > 16384 | ||||
| @@ -10,7 +10,7 @@ NEP: Data file for testing Nonsymmetric Eigenvalue Problem routines | |||||
| 0 5 7 3 200 Values of INIBL (nibble crossover point) | 0 5 7 3 200 Values of INIBL (nibble crossover point) | ||||
| 1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts) | 1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts) | ||||
| 0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2) | 0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2) | ||||
| 20.0 Threshold value | |||||
| 70.0 Threshold value | |||||
| T Put T to test the error exits | T Put T to test the error exits | ||||
| 1 Code to interpret the seed | 1 Code to interpret the seed | ||||
| NEP 21 | NEP 21 | ||||
| @@ -1032,14 +1032,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define XGEMM_DEFAULT_UNROLL_N 1 | #define XGEMM_DEFAULT_UNROLL_N 1 | ||||
| #else | #else | ||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | #define SGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | #define QGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | #define CGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | #define ZGEMM_DEFAULT_UNROLL_M 1 | ||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | #define XGEMM_DEFAULT_UNROLL_M 1 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | #define SGEMM_DEFAULT_UNROLL_N 8 | ||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||||
| #define QGEMM_DEFAULT_UNROLL_N 2 | #define QGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | #define CGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | #define ZGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -1073,6 +1073,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GETRF_FACTOR 0.72 | #define GETRF_FACTOR 0.72 | ||||
| #define CGEMM3M_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM3M_DEFAULT_UNROLL_M 8 | |||||
| #define ZGEMM3M_DEFAULT_UNROLL_N 2 | |||||
| #define ZGEMM3M_DEFAULT_UNROLL_M 8 | |||||
| #endif | #endif | ||||
| @@ -1104,14 +1108,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | #define ZGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define XGEMM_DEFAULT_UNROLL_N 1 | #define XGEMM_DEFAULT_UNROLL_N 1 | ||||
| #else | #else | ||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | #define DGEMM_DEFAULT_UNROLL_M 8 | ||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | #define QGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | #define CGEMM_DEFAULT_UNROLL_M 2 | ||||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | #define ZGEMM_DEFAULT_UNROLL_M 4 | ||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | #define XGEMM_DEFAULT_UNROLL_M 1 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | #define DGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define QGEMM_DEFAULT_UNROLL_N 2 | #define QGEMM_DEFAULT_UNROLL_N 2 | ||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | #define CGEMM_DEFAULT_UNROLL_N 4 | ||||
| @@ -1119,7 +1123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define XGEMM_DEFAULT_UNROLL_N 1 | #define XGEMM_DEFAULT_UNROLL_N 1 | ||||
| #endif | #endif | ||||
| #define SGEMM_DEFAULT_P 512 | |||||
| #define SGEMM_DEFAULT_P 768 | |||||
| #define SGEMM_DEFAULT_R sgemm_r | #define SGEMM_DEFAULT_R sgemm_r | ||||
| //#define SGEMM_DEFAULT_R 1024 | //#define SGEMM_DEFAULT_R 1024 | ||||
| @@ -1141,13 +1145,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define XGEMM_DEFAULT_P 252 | #define XGEMM_DEFAULT_P 252 | ||||
| #define XGEMM_DEFAULT_R xgemm_r | #define XGEMM_DEFAULT_R xgemm_r | ||||
| #define SGEMM_DEFAULT_Q 256 | |||||
| #define SGEMM_DEFAULT_Q 384 | |||||
| #define DGEMM_DEFAULT_Q 256 | #define DGEMM_DEFAULT_Q 256 | ||||
| #define QGEMM_DEFAULT_Q 128 | #define QGEMM_DEFAULT_Q 128 | ||||
| #define CGEMM_DEFAULT_Q 256 | #define CGEMM_DEFAULT_Q 256 | ||||
| #define ZGEMM_DEFAULT_Q 192 | #define ZGEMM_DEFAULT_Q 192 | ||||
| #define XGEMM_DEFAULT_Q 128 | #define XGEMM_DEFAULT_Q 128 | ||||
| #define CGEMM3M_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM3M_DEFAULT_UNROLL_M 8 | |||||
| #define ZGEMM3M_DEFAULT_UNROLL_N 2 | |||||
| #define ZGEMM3M_DEFAULT_UNROLL_M 8 | |||||
| #define GETRF_FACTOR 0.72 | #define GETRF_FACTOR 0.72 | ||||
| #endif | #endif | ||||