From 09b241784854e48620bea2a329fcc79344ca9d9b Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 27 Mar 2012 14:17:13 +0800 Subject: [PATCH 001/162] Fixed a typo in license file. --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index f5e4f35a7..1e93a6a73 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without From 19a48b82cf3c4aa25659ea89dce494e2d78fed25 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 30 Mar 2012 20:01:03 +0800 Subject: [PATCH 002/162] Init Sandybridge codes based on Nehalem. --- Makefile.system | 5 +- TargetList.txt | 2 + cpuid.h | 3 + cpuid_x86.c | 11 +++- driver/others/parameter.c | 13 ++++- getarch.c | 14 +++++ kernel/setparam-ref.c | 16 ++++++ kernel/x86/KERNEL.SANDYBRIDGE | 1 + kernel/x86/gemm_kernel_2x4_penryn.S | 6 ++ kernel/x86/gemm_kernel_4x4_penryn.S | 8 ++- kernel/x86/gemv_n_sse.S | 2 +- kernel/x86/gemv_n_sse2.S | 2 +- kernel/x86/gemv_t_sse.S | 2 +- kernel/x86/gemv_t_sse2.S | 2 +- kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/zgemm_kernel_1x2_penryn.S | 2 +- kernel/x86/zgemm_kernel_2x2_penryn.S | 2 +- kernel/x86/zgemv_n_sse.S | 2 +- kernel/x86/zgemv_n_sse2.S | 2 +- kernel/x86/zgemv_t_sse.S | 2 +- kernel/x86/zgemv_t_sse2.S | 2 +- kernel/x86/zscal_sse.S | 4 +- kernel/x86/zscal_sse2.S | 4 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- kernel/x86_64/KERNEL.SANDYBRIDGE | 59 ++++++++++++++++++++ kernel/x86_64/dgemm_ncopy_2.S | 6 ++ kernel/x86_64/dgemm_ncopy_4.S | 2 +- kernel/x86_64/dgemm_ncopy_8.S | 6 ++ kernel/x86_64/dgemm_tcopy_2.S | 7 +++ kernel/x86_64/dgemm_tcopy_4.S | 6 ++ kernel/x86_64/dgemm_tcopy_8.S | 7 +++ kernel/x86_64/gemm_ncopy_2.S | 7 +++ kernel/x86_64/gemm_ncopy_4.S | 2 +- kernel/x86_64/gemm_tcopy_2.S | 7 +++ kernel/x86_64/gemm_tcopy_4.S | 2 +- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zscal_sse.S | 2 +- kernel/x86_64/zscal_sse2.S | 4 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- l1param.h | 7 +++ l2param.h | 11 ++++ param.h | 74 +++++++++++++++++++++++++ 57 files changed, 309 insertions(+), 45 deletions(-) create mode 100644 kernel/x86/KERNEL.SANDYBRIDGE create mode 100644 kernel/x86_64/KERNEL.SANDYBRIDGE diff --git a/Makefile.system b/Makefile.system index 0fd223d60..7c6dce4a5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -226,11 +226,11 @@ endif ifdef DYNAMIC_ARCH ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ - CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO + CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO endif ifndef DYNAMIC_CORE @@ -740,6 +740,7 @@ export HAVE_SSE4_1 export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 +export HAVE_AVX export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE diff --git a/TargetList.txt b/TargetList.txt index 1c3d7c5b9..9e0db4866 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -18,6 +18,7 @@ CORE2 PENRYN DUNNINGTON NEHALEM +SANDYBRIDGE ATOM b)AMD CPU: @@ -47,6 +48,7 @@ CELL 3.MIPS64 CPU: SICORTEX LOONGSON3A +LOONGSON3B 4.IA64 CPU: ITANIUM2 diff --git a/cpuid.h b/cpuid.h index 665ede077..c0f21698d 100644 --- a/cpuid.h +++ b/cpuid.h @@ -103,6 +103,7 @@ #define CORE_NEHALEM 17 #define CORE_ATOM 18 #define CORE_NANO 19 +#define CORE_SANDYBRIDGE 20 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -122,6 +123,7 @@ #define HAVE_MISALIGNSSE (1 << 15) #define HAVE_128BITFPU (1 << 16) #define HAVE_FASTMOVU (1 << 17) +#define HAVE_AVX (1 << 18) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -188,4 +190,5 @@ typedef struct { #define CPUTYPE_NSGEODE 41 #define CPUTYPE_VIAC3 42 #define CPUTYPE_NANO 43 +#define CPUTYPE_SANDYBRIDGE 44 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index e183e9fc3..9916a662b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -189,6 +189,7 @@ int get_cputype(int gettype){ if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; + if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX; if (have_excpuid() >= 0x01) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); @@ -983,7 +984,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CPUTYPE_NEHALEM; + return CPUTYPE_SANDYBRIDGE; case 12: //Xeon Processor 5600 (Westmere-EP) return CPUTYPE_NEHALEM; @@ -1140,6 +1141,7 @@ static char *cpuname[] = { "NSGEODE", "VIAC3", "NANO", + "SANDYBRIDGE", }; static char *lowercpuname[] = { @@ -1186,6 +1188,7 @@ static char *lowercpuname[] = { "tms3x00", "nsgeode", "nano", + "sandybridge", }; static char *corename[] = { @@ -1209,6 +1212,7 @@ static char *corename[] = { "NEHALEM", "ATOM", "NANO", + "SANDYBRIDGE", }; static char *corename_lower[] = { @@ -1232,6 +1236,7 @@ static char *corename_lower[] = { "nehalem", "atom", "nano", + "sandybridge", }; @@ -1315,7 +1320,7 @@ int get_coretype(void){ return CORE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CORE_NEHALEM; + return CORE_SANDYBRIDGE; case 12: //Xeon Processor 5600 (Westmere-EP) return CORE_NEHALEM; @@ -1414,6 +1419,7 @@ void get_cpuconfig(void){ if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); + if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); @@ -1479,6 +1485,7 @@ void get_sse(void){ if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); + if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 21f56e889..5ff1f2934 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -165,7 +165,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ - defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) + defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -384,6 +384,17 @@ void blas_set_parameter(void){ #endif #endif +#if defined(SANDYBRIDGE) + sgemm_p = 1024; + dgemm_p = 512; + cgemm_p = 512; + zgemm_p = 256; +#ifdef EXPRECISION + qgemm_p = 256; + xgemm_p = 128; +#endif +#endif + #if defined(CORE_PRESCOTT) || defined(GENERIC) size >>= 6; diff --git a/getarch.c b/getarch.c index 5b614472a..d8f467f03 100644 --- a/getarch.c +++ b/getarch.c @@ -278,6 +278,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "NEHALEM" #endif +#ifdef FORCE_SANDYBRIDGE +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index aa45d47f8..e841bb171 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -746,6 +746,22 @@ static void init_parameter(void) { #endif #endif +#ifdef SANDYBRIDGE + +#ifdef DEBUG + fprintf(stderr, "Sandybridge\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef OPTERON #ifdef DEBUG diff --git a/kernel/x86/KERNEL.SANDYBRIDGE b/kernel/x86/KERNEL.SANDYBRIDGE new file mode 100644 index 000000000..65b03ae50 --- /dev/null +++ b/kernel/x86/KERNEL.SANDYBRIDGE @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.PENRYN diff --git a/kernel/x86/gemm_kernel_2x4_penryn.S b/kernel/x86/gemm_kernel_2x4_penryn.S index 263aea042..0bdc9185c 100644 --- a/kernel/x86/gemm_kernel_2x4_penryn.S +++ b/kernel/x86/gemm_kernel_2x4_penryn.S @@ -76,6 +76,12 @@ #define PREFETCHB prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + #ifndef PREFETCH #define PREFETCH prefetcht0 #endif diff --git a/kernel/x86/gemm_kernel_4x4_penryn.S b/kernel/x86/gemm_kernel_4x4_penryn.S index 6775d1d18..2d51d9711 100644 --- a/kernel/x86/gemm_kernel_4x4_penryn.S +++ b/kernel/x86/gemm_kernel_4x4_penryn.S @@ -69,6 +69,12 @@ #define PREFETCHB prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE (16 * 1 - 8) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + #ifndef PREFETCH #define PREFETCH prefetcht0 #endif @@ -262,7 +268,7 @@ movaps -16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 -#ifndef NEHALEM +#if !(defined(NEHALEM) || defined(SANDYBRIDGE)) PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) #endif pshufd $0x93, %xmm1, %xmm2 diff --git a/kernel/x86/gemv_n_sse.S b/kernel/x86/gemv_n_sse.S index aae49a22d..0891657fa 100644 --- a/kernel/x86/gemv_n_sse.S +++ b/kernel/x86/gemv_n_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 4) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) diff --git a/kernel/x86/gemv_n_sse2.S b/kernel/x86/gemv_n_sse2.S index 669c5ac6c..5f5fa5a51 100644 --- a/kernel/x86/gemv_n_sse2.S +++ b/kernel/x86/gemv_n_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index a4990116d..5bacb7da8 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 4) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S index 9960b5c0c..c7e685dd8 100644 --- a/kernel/x86/gemv_t_sse2.S +++ b/kernel/x86/gemv_t_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index 6645b790e..ebd1377f1 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index bb33918ef..6fa7d410e 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 55c69e49f..9ce4cd8d4 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index 11cc104e2..a1a35a7a5 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index 01876a515..a5333640d 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index 40a9604d3..c3619ec3d 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/zgemm_kernel_1x2_penryn.S b/kernel/x86/zgemm_kernel_1x2_penryn.S index 849361956..70b38dc79 100644 --- a/kernel/x86/zgemm_kernel_1x2_penryn.S +++ b/kernel/x86/zgemm_kernel_1x2_penryn.S @@ -64,7 +64,7 @@ #define PREFETCHB prefetcht0 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE (8 * 1 - 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 diff --git a/kernel/x86/zgemm_kernel_2x2_penryn.S b/kernel/x86/zgemm_kernel_2x2_penryn.S index edd89b112..715eb4d4f 100644 --- a/kernel/x86/zgemm_kernel_2x2_penryn.S +++ b/kernel/x86/zgemm_kernel_2x2_penryn.S @@ -64,7 +64,7 @@ #define PREFETCHB prefetcht0 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE (16 * 1 + 8) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 diff --git a/kernel/x86/zgemv_n_sse.S b/kernel/x86/zgemv_n_sse.S index 340b9d375..8e28bb8e6 100644 --- a/kernel/x86/zgemv_n_sse.S +++ b/kernel/x86/zgemv_n_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) diff --git a/kernel/x86/zgemv_n_sse2.S b/kernel/x86/zgemv_n_sse2.S index 441fbb0c0..607c51de0 100644 --- a/kernel/x86/zgemv_n_sse2.S +++ b/kernel/x86/zgemv_n_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) diff --git a/kernel/x86/zgemv_t_sse.S b/kernel/x86/zgemv_t_sse.S index 4312ed173..fb98226ee 100644 --- a/kernel/x86/zgemv_t_sse.S +++ b/kernel/x86/zgemv_t_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) diff --git a/kernel/x86/zgemv_t_sse2.S b/kernel/x86/zgemv_t_sse2.S index 78ca14cab..e2f391a82 100644 --- a/kernel/x86/zgemv_t_sse2.S +++ b/kernel/x86/zgemv_t_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S index 849d787f6..53abb697b 100644 --- a/kernel/x86/zscal_sse.S +++ b/kernel/x86/zscal_sse.S @@ -55,7 +55,7 @@ #define XX %edi #define FLAG %ebp -#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF @@ -697,7 +697,7 @@ cmpl $2 * SIZE, INCX jne .L120 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) PSHUFD2($0, %xmm0, %xmm6) PSHUFD2($0, %xmm1, %xmm1) diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S index 5b1da61e6..26ef693a0 100644 --- a/kernel/x86/zscal_sse2.S +++ b/kernel/x86/zscal_sse2.S @@ -57,7 +57,7 @@ #include "l1param.h" -#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF @@ -860,7 +860,7 @@ cmpl $2 * SIZE, INCX jne .L220 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) #ifdef HAVE_SSE3 movddup %xmm0, %xmm6 diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index e5949aa6e..53e53c3ce 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index b01498f78..3c056cdff 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 3668ee2bb..1efa1fd25 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 13064166f..849afed73 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index ebff425c0..c1833abe2 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE new file mode 100644 index 000000000..58a883243 --- /dev/null +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x8_nehalem.S +SGEMMINCOPY = gemm_ncopy_4.S +SGEMMITCOPY = gemm_tcopy_4.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x8_nehalem.S +DGEMMINCOPY = dgemm_ncopy_2.S +DGEMMITCOPY = dgemm_tcopy_2.S +DGEMMONCOPY = dgemm_ncopy_8.S +DGEMMOTCOPY = dgemm_tcopy_8.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S +CGEMMINCOPY = zgemm_ncopy_2.S +CGEMMITCOPY = zgemm_tcopy_2.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S +ZGEMMINCOPY = zgemm_ncopy_1.S +ZGEMMITCOPY = zgemm_tcopy_1.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S +STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S diff --git a/kernel/x86_64/dgemm_ncopy_2.S b/kernel/x86_64/dgemm_ncopy_2.S index 2724cfe92..e4bde49bd 100644 --- a/kernel/x86_64/dgemm_ncopy_2.S +++ b/kernel/x86_64/dgemm_ncopy_2.S @@ -45,6 +45,12 @@ #define PREFETCHW prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef MOVAPS #define MOVAPS movaps #endif diff --git a/kernel/x86_64/dgemm_ncopy_4.S b/kernel/x86_64/dgemm_ncopy_4.S index 52115bd4d..1e4431664 100644 --- a/kernel/x86_64/dgemm_ncopy_4.S +++ b/kernel/x86_64/dgemm_ncopy_4.S @@ -45,7 +45,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 diff --git a/kernel/x86_64/dgemm_ncopy_8.S b/kernel/x86_64/dgemm_ncopy_8.S index 5d3627230..f35c3c5af 100644 --- a/kernel/x86_64/dgemm_ncopy_8.S +++ b/kernel/x86_64/dgemm_ncopy_8.S @@ -45,6 +45,12 @@ #define PREFETCHW prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef MOVAPS #define MOVAPS movaps #endif diff --git a/kernel/x86_64/dgemm_tcopy_2.S b/kernel/x86_64/dgemm_tcopy_2.S index 06e59991d..b0b3590aa 100644 --- a/kernel/x86_64/dgemm_tcopy_2.S +++ b/kernel/x86_64/dgemm_tcopy_2.S @@ -52,6 +52,13 @@ #define MOVUPS_A movups #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define MOVUPS_A movups +#endif + #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/dgemm_tcopy_4.S b/kernel/x86_64/dgemm_tcopy_4.S index 8b81c41c0..85b0253d7 100644 --- a/kernel/x86_64/dgemm_tcopy_4.S +++ b/kernel/x86_64/dgemm_tcopy_4.S @@ -51,6 +51,12 @@ #define MOVUPS_A movups #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define MOVUPS_A movups +#endif + #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/dgemm_tcopy_8.S b/kernel/x86_64/dgemm_tcopy_8.S index 976033714..3d411cda5 100644 --- a/kernel/x86_64/dgemm_tcopy_8.S +++ b/kernel/x86_64/dgemm_tcopy_8.S @@ -46,6 +46,13 @@ #define MOVUPS_A movups #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define MOVUPS_A movups +#endif + #ifdef MOVUPS_A #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS diff --git a/kernel/x86_64/gemm_ncopy_2.S b/kernel/x86_64/gemm_ncopy_2.S index 72c2b9d20..06a0feae9 100644 --- a/kernel/x86_64/gemm_ncopy_2.S +++ b/kernel/x86_64/gemm_ncopy_2.S @@ -46,6 +46,13 @@ #define PREFETCHW prefetcht0 #endif +#if defined(SANDYBRIDGE) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ diff --git a/kernel/x86_64/gemm_ncopy_4.S b/kernel/x86_64/gemm_ncopy_4.S index a04542f6a..cac647fa0 100644 --- a/kernel/x86_64/gemm_ncopy_4.S +++ b/kernel/x86_64/gemm_ncopy_4.S @@ -46,7 +46,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/gemm_tcopy_2.S b/kernel/x86_64/gemm_tcopy_2.S index 8bfaca265..190cebb29 100644 --- a/kernel/x86_64/gemm_tcopy_2.S +++ b/kernel/x86_64/gemm_tcopy_2.S @@ -46,6 +46,13 @@ #define PREFETCHW prefetcht0 #endif +#if defined(SANDYBRIDGE) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ diff --git a/kernel/x86_64/gemm_tcopy_4.S b/kernel/x86_64/gemm_tcopy_4.S index 877969ff5..c2308162f 100644 --- a/kernel/x86_64/gemm_tcopy_4.S +++ b/kernel/x86_64/gemm_tcopy_4.S @@ -46,7 +46,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 901a5ad31..9db45a642 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index bfe7ebd69..ca03f86b7 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 2df76f1cb..01ad2d96e 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index bbba0b427..60c1ea778 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zscal_sse.S b/kernel/x86_64/zscal_sse.S index eb2092dc7..393988e73 100644 --- a/kernel/x86_64/zscal_sse.S +++ b/kernel/x86_64/zscal_sse.S @@ -685,7 +685,7 @@ cmpq $2 * SIZE, INCX jne .L120 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm1 diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S index 23d2da73d..a553bbd39 100644 --- a/kernel/x86_64/zscal_sse2.S +++ b/kernel/x86_64/zscal_sse2.S @@ -55,7 +55,7 @@ #include "l1param.h" -#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF @@ -803,7 +803,7 @@ cmpq $2 * SIZE, INCX jne .L220 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) movddup %xmm0, %xmm14 pxor %xmm15, %xmm15 diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 39f0ff46f..fc54dc4a5 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 711907711..eae31b955 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 175912c71..4d6ad3326 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 3e4b17030..2623bfe6d 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/l1param.h b/l1param.h index f1d223ea7..61c61aa94 100644 --- a/l1param.h +++ b/l1param.h @@ -9,6 +9,13 @@ #define ALIGNED_ACCESS #endif +#ifdef SANDYBRIDGE +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (128 * 12) +#define ALIGNED_ACCESS +#endif + #ifdef ATHLON #define PREFETCH prefetch #define PREFETCHW prefetchw diff --git a/l2param.h b/l2param.h index af9d17179..a371b2ded 100644 --- a/l2param.h +++ b/l2param.h @@ -63,6 +63,17 @@ #define PREFETCHSIZE 64 * 3 #endif +#ifdef SANDYBRIDGE +#define MOVUPS_A movups +#define MOVUPS_XL movups +#define MOVUPS_XS movups +#define MOVUPS_YL movups +#define MOVUPS_YS movups +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 64 * 3 +#endif + #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw diff --git a/param.h b/param.h index 72d721d4e..53159a4fd 100644 --- a/param.h +++ b/param.h @@ -913,6 +913,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef SANDYBRIDGE + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 32 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P 504 +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P 504 +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P 252 +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P 252 +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_Q 128 + +#define GETRF_FACTOR 0.72 + +#endif + + + #ifdef ATOM #define SNUMOPT 2 From fad089ffff6cc8dd3d2f70397f6e4054372fe6f0 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 5 Apr 2012 16:21:40 +0800 Subject: [PATCH 003/162] Fixed #84 the MD5 command line bug on Mac OSX. --- Makefile | 13 +++++++++++-- Makefile.system | 4 ++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ba04aa989..afa8d2efb 100644 --- a/Makefile +++ b/Makefile @@ -232,18 +232,27 @@ endif lapack-3.4.0 : lapack-3.4.0.tgz ifndef NOFORTRAN +ifndef NO_LAPACK @if test `$(MD5SUM) lapack-3.4.0.tgz | $(AWK) '{print $$1}'` = 02d5706ec03ba885fc246e5fa10d8c70; then \ echo $(TAR) zxf $< ;\ $(TAR) zxf $< && (cd lapack-3.4.0; $(PATCH) -p1 < ../patch.for_lapack-3.4.0) ;\ else \ - echo " lapack-3.4.0.tgz check sum is wrong (Please use orignal)." ;\ rm -rf lapack-3.4.0 ;\ + echo " Cannot download lapack-3.4.0.tgz or the MD5 check sum is wrong (Please use orignal)."; \ + exit 1; \ fi endif +endif + +LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.0.tgz lapack-3.4.0.tgz : ifndef NOFORTRAN - -wget http://www.netlib.org/lapack/lapack-3.4.0.tgz +ifeq ($(OSNAME), Darwin) + curl -O $(LAPACK_URL) +else + wget $(LAPACK_URL) +endif endif large.tgz : diff --git a/Makefile.system b/Makefile.system index 0fd223d60..bbd31e5ca 100644 --- a/Makefile.system +++ b/Makefile.system @@ -101,6 +101,7 @@ DLLWRAP = $(CROSS_SUFFIX)dllwrap ifeq ($(OSNAME), Darwin) export MACOSX_DEPLOYMENT_TARGET=10.2 +MD5SUM = md5 -r endif ifeq ($(OSNAME), Linux) @@ -651,7 +652,10 @@ PATCH = patch GREP = grep endif +ifndef MD5SUM MD5SUM = md5sum +endif + AWK = awk REVISION = -r$(VERSION) From 03b0eb19f719320773449b00183bbefb56fa21ad Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 5 Apr 2012 18:16:18 +0800 Subject: [PATCH 004/162] Refs #86. Test alpha=Nan in x86/x86_64 dscale. --- kernel/x86/scal_sse2.S | 3 ++- kernel/x86_64/scal_sse2.S | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/x86/scal_sse2.S b/kernel/x86/scal_sse2.S index dab543470..67c1f437b 100644 --- a/kernel/x86/scal_sse2.S +++ b/kernel/x86/scal_sse2.S @@ -76,7 +76,8 @@ xorps %xmm1, %xmm1 comisd %xmm0, %xmm1 jne .L100 # Alpha != ZERO - + jp .L100 # For Alpha = NaN + /* Alpha == ZERO */ cmpl $SIZE, INCX jne .L50 diff --git a/kernel/x86_64/scal_sse2.S b/kernel/x86_64/scal_sse2.S index b0abb4533..8f5612081 100644 --- a/kernel/x86_64/scal_sse2.S +++ b/kernel/x86_64/scal_sse2.S @@ -74,7 +74,8 @@ xorps %xmm1, %xmm1 comisd %xmm0, %xmm1 jne .L100 # Alpha != ZERO - + jp .L100 # For Alpha = NaN + /* Alpha == ZERO */ cmpq $SIZE, INCX jne .L50 From 435420d6d53d908e51daed5a7467dc694ea5ed06 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Sat, 7 Apr 2012 10:39:09 +0200 Subject: [PATCH 005/162] Fixed #87. Export missing and new LAPACK 3.4.0 functions in shared library. --- exports/gensymbol | 96 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 66 insertions(+), 30 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 6b2a00672..edf4632d8 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -93,7 +93,7 @@ sgebrd, sgecon, sgeequ, sgees, sgeesx, sgeev, sgeevx, sgegs, sgegv, sgehd2, sgehrd, sgelq2, sgelqf, sgels, sgelsd, sgelss, sgelsx, sgelsy, sgeql2, sgeqlf, - sgeqp3, sgeqpf, sgeqr2, sgeqrf, sgerfs, sgerq2, sgerqf, + sgeqp3, sgeqpf, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, sgerq2, sgerqf, sgesc2, sgesdd, sgesvd, sgesvx, sgetc2, sgetri, sggbak, sggbal, sgges, sggesx, sggev, sggevx, @@ -108,8 +108,8 @@ slapll, slapmt, slaqgb, slaqge, slaqp2, slaqps, slaqsb, slaqsp, slaqsy, slaqr0, slaqr1, slaqr2, slaqr3, slaqr4, slaqr5, - slaqtr, slar1v, slar2v, - slarf, slarfb, slarfg, slarft, slarfx, slargv, + slaqtr, slar1v, slar2v, ilaslr, ilaslc, + slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv, slarrv, slartv, slarz, slarzb, slarzt, slasy2, slasyf, slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm, @@ -120,7 +120,8 @@ sormr3, sormrq, sormrz, sormtr, spbcon, spbequ, spbrfs, spbstf, spbsv, spbsvx, spbtf2, spbtrf, spbtrs, spocon, spoequ, sporfs, sposv, - sposvx, spotrs, sppcon, sppequ, + sposvx, spotrs, spstrf, spstf2, + sppcon, sppequ, spprfs, sppsv, sppsvx, spptrf, spptri, spptrs, sptcon, spteqr, sptrfs, sptsv, sptsvx, spttrs, sptts2, srscl, ssbev, ssbevd, ssbevx, ssbgst, ssbgv, ssbgvd, ssbgvx, @@ -129,19 +130,27 @@ ssptrf, ssptri, ssptrs, sstegr, sstein, sstev, sstevd, sstevr, sstevx, ssycon, ssyev, ssyevd, ssyevr, ssyevx, ssygs2, ssygst, ssygv, ssygvd, ssygvx, ssyrfs, ssysv, ssysvx, - ssytd2, ssytf2, ssytrd, ssytrf, ssytri, ssytrs, stbcon, + ssytd2, ssytf2, ssytrd, ssytrf, ssytri, ssytri2, ssytri2x, + ssyswapr, ssytrs, ssytrs2, ssyconv, stbcon, stbrfs, stbtrs, stgevc, stgex2, stgexc, stgsen, stgsja, stgsna, stgsy2, stgsyl, stpcon, stprfs, stptri, stptrs, strcon, strevc, strexc, strrfs, strsen, strsna, strsyl, strtrs, stzrqf, stzrzf, sstemr, + slansf, spftrf, spftri, spftrs, ssfrk, stfsm, stftri, stfttp, + stfttr, stpttf, stpttr, strttf, strttp, + sgejsv, sgesvj, sgsvj0, sgsvj1, + sgeequb, ssyequb, spoequb, sgbequb, + sbbcsd, slapmr, sorbdb, sorcsd, + sgeqrt, sgeqrt2, sgeqrt3, sgemqrt, + stpqrt, stpqrt2, stpmqrt, stprfb, cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx, cgbtf2, cgbtrf, cgbtrs, cgebak, cgebal, cgebd2, cgebrd, cgecon, cgeequ, cgees, cgeesx, cgeev, cgeevx, cgegs, cgegv, cgehd2, cgehrd, cgelq2, cgelqf, cgels, cgelsd, cgelss, cgelsx, cgelsy, cgeql2, cgeqlf, cgeqp3, - cgeqpf, cgeqr2, cgeqrf, cgerfs, cgerq2, cgerqf, + cgeqpf, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, cgerq2, cgerqf, cgesc2, cgesdd, cgesvd, cgesvx, cgetc2, cgetri, cggbak, cggbal, cgges, cggesx, cggev, cggevx, cggglm, @@ -152,7 +161,8 @@ checon, cheev, cheevd, cheevr, cheevx, chegs2, chegst, chegv, chegvd, chegvx, cherfs, chesv, chesvx, chetd2, chetf2, chetrd, - chetrf, chetri, chetrs, chgeqz, chpcon, chpev, chpevd, + chetrf, chetri, chetri2, chetri2x, cheswapr, + chetrs, chetrs2, chgeqz, chpcon, chpev, chpevd, chpevx, chpgst, chpgv, chpgvd, chpgvx, chprfs, chpsv, chpsvx, chptrd, chptrf, chptri, chptrs, chsein, chseqr, clabrd, @@ -166,20 +176,22 @@ clantp, clantr, clapll, clapmt, clarcm, claqgb, claqge, claqhb, claqhe, claqhp, claqp2, claqps, claqsb, claqr0, claqr1, claqr2, claqr3, claqr4, claqr5, - claqsp, claqsy, clar1v, clar2v, clarf, clarfb, clarfg, clarft, + claqsp, claqsy, clar1v, clar2v, ilaclr, ilaclc, + clarf, clarfb, clarfg, clarfgp, clarft, clarfx, clargv, clarnv, clarrv, clartg, clartv, clarz, clarzb, clarzt, clascl, claset, clasr, classq, clasyf, clatbs, clatdf, clatps, clatrd, clatrs, clatrz, clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, cpbsvx, cpbtf2, cpbtrf, cpbtrs, cpocon, cpoequ, cporfs, - cposv, cposvx, cpotrs, cppcon, - cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs, + cposv, cposvx, cpotrs, cpstrf, cpstf2, + cppcon, cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs, cptcon, cpteqr, cptrfs, cptsv, cptsvx, cpttrf, cpttrs, cptts2, crot, cspcon, cspmv, cspr, csprfs, cspsv, cspsvx, csptrf, csptri, csptrs, csrscl, cstedc, - cstegr, cstein, csteqr, csycon, + cstegr, cstein, csteqr, csycon, csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, - csytrs, ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, + csytri2, csytri2x, csyswapr, + csytrs, csytrs2, csyconv, ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, ctgexc, ctgsen, ctgsja, ctgsna, ctgsy2, ctgsyl, ctpcon, ctprfs, ctptri, ctptrs, ctrcon, ctrevc, ctrexc, ctrrfs, ctrsen, ctrsna, @@ -188,13 +200,19 @@ cungrq, cungtr, cunm2l, cunm2r, cunmbr, cunmhr, cunml2, cunmlq, cunmql, cunmqr, cunmr2, cunmr3, cunmrq, cunmrz, cunmtr, cupgtr, cupmtr, icmax1, scsum1, cstemr, - - dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv, + chfrk, ctfttp, clanhf, cpftrf, cpftri, cpftrs, ctfsm, ctftri, + ctfttr, ctpttf, ctpttr, ctrttf, ctrttp, + cgeequb, cgbequb, csyequb, cpoequb, cheequb, + cbbcsd, clapmr, cunbdb, cuncsd, + cgeqrt, cgeqrt2, cgeqrt3, cgemqrt, + ctpqrt, ctpqrt2, ctpmqrt, ctprfb, + + dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv, dgbsvx, dgbtf2, dgbtrf, dgbtrs, dgebak, dgebal, dgebd2, dgebrd, dgecon, dgeequ, dgees, dgeesx, dgeev, dgeevx, dgegs, dgegv, dgehd2, dgehrd, dgelq2, dgelqf, dgels, dgelsd, dgelss, dgelsx, dgelsy, dgeql2, dgeqlf, - dgeqp3, dgeqpf, dgeqr2, dgeqrf, dgerfs, dgerq2, dgerqf, + dgeqp3, dgeqpf, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, dgerq2, dgerqf, dgesc2, dgesdd, dgesvd, dgesvx, dgetc2, dgetri, dggbak, dggbal, dgges, dggesx, dggev, dggevx, @@ -209,8 +227,8 @@ dlapll, dlapmt, dlaqgb, dlaqge, dlaqp2, dlaqps, dlaqsb, dlaqsp, dlaqsy, dlaqr0, dlaqr1, dlaqr2, dlaqr3, dlaqr4, dlaqr5, - dlaqtr, dlar1v, dlar2v, - dlarf, dlarfb, dlarfg, dlarft, dlarfx, dlargv, + dlaqtr, dlar1v, dlar2v, iladlr, iladlc, + dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, dlargv, dlarrv, dlartv, dlarz, dlarzb, dlarzt, dlasy2, dlasyf, dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm, @@ -221,7 +239,8 @@ dormr3, dormrq, dormrz, dormtr, dpbcon, dpbequ, dpbrfs, dpbstf, dpbsv, dpbsvx, dpbtf2, dpbtrf, dpbtrs, dpocon, dpoequ, dporfs, dposv, - dposvx, dpotrs, dppcon, dppequ, + dposvx, dpotrs, dpstrf, dpstf2, + dppcon, dppequ, dpprfs, dppsv, dppsvx, dpptrf, dpptri, dpptrs, dptcon, dpteqr, dptrfs, dptsv, dptsvx, dpttrs, dptts2, drscl, dsbev, dsbevd, dsbevx, dsbgst, dsbgv, dsbgvd, dsbgvx, @@ -231,20 +250,28 @@ dstevx, dsycon, dsyev, dsyevd, dsyevr, dsyevx, dsygs2, dsygst, dsygv, dsygvd, dsygvx, dsyrfs, dsysv, dsysvx, - dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytrs, dtbcon, + dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytrs, dsytrs2, + dsytri2, dsytri2x, dsyswapr, dsyconv, dtbcon, dtbrfs, dtbtrs, dtgevc, dtgex2, dtgexc, dtgsen, dtgsja, dtgsna, dtgsy2, dtgsyl, dtpcon, dtprfs, dtptri, dtptrs, dtrcon, dtrevc, dtrexc, dtrrfs, dtrsen, dtrsna, dtrsyl, dtrtrs, dtzrqf, dtzrzf, dstemr, - dsgesv, dlag2s, slag2d, - + dsgesv, dsposv, dlag2s, slag2d, dlat2s, + dlansf, dpftrf, dpftri, dpftrs, dsfrk, dtfsm, dtftri, dtfttp, + dtfttr, dtpttf, dtpttr, dtrttf, dtrttp, + dgejsv, dgesvj, dgsvj0, dgsvj1, + dgeequb, dsyequb, dpoequb, dgbequb, + dbbcsd, dlapmr, dorbdb, dorcsd, + dgeqrt, dgeqrt2, dgeqrt3, dgemqrt, + dtpqrt, dtpqrt2, dtpmqrt, dtprfb, + zbdsqr, zgbbrd, zgbcon, zgbequ, zgbrfs, zgbsv, zgbsvx, zgbtf2, zgbtrf, zgbtrs, zgebak, zgebal, zgebd2, zgebrd, zgecon, zgeequ, zgees, zgeesx, zgeev, zgeevx, zgegs, zgegv, zgehd2, zgehrd, zgelq2, zgelqf, zgels, zgelsd, zgelss, zgelsx, zgelsy, zgeql2, zgeqlf, zgeqp3, - zgeqpf, zgeqr2, zgeqrf, zgerfs, zgerq2, zgerqf, + zgeqpf, zgeqr2, zgeqr2p, zgeqrf, zgeqrfp, zgerfs, zgerq2, zgerqf, zgesc2, zgesdd, zgesvd, zgesvx, zgetc2, zgetri, zggbak, zggbal, zgges, zggesx, zggev, zggevx, zggglm, @@ -255,7 +282,8 @@ zhecon, zheev, zheevd, zheevr, zheevx, zhegs2, zhegst, zhegv, zhegvd, zhegvx, zherfs, zhesv, zhesvx, zhetd2, zhetf2, zhetrd, - zhetrf, zhetri, zhetrs, zhgeqz, zhpcon, zhpev, zhpevd, + zhetrf, zhetri, zhetri2, zhetri2x, zheswapr, + zhetrs, zhetrs2, zhgeqz, zhpcon, zhpev, zhpevd, zhpevx, zhpgst, zhpgv, zhpgvd, zhpgvx, zhprfs, zhpsv, zhpsvx, zhptrd, zhptrf, zhptri, zhptrs, zhsein, zhseqr, zlabrd, @@ -270,22 +298,24 @@ zlantp, zlantr, zlapll, zlapmt, zlaqgb, zlaqge, zlaqhb, zlaqhe, zlaqhp, zlaqp2, zlaqps, zlaqsb, zlaqr0, zlaqr1, zlaqr2, zlaqr3, zlaqr4, zlaqr5, - zlaqsp, zlaqsy, zlar1v, zlar2v, zlarcm, zlarf, zlarfb, - zlarfg, zlarft, + zlaqsp, zlaqsy, zlar1v, zlar2v, ilazlr, ilazlc, + zlarcm, zlarf, zlarfb, + zlarfg, zlarfgp, zlarft, zlarfx, zlargv, zlarnv, zlarrv, zlartg, zlartv, zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, zlassq, zlasyf, zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zlatzm, zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, zpbsvx, zpbtf2, zpbtrf, zpbtrs, zpocon, zpoequ, zporfs, - zposv, zposvx, zpotrs, zppcon, - zppequ, zpprfs, zppsv, zppsvx, zpptrf, zpptri, zpptrs, + zposv, zposvx, zpotrs, zpstrf, zpstf2, + zppcon, zppequ, zpprfs, zppsv, zppsvx, zpptrf, zpptri, zpptrs, zptcon, zpteqr, zptrfs, zptsv, zptsvx, zpttrf, zpttrs, zptts2, zrot, zspcon, zspmv, zspr, zsprfs, zspsv, zspsvx, zsptrf, zsptri, zsptrs, zdrscl, zstedc, - zstegr, zstein, zsteqr, zsycon, + zstegr, zstein, zsteqr, zsycon, zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, - zsytrs, ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, + zsytri2, zsytri2x, zsyswapr, + zsytrs, zsytrs2, zsyconv, ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, ztgexc, ztgsen, ztgsja, ztgsna, ztgsy2, ztgsyl, ztpcon, ztprfs, ztptri, ztptrs, ztrcon, ztrevc, ztrexc, ztrrfs, ztrsen, ztrsna, @@ -295,7 +325,13 @@ zunmlq, zunmql, zunmqr, zunmr2, zunmr3, zunmrq, zunmrz, zunmtr, zupgtr, zupmtr, izmax1, dzsum1, zstemr, - zcgesv, zlag2c, clag2z, + zcgesv, zcposv, zlag2c, clag2z, zlat2c, + zhfrk, ztfttp, zlanhf, zpftrf, zpftri, zpftrs, ztfsm, ztftri, + ztfttr, ztpttf, ztpttr, ztrttf, ztrttp, + zgeequb, zgbequb, zsyequb, zpoequb, zheequb, + zbbcsd, zlapmr, zunbdb, zuncsd, + zgeqrt, zgeqrt2, zgeqrt3, zgemqrt, + ztpqrt, ztpqrt2, ztpmqrt, ztprfb, ); if ($ARGV[5] == 1) { From 78914475aeecc2ad51da1da9b9ab0958c9179cfd Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Sat, 7 Apr 2012 10:40:46 +0200 Subject: [PATCH 006/162] Fixed #88. Build LAPACKE: C Interface to LAPACK. --- Makefile | 93 +- Makefile.install | 7 + Makefile.rule | 3 + Makefile.system | 8 + exports/Makefile | 16 +- exports/gensymbol | 2152 +++++++++++++++++++++++++++++++++++++++- patch.for_lapack-3.4.0 | 155 +++ 7 files changed, 2371 insertions(+), 63 deletions(-) diff --git a/Makefile b/Makefile index afa8d2efb..6f0a255ab 100644 --- a/Makefile +++ b/Makefile @@ -26,10 +26,10 @@ endif SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench -.PHONY : all libs netlib test ctest shared install +.PHONY : all libs netlib lapacke test ctest shared install .NOTPARALLEL : all libs prof lapack-test install -all :: libs netlib tests shared +all :: libs netlib lapacke tests shared @echo @echo " OpenBLAS build complete." @echo @@ -203,31 +203,54 @@ ifeq ($(NO_LAPACK), 1) netlib : else -netlib : lapack-3.4.0 patch.for_lapack-3.4.0 lapack-3.4.0/make.inc +netlib : lapack-3.4.0 patch.for_lapack-3.4.0 $(NETLIB_LAPACK_DIR)/make.inc ifndef NOFORTRAN - -@$(MAKE) -C lapack-3.4.0 lapacklib + -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib endif endif -prof_lapack : lapack-3.4.0 lapack-3.4.0/make.inc - -@$(MAKE) -C lapack-3.4.0 lapack_prof +ifeq ($(NO_LAPACKE), 1) +lapacke : -lapack-3.4.0/make.inc : +else +lapacke : lapack-3.4.0 $(NETLIB_LAPACK_DIR)/lapacke/make.inc +ifndef NOFORTRAN + -@$(MAKE) -C $(NETLIB_LAPACK_DIR)/lapacke +endif +endif + +prof_lapack : lapack-3.4.0 $(NETLIB_LAPACK_DIR)/make.inc + -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof + +$(NETLIB_LAPACK_DIR)/make.inc : +ifndef NOFORTRAN + -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc + -@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "POPTS = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "NOOPT = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc +# -@echo "CEXTRALIB = $(CEXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc +endif + +$(NETLIB_LAPACK_DIR)/lapacke/make.inc : ifndef NOFORTRAN - -@echo "FORTRAN = $(FC)" > lapack-3.4.0/make.inc - -@echo "OPTS = $(FFLAGS)" >> lapack-3.4.0/make.inc - -@echo "POPTS = $(FPFLAGS)" >> lapack-3.4.0/make.inc - -@echo "NOOPT = $(FFLAGS) -O0" >> lapack-3.4.0/make.inc - -@echo "PNOOPT = $(FPFLAGS) -O0" >> lapack-3.4.0/make.inc - -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> lapack-3.4.0/make.inc - -@echo "ARCH = $(AR)" >> lapack-3.4.0/make.inc - -@echo "RANLIB = $(RANLIB)" >> lapack-3.4.0/make.inc - -@echo "LAPACKLIB = ../$(LIBNAME)" >> lapack-3.4.0/make.inc - -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> lapack-3.4.0/make.inc - -@echo "SUFFIX = $(SUFFIX)" >> lapack-3.4.0/make.inc - -@echo "PSUFFIX = $(PSUFFIX)" >> lapack-3.4.0/make.inc -# -@echo "CEXTRALIB = $(CEXTRALIB)" >> lapack-3.4.0/make.inc - -@cat make.inc >> lapack-3.4.0/make.inc + -@echo "CC = $(CC)" > $(NETLIB_LAPACK_DIR)/lapacke/make.inc + -@echo "CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc + -@echo "LINKER = $(FC)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc + -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc + -@echo "LAPACKE = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc + -@echo "LIBS = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc + -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc + -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc + -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc endif lapack-3.4.0 : lapack-3.4.0.tgz @@ -235,9 +258,9 @@ ifndef NOFORTRAN ifndef NO_LAPACK @if test `$(MD5SUM) lapack-3.4.0.tgz | $(AWK) '{print $$1}'` = 02d5706ec03ba885fc246e5fa10d8c70; then \ echo $(TAR) zxf $< ;\ - $(TAR) zxf $< && (cd lapack-3.4.0; $(PATCH) -p1 < ../patch.for_lapack-3.4.0) ;\ + $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.0) ;\ else \ - rm -rf lapack-3.4.0 ;\ + rm -rf $(NETLIB_LAPACK_DIR) ;\ echo " Cannot download lapack-3.4.0.tgz or the MD5 check sum is wrong (Please use orignal)."; \ exit 1; \ fi @@ -267,19 +290,19 @@ endif lapack-timing : lapack-3.4.0 large.tgz timing.tgz ifndef NOFORTRAN - (cd lapack-3.4.0; $(TAR) zxf ../timing.tgz TIMING) - (cd lapack-3.4.0/TIMING; $(TAR) zxf ../../large.tgz ) - make -C lapack-3.4.0 tmglib - make -C lapack-3.4.0/TIMING + (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) + (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) + make -C $(NETLIB_LAPACK_DIR) tmglib + make -C $(NETLIB_LAPACK_DIR)/TIMING endif lapack-test : - $(MAKE) -C lapack-3.4.0 tmglib - $(MAKE) -C lapack-3.4.0/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc - @rm -f lapack-3.4.0/TESTING/*.out - $(MAKE) -j 1 -C lapack-3.4.0/TESTING - $(GREP) failed lapack-3.4.0/TESTING/*.out + $(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib + $(MAKE) -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc + @rm -f $(NETLIB_LAPACK_DIR)/TESTING/*.out + $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING + $(GREP) failed $(NETLIB_LAPACK_DIR)/TESTING/*.out dummy : @@ -298,9 +321,9 @@ clean :: @$(MAKE) -C reference clean @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib - @if test -d lapack-3.4.0; then \ - echo deleting lapack-3.4.0; \ - rm -rf lapack-3.4.0 ;\ + @if test -d $(NETLIB_LAPACK_DIR); then \ + echo deleting $(NETLIB_LAPACK_DIR); \ + rm -rf $(NETLIB_LAPACK_DIR) ;\ fi @rm -f *.grd Makefile.conf_last config_last.h @echo Done. \ No newline at end of file diff --git a/Makefile.install b/Makefile.install index 46105fc39..2ba10d0dc 100644 --- a/Makefile.install +++ b/Makefile.install @@ -35,6 +35,13 @@ install : lib.grd @echo Generating cblas.h in $(OPENBLAS_INCLUDE_DIR) @sed 's/common/openblas_config/g' cblas.h > $(OPENBLAS_INCLUDE_DIR)/cblas.h +ifndef NO_LAPACKE + @echo Copying LAPACKE header files to $(OPENBLAS_LIBRARY_DIR) + @cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(OPENBLAS_INCLUDE_DIR)/lapacke.h + @cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(OPENBLAS_INCLUDE_DIR)/lapacke_config.h + @cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h +endif + #for install static library @echo Copy the static library to $(OPENBLAS_LIBRARY_DIR) @cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR) diff --git a/Makefile.rule b/Makefile.rule index 650478a07..843888b4c 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -51,6 +51,9 @@ VERSION = 0.1.0 # If you don't need LAPACK, please comment it in. # NO_LAPACK = 1 +# If you don't need LAPACKE (C Interface to LAPACK), please comment it in. +# NO_LAPACKE = 1 + # If you want to use legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 diff --git a/Makefile.system b/Makefile.system index bbd31e5ca..bc5b20d86 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,6 +9,10 @@ ifndef TOPDIR TOPDIR = . endif +ifndef NETLIB_LAPACK_DIR +NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.0 +endif + # Default C compiler CC = gcc @@ -536,6 +540,10 @@ ifeq ($(NO_LAPACK), 1) CCOMMON_OPT += -DNO_LAPACK endif +ifeq ($(NO_LAPACKE), 1) +CCOMMON_OPT += -DNO_LAPACKE +endif + ifdef SMP CCOMMON_OPT += -DSMP_SERVER diff --git a/exports/Makefile b/exports/Makefile index 873e8b270..c4d2abd63 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -76,13 +76,13 @@ libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) libopenblas.def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) libgoto2_shared.def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) libgoto_hpl.def : gensymbol - perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) + perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) $(LIBDYNNAME) : ../$(LIBNAME) osx.def $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) @@ -163,23 +163,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) linux.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) + perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F) symbol.S : gensymbol - perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > symbol.S + perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S test : linktest.c $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* diff --git a/exports/gensymbol b/exports/gensymbol index edf4632d8..626827e4e 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -334,24 +334,2136 @@ ztpqrt, ztpqrt2, ztpmqrt, ztprfb, ); +@lapackeobjs = ( + lapack_make_complex_double, + lapack_make_complex_float, + LAPACKE_c_nancheck, + LAPACKE_cbbcsd, + LAPACKE_cbbcsd_work, + LAPACKE_cbdsqr, + LAPACKE_cbdsqr_work, + LAPACKE_cgb_nancheck, + LAPACKE_cgb_trans, + LAPACKE_cgbbrd, + LAPACKE_cgbbrd_work, + LAPACKE_cgbcon, + LAPACKE_cgbcon_work, + LAPACKE_cgbequ, + LAPACKE_cgbequ_work, + LAPACKE_cgbequb, + LAPACKE_cgbequb_work, + LAPACKE_cgbrfs, + LAPACKE_cgbrfs_work, + LAPACKE_cgbsv, + LAPACKE_cgbsv_work, + LAPACKE_cgbsvx, + LAPACKE_cgbsvx_work, + LAPACKE_cgbtrf, + LAPACKE_cgbtrf_work, + LAPACKE_cgbtrs, + LAPACKE_cgbtrs_work, + LAPACKE_cge_nancheck, + LAPACKE_cge_trans, + LAPACKE_cgebak, + LAPACKE_cgebak_work, + LAPACKE_cgebal, + LAPACKE_cgebal_work, + LAPACKE_cgebrd, + LAPACKE_cgebrd_work, + LAPACKE_cgecon, + LAPACKE_cgecon_work, + LAPACKE_cgeequ, + LAPACKE_cgeequ_work, + LAPACKE_cgeequb, + LAPACKE_cgeequb_work, + LAPACKE_cgees, + LAPACKE_cgees_work, + LAPACKE_cgeesx, + LAPACKE_cgeesx_work, + LAPACKE_cgeev, + LAPACKE_cgeev_work, + LAPACKE_cgeevx, + LAPACKE_cgeevx_work, + LAPACKE_cgehrd, + LAPACKE_cgehrd_work, + LAPACKE_cgelq2, + LAPACKE_cgelq2_work, + LAPACKE_cgelqf, + LAPACKE_cgelqf_work, + LAPACKE_cgels, + LAPACKE_cgels_work, + LAPACKE_cgelsd, + LAPACKE_cgelsd_work, + LAPACKE_cgelss, + LAPACKE_cgelss_work, + LAPACKE_cgelsy, + LAPACKE_cgelsy_work, + LAPACKE_cgemqrt, + LAPACKE_cgemqrt_work, + LAPACKE_cgeqlf, + LAPACKE_cgeqlf_work, + LAPACKE_cgeqp3, + LAPACKE_cgeqp3_work, + LAPACKE_cgeqpf, + LAPACKE_cgeqpf_work, + LAPACKE_cgeqr2, + LAPACKE_cgeqr2_work, + LAPACKE_cgeqrf, + LAPACKE_cgeqrf_work, + LAPACKE_cgeqrfp, + LAPACKE_cgeqrfp_work, + LAPACKE_cgeqrt, + LAPACKE_cgeqrt2, + LAPACKE_cgeqrt2_work, + LAPACKE_cgeqrt3, + LAPACKE_cgeqrt3_work, + LAPACKE_cgeqrt_work, + LAPACKE_cgerfs, + LAPACKE_cgerfs_work, + LAPACKE_cgerqf, + LAPACKE_cgerqf_work, + LAPACKE_cgesdd, + LAPACKE_cgesdd_work, + LAPACKE_cgesv, + LAPACKE_cgesv_work, + LAPACKE_cgesvd, + LAPACKE_cgesvd_work, + LAPACKE_cgesvx, + LAPACKE_cgesvx_work, + LAPACKE_cgetf2, + LAPACKE_cgetf2_work, + LAPACKE_cgetrf, + LAPACKE_cgetrf_work, + LAPACKE_cgetri, + LAPACKE_cgetri_work, + LAPACKE_cgetrs, + LAPACKE_cgetrs_work, + LAPACKE_cgg_nancheck, + LAPACKE_cgg_trans, + LAPACKE_cggbak, + LAPACKE_cggbak_work, + LAPACKE_cggbal, + LAPACKE_cggbal_work, + LAPACKE_cgges, + LAPACKE_cgges_work, + LAPACKE_cggesx, + LAPACKE_cggesx_work, + LAPACKE_cggev, + LAPACKE_cggev_work, + LAPACKE_cggevx, + LAPACKE_cggevx_work, + LAPACKE_cggglm, + LAPACKE_cggglm_work, + LAPACKE_cgghrd, + LAPACKE_cgghrd_work, + LAPACKE_cgglse, + LAPACKE_cgglse_work, + LAPACKE_cggqrf, + LAPACKE_cggqrf_work, + LAPACKE_cggrqf, + LAPACKE_cggrqf_work, + LAPACKE_cggsvd, + LAPACKE_cggsvd_work, + LAPACKE_cggsvp, + LAPACKE_cggsvp_work, + LAPACKE_cgt_nancheck, + LAPACKE_cgtcon, + LAPACKE_cgtcon_work, + LAPACKE_cgtrfs, + LAPACKE_cgtrfs_work, + LAPACKE_cgtsv, + LAPACKE_cgtsv_work, + LAPACKE_cgtsvx, + LAPACKE_cgtsvx_work, + LAPACKE_cgttrf, + LAPACKE_cgttrf_work, + LAPACKE_cgttrs, + LAPACKE_cgttrs_work, + LAPACKE_chb_nancheck, + LAPACKE_chb_trans, + LAPACKE_chbev, + LAPACKE_chbev_work, + LAPACKE_chbevd, + LAPACKE_chbevd_work, + LAPACKE_chbevx, + LAPACKE_chbevx_work, + LAPACKE_chbgst, + LAPACKE_chbgst_work, + LAPACKE_chbgv, + LAPACKE_chbgv_work, + LAPACKE_chbgvd, + LAPACKE_chbgvd_work, + LAPACKE_chbgvx, + LAPACKE_chbgvx_work, + LAPACKE_chbtrd, + LAPACKE_chbtrd_work, + LAPACKE_che_nancheck, + LAPACKE_che_trans, + LAPACKE_checon, + LAPACKE_checon_work, + LAPACKE_cheequb, + LAPACKE_cheequb_work, + LAPACKE_cheev, + LAPACKE_cheev_work, + LAPACKE_cheevd, + LAPACKE_cheevd_work, + LAPACKE_cheevr, + LAPACKE_cheevr_work, + LAPACKE_cheevx, + LAPACKE_cheevx_work, + LAPACKE_chegst, + LAPACKE_chegst_work, + LAPACKE_chegv, + LAPACKE_chegv_work, + LAPACKE_chegvd, + LAPACKE_chegvd_work, + LAPACKE_chegvx, + LAPACKE_chegvx_work, + LAPACKE_cherfs, + LAPACKE_cherfs_work, + LAPACKE_chesv, + LAPACKE_chesv_work, + LAPACKE_chesvx, + LAPACKE_chesvx_work, + LAPACKE_cheswapr, + LAPACKE_cheswapr_work, + LAPACKE_chetrd, + LAPACKE_chetrd_work, + LAPACKE_chetrf, + LAPACKE_chetrf_work, + LAPACKE_chetri, + LAPACKE_chetri2, + LAPACKE_chetri2_work, + LAPACKE_chetri2x, + LAPACKE_chetri2x_work, + LAPACKE_chetri_work, + LAPACKE_chetrs, + LAPACKE_chetrs2, + LAPACKE_chetrs2_work, + LAPACKE_chetrs_work, + LAPACKE_chfrk, + LAPACKE_chfrk_work, + LAPACKE_chgeqz, + LAPACKE_chgeqz_work, + LAPACKE_chp_nancheck, + LAPACKE_chp_trans, + LAPACKE_chpcon, + LAPACKE_chpcon_work, + LAPACKE_chpev, + LAPACKE_chpev_work, + LAPACKE_chpevd, + LAPACKE_chpevd_work, + LAPACKE_chpevx, + LAPACKE_chpevx_work, + LAPACKE_chpgst, + LAPACKE_chpgst_work, + LAPACKE_chpgv, + LAPACKE_chpgv_work, + LAPACKE_chpgvd, + LAPACKE_chpgvd_work, + LAPACKE_chpgvx, + LAPACKE_chpgvx_work, + LAPACKE_chprfs, + LAPACKE_chprfs_work, + LAPACKE_chpsv, + LAPACKE_chpsv_work, + LAPACKE_chpsvx, + LAPACKE_chpsvx_work, + LAPACKE_chptrd, + LAPACKE_chptrd_work, + LAPACKE_chptrf, + LAPACKE_chptrf_work, + LAPACKE_chptri, + LAPACKE_chptri_work, + LAPACKE_chptrs, + LAPACKE_chptrs_work, + LAPACKE_chs_nancheck, + LAPACKE_chs_trans, + LAPACKE_chsein, + LAPACKE_chsein_work, + LAPACKE_chseqr, + LAPACKE_chseqr_work, + LAPACKE_clacgv, + LAPACKE_clacgv_work, + LAPACKE_clacpy, + LAPACKE_clacpy_work, + LAPACKE_clag2z, + LAPACKE_clag2z_work, + LAPACKE_clange, + LAPACKE_clange_work, + LAPACKE_clanhe, + LAPACKE_clanhe_work, + LAPACKE_clansy, + LAPACKE_clansy_work, + LAPACKE_clantr, + LAPACKE_clantr_work, + LAPACKE_clapmr, + LAPACKE_clapmr_work, + LAPACKE_clarfb, + LAPACKE_clarfb_work, + LAPACKE_clarfg, + LAPACKE_clarfg_work, + LAPACKE_clarft, + LAPACKE_clarft_work, + LAPACKE_clarfx, + LAPACKE_clarfx_work, + LAPACKE_clarnv, + LAPACKE_clarnv_work, + LAPACKE_claset, + LAPACKE_claset_work, + LAPACKE_claswp, + LAPACKE_claswp_work, + LAPACKE_clauum, + LAPACKE_clauum_work, + LAPACKE_cpb_nancheck, + LAPACKE_cpb_trans, + LAPACKE_cpbcon, + LAPACKE_cpbcon_work, + LAPACKE_cpbequ, + LAPACKE_cpbequ_work, + LAPACKE_cpbrfs, + LAPACKE_cpbrfs_work, + LAPACKE_cpbstf, + LAPACKE_cpbstf_work, + LAPACKE_cpbsv, + LAPACKE_cpbsv_work, + LAPACKE_cpbsvx, + LAPACKE_cpbsvx_work, + LAPACKE_cpbtrf, + LAPACKE_cpbtrf_work, + LAPACKE_cpbtrs, + LAPACKE_cpbtrs_work, + LAPACKE_cpf_nancheck, + LAPACKE_cpf_trans, + LAPACKE_cpftrf, + LAPACKE_cpftrf_work, + LAPACKE_cpftri, + LAPACKE_cpftri_work, + LAPACKE_cpftrs, + LAPACKE_cpftrs_work, + LAPACKE_cpo_nancheck, + LAPACKE_cpo_trans, + LAPACKE_cpocon, + LAPACKE_cpocon_work, + LAPACKE_cpoequ, + LAPACKE_cpoequ_work, + LAPACKE_cpoequb, + LAPACKE_cpoequb_work, + LAPACKE_cporfs, + LAPACKE_cporfs_work, + LAPACKE_cposv, + LAPACKE_cposv_work, + LAPACKE_cposvx, + LAPACKE_cposvx_work, + LAPACKE_cpotrf, + LAPACKE_cpotrf_work, + LAPACKE_cpotri, + LAPACKE_cpotri_work, + LAPACKE_cpotrs, + LAPACKE_cpotrs_work, + LAPACKE_cpp_nancheck, + LAPACKE_cpp_trans, + LAPACKE_cppcon, + LAPACKE_cppcon_work, + LAPACKE_cppequ, + LAPACKE_cppequ_work, + LAPACKE_cpprfs, + LAPACKE_cpprfs_work, + LAPACKE_cppsv, + LAPACKE_cppsv_work, + LAPACKE_cppsvx, + LAPACKE_cppsvx_work, + LAPACKE_cpptrf, + LAPACKE_cpptrf_work, + LAPACKE_cpptri, + LAPACKE_cpptri_work, + LAPACKE_cpptrs, + LAPACKE_cpptrs_work, + LAPACKE_cpstrf, + LAPACKE_cpstrf_work, + LAPACKE_cpt_nancheck, + LAPACKE_cptcon, + LAPACKE_cptcon_work, + LAPACKE_cpteqr, + LAPACKE_cpteqr_work, + LAPACKE_cptrfs, + LAPACKE_cptrfs_work, + LAPACKE_cptsv, + LAPACKE_cptsv_work, + LAPACKE_cptsvx, + LAPACKE_cptsvx_work, + LAPACKE_cpttrf, + LAPACKE_cpttrf_work, + LAPACKE_cpttrs, + LAPACKE_cpttrs_work, + LAPACKE_csp_nancheck, + LAPACKE_csp_trans, + LAPACKE_cspcon, + LAPACKE_cspcon_work, + LAPACKE_csprfs, + LAPACKE_csprfs_work, + LAPACKE_cspsv, + LAPACKE_cspsv_work, + LAPACKE_cspsvx, + LAPACKE_cspsvx_work, + LAPACKE_csptrf, + LAPACKE_csptrf_work, + LAPACKE_csptri, + LAPACKE_csptri_work, + LAPACKE_csptrs, + LAPACKE_csptrs_work, + LAPACKE_cst_nancheck, + LAPACKE_cstedc, + LAPACKE_cstedc_work, + LAPACKE_cstegr, + LAPACKE_cstegr_work, + LAPACKE_cstein, + LAPACKE_cstein_work, + LAPACKE_cstemr, + LAPACKE_cstemr_work, + LAPACKE_csteqr, + LAPACKE_csteqr_work, + LAPACKE_csy_nancheck, + LAPACKE_csy_trans, + LAPACKE_csycon, + LAPACKE_csycon_work, + LAPACKE_csyconv, + LAPACKE_csyconv_work, + LAPACKE_csyequb, + LAPACKE_csyequb_work, + LAPACKE_csyrfs, + LAPACKE_csyrfs_work, + LAPACKE_csysv, + LAPACKE_csysv_work, + LAPACKE_csysvx, + LAPACKE_csysvx_work, + LAPACKE_csyswapr, + LAPACKE_csyswapr_work, + LAPACKE_csytrf, + LAPACKE_csytrf_work, + LAPACKE_csytri, + LAPACKE_csytri2, + LAPACKE_csytri2_work, + LAPACKE_csytri2x, + LAPACKE_csytri2x_work, + LAPACKE_csytri_work, + LAPACKE_csytrs, + LAPACKE_csytrs2, + LAPACKE_csytrs2_work, + LAPACKE_csytrs_work, + LAPACKE_ctb_nancheck, + LAPACKE_ctb_trans, + LAPACKE_ctbcon, + LAPACKE_ctbcon_work, + LAPACKE_ctbrfs, + LAPACKE_ctbrfs_work, + LAPACKE_ctbtrs, + LAPACKE_ctbtrs_work, + LAPACKE_ctf_nancheck, + LAPACKE_ctf_trans, + LAPACKE_ctfsm, + LAPACKE_ctfsm_work, + LAPACKE_ctftri, + LAPACKE_ctftri_work, + LAPACKE_ctfttp, + LAPACKE_ctfttp_work, + LAPACKE_ctfttr, + LAPACKE_ctfttr_work, + LAPACKE_ctgevc, + LAPACKE_ctgevc_work, + LAPACKE_ctgexc, + LAPACKE_ctgexc_work, + LAPACKE_ctgsen, + LAPACKE_ctgsen_work, + LAPACKE_ctgsja, + LAPACKE_ctgsja_work, + LAPACKE_ctgsna, + LAPACKE_ctgsna_work, + LAPACKE_ctgsyl, + LAPACKE_ctgsyl_work, + LAPACKE_ctp_nancheck, + LAPACKE_ctp_trans, + LAPACKE_ctpcon, + LAPACKE_ctpcon_work, + LAPACKE_ctpmqrt, + LAPACKE_ctpmqrt_work, + LAPACKE_ctpqrt, + LAPACKE_ctpqrt2, + LAPACKE_ctpqrt2_work, + LAPACKE_ctpqrt_work, + LAPACKE_ctprfb, + LAPACKE_ctprfb_work, + LAPACKE_ctprfs, + LAPACKE_ctprfs_work, + LAPACKE_ctptri, + LAPACKE_ctptri_work, + LAPACKE_ctptrs, + LAPACKE_ctptrs_work, + LAPACKE_ctpttf, + LAPACKE_ctpttf_work, + LAPACKE_ctpttr, + LAPACKE_ctpttr_work, + LAPACKE_ctr_nancheck, + LAPACKE_ctr_trans, + LAPACKE_ctrcon, + LAPACKE_ctrcon_work, + LAPACKE_ctrevc, + LAPACKE_ctrevc_work, + LAPACKE_ctrexc, + LAPACKE_ctrexc_work, + LAPACKE_ctrrfs, + LAPACKE_ctrrfs_work, + LAPACKE_ctrsen, + LAPACKE_ctrsen_work, + LAPACKE_ctrsna, + LAPACKE_ctrsna_work, + LAPACKE_ctrsyl, + LAPACKE_ctrsyl_work, + LAPACKE_ctrtri, + LAPACKE_ctrtri_work, + LAPACKE_ctrtrs, + LAPACKE_ctrtrs_work, + LAPACKE_ctrttf, + LAPACKE_ctrttf_work, + LAPACKE_ctrttp, + LAPACKE_ctrttp_work, + LAPACKE_ctzrzf, + LAPACKE_ctzrzf_work, + LAPACKE_cunbdb, + LAPACKE_cunbdb_work, + LAPACKE_cuncsd, + LAPACKE_cuncsd_work, + LAPACKE_cungbr, + LAPACKE_cungbr_work, + LAPACKE_cunghr, + LAPACKE_cunghr_work, + LAPACKE_cunglq, + LAPACKE_cunglq_work, + LAPACKE_cungql, + LAPACKE_cungql_work, + LAPACKE_cungqr, + LAPACKE_cungqr_work, + LAPACKE_cungrq, + LAPACKE_cungrq_work, + LAPACKE_cungtr, + LAPACKE_cungtr_work, + LAPACKE_cunmbr, + LAPACKE_cunmbr_work, + LAPACKE_cunmhr, + LAPACKE_cunmhr_work, + LAPACKE_cunmlq, + LAPACKE_cunmlq_work, + LAPACKE_cunmql, + LAPACKE_cunmql_work, + LAPACKE_cunmqr, + LAPACKE_cunmqr_work, + LAPACKE_cunmrq, + LAPACKE_cunmrq_work, + LAPACKE_cunmrz, + LAPACKE_cunmrz_work, + LAPACKE_cunmtr, + LAPACKE_cunmtr_work, + LAPACKE_cupgtr, + LAPACKE_cupgtr_work, + LAPACKE_cupmtr, + LAPACKE_cupmtr_work, + LAPACKE_d_nancheck, + LAPACKE_dbbcsd, + LAPACKE_dbbcsd_work, + LAPACKE_dbdsdc, + LAPACKE_dbdsdc_work, + LAPACKE_dbdsqr, + LAPACKE_dbdsqr_work, + LAPACKE_ddisna, + LAPACKE_ddisna_work, + LAPACKE_dgb_nancheck, + LAPACKE_dgb_trans, + LAPACKE_dgbbrd, + LAPACKE_dgbbrd_work, + LAPACKE_dgbcon, + LAPACKE_dgbcon_work, + LAPACKE_dgbequ, + LAPACKE_dgbequ_work, + LAPACKE_dgbequb, + LAPACKE_dgbequb_work, + LAPACKE_dgbrfs, + LAPACKE_dgbrfs_work, + LAPACKE_dgbsv, + LAPACKE_dgbsv_work, + LAPACKE_dgbsvx, + LAPACKE_dgbsvx_work, + LAPACKE_dgbtrf, + LAPACKE_dgbtrf_work, + LAPACKE_dgbtrs, + LAPACKE_dgbtrs_work, + LAPACKE_dge_nancheck, + LAPACKE_dge_trans, + LAPACKE_dgebak, + LAPACKE_dgebak_work, + LAPACKE_dgebal, + LAPACKE_dgebal_work, + LAPACKE_dgebrd, + LAPACKE_dgebrd_work, + LAPACKE_dgecon, + LAPACKE_dgecon_work, + LAPACKE_dgeequ, + LAPACKE_dgeequ_work, + LAPACKE_dgeequb, + LAPACKE_dgeequb_work, + LAPACKE_dgees, + LAPACKE_dgees_work, + LAPACKE_dgeesx, + LAPACKE_dgeesx_work, + LAPACKE_dgeev, + LAPACKE_dgeev_work, + LAPACKE_dgeevx, + LAPACKE_dgeevx_work, + LAPACKE_dgehrd, + LAPACKE_dgehrd_work, + LAPACKE_dgejsv, + LAPACKE_dgejsv_work, + LAPACKE_dgelq2, + LAPACKE_dgelq2_work, + LAPACKE_dgelqf, + LAPACKE_dgelqf_work, + LAPACKE_dgels, + LAPACKE_dgels_work, + LAPACKE_dgelsd, + LAPACKE_dgelsd_work, + LAPACKE_dgelss, + LAPACKE_dgelss_work, + LAPACKE_dgelsy, + LAPACKE_dgelsy_work, + LAPACKE_dgemqrt, + LAPACKE_dgemqrt_work, + LAPACKE_dgeqlf, + LAPACKE_dgeqlf_work, + LAPACKE_dgeqp3, + LAPACKE_dgeqp3_work, + LAPACKE_dgeqpf, + LAPACKE_dgeqpf_work, + LAPACKE_dgeqr2, + LAPACKE_dgeqr2_work, + LAPACKE_dgeqrf, + LAPACKE_dgeqrf_work, + LAPACKE_dgeqrfp, + LAPACKE_dgeqrfp_work, + LAPACKE_dgeqrt, + LAPACKE_dgeqrt2, + LAPACKE_dgeqrt2_work, + LAPACKE_dgeqrt3, + LAPACKE_dgeqrt3_work, + LAPACKE_dgeqrt_work, + LAPACKE_dgerfs, + LAPACKE_dgerfs_work, + LAPACKE_dgerqf, + LAPACKE_dgerqf_work, + LAPACKE_dgesdd, + LAPACKE_dgesdd_work, + LAPACKE_dgesv, + LAPACKE_dgesv_work, + LAPACKE_dgesvd, + LAPACKE_dgesvd_work, + LAPACKE_dgesvj, + LAPACKE_dgesvj_work, + LAPACKE_dgesvx, + LAPACKE_dgesvx_work, + LAPACKE_dgetf2, + LAPACKE_dgetf2_work, + LAPACKE_dgetrf, + LAPACKE_dgetrf_work, + LAPACKE_dgetri, + LAPACKE_dgetri_work, + LAPACKE_dgetrs, + LAPACKE_dgetrs_work, + LAPACKE_dgg_nancheck, + LAPACKE_dgg_trans, + LAPACKE_dggbak, + LAPACKE_dggbak_work, + LAPACKE_dggbal, + LAPACKE_dggbal_work, + LAPACKE_dgges, + LAPACKE_dgges_work, + LAPACKE_dggesx, + LAPACKE_dggesx_work, + LAPACKE_dggev, + LAPACKE_dggev_work, + LAPACKE_dggevx, + LAPACKE_dggevx_work, + LAPACKE_dggglm, + LAPACKE_dggglm_work, + LAPACKE_dgghrd, + LAPACKE_dgghrd_work, + LAPACKE_dgglse, + LAPACKE_dgglse_work, + LAPACKE_dggqrf, + LAPACKE_dggqrf_work, + LAPACKE_dggrqf, + LAPACKE_dggrqf_work, + LAPACKE_dggsvd, + LAPACKE_dggsvd_work, + LAPACKE_dggsvp, + LAPACKE_dggsvp_work, + LAPACKE_dgt_nancheck, + LAPACKE_dgtcon, + LAPACKE_dgtcon_work, + LAPACKE_dgtrfs, + LAPACKE_dgtrfs_work, + LAPACKE_dgtsv, + LAPACKE_dgtsv_work, + LAPACKE_dgtsvx, + LAPACKE_dgtsvx_work, + LAPACKE_dgttrf, + LAPACKE_dgttrf_work, + LAPACKE_dgttrs, + LAPACKE_dgttrs_work, + LAPACKE_dhgeqz, + LAPACKE_dhgeqz_work, + LAPACKE_dhs_nancheck, + LAPACKE_dhs_trans, + LAPACKE_dhsein, + LAPACKE_dhsein_work, + LAPACKE_dhseqr, + LAPACKE_dhseqr_work, + LAPACKE_dlacpy, + LAPACKE_dlacpy_work, + LAPACKE_dlag2s, + LAPACKE_dlag2s_work, + LAPACKE_dlamch, + LAPACKE_dlamch_work, + LAPACKE_dlange, + LAPACKE_dlange_work, + LAPACKE_dlansy, + LAPACKE_dlansy_work, + LAPACKE_dlantr, + LAPACKE_dlantr_work, + LAPACKE_dlapmr, + LAPACKE_dlapmr_work, + LAPACKE_dlapy2, + LAPACKE_dlapy2_work, + LAPACKE_dlapy3, + LAPACKE_dlapy3_work, + LAPACKE_dlarfb, + LAPACKE_dlarfb_work, + LAPACKE_dlarfg, + LAPACKE_dlarfg_work, + LAPACKE_dlarft, + LAPACKE_dlarft_work, + LAPACKE_dlarfx, + LAPACKE_dlarfx_work, + LAPACKE_dlarnv, + LAPACKE_dlarnv_work, + LAPACKE_dlartgp, + LAPACKE_dlartgp_work, + LAPACKE_dlartgs, + LAPACKE_dlartgs_work, + LAPACKE_dlaset, + LAPACKE_dlaset_work, + LAPACKE_dlasrt, + LAPACKE_dlasrt_work, + LAPACKE_dlaswp, + LAPACKE_dlaswp_work, + LAPACKE_dlauum, + LAPACKE_dlauum_work, + LAPACKE_dopgtr, + LAPACKE_dopgtr_work, + LAPACKE_dopmtr, + LAPACKE_dopmtr_work, + LAPACKE_dorbdb, + LAPACKE_dorbdb_work, + LAPACKE_dorcsd, + LAPACKE_dorcsd_work, + LAPACKE_dorgbr, + LAPACKE_dorgbr_work, + LAPACKE_dorghr, + LAPACKE_dorghr_work, + LAPACKE_dorglq, + LAPACKE_dorglq_work, + LAPACKE_dorgql, + LAPACKE_dorgql_work, + LAPACKE_dorgqr, + LAPACKE_dorgqr_work, + LAPACKE_dorgrq, + LAPACKE_dorgrq_work, + LAPACKE_dorgtr, + LAPACKE_dorgtr_work, + LAPACKE_dormbr, + LAPACKE_dormbr_work, + LAPACKE_dormhr, + LAPACKE_dormhr_work, + LAPACKE_dormlq, + LAPACKE_dormlq_work, + LAPACKE_dormql, + LAPACKE_dormql_work, + LAPACKE_dormqr, + LAPACKE_dormqr_work, + LAPACKE_dormrq, + LAPACKE_dormrq_work, + LAPACKE_dormrz, + LAPACKE_dormrz_work, + LAPACKE_dormtr, + LAPACKE_dormtr_work, + LAPACKE_dpb_nancheck, + LAPACKE_dpb_trans, + LAPACKE_dpbcon, + LAPACKE_dpbcon_work, + LAPACKE_dpbequ, + LAPACKE_dpbequ_work, + LAPACKE_dpbrfs, + LAPACKE_dpbrfs_work, + LAPACKE_dpbstf, + LAPACKE_dpbstf_work, + LAPACKE_dpbsv, + LAPACKE_dpbsv_work, + LAPACKE_dpbsvx, + LAPACKE_dpbsvx_work, + LAPACKE_dpbtrf, + LAPACKE_dpbtrf_work, + LAPACKE_dpbtrs, + LAPACKE_dpbtrs_work, + LAPACKE_dpf_nancheck, + LAPACKE_dpf_trans, + LAPACKE_dpftrf, + LAPACKE_dpftrf_work, + LAPACKE_dpftri, + LAPACKE_dpftri_work, + LAPACKE_dpftrs, + LAPACKE_dpftrs_work, + LAPACKE_dpo_nancheck, + LAPACKE_dpo_trans, + LAPACKE_dpocon, + LAPACKE_dpocon_work, + LAPACKE_dpoequ, + LAPACKE_dpoequ_work, + LAPACKE_dpoequb, + LAPACKE_dpoequb_work, + LAPACKE_dporfs, + LAPACKE_dporfs_work, + LAPACKE_dposv, + LAPACKE_dposv_work, + LAPACKE_dposvx, + LAPACKE_dposvx_work, + LAPACKE_dpotrf, + LAPACKE_dpotrf_work, + LAPACKE_dpotri, + LAPACKE_dpotri_work, + LAPACKE_dpotrs, + LAPACKE_dpotrs_work, + LAPACKE_dpp_nancheck, + LAPACKE_dpp_trans, + LAPACKE_dppcon, + LAPACKE_dppcon_work, + LAPACKE_dppequ, + LAPACKE_dppequ_work, + LAPACKE_dpprfs, + LAPACKE_dpprfs_work, + LAPACKE_dppsv, + LAPACKE_dppsv_work, + LAPACKE_dppsvx, + LAPACKE_dppsvx_work, + LAPACKE_dpptrf, + LAPACKE_dpptrf_work, + LAPACKE_dpptri, + LAPACKE_dpptri_work, + LAPACKE_dpptrs, + LAPACKE_dpptrs_work, + LAPACKE_dpstrf, + LAPACKE_dpstrf_work, + LAPACKE_dpt_nancheck, + LAPACKE_dptcon, + LAPACKE_dptcon_work, + LAPACKE_dpteqr, + LAPACKE_dpteqr_work, + LAPACKE_dptrfs, + LAPACKE_dptrfs_work, + LAPACKE_dptsv, + LAPACKE_dptsv_work, + LAPACKE_dptsvx, + LAPACKE_dptsvx_work, + LAPACKE_dpttrf, + LAPACKE_dpttrf_work, + LAPACKE_dpttrs, + LAPACKE_dpttrs_work, + LAPACKE_dsb_nancheck, + LAPACKE_dsb_trans, + LAPACKE_dsbev, + LAPACKE_dsbev_work, + LAPACKE_dsbevd, + LAPACKE_dsbevd_work, + LAPACKE_dsbevx, + LAPACKE_dsbevx_work, + LAPACKE_dsbgst, + LAPACKE_dsbgst_work, + LAPACKE_dsbgv, + LAPACKE_dsbgv_work, + LAPACKE_dsbgvd, + LAPACKE_dsbgvd_work, + LAPACKE_dsbgvx, + LAPACKE_dsbgvx_work, + LAPACKE_dsbtrd, + LAPACKE_dsbtrd_work, + LAPACKE_dsfrk, + LAPACKE_dsfrk_work, + LAPACKE_dsgesv, + LAPACKE_dsgesv_work, + LAPACKE_dsp_nancheck, + LAPACKE_dsp_trans, + LAPACKE_dspcon, + LAPACKE_dspcon_work, + LAPACKE_dspev, + LAPACKE_dspev_work, + LAPACKE_dspevd, + LAPACKE_dspevd_work, + LAPACKE_dspevx, + LAPACKE_dspevx_work, + LAPACKE_dspgst, + LAPACKE_dspgst_work, + LAPACKE_dspgv, + LAPACKE_dspgv_work, + LAPACKE_dspgvd, + LAPACKE_dspgvd_work, + LAPACKE_dspgvx, + LAPACKE_dspgvx_work, + LAPACKE_dsposv, + LAPACKE_dsposv_work, + LAPACKE_dsprfs, + LAPACKE_dsprfs_work, + LAPACKE_dspsv, + LAPACKE_dspsv_work, + LAPACKE_dspsvx, + LAPACKE_dspsvx_work, + LAPACKE_dsptrd, + LAPACKE_dsptrd_work, + LAPACKE_dsptrf, + LAPACKE_dsptrf_work, + LAPACKE_dsptri, + LAPACKE_dsptri_work, + LAPACKE_dsptrs, + LAPACKE_dsptrs_work, + LAPACKE_dst_nancheck, + LAPACKE_dstebz, + LAPACKE_dstebz_work, + LAPACKE_dstedc, + LAPACKE_dstedc_work, + LAPACKE_dstegr, + LAPACKE_dstegr_work, + LAPACKE_dstein, + LAPACKE_dstein_work, + LAPACKE_dstemr, + LAPACKE_dstemr_work, + LAPACKE_dsteqr, + LAPACKE_dsteqr_work, + LAPACKE_dsterf, + LAPACKE_dsterf_work, + LAPACKE_dstev, + LAPACKE_dstev_work, + LAPACKE_dstevd, + LAPACKE_dstevd_work, + LAPACKE_dstevr, + LAPACKE_dstevr_work, + LAPACKE_dstevx, + LAPACKE_dstevx_work, + LAPACKE_dsy_nancheck, + LAPACKE_dsy_trans, + LAPACKE_dsycon, + LAPACKE_dsycon_work, + LAPACKE_dsyconv, + LAPACKE_dsyconv_work, + LAPACKE_dsyequb, + LAPACKE_dsyequb_work, + LAPACKE_dsyev, + LAPACKE_dsyev_work, + LAPACKE_dsyevd, + LAPACKE_dsyevd_work, + LAPACKE_dsyevr, + LAPACKE_dsyevr_work, + LAPACKE_dsyevx, + LAPACKE_dsyevx_work, + LAPACKE_dsygst, + LAPACKE_dsygst_work, + LAPACKE_dsygv, + LAPACKE_dsygv_work, + LAPACKE_dsygvd, + LAPACKE_dsygvd_work, + LAPACKE_dsygvx, + LAPACKE_dsygvx_work, + LAPACKE_dsyrfs, + LAPACKE_dsyrfs_work, + LAPACKE_dsysv, + LAPACKE_dsysv_work, + LAPACKE_dsysvx, + LAPACKE_dsysvx_work, + LAPACKE_dsyswapr, + LAPACKE_dsyswapr_work, + LAPACKE_dsytrd, + LAPACKE_dsytrd_work, + LAPACKE_dsytrf, + LAPACKE_dsytrf_work, + LAPACKE_dsytri, + LAPACKE_dsytri2, + LAPACKE_dsytri2_work, + LAPACKE_dsytri2x, + LAPACKE_dsytri2x_work, + LAPACKE_dsytri_work, + LAPACKE_dsytrs, + LAPACKE_dsytrs2, + LAPACKE_dsytrs2_work, + LAPACKE_dsytrs_work, + LAPACKE_dtb_nancheck, + LAPACKE_dtb_trans, + LAPACKE_dtbcon, + LAPACKE_dtbcon_work, + LAPACKE_dtbrfs, + LAPACKE_dtbrfs_work, + LAPACKE_dtbtrs, + LAPACKE_dtbtrs_work, + LAPACKE_dtf_nancheck, + LAPACKE_dtf_trans, + LAPACKE_dtfsm, + LAPACKE_dtfsm_work, + LAPACKE_dtftri, + LAPACKE_dtftri_work, + LAPACKE_dtfttp, + LAPACKE_dtfttp_work, + LAPACKE_dtfttr, + LAPACKE_dtfttr_work, + LAPACKE_dtgevc, + LAPACKE_dtgevc_work, + LAPACKE_dtgexc, + LAPACKE_dtgexc_work, + LAPACKE_dtgsen, + LAPACKE_dtgsen_work, + LAPACKE_dtgsja, + LAPACKE_dtgsja_work, + LAPACKE_dtgsna, + LAPACKE_dtgsna_work, + LAPACKE_dtgsyl, + LAPACKE_dtgsyl_work, + LAPACKE_dtp_nancheck, + LAPACKE_dtp_trans, + LAPACKE_dtpcon, + LAPACKE_dtpcon_work, + LAPACKE_dtpmqrt, + LAPACKE_dtpmqrt_work, + LAPACKE_dtpqrt, + LAPACKE_dtpqrt2, + LAPACKE_dtpqrt2_work, + LAPACKE_dtpqrt_work, + LAPACKE_dtprfb, + LAPACKE_dtprfb_work, + LAPACKE_dtprfs, + LAPACKE_dtprfs_work, + LAPACKE_dtptri, + LAPACKE_dtptri_work, + LAPACKE_dtptrs, + LAPACKE_dtptrs_work, + LAPACKE_dtpttf, + LAPACKE_dtpttf_work, + LAPACKE_dtpttr, + LAPACKE_dtpttr_work, + LAPACKE_dtr_nancheck, + LAPACKE_dtr_trans, + LAPACKE_dtrcon, + LAPACKE_dtrcon_work, + LAPACKE_dtrevc, + LAPACKE_dtrevc_work, + LAPACKE_dtrexc, + LAPACKE_dtrexc_work, + LAPACKE_dtrrfs, + LAPACKE_dtrrfs_work, + LAPACKE_dtrsen, + LAPACKE_dtrsen_work, + LAPACKE_dtrsna, + LAPACKE_dtrsna_work, + LAPACKE_dtrsyl, + LAPACKE_dtrsyl_work, + LAPACKE_dtrtri, + LAPACKE_dtrtri_work, + LAPACKE_dtrtrs, + LAPACKE_dtrtrs_work, + LAPACKE_dtrttf, + LAPACKE_dtrttf_work, + LAPACKE_dtrttp, + LAPACKE_dtrttp_work, + LAPACKE_dtzrzf, + LAPACKE_dtzrzf_work, + LAPACKE_lsame, + LAPACKE_s_nancheck, + LAPACKE_sbbcsd, + LAPACKE_sbbcsd_work, + LAPACKE_sbdsdc, + LAPACKE_sbdsdc_work, + LAPACKE_sbdsqr, + LAPACKE_sbdsqr_work, + LAPACKE_sdisna, + LAPACKE_sdisna_work, + LAPACKE_sgb_nancheck, + LAPACKE_sgb_trans, + LAPACKE_sgbbrd, + LAPACKE_sgbbrd_work, + LAPACKE_sgbcon, + LAPACKE_sgbcon_work, + LAPACKE_sgbequ, + LAPACKE_sgbequ_work, + LAPACKE_sgbequb, + LAPACKE_sgbequb_work, + LAPACKE_sgbrfs, + LAPACKE_sgbrfs_work, + LAPACKE_sgbsv, + LAPACKE_sgbsv_work, + LAPACKE_sgbsvx, + LAPACKE_sgbsvx_work, + LAPACKE_sgbtrf, + LAPACKE_sgbtrf_work, + LAPACKE_sgbtrs, + LAPACKE_sgbtrs_work, + LAPACKE_sge_nancheck, + LAPACKE_sge_trans, + LAPACKE_sgebak, + LAPACKE_sgebak_work, + LAPACKE_sgebal, + LAPACKE_sgebal_work, + LAPACKE_sgebrd, + LAPACKE_sgebrd_work, + LAPACKE_sgecon, + LAPACKE_sgecon_work, + LAPACKE_sgeequ, + LAPACKE_sgeequ_work, + LAPACKE_sgeequb, + LAPACKE_sgeequb_work, + LAPACKE_sgees, + LAPACKE_sgees_work, + LAPACKE_sgeesx, + LAPACKE_sgeesx_work, + LAPACKE_sgeev, + LAPACKE_sgeev_work, + LAPACKE_sgeevx, + LAPACKE_sgeevx_work, + LAPACKE_sgehrd, + LAPACKE_sgehrd_work, + LAPACKE_sgejsv, + LAPACKE_sgejsv_work, + LAPACKE_sgelq2, + LAPACKE_sgelq2_work, + LAPACKE_sgelqf, + LAPACKE_sgelqf_work, + LAPACKE_sgels, + LAPACKE_sgels_work, + LAPACKE_sgelsd, + LAPACKE_sgelsd_work, + LAPACKE_sgelss, + LAPACKE_sgelss_work, + LAPACKE_sgelsy, + LAPACKE_sgelsy_work, + LAPACKE_sgemqrt, + LAPACKE_sgemqrt_work, + LAPACKE_sgeqlf, + LAPACKE_sgeqlf_work, + LAPACKE_sgeqp3, + LAPACKE_sgeqp3_work, + LAPACKE_sgeqpf, + LAPACKE_sgeqpf_work, + LAPACKE_sgeqr2, + LAPACKE_sgeqr2_work, + LAPACKE_sgeqrf, + LAPACKE_sgeqrf_work, + LAPACKE_sgeqrfp, + LAPACKE_sgeqrfp_work, + LAPACKE_sgeqrt, + LAPACKE_sgeqrt2, + LAPACKE_sgeqrt2_work, + LAPACKE_sgeqrt3, + LAPACKE_sgeqrt3_work, + LAPACKE_sgeqrt_work, + LAPACKE_sgerfs, + LAPACKE_sgerfs_work, + LAPACKE_sgerqf, + LAPACKE_sgerqf_work, + LAPACKE_sgesdd, + LAPACKE_sgesdd_work, + LAPACKE_sgesv, + LAPACKE_sgesv_work, + LAPACKE_sgesvd, + LAPACKE_sgesvd_work, + LAPACKE_sgesvj, + LAPACKE_sgesvj_work, + LAPACKE_sgesvx, + LAPACKE_sgesvx_work, + LAPACKE_sgetf2, + LAPACKE_sgetf2_work, + LAPACKE_sgetrf, + LAPACKE_sgetrf_work, + LAPACKE_sgetri, + LAPACKE_sgetri_work, + LAPACKE_sgetrs, + LAPACKE_sgetrs_work, + LAPACKE_sgg_nancheck, + LAPACKE_sgg_trans, + LAPACKE_sggbak, + LAPACKE_sggbak_work, + LAPACKE_sggbal, + LAPACKE_sggbal_work, + LAPACKE_sgges, + LAPACKE_sgges_work, + LAPACKE_sggesx, + LAPACKE_sggesx_work, + LAPACKE_sggev, + LAPACKE_sggev_work, + LAPACKE_sggevx, + LAPACKE_sggevx_work, + LAPACKE_sggglm, + LAPACKE_sggglm_work, + LAPACKE_sgghrd, + LAPACKE_sgghrd_work, + LAPACKE_sgglse, + LAPACKE_sgglse_work, + LAPACKE_sggqrf, + LAPACKE_sggqrf_work, + LAPACKE_sggrqf, + LAPACKE_sggrqf_work, + LAPACKE_sggsvd, + LAPACKE_sggsvd_work, + LAPACKE_sggsvp, + LAPACKE_sggsvp_work, + LAPACKE_sgt_nancheck, + LAPACKE_sgtcon, + LAPACKE_sgtcon_work, + LAPACKE_sgtrfs, + LAPACKE_sgtrfs_work, + LAPACKE_sgtsv, + LAPACKE_sgtsv_work, + LAPACKE_sgtsvx, + LAPACKE_sgtsvx_work, + LAPACKE_sgttrf, + LAPACKE_sgttrf_work, + LAPACKE_sgttrs, + LAPACKE_sgttrs_work, + LAPACKE_shgeqz, + LAPACKE_shgeqz_work, + LAPACKE_shs_nancheck, + LAPACKE_shs_trans, + LAPACKE_shsein, + LAPACKE_shsein_work, + LAPACKE_shseqr, + LAPACKE_shseqr_work, + LAPACKE_slacpy, + LAPACKE_slacpy_work, + LAPACKE_slag2d, + LAPACKE_slag2d_work, + LAPACKE_slamch, + LAPACKE_slamch_work, + LAPACKE_slange, + LAPACKE_slange_work, + LAPACKE_slansy, + LAPACKE_slansy_work, + LAPACKE_slantr, + LAPACKE_slantr_work, + LAPACKE_slapmr, + LAPACKE_slapmr_work, + LAPACKE_slapy2, + LAPACKE_slapy2_work, + LAPACKE_slapy3, + LAPACKE_slapy3_work, + LAPACKE_slarfb, + LAPACKE_slarfb_work, + LAPACKE_slarfg, + LAPACKE_slarfg_work, + LAPACKE_slarft, + LAPACKE_slarft_work, + LAPACKE_slarfx, + LAPACKE_slarfx_work, + LAPACKE_slarnv, + LAPACKE_slarnv_work, + LAPACKE_slartgp, + LAPACKE_slartgp_work, + LAPACKE_slartgs, + LAPACKE_slartgs_work, + LAPACKE_slaset, + LAPACKE_slaset_work, + LAPACKE_slasrt, + LAPACKE_slasrt_work, + LAPACKE_slaswp, + LAPACKE_slaswp_work, + LAPACKE_slauum, + LAPACKE_slauum_work, + LAPACKE_sopgtr, + LAPACKE_sopgtr_work, + LAPACKE_sopmtr, + LAPACKE_sopmtr_work, + LAPACKE_sorbdb, + LAPACKE_sorbdb_work, + LAPACKE_sorcsd, + LAPACKE_sorcsd_work, + LAPACKE_sorgbr, + LAPACKE_sorgbr_work, + LAPACKE_sorghr, + LAPACKE_sorghr_work, + LAPACKE_sorglq, + LAPACKE_sorglq_work, + LAPACKE_sorgql, + LAPACKE_sorgql_work, + LAPACKE_sorgqr, + LAPACKE_sorgqr_work, + LAPACKE_sorgrq, + LAPACKE_sorgrq_work, + LAPACKE_sorgtr, + LAPACKE_sorgtr_work, + LAPACKE_sormbr, + LAPACKE_sormbr_work, + LAPACKE_sormhr, + LAPACKE_sormhr_work, + LAPACKE_sormlq, + LAPACKE_sormlq_work, + LAPACKE_sormql, + LAPACKE_sormql_work, + LAPACKE_sormqr, + LAPACKE_sormqr_work, + LAPACKE_sormrq, + LAPACKE_sormrq_work, + LAPACKE_sormrz, + LAPACKE_sormrz_work, + LAPACKE_sormtr, + LAPACKE_sormtr_work, + LAPACKE_spb_nancheck, + LAPACKE_spb_trans, + LAPACKE_spbcon, + LAPACKE_spbcon_work, + LAPACKE_spbequ, + LAPACKE_spbequ_work, + LAPACKE_spbrfs, + LAPACKE_spbrfs_work, + LAPACKE_spbstf, + LAPACKE_spbstf_work, + LAPACKE_spbsv, + LAPACKE_spbsv_work, + LAPACKE_spbsvx, + LAPACKE_spbsvx_work, + LAPACKE_spbtrf, + LAPACKE_spbtrf_work, + LAPACKE_spbtrs, + LAPACKE_spbtrs_work, + LAPACKE_spf_nancheck, + LAPACKE_spf_trans, + LAPACKE_spftrf, + LAPACKE_spftrf_work, + LAPACKE_spftri, + LAPACKE_spftri_work, + LAPACKE_spftrs, + LAPACKE_spftrs_work, + LAPACKE_spo_nancheck, + LAPACKE_spo_trans, + LAPACKE_spocon, + LAPACKE_spocon_work, + LAPACKE_spoequ, + LAPACKE_spoequ_work, + LAPACKE_spoequb, + LAPACKE_spoequb_work, + LAPACKE_sporfs, + LAPACKE_sporfs_work, + LAPACKE_sposv, + LAPACKE_sposv_work, + LAPACKE_sposvx, + LAPACKE_sposvx_work, + LAPACKE_spotrf, + LAPACKE_spotrf_work, + LAPACKE_spotri, + LAPACKE_spotri_work, + LAPACKE_spotrs, + LAPACKE_spotrs_work, + LAPACKE_spp_nancheck, + LAPACKE_spp_trans, + LAPACKE_sppcon, + LAPACKE_sppcon_work, + LAPACKE_sppequ, + LAPACKE_sppequ_work, + LAPACKE_spprfs, + LAPACKE_spprfs_work, + LAPACKE_sppsv, + LAPACKE_sppsv_work, + LAPACKE_sppsvx, + LAPACKE_sppsvx_work, + LAPACKE_spptrf, + LAPACKE_spptrf_work, + LAPACKE_spptri, + LAPACKE_spptri_work, + LAPACKE_spptrs, + LAPACKE_spptrs_work, + LAPACKE_spstrf, + LAPACKE_spstrf_work, + LAPACKE_spt_nancheck, + LAPACKE_sptcon, + LAPACKE_sptcon_work, + LAPACKE_spteqr, + LAPACKE_spteqr_work, + LAPACKE_sptrfs, + LAPACKE_sptrfs_work, + LAPACKE_sptsv, + LAPACKE_sptsv_work, + LAPACKE_sptsvx, + LAPACKE_sptsvx_work, + LAPACKE_spttrf, + LAPACKE_spttrf_work, + LAPACKE_spttrs, + LAPACKE_spttrs_work, + LAPACKE_ssb_nancheck, + LAPACKE_ssb_trans, + LAPACKE_ssbev, + LAPACKE_ssbev_work, + LAPACKE_ssbevd, + LAPACKE_ssbevd_work, + LAPACKE_ssbevx, + LAPACKE_ssbevx_work, + LAPACKE_ssbgst, + LAPACKE_ssbgst_work, + LAPACKE_ssbgv, + LAPACKE_ssbgv_work, + LAPACKE_ssbgvd, + LAPACKE_ssbgvd_work, + LAPACKE_ssbgvx, + LAPACKE_ssbgvx_work, + LAPACKE_ssbtrd, + LAPACKE_ssbtrd_work, + LAPACKE_ssfrk, + LAPACKE_ssfrk_work, + LAPACKE_ssp_nancheck, + LAPACKE_ssp_trans, + LAPACKE_sspcon, + LAPACKE_sspcon_work, + LAPACKE_sspev, + LAPACKE_sspev_work, + LAPACKE_sspevd, + LAPACKE_sspevd_work, + LAPACKE_sspevx, + LAPACKE_sspevx_work, + LAPACKE_sspgst, + LAPACKE_sspgst_work, + LAPACKE_sspgv, + LAPACKE_sspgv_work, + LAPACKE_sspgvd, + LAPACKE_sspgvd_work, + LAPACKE_sspgvx, + LAPACKE_sspgvx_work, + LAPACKE_ssprfs, + LAPACKE_ssprfs_work, + LAPACKE_sspsv, + LAPACKE_sspsv_work, + LAPACKE_sspsvx, + LAPACKE_sspsvx_work, + LAPACKE_ssptrd, + LAPACKE_ssptrd_work, + LAPACKE_ssptrf, + LAPACKE_ssptrf_work, + LAPACKE_ssptri, + LAPACKE_ssptri_work, + LAPACKE_ssptrs, + LAPACKE_ssptrs_work, + LAPACKE_sst_nancheck, + LAPACKE_sstebz, + LAPACKE_sstebz_work, + LAPACKE_sstedc, + LAPACKE_sstedc_work, + LAPACKE_sstegr, + LAPACKE_sstegr_work, + LAPACKE_sstein, + LAPACKE_sstein_work, + LAPACKE_sstemr, + LAPACKE_sstemr_work, + LAPACKE_ssteqr, + LAPACKE_ssteqr_work, + LAPACKE_ssterf, + LAPACKE_ssterf_work, + LAPACKE_sstev, + LAPACKE_sstev_work, + LAPACKE_sstevd, + LAPACKE_sstevd_work, + LAPACKE_sstevr, + LAPACKE_sstevr_work, + LAPACKE_sstevx, + LAPACKE_sstevx_work, + LAPACKE_ssy_nancheck, + LAPACKE_ssy_trans, + LAPACKE_ssycon, + LAPACKE_ssycon_work, + LAPACKE_ssyconv, + LAPACKE_ssyconv_work, + LAPACKE_ssyequb, + LAPACKE_ssyequb_work, + LAPACKE_ssyev, + LAPACKE_ssyev_work, + LAPACKE_ssyevd, + LAPACKE_ssyevd_work, + LAPACKE_ssyevr, + LAPACKE_ssyevr_work, + LAPACKE_ssyevx, + LAPACKE_ssyevx_work, + LAPACKE_ssygst, + LAPACKE_ssygst_work, + LAPACKE_ssygv, + LAPACKE_ssygv_work, + LAPACKE_ssygvd, + LAPACKE_ssygvd_work, + LAPACKE_ssygvx, + LAPACKE_ssygvx_work, + LAPACKE_ssyrfs, + LAPACKE_ssyrfs_work, + LAPACKE_ssysv, + LAPACKE_ssysv_work, + LAPACKE_ssysvx, + LAPACKE_ssysvx_work, + LAPACKE_ssyswapr, + LAPACKE_ssyswapr_work, + LAPACKE_ssytrd, + LAPACKE_ssytrd_work, + LAPACKE_ssytrf, + LAPACKE_ssytrf_work, + LAPACKE_ssytri, + LAPACKE_ssytri2, + LAPACKE_ssytri2_work, + LAPACKE_ssytri2x, + LAPACKE_ssytri2x_work, + LAPACKE_ssytri_work, + LAPACKE_ssytrs, + LAPACKE_ssytrs2, + LAPACKE_ssytrs2_work, + LAPACKE_ssytrs_work, + LAPACKE_stb_nancheck, + LAPACKE_stb_trans, + LAPACKE_stbcon, + LAPACKE_stbcon_work, + LAPACKE_stbrfs, + LAPACKE_stbrfs_work, + LAPACKE_stbtrs, + LAPACKE_stbtrs_work, + LAPACKE_stf_nancheck, + LAPACKE_stf_trans, + LAPACKE_stfsm, + LAPACKE_stfsm_work, + LAPACKE_stftri, + LAPACKE_stftri_work, + LAPACKE_stfttp, + LAPACKE_stfttp_work, + LAPACKE_stfttr, + LAPACKE_stfttr_work, + LAPACKE_stgevc, + LAPACKE_stgevc_work, + LAPACKE_stgexc, + LAPACKE_stgexc_work, + LAPACKE_stgsen, + LAPACKE_stgsen_work, + LAPACKE_stgsja, + LAPACKE_stgsja_work, + LAPACKE_stgsna, + LAPACKE_stgsna_work, + LAPACKE_stgsyl, + LAPACKE_stgsyl_work, + LAPACKE_stp_nancheck, + LAPACKE_stp_trans, + LAPACKE_stpcon, + LAPACKE_stpcon_work, + LAPACKE_stpmqrt, + LAPACKE_stpmqrt_work, + LAPACKE_stpqrt2, + LAPACKE_stpqrt2_work, + LAPACKE_stprfb, + LAPACKE_stprfb_work, + LAPACKE_stprfs, + LAPACKE_stprfs_work, + LAPACKE_stptri, + LAPACKE_stptri_work, + LAPACKE_stptrs, + LAPACKE_stptrs_work, + LAPACKE_stpttf, + LAPACKE_stpttf_work, + LAPACKE_stpttr, + LAPACKE_stpttr_work, + LAPACKE_str_nancheck, + LAPACKE_str_trans, + LAPACKE_strcon, + LAPACKE_strcon_work, + LAPACKE_strevc, + LAPACKE_strevc_work, + LAPACKE_strexc, + LAPACKE_strexc_work, + LAPACKE_strrfs, + LAPACKE_strrfs_work, + LAPACKE_strsen, + LAPACKE_strsen_work, + LAPACKE_strsna, + LAPACKE_strsna_work, + LAPACKE_strsyl, + LAPACKE_strsyl_work, + LAPACKE_strtri, + LAPACKE_strtri_work, + LAPACKE_strtrs, + LAPACKE_strtrs_work, + LAPACKE_strttf, + LAPACKE_strttf_work, + LAPACKE_strttp, + LAPACKE_strttp_work, + LAPACKE_stzrzf, + LAPACKE_stzrzf_work, + LAPACKE_xerbla, + LAPACKE_z_nancheck, + LAPACKE_zbbcsd, + LAPACKE_zbbcsd_work, + LAPACKE_zbdsqr, + LAPACKE_zbdsqr_work, + LAPACKE_zcgesv, + LAPACKE_zcgesv_work, + LAPACKE_zcposv, + LAPACKE_zcposv_work, + LAPACKE_zgb_nancheck, + LAPACKE_zgb_trans, + LAPACKE_zgbbrd, + LAPACKE_zgbbrd_work, + LAPACKE_zgbcon, + LAPACKE_zgbcon_work, + LAPACKE_zgbequ, + LAPACKE_zgbequ_work, + LAPACKE_zgbequb, + LAPACKE_zgbequb_work, + LAPACKE_zgbrfs, + LAPACKE_zgbrfs_work, + LAPACKE_zgbsv, + LAPACKE_zgbsv_work, + LAPACKE_zgbsvx, + LAPACKE_zgbsvx_work, + LAPACKE_zgbtrf, + LAPACKE_zgbtrf_work, + LAPACKE_zgbtrs, + LAPACKE_zgbtrs_work, + LAPACKE_zge_nancheck, + LAPACKE_zge_trans, + LAPACKE_zgebak, + LAPACKE_zgebak_work, + LAPACKE_zgebal, + LAPACKE_zgebal_work, + LAPACKE_zgebrd, + LAPACKE_zgebrd_work, + LAPACKE_zgecon, + LAPACKE_zgecon_work, + LAPACKE_zgeequ, + LAPACKE_zgeequ_work, + LAPACKE_zgeequb, + LAPACKE_zgeequb_work, + LAPACKE_zgees, + LAPACKE_zgees_work, + LAPACKE_zgeesx, + LAPACKE_zgeesx_work, + LAPACKE_zgeev, + LAPACKE_zgeev_work, + LAPACKE_zgeevx, + LAPACKE_zgeevx_work, + LAPACKE_zgehrd, + LAPACKE_zgehrd_work, + LAPACKE_zgelq2, + LAPACKE_zgelq2_work, + LAPACKE_zgelqf, + LAPACKE_zgelqf_work, + LAPACKE_zgels, + LAPACKE_zgels_work, + LAPACKE_zgelsd, + LAPACKE_zgelsd_work, + LAPACKE_zgelss, + LAPACKE_zgelss_work, + LAPACKE_zgelsy, + LAPACKE_zgelsy_work, + LAPACKE_zgemqrt, + LAPACKE_zgemqrt_work, + LAPACKE_zgeqlf, + LAPACKE_zgeqlf_work, + LAPACKE_zgeqp3, + LAPACKE_zgeqp3_work, + LAPACKE_zgeqpf, + LAPACKE_zgeqpf_work, + LAPACKE_zgeqr2, + LAPACKE_zgeqr2_work, + LAPACKE_zgeqrf, + LAPACKE_zgeqrf_work, + LAPACKE_zgeqrfp, + LAPACKE_zgeqrfp_work, + LAPACKE_zgeqrt, + LAPACKE_zgeqrt2, + LAPACKE_zgeqrt2_work, + LAPACKE_zgeqrt3, + LAPACKE_zgeqrt3_work, + LAPACKE_zgeqrt_work, + LAPACKE_zgerfs, + LAPACKE_zgerfs_work, + LAPACKE_zgerqf, + LAPACKE_zgerqf_work, + LAPACKE_zgesdd, + LAPACKE_zgesdd_work, + LAPACKE_zgesv, + LAPACKE_zgesv_work, + LAPACKE_zgesvd, + LAPACKE_zgesvd_work, + LAPACKE_zgesvx, + LAPACKE_zgesvx_work, + LAPACKE_zgetf2, + LAPACKE_zgetf2_work, + LAPACKE_zgetrf, + LAPACKE_zgetrf_work, + LAPACKE_zgetri, + LAPACKE_zgetri_work, + LAPACKE_zgetrs, + LAPACKE_zgetrs_work, + LAPACKE_zgg_nancheck, + LAPACKE_zgg_trans, + LAPACKE_zggbak, + LAPACKE_zggbak_work, + LAPACKE_zggbal, + LAPACKE_zggbal_work, + LAPACKE_zgges, + LAPACKE_zgges_work, + LAPACKE_zggesx, + LAPACKE_zggesx_work, + LAPACKE_zggev, + LAPACKE_zggev_work, + LAPACKE_zggevx, + LAPACKE_zggevx_work, + LAPACKE_zggglm, + LAPACKE_zggglm_work, + LAPACKE_zgghrd, + LAPACKE_zgghrd_work, + LAPACKE_zgglse, + LAPACKE_zgglse_work, + LAPACKE_zggqrf, + LAPACKE_zggqrf_work, + LAPACKE_zggrqf, + LAPACKE_zggrqf_work, + LAPACKE_zggsvd, + LAPACKE_zggsvd_work, + LAPACKE_zggsvp, + LAPACKE_zggsvp_work, + LAPACKE_zgt_nancheck, + LAPACKE_zgtcon, + LAPACKE_zgtcon_work, + LAPACKE_zgtrfs, + LAPACKE_zgtrfs_work, + LAPACKE_zgtsv, + LAPACKE_zgtsv_work, + LAPACKE_zgtsvx, + LAPACKE_zgtsvx_work, + LAPACKE_zgttrf, + LAPACKE_zgttrf_work, + LAPACKE_zgttrs, + LAPACKE_zgttrs_work, + LAPACKE_zhb_nancheck, + LAPACKE_zhb_trans, + LAPACKE_zhbev, + LAPACKE_zhbev_work, + LAPACKE_zhbevd, + LAPACKE_zhbevd_work, + LAPACKE_zhbevx, + LAPACKE_zhbevx_work, + LAPACKE_zhbgst, + LAPACKE_zhbgst_work, + LAPACKE_zhbgv, + LAPACKE_zhbgv_work, + LAPACKE_zhbgvd, + LAPACKE_zhbgvd_work, + LAPACKE_zhbgvx, + LAPACKE_zhbgvx_work, + LAPACKE_zhbtrd, + LAPACKE_zhbtrd_work, + LAPACKE_zhe_nancheck, + LAPACKE_zhe_trans, + LAPACKE_zhecon, + LAPACKE_zhecon_work, + LAPACKE_zheequb, + LAPACKE_zheequb_work, + LAPACKE_zheev, + LAPACKE_zheev_work, + LAPACKE_zheevd, + LAPACKE_zheevd_work, + LAPACKE_zheevr, + LAPACKE_zheevr_work, + LAPACKE_zheevx, + LAPACKE_zheevx_work, + LAPACKE_zhegst, + LAPACKE_zhegst_work, + LAPACKE_zhegv, + LAPACKE_zhegv_work, + LAPACKE_zhegvd, + LAPACKE_zhegvd_work, + LAPACKE_zhegvx, + LAPACKE_zhegvx_work, + LAPACKE_zherfs, + LAPACKE_zherfs_work, + LAPACKE_zhesv, + LAPACKE_zhesv_work, + LAPACKE_zhesvx, + LAPACKE_zhesvx_work, + LAPACKE_zheswapr, + LAPACKE_zheswapr_work, + LAPACKE_zhetrd, + LAPACKE_zhetrd_work, + LAPACKE_zhetrf, + LAPACKE_zhetrf_work, + LAPACKE_zhetri, + LAPACKE_zhetri2, + LAPACKE_zhetri2_work, + LAPACKE_zhetri2x, + LAPACKE_zhetri2x_work, + LAPACKE_zhetri_work, + LAPACKE_zhetrs, + LAPACKE_zhetrs2, + LAPACKE_zhetrs2_work, + LAPACKE_zhetrs_work, + LAPACKE_zhfrk, + LAPACKE_zhfrk_work, + LAPACKE_zhgeqz, + LAPACKE_zhgeqz_work, + LAPACKE_zhp_nancheck, + LAPACKE_zhp_trans, + LAPACKE_zhpcon, + LAPACKE_zhpcon_work, + LAPACKE_zhpev, + LAPACKE_zhpev_work, + LAPACKE_zhpevd, + LAPACKE_zhpevd_work, + LAPACKE_zhpevx, + LAPACKE_zhpevx_work, + LAPACKE_zhpgst, + LAPACKE_zhpgst_work, + LAPACKE_zhpgv, + LAPACKE_zhpgv_work, + LAPACKE_zhpgvd, + LAPACKE_zhpgvd_work, + LAPACKE_zhpgvx, + LAPACKE_zhpgvx_work, + LAPACKE_zhprfs, + LAPACKE_zhprfs_work, + LAPACKE_zhpsv, + LAPACKE_zhpsv_work, + LAPACKE_zhpsvx, + LAPACKE_zhpsvx_work, + LAPACKE_zhptrd, + LAPACKE_zhptrd_work, + LAPACKE_zhptrf, + LAPACKE_zhptrf_work, + LAPACKE_zhptri, + LAPACKE_zhptri_work, + LAPACKE_zhptrs, + LAPACKE_zhptrs_work, + LAPACKE_zhs_nancheck, + LAPACKE_zhs_trans, + LAPACKE_zhsein, + LAPACKE_zhsein_work, + LAPACKE_zhseqr, + LAPACKE_zhseqr_work, + LAPACKE_zlacgv, + LAPACKE_zlacgv_work, + LAPACKE_zlacpy, + LAPACKE_zlacpy_work, + LAPACKE_zlag2c, + LAPACKE_zlag2c_work, + LAPACKE_zlange, + LAPACKE_zlange_work, + LAPACKE_zlanhe, + LAPACKE_zlanhe_work, + LAPACKE_zlansy, + LAPACKE_zlansy_work, + LAPACKE_zlantr, + LAPACKE_zlantr_work, + LAPACKE_zlapmr, + LAPACKE_zlapmr_work, + LAPACKE_zlarfb, + LAPACKE_zlarfb_work, + LAPACKE_zlarfg, + LAPACKE_zlarfg_work, + LAPACKE_zlarft, + LAPACKE_zlarft_work, + LAPACKE_zlarfx, + LAPACKE_zlarfx_work, + LAPACKE_zlarnv, + LAPACKE_zlarnv_work, + LAPACKE_zlaset, + LAPACKE_zlaset_work, + LAPACKE_zlaswp, + LAPACKE_zlaswp_work, + LAPACKE_zlauum, + LAPACKE_zlauum_work, + LAPACKE_zpb_nancheck, + LAPACKE_zpb_trans, + LAPACKE_zpbcon, + LAPACKE_zpbcon_work, + LAPACKE_zpbequ, + LAPACKE_zpbequ_work, + LAPACKE_zpbrfs, + LAPACKE_zpbrfs_work, + LAPACKE_zpbstf, + LAPACKE_zpbstf_work, + LAPACKE_zpbsv, + LAPACKE_zpbsv_work, + LAPACKE_zpbsvx, + LAPACKE_zpbsvx_work, + LAPACKE_zpbtrf, + LAPACKE_zpbtrf_work, + LAPACKE_zpbtrs, + LAPACKE_zpbtrs_work, + LAPACKE_zpf_nancheck, + LAPACKE_zpf_trans, + LAPACKE_zpftrf, + LAPACKE_zpftrf_work, + LAPACKE_zpftri, + LAPACKE_zpftri_work, + LAPACKE_zpftrs, + LAPACKE_zpftrs_work, + LAPACKE_zpo_nancheck, + LAPACKE_zpo_trans, + LAPACKE_zpocon, + LAPACKE_zpocon_work, + LAPACKE_zpoequ, + LAPACKE_zpoequ_work, + LAPACKE_zpoequb, + LAPACKE_zpoequb_work, + LAPACKE_zporfs, + LAPACKE_zporfs_work, + LAPACKE_zposv, + LAPACKE_zposv_work, + LAPACKE_zposvx, + LAPACKE_zposvx_work, + LAPACKE_zpotrf, + LAPACKE_zpotrf_work, + LAPACKE_zpotri, + LAPACKE_zpotri_work, + LAPACKE_zpotrs, + LAPACKE_zpotrs_work, + LAPACKE_zpp_nancheck, + LAPACKE_zpp_trans, + LAPACKE_zppcon, + LAPACKE_zppcon_work, + LAPACKE_zppequ, + LAPACKE_zppequ_work, + LAPACKE_zpprfs, + LAPACKE_zpprfs_work, + LAPACKE_zppsv, + LAPACKE_zppsv_work, + LAPACKE_zppsvx, + LAPACKE_zppsvx_work, + LAPACKE_zpptrf, + LAPACKE_zpptrf_work, + LAPACKE_zpptri, + LAPACKE_zpptri_work, + LAPACKE_zpptrs, + LAPACKE_zpptrs_work, + LAPACKE_zpstrf, + LAPACKE_zpstrf_work, + LAPACKE_zpt_nancheck, + LAPACKE_zptcon, + LAPACKE_zptcon_work, + LAPACKE_zpteqr, + LAPACKE_zpteqr_work, + LAPACKE_zptrfs, + LAPACKE_zptrfs_work, + LAPACKE_zptsv, + LAPACKE_zptsv_work, + LAPACKE_zptsvx, + LAPACKE_zptsvx_work, + LAPACKE_zpttrf, + LAPACKE_zpttrf_work, + LAPACKE_zpttrs, + LAPACKE_zpttrs_work, + LAPACKE_zsp_nancheck, + LAPACKE_zsp_trans, + LAPACKE_zspcon, + LAPACKE_zspcon_work, + LAPACKE_zsprfs, + LAPACKE_zsprfs_work, + LAPACKE_zspsv, + LAPACKE_zspsv_work, + LAPACKE_zspsvx, + LAPACKE_zspsvx_work, + LAPACKE_zsptrf, + LAPACKE_zsptrf_work, + LAPACKE_zsptri, + LAPACKE_zsptri_work, + LAPACKE_zsptrs, + LAPACKE_zsptrs_work, + LAPACKE_zst_nancheck, + LAPACKE_zstedc, + LAPACKE_zstedc_work, + LAPACKE_zstegr, + LAPACKE_zstegr_work, + LAPACKE_zstein, + LAPACKE_zstein_work, + LAPACKE_zstemr, + LAPACKE_zstemr_work, + LAPACKE_zsteqr, + LAPACKE_zsteqr_work, + LAPACKE_zsy_nancheck, + LAPACKE_zsy_trans, + LAPACKE_zsycon, + LAPACKE_zsycon_work, + LAPACKE_zsyconv, + LAPACKE_zsyconv_work, + LAPACKE_zsyequb, + LAPACKE_zsyequb_work, + LAPACKE_zsyrfs, + LAPACKE_zsyrfs_work, + LAPACKE_zsysv, + LAPACKE_zsysv_work, + LAPACKE_zsysvx, + LAPACKE_zsysvx_work, + LAPACKE_zsyswapr, + LAPACKE_zsyswapr_work, + LAPACKE_zsytrf, + LAPACKE_zsytrf_work, + LAPACKE_zsytri, + LAPACKE_zsytri2, + LAPACKE_zsytri2_work, + LAPACKE_zsytri2x, + LAPACKE_zsytri2x_work, + LAPACKE_zsytri_work, + LAPACKE_zsytrs, + LAPACKE_zsytrs2, + LAPACKE_zsytrs2_work, + LAPACKE_zsytrs_work, + LAPACKE_ztb_nancheck, + LAPACKE_ztb_trans, + LAPACKE_ztbcon, + LAPACKE_ztbcon_work, + LAPACKE_ztbrfs, + LAPACKE_ztbrfs_work, + LAPACKE_ztbtrs, + LAPACKE_ztbtrs_work, + LAPACKE_ztf_nancheck, + LAPACKE_ztf_trans, + LAPACKE_ztfsm, + LAPACKE_ztfsm_work, + LAPACKE_ztftri, + LAPACKE_ztftri_work, + LAPACKE_ztfttp, + LAPACKE_ztfttp_work, + LAPACKE_ztfttr, + LAPACKE_ztfttr_work, + LAPACKE_ztgevc, + LAPACKE_ztgevc_work, + LAPACKE_ztgexc, + LAPACKE_ztgexc_work, + LAPACKE_ztgsen, + LAPACKE_ztgsen_work, + LAPACKE_ztgsja, + LAPACKE_ztgsja_work, + LAPACKE_ztgsna, + LAPACKE_ztgsna_work, + LAPACKE_ztgsyl, + LAPACKE_ztgsyl_work, + LAPACKE_ztp_nancheck, + LAPACKE_ztp_trans, + LAPACKE_ztpcon, + LAPACKE_ztpcon_work, + LAPACKE_ztpmqrt, + LAPACKE_ztpmqrt_work, + LAPACKE_ztpqrt, + LAPACKE_ztpqrt2, + LAPACKE_ztpqrt2_work, + LAPACKE_ztpqrt_work, + LAPACKE_ztprfb, + LAPACKE_ztprfb_work, + LAPACKE_ztprfs, + LAPACKE_ztprfs_work, + LAPACKE_ztptri, + LAPACKE_ztptri_work, + LAPACKE_ztptrs, + LAPACKE_ztptrs_work, + LAPACKE_ztpttf, + LAPACKE_ztpttf_work, + LAPACKE_ztpttr, + LAPACKE_ztpttr_work, + LAPACKE_ztr_nancheck, + LAPACKE_ztr_trans, + LAPACKE_ztrcon, + LAPACKE_ztrcon_work, + LAPACKE_ztrevc, + LAPACKE_ztrevc_work, + LAPACKE_ztrexc, + LAPACKE_ztrexc_work, + LAPACKE_ztrrfs, + LAPACKE_ztrrfs_work, + LAPACKE_ztrsen, + LAPACKE_ztrsen_work, + LAPACKE_ztrsna, + LAPACKE_ztrsna_work, + LAPACKE_ztrsyl, + LAPACKE_ztrsyl_work, + LAPACKE_ztrtri, + LAPACKE_ztrtri_work, + LAPACKE_ztrtrs, + LAPACKE_ztrtrs_work, + LAPACKE_ztrttf, + LAPACKE_ztrttf_work, + LAPACKE_ztrttp, + LAPACKE_ztrttp_work, + LAPACKE_ztzrzf, + LAPACKE_ztzrzf_work, + LAPACKE_zunbdb, + LAPACKE_zunbdb_work, + LAPACKE_zuncsd, + LAPACKE_zuncsd_work, + LAPACKE_zungbr, + LAPACKE_zungbr_work, + LAPACKE_zunghr, + LAPACKE_zunghr_work, + LAPACKE_zunglq, + LAPACKE_zunglq_work, + LAPACKE_zungql, + LAPACKE_zungql_work, + LAPACKE_zungqr, + LAPACKE_zungqr_work, + LAPACKE_zungrq, + LAPACKE_zungrq_work, + LAPACKE_zungtr, + LAPACKE_zungtr_work, + LAPACKE_zunmbr, + LAPACKE_zunmbr_work, + LAPACKE_zunmhr, + LAPACKE_zunmhr_work, + LAPACKE_zunmlq, + LAPACKE_zunmlq_work, + LAPACKE_zunmql, + LAPACKE_zunmql_work, + LAPACKE_zunmqr, + LAPACKE_zunmqr_work, + LAPACKE_zunmrq, + LAPACKE_zunmrq_work, + LAPACKE_zunmrz, + LAPACKE_zunmrz_work, + LAPACKE_zunmtr, + LAPACKE_zunmtr_work, + LAPACKE_zupgtr, + LAPACKE_zupgtr_work, + LAPACKE_zupmtr, + LAPACKE_zupmtr_work, + ); + if ($ARGV[5] == 1) { #NO_LAPACK=1 - @objs = (@blasobjs); + @underscore_objs = (@blasobjs); } elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0") { - @objs = (@blasobjs, @lapackobjs, @lapackobjs2); + @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2); } else { - @objs = (@blasobjs, @lapackobjs); + @underscore_objs = (@blasobjs, @lapackobjs); } -if ($ARGV[3] == 1){ @objs = (@objs, @exblasobjs); }; +if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); }; + +if ($ARGV[1] eq "X86_64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; -if ($ARGV[1] eq "X86_64"){ @objs = (@objs, @gemm3mobjs); }; +if ($ARGV[1] eq "x86"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; -if ($ARGV[1] eq "x86"){ @objs = (@objs, @gemm3mobjs); }; +if ($ARGV[1] eq "ia64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; -if ($ARGV[1] eq "ia64"){ @objs = (@objs, @gemm3mobjs); }; +if ($ARGV[1] eq "MIPS"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; -if ($ARGV[1] eq "MIPS"){ @objs = (@objs, @gemm3mobjs); }; +if ($ARGV[6] == 1) { + #NO_LAPACK=1 + @no_underscore_objs = (@cblasobjs); +} else { + @no_underscore_objs = (@cblasobjs, @lapackeobjs); +} @linuxobjs = ('__strtol_internal', 'exit', 'free', 'getenv', 'malloc', 'mmap', 'printf', 'sqrt', @@ -369,12 +2481,12 @@ $bu = $ARGV[2]; $bu = "" if (($bu eq "0") || ($bu eq "1")); if ($ARGV[0] eq "linux"){ - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { print $objs, $bu, "\n"; } if ($ARGV[4] == 0) { - foreach $objs (@cblasobjs) { + foreach $objs (@no_underscore_objs) { print $objs, "\n"; } } @@ -386,12 +2498,12 @@ if ($ARGV[0] eq "linux"){ } if ($ARGV[0] eq "osx"){ - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { print "_", $objs, $bu, "\n"; } if ($ARGV[4] == 0) { - foreach $objs (@cblasobjs) { + foreach $objs (@no_underscore_objs) { print "_", $objs, "\n"; } } @@ -399,12 +2511,12 @@ if ($ARGV[0] eq "osx"){ } if ($ARGV[0] eq "aix"){ - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { print $objs, $bu, "\n"; } if ($ARGV[4] == 0) { - foreach $objs (@cblasobjs) { + foreach $objs (@no_underscore_objs) { print $objs, "\n"; } } @@ -414,7 +2526,7 @@ if ($ARGV[0] eq "aix"){ if ($ARGV[0] eq "win2k"){ print "EXPORTS\n"; $count = 1; - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; print "\t$objs=$objs","_ \@", $count, "\n"; @@ -426,7 +2538,7 @@ if ($ARGV[0] eq "win2k"){ } if ($ARGV[4] == 0) { - foreach $objs (@cblasobjs) { + foreach $objs (@no_underscore_objs) { print "\t",$objs,"=$objs"," \@", $count, "\n"; $count ++; } @@ -460,7 +2572,7 @@ if ($ARGV[0] eq "win2khpl"){ if ($ARGV[0] eq "microsoft"){ print "EXPORTS\n"; $count = 1; - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; print "\t$objs = $objs","_\n"; @@ -477,7 +2589,7 @@ if ($ARGV[0] eq "microsoft"){ if ($ARGV[0] eq "win2kasm"){ print "\t.text\n"; - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; print "\t.align 16\n"; @@ -490,11 +2602,11 @@ if ($ARGV[0] eq "win2kasm"){ if ($ARGV[0] eq "linktest"){ print "int main(void){\n"; - foreach $objs (@objs) { + foreach $objs (@underscore_objs) { print $objs, $bu, "();\n" if $objs ne "xerbla"; } if ($ARGV[4] == 0) { - foreach $objs (@cblasobjs) { + foreach $objs (@no_underscore_objs) { print $objs, "();\n"; } } diff --git a/patch.for_lapack-3.4.0 b/patch.for_lapack-3.4.0 index 9d3cd5f31..5d048f9c4 100644 --- a/patch.for_lapack-3.4.0 +++ b/patch.for_lapack-3.4.0 @@ -887,3 +887,158 @@ diff -ruN lapack-3.4.0.old/TESTING/LIN/Makefile lapack-3.4.0/TESTING/LIN/Makefil ../xlintsts: xlintsts mv xlintsts $@ + +diff -ruN lapack-3.4.0.old/lapacke/make.inc lapack-3.4.0/lapacke/make.inc +--- lapack-3.4.0.old/lapacke/make.inc 2011-11-09 23:56:15 +0100 ++++ lapack-3.4.0/lapacke/make.inc 1970-01-01 01:00:00 +0100 +@@ -1,66 +0,0 @@ +-############################################################################## +-# Copyright (c) 2010, Intel Corp. +-# All rights reserved. +-# +-# Redistribution and use in source and binary forms, with or without +-# modification, are permitted provided that the following conditions are met: +-# +-# * Redistributions of source code must retain the above copyright notice, +-# this list of conditions and the following disclaimer. +-# * Redistributions in binary form must reproduce the above copyright +-# notice, this list of conditions and the following disclaimer in the +-# documentation and/or other materials provided with the distribution. +-# * Neither the name of Intel Corporation nor the names of its contributors +-# may be used to endorse or promote products derived from this software +-# without specific prior written permission. +-# +-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +-# THE POSSIBILITY OF SUCH DAMAGE. +-############################################################################## +-# Contents: Native C interface to LAPACK +-# Author: Intel Corporation +-# September, 2010 +-############################################################################## +-# This is the make.inc example. The following settings are used: +-# +-# Compiler: gcc +-# Configuration file: turned off (default) +-# Complex types: C99 (default) +-# Name pattern: mixed case (default) +-# (64-bit) Data model: LP64 (default) +-# +-# Basic include options. +-# CC is the C compiler, normally invoked with options CFLAGS. +-# LINKER is the linker, invoked with LDFLAGS. +-# +-# If libraries lapack.a and blas.a are built with +-# - ifort, set: LINKER = ifort +-# LDFLAGS = -nofor-main +-# - gfortran, set: LINKER = gfortran +-# +-CC = gcc +-CFLAGS = +-LINKER = gfortran +-LDFLAGS = +-# +-# The name of the libraries to be created/linked to +-# Ensure that the libraries have the same data model (LP64/ILP64). +-# +-LAPACKE = lapacke.a +-LIBS = /Users/julie/Documents/Boulot/lapack-dev/lapack/trunk/lapack_LINUX.a \ +- /Users/julie/Documents/Boulot/lapack-dev/lapack/trunk/blas_LINUX.a -lm +-# The archiver and the flag(s) to use when building archive (library) +-# If your system has no ranlib, set RANLIB = echo. +-# +-ARCH = ar +-ARCHFLAGS = cr +-RANLIB = ranlib +diff -ruN lapack-3.4.0.old/lapacke/make.inc.example lapack-3.4.0/lapacke/make.inc.example +--- lapack-3.4.0.old/lapacke/make.inc.example 1970-01-01 01:00:00 +0100 ++++ lapack-3.4.0/lapacke/make.inc.example 2011-11-09 23:56:15 +0100 +@@ -0,0 +1,66 @@ ++############################################################################## ++# Copyright (c) 2010, Intel Corp. ++# All rights reserved. ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions are met: ++# ++# * Redistributions of source code must retain the above copyright notice, ++# this list of conditions and the following disclaimer. ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in the ++# documentation and/or other materials provided with the distribution. ++# * Neither the name of Intel Corporation nor the names of its contributors ++# may be used to endorse or promote products derived from this software ++# without specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE ++# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ++# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ++# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF ++# THE POSSIBILITY OF SUCH DAMAGE. ++############################################################################## ++# Contents: Native C interface to LAPACK ++# Author: Intel Corporation ++# September, 2010 ++############################################################################## ++# This is the make.inc example. The following settings are used: ++# ++# Compiler: gcc ++# Configuration file: turned off (default) ++# Complex types: C99 (default) ++# Name pattern: mixed case (default) ++# (64-bit) Data model: LP64 (default) ++# ++# Basic include options. ++# CC is the C compiler, normally invoked with options CFLAGS. ++# LINKER is the linker, invoked with LDFLAGS. ++# ++# If libraries lapack.a and blas.a are built with ++# - ifort, set: LINKER = ifort ++# LDFLAGS = -nofor-main ++# - gfortran, set: LINKER = gfortran ++# ++CC = gcc ++CFLAGS = ++LINKER = gfortran ++LDFLAGS = ++# ++# The name of the libraries to be created/linked to ++# Ensure that the libraries have the same data model (LP64/ILP64). ++# ++LAPACKE = lapacke.a ++LIBS = /Users/julie/Documents/Boulot/lapack-dev/lapack/trunk/lapack_LINUX.a \ ++ /Users/julie/Documents/Boulot/lapack-dev/lapack/trunk/blas_LINUX.a -lm ++# The archiver and the flag(s) to use when building archive (library) ++# If your system has no ranlib, set RANLIB = echo. ++# ++ARCH = ar ++ARCHFLAGS = cr ++RANLIB = ranlib + +diff -ruN lapack-3.4.0.old/lapacke/src/Makefile lapack-3.4.0/lapacke/src/Makefile +--- lapack-3.4.0.old/lapacke/src/Makefile 2011-11-09 23:56:15 +0100 ++++ lapack-3.4.0/lapacke/src/Makefile 2012-04-06 13:57:50 +0200 +@@ -40,7 +40,8 @@ + all: lib + + lib: $(OBJ_FILES) +- $(ARCH) $(ARCHFLAGS) ../$(LAPACKE) $(OBJ_FILES) ++ # http://hackage.haskell.org/trac/gtk2hs/ticket/1146 ++ echo $(OBJ_FILES) | xargs --max-args=100 $(ARCH) $(ARCHFLAGS) ../$(LAPACKE) + $(RANLIB) ../$(LAPACKE) + + .c.o: From fd2ee0c9e21a0a7ce339ddd4e4527b675a7e7515 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 13 Apr 2012 23:12:06 +0800 Subject: [PATCH 007/162] Refs #88. Fixed the build bug about LAPACKE C Interface to LAPACKE. --- Makefile | 13 +- Makefile.install | 6 +- Makefile.rule | 3 +- Makefile.system | 2 + exports/Makefile | 8 + exports/gensymbol | 8 +- patch.for_lapack-3.4.0 | 2208 +++++++++++++++++++++++++++++++++++++--- 7 files changed, 2093 insertions(+), 155 deletions(-) diff --git a/Makefile b/Makefile index 6f0a255ab..5de7987e9 100644 --- a/Makefile +++ b/Makefile @@ -240,12 +240,20 @@ ifndef NOFORTRAN -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc endif +LAPACKE_CFLAGS = $(CFLAGS) +LAPACKE_LDFLAGS = $(FFLAGS) $(EXTRALIB) +ifeq ($(F_COMPILER), INTEL) +LAPACKE_LDFLAGS += -nofor-main +endif +ifdef INTERFACE64 +LAPACKE_CFLAGS += -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64 +endif $(NETLIB_LAPACK_DIR)/lapacke/make.inc : ifndef NOFORTRAN -@echo "CC = $(CC)" > $(NETLIB_LAPACK_DIR)/lapacke/make.inc - -@echo "CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc + -@echo "CFLAGS = $(LAPACKE_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc -@echo "LINKER = $(FC)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc - -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc + -@echo "LDFLAGS = $(LAPACKE_LDFLAGS)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc -@echo "LAPACKE = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc -@echo "LIBS = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc @@ -259,6 +267,7 @@ ifndef NO_LAPACK @if test `$(MD5SUM) lapack-3.4.0.tgz | $(AWK) '{print $$1}'` = 02d5706ec03ba885fc246e5fa10d8c70; then \ echo $(TAR) zxf $< ;\ $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.0) ;\ + rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\ else \ rm -rf $(NETLIB_LAPACK_DIR) ;\ echo " Cannot download lapack-3.4.0.tgz or the MD5 check sum is wrong (Please use orignal)."; \ diff --git a/Makefile.install b/Makefile.install index 2ba10d0dc..6ecfd91ed 100644 --- a/Makefile.install +++ b/Makefile.install @@ -37,9 +37,9 @@ install : lib.grd ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(OPENBLAS_LIBRARY_DIR) - @cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(OPENBLAS_INCLUDE_DIR)/lapacke.h - @cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(OPENBLAS_INCLUDE_DIR)/lapacke_config.h - @cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h + @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(OPENBLAS_INCLUDE_DIR)/lapacke.h + @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(OPENBLAS_INCLUDE_DIR)/lapacke_config.h + @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h endif #for install static library diff --git a/Makefile.rule b/Makefile.rule index 843888b4c..7a1e845fe 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -48,7 +48,8 @@ VERSION = 0.1.0 # If you don't need CBLAS interface, please comment it in. # NO_CBLAS = 1 -# If you don't need LAPACK, please comment it in. +# If you don't need LAPACK, please comment it in. +# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. # NO_LAPACK = 1 # If you don't need LAPACKE (C Interface to LAPACK), please comment it in. diff --git a/Makefile.system b/Makefile.system index bc5b20d86..dbc8ffe07 100644 --- a/Makefile.system +++ b/Makefile.system @@ -538,6 +538,8 @@ endif ifeq ($(NO_LAPACK), 1) CCOMMON_OPT += -DNO_LAPACK +#Disable LAPACK C interface +NO_LAPACKE = 1 endif ifeq ($(NO_LAPACKE), 1) diff --git a/exports/Makefile b/exports/Makefile index c4d2abd63..971bd0bed 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -10,6 +10,14 @@ ifndef NO_CBLAS NO_CBLAS = 0 endif +ifndef NO_LAPACK +NO_LAPACK = 0 +endif + +ifndef NO_LAPACKE +NO_LAPACKE = 0 +endif + ifeq ($(OSNAME), WINNT) ifeq ($(F_COMPILER), GFORTRAN) EXTRALIB += -lgfortran diff --git a/exports/gensymbol b/exports/gensymbol index 626827e4e..d9d35de48 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -332,8 +332,14 @@ zbbcsd, zlapmr, zunbdb, zuncsd, zgeqrt, zgeqrt2, zgeqrt3, zgemqrt, ztpqrt, ztpqrt2, ztpmqrt, ztprfb, + ); +@lapack_extendedprecision_objs = ( + zposvxx, clagge, clatms, chesvxx, cposvxx, cgesvxx, ssyrfssx, csyrfsx, + dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, + ); + @lapackeobjs = ( lapack_make_complex_double, lapack_make_complex_float, @@ -2459,7 +2465,7 @@ if ($ARGV[1] eq "ia64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "MIPS"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[6] == 1) { - #NO_LAPACK=1 + #NO_LAPACKE=1 @no_underscore_objs = (@cblasobjs); } else { @no_underscore_objs = (@cblasobjs, @lapackeobjs); diff --git a/patch.for_lapack-3.4.0 b/patch.for_lapack-3.4.0 index 5d048f9c4..a3dc9b8a9 100644 --- a/patch.for_lapack-3.4.0 +++ b/patch.for_lapack-3.4.0 @@ -888,157 +888,2069 @@ diff -ruN lapack-3.4.0.old/TESTING/LIN/Makefile lapack-3.4.0/TESTING/LIN/Makefil ../xlintsts: xlintsts mv xlintsts $@ -diff -ruN lapack-3.4.0.old/lapacke/make.inc lapack-3.4.0/lapacke/make.inc ---- lapack-3.4.0.old/lapacke/make.inc 2011-11-09 23:56:15 +0100 -+++ lapack-3.4.0/lapacke/make.inc 1970-01-01 01:00:00 +0100 -@@ -1,66 +0,0 @@ --############################################################################## --# Copyright (c) 2010, Intel Corp. --# All rights reserved. --# --# Redistribution and use in source and binary forms, with or without --# modification, are permitted provided that the following conditions are met: --# --# * Redistributions of source code must retain the above copyright notice, --# this list of conditions and the following disclaimer. --# * Redistributions in binary form must reproduce the above copyright --# notice, this list of conditions and the following disclaimer in the --# documentation and/or other materials provided with the distribution. --# * Neither the name of Intel Corporation nor the names of its contributors --# may be used to endorse or promote products derived from this software --# without specific prior written permission. --# --# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" --# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE --# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE --# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE --# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR --# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF --# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS --# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN --# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) --# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF --# THE POSSIBILITY OF SUCH DAMAGE. --############################################################################## --# Contents: Native C interface to LAPACK --# Author: Intel Corporation --# September, 2010 --############################################################################## --# This is the make.inc example. The following settings are used: --# --# Compiler: gcc --# Configuration file: turned off (default) --# Complex types: C99 (default) --# Name pattern: mixed case (default) --# (64-bit) Data model: LP64 (default) --# --# Basic include options. --# CC is the C compiler, normally invoked with options CFLAGS. --# LINKER is the linker, invoked with LDFLAGS. --# --# If libraries lapack.a and blas.a are built with --# - ifort, set: LINKER = ifort --# LDFLAGS = -nofor-main --# - gfortran, set: LINKER = gfortran --# --CC = gcc --CFLAGS = --LINKER = gfortran --LDFLAGS = --# --# The name of the libraries to be created/linked to --# Ensure that the libraries have the same data model (LP64/ILP64). --# --LAPACKE = lapacke.a --LIBS = /Users/julie/Documents/Boulot/lapack-dev/lapack/trunk/lapack_LINUX.a \ -- /Users/julie/Documents/Boulot/lapack-dev/lapack/trunk/blas_LINUX.a -lm --# The archiver and the flag(s) to use when building archive (library) --# If your system has no ranlib, set RANLIB = echo. --# --ARCH = ar --ARCHFLAGS = cr --RANLIB = ranlib -diff -ruN lapack-3.4.0.old/lapacke/make.inc.example lapack-3.4.0/lapacke/make.inc.example ---- lapack-3.4.0.old/lapacke/make.inc.example 1970-01-01 01:00:00 +0100 -+++ lapack-3.4.0/lapacke/make.inc.example 2011-11-09 23:56:15 +0100 -@@ -0,0 +1,66 @@ -+############################################################################## -+# Copyright (c) 2010, Intel Corp. -+# All rights reserved. -+# -+# Redistribution and use in source and binary forms, with or without -+# modification, are permitted provided that the following conditions are met: -+# -+# * Redistributions of source code must retain the above copyright notice, -+# this list of conditions and the following disclaimer. -+# * Redistributions in binary form must reproduce the above copyright -+# notice, this list of conditions and the following disclaimer in the -+# documentation and/or other materials provided with the distribution. -+# * Neither the name of Intel Corporation nor the names of its contributors -+# may be used to endorse or promote products derived from this software -+# without specific prior written permission. -+# -+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -+# THE POSSIBILITY OF SUCH DAMAGE. -+############################################################################## -+# Contents: Native C interface to LAPACK -+# Author: Intel Corporation -+# September, 2010 -+############################################################################## -+# This is the make.inc example. The following settings are used: -+# -+# Compiler: gcc -+# Configuration file: turned off (default) -+# Complex types: C99 (default) -+# Name pattern: mixed case (default) -+# (64-bit) Data model: LP64 (default) -+# -+# Basic include options. -+# CC is the C compiler, normally invoked with options CFLAGS. -+# LINKER is the linker, invoked with LDFLAGS. -+# -+# If libraries lapack.a and blas.a are built with -+# - ifort, set: LINKER = ifort -+# LDFLAGS = -nofor-main -+# - gfortran, set: LINKER = gfortran -+# -+CC = gcc -+CFLAGS = -+LINKER = gfortran -+LDFLAGS = -+# -+# The name of the libraries to be created/linked to -+# Ensure that the libraries have the same data model (LP64/ILP64). -+# -+LAPACKE = lapacke.a -+LIBS = /Users/julie/Documents/Boulot/lapack-dev/lapack/trunk/lapack_LINUX.a \ -+ /Users/julie/Documents/Boulot/lapack-dev/lapack/trunk/blas_LINUX.a -lm -+# The archiver and the flag(s) to use when building archive (library) -+# If your system has no ranlib, set RANLIB = echo. -+# -+ARCH = ar -+ARCHFLAGS = cr -+RANLIB = ranlib - diff -ruN lapack-3.4.0.old/lapacke/src/Makefile lapack-3.4.0/lapacke/src/Makefile ---- lapack-3.4.0.old/lapacke/src/Makefile 2011-11-09 23:56:15 +0100 -+++ lapack-3.4.0/lapacke/src/Makefile 2012-04-06 13:57:50 +0200 -@@ -40,7 +40,8 @@ +--- lapack-3.4.0.old/lapacke/src/Makefile 2011-11-10 06:56:15.000000000 +0800 ++++ lapack-3.4.0/lapacke/src/Makefile 2012-04-13 22:37:09.000000000 +0800 +@@ -35,12 +35,2060 @@ + include ../make.inc + + C_FILES := $(wildcard *.c) +-OBJ_FILES := $(C_FILES:.c=.o) ++ ++LAPACKE_OBJS := \ ++lapacke_cbbcsd.o \ ++lapacke_cbbcsd_work.o \ ++lapacke_cbdsqr.o \ ++lapacke_cbdsqr_work.o \ ++lapacke_cgbbrd.o \ ++lapacke_cgbbrd_work.o \ ++lapacke_cgbcon.o \ ++lapacke_cgbcon_work.o \ ++lapacke_cgbequb.o \ ++lapacke_cgbequb_work.o \ ++lapacke_cgbequ.o \ ++lapacke_cgbequ_work.o \ ++lapacke_cgbrfs.o \ ++lapacke_cgbrfs_work.o \ ++lapacke_cgbsv.o \ ++lapacke_cgbsv_work.o \ ++lapacke_cgbsvx.o \ ++lapacke_cgbsvx_work.o \ ++lapacke_cgbtrf.o \ ++lapacke_cgbtrf_work.o \ ++lapacke_cgbtrs.o \ ++lapacke_cgbtrs_work.o \ ++lapacke_cgebak.o \ ++lapacke_cgebak_work.o \ ++lapacke_cgebal.o \ ++lapacke_cgebal_work.o \ ++lapacke_cgebrd.o \ ++lapacke_cgebrd_work.o \ ++lapacke_cgecon.o \ ++lapacke_cgecon_work.o \ ++lapacke_cgeequb.o \ ++lapacke_cgeequb_work.o \ ++lapacke_cgeequ.o \ ++lapacke_cgeequ_work.o \ ++lapacke_cgees.o \ ++lapacke_cgees_work.o \ ++lapacke_cgeesx.o \ ++lapacke_cgeesx_work.o \ ++lapacke_cgeev.o \ ++lapacke_cgeev_work.o \ ++lapacke_cgeevx.o \ ++lapacke_cgeevx_work.o \ ++lapacke_cgehrd.o \ ++lapacke_cgehrd_work.o \ ++lapacke_cgelq2.o \ ++lapacke_cgelq2_work.o \ ++lapacke_cgelqf.o \ ++lapacke_cgelqf_work.o \ ++lapacke_cgels.o \ ++lapacke_cgelsd.o \ ++lapacke_cgelsd_work.o \ ++lapacke_cgelss.o \ ++lapacke_cgelss_work.o \ ++lapacke_cgels_work.o \ ++lapacke_cgelsy.o \ ++lapacke_cgelsy_work.o \ ++lapacke_cgemqrt.o \ ++lapacke_cgemqrt_work.o \ ++lapacke_cgeqlf.o \ ++lapacke_cgeqlf_work.o \ ++lapacke_cgeqp3.o \ ++lapacke_cgeqp3_work.o \ ++lapacke_cgeqpf.o \ ++lapacke_cgeqpf_work.o \ ++lapacke_cgeqr2.o \ ++lapacke_cgeqr2_work.o \ ++lapacke_cgeqrf.o \ ++lapacke_cgeqrfp.o \ ++lapacke_cgeqrfp_work.o \ ++lapacke_cgeqrf_work.o \ ++lapacke_cgeqrt2.o \ ++lapacke_cgeqrt2_work.o \ ++lapacke_cgeqrt3.o \ ++lapacke_cgeqrt3_work.o \ ++lapacke_cgeqrt.o \ ++lapacke_cgeqrt_work.o \ ++lapacke_cgerfs.o \ ++lapacke_cgerfs_work.o \ ++lapacke_cgerqf.o \ ++lapacke_cgerqf_work.o \ ++lapacke_cgesdd.o \ ++lapacke_cgesdd_work.o \ ++lapacke_cgesv.o \ ++lapacke_cgesvd.o \ ++lapacke_cgesvd_work.o \ ++lapacke_cgesv_work.o \ ++lapacke_cgesvx.o \ ++lapacke_cgesvx_work.o \ ++lapacke_cgetf2.o \ ++lapacke_cgetf2_work.o \ ++lapacke_cgetrf.o \ ++lapacke_cgetrf_work.o \ ++lapacke_cgetri.o \ ++lapacke_cgetri_work.o \ ++lapacke_cgetrs.o \ ++lapacke_cgetrs_work.o \ ++lapacke_cggbak.o \ ++lapacke_cggbak_work.o \ ++lapacke_cggbal.o \ ++lapacke_cggbal_work.o \ ++lapacke_cgges.o \ ++lapacke_cgges_work.o \ ++lapacke_cggesx.o \ ++lapacke_cggesx_work.o \ ++lapacke_cggev.o \ ++lapacke_cggev_work.o \ ++lapacke_cggevx.o \ ++lapacke_cggevx_work.o \ ++lapacke_cggglm.o \ ++lapacke_cggglm_work.o \ ++lapacke_cgghrd.o \ ++lapacke_cgghrd_work.o \ ++lapacke_cgglse.o \ ++lapacke_cgglse_work.o \ ++lapacke_cggqrf.o \ ++lapacke_cggqrf_work.o \ ++lapacke_cggrqf.o \ ++lapacke_cggrqf_work.o \ ++lapacke_cggsvd.o \ ++lapacke_cggsvd_work.o \ ++lapacke_cggsvp.o \ ++lapacke_cggsvp_work.o \ ++lapacke_cgtcon.o \ ++lapacke_cgtcon_work.o \ ++lapacke_cgtrfs.o \ ++lapacke_cgtrfs_work.o \ ++lapacke_cgtsv.o \ ++lapacke_cgtsv_work.o \ ++lapacke_cgtsvx.o \ ++lapacke_cgtsvx_work.o \ ++lapacke_cgttrf.o \ ++lapacke_cgttrf_work.o \ ++lapacke_cgttrs.o \ ++lapacke_cgttrs_work.o \ ++lapacke_chbev.o \ ++lapacke_chbevd.o \ ++lapacke_chbevd_work.o \ ++lapacke_chbev_work.o \ ++lapacke_chbevx.o \ ++lapacke_chbevx_work.o \ ++lapacke_chbgst.o \ ++lapacke_chbgst_work.o \ ++lapacke_chbgv.o \ ++lapacke_chbgvd.o \ ++lapacke_chbgvd_work.o \ ++lapacke_chbgv_work.o \ ++lapacke_chbgvx.o \ ++lapacke_chbgvx_work.o \ ++lapacke_chbtrd.o \ ++lapacke_chbtrd_work.o \ ++lapacke_checon.o \ ++lapacke_checon_work.o \ ++lapacke_cheequb.o \ ++lapacke_cheequb_work.o \ ++lapacke_cheev.o \ ++lapacke_cheevd.o \ ++lapacke_cheevd_work.o \ ++lapacke_cheevr.o \ ++lapacke_cheevr_work.o \ ++lapacke_cheev_work.o \ ++lapacke_cheevx.o \ ++lapacke_cheevx_work.o \ ++lapacke_chegst.o \ ++lapacke_chegst_work.o \ ++lapacke_chegv.o \ ++lapacke_chegvd.o \ ++lapacke_chegvd_work.o \ ++lapacke_chegv_work.o \ ++lapacke_chegvx.o \ ++lapacke_chegvx_work.o \ ++lapacke_cherfs.o \ ++lapacke_cherfs_work.o \ ++lapacke_chesv.o \ ++lapacke_chesv_work.o \ ++lapacke_chesvx.o \ ++lapacke_chesvx_work.o \ ++lapacke_cheswapr.o \ ++lapacke_cheswapr_work.o \ ++lapacke_chetrd.o \ ++lapacke_chetrd_work.o \ ++lapacke_chetrf.o \ ++lapacke_chetrf_work.o \ ++lapacke_chetri2.o \ ++lapacke_chetri2_work.o \ ++lapacke_chetri2x.o \ ++lapacke_chetri2x_work.o \ ++lapacke_chetri.o \ ++lapacke_chetri_work.o \ ++lapacke_chetrs2.o \ ++lapacke_chetrs2_work.o \ ++lapacke_chetrs.o \ ++lapacke_chetrs_work.o \ ++lapacke_chfrk.o \ ++lapacke_chfrk_work.o \ ++lapacke_chgeqz.o \ ++lapacke_chgeqz_work.o \ ++lapacke_chpcon.o \ ++lapacke_chpcon_work.o \ ++lapacke_chpev.o \ ++lapacke_chpevd.o \ ++lapacke_chpevd_work.o \ ++lapacke_chpev_work.o \ ++lapacke_chpevx.o \ ++lapacke_chpevx_work.o \ ++lapacke_chpgst.o \ ++lapacke_chpgst_work.o \ ++lapacke_chpgv.o \ ++lapacke_chpgvd.o \ ++lapacke_chpgvd_work.o \ ++lapacke_chpgv_work.o \ ++lapacke_chpgvx.o \ ++lapacke_chpgvx_work.o \ ++lapacke_chprfs.o \ ++lapacke_chprfs_work.o \ ++lapacke_chpsv.o \ ++lapacke_chpsv_work.o \ ++lapacke_chpsvx.o \ ++lapacke_chpsvx_work.o \ ++lapacke_chptrd.o \ ++lapacke_chptrd_work.o \ ++lapacke_chptrf.o \ ++lapacke_chptrf_work.o \ ++lapacke_chptri.o \ ++lapacke_chptri_work.o \ ++lapacke_chptrs.o \ ++lapacke_chptrs_work.o \ ++lapacke_chsein.o \ ++lapacke_chsein_work.o \ ++lapacke_chseqr.o \ ++lapacke_chseqr_work.o \ ++lapacke_clacgv.o \ ++lapacke_clacgv_work.o \ ++lapacke_clacpy.o \ ++lapacke_clacpy_work.o \ ++lapacke_clag2z.o \ ++lapacke_clag2z_work.o \ ++lapacke_clange.o \ ++lapacke_clange_work.o \ ++lapacke_clanhe.o \ ++lapacke_clanhe_work.o \ ++lapacke_clansy.o \ ++lapacke_clansy_work.o \ ++lapacke_clantr.o \ ++lapacke_clantr_work.o \ ++lapacke_clapmr.o \ ++lapacke_clapmr_work.o \ ++lapacke_clarfb.o \ ++lapacke_clarfb_work.o \ ++lapacke_clarfg.o \ ++lapacke_clarfg_work.o \ ++lapacke_clarft.o \ ++lapacke_clarft_work.o \ ++lapacke_clarfx.o \ ++lapacke_clarfx_work.o \ ++lapacke_clarnv.o \ ++lapacke_clarnv_work.o \ ++lapacke_claset.o \ ++lapacke_claset_work.o \ ++lapacke_claswp.o \ ++lapacke_claswp_work.o \ ++lapacke_clauum.o \ ++lapacke_clauum_work.o \ ++lapacke_cpbcon.o \ ++lapacke_cpbcon_work.o \ ++lapacke_cpbequ.o \ ++lapacke_cpbequ_work.o \ ++lapacke_cpbrfs.o \ ++lapacke_cpbrfs_work.o \ ++lapacke_cpbstf.o \ ++lapacke_cpbstf_work.o \ ++lapacke_cpbsv.o \ ++lapacke_cpbsv_work.o \ ++lapacke_cpbsvx.o \ ++lapacke_cpbsvx_work.o \ ++lapacke_cpbtrf.o \ ++lapacke_cpbtrf_work.o \ ++lapacke_cpbtrs.o \ ++lapacke_cpbtrs_work.o \ ++lapacke_cpftrf.o \ ++lapacke_cpftrf_work.o \ ++lapacke_cpftri.o \ ++lapacke_cpftri_work.o \ ++lapacke_cpftrs.o \ ++lapacke_cpftrs_work.o \ ++lapacke_cpocon.o \ ++lapacke_cpocon_work.o \ ++lapacke_cpoequb.o \ ++lapacke_cpoequb_work.o \ ++lapacke_cpoequ.o \ ++lapacke_cpoequ_work.o \ ++lapacke_cporfs.o \ ++lapacke_cporfs_work.o \ ++lapacke_cposv.o \ ++lapacke_cposv_work.o \ ++lapacke_cposvx.o \ ++lapacke_cposvx_work.o \ ++lapacke_cpotrf.o \ ++lapacke_cpotrf_work.o \ ++lapacke_cpotri.o \ ++lapacke_cpotri_work.o \ ++lapacke_cpotrs.o \ ++lapacke_cpotrs_work.o \ ++lapacke_cppcon.o \ ++lapacke_cppcon_work.o \ ++lapacke_cppequ.o \ ++lapacke_cppequ_work.o \ ++lapacke_cpprfs.o \ ++lapacke_cpprfs_work.o \ ++lapacke_cppsv.o \ ++lapacke_cppsv_work.o \ ++lapacke_cppsvx.o \ ++lapacke_cppsvx_work.o \ ++lapacke_cpptrf.o \ ++lapacke_cpptrf_work.o \ ++lapacke_cpptri.o \ ++lapacke_cpptri_work.o \ ++lapacke_cpptrs.o \ ++lapacke_cpptrs_work.o \ ++lapacke_cpstrf.o \ ++lapacke_cpstrf_work.o \ ++lapacke_cptcon.o \ ++lapacke_cptcon_work.o \ ++lapacke_cpteqr.o \ ++lapacke_cpteqr_work.o \ ++lapacke_cptrfs.o \ ++lapacke_cptrfs_work.o \ ++lapacke_cptsv.o \ ++lapacke_cptsv_work.o \ ++lapacke_cptsvx.o \ ++lapacke_cptsvx_work.o \ ++lapacke_cpttrf.o \ ++lapacke_cpttrf_work.o \ ++lapacke_cpttrs.o \ ++lapacke_cpttrs_work.o \ ++lapacke_cspcon.o \ ++lapacke_cspcon_work.o \ ++lapacke_csprfs.o \ ++lapacke_csprfs_work.o \ ++lapacke_cspsv.o \ ++lapacke_cspsv_work.o \ ++lapacke_cspsvx.o \ ++lapacke_cspsvx_work.o \ ++lapacke_csptrf.o \ ++lapacke_csptrf_work.o \ ++lapacke_csptri.o \ ++lapacke_csptri_work.o \ ++lapacke_csptrs.o \ ++lapacke_csptrs_work.o \ ++lapacke_cstedc.o \ ++lapacke_cstedc_work.o \ ++lapacke_cstegr.o \ ++lapacke_cstegr_work.o \ ++lapacke_cstein.o \ ++lapacke_cstein_work.o \ ++lapacke_cstemr.o \ ++lapacke_cstemr_work.o \ ++lapacke_csteqr.o \ ++lapacke_csteqr_work.o \ ++lapacke_csycon.o \ ++lapacke_csyconv.o \ ++lapacke_csyconv_work.o \ ++lapacke_csycon_work.o \ ++lapacke_csyequb.o \ ++lapacke_csyequb_work.o \ ++lapacke_csyrfs.o \ ++lapacke_csyrfs_work.o \ ++lapacke_csysv.o \ ++lapacke_csysv_work.o \ ++lapacke_csysvx.o \ ++lapacke_csysvx_work.o \ ++lapacke_csyswapr.o \ ++lapacke_csyswapr_work.o \ ++lapacke_csytrf.o \ ++lapacke_csytrf_work.o \ ++lapacke_csytri2.o \ ++lapacke_csytri2_work.o \ ++lapacke_csytri2x.o \ ++lapacke_csytri2x_work.o \ ++lapacke_csytri.o \ ++lapacke_csytri_work.o \ ++lapacke_csytrs2.o \ ++lapacke_csytrs2_work.o \ ++lapacke_csytrs.o \ ++lapacke_csytrs_work.o \ ++lapacke_ctbcon.o \ ++lapacke_ctbcon_work.o \ ++lapacke_ctbrfs.o \ ++lapacke_ctbrfs_work.o \ ++lapacke_ctbtrs.o \ ++lapacke_ctbtrs_work.o \ ++lapacke_ctfsm.o \ ++lapacke_ctfsm_work.o \ ++lapacke_ctftri.o \ ++lapacke_ctftri_work.o \ ++lapacke_ctfttp.o \ ++lapacke_ctfttp_work.o \ ++lapacke_ctfttr.o \ ++lapacke_ctfttr_work.o \ ++lapacke_ctgevc.o \ ++lapacke_ctgevc_work.o \ ++lapacke_ctgexc.o \ ++lapacke_ctgexc_work.o \ ++lapacke_ctgsen.o \ ++lapacke_ctgsen_work.o \ ++lapacke_ctgsja.o \ ++lapacke_ctgsja_work.o \ ++lapacke_ctgsna.o \ ++lapacke_ctgsna_work.o \ ++lapacke_ctgsyl.o \ ++lapacke_ctgsyl_work.o \ ++lapacke_ctpcon.o \ ++lapacke_ctpcon_work.o \ ++lapacke_ctpmqrt.o \ ++lapacke_ctpmqrt_work.o \ ++lapacke_ctpqrt2.o \ ++lapacke_ctpqrt2_work.o \ ++lapacke_ctpqrt.o \ ++lapacke_ctpqrt_work.o \ ++lapacke_ctprfb.o \ ++lapacke_ctprfb_work.o \ ++lapacke_ctprfs.o \ ++lapacke_ctprfs_work.o \ ++lapacke_ctptri.o \ ++lapacke_ctptri_work.o \ ++lapacke_ctptrs.o \ ++lapacke_ctptrs_work.o \ ++lapacke_ctpttf.o \ ++lapacke_ctpttf_work.o \ ++lapacke_ctpttr.o \ ++lapacke_ctpttr_work.o \ ++lapacke_ctrcon.o \ ++lapacke_ctrcon_work.o \ ++lapacke_ctrevc.o \ ++lapacke_ctrevc_work.o \ ++lapacke_ctrexc.o \ ++lapacke_ctrexc_work.o \ ++lapacke_ctrrfs.o \ ++lapacke_ctrrfs_work.o \ ++lapacke_ctrsen.o \ ++lapacke_ctrsen_work.o \ ++lapacke_ctrsna.o \ ++lapacke_ctrsna_work.o \ ++lapacke_ctrsyl.o \ ++lapacke_ctrsyl_work.o \ ++lapacke_ctrtri.o \ ++lapacke_ctrtri_work.o \ ++lapacke_ctrtrs.o \ ++lapacke_ctrtrs_work.o \ ++lapacke_ctrttf.o \ ++lapacke_ctrttf_work.o \ ++lapacke_ctrttp.o \ ++lapacke_ctrttp_work.o \ ++lapacke_ctzrzf.o \ ++lapacke_ctzrzf_work.o \ ++lapacke_cunbdb.o \ ++lapacke_cunbdb_work.o \ ++lapacke_cuncsd.o \ ++lapacke_cuncsd_work.o \ ++lapacke_cungbr.o \ ++lapacke_cungbr_work.o \ ++lapacke_cunghr.o \ ++lapacke_cunghr_work.o \ ++lapacke_cunglq.o \ ++lapacke_cunglq_work.o \ ++lapacke_cungql.o \ ++lapacke_cungql_work.o \ ++lapacke_cungqr.o \ ++lapacke_cungqr_work.o \ ++lapacke_cungrq.o \ ++lapacke_cungrq_work.o \ ++lapacke_cungtr.o \ ++lapacke_cungtr_work.o \ ++lapacke_cunmbr.o \ ++lapacke_cunmbr_work.o \ ++lapacke_cunmhr.o \ ++lapacke_cunmhr_work.o \ ++lapacke_cunmlq.o \ ++lapacke_cunmlq_work.o \ ++lapacke_cunmql.o \ ++lapacke_cunmql_work.o \ ++lapacke_cunmqr.o \ ++lapacke_cunmqr_work.o \ ++lapacke_cunmrq.o \ ++lapacke_cunmrq_work.o \ ++lapacke_cunmrz.o \ ++lapacke_cunmrz_work.o \ ++lapacke_cunmtr.o \ ++lapacke_cunmtr_work.o \ ++lapacke_cupgtr.o \ ++lapacke_cupgtr_work.o \ ++lapacke_cupmtr.o \ ++lapacke_cupmtr_work.o \ ++lapacke_dbbcsd.o \ ++lapacke_dbbcsd_work.o \ ++lapacke_dbdsdc.o \ ++lapacke_dbdsdc_work.o \ ++lapacke_dbdsqr.o \ ++lapacke_dbdsqr_work.o \ ++lapacke_ddisna.o \ ++lapacke_ddisna_work.o \ ++lapacke_dgbbrd.o \ ++lapacke_dgbbrd_work.o \ ++lapacke_dgbcon.o \ ++lapacke_dgbcon_work.o \ ++lapacke_dgbequb.o \ ++lapacke_dgbequb_work.o \ ++lapacke_dgbequ.o \ ++lapacke_dgbequ_work.o \ ++lapacke_dgbrfs.o \ ++lapacke_dgbrfs_work.o \ ++lapacke_dgbsv.o \ ++lapacke_dgbsv_work.o \ ++lapacke_dgbsvx.o \ ++lapacke_dgbsvx_work.o \ ++lapacke_dgbtrf.o \ ++lapacke_dgbtrf_work.o \ ++lapacke_dgbtrs.o \ ++lapacke_dgbtrs_work.o \ ++lapacke_dgebak.o \ ++lapacke_dgebak_work.o \ ++lapacke_dgebal.o \ ++lapacke_dgebal_work.o \ ++lapacke_dgebrd.o \ ++lapacke_dgebrd_work.o \ ++lapacke_dgecon.o \ ++lapacke_dgecon_work.o \ ++lapacke_dgeequb.o \ ++lapacke_dgeequb_work.o \ ++lapacke_dgeequ.o \ ++lapacke_dgeequ_work.o \ ++lapacke_dgees.o \ ++lapacke_dgees_work.o \ ++lapacke_dgeesx.o \ ++lapacke_dgeesx_work.o \ ++lapacke_dgeev.o \ ++lapacke_dgeev_work.o \ ++lapacke_dgeevx.o \ ++lapacke_dgeevx_work.o \ ++lapacke_dgehrd.o \ ++lapacke_dgehrd_work.o \ ++lapacke_dgejsv.o \ ++lapacke_dgejsv_work.o \ ++lapacke_dgelq2.o \ ++lapacke_dgelq2_work.o \ ++lapacke_dgelqf.o \ ++lapacke_dgelqf_work.o \ ++lapacke_dgels.o \ ++lapacke_dgelsd.o \ ++lapacke_dgelsd_work.o \ ++lapacke_dgelss.o \ ++lapacke_dgelss_work.o \ ++lapacke_dgels_work.o \ ++lapacke_dgelsy.o \ ++lapacke_dgelsy_work.o \ ++lapacke_dgemqrt.o \ ++lapacke_dgemqrt_work.o \ ++lapacke_dgeqlf.o \ ++lapacke_dgeqlf_work.o \ ++lapacke_dgeqp3.o \ ++lapacke_dgeqp3_work.o \ ++lapacke_dgeqpf.o \ ++lapacke_dgeqpf_work.o \ ++lapacke_dgeqr2.o \ ++lapacke_dgeqr2_work.o \ ++lapacke_dgeqrf.o \ ++lapacke_dgeqrfp.o \ ++lapacke_dgeqrfp_work.o \ ++lapacke_dgeqrf_work.o \ ++lapacke_dgeqrt2.o \ ++lapacke_dgeqrt2_work.o \ ++lapacke_dgeqrt3.o \ ++lapacke_dgeqrt3_work.o \ ++lapacke_dgeqrt.o \ ++lapacke_dgeqrt_work.o \ ++lapacke_dgerfs.o \ ++lapacke_dgerfs_work.o \ ++lapacke_dgerqf.o \ ++lapacke_dgerqf_work.o \ ++lapacke_dgesdd.o \ ++lapacke_dgesdd_work.o \ ++lapacke_dgesv.o \ ++lapacke_dgesvd.o \ ++lapacke_dgesvd_work.o \ ++lapacke_dgesvj.o \ ++lapacke_dgesvj_work.o \ ++lapacke_dgesv_work.o \ ++lapacke_dgesvx.o \ ++lapacke_dgesvx_work.o \ ++lapacke_dgetf2.o \ ++lapacke_dgetf2_work.o \ ++lapacke_dgetrf.o \ ++lapacke_dgetrf_work.o \ ++lapacke_dgetri.o \ ++lapacke_dgetri_work.o \ ++lapacke_dgetrs.o \ ++lapacke_dgetrs_work.o \ ++lapacke_dggbak.o \ ++lapacke_dggbak_work.o \ ++lapacke_dggbal.o \ ++lapacke_dggbal_work.o \ ++lapacke_dgges.o \ ++lapacke_dgges_work.o \ ++lapacke_dggesx.o \ ++lapacke_dggesx_work.o \ ++lapacke_dggev.o \ ++lapacke_dggev_work.o \ ++lapacke_dggevx.o \ ++lapacke_dggevx_work.o \ ++lapacke_dggglm.o \ ++lapacke_dggglm_work.o \ ++lapacke_dgghrd.o \ ++lapacke_dgghrd_work.o \ ++lapacke_dgglse.o \ ++lapacke_dgglse_work.o \ ++lapacke_dggqrf.o \ ++lapacke_dggqrf_work.o \ ++lapacke_dggrqf.o \ ++lapacke_dggrqf_work.o \ ++lapacke_dggsvd.o \ ++lapacke_dggsvd_work.o \ ++lapacke_dggsvp.o \ ++lapacke_dggsvp_work.o \ ++lapacke_dgtcon.o \ ++lapacke_dgtcon_work.o \ ++lapacke_dgtrfs.o \ ++lapacke_dgtrfs_work.o \ ++lapacke_dgtsv.o \ ++lapacke_dgtsv_work.o \ ++lapacke_dgtsvx.o \ ++lapacke_dgtsvx_work.o \ ++lapacke_dgttrf.o \ ++lapacke_dgttrf_work.o \ ++lapacke_dgttrs.o \ ++lapacke_dgttrs_work.o \ ++lapacke_dhgeqz.o \ ++lapacke_dhgeqz_work.o \ ++lapacke_dhsein.o \ ++lapacke_dhsein_work.o \ ++lapacke_dhseqr.o \ ++lapacke_dhseqr_work.o \ ++lapacke_dlacpy.o \ ++lapacke_dlacpy_work.o \ ++lapacke_dlag2s.o \ ++lapacke_dlag2s_work.o \ ++lapacke_dlamch.o \ ++lapacke_dlamch_work.o \ ++lapacke_dlange.o \ ++lapacke_dlange_work.o \ ++lapacke_dlansy.o \ ++lapacke_dlansy_work.o \ ++lapacke_dlantr.o \ ++lapacke_dlantr_work.o \ ++lapacke_dlapmr.o \ ++lapacke_dlapmr_work.o \ ++lapacke_dlapy2.o \ ++lapacke_dlapy2_work.o \ ++lapacke_dlapy3.o \ ++lapacke_dlapy3_work.o \ ++lapacke_dlarfb.o \ ++lapacke_dlarfb_work.o \ ++lapacke_dlarfg.o \ ++lapacke_dlarfg_work.o \ ++lapacke_dlarft.o \ ++lapacke_dlarft_work.o \ ++lapacke_dlarfx.o \ ++lapacke_dlarfx_work.o \ ++lapacke_dlarnv.o \ ++lapacke_dlarnv_work.o \ ++lapacke_dlartgp.o \ ++lapacke_dlartgp_work.o \ ++lapacke_dlartgs.o \ ++lapacke_dlartgs_work.o \ ++lapacke_dlaset.o \ ++lapacke_dlaset_work.o \ ++lapacke_dlasrt.o \ ++lapacke_dlasrt_work.o \ ++lapacke_dlaswp.o \ ++lapacke_dlaswp_work.o \ ++lapacke_dlauum.o \ ++lapacke_dlauum_work.o \ ++lapacke_dopgtr.o \ ++lapacke_dopgtr_work.o \ ++lapacke_dopmtr.o \ ++lapacke_dopmtr_work.o \ ++lapacke_dorbdb.o \ ++lapacke_dorbdb_work.o \ ++lapacke_dorcsd.o \ ++lapacke_dorcsd_work.o \ ++lapacke_dorgbr.o \ ++lapacke_dorgbr_work.o \ ++lapacke_dorghr.o \ ++lapacke_dorghr_work.o \ ++lapacke_dorglq.o \ ++lapacke_dorglq_work.o \ ++lapacke_dorgql.o \ ++lapacke_dorgql_work.o \ ++lapacke_dorgqr.o \ ++lapacke_dorgqr_work.o \ ++lapacke_dorgrq.o \ ++lapacke_dorgrq_work.o \ ++lapacke_dorgtr.o \ ++lapacke_dorgtr_work.o \ ++lapacke_dormbr.o \ ++lapacke_dormbr_work.o \ ++lapacke_dormhr.o \ ++lapacke_dormhr_work.o \ ++lapacke_dormlq.o \ ++lapacke_dormlq_work.o \ ++lapacke_dormql.o \ ++lapacke_dormql_work.o \ ++lapacke_dormqr.o \ ++lapacke_dormqr_work.o \ ++lapacke_dormrq.o \ ++lapacke_dormrq_work.o \ ++lapacke_dormrz.o \ ++lapacke_dormrz_work.o \ ++lapacke_dormtr.o \ ++lapacke_dormtr_work.o \ ++lapacke_dpbcon.o \ ++lapacke_dpbcon_work.o \ ++lapacke_dpbequ.o \ ++lapacke_dpbequ_work.o \ ++lapacke_dpbrfs.o \ ++lapacke_dpbrfs_work.o \ ++lapacke_dpbstf.o \ ++lapacke_dpbstf_work.o \ ++lapacke_dpbsv.o \ ++lapacke_dpbsv_work.o \ ++lapacke_dpbsvx.o \ ++lapacke_dpbsvx_work.o \ ++lapacke_dpbtrf.o \ ++lapacke_dpbtrf_work.o \ ++lapacke_dpbtrs.o \ ++lapacke_dpbtrs_work.o \ ++lapacke_dpftrf.o \ ++lapacke_dpftrf_work.o \ ++lapacke_dpftri.o \ ++lapacke_dpftri_work.o \ ++lapacke_dpftrs.o \ ++lapacke_dpftrs_work.o \ ++lapacke_dpocon.o \ ++lapacke_dpocon_work.o \ ++lapacke_dpoequb.o \ ++lapacke_dpoequb_work.o \ ++lapacke_dpoequ.o \ ++lapacke_dpoequ_work.o \ ++lapacke_dporfs.o \ ++lapacke_dporfs_work.o \ ++lapacke_dposv.o \ ++lapacke_dposv_work.o \ ++lapacke_dposvx.o \ ++lapacke_dposvx_work.o \ ++lapacke_dpotrf.o \ ++lapacke_dpotrf_work.o \ ++lapacke_dpotri.o \ ++lapacke_dpotri_work.o \ ++lapacke_dpotrs.o \ ++lapacke_dpotrs_work.o \ ++lapacke_dppcon.o \ ++lapacke_dppcon_work.o \ ++lapacke_dppequ.o \ ++lapacke_dppequ_work.o \ ++lapacke_dpprfs.o \ ++lapacke_dpprfs_work.o \ ++lapacke_dppsv.o \ ++lapacke_dppsv_work.o \ ++lapacke_dppsvx.o \ ++lapacke_dppsvx_work.o \ ++lapacke_dpptrf.o \ ++lapacke_dpptrf_work.o \ ++lapacke_dpptri.o \ ++lapacke_dpptri_work.o \ ++lapacke_dpptrs.o \ ++lapacke_dpptrs_work.o \ ++lapacke_dpstrf.o \ ++lapacke_dpstrf_work.o \ ++lapacke_dptcon.o \ ++lapacke_dptcon_work.o \ ++lapacke_dpteqr.o \ ++lapacke_dpteqr_work.o \ ++lapacke_dptrfs.o \ ++lapacke_dptrfs_work.o \ ++lapacke_dptsv.o \ ++lapacke_dptsv_work.o \ ++lapacke_dptsvx.o \ ++lapacke_dptsvx_work.o \ ++lapacke_dpttrf.o \ ++lapacke_dpttrf_work.o \ ++lapacke_dpttrs.o \ ++lapacke_dpttrs_work.o \ ++lapacke_dsbev.o \ ++lapacke_dsbevd.o \ ++lapacke_dsbevd_work.o \ ++lapacke_dsbev_work.o \ ++lapacke_dsbevx.o \ ++lapacke_dsbevx_work.o \ ++lapacke_dsbgst.o \ ++lapacke_dsbgst_work.o \ ++lapacke_dsbgv.o \ ++lapacke_dsbgvd.o \ ++lapacke_dsbgvd_work.o \ ++lapacke_dsbgv_work.o \ ++lapacke_dsbgvx.o \ ++lapacke_dsbgvx_work.o \ ++lapacke_dsbtrd.o \ ++lapacke_dsbtrd_work.o \ ++lapacke_dsfrk.o \ ++lapacke_dsfrk_work.o \ ++lapacke_dsgesv.o \ ++lapacke_dsgesv_work.o \ ++lapacke_dspcon.o \ ++lapacke_dspcon_work.o \ ++lapacke_dspev.o \ ++lapacke_dspevd.o \ ++lapacke_dspevd_work.o \ ++lapacke_dspev_work.o \ ++lapacke_dspevx.o \ ++lapacke_dspevx_work.o \ ++lapacke_dspgst.o \ ++lapacke_dspgst_work.o \ ++lapacke_dspgv.o \ ++lapacke_dspgvd.o \ ++lapacke_dspgvd_work.o \ ++lapacke_dspgv_work.o \ ++lapacke_dspgvx.o \ ++lapacke_dspgvx_work.o \ ++lapacke_dsposv.o \ ++lapacke_dsposv_work.o \ ++lapacke_dsprfs.o \ ++lapacke_dsprfs_work.o \ ++lapacke_dspsv.o \ ++lapacke_dspsv_work.o \ ++lapacke_dspsvx.o \ ++lapacke_dspsvx_work.o \ ++lapacke_dsptrd.o \ ++lapacke_dsptrd_work.o \ ++lapacke_dsptrf.o \ ++lapacke_dsptrf_work.o \ ++lapacke_dsptri.o \ ++lapacke_dsptri_work.o \ ++lapacke_dsptrs.o \ ++lapacke_dsptrs_work.o \ ++lapacke_dstebz.o \ ++lapacke_dstebz_work.o \ ++lapacke_dstedc.o \ ++lapacke_dstedc_work.o \ ++lapacke_dstegr.o \ ++lapacke_dstegr_work.o \ ++lapacke_dstein.o \ ++lapacke_dstein_work.o \ ++lapacke_dstemr.o \ ++lapacke_dstemr_work.o \ ++lapacke_dsteqr.o \ ++lapacke_dsteqr_work.o \ ++lapacke_dsterf.o \ ++lapacke_dsterf_work.o \ ++lapacke_dstev.o \ ++lapacke_dstevd.o \ ++lapacke_dstevd_work.o \ ++lapacke_dstevr.o \ ++lapacke_dstevr_work.o \ ++lapacke_dstev_work.o \ ++lapacke_dstevx.o \ ++lapacke_dstevx_work.o \ ++lapacke_dsycon.o \ ++lapacke_dsyconv.o \ ++lapacke_dsyconv_work.o \ ++lapacke_dsycon_work.o \ ++lapacke_dsyequb.o \ ++lapacke_dsyequb_work.o \ ++lapacke_dsyev.o \ ++lapacke_dsyevd.o \ ++lapacke_dsyevd_work.o \ ++lapacke_dsyevr.o \ ++lapacke_dsyevr_work.o \ ++lapacke_dsyev_work.o \ ++lapacke_dsyevx.o \ ++lapacke_dsyevx_work.o \ ++lapacke_dsygst.o \ ++lapacke_dsygst_work.o \ ++lapacke_dsygv.o \ ++lapacke_dsygvd.o \ ++lapacke_dsygvd_work.o \ ++lapacke_dsygv_work.o \ ++lapacke_dsygvx.o \ ++lapacke_dsygvx_work.o \ ++lapacke_dsyrfs.o \ ++lapacke_dsyrfs_work.o \ ++lapacke_dsysv.o \ ++lapacke_dsysv_work.o \ ++lapacke_dsysvx.o \ ++lapacke_dsysvx_work.o \ ++lapacke_dsyswapr.o \ ++lapacke_dsyswapr_work.o \ ++lapacke_dsytrd.o \ ++lapacke_dsytrd_work.o \ ++lapacke_dsytrf.o \ ++lapacke_dsytrf_work.o \ ++lapacke_dsytri2.o \ ++lapacke_dsytri2_work.o \ ++lapacke_dsytri2x.o \ ++lapacke_dsytri2x_work.o \ ++lapacke_dsytri.o \ ++lapacke_dsytri_work.o \ ++lapacke_dsytrs2.o \ ++lapacke_dsytrs2_work.o \ ++lapacke_dsytrs.o \ ++lapacke_dsytrs_work.o \ ++lapacke_dtbcon.o \ ++lapacke_dtbcon_work.o \ ++lapacke_dtbrfs.o \ ++lapacke_dtbrfs_work.o \ ++lapacke_dtbtrs.o \ ++lapacke_dtbtrs_work.o \ ++lapacke_dtfsm.o \ ++lapacke_dtfsm_work.o \ ++lapacke_dtftri.o \ ++lapacke_dtftri_work.o \ ++lapacke_dtfttp.o \ ++lapacke_dtfttp_work.o \ ++lapacke_dtfttr.o \ ++lapacke_dtfttr_work.o \ ++lapacke_dtgevc.o \ ++lapacke_dtgevc_work.o \ ++lapacke_dtgexc.o \ ++lapacke_dtgexc_work.o \ ++lapacke_dtgsen.o \ ++lapacke_dtgsen_work.o \ ++lapacke_dtgsja.o \ ++lapacke_dtgsja_work.o \ ++lapacke_dtgsna.o \ ++lapacke_dtgsna_work.o \ ++lapacke_dtgsyl.o \ ++lapacke_dtgsyl_work.o \ ++lapacke_dtpcon.o \ ++lapacke_dtpcon_work.o \ ++lapacke_dtpmqrt.o \ ++lapacke_dtpmqrt_work.o \ ++lapacke_dtpqrt2.o \ ++lapacke_dtpqrt2_work.o \ ++lapacke_dtpqrt.o \ ++lapacke_dtpqrt_work.o \ ++lapacke_dtprfb.o \ ++lapacke_dtprfb_work.o \ ++lapacke_dtprfs.o \ ++lapacke_dtprfs_work.o \ ++lapacke_dtptri.o \ ++lapacke_dtptri_work.o \ ++lapacke_dtptrs.o \ ++lapacke_dtptrs_work.o \ ++lapacke_dtpttf.o \ ++lapacke_dtpttf_work.o \ ++lapacke_dtpttr.o \ ++lapacke_dtpttr_work.o \ ++lapacke_dtrcon.o \ ++lapacke_dtrcon_work.o \ ++lapacke_dtrevc.o \ ++lapacke_dtrevc_work.o \ ++lapacke_dtrexc.o \ ++lapacke_dtrexc_work.o \ ++lapacke_dtrrfs.o \ ++lapacke_dtrrfs_work.o \ ++lapacke_dtrsen.o \ ++lapacke_dtrsen_work.o \ ++lapacke_dtrsna.o \ ++lapacke_dtrsna_work.o \ ++lapacke_dtrsyl.o \ ++lapacke_dtrsyl_work.o \ ++lapacke_dtrtri.o \ ++lapacke_dtrtri_work.o \ ++lapacke_dtrtrs.o \ ++lapacke_dtrtrs_work.o \ ++lapacke_dtrttf.o \ ++lapacke_dtrttf_work.o \ ++lapacke_dtrttp.o \ ++lapacke_dtrttp_work.o \ ++lapacke_dtzrzf.o \ ++lapacke_dtzrzf_work.o \ ++lapacke_sbbcsd.o \ ++lapacke_sbbcsd_work.o \ ++lapacke_sbdsdc.o \ ++lapacke_sbdsdc_work.o \ ++lapacke_sbdsqr.o \ ++lapacke_sbdsqr_work.o \ ++lapacke_sdisna.o \ ++lapacke_sdisna_work.o \ ++lapacke_sgbbrd.o \ ++lapacke_sgbbrd_work.o \ ++lapacke_sgbcon.o \ ++lapacke_sgbcon_work.o \ ++lapacke_sgbequb.o \ ++lapacke_sgbequb_work.o \ ++lapacke_sgbequ.o \ ++lapacke_sgbequ_work.o \ ++lapacke_sgbrfs.o \ ++lapacke_sgbrfs_work.o \ ++lapacke_sgbsv.o \ ++lapacke_sgbsv_work.o \ ++lapacke_sgbsvx.o \ ++lapacke_sgbsvx_work.o \ ++lapacke_sgbtrf.o \ ++lapacke_sgbtrf_work.o \ ++lapacke_sgbtrs.o \ ++lapacke_sgbtrs_work.o \ ++lapacke_sgebak.o \ ++lapacke_sgebak_work.o \ ++lapacke_sgebal.o \ ++lapacke_sgebal_work.o \ ++lapacke_sgebrd.o \ ++lapacke_sgebrd_work.o \ ++lapacke_sgecon.o \ ++lapacke_sgecon_work.o \ ++lapacke_sgeequb.o \ ++lapacke_sgeequb_work.o \ ++lapacke_sgeequ.o \ ++lapacke_sgeequ_work.o \ ++lapacke_sgees.o \ ++lapacke_sgees_work.o \ ++lapacke_sgeesx.o \ ++lapacke_sgeesx_work.o \ ++lapacke_sgeev.o \ ++lapacke_sgeev_work.o \ ++lapacke_sgeevx.o \ ++lapacke_sgeevx_work.o \ ++lapacke_sgehrd.o \ ++lapacke_sgehrd_work.o \ ++lapacke_sgejsv.o \ ++lapacke_sgejsv_work.o \ ++lapacke_sgelq2.o \ ++lapacke_sgelq2_work.o \ ++lapacke_sgelqf.o \ ++lapacke_sgelqf_work.o \ ++lapacke_sgels.o \ ++lapacke_sgelsd.o \ ++lapacke_sgelsd_work.o \ ++lapacke_sgelss.o \ ++lapacke_sgelss_work.o \ ++lapacke_sgels_work.o \ ++lapacke_sgelsy.o \ ++lapacke_sgelsy_work.o \ ++lapacke_sgemqrt.o \ ++lapacke_sgemqrt_work.o \ ++lapacke_sgeqlf.o \ ++lapacke_sgeqlf_work.o \ ++lapacke_sgeqp3.o \ ++lapacke_sgeqp3_work.o \ ++lapacke_sgeqpf.o \ ++lapacke_sgeqpf_work.o \ ++lapacke_sgeqr2.o \ ++lapacke_sgeqr2_work.o \ ++lapacke_sgeqrf.o \ ++lapacke_sgeqrfp.o \ ++lapacke_sgeqrfp_work.o \ ++lapacke_sgeqrf_work.o \ ++lapacke_sgeqrt2.o \ ++lapacke_sgeqrt2_work.o \ ++lapacke_sgeqrt3.o \ ++lapacke_sgeqrt3_work.o \ ++lapacke_sgeqrt.o \ ++lapacke_sgeqrt_work.o \ ++lapacke_sgerfs.o \ ++lapacke_sgerfs_work.o \ ++lapacke_sgerqf.o \ ++lapacke_sgerqf_work.o \ ++lapacke_sgesdd.o \ ++lapacke_sgesdd_work.o \ ++lapacke_sgesv.o \ ++lapacke_sgesvd.o \ ++lapacke_sgesvd_work.o \ ++lapacke_sgesvj.o \ ++lapacke_sgesvj_work.o \ ++lapacke_sgesv_work.o \ ++lapacke_sgesvx.o \ ++lapacke_sgesvx_work.o \ ++lapacke_sgetf2.o \ ++lapacke_sgetf2_work.o \ ++lapacke_sgetrf.o \ ++lapacke_sgetrf_work.o \ ++lapacke_sgetri.o \ ++lapacke_sgetri_work.o \ ++lapacke_sgetrs.o \ ++lapacke_sgetrs_work.o \ ++lapacke_sggbak.o \ ++lapacke_sggbak_work.o \ ++lapacke_sggbal.o \ ++lapacke_sggbal_work.o \ ++lapacke_sgges.o \ ++lapacke_sgges_work.o \ ++lapacke_sggesx.o \ ++lapacke_sggesx_work.o \ ++lapacke_sggev.o \ ++lapacke_sggev_work.o \ ++lapacke_sggevx.o \ ++lapacke_sggevx_work.o \ ++lapacke_sggglm.o \ ++lapacke_sggglm_work.o \ ++lapacke_sgghrd.o \ ++lapacke_sgghrd_work.o \ ++lapacke_sgglse.o \ ++lapacke_sgglse_work.o \ ++lapacke_sggqrf.o \ ++lapacke_sggqrf_work.o \ ++lapacke_sggrqf.o \ ++lapacke_sggrqf_work.o \ ++lapacke_sggsvd.o \ ++lapacke_sggsvd_work.o \ ++lapacke_sggsvp.o \ ++lapacke_sggsvp_work.o \ ++lapacke_sgtcon.o \ ++lapacke_sgtcon_work.o \ ++lapacke_sgtrfs.o \ ++lapacke_sgtrfs_work.o \ ++lapacke_sgtsv.o \ ++lapacke_sgtsv_work.o \ ++lapacke_sgtsvx.o \ ++lapacke_sgtsvx_work.o \ ++lapacke_sgttrf.o \ ++lapacke_sgttrf_work.o \ ++lapacke_sgttrs.o \ ++lapacke_sgttrs_work.o \ ++lapacke_shgeqz.o \ ++lapacke_shgeqz_work.o \ ++lapacke_shsein.o \ ++lapacke_shsein_work.o \ ++lapacke_shseqr.o \ ++lapacke_shseqr_work.o \ ++lapacke_slacpy.o \ ++lapacke_slacpy_work.o \ ++lapacke_slag2d.o \ ++lapacke_slag2d_work.o \ ++lapacke_slamch.o \ ++lapacke_slamch_work.o \ ++lapacke_slange.o \ ++lapacke_slange_work.o \ ++lapacke_slansy.o \ ++lapacke_slansy_work.o \ ++lapacke_slantr.o \ ++lapacke_slantr_work.o \ ++lapacke_slapmr.o \ ++lapacke_slapmr_work.o \ ++lapacke_slapy2.o \ ++lapacke_slapy2_work.o \ ++lapacke_slapy3.o \ ++lapacke_slapy3_work.o \ ++lapacke_slarfb.o \ ++lapacke_slarfb_work.o \ ++lapacke_slarfg.o \ ++lapacke_slarfg_work.o \ ++lapacke_slarft.o \ ++lapacke_slarft_work.o \ ++lapacke_slarfx.o \ ++lapacke_slarfx_work.o \ ++lapacke_slarnv.o \ ++lapacke_slarnv_work.o \ ++lapacke_slartgp.o \ ++lapacke_slartgp_work.o \ ++lapacke_slartgs.o \ ++lapacke_slartgs_work.o \ ++lapacke_slaset.o \ ++lapacke_slaset_work.o \ ++lapacke_slasrt.o \ ++lapacke_slasrt_work.o \ ++lapacke_slaswp.o \ ++lapacke_slaswp_work.o \ ++lapacke_slauum.o \ ++lapacke_slauum_work.o \ ++lapacke_sopgtr.o \ ++lapacke_sopgtr_work.o \ ++lapacke_sopmtr.o \ ++lapacke_sopmtr_work.o \ ++lapacke_sorbdb.o \ ++lapacke_sorbdb_work.o \ ++lapacke_sorcsd.o \ ++lapacke_sorcsd_work.o \ ++lapacke_sorgbr.o \ ++lapacke_sorgbr_work.o \ ++lapacke_sorghr.o \ ++lapacke_sorghr_work.o \ ++lapacke_sorglq.o \ ++lapacke_sorglq_work.o \ ++lapacke_sorgql.o \ ++lapacke_sorgql_work.o \ ++lapacke_sorgqr.o \ ++lapacke_sorgqr_work.o \ ++lapacke_sorgrq.o \ ++lapacke_sorgrq_work.o \ ++lapacke_sorgtr.o \ ++lapacke_sorgtr_work.o \ ++lapacke_sormbr.o \ ++lapacke_sormbr_work.o \ ++lapacke_sormhr.o \ ++lapacke_sormhr_work.o \ ++lapacke_sormlq.o \ ++lapacke_sormlq_work.o \ ++lapacke_sormql.o \ ++lapacke_sormql_work.o \ ++lapacke_sormqr.o \ ++lapacke_sormqr_work.o \ ++lapacke_sormrq.o \ ++lapacke_sormrq_work.o \ ++lapacke_sormrz.o \ ++lapacke_sormrz_work.o \ ++lapacke_sormtr.o \ ++lapacke_sormtr_work.o \ ++lapacke_spbcon.o \ ++lapacke_spbcon_work.o \ ++lapacke_spbequ.o \ ++lapacke_spbequ_work.o \ ++lapacke_spbrfs.o \ ++lapacke_spbrfs_work.o \ ++lapacke_spbstf.o \ ++lapacke_spbstf_work.o \ ++lapacke_spbsv.o \ ++lapacke_spbsv_work.o \ ++lapacke_spbsvx.o \ ++lapacke_spbsvx_work.o \ ++lapacke_spbtrf.o \ ++lapacke_spbtrf_work.o \ ++lapacke_spbtrs.o \ ++lapacke_spbtrs_work.o \ ++lapacke_spftrf.o \ ++lapacke_spftrf_work.o \ ++lapacke_spftri.o \ ++lapacke_spftri_work.o \ ++lapacke_spftrs.o \ ++lapacke_spftrs_work.o \ ++lapacke_spocon.o \ ++lapacke_spocon_work.o \ ++lapacke_spoequb.o \ ++lapacke_spoequb_work.o \ ++lapacke_spoequ.o \ ++lapacke_spoequ_work.o \ ++lapacke_sporfs.o \ ++lapacke_sporfs_work.o \ ++lapacke_sposv.o \ ++lapacke_sposv_work.o \ ++lapacke_sposvx.o \ ++lapacke_sposvx_work.o \ ++lapacke_spotrf.o \ ++lapacke_spotrf_work.o \ ++lapacke_spotri.o \ ++lapacke_spotri_work.o \ ++lapacke_spotrs.o \ ++lapacke_spotrs_work.o \ ++lapacke_sppcon.o \ ++lapacke_sppcon_work.o \ ++lapacke_sppequ.o \ ++lapacke_sppequ_work.o \ ++lapacke_spprfs.o \ ++lapacke_spprfs_work.o \ ++lapacke_sppsv.o \ ++lapacke_sppsv_work.o \ ++lapacke_sppsvx.o \ ++lapacke_sppsvx_work.o \ ++lapacke_spptrf.o \ ++lapacke_spptrf_work.o \ ++lapacke_spptri.o \ ++lapacke_spptri_work.o \ ++lapacke_spptrs.o \ ++lapacke_spptrs_work.o \ ++lapacke_spstrf.o \ ++lapacke_spstrf_work.o \ ++lapacke_sptcon.o \ ++lapacke_sptcon_work.o \ ++lapacke_spteqr.o \ ++lapacke_spteqr_work.o \ ++lapacke_sptrfs.o \ ++lapacke_sptrfs_work.o \ ++lapacke_sptsv.o \ ++lapacke_sptsv_work.o \ ++lapacke_sptsvx.o \ ++lapacke_sptsvx_work.o \ ++lapacke_spttrf.o \ ++lapacke_spttrf_work.o \ ++lapacke_spttrs.o \ ++lapacke_spttrs_work.o \ ++lapacke_ssbev.o \ ++lapacke_ssbevd.o \ ++lapacke_ssbevd_work.o \ ++lapacke_ssbev_work.o \ ++lapacke_ssbevx.o \ ++lapacke_ssbevx_work.o \ ++lapacke_ssbgst.o \ ++lapacke_ssbgst_work.o \ ++lapacke_ssbgv.o \ ++lapacke_ssbgvd.o \ ++lapacke_ssbgvd_work.o \ ++lapacke_ssbgv_work.o \ ++lapacke_ssbgvx.o \ ++lapacke_ssbgvx_work.o \ ++lapacke_ssbtrd.o \ ++lapacke_ssbtrd_work.o \ ++lapacke_ssfrk.o \ ++lapacke_ssfrk_work.o \ ++lapacke_sspcon.o \ ++lapacke_sspcon_work.o \ ++lapacke_sspev.o \ ++lapacke_sspevd.o \ ++lapacke_sspevd_work.o \ ++lapacke_sspev_work.o \ ++lapacke_sspevx.o \ ++lapacke_sspevx_work.o \ ++lapacke_sspgst.o \ ++lapacke_sspgst_work.o \ ++lapacke_sspgv.o \ ++lapacke_sspgvd.o \ ++lapacke_sspgvd_work.o \ ++lapacke_sspgv_work.o \ ++lapacke_sspgvx.o \ ++lapacke_sspgvx_work.o \ ++lapacke_ssprfs.o \ ++lapacke_ssprfs_work.o \ ++lapacke_sspsv.o \ ++lapacke_sspsv_work.o \ ++lapacke_sspsvx.o \ ++lapacke_sspsvx_work.o \ ++lapacke_ssptrd.o \ ++lapacke_ssptrd_work.o \ ++lapacke_ssptrf.o \ ++lapacke_ssptrf_work.o \ ++lapacke_ssptri.o \ ++lapacke_ssptri_work.o \ ++lapacke_ssptrs.o \ ++lapacke_ssptrs_work.o \ ++lapacke_sstebz.o \ ++lapacke_sstebz_work.o \ ++lapacke_sstedc.o \ ++lapacke_sstedc_work.o \ ++lapacke_sstegr.o \ ++lapacke_sstegr_work.o \ ++lapacke_sstein.o \ ++lapacke_sstein_work.o \ ++lapacke_sstemr.o \ ++lapacke_sstemr_work.o \ ++lapacke_ssteqr.o \ ++lapacke_ssteqr_work.o \ ++lapacke_ssterf.o \ ++lapacke_ssterf_work.o \ ++lapacke_sstev.o \ ++lapacke_sstevd.o \ ++lapacke_sstevd_work.o \ ++lapacke_sstevr.o \ ++lapacke_sstevr_work.o \ ++lapacke_sstev_work.o \ ++lapacke_sstevx.o \ ++lapacke_sstevx_work.o \ ++lapacke_ssycon.o \ ++lapacke_ssyconv.o \ ++lapacke_ssyconv_work.o \ ++lapacke_ssycon_work.o \ ++lapacke_ssyequb.o \ ++lapacke_ssyequb_work.o \ ++lapacke_ssyev.o \ ++lapacke_ssyevd.o \ ++lapacke_ssyevd_work.o \ ++lapacke_ssyevr.o \ ++lapacke_ssyevr_work.o \ ++lapacke_ssyev_work.o \ ++lapacke_ssyevx.o \ ++lapacke_ssyevx_work.o \ ++lapacke_ssygst.o \ ++lapacke_ssygst_work.o \ ++lapacke_ssygv.o \ ++lapacke_ssygvd.o \ ++lapacke_ssygvd_work.o \ ++lapacke_ssygv_work.o \ ++lapacke_ssygvx.o \ ++lapacke_ssygvx_work.o \ ++lapacke_ssyrfs.o \ ++lapacke_ssyrfs_work.o \ ++lapacke_ssysv.o \ ++lapacke_ssysv_work.o \ ++lapacke_ssysvx.o \ ++lapacke_ssysvx_work.o \ ++lapacke_ssyswapr.o \ ++lapacke_ssyswapr_work.o \ ++lapacke_ssytrd.o \ ++lapacke_ssytrd_work.o \ ++lapacke_ssytrf.o \ ++lapacke_ssytrf_work.o \ ++lapacke_ssytri2.o \ ++lapacke_ssytri2_work.o \ ++lapacke_ssytri2x.o \ ++lapacke_ssytri2x_work.o \ ++lapacke_ssytri.o \ ++lapacke_ssytri_work.o \ ++lapacke_ssytrs2.o \ ++lapacke_ssytrs2_work.o \ ++lapacke_ssytrs.o \ ++lapacke_ssytrs_work.o \ ++lapacke_stbcon.o \ ++lapacke_stbcon_work.o \ ++lapacke_stbrfs.o \ ++lapacke_stbrfs_work.o \ ++lapacke_stbtrs.o \ ++lapacke_stbtrs_work.o \ ++lapacke_stfsm.o \ ++lapacke_stfsm_work.o \ ++lapacke_stftri.o \ ++lapacke_stftri_work.o \ ++lapacke_stfttp.o \ ++lapacke_stfttp_work.o \ ++lapacke_stfttr.o \ ++lapacke_stfttr_work.o \ ++lapacke_stgevc.o \ ++lapacke_stgevc_work.o \ ++lapacke_stgexc.o \ ++lapacke_stgexc_work.o \ ++lapacke_stgsen.o \ ++lapacke_stgsen_work.o \ ++lapacke_stgsja.o \ ++lapacke_stgsja_work.o \ ++lapacke_stgsna.o \ ++lapacke_stgsna_work.o \ ++lapacke_stgsyl.o \ ++lapacke_stgsyl_work.o \ ++lapacke_stpcon.o \ ++lapacke_stpcon_work.o \ ++lapacke_stpmqrt.o \ ++lapacke_stpmqrt_work.o \ ++lapacke_stpqrt2.o \ ++lapacke_stpqrt2_work.o \ ++lapacke_stprfb.o \ ++lapacke_stprfb_work.o \ ++lapacke_stprfs.o \ ++lapacke_stprfs_work.o \ ++lapacke_stptri.o \ ++lapacke_stptri_work.o \ ++lapacke_stptrs.o \ ++lapacke_stptrs_work.o \ ++lapacke_stpttf.o \ ++lapacke_stpttf_work.o \ ++lapacke_stpttr.o \ ++lapacke_stpttr_work.o \ ++lapacke_strcon.o \ ++lapacke_strcon_work.o \ ++lapacke_strevc.o \ ++lapacke_strevc_work.o \ ++lapacke_strexc.o \ ++lapacke_strexc_work.o \ ++lapacke_strrfs.o \ ++lapacke_strrfs_work.o \ ++lapacke_strsen.o \ ++lapacke_strsen_work.o \ ++lapacke_strsna.o \ ++lapacke_strsna_work.o \ ++lapacke_strsyl.o \ ++lapacke_strsyl_work.o \ ++lapacke_strtri.o \ ++lapacke_strtri_work.o \ ++lapacke_strtrs.o \ ++lapacke_strtrs_work.o \ ++lapacke_strttf.o \ ++lapacke_strttf_work.o \ ++lapacke_strttp.o \ ++lapacke_strttp_work.o \ ++lapacke_stzrzf.o \ ++lapacke_stzrzf_work.o \ ++lapacke_zbbcsd.o \ ++lapacke_zbbcsd_work.o \ ++lapacke_zbdsqr.o \ ++lapacke_zbdsqr_work.o \ ++lapacke_zcgesv.o \ ++lapacke_zcgesv_work.o \ ++lapacke_zcposv.o \ ++lapacke_zcposv_work.o \ ++lapacke_zgbbrd.o \ ++lapacke_zgbbrd_work.o \ ++lapacke_zgbcon.o \ ++lapacke_zgbcon_work.o \ ++lapacke_zgbequb.o \ ++lapacke_zgbequb_work.o \ ++lapacke_zgbequ.o \ ++lapacke_zgbequ_work.o \ ++lapacke_zgbrfs.o \ ++lapacke_zgbrfs_work.o \ ++lapacke_zgbsv.o \ ++lapacke_zgbsv_work.o \ ++lapacke_zgbsvx.o \ ++lapacke_zgbsvx_work.o \ ++lapacke_zgbtrf.o \ ++lapacke_zgbtrf_work.o \ ++lapacke_zgbtrs.o \ ++lapacke_zgbtrs_work.o \ ++lapacke_zgebak.o \ ++lapacke_zgebak_work.o \ ++lapacke_zgebal.o \ ++lapacke_zgebal_work.o \ ++lapacke_zgebrd.o \ ++lapacke_zgebrd_work.o \ ++lapacke_zgecon.o \ ++lapacke_zgecon_work.o \ ++lapacke_zgeequb.o \ ++lapacke_zgeequb_work.o \ ++lapacke_zgeequ.o \ ++lapacke_zgeequ_work.o \ ++lapacke_zgees.o \ ++lapacke_zgees_work.o \ ++lapacke_zgeesx.o \ ++lapacke_zgeesx_work.o \ ++lapacke_zgeev.o \ ++lapacke_zgeev_work.o \ ++lapacke_zgeevx.o \ ++lapacke_zgeevx_work.o \ ++lapacke_zgehrd.o \ ++lapacke_zgehrd_work.o \ ++lapacke_zgelq2.o \ ++lapacke_zgelq2_work.o \ ++lapacke_zgelqf.o \ ++lapacke_zgelqf_work.o \ ++lapacke_zgels.o \ ++lapacke_zgelsd.o \ ++lapacke_zgelsd_work.o \ ++lapacke_zgelss.o \ ++lapacke_zgelss_work.o \ ++lapacke_zgels_work.o \ ++lapacke_zgelsy.o \ ++lapacke_zgelsy_work.o \ ++lapacke_zgemqrt.o \ ++lapacke_zgemqrt_work.o \ ++lapacke_zgeqlf.o \ ++lapacke_zgeqlf_work.o \ ++lapacke_zgeqp3.o \ ++lapacke_zgeqp3_work.o \ ++lapacke_zgeqpf.o \ ++lapacke_zgeqpf_work.o \ ++lapacke_zgeqr2.o \ ++lapacke_zgeqr2_work.o \ ++lapacke_zgeqrf.o \ ++lapacke_zgeqrfp.o \ ++lapacke_zgeqrfp_work.o \ ++lapacke_zgeqrf_work.o \ ++lapacke_zgeqrt2.o \ ++lapacke_zgeqrt2_work.o \ ++lapacke_zgeqrt3.o \ ++lapacke_zgeqrt3_work.o \ ++lapacke_zgeqrt.o \ ++lapacke_zgeqrt_work.o \ ++lapacke_zgerfs.o \ ++lapacke_zgerfs_work.o \ ++lapacke_zgerqf.o \ ++lapacke_zgerqf_work.o \ ++lapacke_zgesdd.o \ ++lapacke_zgesdd_work.o \ ++lapacke_zgesv.o \ ++lapacke_zgesvd.o \ ++lapacke_zgesvd_work.o \ ++lapacke_zgesv_work.o \ ++lapacke_zgesvx.o \ ++lapacke_zgesvx_work.o \ ++lapacke_zgetf2.o \ ++lapacke_zgetf2_work.o \ ++lapacke_zgetrf.o \ ++lapacke_zgetrf_work.o \ ++lapacke_zgetri.o \ ++lapacke_zgetri_work.o \ ++lapacke_zgetrs.o \ ++lapacke_zgetrs_work.o \ ++lapacke_zggbak.o \ ++lapacke_zggbak_work.o \ ++lapacke_zggbal.o \ ++lapacke_zggbal_work.o \ ++lapacke_zgges.o \ ++lapacke_zgges_work.o \ ++lapacke_zggesx.o \ ++lapacke_zggesx_work.o \ ++lapacke_zggev.o \ ++lapacke_zggev_work.o \ ++lapacke_zggevx.o \ ++lapacke_zggevx_work.o \ ++lapacke_zggglm.o \ ++lapacke_zggglm_work.o \ ++lapacke_zgghrd.o \ ++lapacke_zgghrd_work.o \ ++lapacke_zgglse.o \ ++lapacke_zgglse_work.o \ ++lapacke_zggqrf.o \ ++lapacke_zggqrf_work.o \ ++lapacke_zggrqf.o \ ++lapacke_zggrqf_work.o \ ++lapacke_zggsvd.o \ ++lapacke_zggsvd_work.o \ ++lapacke_zggsvp.o \ ++lapacke_zggsvp_work.o \ ++lapacke_zgtcon.o \ ++lapacke_zgtcon_work.o \ ++lapacke_zgtrfs.o \ ++lapacke_zgtrfs_work.o \ ++lapacke_zgtsv.o \ ++lapacke_zgtsv_work.o \ ++lapacke_zgtsvx.o \ ++lapacke_zgtsvx_work.o \ ++lapacke_zgttrf.o \ ++lapacke_zgttrf_work.o \ ++lapacke_zgttrs.o \ ++lapacke_zgttrs_work.o \ ++lapacke_zhbev.o \ ++lapacke_zhbevd.o \ ++lapacke_zhbevd_work.o \ ++lapacke_zhbev_work.o \ ++lapacke_zhbevx.o \ ++lapacke_zhbevx_work.o \ ++lapacke_zhbgst.o \ ++lapacke_zhbgst_work.o \ ++lapacke_zhbgv.o \ ++lapacke_zhbgvd.o \ ++lapacke_zhbgvd_work.o \ ++lapacke_zhbgv_work.o \ ++lapacke_zhbgvx.o \ ++lapacke_zhbgvx_work.o \ ++lapacke_zhbtrd.o \ ++lapacke_zhbtrd_work.o \ ++lapacke_zhecon.o \ ++lapacke_zhecon_work.o \ ++lapacke_zheequb.o \ ++lapacke_zheequb_work.o \ ++lapacke_zheev.o \ ++lapacke_zheevd.o \ ++lapacke_zheevd_work.o \ ++lapacke_zheevr.o \ ++lapacke_zheevr_work.o \ ++lapacke_zheev_work.o \ ++lapacke_zheevx.o \ ++lapacke_zheevx_work.o \ ++lapacke_zhegst.o \ ++lapacke_zhegst_work.o \ ++lapacke_zhegv.o \ ++lapacke_zhegvd.o \ ++lapacke_zhegvd_work.o \ ++lapacke_zhegv_work.o \ ++lapacke_zhegvx.o \ ++lapacke_zhegvx_work.o \ ++lapacke_zherfs.o \ ++lapacke_zherfs_work.o \ ++lapacke_zhesv.o \ ++lapacke_zhesv_work.o \ ++lapacke_zhesvx.o \ ++lapacke_zhesvx_work.o \ ++lapacke_zheswapr.o \ ++lapacke_zheswapr_work.o \ ++lapacke_zhetrd.o \ ++lapacke_zhetrd_work.o \ ++lapacke_zhetrf.o \ ++lapacke_zhetrf_work.o \ ++lapacke_zhetri2.o \ ++lapacke_zhetri2_work.o \ ++lapacke_zhetri2x.o \ ++lapacke_zhetri2x_work.o \ ++lapacke_zhetri.o \ ++lapacke_zhetri_work.o \ ++lapacke_zhetrs2.o \ ++lapacke_zhetrs2_work.o \ ++lapacke_zhetrs.o \ ++lapacke_zhetrs_work.o \ ++lapacke_zhfrk.o \ ++lapacke_zhfrk_work.o \ ++lapacke_zhgeqz.o \ ++lapacke_zhgeqz_work.o \ ++lapacke_zhpcon.o \ ++lapacke_zhpcon_work.o \ ++lapacke_zhpev.o \ ++lapacke_zhpevd.o \ ++lapacke_zhpevd_work.o \ ++lapacke_zhpev_work.o \ ++lapacke_zhpevx.o \ ++lapacke_zhpevx_work.o \ ++lapacke_zhpgst.o \ ++lapacke_zhpgst_work.o \ ++lapacke_zhpgv.o \ ++lapacke_zhpgvd.o \ ++lapacke_zhpgvd_work.o \ ++lapacke_zhpgv_work.o \ ++lapacke_zhpgvx.o \ ++lapacke_zhpgvx_work.o \ ++lapacke_zhprfs.o \ ++lapacke_zhprfs_work.o \ ++lapacke_zhpsv.o \ ++lapacke_zhpsv_work.o \ ++lapacke_zhpsvx.o \ ++lapacke_zhpsvx_work.o \ ++lapacke_zhptrd.o \ ++lapacke_zhptrd_work.o \ ++lapacke_zhptrf.o \ ++lapacke_zhptrf_work.o \ ++lapacke_zhptri.o \ ++lapacke_zhptri_work.o \ ++lapacke_zhptrs.o \ ++lapacke_zhptrs_work.o \ ++lapacke_zhsein.o \ ++lapacke_zhsein_work.o \ ++lapacke_zhseqr.o \ ++lapacke_zhseqr_work.o \ ++lapacke_zlacgv.o \ ++lapacke_zlacgv_work.o \ ++lapacke_zlacpy.o \ ++lapacke_zlacpy_work.o \ ++lapacke_zlag2c.o \ ++lapacke_zlag2c_work.o \ ++lapacke_zlange.o \ ++lapacke_zlange_work.o \ ++lapacke_zlanhe.o \ ++lapacke_zlanhe_work.o \ ++lapacke_zlansy.o \ ++lapacke_zlansy_work.o \ ++lapacke_zlantr.o \ ++lapacke_zlantr_work.o \ ++lapacke_zlapmr.o \ ++lapacke_zlapmr_work.o \ ++lapacke_zlarfb.o \ ++lapacke_zlarfb_work.o \ ++lapacke_zlarfg.o \ ++lapacke_zlarfg_work.o \ ++lapacke_zlarft.o \ ++lapacke_zlarft_work.o \ ++lapacke_zlarfx.o \ ++lapacke_zlarfx_work.o \ ++lapacke_zlarnv.o \ ++lapacke_zlarnv_work.o \ ++lapacke_zlaset.o \ ++lapacke_zlaset_work.o \ ++lapacke_zlaswp.o \ ++lapacke_zlaswp_work.o \ ++lapacke_zlauum.o \ ++lapacke_zlauum_work.o \ ++lapacke_zpbcon.o \ ++lapacke_zpbcon_work.o \ ++lapacke_zpbequ.o \ ++lapacke_zpbequ_work.o \ ++lapacke_zpbrfs.o \ ++lapacke_zpbrfs_work.o \ ++lapacke_zpbstf.o \ ++lapacke_zpbstf_work.o \ ++lapacke_zpbsv.o \ ++lapacke_zpbsv_work.o \ ++lapacke_zpbsvx.o \ ++lapacke_zpbsvx_work.o \ ++lapacke_zpbtrf.o \ ++lapacke_zpbtrf_work.o \ ++lapacke_zpbtrs.o \ ++lapacke_zpbtrs_work.o \ ++lapacke_zpftrf.o \ ++lapacke_zpftrf_work.o \ ++lapacke_zpftri.o \ ++lapacke_zpftri_work.o \ ++lapacke_zpftrs.o \ ++lapacke_zpftrs_work.o \ ++lapacke_zpocon.o \ ++lapacke_zpocon_work.o \ ++lapacke_zpoequb.o \ ++lapacke_zpoequb_work.o \ ++lapacke_zpoequ.o \ ++lapacke_zpoequ_work.o \ ++lapacke_zporfs.o \ ++lapacke_zporfs_work.o \ ++lapacke_zposv.o \ ++lapacke_zposv_work.o \ ++lapacke_zposvx.o \ ++lapacke_zposvx_work.o \ ++lapacke_zpotrf.o \ ++lapacke_zpotrf_work.o \ ++lapacke_zpotri.o \ ++lapacke_zpotri_work.o \ ++lapacke_zpotrs.o \ ++lapacke_zpotrs_work.o \ ++lapacke_zppcon.o \ ++lapacke_zppcon_work.o \ ++lapacke_zppequ.o \ ++lapacke_zppequ_work.o \ ++lapacke_zpprfs.o \ ++lapacke_zpprfs_work.o \ ++lapacke_zppsv.o \ ++lapacke_zppsv_work.o \ ++lapacke_zppsvx.o \ ++lapacke_zppsvx_work.o \ ++lapacke_zpptrf.o \ ++lapacke_zpptrf_work.o \ ++lapacke_zpptri.o \ ++lapacke_zpptri_work.o \ ++lapacke_zpptrs.o \ ++lapacke_zpptrs_work.o \ ++lapacke_zpstrf.o \ ++lapacke_zpstrf_work.o \ ++lapacke_zptcon.o \ ++lapacke_zptcon_work.o \ ++lapacke_zpteqr.o \ ++lapacke_zpteqr_work.o \ ++lapacke_zptrfs.o \ ++lapacke_zptrfs_work.o \ ++lapacke_zptsv.o \ ++lapacke_zptsv_work.o \ ++lapacke_zptsvx.o \ ++lapacke_zptsvx_work.o \ ++lapacke_zpttrf.o \ ++lapacke_zpttrf_work.o \ ++lapacke_zpttrs.o \ ++lapacke_zpttrs_work.o \ ++lapacke_zspcon.o \ ++lapacke_zspcon_work.o \ ++lapacke_zsprfs.o \ ++lapacke_zsprfs_work.o \ ++lapacke_zspsv.o \ ++lapacke_zspsv_work.o \ ++lapacke_zspsvx.o \ ++lapacke_zspsvx_work.o \ ++lapacke_zsptrf.o \ ++lapacke_zsptrf_work.o \ ++lapacke_zsptri.o \ ++lapacke_zsptri_work.o \ ++lapacke_zsptrs.o \ ++lapacke_zsptrs_work.o \ ++lapacke_zstedc.o \ ++lapacke_zstedc_work.o \ ++lapacke_zstegr.o \ ++lapacke_zstegr_work.o \ ++lapacke_zstein.o \ ++lapacke_zstein_work.o \ ++lapacke_zstemr.o \ ++lapacke_zstemr_work.o \ ++lapacke_zsteqr.o \ ++lapacke_zsteqr_work.o \ ++lapacke_zsycon.o \ ++lapacke_zsyconv.o \ ++lapacke_zsyconv_work.o \ ++lapacke_zsycon_work.o \ ++lapacke_zsyequb.o \ ++lapacke_zsyequb_work.o \ ++lapacke_zsyrfs.o \ ++lapacke_zsyrfs_work.o \ ++lapacke_zsysv.o \ ++lapacke_zsysv_work.o \ ++lapacke_zsysvx.o \ ++lapacke_zsysvx_work.o \ ++lapacke_zsyswapr.o \ ++lapacke_zsyswapr_work.o \ ++lapacke_zsytrf.o \ ++lapacke_zsytrf_work.o \ ++lapacke_zsytri2.o \ ++lapacke_zsytri2_work.o \ ++lapacke_zsytri2x.o \ ++lapacke_zsytri2x_work.o \ ++lapacke_zsytri.o \ ++lapacke_zsytri_work.o \ ++lapacke_zsytrs2.o \ ++lapacke_zsytrs2_work.o \ ++lapacke_zsytrs.o \ ++lapacke_zsytrs_work.o \ ++lapacke_ztbcon.o \ ++lapacke_ztbcon_work.o \ ++lapacke_ztbrfs.o \ ++lapacke_ztbrfs_work.o \ ++lapacke_ztbtrs.o \ ++lapacke_ztbtrs_work.o \ ++lapacke_ztfsm.o \ ++lapacke_ztfsm_work.o \ ++lapacke_ztftri.o \ ++lapacke_ztftri_work.o \ ++lapacke_ztfttp.o \ ++lapacke_ztfttp_work.o \ ++lapacke_ztfttr.o \ ++lapacke_ztfttr_work.o \ ++lapacke_ztgevc.o \ ++lapacke_ztgevc_work.o \ ++lapacke_ztgexc.o \ ++lapacke_ztgexc_work.o \ ++lapacke_ztgsen.o \ ++lapacke_ztgsen_work.o \ ++lapacke_ztgsja.o \ ++lapacke_ztgsja_work.o \ ++lapacke_ztgsna.o \ ++lapacke_ztgsna_work.o \ ++lapacke_ztgsyl.o \ ++lapacke_ztgsyl_work.o \ ++lapacke_ztpcon.o \ ++lapacke_ztpcon_work.o \ ++lapacke_ztpmqrt.o \ ++lapacke_ztpmqrt_work.o \ ++lapacke_ztpqrt2.o \ ++lapacke_ztpqrt2_work.o \ ++lapacke_ztpqrt.o \ ++lapacke_ztpqrt_work.o \ ++lapacke_ztprfb.o \ ++lapacke_ztprfb_work.o \ ++lapacke_ztprfs.o \ ++lapacke_ztprfs_work.o \ ++lapacke_ztptri.o \ ++lapacke_ztptri_work.o \ ++lapacke_ztptrs.o \ ++lapacke_ztptrs_work.o \ ++lapacke_ztpttf.o \ ++lapacke_ztpttf_work.o \ ++lapacke_ztpttr.o \ ++lapacke_ztpttr_work.o \ ++lapacke_ztrcon.o \ ++lapacke_ztrcon_work.o \ ++lapacke_ztrevc.o \ ++lapacke_ztrevc_work.o \ ++lapacke_ztrexc.o \ ++lapacke_ztrexc_work.o \ ++lapacke_ztrrfs.o \ ++lapacke_ztrrfs_work.o \ ++lapacke_ztrsen.o \ ++lapacke_ztrsen_work.o \ ++lapacke_ztrsna.o \ ++lapacke_ztrsna_work.o \ ++lapacke_ztrsyl.o \ ++lapacke_ztrsyl_work.o \ ++lapacke_ztrtri.o \ ++lapacke_ztrtri_work.o \ ++lapacke_ztrtrs.o \ ++lapacke_ztrtrs_work.o \ ++lapacke_ztrttf.o \ ++lapacke_ztrttf_work.o \ ++lapacke_ztrttp.o \ ++lapacke_ztrttp_work.o \ ++lapacke_ztzrzf.o \ ++lapacke_ztzrzf_work.o \ ++lapacke_zunbdb.o \ ++lapacke_zunbdb_work.o \ ++lapacke_zuncsd.o \ ++lapacke_zuncsd_work.o \ ++lapacke_zungbr.o \ ++lapacke_zungbr_work.o \ ++lapacke_zunghr.o \ ++lapacke_zunghr_work.o \ ++lapacke_zunglq.o \ ++lapacke_zunglq_work.o \ ++lapacke_zungql.o \ ++lapacke_zungql_work.o \ ++lapacke_zungqr.o \ ++lapacke_zungqr_work.o \ ++lapacke_zungrq.o \ ++lapacke_zungrq_work.o \ ++lapacke_zungtr.o \ ++lapacke_zungtr_work.o \ ++lapacke_zunmbr.o \ ++lapacke_zunmbr_work.o \ ++lapacke_zunmhr.o \ ++lapacke_zunmhr_work.o \ ++lapacke_zunmlq.o \ ++lapacke_zunmlq_work.o \ ++lapacke_zunmql.o \ ++lapacke_zunmql_work.o \ ++lapacke_zunmqr.o \ ++lapacke_zunmqr_work.o \ ++lapacke_zunmrq.o \ ++lapacke_zunmrq_work.o \ ++lapacke_zunmrz.o \ ++lapacke_zunmrz_work.o \ ++lapacke_zunmtr.o \ ++lapacke_zunmtr_work.o \ ++lapacke_zupgtr.o \ ++lapacke_zupgtr_work.o \ ++lapacke_zupmtr.o \ ++lapacke_zupmtr_work.o ++ ++ ++LAPACKE_EXTENDEDPRECSION_OBJS := \ ++ lapacke_dgbrfsx.o lapacke_dgbrfsx_work.o lapacke_zgbrfsx.o lapacke_zgbrfsx_work.o lapacke_zsyrfsx.o \ ++lapacke_zsyrfsx_work.o \ ++lapacke_zgerfsx.o \ ++lapacke_zgerfsx_work.o \ ++lapacke_zporfsx.o \ ++lapacke_zporfsx_work.o \ ++lapacke_sgerfsx.o \ ++lapacke_sgerfsx_work.o \ ++lapacke_zgesvxx.o \ ++lapacke_zgesvxx_work.o \ ++lapacke_sgbrfsx.o \ ++lapacke_sgbrfsx_work.o \ ++lapacke_ssysvxx.o \ ++lapacke_ssysvxx_work.o \ ++lapacke_sgesvxx.o \ ++lapacke_sgesvxx_work.o \ ++lapacke_cgbsvxx.o \ ++lapacke_cgbsvxx_work.o \ ++lapacke_cporfsx.o \ ++lapacke_cporfsx_work.o \ ++lapacke_cherfsx.o \ ++lapacke_cherfsx_work.o \ ++lapacke_dporfsx.o \ ++lapacke_dporfsx_work.o \ ++lapacke_sposvxx.o \ ++lapacke_sposvxx_work.o \ ++lapacke_sgbsvxx.o \ ++lapacke_sgbsvxx_work.o \ ++lapacke_zposvxx.o \ ++lapacke_zposvxx_work.o \ ++lapacke_chesvxx.o \ ++lapacke_chesvxx_work.o \ ++lapacke_cposvxx.o \ ++lapacke_cposvxx_work.o \ ++lapacke_cgesvxx.o \ ++lapacke_cgesvxx_work.o \ ++lapacke_ssyrfsx.o \ ++lapacke_ssyrfsx_work.o \ ++lapacke_csyrfsx.o \ ++lapacke_csyrfsx_work.o \ ++lapacke_dsysvxx.o \ ++lapacke_dsysvxx_work.o \ ++lapacke_sporfsx.o \ ++lapacke_sporfsx_work.o \ ++lapacke_zherfsx.o \ ++lapacke_zherfsx_work.o \ ++lapacke_csysvxx.o \ ++lapacke_csysvxx_work.o \ ++lapacke_dposvxx.o \ ++lapacke_dposvxx_work.o \ ++lapacke_cgerfsx.o \ ++lapacke_cgerfsx_work.o \ ++lapacke_zgbsvxx.o \ ++lapacke_zgbsvxx_work.o \ ++lapacke_zsysvxx.o \ ++lapacke_zsysvxx_work.o \ ++lapacke_dgesvxx.o \ ++lapacke_dgesvxx_work.o \ ++lapacke_dgerfsx.o \ ++lapacke_dgerfsx_work.o \ ++lapacke_dsyrfsx.o \ ++lapacke_dsyrfsx_work.o \ ++lapacke_claghe.o \ ++lapacke_claghe_work.o \ ++lapacke_zhesvxx.o \ ++lapacke_zhesvxx_work.o \ ++lapacke_cgbrfsx.o \ ++lapacke_cgbrfsx_work.o \ ++lapacke_dgbsvxx.o \ ++lapacke_dgbsvxx_work.o ++ ++LAPACKE_TESTING_OBJS := \ ++ lapacke_slagge.o lapacke_slagge_work.o lapacke_clagge.o lapacke_clagge_work.o \ ++ lapacke_clatms.o lapacke_clatms_work.o lapacke_slatms.o lapacke_slatms_work.o lapacke_zlatms.o lapacke_zlatms_work.o \ ++ lapacke_clagsy.o lapacke_clagsy_work.o lapacke_slagsy.o lapacke_slagsy_work.o \ ++ lapacke_zlagsy.o lapacke_zlagsy_work.o lapacke_zlagge.o lapacke_zlagge_work.o \ ++ lapacke_dlatms.o lapacke_dlatms_work.o lapacke_zlaghe.o lapacke_zlaghe_work.o \ ++ lapacke_dlagsy.o lapacke_dlagsy_work.o lapacke_dlagge.o lapacke_dlagge_work.o ++ ++ ++OBJ_FILES := $(LAPACKE_OBJS) ++ ++ifdef LAPACKE_EXTENDED ++OBJ_FILES += $(LAPACKE_EXTENDEDPRECSION_OBJS) ++endif ++ ++ifdef LAPACKE_TESTING ++OBJ_FILES += $(LAPACK_TESTING_OBJS) ++endif + all: lib - + lib: $(OBJ_FILES) - $(ARCH) $(ARCHFLAGS) ../$(LAPACKE) $(OBJ_FILES) -+ # http://hackage.haskell.org/trac/gtk2hs/ticket/1146 ++# http://hackage.haskell.org/trac/gtk2hs/ticket/1146 + echo $(OBJ_FILES) | xargs --max-args=100 $(ARCH) $(ARCHFLAGS) ../$(LAPACKE) - $(RANLIB) ../$(LAPACKE) - + $(RANLIB) ../$(LAPACKE) + .c.o: From 74306b54d7b5b2ec739a3ecd03b0d06e1fd9fa06 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 23 Apr 2012 11:26:42 +0800 Subject: [PATCH 008/162] Refs #90 auto detect Intel Sandy Bridge Core i7-3820 --- cpuid_x86.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index e183e9fc3..e7aa07b44 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -987,6 +987,9 @@ int get_cpuname(void){ case 12: //Xeon Processor 5600 (Westmere-EP) return CPUTYPE_NEHALEM; + case 13: + //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) + return CPUTYPE_NEHALEM; } break; } @@ -1319,6 +1322,9 @@ int get_coretype(void){ case 12: //Xeon Processor 5600 (Westmere-EP) return CORE_NEHALEM; + case 13: + //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) + return CORE_NEHALEM; } break; } From f4eee224d862918768801c4a2fd33f8f8e275fb2 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Sun, 22 Apr 2012 22:38:10 +0200 Subject: [PATCH 009/162] Refs #93. Upgraded LAPACK to 3.4.1 version. --- Makefile | 82 ++-- Makefile.system | 2 +- exports/gensymbol | 6 +- patch.for_lapack-3.4.1 | 932 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 967 insertions(+), 55 deletions(-) create mode 100644 patch.for_lapack-3.4.1 diff --git a/Makefile b/Makefile index 5de7987e9..efc8b33d8 100644 --- a/Makefile +++ b/Makefile @@ -26,10 +26,10 @@ endif SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench -.PHONY : all libs netlib lapacke test ctest shared install +.PHONY : all libs netlib test ctest shared install .NOTPARALLEL : all libs prof lapack-test install -all :: libs netlib lapacke tests shared +all :: libs netlib tests shared @echo @echo " OpenBLAS build complete." @echo @@ -203,82 +203,58 @@ ifeq ($(NO_LAPACK), 1) netlib : else -netlib : lapack-3.4.0 patch.for_lapack-3.4.0 $(NETLIB_LAPACK_DIR)/make.inc +netlib : lapack-3.4.1 patch.for_lapack-3.4.1 $(NETLIB_LAPACK_DIR)/make.inc ifndef NOFORTRAN -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib endif -endif - -ifeq ($(NO_LAPACKE), 1) -lapacke : - -else -lapacke : lapack-3.4.0 $(NETLIB_LAPACK_DIR)/lapacke/make.inc -ifndef NOFORTRAN - -@$(MAKE) -C $(NETLIB_LAPACK_DIR)/lapacke +ifndef NO_LAPACKE + -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib endif endif -prof_lapack : lapack-3.4.0 $(NETLIB_LAPACK_DIR)/make.inc +prof_lapack : lapack-3.4.1 $(NETLIB_LAPACK_DIR)/make.inc -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof $(NETLIB_LAPACK_DIR)/make.inc : ifndef NOFORTRAN - -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc - -@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "POPTS = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "NOOPT = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc + -@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "POPTS = $(FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "NOOPT = $(FFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc - -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -# -@echo "CEXTRALIB = $(CEXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc +# -@echo "CEXTRALIB = $(CEXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc endif -LAPACKE_CFLAGS = $(CFLAGS) -LAPACKE_LDFLAGS = $(FFLAGS) $(EXTRALIB) -ifeq ($(F_COMPILER), INTEL) -LAPACKE_LDFLAGS += -nofor-main -endif -ifdef INTERFACE64 -LAPACKE_CFLAGS += -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64 -endif -$(NETLIB_LAPACK_DIR)/lapacke/make.inc : -ifndef NOFORTRAN - -@echo "CC = $(CC)" > $(NETLIB_LAPACK_DIR)/lapacke/make.inc - -@echo "CFLAGS = $(LAPACKE_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc - -@echo "LINKER = $(FC)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc - -@echo "LDFLAGS = $(LAPACKE_LDFLAGS)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc - -@echo "LAPACKE = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc - -@echo "LIBS = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc - -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc - -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc - -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/lapacke/make.inc -endif - -lapack-3.4.0 : lapack-3.4.0.tgz +lapack-3.4.1 : lapack-3.4.1.tgz ifndef NOFORTRAN ifndef NO_LAPACK - @if test `$(MD5SUM) lapack-3.4.0.tgz | $(AWK) '{print $$1}'` = 02d5706ec03ba885fc246e5fa10d8c70; then \ + @if test `$(MD5SUM) lapack-3.4.1.tgz | $(AWK) '{print $$1}'` = d33ace3ac27dc6b4502833ee4dd820db; then \ echo $(TAR) zxf $< ;\ - $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.0) ;\ + $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.1) ;\ rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\ else \ rm -rf $(NETLIB_LAPACK_DIR) ;\ - echo " Cannot download lapack-3.4.0.tgz or the MD5 check sum is wrong (Please use orignal)."; \ + echo " Cannot download lapack-3.4.1.tgz or the MD5 check sum is wrong (Please use orignal)."; \ exit 1; \ fi endif endif -LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.0.tgz +LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.1.tgz -lapack-3.4.0.tgz : +lapack-3.4.1.tgz : ifndef NOFORTRAN ifeq ($(OSNAME), Darwin) curl -O $(LAPACK_URL) @@ -297,7 +273,7 @@ ifndef NOFORTRAN -wget http://www.netlib.org/lapack/timing/timing.tgz endif -lapack-timing : lapack-3.4.0 large.tgz timing.tgz +lapack-timing : lapack-3.4.1 large.tgz timing.tgz ifndef NOFORTRAN (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) diff --git a/Makefile.system b/Makefile.system index dbc8ffe07..e2fe9f730 100644 --- a/Makefile.system +++ b/Makefile.system @@ -10,7 +10,7 @@ TOPDIR = . endif ifndef NETLIB_LAPACK_DIR -NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.0 +NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.1 endif # Default C compiler diff --git a/exports/gensymbol b/exports/gensymbol index d9d35de48..702e047c1 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -737,6 +737,8 @@ LAPACKE_csyconv_work, LAPACKE_csyequb, LAPACKE_csyequb_work, + LAPACKE_csyr, + LAPACKE_csyr_work, LAPACKE_csyrfs, LAPACKE_csyrfs_work, LAPACKE_csysv, @@ -2307,6 +2309,8 @@ LAPACKE_zsyconv_work, LAPACKE_zsyequb, LAPACKE_zsyequb_work, + LAPACKE_zsyr, + LAPACKE_zsyr_work, LAPACKE_zsyrfs, LAPACKE_zsyrfs_work, LAPACKE_zsysv, @@ -2448,7 +2452,7 @@ if ($ARGV[5] == 1) { #NO_LAPACK=1 @underscore_objs = (@blasobjs); -} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0") { +} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1") { @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2); } else { @underscore_objs = (@blasobjs, @lapackobjs); diff --git a/patch.for_lapack-3.4.1 b/patch.for_lapack-3.4.1 new file mode 100644 index 000000000..e06129202 --- /dev/null +++ b/patch.for_lapack-3.4.1 @@ -0,0 +1,932 @@ +diff -ruN lapack-3.4.1.old/INSTALL/Makefile lapack-3.4.1/INSTALL/Makefile +--- lapack-3.4.1.old/INSTALL/Makefile 2011-10-01 04:37:03 +0200 ++++ lapack-3.4.1/INSTALL/Makefile 2012-04-22 21:48:48 +0200 +@@ -27,7 +27,7 @@ + $(LOADER) $(LOADOPTS) -o testversion ilaver.o LAPACK_version.o + + clean: +- rm -f *.o ++ rm -f *.o test* + .f.o: + $(FORTRAN) $(OPTS) -c $< -o $@ + +diff -ruN lapack-3.4.1.old/Makefile lapack-3.4.1/Makefile +--- lapack-3.4.1.old/Makefile 2012-04-13 20:13:07 +0200 ++++ lapack-3.4.1/Makefile 2012-04-22 21:48:07 +0200 +@@ -20,9 +20,12 @@ + blaslib: + ( cd BLAS/SRC; $(MAKE) ) + +-lapacklib: lapack_install ++lapacklib: + ( cd SRC; $(MAKE) ) + ++lapack_prof: ++ ( cd SRC; $(MAKE) lapack_prof) ++ + lapackelib: lapacklib + ( cd lapacke; $(MAKE) ) + +diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile +--- lapack-3.4.1.old/SRC/Makefile 2012-04-02 21:06:36 +0200 ++++ lapack-3.4.1/SRC/Makefile 2012-04-22 21:40:21 +0200 +@@ -54,363 +54,371 @@ + # + ####################################################################### + +-ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o xerbla_array.o iparmq.o \ +- ilaprec.o ilatrans.o ilauplo.o iladiag.o chla_transtype.o \ +- ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o ++ALLAUX = ilaenv.$(SUFFIX) ieeeck.$(SUFFIX) lsamen.$(SUFFIX) xerbla_array.$(SUFFIX) iparmq.$(SUFFIX) \ ++ ilaprec.$(SUFFIX) ilatrans.$(SUFFIX) ilauplo.$(SUFFIX) iladiag.$(SUFFIX) chla_transtype.$(SUFFIX) \ ++ ../INSTALL/ilaver.$(SUFFIX) + + SCLAUX = \ +- sbdsdc.o \ +- sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ +- slaed0.o slaed1.o slaed2.o slaed3.o slaed4.o slaed5.o slaed6.o \ +- slaed7.o slaed8.o slaed9.o slaeda.o slaev2.o slagtf.o \ +- slagts.o slamrg.o slanst.o \ +- slapy2.o slapy3.o slarnv.o \ +- slarra.o slarrb.o slarrc.o slarrd.o slarre.o slarrf.o slarrj.o \ +- slarrk.o slarrr.o slaneg.o \ +- slartg.o slaruv.o slas2.o slascl.o \ +- slasd0.o slasd1.o slasd2.o slasd3.o slasd4.o slasd5.o slasd6.o \ +- slasd7.o slasd8.o slasda.o slasdq.o slasdt.o \ +- slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \ +- slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ +- ssteqr.o ssterf.o slaisnan.o sisnan.o \ +- slartgp.o slartgs.o \ +- ../INSTALL/second_$(TIMER).o ++ sbdsdc.$(SUFFIX) \ ++ sbdsqr.$(SUFFIX) sdisna.$(SUFFIX) slabad.$(SUFFIX) slacpy.$(SUFFIX) sladiv.$(SUFFIX) slae2.$(SUFFIX) slaebz.$(SUFFIX) \ ++ slaed0.$(SUFFIX) slaed1.$(SUFFIX) slaed2.$(SUFFIX) slaed3.$(SUFFIX) slaed4.$(SUFFIX) slaed5.$(SUFFIX) slaed6.$(SUFFIX) \ ++ slaed7.$(SUFFIX) slaed8.$(SUFFIX) slaed9.$(SUFFIX) slaeda.$(SUFFIX) slaev2.$(SUFFIX) slagtf.$(SUFFIX) \ ++ slagts.$(SUFFIX) slamrg.$(SUFFIX) slanst.$(SUFFIX) \ ++ slapy2.$(SUFFIX) slapy3.$(SUFFIX) slarnv.$(SUFFIX) \ ++ slarra.$(SUFFIX) slarrb.$(SUFFIX) slarrc.$(SUFFIX) slarrd.$(SUFFIX) slarre.$(SUFFIX) slarrf.$(SUFFIX) slarrj.$(SUFFIX) \ ++ slarrk.$(SUFFIX) slarrr.$(SUFFIX) slaneg.$(SUFFIX) \ ++ slartg.$(SUFFIX) slaruv.$(SUFFIX) slas2.$(SUFFIX) slascl.$(SUFFIX) \ ++ slasd0.$(SUFFIX) slasd1.$(SUFFIX) slasd2.$(SUFFIX) slasd3.$(SUFFIX) slasd4.$(SUFFIX) slasd5.$(SUFFIX) slasd6.$(SUFFIX) \ ++ slasd7.$(SUFFIX) slasd8.$(SUFFIX) slasda.$(SUFFIX) slasdq.$(SUFFIX) slasdt.$(SUFFIX) \ ++ slaset.$(SUFFIX) slasq1.$(SUFFIX) slasq2.$(SUFFIX) slasq3.$(SUFFIX) slasq4.$(SUFFIX) slasq5.$(SUFFIX) slasq6.$(SUFFIX) \ ++ slasr.$(SUFFIX) slasrt.$(SUFFIX) slassq.$(SUFFIX) slasv2.$(SUFFIX) spttrf.$(SUFFIX) sstebz.$(SUFFIX) sstedc.$(SUFFIX) \ ++ ssteqr.$(SUFFIX) ssterf.$(SUFFIX) slaisnan.$(SUFFIX) sisnan.$(SUFFIX) \ ++ slartgp.$(SUFFIX) slartgs.$(SUFFIX) \ ++ ../INSTALL/second_$(TIMER).$(SUFFIX) + + DZLAUX = \ +- dbdsdc.o \ +- dbdsqr.o ddisna.o dlabad.o dlacpy.o dladiv.o dlae2.o dlaebz.o \ +- dlaed0.o dlaed1.o dlaed2.o dlaed3.o dlaed4.o dlaed5.o dlaed6.o \ +- dlaed7.o dlaed8.o dlaed9.o dlaeda.o dlaev2.o dlagtf.o \ +- dlagts.o dlamrg.o dlanst.o \ +- dlapy2.o dlapy3.o dlarnv.o \ +- dlarra.o dlarrb.o dlarrc.o dlarrd.o dlarre.o dlarrf.o dlarrj.o \ +- dlarrk.o dlarrr.o dlaneg.o \ +- dlartg.o dlaruv.o dlas2.o dlascl.o \ +- dlasd0.o dlasd1.o dlasd2.o dlasd3.o dlasd4.o dlasd5.o dlasd6.o \ +- dlasd7.o dlasd8.o dlasda.o dlasdq.o dlasdt.o \ +- dlaset.o dlasq1.o dlasq2.o dlasq3.o dlasq4.o dlasq5.o dlasq6.o \ +- dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \ +- dsteqr.o dsterf.o dlaisnan.o disnan.o \ +- dlartgp.o dlartgs.o \ +- ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o ++ dbdsdc.$(SUFFIX) \ ++ dbdsqr.$(SUFFIX) ddisna.$(SUFFIX) dlabad.$(SUFFIX) dlacpy.$(SUFFIX) dladiv.$(SUFFIX) dlae2.$(SUFFIX) dlaebz.$(SUFFIX) \ ++ dlaed0.$(SUFFIX) dlaed1.$(SUFFIX) dlaed2.$(SUFFIX) dlaed3.$(SUFFIX) dlaed4.$(SUFFIX) dlaed5.$(SUFFIX) dlaed6.$(SUFFIX) \ ++ dlaed7.$(SUFFIX) dlaed8.$(SUFFIX) dlaed9.$(SUFFIX) dlaeda.$(SUFFIX) dlaev2.$(SUFFIX) dlagtf.$(SUFFIX) \ ++ dlagts.$(SUFFIX) dlamrg.$(SUFFIX) dlanst.$(SUFFIX) \ ++ dlapy2.$(SUFFIX) dlapy3.$(SUFFIX) dlarnv.$(SUFFIX) \ ++ dlarra.$(SUFFIX) dlarrb.$(SUFFIX) dlarrc.$(SUFFIX) dlarrd.$(SUFFIX) dlarre.$(SUFFIX) dlarrf.$(SUFFIX) dlarrj.$(SUFFIX) \ ++ dlarrk.$(SUFFIX) dlarrr.$(SUFFIX) dlaneg.$(SUFFIX) \ ++ dlartg.$(SUFFIX) dlaruv.$(SUFFIX) dlas2.$(SUFFIX) dlascl.$(SUFFIX) \ ++ dlasd0.$(SUFFIX) dlasd1.$(SUFFIX) dlasd2.$(SUFFIX) dlasd3.$(SUFFIX) dlasd4.$(SUFFIX) dlasd5.$(SUFFIX) dlasd6.$(SUFFIX) \ ++ dlasd7.$(SUFFIX) dlasd8.$(SUFFIX) dlasda.$(SUFFIX) dlasdq.$(SUFFIX) dlasdt.$(SUFFIX) \ ++ dlaset.$(SUFFIX) dlasq1.$(SUFFIX) dlasq2.$(SUFFIX) dlasq3.$(SUFFIX) dlasq4.$(SUFFIX) dlasq5.$(SUFFIX) dlasq6.$(SUFFIX) \ ++ dlasr.$(SUFFIX) dlasrt.$(SUFFIX) dlassq.$(SUFFIX) dlasv2.$(SUFFIX) dpttrf.$(SUFFIX) dstebz.$(SUFFIX) dstedc.$(SUFFIX) \ ++ dsteqr.$(SUFFIX) dsterf.$(SUFFIX) dlaisnan.$(SUFFIX) disnan.$(SUFFIX) \ ++ dlartgp.$(SUFFIX) dlartgs.$(SUFFIX) \ ++ ../INSTALL/dsecnd_$(TIMER).$(SUFFIX) + + SLASRC = \ +- sgbbrd.o sgbcon.o sgbequ.o sgbrfs.o sgbsv.o \ +- sgbsvx.o sgbtf2.o sgbtrf.o sgbtrs.o sgebak.o sgebal.o sgebd2.o \ +- sgebrd.o sgecon.o sgeequ.o sgees.o sgeesx.o sgeev.o sgeevx.o \ +- sgegs.o sgegv.o sgehd2.o sgehrd.o sgelq2.o sgelqf.o \ +- sgels.o sgelsd.o sgelss.o sgelsx.o sgelsy.o sgeql2.o sgeqlf.o \ +- sgeqp3.o sgeqpf.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \ +- sgerq2.o sgerqf.o sgesc2.o sgesdd.o sgesv.o sgesvd.o sgesvx.o \ +- sgetc2.o sgetf2.o sgetri.o \ +- sggbak.o sggbal.o sgges.o sggesx.o sggev.o sggevx.o \ +- sggglm.o sgghrd.o sgglse.o sggqrf.o \ +- sggrqf.o sggsvd.o sggsvp.o sgtcon.o sgtrfs.o sgtsv.o \ +- sgtsvx.o sgttrf.o sgttrs.o sgtts2.o shgeqz.o \ +- shsein.o shseqr.o slabrd.o slacon.o slacn2.o \ +- slaein.o slaexc.o slag2.o slags2.o slagtm.o slagv2.o slahqr.o \ +- slahrd.o slahr2.o slaic1.o slaln2.o slals0.o slalsa.o slalsd.o \ +- slangb.o slange.o slangt.o slanhs.o slansb.o slansp.o \ +- slansy.o slantb.o slantp.o slantr.o slanv2.o \ +- slapll.o slapmt.o \ +- slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ +- slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ +- slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ +- slarf.o slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slargv.o \ +- slarrv.o slartv.o \ +- slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o \ +- slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o slatzm.o \ +- slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ +- sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ +- sorgrq.o sorgtr.o sorm2l.o sorm2r.o \ +- sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ +- sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \ +- spbstf.o spbsv.o spbsvx.o \ +- spbtf2.o spbtrf.o spbtrs.o spocon.o spoequ.o sporfs.o sposv.o \ +- sposvx.o spotf2.o spotri.o spstrf.o spstf2.o \ +- sppcon.o sppequ.o \ +- spprfs.o sppsv.o sppsvx.o spptrf.o spptri.o spptrs.o sptcon.o \ +- spteqr.o sptrfs.o sptsv.o sptsvx.o spttrs.o sptts2.o srscl.o \ +- ssbev.o ssbevd.o ssbevx.o ssbgst.o ssbgv.o ssbgvd.o ssbgvx.o \ +- ssbtrd.o sspcon.o sspev.o sspevd.o sspevx.o sspgst.o \ +- sspgv.o sspgvd.o sspgvx.o ssprfs.o sspsv.o sspsvx.o ssptrd.o \ +- ssptrf.o ssptri.o ssptrs.o sstegr.o sstein.o sstev.o sstevd.o sstevr.o \ +- sstevx.o \ +- ssycon.o ssyev.o ssyevd.o ssyevr.o ssyevx.o ssygs2.o \ +- ssygst.o ssygv.o ssygvd.o ssygvx.o ssyrfs.o ssysv.o ssysvx.o \ +- ssytd2.o ssytf2.o ssytrd.o ssytrf.o ssytri.o ssytri2.o ssytri2x.o \ +- ssyswapr.o ssytrs.o ssytrs2.o ssyconv.o \ +- stbcon.o \ +- stbrfs.o stbtrs.o stgevc.o stgex2.o stgexc.o stgsen.o \ +- stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ +- stptrs.o \ +- strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ +- strti2.o strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o \ +- slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ +- stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ +- sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ +- sgeequb.o ssyequb.o spoequb.o sgbequb.o \ +- sbbcsd.o slapmr.o sorbdb.o sorcsd.o \ +- sgeqrt.o sgeqrt2.o sgeqrt3.o sgemqrt.o \ +- stpqrt.o stpqrt2.o stpmqrt.o stprfb.o ++ sgbbrd.$(SUFFIX) sgbcon.$(SUFFIX) sgbequ.$(SUFFIX) sgbrfs.$(SUFFIX) sgbsv.$(SUFFIX) \ ++ sgbsvx.$(SUFFIX) sgbtf2.$(SUFFIX) sgbtrf.$(SUFFIX) sgbtrs.$(SUFFIX) sgebak.$(SUFFIX) sgebal.$(SUFFIX) sgebd2.$(SUFFIX) \ ++ sgebrd.$(SUFFIX) sgecon.$(SUFFIX) sgeequ.$(SUFFIX) sgees.$(SUFFIX) sgeesx.$(SUFFIX) sgeev.$(SUFFIX) sgeevx.$(SUFFIX) \ ++ sgegs.$(SUFFIX) sgegv.$(SUFFIX) sgehd2.$(SUFFIX) sgehrd.$(SUFFIX) sgelq2.$(SUFFIX) sgelqf.$(SUFFIX) \ ++ sgels.$(SUFFIX) sgelsd.$(SUFFIX) sgelss.$(SUFFIX) sgelsx.$(SUFFIX) sgelsy.$(SUFFIX) sgeql2.$(SUFFIX) sgeqlf.$(SUFFIX) \ ++ sgeqp3.$(SUFFIX) sgeqpf.$(SUFFIX) sgeqr2.$(SUFFIX) sgeqr2p.$(SUFFIX) sgeqrf.$(SUFFIX) sgeqrfp.$(SUFFIX) sgerfs.$(SUFFIX) \ ++ sgerq2.$(SUFFIX) sgerqf.$(SUFFIX) sgesc2.$(SUFFIX) sgesdd.$(SUFFIX) sgesv.$(SUFFIX) sgesvd.$(SUFFIX) sgesvx.$(SUFFIX) \ ++ sgetc2.$(SUFFIX) sgetri.$(SUFFIX) \ ++ sggbak.$(SUFFIX) sggbal.$(SUFFIX) sgges.$(SUFFIX) sggesx.$(SUFFIX) sggev.$(SUFFIX) sggevx.$(SUFFIX) \ ++ sggglm.$(SUFFIX) sgghrd.$(SUFFIX) sgglse.$(SUFFIX) sggqrf.$(SUFFIX) \ ++ sggrqf.$(SUFFIX) sggsvd.$(SUFFIX) sggsvp.$(SUFFIX) sgtcon.$(SUFFIX) sgtrfs.$(SUFFIX) sgtsv.$(SUFFIX) \ ++ sgtsvx.$(SUFFIX) sgttrf.$(SUFFIX) sgttrs.$(SUFFIX) sgtts2.$(SUFFIX) shgeqz.$(SUFFIX) \ ++ shsein.$(SUFFIX) shseqr.$(SUFFIX) slabrd.$(SUFFIX) slacon.$(SUFFIX) slacn2.$(SUFFIX) \ ++ slaein.$(SUFFIX) slaexc.$(SUFFIX) slag2.$(SUFFIX) slags2.$(SUFFIX) slagtm.$(SUFFIX) slagv2.$(SUFFIX) slahqr.$(SUFFIX) \ ++ slahrd.$(SUFFIX) slahr2.$(SUFFIX) slaic1.$(SUFFIX) slaln2.$(SUFFIX) slals0.$(SUFFIX) slalsa.$(SUFFIX) slalsd.$(SUFFIX) \ ++ slangb.$(SUFFIX) slange.$(SUFFIX) slangt.$(SUFFIX) slanhs.$(SUFFIX) slansb.$(SUFFIX) slansp.$(SUFFIX) \ ++ slansy.$(SUFFIX) slantb.$(SUFFIX) slantp.$(SUFFIX) slantr.$(SUFFIX) slanv2.$(SUFFIX) \ ++ slapll.$(SUFFIX) slapmt.$(SUFFIX) \ ++ slaqgb.$(SUFFIX) slaqge.$(SUFFIX) slaqp2.$(SUFFIX) slaqps.$(SUFFIX) slaqsb.$(SUFFIX) slaqsp.$(SUFFIX) slaqsy.$(SUFFIX) \ ++ slaqr0.$(SUFFIX) slaqr1.$(SUFFIX) slaqr2.$(SUFFIX) slaqr3.$(SUFFIX) slaqr4.$(SUFFIX) slaqr5.$(SUFFIX) \ ++ slaqtr.$(SUFFIX) slar1v.$(SUFFIX) slar2v.$(SUFFIX) ilaslr.$(SUFFIX) ilaslc.$(SUFFIX) \ ++ slarf.$(SUFFIX) slarfb.$(SUFFIX) slarfg.$(SUFFIX) slarfgp.$(SUFFIX) slarft.$(SUFFIX) slarfx.$(SUFFIX) slargv.$(SUFFIX) \ ++ slarrv.$(SUFFIX) slartv.$(SUFFIX) \ ++ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slaswp.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ ++ slatbs.$(SUFFIX) slatdf.$(SUFFIX) slatps.$(SUFFIX) slatrd.$(SUFFIX) slatrs.$(SUFFIX) slatrz.$(SUFFIX) slatzm.$(SUFFIX) \ ++ sopgtr.$(SUFFIX) sopmtr.$(SUFFIX) sorg2l.$(SUFFIX) sorg2r.$(SUFFIX) \ ++ sorgbr.$(SUFFIX) sorghr.$(SUFFIX) sorgl2.$(SUFFIX) sorglq.$(SUFFIX) sorgql.$(SUFFIX) sorgqr.$(SUFFIX) sorgr2.$(SUFFIX) \ ++ sorgrq.$(SUFFIX) sorgtr.$(SUFFIX) sorm2l.$(SUFFIX) sorm2r.$(SUFFIX) \ ++ sormbr.$(SUFFIX) sormhr.$(SUFFIX) sorml2.$(SUFFIX) sormlq.$(SUFFIX) sormql.$(SUFFIX) sormqr.$(SUFFIX) sormr2.$(SUFFIX) \ ++ sormr3.$(SUFFIX) sormrq.$(SUFFIX) sormrz.$(SUFFIX) sormtr.$(SUFFIX) spbcon.$(SUFFIX) spbequ.$(SUFFIX) spbrfs.$(SUFFIX) \ ++ spbstf.$(SUFFIX) spbsv.$(SUFFIX) spbsvx.$(SUFFIX) \ ++ spbtf2.$(SUFFIX) spbtrf.$(SUFFIX) spbtrs.$(SUFFIX) spocon.$(SUFFIX) spoequ.$(SUFFIX) sporfs.$(SUFFIX) sposv.$(SUFFIX) \ ++ sposvx.$(SUFFIX) spotri.$(SUFFIX) spstrf.$(SUFFIX) spstf2.$(SUFFIX) \ ++ sppcon.$(SUFFIX) sppequ.$(SUFFIX) \ ++ spprfs.$(SUFFIX) sppsv.$(SUFFIX) sppsvx.$(SUFFIX) spptrf.$(SUFFIX) spptri.$(SUFFIX) spptrs.$(SUFFIX) sptcon.$(SUFFIX) \ ++ spteqr.$(SUFFIX) sptrfs.$(SUFFIX) sptsv.$(SUFFIX) sptsvx.$(SUFFIX) spttrs.$(SUFFIX) sptts2.$(SUFFIX) srscl.$(SUFFIX) \ ++ ssbev.$(SUFFIX) ssbevd.$(SUFFIX) ssbevx.$(SUFFIX) ssbgst.$(SUFFIX) ssbgv.$(SUFFIX) ssbgvd.$(SUFFIX) ssbgvx.$(SUFFIX) \ ++ ssbtrd.$(SUFFIX) sspcon.$(SUFFIX) sspev.$(SUFFIX) sspevd.$(SUFFIX) sspevx.$(SUFFIX) sspgst.$(SUFFIX) \ ++ sspgv.$(SUFFIX) sspgvd.$(SUFFIX) sspgvx.$(SUFFIX) ssprfs.$(SUFFIX) sspsv.$(SUFFIX) sspsvx.$(SUFFIX) ssptrd.$(SUFFIX) \ ++ ssptrf.$(SUFFIX) ssptri.$(SUFFIX) ssptrs.$(SUFFIX) sstegr.$(SUFFIX) sstein.$(SUFFIX) sstev.$(SUFFIX) sstevd.$(SUFFIX) sstevr.$(SUFFIX) \ ++ sstevx.$(SUFFIX) \ ++ ssycon.$(SUFFIX) ssyev.$(SUFFIX) ssyevd.$(SUFFIX) ssyevr.$(SUFFIX) ssyevx.$(SUFFIX) ssygs2.$(SUFFIX) \ ++ ssygst.$(SUFFIX) ssygv.$(SUFFIX) ssygvd.$(SUFFIX) ssygvx.$(SUFFIX) ssyrfs.$(SUFFIX) ssysv.$(SUFFIX) ssysvx.$(SUFFIX) \ ++ ssytd2.$(SUFFIX) ssytf2.$(SUFFIX) ssytrd.$(SUFFIX) ssytrf.$(SUFFIX) ssytri.$(SUFFIX) ssytri2.$(SUFFIX) ssytri2x.$(SUFFIX) \ ++ ssyswapr.$(SUFFIX) ssytrs.$(SUFFIX) ssytrs2.$(SUFFIX) ssyconv.$(SUFFIX) \ ++ stbcon.$(SUFFIX) \ ++ stbrfs.$(SUFFIX) stbtrs.$(SUFFIX) stgevc.$(SUFFIX) stgex2.$(SUFFIX) stgexc.$(SUFFIX) stgsen.$(SUFFIX) \ ++ stgsja.$(SUFFIX) stgsna.$(SUFFIX) stgsy2.$(SUFFIX) stgsyl.$(SUFFIX) stpcon.$(SUFFIX) stprfs.$(SUFFIX) stptri.$(SUFFIX) \ ++ stptrs.$(SUFFIX) \ ++ strcon.$(SUFFIX) strevc.$(SUFFIX) strexc.$(SUFFIX) strrfs.$(SUFFIX) strsen.$(SUFFIX) strsna.$(SUFFIX) strsyl.$(SUFFIX) \ ++ strtrs.$(SUFFIX) stzrqf.$(SUFFIX) stzrzf.$(SUFFIX) sstemr.$(SUFFIX) \ ++ slansf.$(SUFFIX) spftrf.$(SUFFIX) spftri.$(SUFFIX) spftrs.$(SUFFIX) ssfrk.$(SUFFIX) stfsm.$(SUFFIX) stftri.$(SUFFIX) stfttp.$(SUFFIX) \ ++ stfttr.$(SUFFIX) stpttf.$(SUFFIX) stpttr.$(SUFFIX) strttf.$(SUFFIX) strttp.$(SUFFIX) \ ++ sgejsv.$(SUFFIX) sgesvj.$(SUFFIX) sgsvj0.$(SUFFIX) sgsvj1.$(SUFFIX) \ ++ sgeequb.$(SUFFIX) ssyequb.$(SUFFIX) spoequb.$(SUFFIX) sgbequb.$(SUFFIX) \ ++ sbbcsd.$(SUFFIX) slapmr.$(SUFFIX) sorbdb.$(SUFFIX) sorcsd.$(SUFFIX) \ ++ sgeqrt.$(SUFFIX) sgeqrt2.$(SUFFIX) sgeqrt3.$(SUFFIX) sgemqrt.$(SUFFIX) \ ++ stpqrt.$(SUFFIX) stpqrt2.$(SUFFIX) stpmqrt.$(SUFFIX) stprfb.$(SUFFIX) + +-DSLASRC = spotrs.o sgetrs.o spotrf.o sgetrf.o ++DSLASRC = spotrs.$(SUFFIX) + + ifdef USEXBLAS +-SXLASRC = sgesvxx.o sgerfsx.o sla_gerfsx_extended.o sla_geamv.o \ +- sla_gercond.o sla_gerpvgrw.o ssysvxx.o ssyrfsx.o \ +- sla_syrfsx_extended.o sla_syamv.o sla_syrcond.o sla_syrpvgrw.o \ +- sposvxx.o sporfsx.o sla_porfsx_extended.o sla_porcond.o \ +- sla_porpvgrw.o sgbsvxx.o sgbrfsx.o sla_gbrfsx_extended.o \ +- sla_gbamv.o sla_gbrcond.o sla_gbrpvgrw.o sla_lin_berr.o slarscl2.o \ +- slascl2.o sla_wwaddw.o ++SXLASRC = sgesvxx.$(SUFFIX) sgerfsx.$(SUFFIX) sla_gerfsx_extended.$(SUFFIX) sla_geamv.$(SUFFIX) \ ++ sla_gercond.$(SUFFIX) sla_gerpvgrw.$(SUFFIX) ssysvxx.$(SUFFIX) ssyrfsx.$(SUFFIX) \ ++ sla_syrfsx_extended.$(SUFFIX) sla_syamv.$(SUFFIX) sla_syrcond.$(SUFFIX) sla_syrpvgrw.$(SUFFIX) \ ++ sposvxx.$(SUFFIX) sporfsx.$(SUFFIX) sla_porfsx_extended.$(SUFFIX) sla_porcond.$(SUFFIX) \ ++ sla_porpvgrw.$(SUFFIX) sgbsvxx.$(SUFFIX) sgbrfsx.$(SUFFIX) sla_gbrfsx_extended.$(SUFFIX) \ ++ sla_gbamv.$(SUFFIX) sla_gbrcond.$(SUFFIX) sla_gbrpvgrw.$(SUFFIX) sla_lin_berr.$(SUFFIX) slarscl2.$(SUFFIX) \ ++ slascl2.$(SUFFIX) sla_wwaddw.$(SUFFIX) + endif + + CLASRC = \ +- cbdsqr.o cgbbrd.o cgbcon.o cgbequ.o cgbrfs.o cgbsv.o cgbsvx.o \ +- cgbtf2.o cgbtrf.o cgbtrs.o cgebak.o cgebal.o cgebd2.o cgebrd.o \ +- cgecon.o cgeequ.o cgees.o cgeesx.o cgeev.o cgeevx.o \ +- cgegs.o cgegv.o cgehd2.o cgehrd.o cgelq2.o cgelqf.o \ +- cgels.o cgelsd.o cgelss.o cgelsx.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o \ +- cgeqpf.o cgeqr2.o cgeqr2p.o cgeqrf.o cgeqrfp.o cgerfs.o \ +- cgerq2.o cgerqf.o cgesc2.o cgesdd.o cgesv.o cgesvd.o \ +- cgesvx.o cgetc2.o cgetf2.o cgetri.o \ +- cggbak.o cggbal.o cgges.o cggesx.o cggev.o cggevx.o cggglm.o \ +- cgghrd.o cgglse.o cggqrf.o cggrqf.o \ +- cggsvd.o cggsvp.o \ +- cgtcon.o cgtrfs.o cgtsv.o cgtsvx.o cgttrf.o cgttrs.o cgtts2.o chbev.o \ +- chbevd.o chbevx.o chbgst.o chbgv.o chbgvd.o chbgvx.o chbtrd.o \ +- checon.o cheev.o cheevd.o cheevr.o cheevx.o chegs2.o chegst.o \ +- chegv.o chegvd.o chegvx.o cherfs.o chesv.o chesvx.o chetd2.o \ +- chetf2.o chetrd.o \ +- chetrf.o chetri.o chetri2.o chetri2x.o cheswapr.o \ +- chetrs.o chetrs2.o chgeqz.o chpcon.o chpev.o chpevd.o \ +- chpevx.o chpgst.o chpgv.o chpgvd.o chpgvx.o chprfs.o chpsv.o \ +- chpsvx.o \ +- chptrd.o chptrf.o chptri.o chptrs.o chsein.o chseqr.o clabrd.o \ +- clacgv.o clacon.o clacn2.o clacp2.o clacpy.o clacrm.o clacrt.o cladiv.o \ +- claed0.o claed7.o claed8.o \ +- claein.o claesy.o claev2.o clags2.o clagtm.o \ +- clahef.o clahqr.o \ +- clahrd.o clahr2.o claic1.o clals0.o clalsa.o clalsd.o clangb.o clange.o clangt.o \ +- clanhb.o clanhe.o \ +- clanhp.o clanhs.o clanht.o clansb.o clansp.o clansy.o clantb.o \ +- clantp.o clantr.o clapll.o clapmt.o clarcm.o claqgb.o claqge.o \ +- claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \ +- claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ +- claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ +- clarf.o clarfb.o clarfg.o clarft.o clarfgp.o \ +- clarfx.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ +- clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ +- claswp.o clasyf.o clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrz.o \ +- clatzm.o clauu2.o clauum.o cpbcon.o cpbequ.o cpbrfs.o cpbstf.o cpbsv.o \ +- cpbsvx.o cpbtf2.o cpbtrf.o cpbtrs.o cpocon.o cpoequ.o cporfs.o \ +- cposv.o cposvx.o cpotf2.o cpotri.o cpstrf.o cpstf2.o \ +- cppcon.o cppequ.o cpprfs.o cppsv.o cppsvx.o cpptrf.o cpptri.o cpptrs.o \ +- cptcon.o cpteqr.o cptrfs.o cptsv.o cptsvx.o cpttrf.o cpttrs.o cptts2.o \ +- crot.o cspcon.o cspmv.o cspr.o csprfs.o cspsv.o \ +- cspsvx.o csptrf.o csptri.o csptrs.o csrscl.o cstedc.o \ +- cstegr.o cstein.o csteqr.o \ +- csycon.o csymv.o \ +- csyr.o csyrfs.o csysv.o csysvx.o csytf2.o csytrf.o csytri.o csytri2.o csytri2x.o \ +- csyswapr.o csytrs.o csytrs2.o csyconv.o \ +- ctbcon.o ctbrfs.o ctbtrs.o ctgevc.o ctgex2.o \ +- ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ +- ctprfs.o ctptri.o \ +- ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ +- ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ +- cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ +- cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ +- cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ +- cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \ +- chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \ +- ctfttr.o ctpttf.o ctpttr.o ctrttf.o ctrttp.o \ +- cgeequb.o cgbequb.o csyequb.o cpoequb.o cheequb.o \ +- cbbcsd.o clapmr.o cunbdb.o cuncsd.o \ +- cgeqrt.o cgeqrt2.o cgeqrt3.o cgemqrt.o \ +- ctpqrt.o ctpqrt2.o ctpmqrt.o ctprfb.o ++ cbdsqr.$(SUFFIX) cgbbrd.$(SUFFIX) cgbcon.$(SUFFIX) cgbequ.$(SUFFIX) cgbrfs.$(SUFFIX) cgbsv.$(SUFFIX) cgbsvx.$(SUFFIX) \ ++ cgbtf2.$(SUFFIX) cgbtrf.$(SUFFIX) cgbtrs.$(SUFFIX) cgebak.$(SUFFIX) cgebal.$(SUFFIX) cgebd2.$(SUFFIX) cgebrd.$(SUFFIX) \ ++ cgecon.$(SUFFIX) cgeequ.$(SUFFIX) cgees.$(SUFFIX) cgeesx.$(SUFFIX) cgeev.$(SUFFIX) cgeevx.$(SUFFIX) \ ++ cgegs.$(SUFFIX) cgegv.$(SUFFIX) cgehd2.$(SUFFIX) cgehrd.$(SUFFIX) cgelq2.$(SUFFIX) cgelqf.$(SUFFIX) \ ++ cgels.$(SUFFIX) cgelsd.$(SUFFIX) cgelss.$(SUFFIX) cgelsx.$(SUFFIX) cgelsy.$(SUFFIX) cgeql2.$(SUFFIX) cgeqlf.$(SUFFIX) cgeqp3.$(SUFFIX) \ ++ cgeqpf.$(SUFFIX) cgeqr2.$(SUFFIX) cgeqr2p.$(SUFFIX) cgeqrf.$(SUFFIX) cgeqrfp.$(SUFFIX) cgerfs.$(SUFFIX) \ ++ cgerq2.$(SUFFIX) cgerqf.$(SUFFIX) cgesc2.$(SUFFIX) cgesdd.$(SUFFIX) cgesv.$(SUFFIX) cgesvd.$(SUFFIX) \ ++ cgesvx.$(SUFFIX) cgetc2.$(SUFFIX) cgetri.$(SUFFIX) \ ++ cggbak.$(SUFFIX) cggbal.$(SUFFIX) cgges.$(SUFFIX) cggesx.$(SUFFIX) cggev.$(SUFFIX) cggevx.$(SUFFIX) cggglm.$(SUFFIX) \ ++ cgghrd.$(SUFFIX) cgglse.$(SUFFIX) cggqrf.$(SUFFIX) cggrqf.$(SUFFIX) \ ++ cggsvd.$(SUFFIX) cggsvp.$(SUFFIX) \ ++ cgtcon.$(SUFFIX) cgtrfs.$(SUFFIX) cgtsv.$(SUFFIX) cgtsvx.$(SUFFIX) cgttrf.$(SUFFIX) cgttrs.$(SUFFIX) cgtts2.$(SUFFIX) chbev.$(SUFFIX) \ ++ chbevd.$(SUFFIX) chbevx.$(SUFFIX) chbgst.$(SUFFIX) chbgv.$(SUFFIX) chbgvd.$(SUFFIX) chbgvx.$(SUFFIX) chbtrd.$(SUFFIX) \ ++ checon.$(SUFFIX) cheev.$(SUFFIX) cheevd.$(SUFFIX) cheevr.$(SUFFIX) cheevx.$(SUFFIX) chegs2.$(SUFFIX) chegst.$(SUFFIX) \ ++ chegv.$(SUFFIX) chegvd.$(SUFFIX) chegvx.$(SUFFIX) cherfs.$(SUFFIX) chesv.$(SUFFIX) chesvx.$(SUFFIX) chetd2.$(SUFFIX) \ ++ chetf2.$(SUFFIX) chetrd.$(SUFFIX) \ ++ chetrf.$(SUFFIX) chetri.$(SUFFIX) chetri2.$(SUFFIX) chetri2x.$(SUFFIX) cheswapr.$(SUFFIX) \ ++ chetrs.$(SUFFIX) chetrs2.$(SUFFIX) chgeqz.$(SUFFIX) chpcon.$(SUFFIX) chpev.$(SUFFIX) chpevd.$(SUFFIX) \ ++ chpevx.$(SUFFIX) chpgst.$(SUFFIX) chpgv.$(SUFFIX) chpgvd.$(SUFFIX) chpgvx.$(SUFFIX) chprfs.$(SUFFIX) chpsv.$(SUFFIX) \ ++ chpsvx.$(SUFFIX) \ ++ chptrd.$(SUFFIX) chptrf.$(SUFFIX) chptri.$(SUFFIX) chptrs.$(SUFFIX) chsein.$(SUFFIX) chseqr.$(SUFFIX) clabrd.$(SUFFIX) \ ++ clacgv.$(SUFFIX) clacon.$(SUFFIX) clacn2.$(SUFFIX) clacp2.$(SUFFIX) clacpy.$(SUFFIX) clacrm.$(SUFFIX) clacrt.$(SUFFIX) cladiv.$(SUFFIX) \ ++ claed0.$(SUFFIX) claed7.$(SUFFIX) claed8.$(SUFFIX) \ ++ claein.$(SUFFIX) claesy.$(SUFFIX) claev2.$(SUFFIX) clags2.$(SUFFIX) clagtm.$(SUFFIX) \ ++ clahef.$(SUFFIX) clahqr.$(SUFFIX) \ ++ clahrd.$(SUFFIX) clahr2.$(SUFFIX) claic1.$(SUFFIX) clals0.$(SUFFIX) clalsa.$(SUFFIX) clalsd.$(SUFFIX) clangb.$(SUFFIX) clange.$(SUFFIX) clangt.$(SUFFIX) \ ++ clanhb.$(SUFFIX) clanhe.$(SUFFIX) \ ++ clanhp.$(SUFFIX) clanhs.$(SUFFIX) clanht.$(SUFFIX) clansb.$(SUFFIX) clansp.$(SUFFIX) clansy.$(SUFFIX) clantb.$(SUFFIX) \ ++ clantp.$(SUFFIX) clantr.$(SUFFIX) clapll.$(SUFFIX) clapmt.$(SUFFIX) clarcm.$(SUFFIX) claqgb.$(SUFFIX) claqge.$(SUFFIX) \ ++ claqhb.$(SUFFIX) claqhe.$(SUFFIX) claqhp.$(SUFFIX) claqp2.$(SUFFIX) claqps.$(SUFFIX) claqsb.$(SUFFIX) \ ++ claqr0.$(SUFFIX) claqr1.$(SUFFIX) claqr2.$(SUFFIX) claqr3.$(SUFFIX) claqr4.$(SUFFIX) claqr5.$(SUFFIX) \ ++ claqsp.$(SUFFIX) claqsy.$(SUFFIX) clar1v.$(SUFFIX) clar2v.$(SUFFIX) ilaclr.$(SUFFIX) ilaclc.$(SUFFIX) \ ++ clarf.$(SUFFIX) clarfb.$(SUFFIX) clarfg.$(SUFFIX) clarft.$(SUFFIX) clarfgp.$(SUFFIX) \ ++ clarfx.$(SUFFIX) clargv.$(SUFFIX) clarnv.$(SUFFIX) clarrv.$(SUFFIX) clartg.$(SUFFIX) clartv.$(SUFFIX) \ ++ clarz.$(SUFFIX) clarzb.$(SUFFIX) clarzt.$(SUFFIX) clascl.$(SUFFIX) claset.$(SUFFIX) clasr.$(SUFFIX) classq.$(SUFFIX) \ ++ claswp.$(SUFFIX) clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ ++ clatzm.$(SUFFIX) cpbcon.$(SUFFIX) cpbequ.$(SUFFIX) cpbrfs.$(SUFFIX) cpbstf.$(SUFFIX) cpbsv.$(SUFFIX) \ ++ cpbsvx.$(SUFFIX) cpbtf2.$(SUFFIX) cpbtrf.$(SUFFIX) cpbtrs.$(SUFFIX) cpocon.$(SUFFIX) cpoequ.$(SUFFIX) cporfs.$(SUFFIX) \ ++ cposv.$(SUFFIX) cposvx.$(SUFFIX) cpotri.$(SUFFIX) cpstrf.$(SUFFIX) cpstf2.$(SUFFIX) \ ++ cppcon.$(SUFFIX) cppequ.$(SUFFIX) cpprfs.$(SUFFIX) cppsv.$(SUFFIX) cppsvx.$(SUFFIX) cpptrf.$(SUFFIX) cpptri.$(SUFFIX) cpptrs.$(SUFFIX) \ ++ cptcon.$(SUFFIX) cpteqr.$(SUFFIX) cptrfs.$(SUFFIX) cptsv.$(SUFFIX) cptsvx.$(SUFFIX) cpttrf.$(SUFFIX) cpttrs.$(SUFFIX) cptts2.$(SUFFIX) \ ++ crot.$(SUFFIX) cspcon.$(SUFFIX) cspmv.$(SUFFIX) cspr.$(SUFFIX) csprfs.$(SUFFIX) cspsv.$(SUFFIX) \ ++ cspsvx.$(SUFFIX) csptrf.$(SUFFIX) csptri.$(SUFFIX) csptrs.$(SUFFIX) csrscl.$(SUFFIX) cstedc.$(SUFFIX) \ ++ cstegr.$(SUFFIX) cstein.$(SUFFIX) csteqr.$(SUFFIX) \ ++ csycon.$(SUFFIX) csymv.$(SUFFIX) \ ++ csyr.$(SUFFIX) csyrfs.$(SUFFIX) csysv.$(SUFFIX) csysvx.$(SUFFIX) csytf2.$(SUFFIX) csytrf.$(SUFFIX) csytri.$(SUFFIX) csytri2.$(SUFFIX) csytri2x.$(SUFFIX) \ ++ csyswapr.$(SUFFIX) csytrs.$(SUFFIX) csytrs2.$(SUFFIX) csyconv.$(SUFFIX) \ ++ ctbcon.$(SUFFIX) ctbrfs.$(SUFFIX) ctbtrs.$(SUFFIX) ctgevc.$(SUFFIX) ctgex2.$(SUFFIX) \ ++ ctgexc.$(SUFFIX) ctgsen.$(SUFFIX) ctgsja.$(SUFFIX) ctgsna.$(SUFFIX) ctgsy2.$(SUFFIX) ctgsyl.$(SUFFIX) ctpcon.$(SUFFIX) \ ++ ctprfs.$(SUFFIX) ctptri.$(SUFFIX) \ ++ ctptrs.$(SUFFIX) ctrcon.$(SUFFIX) ctrevc.$(SUFFIX) ctrexc.$(SUFFIX) ctrrfs.$(SUFFIX) ctrsen.$(SUFFIX) ctrsna.$(SUFFIX) \ ++ ctrsyl.$(SUFFIX) ctrtrs.$(SUFFIX) ctzrqf.$(SUFFIX) ctzrzf.$(SUFFIX) cung2l.$(SUFFIX) cung2r.$(SUFFIX) \ ++ cungbr.$(SUFFIX) cunghr.$(SUFFIX) cungl2.$(SUFFIX) cunglq.$(SUFFIX) cungql.$(SUFFIX) cungqr.$(SUFFIX) cungr2.$(SUFFIX) \ ++ cungrq.$(SUFFIX) cungtr.$(SUFFIX) cunm2l.$(SUFFIX) cunm2r.$(SUFFIX) cunmbr.$(SUFFIX) cunmhr.$(SUFFIX) cunml2.$(SUFFIX) \ ++ cunmlq.$(SUFFIX) cunmql.$(SUFFIX) cunmqr.$(SUFFIX) cunmr2.$(SUFFIX) cunmr3.$(SUFFIX) cunmrq.$(SUFFIX) cunmrz.$(SUFFIX) \ ++ cunmtr.$(SUFFIX) cupgtr.$(SUFFIX) cupmtr.$(SUFFIX) icmax1.$(SUFFIX) scsum1.$(SUFFIX) cstemr.$(SUFFIX) \ ++ chfrk.$(SUFFIX) ctfttp.$(SUFFIX) clanhf.$(SUFFIX) cpftrf.$(SUFFIX) cpftri.$(SUFFIX) cpftrs.$(SUFFIX) ctfsm.$(SUFFIX) ctftri.$(SUFFIX) \ ++ ctfttr.$(SUFFIX) ctpttf.$(SUFFIX) ctpttr.$(SUFFIX) ctrttf.$(SUFFIX) ctrttp.$(SUFFIX) \ ++ cgeequb.$(SUFFIX) cgbequb.$(SUFFIX) csyequb.$(SUFFIX) cpoequb.$(SUFFIX) cheequb.$(SUFFIX) \ ++ cbbcsd.$(SUFFIX) clapmr.$(SUFFIX) cunbdb.$(SUFFIX) cuncsd.$(SUFFIX) \ ++ cgeqrt.$(SUFFIX) cgeqrt2.$(SUFFIX) cgeqrt3.$(SUFFIX) cgemqrt.$(SUFFIX) \ ++ ctpqrt.$(SUFFIX) ctpqrt2.$(SUFFIX) ctpmqrt.$(SUFFIX) ctprfb.$(SUFFIX) + + ifdef USEXBLAS +-CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \ +- cla_gercond_c.o cla_gercond_x.o cla_gerpvgrw.o \ +- csysvxx.o csyrfsx.o cla_syrfsx_extended.o cla_syamv.o \ +- cla_syrcond_c.o cla_syrcond_x.o cla_syrpvgrw.o \ +- cposvxx.o cporfsx.o cla_porfsx_extended.o \ +- cla_porcond_c.o cla_porcond_x.o cla_porpvgrw.o \ +- cgbsvxx.o cgbrfsx.o cla_gbrfsx_extended.o cla_gbamv.o \ +- cla_gbrcond_c.o cla_gbrcond_x.o cla_gbrpvgrw.o \ +- chesvxx.o cherfsx.o cla_herfsx_extended.o cla_heamv.o \ +- cla_hercond_c.o cla_hercond_x.o cla_herpvgrw.o \ +- cla_lin_berr.o clarscl2.o clascl2.o cla_wwaddw.o ++CXLASRC = cgesvxx.$(SUFFIX) cgerfsx.$(SUFFIX) cla_gerfsx_extended.$(SUFFIX) cla_geamv.$(SUFFIX) \ ++ cla_gercond_c.$(SUFFIX) cla_gercond_x.$(SUFFIX) cla_gerpvgrw.$(SUFFIX) \ ++ csysvxx.$(SUFFIX) csyrfsx.$(SUFFIX) cla_syrfsx_extended.$(SUFFIX) cla_syamv.$(SUFFIX) \ ++ cla_syrcond_c.$(SUFFIX) cla_syrcond_x.$(SUFFIX) cla_syrpvgrw.$(SUFFIX) \ ++ cposvxx.$(SUFFIX) cporfsx.$(SUFFIX) cla_porfsx_extended.$(SUFFIX) \ ++ cla_porcond_c.$(SUFFIX) cla_porcond_x.$(SUFFIX) cla_porpvgrw.$(SUFFIX) \ ++ cgbsvxx.$(SUFFIX) cgbrfsx.$(SUFFIX) cla_gbrfsx_extended.$(SUFFIX) cla_gbamv.$(SUFFIX) \ ++ cla_gbrcond_c.$(SUFFIX) cla_gbrcond_x.$(SUFFIX) cla_gbrpvgrw.$(SUFFIX) \ ++ chesvxx.$(SUFFIX) cherfsx.$(SUFFIX) cla_herfsx_extended.$(SUFFIX) cla_heamv.$(SUFFIX) \ ++ cla_hercond_c.$(SUFFIX) cla_hercond_x.$(SUFFIX) cla_herpvgrw.$(SUFFIX) \ ++ cla_lin_berr.$(SUFFIX) clarscl2.$(SUFFIX) clascl2.$(SUFFIX) cla_wwaddw.$(SUFFIX) + endif + +-ZCLASRC = cpotrs.o cgetrs.o cpotrf.o cgetrf.o ++ZCLASRC = cpotrs.$(SUFFIX) + + DLASRC = \ +- dgbbrd.o dgbcon.o dgbequ.o dgbrfs.o dgbsv.o \ +- dgbsvx.o dgbtf2.o dgbtrf.o dgbtrs.o dgebak.o dgebal.o dgebd2.o \ +- dgebrd.o dgecon.o dgeequ.o dgees.o dgeesx.o dgeev.o dgeevx.o \ +- dgegs.o dgegv.o dgehd2.o dgehrd.o dgelq2.o dgelqf.o \ +- dgels.o dgelsd.o dgelss.o dgelsx.o dgelsy.o dgeql2.o dgeqlf.o \ +- dgeqp3.o dgeqpf.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \ +- dgerq2.o dgerqf.o dgesc2.o dgesdd.o dgesv.o dgesvd.o dgesvx.o \ +- dgetc2.o dgetf2.o dgetrf.o dgetri.o \ +- dgetrs.o dggbak.o dggbal.o dgges.o dggesx.o dggev.o dggevx.o \ +- dggglm.o dgghrd.o dgglse.o dggqrf.o \ +- dggrqf.o dggsvd.o dggsvp.o dgtcon.o dgtrfs.o dgtsv.o \ +- dgtsvx.o dgttrf.o dgttrs.o dgtts2.o dhgeqz.o \ +- dhsein.o dhseqr.o dlabrd.o dlacon.o dlacn2.o \ +- dlaein.o dlaexc.o dlag2.o dlags2.o dlagtm.o dlagv2.o dlahqr.o \ +- dlahrd.o dlahr2.o dlaic1.o dlaln2.o dlals0.o dlalsa.o dlalsd.o \ +- dlangb.o dlange.o dlangt.o dlanhs.o dlansb.o dlansp.o \ +- dlansy.o dlantb.o dlantp.o dlantr.o dlanv2.o \ +- dlapll.o dlapmt.o \ +- dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ +- dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ +- dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ +- dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o \ +- dlargv.o dlarrv.o dlartv.o \ +- dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o dlasyf.o \ +- dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlatzm.o dlauu2.o \ +- dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ +- dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ +- dorgrq.o dorgtr.o dorm2l.o dorm2r.o \ +- dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ +- dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \ +- dpbstf.o dpbsv.o dpbsvx.o \ +- dpbtf2.o dpbtrf.o dpbtrs.o dpocon.o dpoequ.o dporfs.o dposv.o \ +- dposvx.o dpotf2.o dpotrf.o dpotri.o dpotrs.o dpstrf.o dpstf2.o \ +- dppcon.o dppequ.o \ +- dpprfs.o dppsv.o dppsvx.o dpptrf.o dpptri.o dpptrs.o dptcon.o \ +- dpteqr.o dptrfs.o dptsv.o dptsvx.o dpttrs.o dptts2.o drscl.o \ +- dsbev.o dsbevd.o dsbevx.o dsbgst.o dsbgv.o dsbgvd.o dsbgvx.o \ +- dsbtrd.o dspcon.o dspev.o dspevd.o dspevx.o dspgst.o \ +- dspgv.o dspgvd.o dspgvx.o dsprfs.o dspsv.o dspsvx.o dsptrd.o \ +- dsptrf.o dsptri.o dsptrs.o dstegr.o dstein.o dstev.o dstevd.o dstevr.o \ +- dstevx.o \ +- dsycon.o dsyev.o dsyevd.o dsyevr.o \ +- dsyevx.o dsygs2.o dsygst.o dsygv.o dsygvd.o dsygvx.o dsyrfs.o \ +- dsysv.o dsysvx.o \ +- dsytd2.o dsytf2.o dsytrd.o dsytrf.o dsytri.o dsytri2.o dsytri2x.o \ +- dsyswapr.o dsytrs.o dsytrs2.o dsyconv.o \ +- dtbcon.o dtbrfs.o dtbtrs.o dtgevc.o dtgex2.o dtgexc.o dtgsen.o \ +- dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ +- dtptrs.o \ +- dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ +- dtrti2.o dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ +- dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ +- dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ +- dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ +- dgejsv.o dgesvj.o dgsvj0.o dgsvj1.o \ +- dgeequb.o dsyequb.o dpoequb.o dgbequb.o \ +- dbbcsd.o dlapmr.o dorbdb.o dorcsd.o \ +- dgeqrt.o dgeqrt2.o dgeqrt3.o dgemqrt.o \ +- dtpqrt.o dtpqrt2.o dtpmqrt.o dtprfb.o ++ dgbbrd.$(SUFFIX) dgbcon.$(SUFFIX) dgbequ.$(SUFFIX) dgbrfs.$(SUFFIX) dgbsv.$(SUFFIX) \ ++ dgbsvx.$(SUFFIX) dgbtf2.$(SUFFIX) dgbtrf.$(SUFFIX) dgbtrs.$(SUFFIX) dgebak.$(SUFFIX) dgebal.$(SUFFIX) dgebd2.$(SUFFIX) \ ++ dgebrd.$(SUFFIX) dgecon.$(SUFFIX) dgeequ.$(SUFFIX) dgees.$(SUFFIX) dgeesx.$(SUFFIX) dgeev.$(SUFFIX) dgeevx.$(SUFFIX) \ ++ dgegs.$(SUFFIX) dgegv.$(SUFFIX) dgehd2.$(SUFFIX) dgehrd.$(SUFFIX) dgelq2.$(SUFFIX) dgelqf.$(SUFFIX) \ ++ dgels.$(SUFFIX) dgelsd.$(SUFFIX) dgelss.$(SUFFIX) dgelsx.$(SUFFIX) dgelsy.$(SUFFIX) dgeql2.$(SUFFIX) dgeqlf.$(SUFFIX) \ ++ dgeqp3.$(SUFFIX) dgeqpf.$(SUFFIX) dgeqr2.$(SUFFIX) dgeqr2p.$(SUFFIX) dgeqrf.$(SUFFIX) dgeqrfp.$(SUFFIX) dgerfs.$(SUFFIX) \ ++ dgerq2.$(SUFFIX) dgerqf.$(SUFFIX) dgesc2.$(SUFFIX) dgesdd.$(SUFFIX) dgesv.$(SUFFIX) dgesvd.$(SUFFIX) dgesvx.$(SUFFIX) \ ++ dgetc2.$(SUFFIX) dgetri.$(SUFFIX) \ ++ dggbak.$(SUFFIX) dggbal.$(SUFFIX) dgges.$(SUFFIX) dggesx.$(SUFFIX) dggev.$(SUFFIX) dggevx.$(SUFFIX) \ ++ dggglm.$(SUFFIX) dgghrd.$(SUFFIX) dgglse.$(SUFFIX) dggqrf.$(SUFFIX) \ ++ dggrqf.$(SUFFIX) dggsvd.$(SUFFIX) dggsvp.$(SUFFIX) dgtcon.$(SUFFIX) dgtrfs.$(SUFFIX) dgtsv.$(SUFFIX) \ ++ dgtsvx.$(SUFFIX) dgttrf.$(SUFFIX) dgttrs.$(SUFFIX) dgtts2.$(SUFFIX) dhgeqz.$(SUFFIX) \ ++ dhsein.$(SUFFIX) dhseqr.$(SUFFIX) dlabrd.$(SUFFIX) dlacon.$(SUFFIX) dlacn2.$(SUFFIX) \ ++ dlaein.$(SUFFIX) dlaexc.$(SUFFIX) dlag2.$(SUFFIX) dlags2.$(SUFFIX) dlagtm.$(SUFFIX) dlagv2.$(SUFFIX) dlahqr.$(SUFFIX) \ ++ dlahrd.$(SUFFIX) dlahr2.$(SUFFIX) dlaic1.$(SUFFIX) dlaln2.$(SUFFIX) dlals0.$(SUFFIX) dlalsa.$(SUFFIX) dlalsd.$(SUFFIX) \ ++ dlangb.$(SUFFIX) dlange.$(SUFFIX) dlangt.$(SUFFIX) dlanhs.$(SUFFIX) dlansb.$(SUFFIX) dlansp.$(SUFFIX) \ ++ dlansy.$(SUFFIX) dlantb.$(SUFFIX) dlantp.$(SUFFIX) dlantr.$(SUFFIX) dlanv2.$(SUFFIX) \ ++ dlapll.$(SUFFIX) dlapmt.$(SUFFIX) \ ++ dlaqgb.$(SUFFIX) dlaqge.$(SUFFIX) dlaqp2.$(SUFFIX) dlaqps.$(SUFFIX) dlaqsb.$(SUFFIX) dlaqsp.$(SUFFIX) dlaqsy.$(SUFFIX) \ ++ dlaqr0.$(SUFFIX) dlaqr1.$(SUFFIX) dlaqr2.$(SUFFIX) dlaqr3.$(SUFFIX) dlaqr4.$(SUFFIX) dlaqr5.$(SUFFIX) \ ++ dlaqtr.$(SUFFIX) dlar1v.$(SUFFIX) dlar2v.$(SUFFIX) iladlr.$(SUFFIX) iladlc.$(SUFFIX) \ ++ dlarf.$(SUFFIX) dlarfb.$(SUFFIX) dlarfg.$(SUFFIX) dlarfgp.$(SUFFIX) dlarft.$(SUFFIX) dlarfx.$(SUFFIX) \ ++ dlargv.$(SUFFIX) dlarrv.$(SUFFIX) dlartv.$(SUFFIX) \ ++ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlaswp.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ ++ dlatbs.$(SUFFIX) dlatdf.$(SUFFIX) dlatps.$(SUFFIX) dlatrd.$(SUFFIX) dlatrs.$(SUFFIX) dlatrz.$(SUFFIX) dlatzm.$(SUFFIX) \ ++ dopgtr.$(SUFFIX) dopmtr.$(SUFFIX) dorg2l.$(SUFFIX) dorg2r.$(SUFFIX) \ ++ dorgbr.$(SUFFIX) dorghr.$(SUFFIX) dorgl2.$(SUFFIX) dorglq.$(SUFFIX) dorgql.$(SUFFIX) dorgqr.$(SUFFIX) dorgr2.$(SUFFIX) \ ++ dorgrq.$(SUFFIX) dorgtr.$(SUFFIX) dorm2l.$(SUFFIX) dorm2r.$(SUFFIX) \ ++ dormbr.$(SUFFIX) dormhr.$(SUFFIX) dorml2.$(SUFFIX) dormlq.$(SUFFIX) dormql.$(SUFFIX) dormqr.$(SUFFIX) dormr2.$(SUFFIX) \ ++ dormr3.$(SUFFIX) dormrq.$(SUFFIX) dormrz.$(SUFFIX) dormtr.$(SUFFIX) dpbcon.$(SUFFIX) dpbequ.$(SUFFIX) dpbrfs.$(SUFFIX) \ ++ dpbstf.$(SUFFIX) dpbsv.$(SUFFIX) dpbsvx.$(SUFFIX) \ ++ dpbtf2.$(SUFFIX) dpbtrf.$(SUFFIX) dpbtrs.$(SUFFIX) dpocon.$(SUFFIX) dpoequ.$(SUFFIX) dporfs.$(SUFFIX) dposv.$(SUFFIX) \ ++ dposvx.$(SUFFIX) dpotri.$(SUFFIX) dpotrs.$(SUFFIX) dpstrf.$(SUFFIX) dpstf2.$(SUFFIX) \ ++ dppcon.$(SUFFIX) dppequ.$(SUFFIX) \ ++ dpprfs.$(SUFFIX) dppsv.$(SUFFIX) dppsvx.$(SUFFIX) dpptrf.$(SUFFIX) dpptri.$(SUFFIX) dpptrs.$(SUFFIX) dptcon.$(SUFFIX) \ ++ dpteqr.$(SUFFIX) dptrfs.$(SUFFIX) dptsv.$(SUFFIX) dptsvx.$(SUFFIX) dpttrs.$(SUFFIX) dptts2.$(SUFFIX) drscl.$(SUFFIX) \ ++ dsbev.$(SUFFIX) dsbevd.$(SUFFIX) dsbevx.$(SUFFIX) dsbgst.$(SUFFIX) dsbgv.$(SUFFIX) dsbgvd.$(SUFFIX) dsbgvx.$(SUFFIX) \ ++ dsbtrd.$(SUFFIX) dspcon.$(SUFFIX) dspev.$(SUFFIX) dspevd.$(SUFFIX) dspevx.$(SUFFIX) dspgst.$(SUFFIX) \ ++ dspgv.$(SUFFIX) dspgvd.$(SUFFIX) dspgvx.$(SUFFIX) dsprfs.$(SUFFIX) dspsv.$(SUFFIX) dspsvx.$(SUFFIX) dsptrd.$(SUFFIX) \ ++ dsptrf.$(SUFFIX) dsptri.$(SUFFIX) dsptrs.$(SUFFIX) dstegr.$(SUFFIX) dstein.$(SUFFIX) dstev.$(SUFFIX) dstevd.$(SUFFIX) dstevr.$(SUFFIX) \ ++ dstevx.$(SUFFIX) \ ++ dsycon.$(SUFFIX) dsyev.$(SUFFIX) dsyevd.$(SUFFIX) dsyevr.$(SUFFIX) \ ++ dsyevx.$(SUFFIX) dsygs2.$(SUFFIX) dsygst.$(SUFFIX) dsygv.$(SUFFIX) dsygvd.$(SUFFIX) dsygvx.$(SUFFIX) dsyrfs.$(SUFFIX) \ ++ dsysv.$(SUFFIX) dsysvx.$(SUFFIX) \ ++ dsytd2.$(SUFFIX) dsytf2.$(SUFFIX) dsytrd.$(SUFFIX) dsytrf.$(SUFFIX) dsytri.$(SUFFIX) dsytri2.$(SUFFIX) dsytri2x.$(SUFFIX) \ ++ dsyswapr.$(SUFFIX) dsytrs.$(SUFFIX) dsytrs2.$(SUFFIX) dsyconv.$(SUFFIX) \ ++ dtbcon.$(SUFFIX) dtbrfs.$(SUFFIX) dtbtrs.$(SUFFIX) dtgevc.$(SUFFIX) dtgex2.$(SUFFIX) dtgexc.$(SUFFIX) dtgsen.$(SUFFIX) \ ++ dtgsja.$(SUFFIX) dtgsna.$(SUFFIX) dtgsy2.$(SUFFIX) dtgsyl.$(SUFFIX) dtpcon.$(SUFFIX) dtprfs.$(SUFFIX) dtptri.$(SUFFIX) \ ++ dtptrs.$(SUFFIX) \ ++ dtrcon.$(SUFFIX) dtrevc.$(SUFFIX) dtrexc.$(SUFFIX) dtrrfs.$(SUFFIX) dtrsen.$(SUFFIX) dtrsna.$(SUFFIX) dtrsyl.$(SUFFIX) \ ++ dtrtrs.$(SUFFIX) dtzrqf.$(SUFFIX) dtzrzf.$(SUFFIX) dstemr.$(SUFFIX) \ ++ dsgesv.$(SUFFIX) dsposv.$(SUFFIX) dlag2s.$(SUFFIX) slag2d.$(SUFFIX) dlat2s.$(SUFFIX) \ ++ dlansf.$(SUFFIX) dpftrf.$(SUFFIX) dpftri.$(SUFFIX) dpftrs.$(SUFFIX) dsfrk.$(SUFFIX) dtfsm.$(SUFFIX) dtftri.$(SUFFIX) dtfttp.$(SUFFIX) \ ++ dtfttr.$(SUFFIX) dtpttf.$(SUFFIX) dtpttr.$(SUFFIX) dtrttf.$(SUFFIX) dtrttp.$(SUFFIX) \ ++ dgejsv.$(SUFFIX) dgesvj.$(SUFFIX) dgsvj0.$(SUFFIX) dgsvj1.$(SUFFIX) \ ++ dgeequb.$(SUFFIX) dsyequb.$(SUFFIX) dpoequb.$(SUFFIX) dgbequb.$(SUFFIX) \ ++ dbbcsd.$(SUFFIX) dlapmr.$(SUFFIX) dorbdb.$(SUFFIX) dorcsd.$(SUFFIX) \ ++ dgeqrt.$(SUFFIX) dgeqrt2.$(SUFFIX) dgeqrt3.$(SUFFIX) dgemqrt.$(SUFFIX) \ ++ dtpqrt.$(SUFFIX) dtpqrt2.$(SUFFIX) dtpmqrt.$(SUFFIX) dtprfb.$(SUFFIX) + + ifdef USEXBLAS +-DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o \ +- dla_gercond.o dla_gerpvgrw.o dsysvxx.o dsyrfsx.o \ +- dla_syrfsx_extended.o dla_syamv.o dla_syrcond.o dla_syrpvgrw.o \ +- dposvxx.o dporfsx.o dla_porfsx_extended.o dla_porcond.o \ +- dla_porpvgrw.o dgbsvxx.o dgbrfsx.o dla_gbrfsx_extended.o \ +- dla_gbamv.o dla_gbrcond.o dla_gbrpvgrw.o dla_lin_berr.o dlarscl2.o \ +- dlascl2.o dla_wwaddw.o ++DXLASRC = dgesvxx.$(SUFFIX) dgerfsx.$(SUFFIX) dla_gerfsx_extended.$(SUFFIX) dla_geamv.$(SUFFIX) \ ++ dla_gercond.$(SUFFIX) dla_gerpvgrw.$(SUFFIX) dsysvxx.$(SUFFIX) dsyrfsx.$(SUFFIX) \ ++ dla_syrfsx_extended.$(SUFFIX) dla_syamv.$(SUFFIX) dla_syrcond.$(SUFFIX) dla_syrpvgrw.$(SUFFIX) \ ++ dposvxx.$(SUFFIX) dporfsx.$(SUFFIX) dla_porfsx_extended.$(SUFFIX) dla_porcond.$(SUFFIX) \ ++ dla_porpvgrw.$(SUFFIX) dgbsvxx.$(SUFFIX) dgbrfsx.$(SUFFIX) dla_gbrfsx_extended.$(SUFFIX) \ ++ dla_gbamv.$(SUFFIX) dla_gbrcond.$(SUFFIX) dla_gbrpvgrw.$(SUFFIX) dla_lin_berr.$(SUFFIX) dlarscl2.$(SUFFIX) \ ++ dlascl2.$(SUFFIX) dla_wwaddw.$(SUFFIX) + endif + + ZLASRC = \ +- zbdsqr.o zgbbrd.o zgbcon.o zgbequ.o zgbrfs.o zgbsv.o zgbsvx.o \ +- zgbtf2.o zgbtrf.o zgbtrs.o zgebak.o zgebal.o zgebd2.o zgebrd.o \ +- zgecon.o zgeequ.o zgees.o zgeesx.o zgeev.o zgeevx.o \ +- zgegs.o zgegv.o zgehd2.o zgehrd.o zgelq2.o zgelqf.o \ +- zgels.o zgelsd.o zgelss.o zgelsx.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o \ +- zgeqpf.o zgeqr2.o zgeqr2p.o zgeqrf.o zgeqrfp.o zgerfs.o zgerq2.o zgerqf.o \ +- zgesc2.o zgesdd.o zgesv.o zgesvd.o zgesvx.o zgetc2.o zgetf2.o zgetrf.o \ +- zgetri.o zgetrs.o \ +- zggbak.o zggbal.o zgges.o zggesx.o zggev.o zggevx.o zggglm.o \ +- zgghrd.o zgglse.o zggqrf.o zggrqf.o \ +- zggsvd.o zggsvp.o \ +- zgtcon.o zgtrfs.o zgtsv.o zgtsvx.o zgttrf.o zgttrs.o zgtts2.o zhbev.o \ +- zhbevd.o zhbevx.o zhbgst.o zhbgv.o zhbgvd.o zhbgvx.o zhbtrd.o \ +- zhecon.o zheev.o zheevd.o zheevr.o zheevx.o zhegs2.o zhegst.o \ +- zhegv.o zhegvd.o zhegvx.o zherfs.o zhesv.o zhesvx.o zhetd2.o \ +- zhetf2.o zhetrd.o \ +- zhetrf.o zhetri.o zhetri2.o zhetri2x.o zheswapr.o \ +- zhetrs.o zhetrs2.o zhgeqz.o zhpcon.o zhpev.o zhpevd.o \ +- zhpevx.o zhpgst.o zhpgv.o zhpgvd.o zhpgvx.o zhprfs.o zhpsv.o \ +- zhpsvx.o \ +- zhptrd.o zhptrf.o zhptri.o zhptrs.o zhsein.o zhseqr.o zlabrd.o \ +- zlacgv.o zlacon.o zlacn2.o zlacp2.o zlacpy.o zlacrm.o zlacrt.o zladiv.o \ +- zlaed0.o zlaed7.o zlaed8.o \ +- zlaein.o zlaesy.o zlaev2.o zlags2.o zlagtm.o \ +- zlahef.o zlahqr.o \ +- zlahrd.o zlahr2.o zlaic1.o zlals0.o zlalsa.o zlalsd.o zlangb.o zlange.o \ +- zlangt.o zlanhb.o \ +- zlanhe.o \ +- zlanhp.o zlanhs.o zlanht.o zlansb.o zlansp.o zlansy.o zlantb.o \ +- zlantp.o zlantr.o zlapll.o zlapmt.o zlaqgb.o zlaqge.o \ +- zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \ +- zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ +- zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ +- zlarcm.o zlarf.o zlarfb.o \ +- zlarfg.o zlarft.o zlarfgp.o \ +- zlarfx.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ +- zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ +- zlassq.o zlaswp.o zlasyf.o \ +- zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlatzm.o zlauu2.o \ +- zlauum.o zpbcon.o zpbequ.o zpbrfs.o zpbstf.o zpbsv.o \ +- zpbsvx.o zpbtf2.o zpbtrf.o zpbtrs.o zpocon.o zpoequ.o zporfs.o \ +- zposv.o zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \ +- zppcon.o zppequ.o zpprfs.o zppsv.o zppsvx.o zpptrf.o zpptri.o zpptrs.o \ +- zptcon.o zpteqr.o zptrfs.o zptsv.o zptsvx.o zpttrf.o zpttrs.o zptts2.o \ +- zrot.o zspcon.o zspmv.o zspr.o zsprfs.o zspsv.o \ +- zspsvx.o zsptrf.o zsptri.o zsptrs.o zdrscl.o zstedc.o \ +- zstegr.o zstein.o zsteqr.o \ +- zsycon.o zsymv.o \ +- zsyr.o zsyrfs.o zsysv.o zsysvx.o zsytf2.o zsytrf.o zsytri.o zsytri2.o zsytri2x.o \ +- zsyswapr.o zsytrs.o zsytrs2.o zsyconv.o \ +- ztbcon.o ztbrfs.o ztbtrs.o ztgevc.o ztgex2.o \ +- ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ +- ztprfs.o ztptri.o \ +- ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ +- ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ +- zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ +- zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ +- zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ +- zunmtr.o zupgtr.o \ +- zupmtr.o izmax1.o dzsum1.o zstemr.o \ +- zcgesv.o zcposv.o zlag2c.o clag2z.o zlat2c.o \ +- zhfrk.o ztfttp.o zlanhf.o zpftrf.o zpftri.o zpftrs.o ztfsm.o ztftri.o \ +- ztfttr.o ztpttf.o ztpttr.o ztrttf.o ztrttp.o \ +- zgeequb.o zgbequb.o zsyequb.o zpoequb.o zheequb.o \ +- zbbcsd.o zlapmr.o zunbdb.o zuncsd.o \ +- zgeqrt.o zgeqrt2.o zgeqrt3.o zgemqrt.o \ +- ztpqrt.o ztpqrt2.o ztpmqrt.o ztprfb.o ++ zbdsqr.$(SUFFIX) zgbbrd.$(SUFFIX) zgbcon.$(SUFFIX) zgbequ.$(SUFFIX) zgbrfs.$(SUFFIX) zgbsv.$(SUFFIX) zgbsvx.$(SUFFIX) \ ++ zgbtf2.$(SUFFIX) zgbtrf.$(SUFFIX) zgbtrs.$(SUFFIX) zgebak.$(SUFFIX) zgebal.$(SUFFIX) zgebd2.$(SUFFIX) zgebrd.$(SUFFIX) \ ++ zgecon.$(SUFFIX) zgeequ.$(SUFFIX) zgees.$(SUFFIX) zgeesx.$(SUFFIX) zgeev.$(SUFFIX) zgeevx.$(SUFFIX) \ ++ zgegs.$(SUFFIX) zgegv.$(SUFFIX) zgehd2.$(SUFFIX) zgehrd.$(SUFFIX) zgelq2.$(SUFFIX) zgelqf.$(SUFFIX) \ ++ zgels.$(SUFFIX) zgelsd.$(SUFFIX) zgelss.$(SUFFIX) zgelsx.$(SUFFIX) zgelsy.$(SUFFIX) zgeql2.$(SUFFIX) zgeqlf.$(SUFFIX) zgeqp3.$(SUFFIX) \ ++ zgeqpf.$(SUFFIX) zgeqr2.$(SUFFIX) zgeqr2p.$(SUFFIX) zgeqrf.$(SUFFIX) zgeqrfp.$(SUFFIX) zgerfs.$(SUFFIX) zgerq2.$(SUFFIX) zgerqf.$(SUFFIX) \ ++ zgesc2.$(SUFFIX) zgesdd.$(SUFFIX) zgesv.$(SUFFIX) zgesvd.$(SUFFIX) zgesvx.$(SUFFIX) zgetc2.$(SUFFIX) \ ++ zgetri.$(SUFFIX) \ ++ zggbak.$(SUFFIX) zggbal.$(SUFFIX) zgges.$(SUFFIX) zggesx.$(SUFFIX) zggev.$(SUFFIX) zggevx.$(SUFFIX) zggglm.$(SUFFIX) \ ++ zgghrd.$(SUFFIX) zgglse.$(SUFFIX) zggqrf.$(SUFFIX) zggrqf.$(SUFFIX) \ ++ zggsvd.$(SUFFIX) zggsvp.$(SUFFIX) \ ++ zgtcon.$(SUFFIX) zgtrfs.$(SUFFIX) zgtsv.$(SUFFIX) zgtsvx.$(SUFFIX) zgttrf.$(SUFFIX) zgttrs.$(SUFFIX) zgtts2.$(SUFFIX) zhbev.$(SUFFIX) \ ++ zhbevd.$(SUFFIX) zhbevx.$(SUFFIX) zhbgst.$(SUFFIX) zhbgv.$(SUFFIX) zhbgvd.$(SUFFIX) zhbgvx.$(SUFFIX) zhbtrd.$(SUFFIX) \ ++ zhecon.$(SUFFIX) zheev.$(SUFFIX) zheevd.$(SUFFIX) zheevr.$(SUFFIX) zheevx.$(SUFFIX) zhegs2.$(SUFFIX) zhegst.$(SUFFIX) \ ++ zhegv.$(SUFFIX) zhegvd.$(SUFFIX) zhegvx.$(SUFFIX) zherfs.$(SUFFIX) zhesv.$(SUFFIX) zhesvx.$(SUFFIX) zhetd2.$(SUFFIX) \ ++ zhetf2.$(SUFFIX) zhetrd.$(SUFFIX) \ ++ zhetrf.$(SUFFIX) zhetri.$(SUFFIX) zhetri2.$(SUFFIX) zhetri2x.$(SUFFIX) zheswapr.$(SUFFIX) \ ++ zhetrs.$(SUFFIX) zhetrs2.$(SUFFIX) zhgeqz.$(SUFFIX) zhpcon.$(SUFFIX) zhpev.$(SUFFIX) zhpevd.$(SUFFIX) \ ++ zhpevx.$(SUFFIX) zhpgst.$(SUFFIX) zhpgv.$(SUFFIX) zhpgvd.$(SUFFIX) zhpgvx.$(SUFFIX) zhprfs.$(SUFFIX) zhpsv.$(SUFFIX) \ ++ zhpsvx.$(SUFFIX) \ ++ zhptrd.$(SUFFIX) zhptrf.$(SUFFIX) zhptri.$(SUFFIX) zhptrs.$(SUFFIX) zhsein.$(SUFFIX) zhseqr.$(SUFFIX) zlabrd.$(SUFFIX) \ ++ zlacgv.$(SUFFIX) zlacon.$(SUFFIX) zlacn2.$(SUFFIX) zlacp2.$(SUFFIX) zlacpy.$(SUFFIX) zlacrm.$(SUFFIX) zlacrt.$(SUFFIX) zladiv.$(SUFFIX) \ ++ zlaed0.$(SUFFIX) zlaed7.$(SUFFIX) zlaed8.$(SUFFIX) \ ++ zlaein.$(SUFFIX) zlaesy.$(SUFFIX) zlaev2.$(SUFFIX) zlags2.$(SUFFIX) zlagtm.$(SUFFIX) \ ++ zlahef.$(SUFFIX) zlahqr.$(SUFFIX) \ ++ zlahrd.$(SUFFIX) zlahr2.$(SUFFIX) zlaic1.$(SUFFIX) zlals0.$(SUFFIX) zlalsa.$(SUFFIX) zlalsd.$(SUFFIX) zlangb.$(SUFFIX) zlange.$(SUFFIX) \ ++ zlangt.$(SUFFIX) zlanhb.$(SUFFIX) \ ++ zlanhe.$(SUFFIX) \ ++ zlanhp.$(SUFFIX) zlanhs.$(SUFFIX) zlanht.$(SUFFIX) zlansb.$(SUFFIX) zlansp.$(SUFFIX) zlansy.$(SUFFIX) zlantb.$(SUFFIX) \ ++ zlantp.$(SUFFIX) zlantr.$(SUFFIX) zlapll.$(SUFFIX) zlapmt.$(SUFFIX) zlaqgb.$(SUFFIX) zlaqge.$(SUFFIX) \ ++ zlaqhb.$(SUFFIX) zlaqhe.$(SUFFIX) zlaqhp.$(SUFFIX) zlaqp2.$(SUFFIX) zlaqps.$(SUFFIX) zlaqsb.$(SUFFIX) \ ++ zlaqr0.$(SUFFIX) zlaqr1.$(SUFFIX) zlaqr2.$(SUFFIX) zlaqr3.$(SUFFIX) zlaqr4.$(SUFFIX) zlaqr5.$(SUFFIX) \ ++ zlaqsp.$(SUFFIX) zlaqsy.$(SUFFIX) zlar1v.$(SUFFIX) zlar2v.$(SUFFIX) ilazlr.$(SUFFIX) ilazlc.$(SUFFIX) \ ++ zlarcm.$(SUFFIX) zlarf.$(SUFFIX) zlarfb.$(SUFFIX) \ ++ zlarfg.$(SUFFIX) zlarft.$(SUFFIX) zlarfgp.$(SUFFIX) \ ++ zlarfx.$(SUFFIX) zlargv.$(SUFFIX) zlarnv.$(SUFFIX) zlarrv.$(SUFFIX) zlartg.$(SUFFIX) zlartv.$(SUFFIX) \ ++ zlarz.$(SUFFIX) zlarzb.$(SUFFIX) zlarzt.$(SUFFIX) zlascl.$(SUFFIX) zlaset.$(SUFFIX) zlasr.$(SUFFIX) \ ++ zlassq.$(SUFFIX) zlaswp.$(SUFFIX) zlasyf.$(SUFFIX) \ ++ zlatbs.$(SUFFIX) zlatdf.$(SUFFIX) zlatps.$(SUFFIX) zlatrd.$(SUFFIX) zlatrs.$(SUFFIX) zlatrz.$(SUFFIX) zlatzm.$(SUFFIX) zlauu2.$(SUFFIX) \ ++ zpbcon.$(SUFFIX) zpbequ.$(SUFFIX) zpbrfs.$(SUFFIX) zpbstf.$(SUFFIX) zpbsv.$(SUFFIX) \ ++ zpbsvx.$(SUFFIX) zpbtf2.$(SUFFIX) zpbtrf.$(SUFFIX) zpbtrs.$(SUFFIX) zpocon.$(SUFFIX) zpoequ.$(SUFFIX) zporfs.$(SUFFIX) \ ++ zposv.$(SUFFIX) zposvx.$(SUFFIX) zpotri.$(SUFFIX) zpotrs.$(SUFFIX) zpstrf.$(SUFFIX) zpstf2.$(SUFFIX) \ ++ zppcon.$(SUFFIX) zppequ.$(SUFFIX) zpprfs.$(SUFFIX) zppsv.$(SUFFIX) zppsvx.$(SUFFIX) zpptrf.$(SUFFIX) zpptri.$(SUFFIX) zpptrs.$(SUFFIX) \ ++ zptcon.$(SUFFIX) zpteqr.$(SUFFIX) zptrfs.$(SUFFIX) zptsv.$(SUFFIX) zptsvx.$(SUFFIX) zpttrf.$(SUFFIX) zpttrs.$(SUFFIX) zptts2.$(SUFFIX) \ ++ zrot.$(SUFFIX) zspcon.$(SUFFIX) zspmv.$(SUFFIX) zspr.$(SUFFIX) zsprfs.$(SUFFIX) zspsv.$(SUFFIX) \ ++ zspsvx.$(SUFFIX) zsptrf.$(SUFFIX) zsptri.$(SUFFIX) zsptrs.$(SUFFIX) zdrscl.$(SUFFIX) zstedc.$(SUFFIX) \ ++ zstegr.$(SUFFIX) zstein.$(SUFFIX) zsteqr.$(SUFFIX) \ ++ zsycon.$(SUFFIX) zsymv.$(SUFFIX) \ ++ zsyr.$(SUFFIX) zsyrfs.$(SUFFIX) zsysv.$(SUFFIX) zsysvx.$(SUFFIX) zsytf2.$(SUFFIX) zsytrf.$(SUFFIX) zsytri.$(SUFFIX) zsytri2.$(SUFFIX) zsytri2x.$(SUFFIX) \ ++ zsyswapr.$(SUFFIX) zsytrs.$(SUFFIX) zsytrs2.$(SUFFIX) zsyconv.$(SUFFIX) \ ++ ztbcon.$(SUFFIX) ztbrfs.$(SUFFIX) ztbtrs.$(SUFFIX) ztgevc.$(SUFFIX) ztgex2.$(SUFFIX) \ ++ ztgexc.$(SUFFIX) ztgsen.$(SUFFIX) ztgsja.$(SUFFIX) ztgsna.$(SUFFIX) ztgsy2.$(SUFFIX) ztgsyl.$(SUFFIX) ztpcon.$(SUFFIX) \ ++ ztprfs.$(SUFFIX) ztptri.$(SUFFIX) \ ++ ztptrs.$(SUFFIX) ztrcon.$(SUFFIX) ztrevc.$(SUFFIX) ztrexc.$(SUFFIX) ztrrfs.$(SUFFIX) ztrsen.$(SUFFIX) ztrsna.$(SUFFIX) \ ++ ztrsyl.$(SUFFIX) ztrtrs.$(SUFFIX) ztzrqf.$(SUFFIX) ztzrzf.$(SUFFIX) zung2l.$(SUFFIX) \ ++ zung2r.$(SUFFIX) zungbr.$(SUFFIX) zunghr.$(SUFFIX) zungl2.$(SUFFIX) zunglq.$(SUFFIX) zungql.$(SUFFIX) zungqr.$(SUFFIX) zungr2.$(SUFFIX) \ ++ zungrq.$(SUFFIX) zungtr.$(SUFFIX) zunm2l.$(SUFFIX) zunm2r.$(SUFFIX) zunmbr.$(SUFFIX) zunmhr.$(SUFFIX) zunml2.$(SUFFIX) \ ++ zunmlq.$(SUFFIX) zunmql.$(SUFFIX) zunmqr.$(SUFFIX) zunmr2.$(SUFFIX) zunmr3.$(SUFFIX) zunmrq.$(SUFFIX) zunmrz.$(SUFFIX) \ ++ zunmtr.$(SUFFIX) zupgtr.$(SUFFIX) \ ++ zupmtr.$(SUFFIX) izmax1.$(SUFFIX) dzsum1.$(SUFFIX) zstemr.$(SUFFIX) \ ++ zcgesv.$(SUFFIX) zcposv.$(SUFFIX) zlag2c.$(SUFFIX) clag2z.$(SUFFIX) zlat2c.$(SUFFIX) \ ++ zhfrk.$(SUFFIX) ztfttp.$(SUFFIX) zlanhf.$(SUFFIX) zpftrf.$(SUFFIX) zpftri.$(SUFFIX) zpftrs.$(SUFFIX) ztfsm.$(SUFFIX) ztftri.$(SUFFIX) \ ++ ztfttr.$(SUFFIX) ztpttf.$(SUFFIX) ztpttr.$(SUFFIX) ztrttf.$(SUFFIX) ztrttp.$(SUFFIX) \ ++ zgeequb.$(SUFFIX) zgbequb.$(SUFFIX) zsyequb.$(SUFFIX) zpoequb.$(SUFFIX) zheequb.$(SUFFIX) \ ++ zbbcsd.$(SUFFIX) zlapmr.$(SUFFIX) zunbdb.$(SUFFIX) zuncsd.$(SUFFIX) \ ++ zgeqrt.$(SUFFIX) zgeqrt2.$(SUFFIX) zgeqrt3.$(SUFFIX) zgemqrt.$(SUFFIX) \ ++ ztpqrt.$(SUFFIX) ztpqrt2.$(SUFFIX) ztpmqrt.$(SUFFIX) ztprfb.$(SUFFIX) + + ifdef USEXBLAS +-ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o \ +- zla_gercond_c.o zla_gercond_x.o zla_gerpvgrw.o zsysvxx.o zsyrfsx.o \ +- zla_syrfsx_extended.o zla_syamv.o zla_syrcond_c.o zla_syrcond_x.o \ +- zla_syrpvgrw.o zposvxx.o zporfsx.o zla_porfsx_extended.o \ +- zla_porcond_c.o zla_porcond_x.o zla_porpvgrw.o zgbsvxx.o zgbrfsx.o \ +- zla_gbrfsx_extended.o zla_gbamv.o zla_gbrcond_c.o zla_gbrcond_x.o \ +- zla_gbrpvgrw.o zhesvxx.o zherfsx.o zla_herfsx_extended.o \ +- zla_heamv.o zla_hercond_c.o zla_hercond_x.o zla_herpvgrw.o \ +- zla_lin_berr.o zlarscl2.o zlascl2.o zla_wwaddw.o ++ZXLASRC = zgesvxx.$(SUFFIX) zgerfsx.$(SUFFIX) zla_gerfsx_extended.$(SUFFIX) zla_geamv.$(SUFFIX) \ ++ zla_gercond_c.$(SUFFIX) zla_gercond_x.$(SUFFIX) zla_gerpvgrw.$(SUFFIX) zsysvxx.$(SUFFIX) zsyrfsx.$(SUFFIX) \ ++ zla_syrfsx_extended.$(SUFFIX) zla_syamv.$(SUFFIX) zla_syrcond_c.$(SUFFIX) zla_syrcond_x.$(SUFFIX) \ ++ zla_syrpvgrw.$(SUFFIX) zposvxx.$(SUFFIX) zporfsx.$(SUFFIX) zla_porfsx_extended.$(SUFFIX) \ ++ zla_porcond_c.$(SUFFIX) zla_porcond_x.$(SUFFIX) zla_porpvgrw.$(SUFFIX) zgbsvxx.$(SUFFIX) zgbrfsx.$(SUFFIX) \ ++ zla_gbrfsx_extended.$(SUFFIX) zla_gbamv.$(SUFFIX) zla_gbrcond_c.$(SUFFIX) zla_gbrcond_x.$(SUFFIX) \ ++ zla_gbrpvgrw.$(SUFFIX) zhesvxx.$(SUFFIX) zherfsx.$(SUFFIX) zla_herfsx_extended.$(SUFFIX) \ ++ zla_heamv.$(SUFFIX) zla_hercond_c.$(SUFFIX) zla_hercond_x.$(SUFFIX) zla_herpvgrw.$(SUFFIX) \ ++ zla_lin_berr.$(SUFFIX) zlarscl2.$(SUFFIX) zlascl2.$(SUFFIX) zla_wwaddw.$(SUFFIX) + endif + + ALLOBJ = $(SLASRC) $(DLASRC) $(DSLASRC) $(CLASRC) $(ZLASRC) $(ZCLASRC) \ + $(SCLAUX) $(DZLAUX) $(ALLAUX) + ++ALLOBJ_P = $(ALLOBJ:.$(SUFFIX)=.$(PSUFFIX)) ++ + ifdef USEXBLAS + ALLXOBJ = $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) + endif + + all: ../$(LAPACKLIB) + ++lapack_prof: ../$(LAPACKLIB_P) ++ + ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) + $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) + $(RANLIB) $@ + ++../$(LAPACKLIB_P): $(ALLOBJ_P) ++ $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ_P) ++ $(RANLIB) $@ ++ + single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX) + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ + $(SXLASRC) $(SCLAUX) $(ALLAUX) $(ALLXAUX) +@@ -451,15 +459,24 @@ + @FRC=$(FRC) + + clean: +- rm -f *.o ++ rm -f *.$(SUFFIX) *.$(PSUFFIX) + +-.f.o: ++%.$(SUFFIX): %.f + $(FORTRAN) $(OPTS) -c $< -o $@ + +-slaruv.o: slaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-dlaruv.o: dlaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-sla_wwaddw.o: sla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-dla_wwaddw.o: dla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-cla_wwaddw.o: cla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-zla_wwaddw.o: zla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ ++%.$(PSUFFIX): %.f ++ $(FORTRAN) $(POPTS) -c $< -o $@ + ++slaruv.$(SUFFIX): slaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++dlaruv.$(SUFFIX): dlaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++sla_wwaddw.$(SUFFIX): sla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++dla_wwaddw.$(SUFFIX): dla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++cla_wwaddw.$(SUFFIX): cla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++zla_wwaddw.$(SUFFIX): zla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++ ++slaruv.$(PSUFFIX): slaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++dlaruv.$(PSUFFIX): dlaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++sla_wwaddw.$(PSUFFIX): sla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++dla_wwaddw.$(PSUFFIX): dla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++cla_wwaddw.$(PSUFFIX): cla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++zla_wwaddw.$(PSUFFIX): zla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ +diff -ruN lapack-3.4.1.old/TESTING/EIG/Makefile lapack-3.4.1/TESTING/EIG/Makefile +--- lapack-3.4.1.old/TESTING/EIG/Makefile 2011-09-26 23:52:31 +0200 ++++ lapack-3.4.1/TESTING/EIG/Makefile 2012-04-22 21:41:45 +0200 +@@ -78,7 +78,7 @@ + cget35.o cget36.o cget37.o cget38.o cget51.o cget52.o \ + cget54.o cglmts.o cgqrts.o cgrqts.o cgsvts.o \ + chbt21.o chet21.o chet22.o chpt21.o chst01.o \ +- clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o csbmv.o \ ++ clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o \ + csgt01.o cslect.o \ + cstt21.o cstt22.o cunt01.o cunt03.o + +@@ -115,7 +115,7 @@ + zget35.o zget36.o zget37.o zget38.o zget51.o zget52.o \ + zget54.o zglmts.o zgqrts.o zgrqts.o zgsvts.o \ + zhbt21.o zhet21.o zhet22.o zhpt21.o zhst01.o \ +- zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o zsbmv.o \ ++ zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o \ + zsgt01.o zslect.o \ + zstt21.o zstt22.o zunt01.o zunt03.o + +@@ -129,22 +129,22 @@ + ../xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtsts \ + $(SEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtsts $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtsts $@ + + ../xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtstc \ + $(CEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtstc $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtstc $@ + + ../xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtstd \ + $(DEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtstd $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtstd $@ + + ../xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtstz \ + $(ZEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtstz $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtstz $@ + + $(AEIGTST): $(FRC) + $(SCIGTST): $(FRC) +diff -ruN lapack-3.4.1.old/TESTING/LIN/Makefile lapack-3.4.1/TESTING/LIN/Makefile +--- lapack-3.4.1.old/TESTING/LIN/Makefile 2012-04-02 21:06:36 +0200 ++++ lapack-3.4.1/TESTING/LIN/Makefile 2012-04-22 21:43:30 +0200 +@@ -109,7 +109,7 @@ + cqpt01.o cqrt01.o cqrt01p.o cqrt02.o cqrt03.o cqrt11.o \ + cqrt12.o cqrt13.o cqrt14.o cqrt15.o cqrt16.o \ + cqrt17.o crqt01.o crqt02.o crqt03.o crzt01.o crzt02.o \ +- csbmv.o cspt01.o \ ++ cspt01.o \ + cspt02.o cspt03.o csyt01.o csyt02.o csyt03.o \ + ctbt02.o ctbt03.o ctbt05.o ctbt06.o ctpt01.o \ + ctpt02.o ctpt03.o ctpt05.o ctpt06.o ctrt01.o \ +@@ -188,7 +188,7 @@ + zqpt01.o zqrt01.o zqrt01p.o zqrt02.o zqrt03.o zqrt11.o \ + zqrt12.o zqrt13.o zqrt14.o zqrt15.o zqrt16.o \ + zqrt17.o zrqt01.o zrqt02.o zrqt03.o zrzt01.o zrzt02.o \ +- zsbmv.o zspt01.o \ ++ zspt01.o \ + zspt02.o zspt03.o zsyt01.o zsyt02.o zsyt03.o \ + ztbt02.o ztbt03.o ztbt05.o ztbt06.o ztpt01.o \ + ztpt02.o ztpt03.o ztpt05.o ztpt06.o ztrt01.o \ +@@ -214,7 +214,7 @@ + zdrvab.o zdrvac.o zerrab.o zerrac.o zget08.o \ + alaerh.o alahd.o aladhd.o alareq.o \ + chkxer.o zget02.o zlarhs.o zlatb4.o \ +- zsbmv.o xerbla.o zpot06.o zlaipd.o ++ xerbla.o zpot06.o zlaipd.o + + SLINTSTRFP = schkrfp.o sdrvrfp.o sdrvrf1.o sdrvrf2.o sdrvrf3.o sdrvrf4.o serrrfp.o \ + slatb4.o slarhs.o sget04.o spot01.o spot03.o spot02.o \ +@@ -225,11 +225,11 @@ + chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o + + CLINTSTRFP = cchkrfp.o cdrvrfp.o cdrvrf1.o cdrvrf2.o cdrvrf3.o cdrvrf4.o cerrrfp.o \ +- claipd.o clatb4.o clarhs.o csbmv.o cget04.o cpot01.o cpot03.o cpot02.o \ ++ claipd.o clatb4.o clarhs.o cget04.o cpot01.o cpot03.o cpot02.o \ + chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o + + ZLINTSTRFP = zchkrfp.o zdrvrfp.o zdrvrf1.o zdrvrf2.o zdrvrf3.o zdrvrf4.o zerrrfp.o \ +- zlatb4.o zlaipd.o zlarhs.o zsbmv.o zget04.o zpot01.o zpot03.o zpot02.o \ ++ zlatb4.o zlaipd.o zlarhs.o zget04.o zpot01.o zpot03.o zpot02.o \ + chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o + + all: single double complex complex16 proto-single proto-double proto-complex proto-complex16 +@@ -246,43 +246,43 @@ + + xlintsts : $(ALINTST) $(SLINTST) $(SCLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ALINTST) $(SCLNTST) $(SLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstc : $(ALINTST) $(CLINTST) $(SCLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ALINTST) $(SCLNTST) $(CLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstd : $(ALINTST) $(DLINTST) $(DZLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $^ \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstz : $(ALINTST) $(ZLINTST) $(DZLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ALINTST) $(DZLNTST) $(ZLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstds : $(DSLINTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(DSLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstzc : $(ZCLINTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ZCLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfs : $(SLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(SLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfd : $(DLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(DLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfc : $(CLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(CLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfz : $(ZLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ZLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintsts: xlintsts + mv xlintsts $@ +diff -ruN lapack-3.4.1.old/lapacke/src/Makefile lapack-3.4.1/lapacke/src/Makefile +--- lapack-3.4.1.old/lapacke/src/Makefile 2012-04-02 22:16:32 +0200 ++++ lapack-3.4.1/lapacke/src/Makefile 2012-04-22 21:38:38 +0200 +@@ -2040,19 +2040,21 @@ + lapacke_zlagsy.o \ + lapacke_zlagsy_work.o + +-ALLOBJ = $(SRC_OBJ) $(MATGEN_OBJ) ++OBJ_FILES := $(SRC_OBJ) + +-ifdef USEXBLAS +-ALLXOBJ = $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) ++ifdef LAPACKE_EXTENDED ++OBJ_FILES += $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) + endif + +- +-OBJ_FILES := $(C_FILES:.o=.o) ++ifdef LAPACKE_TESTING ++OBJ_FILES += $(MATGEN_OBJ) ++endif + + all: ../../$(LAPACKELIB) + +-../../$(LAPACKELIB): $(ALLOBJ) $(ALLXOBJ) +- $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(ALLOBJ) $(ALLXOBJ) ++../../$(LAPACKELIB): $(OBJ_FILES) ++# http://hackage.haskell.org/trac/gtk2hs/ticket/1146 ++ echo $(OBJ_FILES) | xargs --max-args=100 $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) + $(RANLIB) ../../$(LAPACKELIB) + + .c.o: From 001c2c322b0cdbed6befd09d4e747c2be6392c52 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 23 Apr 2012 17:38:54 +0800 Subject: [PATCH 010/162] Refs #94. Auto-detecting Intel Xeon E7 Westmere-EX. --- cpuid_x86.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index e7aa07b44..7b36fdbdf 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -990,6 +990,9 @@ int get_cpuname(void){ case 13: //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) return CPUTYPE_NEHALEM; + case 15: + //Xeon Processor E7 (Westmere-EX) + return CPUTYPE_NEHALEM; } break; } @@ -1325,6 +1328,9 @@ int get_coretype(void){ case 13: //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) return CORE_NEHALEM; + case 15: + //Xeon Processor E7 (Westmere-EX) + return CORE_NEHALEM; } break; } From f93318a6c818c969a321bc47d1c8fad06d60429b Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Sun, 22 Apr 2012 21:16:03 +0200 Subject: [PATCH 011/162] Refs #95 cblas: compatibility for compilers without C99 complex number support (e.g. Visual Studio) --- Makefile.install | 2 +- cblas.h | 18 +++++++++--------- common.h | 22 ++++++++++++++++++++++ openblas_config_template.h | 31 +++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 10 deletions(-) diff --git a/Makefile.install b/Makefile.install index 6ecfd91ed..62ceda986 100644 --- a/Makefile.install +++ b/Makefile.install @@ -23,7 +23,7 @@ install : lib.grd @cat config_last.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @cat openblas_config_template.h >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h + @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo Generating f77blas.h in $(OPENBLAS_INCLUDE_DIR) @echo \#ifndef OPENBLAS_F77BLAS_H > $(OPENBLAS_INCLUDE_DIR)/f77blas.h diff --git a/cblas.h b/cblas.h index 34adc5e99..f3708a994 100644 --- a/cblas.h +++ b/cblas.h @@ -22,15 +22,15 @@ double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); -float _Complex cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); -float _Complex cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); -double _Complex cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); -double _Complex cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); -void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); -void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); -void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); +openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); +openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); +openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); +openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); +void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); +void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); +void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); float cblas_sasum (blasint n, float *x, blasint incx); double cblas_dasum (blasint n, double *x, blasint incx); diff --git a/common.h b/common.h index e848f33f3..c6d30ddcf 100644 --- a/common.h +++ b/common.h @@ -374,6 +374,28 @@ typedef int blasint; #endif #endif +#ifndef ASSEMBLER +#ifndef NOINCLUDE +/* Inclusion of a standard header file is needed for definition of __STDC_* + predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs + as a side effect of including either or . */ +#include +#endif // NOINCLUDE + +/* C99 supports complex floating numbers natively, which GCC also offers as an + extension since version 3.0. If neither are available, use a compatible + structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ +#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 + #define OPENBLAS_COMPLEX_C99 + typedef float _Complex openblas_complex_float; + typedef double _Complex openblas_complex_double; +#else + #define OPENBLAS_COMPLEX_STRUCT + typedef struct { float real, imag; } openblas_complex_float; + typedef struct { double real, imag; } openblas_complex_double; +#endif +#endif // ASSEMBLER + #ifndef IFLUSH #define IFLUSH #endif diff --git a/openblas_config_template.h b/openblas_config_template.h index 8bf972593..caeccf026 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -39,3 +39,34 @@ typedef int blasint; #define FLOATRET float #endif #endif + +/* Inclusion of a standard header file is needed for definition of __STDC_* + predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs + as a side effect of including either or . */ +#include + +/* C99 supports complex floating numbers natively, which GCC also offers as an + extension since version 3.0. If neither are available, use a compatible + structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ +#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 + #define OPENBLAS_COMPLEX_C99 + #include + typedef float _Complex openblas_complex_float; + typedef double _Complex openblas_complex_double; + #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_complex_float_real(z) (creal(z)) + #define openblas_complex_float_imag(z) (cimag(z)) + #define openblas_complex_double_real(z) (creal(z)) + #define openblas_complex_double_imag(z) (cimag(z)) +#else + #define OPENBLAS_COMPLEX_STRUCT + typedef struct { float real, imag; } openblas_complex_float; + typedef struct { double real, imag; } openblas_complex_double; + #define openblas_make_complex_float(real, imag) {(real), (imag)} + #define openblas_make_complex_double(real, imag) {(real), (imag)} + #define openblas_complex_float_real(z) ((z).real) + #define openblas_complex_float_imag(z) ((z).imag) + #define openblas_complex_double_real(z) ((z).real) + #define openblas_complex_double_imag(z) ((z).imag) +#endif From 9037782a9c42ec1cfd6b39233424bd74f6830bfe Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 24 Apr 2012 12:03:08 +0800 Subject: [PATCH 012/162] Fixed the LAPACKE building bug on Mac OSX. --- patch.for_lapack-3.4.1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/patch.for_lapack-3.4.1 b/patch.for_lapack-3.4.1 index e06129202..79c74aad2 100644 --- a/patch.for_lapack-3.4.1 +++ b/patch.for_lapack-3.4.1 @@ -926,7 +926,7 @@ diff -ruN lapack-3.4.1.old/lapacke/src/Makefile lapack-3.4.1/lapacke/src/Makefil - $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(ALLOBJ) $(ALLXOBJ) +../../$(LAPACKELIB): $(OBJ_FILES) +# http://hackage.haskell.org/trac/gtk2hs/ticket/1146 -+ echo $(OBJ_FILES) | xargs --max-args=100 $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) ++ echo $(OBJ_FILES) | xargs -n 100 $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(RANLIB) ../../$(LAPACKELIB) .c.o: From b2bdb6f7c4901aa63f65dfd74448cdca40faa7a4 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 26 Apr 2012 15:39:03 +0800 Subject: [PATCH 013/162] Automatically download CUnit 2.1.2-2 version from SF.net. --- utest/Makefile | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/utest/Makefile b/utest/Makefile index e7c5f3412..ee9aa6286 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -3,20 +3,50 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system TARGET=openblas_utest -CUNIT_LIB=/usr/local/lib/libcunit.a + +CUNIT_URL=http://downloads.sourceforge.net/project/cunit/CUnit/2.1-2/CUnit-2.1-2-src.tar.bz2 +CUNIT_DIR=$(CURDIR)/CUnit-2.1-2 + +CUNIT_LIB=$(CUNIT_DIR)/lib/libcunit.a + +CFLAGS+=-I$(CUNIT_DIR)/include OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o all : run_test -$(TARGET): $(OBJS) - $(FC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB) +CUnit-2.1-2-src.tar.bz2: +ifeq ($(OSNAME), Darwin) + curl -O $(CUNIT_URL) +else + wget $(CUNIT_URL) +endif + +$(CUNIT_DIR): CUnit-2.1-2-src.tar.bz2 + @if test `$(MD5SUM) CUnit-2.1-2-src.tar.bz2 | $(AWK) '{print $$1}'` = 31c62bd7a65007737ba28b7aafc44d3a; then \ + echo $(TAR) xjf $< ;\ + $(TAR) xjf $< ; \ + else \ + rm -rf $(CUNIT_DIR) ;\ + echo " Cannot download CUnit-2.1-2-src.tar.bz2 or the MD5 check sum is wrong (Please use orignal)."; \ + exit 1; \ + fi + + +$(CUNIT_LIB): $(CUNIT_DIR) + (cd $(CUNIT_DIR); CC=$(CC) CFLAGS="$(CFLAGS)" ./configure --prefix=$(CUNIT_DIR)) + $(MAKE) -C $(CUNIT_DIR) + $(MAKE) -C $(CUNIT_DIR) install + +$(TARGET): $(CUNIT_LIB) $(OBJS) + $(CC) $(CFLAGS) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB) run_test: $(TARGET) ./$(TARGET) clean: - rm -f *.o $(TARGET) + -rm -f *.o $(TARGET) + -rm -rf $(CUNIT_DIR) libs: From 08570c42485c3a62ca3d344406d75047ba4ee296 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 26 Apr 2012 15:54:15 +0800 Subject: [PATCH 014/162] Fixed the utest bug for drotmg. --- utest/test_rotmg.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/utest/test_rotmg.c b/utest/test_rotmg.c index e51e6b299..9a1a3d084 100644 --- a/utest/test_rotmg.c +++ b/utest/test_rotmg.c @@ -38,12 +38,18 @@ void test_drotmg() double te_d2, tr_d2; double te_x1, tr_x1; double te_y1, tr_y1; - double te_param[5],tr_param[5]; + double te_param[5]; + double tr_param[5]; int i=0; te_d1= tr_d1=0.21149573940783739; te_d2= tr_d2=0.046892057172954082; te_x1= tr_x1=-0.42272687517106533; te_y1= tr_y1=0.42211309121921659; + + for(i=0; i<5; i++){ + te_param[i]=tr_param[i]=0.0; + } + //OpenBLAS BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); //reference From 5656cca4f320f72e3e711118d420721c1821f3a1 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 26 Apr 2012 16:17:17 +0800 Subject: [PATCH 015/162] Added the test case for samax. --- common_reference.h | 2 ++ utest/Makefile | 2 +- utest/common_utest.h | 2 ++ utest/main.c | 2 ++ 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/common_reference.h b/common_reference.h index 4cc4be4fd..be151e0d6 100644 --- a/common_reference.h +++ b/common_reference.h @@ -63,5 +63,7 @@ double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *); double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*); + +FLOATRET BLASFUNC_REF(samax) (blasint *, float *, blasint *); #endif diff --git a/utest/Makefile b/utest/Makefile index ee9aa6286..3d120f5b3 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -11,7 +11,7 @@ CUNIT_LIB=$(CUNIT_DIR)/lib/libcunit.a CFLAGS+=-I$(CUNIT_DIR)/include -OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o +OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o test_amax.o all : run_test diff --git a/utest/common_utest.h b/utest/common_utest.h index 1332ef6ab..e57ae0556 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -61,4 +61,6 @@ void test_drotmg(void); void test_dsdot_n_1(void); +void test_samax(void); + #endif diff --git a/utest/main.c b/utest/main.c index 135709507..ece94dd71 100644 --- a/utest/main.c +++ b/utest/main.c @@ -58,6 +58,8 @@ CU_TestInfo test_level1[]={ {"Testing drotmg",test_drotmg}, {"Testing dsdot with n == 1",test_dsdot_n_1}, + + {"Testing samax", test_samax}, CU_TEST_INFO_NULL, }; From 14428af879c0e589506f75cd13b57573d48126a7 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 26 Apr 2012 16:40:44 +0800 Subject: [PATCH 016/162] Adde the mising test_amax.c file. --- utest/test_amax.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 utest/test_amax.c diff --git a/utest/test_amax.c b/utest/test_amax.c new file mode 100644 index 000000000..8d163853a --- /dev/null +++ b/utest/test_amax.c @@ -0,0 +1,46 @@ +/***************************************************************************** +Copyright (c) 2011-2012, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common_utest.h" + +void test_samax() +{ + int N=3, inc=1; + float te_max=0.0, tr_max=0.0; + float x[]={-1.1, 2.2, -3.3}; + + te_max=BLASFUNC(samax)(&N, x, &inc); + + tr_max=BLASFUNC_REF(samax)(&N, x, &inc); + + CU_ASSERT_DOUBLE_EQUAL(te_max, tr_max, CHECK_EPS); +} From 5d657c6e67c1588aad4c9f44633df90acc4a4973 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 26 Apr 2012 16:50:57 +0800 Subject: [PATCH 017/162] Fixed #96 a SEGFAULT bug in samax on x86. --- kernel/x86/amax_sse.S | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/x86/amax_sse.S b/kernel/x86/amax_sse.S index 65792cf45..05d21a7eb 100644 --- a/kernel/x86/amax_sse.S +++ b/kernel/x86/amax_sse.S @@ -495,7 +495,6 @@ ALIGN_4 .L999: - RESTOREREGISTERS subl $8, %esp movss %xmm0, (%esp) From 006200c9a4310abd9f2d53d5974de942dea16ba4 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 27 Apr 2012 09:55:21 +0800 Subject: [PATCH 018/162] Fixed #98 updated MD5 for new LAPACK 3.4.1 version on netlib.org. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index efc8b33d8..8d78a844b 100644 --- a/Makefile +++ b/Makefile @@ -240,7 +240,7 @@ endif lapack-3.4.1 : lapack-3.4.1.tgz ifndef NOFORTRAN ifndef NO_LAPACK - @if test `$(MD5SUM) lapack-3.4.1.tgz | $(AWK) '{print $$1}'` = d33ace3ac27dc6b4502833ee4dd820db; then \ + @if test `$(MD5SUM) lapack-3.4.1.tgz | $(AWK) '{print $$1}'` = 44c3869c38c8335c2b9c2a8bb276eb55; then \ echo $(TAR) zxf $< ;\ $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.1) ;\ rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\ From c8a5d4b86fa0af789eed58b034aec3442b868788 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Thu, 26 Apr 2012 21:13:18 +0100 Subject: [PATCH 019/162] Refs #99. c_check/f_check: strip quotes from detected flags --- c_check | 2 ++ f_check | 2 ++ 2 files changed, 4 insertions(+) diff --git a/c_check b/c_check index 263efeb3d..6ce5e4cc0 100644 --- a/c_check +++ b/c_check @@ -174,6 +174,8 @@ $linker_a = ""; $link =~ s/\-Y\sP\,/\-Y/g; @flags = split(/[\s\,\n]/, $link); + # remove leading and trailing quotes from each flag. + @flags = map {s/^['"]|['"]$//g; $_} @flags; foreach $flags (@flags) { if ( diff --git a/f_check b/f_check index f5bb5a7f6..93c39ec88 100644 --- a/f_check +++ b/f_check @@ -237,6 +237,8 @@ if ($link ne "") { $link =~ s/\-rpath\s+/\-rpath\@/g; @flags = split(/[\s\,\n]/, $link); + # remove leading and trailing quotes from each flag. + @flags = map {s/^['"]|['"]$//g; $_} @flags; foreach $flags (@flags) { if ( From d02171b494bb26ceda4e894b49c047556c82450a Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 27 Apr 2012 11:15:24 +0800 Subject: [PATCH 020/162] Fixed the bug about NO_CBLAS=1 disabled exporting LAPACKE functions in shared library. --- exports/gensymbol | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 702e047c1..ac586630a 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -2468,11 +2468,17 @@ if ($ARGV[1] eq "ia64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "MIPS"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; +if ($ARGV[4] == 0) { + @no_underscore_objs = (@cblasobjs); +}else{ + #NO_CBLAS=1 + @no_underscore_objs = (); +} if ($ARGV[6] == 1) { #NO_LAPACKE=1 - @no_underscore_objs = (@cblasobjs); + @no_underscore_objs = (@no_underscore_objs); } else { - @no_underscore_objs = (@cblasobjs, @lapackeobjs); + @no_underscore_objs = (@no_underscore_objs, @lapackeobjs); } @linuxobjs = ('__strtol_internal', 'exit', 'free', 'getenv', 'malloc', @@ -2495,11 +2501,11 @@ if ($ARGV[0] eq "linux"){ print $objs, $bu, "\n"; } - if ($ARGV[4] == 0) { +# if ($ARGV[4] == 0) { foreach $objs (@no_underscore_objs) { print $objs, "\n"; } - } +# } foreach $objs (@linuxobjs) { print $objs, "\n"; @@ -2512,11 +2518,11 @@ if ($ARGV[0] eq "osx"){ print "_", $objs, $bu, "\n"; } - if ($ARGV[4] == 0) { +# if ($ARGV[4] == 0) { foreach $objs (@no_underscore_objs) { print "_", $objs, "\n"; } - } +# } exit(0); } @@ -2525,11 +2531,11 @@ if ($ARGV[0] eq "aix"){ print $objs, $bu, "\n"; } - if ($ARGV[4] == 0) { +# if ($ARGV[4] == 0) { foreach $objs (@no_underscore_objs) { print $objs, "\n"; } - } +# } exit(0); } @@ -2547,12 +2553,12 @@ if ($ARGV[0] eq "win2k"){ $count ++; } - if ($ARGV[4] == 0) { +# if ($ARGV[4] == 0) { foreach $objs (@no_underscore_objs) { print "\t",$objs,"=$objs"," \@", $count, "\n"; $count ++; } - } +# } exit(0); } @@ -2615,11 +2621,11 @@ if ($ARGV[0] eq "linktest"){ foreach $objs (@underscore_objs) { print $objs, $bu, "();\n" if $objs ne "xerbla"; } - if ($ARGV[4] == 0) { +# if ($ARGV[4] == 0) { foreach $objs (@no_underscore_objs) { print $objs, "();\n"; } - } +# } From d48a1d1928f040b42a3ff6c8ea68dcc347d6032e Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sat, 28 Apr 2012 12:33:56 +0800 Subject: [PATCH 021/162] Fixed #101. Install the missing lapacke header with LAPACK-3.4.1. Thank Zaheer for this patch. --- Makefile.install | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.install b/Makefile.install index 62ceda986..a74f3d606 100644 --- a/Makefile.install +++ b/Makefile.install @@ -39,6 +39,7 @@ ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(OPENBLAS_LIBRARY_DIR) @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(OPENBLAS_INCLUDE_DIR)/lapacke.h @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(OPENBLAS_INCLUDE_DIR)/lapacke_config.h + @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h @-cp $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h endif From fcb89ad94ddb5b6103aa341178db17ff9030f4a5 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sun, 29 Apr 2012 18:40:24 +0800 Subject: [PATCH 022/162] Refs #91. Updated the doc for 0.1.1 version. --- Changelog.txt | 13 +++++++++++++ Makefile.rule | 2 +- README | 3 ++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index e122300ec..0ed35b0e4 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,17 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.1.1 +29-Apr-2012 +common: + * Upgraded LAPACK to 3.4.1 version. (Thank Zaheer Chothia) + * Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia) + * Fixed the build bug (MD5 and download) on Mac OSX. + * Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1. +x86/x86_64: + * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. + * Test alpha=Nan in dscale. + * Fixed a SEGFAULT bug in samax on x86 windows. + ==================================================================== Version 0.1.0 23-Mar-2012 diff --git a/Makefile.rule b/Makefile.rule index 7a1e845fe..b6cf98a3e 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.1.0 +VERSION = 0.1.1 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/README b/README index c8c2c2c55..6372e96bd 100644 --- a/README +++ b/README @@ -2,6 +2,7 @@ OpenBLAS Readme 1.Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn) +Please read the documents on OpenBLAS wiki pages(http://github.com/xianyi/OpenBLAS/wiki). 2.Intallation Download from project homepage. http://xianyi.github.com/OpenBLAS/ @@ -34,7 +35,7 @@ Additional support CPU: x86_64: Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. MIPS64: - ICT Loongson 3A //The initial version used GotoBLAS2 MIPS64 kernels. Thus, the performance is not good. + ICT Loongson 3A //Level 3 BLAS subroutines are optimized. 4.Usages Link with libopenblas.a or -lopenblas for shared library. From 0e39699c8c5618a54ce45fcf825c149307f8dfc9 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 30 Apr 2012 13:03:34 +0800 Subject: [PATCH 023/162] Fixed #102. Export the missing LAPACK functions (slapy2,slapy3,dlapy2,dlapy3) in shared library. --- exports/gensymbol | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index ac586630a..1f30d7b15 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -105,7 +105,7 @@ slahrd, slahr2, slaic1, slaln2, slals0, slalsa, slalsd, slangb, slange, slangt, slanhs, slansb, slansp, slansy, slantb, slantp, slantr, slanv2, - slapll, slapmt, + slapll, slapmt, slapy2, slapy3, slaqgb, slaqge, slaqp2, slaqps, slaqsb, slaqsp, slaqsy, slaqr0, slaqr1, slaqr2, slaqr3, slaqr4, slaqr5, slaqtr, slar1v, slar2v, ilaslr, ilaslc, @@ -224,7 +224,7 @@ dlahrd, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, dlangb, dlange, dlangt, dlanhs, dlansb, dlansp, dlansy, dlantb, dlantp, dlantr, dlanv2, - dlapll, dlapmt, + dlapll, dlapmt, dlapy2, dlapy3, dlaqgb, dlaqge, dlaqp2, dlaqps, dlaqsb, dlaqsp, dlaqsy, dlaqr0, dlaqr1, dlaqr2, dlaqr3, dlaqr4, dlaqr5, dlaqtr, dlar1v, dlar2v, iladlr, iladlc, From 8218cbea2a2b706775c3c302ea1a4c361bd40bcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20Ebersp=C3=A4cher?= Date: Wed, 2 May 2012 11:33:06 +0200 Subject: [PATCH 024/162] Add Xianyi's patch for segfaults on kernel 2.6.32 and add documentation accordingly. --- GotoBLAS_03FAQ.txt | 8 ++++++++ segfaults.patch | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 segfaults.patch diff --git a/GotoBLAS_03FAQ.txt b/GotoBLAS_03FAQ.txt index b6033fe53..b45e6d095 100644 --- a/GotoBLAS_03FAQ.txt +++ b/GotoBLAS_03FAQ.txt @@ -90,6 +90,14 @@ number of threads will consume extra resource. I recommend you to specify minimum number of threads. +1.9 Q I have segfaults when I compile with USE_OPENMP=1. What's wrong? + + A This may be related to a bug in the Linux kernel 2.6.32. Try applying + the patch segaults.patch using + + git am segfaults.patch + + and see if the crashes persist. 2. Architecture Specific issue or Implementation diff --git a/segfaults.patch b/segfaults.patch new file mode 100644 index 000000000..9585fa04b --- /dev/null +++ b/segfaults.patch @@ -0,0 +1,27 @@ +From ac40907baa90a0acc78139762ffa3c6f09274236 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alexander=20Ebersp=C3=A4cher?= +Date: Wed, 2 May 2012 11:22:52 +0200 +Subject: [PATCH] Fix segfaults with kernel 2.6.32. This comes at the price of many compiler warnings. + +--- + common_linux.h | 4 ++-- + 1 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/common_linux.h b/common_linux.h +index b0381d9..40a94cb 100644 +--- a/common_linux.h ++++ b/common_linux.h +@@ -76,8 +76,8 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, + #endif + #else + //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 +-// unsigned long null_nodemask=0; +- return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); ++ unsigned long null_nodemask=0; ++ return syscall(SYS_mbind, addr, len, mode, &nodemask, maxnode, flags); + #endif + } + +-- +1.7.1 + From 4236d0d93836cd304f27646f18a28d309210e14d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20Ebersp=C3=A4cher?= Date: Wed, 2 May 2012 12:03:07 +0200 Subject: [PATCH 025/162] Add note on compiler warnings for the segfaults patch. --- GotoBLAS_03FAQ.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GotoBLAS_03FAQ.txt b/GotoBLAS_03FAQ.txt index b45e6d095..0213d8d58 100644 --- a/GotoBLAS_03FAQ.txt +++ b/GotoBLAS_03FAQ.txt @@ -97,7 +97,8 @@ git am segfaults.patch - and see if the crashes persist. + and see if the crashes persist. Note that this patch will lead to many + compiler warnings. 2. Architecture Specific issue or Implementation From 7f89edee3efce16b3a6db1a4382b432770acee21 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 3 May 2012 20:05:34 +0800 Subject: [PATCH 026/162] refs #103 Increase GEMM_MULTITHREAD_THRESHOLD to 50. --- Makefile.rule | 4 ++-- Makefile.system | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index b6cf98a3e..56cd63540 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -94,8 +94,8 @@ VERSION = 0.1.1 # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # with single thread. You can use this flag to avoid the overhead of multi-threading -# in small matrix sizes. The default value is 4. -# GEMM_MULTITHREAD_THRESHOLD = 4 +# in small matrix sizes. The default value is 50. +# GEMM_MULTITHREAD_THRESHOLD = 50 # If you need santy check by comparing reference BLAS. It'll be very # slow (Not implemented yet). diff --git a/Makefile.system b/Makefile.system index e2fe9f730..ef2e8fcdd 100644 --- a/Makefile.system +++ b/Makefile.system @@ -45,7 +45,7 @@ GETARCH_FLAGS += -DUSE64BITINT endif ifndef GEMM_MULTITHREAD_THRESHOLD -GEMM_MULTITHREAD_THRESHOLD=4 +GEMM_MULTITHREAD_THRESHOLD=50 endif GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) From e7846547be05ad548e748e09403eeee5ca5e7a24 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 7 May 2012 16:38:44 +0800 Subject: [PATCH 027/162] Refs #85 #104. Disable my_bind to fix this segfault issue. --- segfaults.patch | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/segfaults.patch b/segfaults.patch index 9585fa04b..f68d0438d 100644 --- a/segfaults.patch +++ b/segfaults.patch @@ -11,14 +11,14 @@ diff --git a/common_linux.h b/common_linux.h index b0381d9..40a94cb 100644 --- a/common_linux.h +++ b/common_linux.h -@@ -76,8 +76,8 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, +@@ -76,9 +76,8 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, #endif #else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 -// unsigned long null_nodemask=0; - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); + unsigned long null_nodemask=0; -+ return syscall(SYS_mbind, addr, len, mode, &nodemask, maxnode, flags); ++ return 0; #endif } From dee74174ff1df9de22979fa4a76aef5272aeeb70 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 8 May 2012 23:50:46 +0800 Subject: [PATCH 028/162] Refs #85 #104. Use patch instead of git to apply this segfaults.patch. --- GotoBLAS_03FAQ.txt | 2 +- segfaults.patch | 25 +++++-------------------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/GotoBLAS_03FAQ.txt b/GotoBLAS_03FAQ.txt index 0213d8d58..be623d608 100644 --- a/GotoBLAS_03FAQ.txt +++ b/GotoBLAS_03FAQ.txt @@ -95,7 +95,7 @@ A This may be related to a bug in the Linux kernel 2.6.32. Try applying the patch segaults.patch using - git am segfaults.patch + patch < segfaults.patch and see if the crashes persist. Note that this patch will lead to many compiler warnings. diff --git a/segfaults.patch b/segfaults.patch index f68d0438d..375ab766c 100644 --- a/segfaults.patch +++ b/segfaults.patch @@ -1,27 +1,12 @@ -From ac40907baa90a0acc78139762ffa3c6f09274236 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Alexander=20Ebersp=C3=A4cher?= -Date: Wed, 2 May 2012 11:22:52 +0200 -Subject: [PATCH] Fix segfaults with kernel 2.6.32. This comes at the price of many compiler warnings. - ---- - common_linux.h | 4 ++-- - 1 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/common_linux.h b/common_linux.h -index b0381d9..40a94cb 100644 ---- a/common_linux.h -+++ b/common_linux.h -@@ -76,9 +76,8 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, - #endif +diff -ruN common_linux.h.orig common_linux.h +--- common_linux.h.orig 2012-04-23 11:27:55.000000000 +0800 ++++ common_linux.h 2012-05-08 23:43:00.000000000 +0800 +@@ -77,7 +77,7 @@ #else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 --// unsigned long null_nodemask=0; + // unsigned long null_nodemask=0; - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); -+ unsigned long null_nodemask=0; + return 0; #endif } --- -1.7.1 - From 52485e5fd08f5420c13c54bbf3342aef277ea0e3 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 10 May 2012 13:01:35 +0800 Subject: [PATCH 029/162] Refs #105. Export missing LAPACK functions in shared library. They are as following, slabad, dlabad, slacpy, dlacpy, slamch, dlamch, slartg, slartgp, slartgs, dlartg, dlartgp, dlartgs, slascl, dlascl, slaset, dlaset. --- exports/gensymbol | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 1f30d7b15..029dc8395 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -100,9 +100,10 @@ sggglm, sgghrd, sgglse, sggqrf, sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz, - shsein, shseqr, slabrd, slacon, slacn2, + shsein, shseqr, slabad, slabrd, slacon, slacn2, slacpy, slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr, slahrd, slahr2, slaic1, slaln2, slals0, slalsa, slalsd, + slamch, slangb, slange, slangt, slanhs, slansb, slansp, slansy, slantb, slantp, slantr, slanv2, slapll, slapmt, slapy2, slapy3, @@ -110,8 +111,9 @@ slaqr0, slaqr1, slaqr2, slaqr3, slaqr4, slaqr5, slaqtr, slar1v, slar2v, ilaslr, ilaslc, slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv, - slarrv, slartv, - slarz, slarzb, slarzt, slasy2, slasyf, + slarnv, slarrv, slartg, slartgp, slartgs, slartv, + slarz, slarzb, slarzt, slascl, slasy2, slasyf, + slaset, slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm, sopgtr, sopmtr, sorg2l, sorg2r, sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2, @@ -219,9 +221,10 @@ dggglm, dgghrd, dgglse, dggqrf, dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz, - dhsein, dhseqr, dlabrd, dlacon, dlacn2, + dhsein, dhseqr, dlabad, dlabrd, dlacon, dlacn2, dlacpy, dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr, dlahrd, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, + dlamch, dlangb, dlange, dlangt, dlanhs, dlansb, dlansp, dlansy, dlantb, dlantp, dlantr, dlanv2, dlapll, dlapmt, dlapy2, dlapy3, @@ -229,8 +232,9 @@ dlaqr0, dlaqr1, dlaqr2, dlaqr3, dlaqr4, dlaqr5, dlaqtr, dlar1v, dlar2v, iladlr, iladlc, dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, dlargv, - dlarrv, dlartv, - dlarz, dlarzb, dlarzt, dlasy2, dlasyf, + dlarnv, dlarrv, dlartg, dlartgp, dlartgs, dlartv, + dlarz, dlarzb, dlarzt, dlascl, dlasy2, dlasyf, + dlaset, dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm, dopgtr, dopmtr, dorg2l, dorg2r, dorgbr, dorghr, dorgl2, dorglq, dorgql, dorgqr, dorgr2, From 06e208c5c39911009f388275afe4097013d1abd2 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 13 May 2012 11:43:29 +0800 Subject: [PATCH 030/162] Refs #106. Fixed wget and md5 bug on FreeBSD and NetBSD. --- Makefile | 3 ++- Makefile.system | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8d78a844b..905d686a2 100644 --- a/Makefile +++ b/Makefile @@ -256,7 +256,8 @@ LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.1.tgz lapack-3.4.1.tgz : ifndef NOFORTRAN -ifeq ($(OSNAME), Darwin) +#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or +ifeq ($(OSNAME), $(filter $(OSNAME),Darwin FreeBSD NetBSD)) curl -O $(LAPACK_URL) else wget $(LAPACK_URL) diff --git a/Makefile.system b/Makefile.system index ef2e8fcdd..c9e74faa6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -108,6 +108,14 @@ export MACOSX_DEPLOYMENT_TARGET=10.2 MD5SUM = md5 -r endif +ifeq ($(OSNAME), FreeBSD) +MD5SUM = md5 -r +endif + +ifeq ($(OSNAME), NetBSD) +MD5SUM = md5 -r +endif + ifeq ($(OSNAME), Linux) EXTRALIB += -lm endif From fc4927fa0f8821e65455113374b4aa2020501fb0 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Tue, 15 May 2012 23:58:22 +0200 Subject: [PATCH 031/162] Fixed #107. Export missing LAPACK auxiliary routines (ALLAUX, SCLAUX, DZLAUX) Added some documentation on how the symbol list is derived and synchronized with lapack-3.4.1 to minimize the differences. --- exports/gensymbol | 203 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 144 insertions(+), 59 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 029dc8395..735b73f4e 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -73,6 +73,7 @@ ); @lapackobjs = ( + # These routines are provided by OpenBLAS. sgesv, dgesv, cgesv, zgesv, sgetf2, dgetf2, cgetf2, zgetf2, sgetrf, dgetrf, cgetrf, zgetrf, @@ -88,32 +89,85 @@ ); @lapackobjs2 = ( - sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, + # These routines are provided by LAPACK (reference implementation). + # + # This list is prepared by copying all routines listed in + # `lapack-3.4.1/SRC/Makefile` and replacing the '.o' suffix with a comma. + # Thereafter the following routines should be removed: + # - those provided by OpenBLAS (see @lapackobjs) + # - extra precision routines (see @lapack_extendedprecision_objs) + # Each of these have been marked individually with "already provided" or "excluded". + + # ALLAUX -- Auxiliary routines called from all precisions + # already provided by @blasobjs: xerbla, lsame + ilaenv, ieeeck, lsamen, xerbla_array, iparmq, + ilaprec, ilatrans, ilauplo, iladiag, chla_transtype, + ilaver, slamch, + + # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX. + # excluded: second_$(TIMER) + sbdsdc, + sbdsqr, sdisna, slabad, slacpy, sladiv, slae2, slaebz, + slaed0, slaed1, slaed2, slaed3, slaed4, slaed5, slaed6, + slaed7, slaed8, slaed9, slaeda, slaev2, slagtf, + slagts, slamrg, slanst, + slapy2, slapy3, slarnv, + slarra, slarrb, slarrc, slarrd, slarre, slarrf, slarrj, + slarrk, slarrr, slaneg, + slartg, slaruv, slas2, slascl, + slasd0, slasd1, slasd2, slasd3, slasd4, slasd5, slasd6, + slasd7, slasd8, slasda, slasdq, slasdt, + slaset, slasq1, slasq2, slasq3, slasq4, slasq5, slasq6, + slasr, slasrt, slassq, slasv2, spttrf, sstebz, sstedc, + ssteqr, ssterf, slaisnan, sisnan, + slartgp, slartgs, + + # DZLAUX -- Auxiliary routines called from both DOUBLE and COMPLEX*16. + # excluded: dsecnd_$(TIMER) + dbdsdc, + dbdsqr, ddisna, dlabad, dlacpy, dladiv, dlae2, dlaebz, + dlaed0, dlaed1, dlaed2, dlaed3, dlaed4, dlaed5, dlaed6, + dlaed7, dlaed8, dlaed9, dlaeda, dlaev2, dlagtf, + dlagts, dlamrg, dlanst, + dlapy2, dlapy3, dlarnv, + dlarra, dlarrb, dlarrc, dlarrd, dlarre, dlarrf, dlarrj, + dlarrk, dlarrr, dlaneg, + dlartg, dlaruv, dlas2, dlascl, + dlasd0, dlasd1, dlasd2, dlasd3, dlasd4, dlasd5, dlasd6, + dlasd7, dlasd8, dlasda, dlasdq, dlasdt, + dlaset, dlasq1, dlasq2, dlasq3, dlasq4, dlasq5, dlasq6, + dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc, + dsteqr, dsterf, dlaisnan, disnan, + dlartgp, dlartgs, + dlamch, + + # SLASRC -- Single precision real LAPACK routines + # already provided by @lapackobjs: + # sgesv, sgetf2, slaswp, slauu2, slauum, spotf2, spotri, strti2, strtri + sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, sgbsvx, sgbtf2, sgbtrf, sgbtrs, sgebak, sgebal, sgebd2, sgebrd, sgecon, sgeequ, sgees, sgeesx, sgeev, sgeevx, sgegs, sgegv, sgehd2, sgehrd, sgelq2, sgelqf, sgels, sgelsd, sgelss, sgelsx, sgelsy, sgeql2, sgeqlf, - sgeqp3, sgeqpf, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, sgerq2, sgerqf, - sgesc2, sgesdd, sgesvd, sgesvx, sgetc2, - sgetri, + sgeqp3, sgeqpf, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, + sgerq2, sgerqf, sgesc2, sgesdd, sgesvd, sgesvx, + sgetc2, sgetri, sggbak, sggbal, sgges, sggesx, sggev, sggevx, sggglm, sgghrd, sgglse, sggqrf, - sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, + sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz, - shsein, shseqr, slabad, slabrd, slacon, slacn2, slacpy, + shsein, shseqr, slabrd, slacon, slacn2, slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr, slahrd, slahr2, slaic1, slaln2, slals0, slalsa, slalsd, - slamch, slangb, slange, slangt, slanhs, slansb, slansp, slansy, slantb, slantp, slantr, slanv2, - slapll, slapmt, slapy2, slapy3, + slapll, slapmt, slaqgb, slaqge, slaqp2, slaqps, slaqsb, slaqsp, slaqsy, slaqr0, slaqr1, slaqr2, slaqr3, slaqr4, slaqr5, slaqtr, slar1v, slar2v, ilaslr, ilaslc, slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv, - slarnv, slarrv, slartg, slartgp, slartgs, slartv, - slarz, slarzb, slarzt, slascl, slasy2, slasyf, - slaset, + slarrv, slartv, + slarz, slarzb, slarzt, slasy2, slasyf, slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm, sopgtr, sopmtr, sorg2l, sorg2r, sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2, @@ -121,19 +175,21 @@ sormbr, sormhr, sorml2, sormlq, sormql, sormqr, sormr2, sormr3, sormrq, sormrz, sormtr, spbcon, spbequ, spbrfs, spbstf, spbsv, spbsvx, - spbtf2, spbtrf, spbtrs, spocon, spoequ, sporfs, sposv, - sposvx, spotrs, spstrf, spstf2, + spbtf2, spbtrf, spbtrs, spocon, spoequ, sporfs, sposv, + sposvx, spstrf, spstf2, sppcon, sppequ, spprfs, sppsv, sppsvx, spptrf, spptri, spptrs, sptcon, - spteqr, sptrfs, sptsv, sptsvx, spttrs, sptts2, srscl, + spteqr, sptrfs, sptsv, sptsvx, spttrs, sptts2, srscl, ssbev, ssbevd, ssbevx, ssbgst, ssbgv, ssbgvd, ssbgvx, ssbtrd, sspcon, sspev, sspevd, sspevx, sspgst, sspgv, sspgvd, sspgvx, ssprfs, sspsv, sspsvx, ssptrd, ssptrf, ssptri, ssptrs, sstegr, sstein, sstev, sstevd, sstevr, - sstevx, ssycon, ssyev, ssyevd, ssyevr, ssyevx, ssygs2, + sstevx, + ssycon, ssyev, ssyevd, ssyevr, ssyevx, ssygs2, ssygst, ssygv, ssygvd, ssygvx, ssyrfs, ssysv, ssysvx, ssytd2, ssytf2, ssytrd, ssytrf, ssytri, ssytri2, ssytri2x, - ssyswapr, ssytrs, ssytrs2, ssyconv, stbcon, + ssyswapr, ssytrs, ssytrs2, ssyconv, + stbcon, stbrfs, stbtrs, stgevc, stgex2, stgexc, stgsen, stgsja, stgsna, stgsy2, stgsyl, stpcon, stprfs, stptri, stptrs, @@ -146,26 +202,38 @@ sbbcsd, slapmr, sorbdb, sorcsd, sgeqrt, sgeqrt2, sgeqrt3, sgemqrt, stpqrt, stpqrt2, stpmqrt, stprfb, - + + # DSLASRC -- Double-single mixed precision real routines called from + # single, single-extra and double precision real LAPACK + # routines (i.e. from SLASRC, SXLASRC, DLASRC). + # + # already provided by @lapackobjs: + # sgetrs, spotrf, sgetrf + spotrs, + + # CLASRC -- Single precision complex LAPACK routines + # already provided by @blasobjs: csymv + # already provided by @lapackobjs: + # cgesv, cgetf2, claswp, clauu2, clauum, cpotf2, cpotri, ctrti2, ctrtri cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx, cgbtf2, cgbtrf, cgbtrs, cgebak, cgebal, cgebd2, cgebrd, cgecon, cgeequ, cgees, cgeesx, cgeev, cgeevx, cgegs, cgegv, cgehd2, cgehrd, cgelq2, cgelqf, cgels, cgelsd, cgelss, cgelsx, cgelsy, cgeql2, cgeqlf, cgeqp3, - cgeqpf, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, cgerq2, cgerqf, - cgesc2, cgesdd, cgesvd, cgesvx, cgetc2, - cgetri, + cgeqpf, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, + cgerq2, cgerqf, cgesc2, cgesdd, cgesvd, + cgesvx, cgetc2, cgetri, cggbak, cggbal, cgges, cggesx, cggev, cggevx, cggglm, cgghrd, cgglse, cggqrf, cggrqf, cggsvd, cggsvp, - cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, + cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, chbevd, chbevx, chbgst, chbgv, chbgvd, chbgvx, chbtrd, checon, cheev, cheevd, cheevr, cheevx, chegs2, chegst, chegv, chegvd, chegvx, cherfs, chesv, chesvx, chetd2, chetf2, chetrd, - chetrf, chetri, chetri2, chetri2x, cheswapr, + chetrf, chetri, chetri2, chetri2x, cheswapr, chetrs, chetrs2, chgeqz, chpcon, chpev, chpevd, - chpevx, chpgst, chpgv, chpgvd, chpgvx, chprfs, chpsv, + chpevx, chpgst, chpgv, chpgvd, chpgvx, chprfs, chpsv, chpsvx, chptrd, chptrf, chptri, chptrs, chsein, chseqr, clabrd, clacgv, clacon, clacn2, clacp2, clacpy, clacrm, clacrt, cladiv, @@ -179,21 +247,22 @@ claqhb, claqhe, claqhp, claqp2, claqps, claqsb, claqr0, claqr1, claqr2, claqr3, claqr4, claqr5, claqsp, claqsy, clar1v, clar2v, ilaclr, ilaclc, - clarf, clarfb, clarfg, clarfgp, clarft, + clarf, clarfb, clarfg, clarft, clarfgp, clarfx, clargv, clarnv, clarrv, clartg, clartv, clarz, clarzb, clarzt, clascl, claset, clasr, classq, clasyf, clatbs, clatdf, clatps, clatrd, clatrs, clatrz, - clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, + clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, cpbsvx, cpbtf2, cpbtrf, cpbtrs, cpocon, cpoequ, cporfs, - cposv, cposvx, cpotrs, cpstrf, cpstf2, + cposv, cposvx, cpstrf, cpstf2, cppcon, cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs, cptcon, cpteqr, cptrfs, cptsv, cptsvx, cpttrf, cpttrs, cptts2, - crot, cspcon, cspmv, cspr, csprfs, cspsv, + crot, cspcon, cspmv, cspr, csprfs, cspsv, cspsvx, csptrf, csptri, csptrs, csrscl, cstedc, - cstegr, cstein, csteqr, csycon, - csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, - csytri2, csytri2x, csyswapr, - csytrs, csytrs2, csyconv, ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, + cstegr, cstein, csteqr, + csycon, + csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, csytri2, csytri2x, + csyswapr, csytrs, csytrs2, csyconv, + ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, ctgexc, ctgsen, ctgsja, ctgsna, ctgsy2, ctgsyl, ctpcon, ctprfs, ctptri, ctptrs, ctrcon, ctrevc, ctrexc, ctrrfs, ctrsen, ctrsna, @@ -209,32 +278,42 @@ cgeqrt, cgeqrt2, cgeqrt3, cgemqrt, ctpqrt, ctpqrt2, ctpmqrt, ctprfb, + # ZCLASRC -- Double-single mixed precision complex routines called from + # single, single-extra and double precision complex LAPACK + # routines (i.e. from CLASRC, CXLASRC, ZLASRC). + # + # already provided by @lapackobjs: + # cgetrs, cpotrf, cgetrf + cpotrs, + + # DLASRC -- Double precision real LAPACK routines + # already provided by @lapackobjs: + # dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri, + # dtrti2, dtrtri dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv, dgbsvx, dgbtf2, dgbtrf, dgbtrs, dgebak, dgebal, dgebd2, dgebrd, dgecon, dgeequ, dgees, dgeesx, dgeev, dgeevx, dgegs, dgegv, dgehd2, dgehrd, dgelq2, dgelqf, dgels, dgelsd, dgelss, dgelsx, dgelsy, dgeql2, dgeqlf, - dgeqp3, dgeqpf, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, dgerq2, dgerqf, - dgesc2, dgesdd, dgesvd, dgesvx, dgetc2, - dgetri, + dgeqp3, dgeqpf, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, + dgerq2, dgerqf, dgesc2, dgesdd, dgesvd, dgesvx, + dgetc2, dgetri, dggbak, dggbal, dgges, dggesx, dggev, dggevx, dggglm, dgghrd, dgglse, dggqrf, - dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, + dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz, - dhsein, dhseqr, dlabad, dlabrd, dlacon, dlacn2, dlacpy, + dhsein, dhseqr, dlabrd, dlacon, dlacn2, dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr, dlahrd, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, - dlamch, dlangb, dlange, dlangt, dlanhs, dlansb, dlansp, dlansy, dlantb, dlantp, dlantr, dlanv2, - dlapll, dlapmt, dlapy2, dlapy3, + dlapll, dlapmt, dlaqgb, dlaqge, dlaqp2, dlaqps, dlaqsb, dlaqsp, dlaqsy, dlaqr0, dlaqr1, dlaqr2, dlaqr3, dlaqr4, dlaqr5, dlaqtr, dlar1v, dlar2v, iladlr, iladlc, - dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, dlargv, - dlarnv, dlarrv, dlartg, dlartgp, dlartgs, dlartv, - dlarz, dlarzb, dlarzt, dlascl, dlasy2, dlasyf, - dlaset, + dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, + dlargv, dlarrv, dlartv, + dlarz, dlarzb, dlarzt, dlasy2, dlasyf, dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm, dopgtr, dopmtr, dorg2l, dorg2r, dorgbr, dorghr, dorgl2, dorglq, dorgql, dorgqr, dorgr2, @@ -242,21 +321,22 @@ dormbr, dormhr, dorml2, dormlq, dormql, dormqr, dormr2, dormr3, dormrq, dormrz, dormtr, dpbcon, dpbequ, dpbrfs, dpbstf, dpbsv, dpbsvx, - dpbtf2, dpbtrf, dpbtrs, dpocon, dpoequ, dporfs, dposv, + dpbtf2, dpbtrf, dpbtrs, dpocon, dpoequ, dporfs, dposv, dposvx, dpotrs, dpstrf, dpstf2, dppcon, dppequ, dpprfs, dppsv, dppsvx, dpptrf, dpptri, dpptrs, dptcon, - dpteqr, dptrfs, dptsv, dptsvx, dpttrs, dptts2, drscl, + dpteqr, dptrfs, dptsv, dptsvx, dpttrs, dptts2, drscl, dsbev, dsbevd, dsbevx, dsbgst, dsbgv, dsbgvd, dsbgvx, dsbtrd, dspcon, dspev, dspevd, dspevx, dspgst, dspgv, dspgvd, dspgvx, dsprfs, dspsv, dspsvx, dsptrd, dsptrf, dsptri, dsptrs, dstegr, dstein, dstev, dstevd, dstevr, - dstevx, dsycon, dsyev, dsyevd, dsyevr, + dstevx, + dsycon, dsyev, dsyevd, dsyevr, dsyevx, dsygs2, dsygst, dsygv, dsygvd, dsygvx, dsyrfs, dsysv, dsysvx, - dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytrs, dsytrs2, - dsytri2, dsytri2x, dsyswapr, dsyconv, dtbcon, - dtbrfs, dtbtrs, dtgevc, dtgex2, dtgexc, dtgsen, + dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytri2, dsytri2x, + dsyswapr, dsytrs, dsytrs2, dsyconv, + dtbcon, dtbrfs, dtbtrs, dtgevc, dtgex2, dtgexc, dtgsen, dtgsja, dtgsna, dtgsy2, dtgsyl, dtpcon, dtprfs, dtptri, dtptrs, dtrcon, dtrevc, dtrexc, dtrrfs, dtrsen, dtrsna, dtrsyl, @@ -270,6 +350,11 @@ dgeqrt, dgeqrt2, dgeqrt3, dgemqrt, dtpqrt, dtpqrt2, dtpmqrt, dtprfb, + # ZLASRC -- Double precision complex LAPACK routines + # already provided by @blasobjs: zsymv + # already provided by @lapackobjs: + # zgesv, zgetrs, zgetf2, zlaswp, zlauu2, zlauum, zpotf2, zpotrf, zpotri, + # ztrti2, ztrtri zbdsqr, zgbbrd, zgbcon, zgbequ, zgbrfs, zgbsv, zgbsvx, zgbtf2, zgbtrf, zgbtrs, zgebak, zgebal, zgebd2, zgebrd, zgecon, zgeequ, zgees, zgeesx, zgeev, zgeevx, @@ -281,14 +366,14 @@ zggbak, zggbal, zgges, zggesx, zggev, zggevx, zggglm, zgghrd, zgglse, zggqrf, zggrqf, zggsvd, zggsvp, - zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, + zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, zhbevd, zhbevx, zhbgst, zhbgv, zhbgvd, zhbgvx, zhbtrd, zhecon, zheev, zheevd, zheevr, zheevx, zhegs2, zhegst, zhegv, zhegvd, zhegvx, zherfs, zhesv, zhesvx, zhetd2, zhetf2, zhetrd, zhetrf, zhetri, zhetri2, zhetri2x, zheswapr, zhetrs, zhetrs2, zhgeqz, zhpcon, zhpev, zhpevd, - zhpevx, zhpgst, zhpgv, zhpgvd, zhpgvx, zhprfs, zhpsv, + zhpevx, zhpgst, zhpgv, zhpgvd, zhpgvx, zhprfs, zhpsv, zhpsvx, zhptrd, zhptrf, zhptri, zhptrs, zhsein, zhseqr, zlabrd, zlacgv, zlacon, zlacn2, zlacp2, zlacpy, zlacrm, zlacrt, zladiv, @@ -304,22 +389,23 @@ zlaqr0, zlaqr1, zlaqr2, zlaqr3, zlaqr4, zlaqr5, zlaqsp, zlaqsy, zlar1v, zlar2v, ilazlr, ilazlc, zlarcm, zlarf, zlarfb, - zlarfg, zlarfgp, zlarft, + zlarfg, zlarft, zlarfgp, zlarfx, zlargv, zlarnv, zlarrv, zlartg, zlartv, - zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, + zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, zlassq, zlasyf, zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zlatzm, - zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, + zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, zpbsvx, zpbtf2, zpbtrf, zpbtrs, zpocon, zpoequ, zporfs, zposv, zposvx, zpotrs, zpstrf, zpstf2, zppcon, zppequ, zpprfs, zppsv, zppsvx, zpptrf, zpptri, zpptrs, zptcon, zpteqr, zptrfs, zptsv, zptsvx, zpttrf, zpttrs, zptts2, - zrot, zspcon, zspmv, zspr, zsprfs, zspsv, + zrot, zspcon, zspmv, zspr, zsprfs, zspsv, zspsvx, zsptrf, zsptri, zsptrs, zdrscl, zstedc, - zstegr, zstein, zsteqr, zsycon, - zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, - zsytri2, zsytri2x, zsyswapr, - zsytrs, zsytrs2, zsyconv, ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, + zstegr, zstein, zsteqr, + zsycon, + zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, zsytri2, zsytri2x, + zsyswapr, zsytrs, zsytrs2, zsyconv, + ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, ztgexc, ztgsen, ztgsja, ztgsna, ztgsy2, ztgsyl, ztpcon, ztprfs, ztptri, ztptrs, ztrcon, ztrevc, ztrexc, ztrrfs, ztrsen, ztrsna, @@ -336,7 +422,6 @@ zbbcsd, zlapmr, zunbdb, zuncsd, zgeqrt, zgeqrt2, zgeqrt3, zgemqrt, ztpqrt, ztpqrt2, ztpmqrt, ztprfb, - ); @lapack_extendedprecision_objs = ( From f404a177878eee3acea0a5934fecddc75caaf5f3 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Wed, 16 May 2012 11:24:24 +0200 Subject: [PATCH 032/162] Symbol list: document how LAPACKE exports are derived and synchronize with lapack-3.4.1 This change adds the missing LAPACKE_[zc]syr routines but does not remove any exported functions. --- exports/gensymbol | 422 +++++++++++++++++++++++++++++----------------- 1 file changed, 271 insertions(+), 151 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 735b73f4e..dbd559473 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -430,15 +430,170 @@ ); @lapackeobjs = ( - lapack_make_complex_double, - lapack_make_complex_float, + # LAPACK C interface routines. + # + # This list is prepared in a similar manner to @lapackobjs2, however the + # functions all begin with an uppercase prefix (with the exception of the + # make_complex_* routines). + # + # The functions corresponding to @(MATGEN_OBJ) and @(SRCX_OBJ) are not + # exported since the respective LAPACK routines are not built by default. + + # @(OBJ) from `lapack-3.4.1/lapacke/utils/Makefile` + LAPACKE_cgb_nancheck, + LAPACKE_cgb_trans, + LAPACKE_cge_nancheck, + LAPACKE_cge_trans, + LAPACKE_cgg_nancheck, + LAPACKE_cgg_trans, + LAPACKE_cgt_nancheck, + LAPACKE_chb_nancheck, + LAPACKE_chb_trans, + LAPACKE_che_nancheck, + LAPACKE_che_trans, + LAPACKE_chp_nancheck, + LAPACKE_chp_trans, + LAPACKE_chs_nancheck, + LAPACKE_chs_trans, LAPACKE_c_nancheck, + LAPACKE_cpb_nancheck, + LAPACKE_cpb_trans, + LAPACKE_cpf_nancheck, + LAPACKE_cpf_trans, + LAPACKE_cpo_nancheck, + LAPACKE_cpo_trans, + LAPACKE_cpp_nancheck, + LAPACKE_cpp_trans, + LAPACKE_cpt_nancheck, + LAPACKE_csp_nancheck, + LAPACKE_csp_trans, + LAPACKE_cst_nancheck, + LAPACKE_csy_nancheck, + LAPACKE_csy_trans, + LAPACKE_ctb_nancheck, + LAPACKE_ctb_trans, + LAPACKE_ctf_nancheck, + LAPACKE_ctf_trans, + LAPACKE_ctp_nancheck, + LAPACKE_ctp_trans, + LAPACKE_ctr_nancheck, + LAPACKE_ctr_trans, + LAPACKE_dgb_nancheck, + LAPACKE_dgb_trans, + LAPACKE_dge_nancheck, + LAPACKE_dge_trans, + LAPACKE_dgg_nancheck, + LAPACKE_dgg_trans, + LAPACKE_dgt_nancheck, + LAPACKE_dhs_nancheck, + LAPACKE_dhs_trans, + LAPACKE_d_nancheck, + LAPACKE_dpb_nancheck, + LAPACKE_dpb_trans, + LAPACKE_dpf_nancheck, + LAPACKE_dpf_trans, + LAPACKE_dpo_nancheck, + LAPACKE_dpo_trans, + LAPACKE_dpp_nancheck, + LAPACKE_dpp_trans, + LAPACKE_dpt_nancheck, + LAPACKE_dsb_nancheck, + LAPACKE_dsb_trans, + LAPACKE_dsp_nancheck, + LAPACKE_dsp_trans, + LAPACKE_dst_nancheck, + LAPACKE_dsy_nancheck, + LAPACKE_dsy_trans, + LAPACKE_dtb_nancheck, + LAPACKE_dtb_trans, + LAPACKE_dtf_nancheck, + LAPACKE_dtf_trans, + LAPACKE_dtp_nancheck, + LAPACKE_dtp_trans, + LAPACKE_dtr_nancheck, + LAPACKE_dtr_trans, + LAPACKE_lsame, + LAPACKE_sgb_nancheck, + LAPACKE_sgb_trans, + LAPACKE_sge_nancheck, + LAPACKE_sge_trans, + LAPACKE_sgg_nancheck, + LAPACKE_sgg_trans, + LAPACKE_sgt_nancheck, + LAPACKE_shs_nancheck, + LAPACKE_shs_trans, + LAPACKE_s_nancheck, + LAPACKE_spb_nancheck, + LAPACKE_spb_trans, + LAPACKE_spf_nancheck, + LAPACKE_spf_trans, + LAPACKE_spo_nancheck, + LAPACKE_spo_trans, + LAPACKE_spp_nancheck, + LAPACKE_spp_trans, + LAPACKE_spt_nancheck, + LAPACKE_ssb_nancheck, + LAPACKE_ssb_trans, + LAPACKE_ssp_nancheck, + LAPACKE_ssp_trans, + LAPACKE_sst_nancheck, + LAPACKE_ssy_nancheck, + LAPACKE_ssy_trans, + LAPACKE_stb_nancheck, + LAPACKE_stb_trans, + LAPACKE_stf_nancheck, + LAPACKE_stf_trans, + LAPACKE_stp_nancheck, + LAPACKE_stp_trans, + LAPACKE_str_nancheck, + LAPACKE_str_trans, + LAPACKE_xerbla, + LAPACKE_zgb_nancheck, + LAPACKE_zgb_trans, + LAPACKE_zge_nancheck, + LAPACKE_zge_trans, + LAPACKE_zgg_nancheck, + LAPACKE_zgg_trans, + LAPACKE_zgt_nancheck, + LAPACKE_zhb_nancheck, + LAPACKE_zhb_trans, + LAPACKE_zhe_nancheck, + LAPACKE_zhe_trans, + LAPACKE_zhp_nancheck, + LAPACKE_zhp_trans, + LAPACKE_zhs_nancheck, + LAPACKE_zhs_trans, + LAPACKE_z_nancheck, + LAPACKE_zpb_nancheck, + LAPACKE_zpb_trans, + LAPACKE_zpf_nancheck, + LAPACKE_zpf_trans, + LAPACKE_zpo_nancheck, + LAPACKE_zpo_trans, + LAPACKE_zpp_nancheck, + LAPACKE_zpp_trans, + LAPACKE_zpt_nancheck, + LAPACKE_zsp_nancheck, + LAPACKE_zsp_trans, + LAPACKE_zst_nancheck, + LAPACKE_zsy_nancheck, + LAPACKE_zsy_trans, + LAPACKE_ztb_nancheck, + LAPACKE_ztb_trans, + LAPACKE_ztf_nancheck, + LAPACKE_ztf_trans, + LAPACKE_ztp_nancheck, + LAPACKE_ztp_trans, + LAPACKE_ztr_nancheck, + LAPACKE_ztr_trans, + lapack_make_complex_float, + lapack_make_complex_double, + + # @(SRC_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` LAPACKE_cbbcsd, LAPACKE_cbbcsd_work, LAPACKE_cbdsqr, LAPACKE_cbdsqr_work, - LAPACKE_cgb_nancheck, - LAPACKE_cgb_trans, LAPACKE_cgbbrd, LAPACKE_cgbbrd_work, LAPACKE_cgbcon, @@ -457,8 +612,6 @@ LAPACKE_cgbtrf_work, LAPACKE_cgbtrs, LAPACKE_cgbtrs_work, - LAPACKE_cge_nancheck, - LAPACKE_cge_trans, LAPACKE_cgebak, LAPACKE_cgebak_work, LAPACKE_cgebal, @@ -533,8 +686,6 @@ LAPACKE_cgetri_work, LAPACKE_cgetrs, LAPACKE_cgetrs_work, - LAPACKE_cgg_nancheck, - LAPACKE_cgg_trans, LAPACKE_cggbak, LAPACKE_cggbak_work, LAPACKE_cggbal, @@ -561,7 +712,6 @@ LAPACKE_cggsvd_work, LAPACKE_cggsvp, LAPACKE_cggsvp_work, - LAPACKE_cgt_nancheck, LAPACKE_cgtcon, LAPACKE_cgtcon_work, LAPACKE_cgtrfs, @@ -574,8 +724,6 @@ LAPACKE_cgttrf_work, LAPACKE_cgttrs, LAPACKE_cgttrs_work, - LAPACKE_chb_nancheck, - LAPACKE_chb_trans, LAPACKE_chbev, LAPACKE_chbev_work, LAPACKE_chbevd, @@ -592,8 +740,6 @@ LAPACKE_chbgvx_work, LAPACKE_chbtrd, LAPACKE_chbtrd_work, - LAPACKE_che_nancheck, - LAPACKE_che_trans, LAPACKE_checon, LAPACKE_checon_work, LAPACKE_cheequb, @@ -640,8 +786,6 @@ LAPACKE_chfrk_work, LAPACKE_chgeqz, LAPACKE_chgeqz_work, - LAPACKE_chp_nancheck, - LAPACKE_chp_trans, LAPACKE_chpcon, LAPACKE_chpcon_work, LAPACKE_chpev, @@ -672,8 +816,6 @@ LAPACKE_chptri_work, LAPACKE_chptrs, LAPACKE_chptrs_work, - LAPACKE_chs_nancheck, - LAPACKE_chs_trans, LAPACKE_chsein, LAPACKE_chsein_work, LAPACKE_chseqr, @@ -710,8 +852,6 @@ LAPACKE_claswp_work, LAPACKE_clauum, LAPACKE_clauum_work, - LAPACKE_cpb_nancheck, - LAPACKE_cpb_trans, LAPACKE_cpbcon, LAPACKE_cpbcon_work, LAPACKE_cpbequ, @@ -728,16 +868,12 @@ LAPACKE_cpbtrf_work, LAPACKE_cpbtrs, LAPACKE_cpbtrs_work, - LAPACKE_cpf_nancheck, - LAPACKE_cpf_trans, LAPACKE_cpftrf, LAPACKE_cpftrf_work, LAPACKE_cpftri, LAPACKE_cpftri_work, LAPACKE_cpftrs, LAPACKE_cpftrs_work, - LAPACKE_cpo_nancheck, - LAPACKE_cpo_trans, LAPACKE_cpocon, LAPACKE_cpocon_work, LAPACKE_cpoequ, @@ -756,8 +892,6 @@ LAPACKE_cpotri_work, LAPACKE_cpotrs, LAPACKE_cpotrs_work, - LAPACKE_cpp_nancheck, - LAPACKE_cpp_trans, LAPACKE_cppcon, LAPACKE_cppcon_work, LAPACKE_cppequ, @@ -776,7 +910,6 @@ LAPACKE_cpptrs_work, LAPACKE_cpstrf, LAPACKE_cpstrf_work, - LAPACKE_cpt_nancheck, LAPACKE_cptcon, LAPACKE_cptcon_work, LAPACKE_cpteqr, @@ -791,8 +924,6 @@ LAPACKE_cpttrf_work, LAPACKE_cpttrs, LAPACKE_cpttrs_work, - LAPACKE_csp_nancheck, - LAPACKE_csp_trans, LAPACKE_cspcon, LAPACKE_cspcon_work, LAPACKE_csprfs, @@ -807,7 +938,6 @@ LAPACKE_csptri_work, LAPACKE_csptrs, LAPACKE_csptrs_work, - LAPACKE_cst_nancheck, LAPACKE_cstedc, LAPACKE_cstedc_work, LAPACKE_cstegr, @@ -818,16 +948,12 @@ LAPACKE_cstemr_work, LAPACKE_csteqr, LAPACKE_csteqr_work, - LAPACKE_csy_nancheck, - LAPACKE_csy_trans, LAPACKE_csycon, LAPACKE_csycon_work, LAPACKE_csyconv, LAPACKE_csyconv_work, LAPACKE_csyequb, LAPACKE_csyequb_work, - LAPACKE_csyr, - LAPACKE_csyr_work, LAPACKE_csyrfs, LAPACKE_csyrfs_work, LAPACKE_csysv, @@ -848,16 +974,12 @@ LAPACKE_csytrs2, LAPACKE_csytrs2_work, LAPACKE_csytrs_work, - LAPACKE_ctb_nancheck, - LAPACKE_ctb_trans, LAPACKE_ctbcon, LAPACKE_ctbcon_work, LAPACKE_ctbrfs, LAPACKE_ctbrfs_work, LAPACKE_ctbtrs, LAPACKE_ctbtrs_work, - LAPACKE_ctf_nancheck, - LAPACKE_ctf_trans, LAPACKE_ctfsm, LAPACKE_ctfsm_work, LAPACKE_ctftri, @@ -878,8 +1000,6 @@ LAPACKE_ctgsna_work, LAPACKE_ctgsyl, LAPACKE_ctgsyl_work, - LAPACKE_ctp_nancheck, - LAPACKE_ctp_trans, LAPACKE_ctpcon, LAPACKE_ctpcon_work, LAPACKE_ctpmqrt, @@ -900,8 +1020,6 @@ LAPACKE_ctpttf_work, LAPACKE_ctpttr, LAPACKE_ctpttr_work, - LAPACKE_ctr_nancheck, - LAPACKE_ctr_trans, LAPACKE_ctrcon, LAPACKE_ctrcon_work, LAPACKE_ctrevc, @@ -964,7 +1082,6 @@ LAPACKE_cupgtr_work, LAPACKE_cupmtr, LAPACKE_cupmtr_work, - LAPACKE_d_nancheck, LAPACKE_dbbcsd, LAPACKE_dbbcsd_work, LAPACKE_dbdsdc, @@ -973,8 +1090,6 @@ LAPACKE_dbdsqr_work, LAPACKE_ddisna, LAPACKE_ddisna_work, - LAPACKE_dgb_nancheck, - LAPACKE_dgb_trans, LAPACKE_dgbbrd, LAPACKE_dgbbrd_work, LAPACKE_dgbcon, @@ -993,8 +1108,6 @@ LAPACKE_dgbtrf_work, LAPACKE_dgbtrs, LAPACKE_dgbtrs_work, - LAPACKE_dge_nancheck, - LAPACKE_dge_trans, LAPACKE_dgebak, LAPACKE_dgebak_work, LAPACKE_dgebal, @@ -1073,8 +1186,6 @@ LAPACKE_dgetri_work, LAPACKE_dgetrs, LAPACKE_dgetrs_work, - LAPACKE_dgg_nancheck, - LAPACKE_dgg_trans, LAPACKE_dggbak, LAPACKE_dggbak_work, LAPACKE_dggbal, @@ -1101,7 +1212,6 @@ LAPACKE_dggsvd_work, LAPACKE_dggsvp, LAPACKE_dggsvp_work, - LAPACKE_dgt_nancheck, LAPACKE_dgtcon, LAPACKE_dgtcon_work, LAPACKE_dgtrfs, @@ -1116,8 +1226,6 @@ LAPACKE_dgttrs_work, LAPACKE_dhgeqz, LAPACKE_dhgeqz_work, - LAPACKE_dhs_nancheck, - LAPACKE_dhs_trans, LAPACKE_dhsein, LAPACKE_dhsein_work, LAPACKE_dhseqr, @@ -1200,8 +1308,6 @@ LAPACKE_dormrz_work, LAPACKE_dormtr, LAPACKE_dormtr_work, - LAPACKE_dpb_nancheck, - LAPACKE_dpb_trans, LAPACKE_dpbcon, LAPACKE_dpbcon_work, LAPACKE_dpbequ, @@ -1218,16 +1324,12 @@ LAPACKE_dpbtrf_work, LAPACKE_dpbtrs, LAPACKE_dpbtrs_work, - LAPACKE_dpf_nancheck, - LAPACKE_dpf_trans, LAPACKE_dpftrf, LAPACKE_dpftrf_work, LAPACKE_dpftri, LAPACKE_dpftri_work, LAPACKE_dpftrs, LAPACKE_dpftrs_work, - LAPACKE_dpo_nancheck, - LAPACKE_dpo_trans, LAPACKE_dpocon, LAPACKE_dpocon_work, LAPACKE_dpoequ, @@ -1246,8 +1348,6 @@ LAPACKE_dpotri_work, LAPACKE_dpotrs, LAPACKE_dpotrs_work, - LAPACKE_dpp_nancheck, - LAPACKE_dpp_trans, LAPACKE_dppcon, LAPACKE_dppcon_work, LAPACKE_dppequ, @@ -1266,7 +1366,6 @@ LAPACKE_dpptrs_work, LAPACKE_dpstrf, LAPACKE_dpstrf_work, - LAPACKE_dpt_nancheck, LAPACKE_dptcon, LAPACKE_dptcon_work, LAPACKE_dpteqr, @@ -1281,8 +1380,6 @@ LAPACKE_dpttrf_work, LAPACKE_dpttrs, LAPACKE_dpttrs_work, - LAPACKE_dsb_nancheck, - LAPACKE_dsb_trans, LAPACKE_dsbev, LAPACKE_dsbev_work, LAPACKE_dsbevd, @@ -1303,8 +1400,6 @@ LAPACKE_dsfrk_work, LAPACKE_dsgesv, LAPACKE_dsgesv_work, - LAPACKE_dsp_nancheck, - LAPACKE_dsp_trans, LAPACKE_dspcon, LAPACKE_dspcon_work, LAPACKE_dspev, @@ -1337,7 +1432,6 @@ LAPACKE_dsptri_work, LAPACKE_dsptrs, LAPACKE_dsptrs_work, - LAPACKE_dst_nancheck, LAPACKE_dstebz, LAPACKE_dstebz_work, LAPACKE_dstedc, @@ -1360,8 +1454,6 @@ LAPACKE_dstevr_work, LAPACKE_dstevx, LAPACKE_dstevx_work, - LAPACKE_dsy_nancheck, - LAPACKE_dsy_trans, LAPACKE_dsycon, LAPACKE_dsycon_work, LAPACKE_dsyconv, @@ -1406,16 +1498,12 @@ LAPACKE_dsytrs2, LAPACKE_dsytrs2_work, LAPACKE_dsytrs_work, - LAPACKE_dtb_nancheck, - LAPACKE_dtb_trans, LAPACKE_dtbcon, LAPACKE_dtbcon_work, LAPACKE_dtbrfs, LAPACKE_dtbrfs_work, LAPACKE_dtbtrs, LAPACKE_dtbtrs_work, - LAPACKE_dtf_nancheck, - LAPACKE_dtf_trans, LAPACKE_dtfsm, LAPACKE_dtfsm_work, LAPACKE_dtftri, @@ -1436,8 +1524,6 @@ LAPACKE_dtgsna_work, LAPACKE_dtgsyl, LAPACKE_dtgsyl_work, - LAPACKE_dtp_nancheck, - LAPACKE_dtp_trans, LAPACKE_dtpcon, LAPACKE_dtpcon_work, LAPACKE_dtpmqrt, @@ -1458,8 +1544,6 @@ LAPACKE_dtpttf_work, LAPACKE_dtpttr, LAPACKE_dtpttr_work, - LAPACKE_dtr_nancheck, - LAPACKE_dtr_trans, LAPACKE_dtrcon, LAPACKE_dtrcon_work, LAPACKE_dtrevc, @@ -1484,8 +1568,6 @@ LAPACKE_dtrttp_work, LAPACKE_dtzrzf, LAPACKE_dtzrzf_work, - LAPACKE_lsame, - LAPACKE_s_nancheck, LAPACKE_sbbcsd, LAPACKE_sbbcsd_work, LAPACKE_sbdsdc, @@ -1494,8 +1576,6 @@ LAPACKE_sbdsqr_work, LAPACKE_sdisna, LAPACKE_sdisna_work, - LAPACKE_sgb_nancheck, - LAPACKE_sgb_trans, LAPACKE_sgbbrd, LAPACKE_sgbbrd_work, LAPACKE_sgbcon, @@ -1514,8 +1594,6 @@ LAPACKE_sgbtrf_work, LAPACKE_sgbtrs, LAPACKE_sgbtrs_work, - LAPACKE_sge_nancheck, - LAPACKE_sge_trans, LAPACKE_sgebak, LAPACKE_sgebak_work, LAPACKE_sgebal, @@ -1594,8 +1672,6 @@ LAPACKE_sgetri_work, LAPACKE_sgetrs, LAPACKE_sgetrs_work, - LAPACKE_sgg_nancheck, - LAPACKE_sgg_trans, LAPACKE_sggbak, LAPACKE_sggbak_work, LAPACKE_sggbal, @@ -1622,7 +1698,6 @@ LAPACKE_sggsvd_work, LAPACKE_sggsvp, LAPACKE_sggsvp_work, - LAPACKE_sgt_nancheck, LAPACKE_sgtcon, LAPACKE_sgtcon_work, LAPACKE_sgtrfs, @@ -1637,8 +1712,6 @@ LAPACKE_sgttrs_work, LAPACKE_shgeqz, LAPACKE_shgeqz_work, - LAPACKE_shs_nancheck, - LAPACKE_shs_trans, LAPACKE_shsein, LAPACKE_shsein_work, LAPACKE_shseqr, @@ -1721,8 +1794,6 @@ LAPACKE_sormrz_work, LAPACKE_sormtr, LAPACKE_sormtr_work, - LAPACKE_spb_nancheck, - LAPACKE_spb_trans, LAPACKE_spbcon, LAPACKE_spbcon_work, LAPACKE_spbequ, @@ -1739,16 +1810,12 @@ LAPACKE_spbtrf_work, LAPACKE_spbtrs, LAPACKE_spbtrs_work, - LAPACKE_spf_nancheck, - LAPACKE_spf_trans, LAPACKE_spftrf, LAPACKE_spftrf_work, LAPACKE_spftri, LAPACKE_spftri_work, LAPACKE_spftrs, LAPACKE_spftrs_work, - LAPACKE_spo_nancheck, - LAPACKE_spo_trans, LAPACKE_spocon, LAPACKE_spocon_work, LAPACKE_spoequ, @@ -1767,8 +1834,6 @@ LAPACKE_spotri_work, LAPACKE_spotrs, LAPACKE_spotrs_work, - LAPACKE_spp_nancheck, - LAPACKE_spp_trans, LAPACKE_sppcon, LAPACKE_sppcon_work, LAPACKE_sppequ, @@ -1787,7 +1852,6 @@ LAPACKE_spptrs_work, LAPACKE_spstrf, LAPACKE_spstrf_work, - LAPACKE_spt_nancheck, LAPACKE_sptcon, LAPACKE_sptcon_work, LAPACKE_spteqr, @@ -1802,8 +1866,6 @@ LAPACKE_spttrf_work, LAPACKE_spttrs, LAPACKE_spttrs_work, - LAPACKE_ssb_nancheck, - LAPACKE_ssb_trans, LAPACKE_ssbev, LAPACKE_ssbev_work, LAPACKE_ssbevd, @@ -1822,8 +1884,6 @@ LAPACKE_ssbtrd_work, LAPACKE_ssfrk, LAPACKE_ssfrk_work, - LAPACKE_ssp_nancheck, - LAPACKE_ssp_trans, LAPACKE_sspcon, LAPACKE_sspcon_work, LAPACKE_sspev, @@ -1854,7 +1914,6 @@ LAPACKE_ssptri_work, LAPACKE_ssptrs, LAPACKE_ssptrs_work, - LAPACKE_sst_nancheck, LAPACKE_sstebz, LAPACKE_sstebz_work, LAPACKE_sstedc, @@ -1877,8 +1936,6 @@ LAPACKE_sstevr_work, LAPACKE_sstevx, LAPACKE_sstevx_work, - LAPACKE_ssy_nancheck, - LAPACKE_ssy_trans, LAPACKE_ssycon, LAPACKE_ssycon_work, LAPACKE_ssyconv, @@ -1923,16 +1980,12 @@ LAPACKE_ssytrs2, LAPACKE_ssytrs2_work, LAPACKE_ssytrs_work, - LAPACKE_stb_nancheck, - LAPACKE_stb_trans, LAPACKE_stbcon, LAPACKE_stbcon_work, LAPACKE_stbrfs, LAPACKE_stbrfs_work, LAPACKE_stbtrs, LAPACKE_stbtrs_work, - LAPACKE_stf_nancheck, - LAPACKE_stf_trans, LAPACKE_stfsm, LAPACKE_stfsm_work, LAPACKE_stftri, @@ -1953,8 +2006,6 @@ LAPACKE_stgsna_work, LAPACKE_stgsyl, LAPACKE_stgsyl_work, - LAPACKE_stp_nancheck, - LAPACKE_stp_trans, LAPACKE_stpcon, LAPACKE_stpcon_work, LAPACKE_stpmqrt, @@ -1973,8 +2024,6 @@ LAPACKE_stpttf_work, LAPACKE_stpttr, LAPACKE_stpttr_work, - LAPACKE_str_nancheck, - LAPACKE_str_trans, LAPACKE_strcon, LAPACKE_strcon_work, LAPACKE_strevc, @@ -1999,8 +2048,6 @@ LAPACKE_strttp_work, LAPACKE_stzrzf, LAPACKE_stzrzf_work, - LAPACKE_xerbla, - LAPACKE_z_nancheck, LAPACKE_zbbcsd, LAPACKE_zbbcsd_work, LAPACKE_zbdsqr, @@ -2009,8 +2056,6 @@ LAPACKE_zcgesv_work, LAPACKE_zcposv, LAPACKE_zcposv_work, - LAPACKE_zgb_nancheck, - LAPACKE_zgb_trans, LAPACKE_zgbbrd, LAPACKE_zgbbrd_work, LAPACKE_zgbcon, @@ -2029,8 +2074,6 @@ LAPACKE_zgbtrf_work, LAPACKE_zgbtrs, LAPACKE_zgbtrs_work, - LAPACKE_zge_nancheck, - LAPACKE_zge_trans, LAPACKE_zgebak, LAPACKE_zgebak_work, LAPACKE_zgebal, @@ -2105,8 +2148,6 @@ LAPACKE_zgetri_work, LAPACKE_zgetrs, LAPACKE_zgetrs_work, - LAPACKE_zgg_nancheck, - LAPACKE_zgg_trans, LAPACKE_zggbak, LAPACKE_zggbak_work, LAPACKE_zggbal, @@ -2133,7 +2174,6 @@ LAPACKE_zggsvd_work, LAPACKE_zggsvp, LAPACKE_zggsvp_work, - LAPACKE_zgt_nancheck, LAPACKE_zgtcon, LAPACKE_zgtcon_work, LAPACKE_zgtrfs, @@ -2146,8 +2186,6 @@ LAPACKE_zgttrf_work, LAPACKE_zgttrs, LAPACKE_zgttrs_work, - LAPACKE_zhb_nancheck, - LAPACKE_zhb_trans, LAPACKE_zhbev, LAPACKE_zhbev_work, LAPACKE_zhbevd, @@ -2164,8 +2202,6 @@ LAPACKE_zhbgvx_work, LAPACKE_zhbtrd, LAPACKE_zhbtrd_work, - LAPACKE_zhe_nancheck, - LAPACKE_zhe_trans, LAPACKE_zhecon, LAPACKE_zhecon_work, LAPACKE_zheequb, @@ -2212,8 +2248,6 @@ LAPACKE_zhfrk_work, LAPACKE_zhgeqz, LAPACKE_zhgeqz_work, - LAPACKE_zhp_nancheck, - LAPACKE_zhp_trans, LAPACKE_zhpcon, LAPACKE_zhpcon_work, LAPACKE_zhpev, @@ -2244,8 +2278,6 @@ LAPACKE_zhptri_work, LAPACKE_zhptrs, LAPACKE_zhptrs_work, - LAPACKE_zhs_nancheck, - LAPACKE_zhs_trans, LAPACKE_zhsein, LAPACKE_zhsein_work, LAPACKE_zhseqr, @@ -2282,8 +2314,6 @@ LAPACKE_zlaswp_work, LAPACKE_zlauum, LAPACKE_zlauum_work, - LAPACKE_zpb_nancheck, - LAPACKE_zpb_trans, LAPACKE_zpbcon, LAPACKE_zpbcon_work, LAPACKE_zpbequ, @@ -2300,16 +2330,12 @@ LAPACKE_zpbtrf_work, LAPACKE_zpbtrs, LAPACKE_zpbtrs_work, - LAPACKE_zpf_nancheck, - LAPACKE_zpf_trans, LAPACKE_zpftrf, LAPACKE_zpftrf_work, LAPACKE_zpftri, LAPACKE_zpftri_work, LAPACKE_zpftrs, LAPACKE_zpftrs_work, - LAPACKE_zpo_nancheck, - LAPACKE_zpo_trans, LAPACKE_zpocon, LAPACKE_zpocon_work, LAPACKE_zpoequ, @@ -2328,8 +2354,6 @@ LAPACKE_zpotri_work, LAPACKE_zpotrs, LAPACKE_zpotrs_work, - LAPACKE_zpp_nancheck, - LAPACKE_zpp_trans, LAPACKE_zppcon, LAPACKE_zppcon_work, LAPACKE_zppequ, @@ -2348,7 +2372,6 @@ LAPACKE_zpptrs_work, LAPACKE_zpstrf, LAPACKE_zpstrf_work, - LAPACKE_zpt_nancheck, LAPACKE_zptcon, LAPACKE_zptcon_work, LAPACKE_zpteqr, @@ -2363,8 +2386,6 @@ LAPACKE_zpttrf_work, LAPACKE_zpttrs, LAPACKE_zpttrs_work, - LAPACKE_zsp_nancheck, - LAPACKE_zsp_trans, LAPACKE_zspcon, LAPACKE_zspcon_work, LAPACKE_zsprfs, @@ -2379,7 +2400,6 @@ LAPACKE_zsptri_work, LAPACKE_zsptrs, LAPACKE_zsptrs_work, - LAPACKE_zst_nancheck, LAPACKE_zstedc, LAPACKE_zstedc_work, LAPACKE_zstegr, @@ -2390,16 +2410,12 @@ LAPACKE_zstemr_work, LAPACKE_zsteqr, LAPACKE_zsteqr_work, - LAPACKE_zsy_nancheck, - LAPACKE_zsy_trans, LAPACKE_zsycon, LAPACKE_zsycon_work, LAPACKE_zsyconv, LAPACKE_zsyconv_work, LAPACKE_zsyequb, LAPACKE_zsyequb_work, - LAPACKE_zsyr, - LAPACKE_zsyr_work, LAPACKE_zsyrfs, LAPACKE_zsyrfs_work, LAPACKE_zsysv, @@ -2420,16 +2436,12 @@ LAPACKE_zsytrs2, LAPACKE_zsytrs2_work, LAPACKE_zsytrs_work, - LAPACKE_ztb_nancheck, - LAPACKE_ztb_trans, LAPACKE_ztbcon, LAPACKE_ztbcon_work, LAPACKE_ztbrfs, LAPACKE_ztbrfs_work, LAPACKE_ztbtrs, LAPACKE_ztbtrs_work, - LAPACKE_ztf_nancheck, - LAPACKE_ztf_trans, LAPACKE_ztfsm, LAPACKE_ztfsm_work, LAPACKE_ztftri, @@ -2450,8 +2462,6 @@ LAPACKE_ztgsna_work, LAPACKE_ztgsyl, LAPACKE_ztgsyl_work, - LAPACKE_ztp_nancheck, - LAPACKE_ztp_trans, LAPACKE_ztpcon, LAPACKE_ztpcon_work, LAPACKE_ztpmqrt, @@ -2472,8 +2482,6 @@ LAPACKE_ztpttf_work, LAPACKE_ztpttr, LAPACKE_ztpttr_work, - LAPACKE_ztr_nancheck, - LAPACKE_ztr_trans, LAPACKE_ztrcon, LAPACKE_ztrcon_work, LAPACKE_ztrevc, @@ -2536,6 +2544,118 @@ LAPACKE_zupgtr_work, LAPACKE_zupmtr, LAPACKE_zupmtr_work, + LAPACKE_zsyr, + LAPACKE_csyr, + LAPACKE_zsyr_work, + LAPACKE_csyr_work, + + ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` + ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the + ## corresponding LAPACK extended precision routines. + #LAPACKE_cgbrfsx, + #LAPACKE_cporfsx, + #LAPACKE_dgerfsx, + #LAPACKE_sgbrfsx, + #LAPACKE_ssyrfsx, + #LAPACKE_zherfsx, + #LAPACKE_cgbrfsx_work, + #LAPACKE_cporfsx_work, + #LAPACKE_dgerfsx_work, + #LAPACKE_sgbrfsx_work, + #LAPACKE_ssyrfsx_work, + #LAPACKE_zherfsx_work, + #LAPACKE_cgerfsx, + #LAPACKE_csyrfsx, + #LAPACKE_dporfsx, + #LAPACKE_sgerfsx, + #LAPACKE_zgbrfsx, + #LAPACKE_zporfsx, + #LAPACKE_cgerfsx_work, + #LAPACKE_csyrfsx_work, + #LAPACKE_dporfsx_work, + #LAPACKE_sgerfsx_work, + #LAPACKE_zgbrfsx_work, + #LAPACKE_zporfsx_work, + #LAPACKE_cherfsx, + #LAPACKE_dgbrfsx, + #LAPACKE_dsyrfsx, + #LAPACKE_sporfsx, + #LAPACKE_zgerfsx, + #LAPACKE_zsyrfsx, + #LAPACKE_cherfsx_work, + #LAPACKE_dgbrfsx_work, + #LAPACKE_dsyrfsx_work, + #LAPACKE_sporfsx_work, + #LAPACKE_zgerfsx_work, + #LAPACKE_zsyrfsx_work, + #LAPACKE_cgbsvxx, + #LAPACKE_cposvxx, + #LAPACKE_dgesvxx, + #LAPACKE_sgbsvxx, + #LAPACKE_ssysvxx, + #LAPACKE_zhesvxx, + #LAPACKE_cgbsvxx_work, + #LAPACKE_cposvxx_work, + #LAPACKE_dgesvxx_work, + #LAPACKE_sgbsvxx_work, + #LAPACKE_ssysvxx_work, + #LAPACKE_zhesvxx_work, + #LAPACKE_cgesvxx, + #LAPACKE_csysvxx, + #LAPACKE_dposvxx, + #LAPACKE_sgesvxx, + #LAPACKE_zgbsvxx, + #LAPACKE_zposvxx, + #LAPACKE_cgesvxx_work, + #LAPACKE_csysvxx_work, + #LAPACKE_dposvxx_work, + #LAPACKE_sgesvxx_work, + #LAPACKE_zgbsvxx_work, + #LAPACKE_zposvxx_work, + #LAPACKE_chesvxx, + #LAPACKE_dgbsvxx, + #LAPACKE_dsysvxx, + #LAPACKE_sposvxx, + #LAPACKE_zgesvxx, + #LAPACKE_zsysvxx, + #LAPACKE_chesvxx_work, + #LAPACKE_dgbsvxx_work, + #LAPACKE_dsysvxx_work, + #LAPACKE_sposvxx_work, + #LAPACKE_zgesvxx_work, + #LAPACKE_zsysvxx_work, + + ## @(MATGEN_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` + ## Not exported: requires LAPACKE_TESTING to be set and depends on libtmg + ## (see `lapack-3.4.1/TESTING/MATGEN`). + #LAPACKE_clatms, + #LAPACKE_clatms_work, + #LAPACKE_dlatms, + #LAPACKE_dlatms_work, + #LAPACKE_slatms, + #LAPACKE_slatms_work, + #LAPACKE_zlatms, + #LAPACKE_zlatms_work, + #LAPACKE_clagge, + #LAPACKE_clagge_work, + #LAPACKE_dlagge, + #LAPACKE_dlagge_work, + #LAPACKE_slagge, + #LAPACKE_slagge_work, + #LAPACKE_zlagge, + #LAPACKE_zlagge_work, + #LAPACKE_claghe, + #LAPACKE_claghe_work, + #LAPACKE_zlaghe, + #LAPACKE_zlaghe_work, + #LAPACKE_clagsy, + #LAPACKE_clagsy_work, + #LAPACKE_dlagsy, + #LAPACKE_dlagsy_work, + #LAPACKE_slagsy, + #LAPACKE_slagsy_work, + #LAPACKE_zlagsy, + #LAPACKE_zlagsy_work, ); if ($ARGV[5] == 1) { From be1692d64fdaaee300f81f8594f64edb512ff39f Mon Sep 17 00:00:00 2001 From: Mike Nolta Date: Sun, 20 May 2012 00:49:38 -0300 Subject: [PATCH 033/162] fix 'sched_yield' warnings on FreeBSD,NetBSD --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index c6d30ddcf..0868b594a 100644 --- a/common.h +++ b/common.h @@ -89,7 +89,7 @@ extern "C" { #include #endif -#ifdef OS_DARWIN +#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) #include #endif From 14c3511e9271b06d57a7a3777dbe16b3717a48b7 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Sun, 20 May 2012 18:09:35 +0200 Subject: [PATCH 034/162] Respect C compiler set on the command line or inherited from the environment --- Makefile.system | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile.system b/Makefile.system index c9e74faa6..c72326ed5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -14,7 +14,15 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.1 endif # Default C compiler +# - Only set if not specified on the command line or inherited from the environment. +# - CC is an implicit variable so neither '?=' or 'ifndef' can be used. +# http://stackoverflow.com/questions/4029274/mingw-and-make-variables +# - Default value is 'cc' which is not always a valid command (e.g. MinGW). +ifeq ($(origin CC),default) CC = gcc +endif + +# Default Fortran compiler (FC) is selected by f_check. ifndef MAKEFILE_RULE include $(TOPDIR)/Makefile.rule From 44124d3055fe09449ca591fad2db22a20a01d252 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Sun, 20 May 2012 18:11:34 +0200 Subject: [PATCH 035/162] Fix Fortran compiler detection - Test with '-x' operator to ensure file is executable. - 'break' is not a valid Perl keyword. --- f_check | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/f_check b/f_check index 93c39ec88..8e3855b10 100644 --- a/f_check +++ b/f_check @@ -32,11 +32,12 @@ if ($compiler eq "") { "pgf95", "pgf90", "pgf77", "ifort"); +OUTER: foreach $lists (@lists) { foreach $path (@path) { - if (-f $path . "/" . $lists) { + if (-x $path . "/" . $lists) { $compiler = $lists; - break; + last OUTER; } } } From e9be1fdd2bf373800c1cf3a5217b09a018284b21 Mon Sep 17 00:00:00 2001 From: Mike Nolta Date: Sun, 20 May 2012 21:44:15 -0300 Subject: [PATCH 036/162] FreeBSD: replace EXTRALIB -> FEXTRALIB --- exports/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index 971bd0bed..83f2f5d0c 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -121,7 +121,7 @@ so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def $(EXTRALIB) + -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest From 10e25690b4591a4a25b7963fca27a899efc658cf Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Mon, 21 May 2012 12:10:26 +0200 Subject: [PATCH 037/162] Fix FreeBSD build (undefined reference to `pthread_create') --- exports/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index 83f2f5d0c..971bd0bed 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -121,7 +121,7 @@ so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) + -Wl,--retain-symbols-file=linux.def $(EXTRALIB) $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest From a27339b24443fa57d1afa09f2d7ecfa7757e1f42 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Mon, 21 May 2012 12:25:12 +0200 Subject: [PATCH 038/162] DLL: replace FEXTRALIB -> EXTRALIB (for consistency) --- exports/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 971bd0bed..b50b521f1 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -70,11 +70,11 @@ dll2 : libgoto2_shared.dll $(RANLIB) ../$(LIBNAME) ifeq ($(BINARY32), 1) $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ - --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) + --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) -lib /machine:i386 /def:libopenblas.def else $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ - --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) + --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) -lib /machine:X64 /def:libopenblas.def endif From 839b18aa260a4443f9b13615cb583c2f08af79b1 Mon Sep 17 00:00:00 2001 From: Mike Nolta Date: Mon, 21 May 2012 16:56:28 -0400 Subject: [PATCH 039/162] FreeBSD: allow CC & FC to have different versions --- exports/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index b50b521f1..40a3a7c63 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -121,7 +121,7 @@ so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def $(EXTRALIB) + -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB) $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest From 4e29b6ffc0a8e7c748975f44194098ad3d229f14 Mon Sep 17 00:00:00 2001 From: Mike Nolta Date: Mon, 21 May 2012 16:57:19 -0400 Subject: [PATCH 040/162] FreeBSD: fix OS_FreeBSD -> OS_FREEBSD typos --- common_x86.h | 2 +- common_x86_64.h | 2 +- driver/others/memory.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/common_x86.h b/common_x86.h index fbb91f888..eaf395806 100644 --- a/common_x86.h +++ b/common_x86.h @@ -282,7 +282,7 @@ REALNAME: #define EPILOGUE .end REALNAME #endif -#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NetBSD) || defined(__ELF__) #define PROLOGUE \ .text; \ .align 16; \ diff --git a/common_x86_64.h b/common_x86_64.h index 53b702185..735c9b294 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -353,7 +353,7 @@ REALNAME: #define EPILOGUE .end REALNAME #endif -#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ diff --git a/driver/others/memory.c b/driver/others/memory.c index 3f1a5f60a..9a925d290 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_FREEBSD) || defined(OS_Darwin) #include #endif @@ -185,7 +185,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_FREEBSD) || defined(OS_Darwin) int get_num_procs(void) { @@ -215,7 +215,7 @@ int goto_get_num_procs (void) { int blas_get_cpu_number(void){ char *p; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_Darwin) int max_num; #endif int blas_goto_num = 0; @@ -223,7 +223,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_Darwin) max_num = get_num_procs(); #endif @@ -250,7 +250,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_Darwin) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif From a431042475d414d8b786804f351467a33d24f1ae Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Wed, 23 May 2012 00:01:14 +0200 Subject: [PATCH 041/162] Fix inconsistent case for OS_* macros (Refs pull request #111) --- c_check | 8 ++++---- common.h | 2 +- common_x86.h | 2 +- common_x86_64.h | 2 +- ctest.c | 8 ++++---- driver/others/memory.c | 10 +++++----- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/c_check b/c_check index 6ce5e4cc0..b4105d19a 100644 --- a/c_check +++ b/c_check @@ -43,10 +43,10 @@ $compiler = DEC if ($data =~ /COMPILER_DEC/); $compiler = GCC if ($compiler eq ""); $os = Linux if ($data =~ /OS_LINUX/); -$os = FreeBSD if ($data =~ /OS_FreeBSD/); -$os = NetBSD if ($data =~ /OS_NetBSD/); -$os = Darwin if ($data =~ /OS_Darwin/); -$os = SunOS if ($data =~ /OS_SunOS/); +$os = FreeBSD if ($data =~ /OS_FREEBSD/); +$os = NetBSD if ($data =~ /OS_NETBSD/); +$os = Darwin if ($data =~ /OS_DARWIN/); +$os = SunOS if ($data =~ /OS_SUNOS/); $os = AIX if ($data =~ /OS_AIX/); $os = osf if ($data =~ /OS_OSF/); $os = WINNT if ($data =~ /OS_WINNT/); diff --git a/common.h b/common.h index 0868b594a..3718cdee4 100644 --- a/common.h +++ b/common.h @@ -68,7 +68,7 @@ extern "C" { #define SMP #endif -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix) +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define WINDOWS_ABI #define OS_WINDOWS diff --git a/common_x86.h b/common_x86.h index eaf395806..468fc55eb 100644 --- a/common_x86.h +++ b/common_x86.h @@ -282,7 +282,7 @@ REALNAME: #define EPILOGUE .end REALNAME #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NetBSD) || defined(__ELF__) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) #define PROLOGUE \ .text; \ .align 16; \ diff --git a/common_x86_64.h b/common_x86_64.h index 735c9b294..2dc788c93 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -353,7 +353,7 @@ REALNAME: #define EPILOGUE .end REALNAME #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ diff --git a/ctest.c b/ctest.c index 0c373bf2b..ac8283898 100644 --- a/ctest.c +++ b/ctest.c @@ -35,19 +35,19 @@ OS_LINUX #endif #if defined(__FreeBSD__) -OS_FreeBSD +OS_FREEBSD #endif #if defined(__NetBSD__) -OS_NetBSD +OS_NETBSD #endif #if defined(__sun) -OS_SunOS +OS_SUNOS #endif #if defined(__APPLE__) -OS_Darwin +OS_DARWIN #endif #if defined(_AIX) diff --git a/driver/others/memory.c b/driver/others/memory.c index 9a925d290..9b8863f39 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FREEBSD) || defined(OS_Darwin) +#if defined(OS_FREEBSD) || defined(OS_DARWIN) #include #endif @@ -185,7 +185,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) || defined(OS_Darwin) +#if defined(OS_FREEBSD) || defined(OS_DARWIN) int get_num_procs(void) { @@ -215,7 +215,7 @@ int goto_get_num_procs (void) { int blas_get_cpu_number(void){ char *p; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) int max_num; #endif int blas_goto_num = 0; @@ -223,7 +223,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) max_num = get_num_procs(); #endif @@ -250,7 +250,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif From 5199809bba04ebcd176f29d8769285a64b364c08 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Wed, 23 May 2012 00:04:04 +0200 Subject: [PATCH 042/162] Fix typo: OS_CYGWIN -> OS_CYGWIN_NT, OS_INERIX -> OS_INTERIX --- c_check | 2 +- common_x86.h | 4 ++-- ctest.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/c_check b/c_check index b4105d19a..4d82237d4 100644 --- a/c_check +++ b/c_check @@ -50,7 +50,7 @@ $os = SunOS if ($data =~ /OS_SUNOS/); $os = AIX if ($data =~ /OS_AIX/); $os = osf if ($data =~ /OS_OSF/); $os = WINNT if ($data =~ /OS_WINNT/); -$os = CYGWIN_NT if ($data =~ /OS_CYGWIN/); +$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); $os = Interix if ($data =~ /OS_INTERIX/); $architecture = x86 if ($data =~ /ARCH_X86/); diff --git a/common_x86.h b/common_x86.h index 468fc55eb..8f1a0308d 100644 --- a/common_x86.h +++ b/common_x86.h @@ -254,7 +254,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define PROFCODE #endif -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define SAVEREGISTERS \ subl $32, %esp;\ movups %xmm6, 0(%esp);\ @@ -269,7 +269,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define RESTOREREGISTERS #endif -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define PROLOGUE \ .text; \ .align 16; \ diff --git a/ctest.c b/ctest.c index ac8283898..9fc0b0c40 100644 --- a/ctest.c +++ b/ctest.c @@ -63,7 +63,7 @@ OS_WINNT #endif #if defined(__CYGWIN__) -OS_CYGWIN +OS_CYGWIN_NT #endif #if defined(__INTERIX) From 8cc7f86cf7fd66ddf8f015e57c45315ae40daa17 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 25 May 2012 23:20:29 +0800 Subject: [PATCH 043/162] Detect Intel Core i7 3000 with Sandybridge. --- cpuid_x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 53b6f356c..0b9b5b6e6 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -990,7 +990,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 13: //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) - return CPUTYPE_NEHALEM; + return CPUTYPE_SANDYBRIDGE; case 15: //Xeon Processor E7 (Westmere-EX) return CPUTYPE_NEHALEM; @@ -1332,7 +1332,7 @@ int get_coretype(void){ return CORE_NEHALEM; case 13: //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) - return CORE_NEHALEM; + return CORE_SANDYBRIDGE; case 15: //Xeon Processor E7 (Westmere-EX) return CORE_NEHALEM; From a6adbb299da0726eddaf95d4b32da8c5d0616227 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 29 May 2012 14:01:50 +0800 Subject: [PATCH 044/162] Refs #112. Improved setting thread affinity in Linux. Remove the limit (64) about the number of CPU cores. --- driver/others/init.c | 237 ++++++++++++++++++++++++++++++------------- 1 file changed, 166 insertions(+), 71 deletions(-) diff --git a/driver/others/init.c b/driver/others/init.c index 4adba661f..4a6f0aae8 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MAX_NODES 16 #define MAX_CPUS 256 +#define NCPUBITS (8*sizeof(unsigned long)) +#define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS) +#define CPUELT(cpu) ((cpu) / NCPUBITS) +#define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS)) + #define SH_MAGIC 0x510510 @@ -103,10 +108,10 @@ typedef struct { int num_nodes; int num_procs; int final_num_procs; - unsigned long avail; - + unsigned long avail [MAX_BITMASK_LEN]; + int avail_count; unsigned long cpu_info [MAX_CPUS]; - unsigned long node_info [MAX_NODES]; + unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN]; int cpu_use[MAX_CPUS]; } shm_t; @@ -126,7 +131,8 @@ static shm_t *common = (void *)-1; static int shmid, pshmid; static void *paddr; -static unsigned long lprocmask, lnodemask; +static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask; +static int lprocmask_count = 0; static int numprocs = 1; static int numnodes = 1; @@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) { than sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. ***/ -static inline unsigned long get_cpumap(int node) { +static inline void get_cpumap(int node, unsigned long * node_info) { int infile; - unsigned long affinity; + unsigned long affinity[32]; char name[160]; char cpumap[160]; - char *p, *dummy; + char *dummy; int i=0; + int count=0; + int k=0; sprintf(name, CPUMAP_NAME, node); infile = open(name, O_RDONLY); + for(i=0; i<32; i++){ + affinity[i] = 0; + } - affinity = 0; - if (infile != -1) { read(infile, cpumap, sizeof(cpumap)); - p = cpumap; - while (*p != '\n' && i<160){ - if(*p != ',') { - name[i++]=*p; - } - p++; - } - p = name; - // while ((*p == '0') || (*p == ',')) p++; + for(i=0; i<160; i++){ + if(cpumap[i] == '\n') + break; + if(cpumap[i] != ','){ + name[k++]=cpumap[i]; + + //Enough data for Hex + if(k >= NCPUBITS/4){ + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + } - affinity = strtoul(p, &dummy, 16); - + } + if(k!=0){ + name[k]='\0'; + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... + // revert the sequence + for(i=0; i= NCPUBITS/4){ + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + } - affinity = strtol(p, &p, 16); + } + if(k!=0){ + name[k]='\0'; + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... + // revert the sequence + for(i=0; i num_nodes = 0; @@ -258,7 +309,9 @@ static int numa_check(void) { return 0; } - for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; + for (node = 0; node < MAX_NODES; node ++) { + for (j = 0; j node_info[node][j] = 0; + } while ((dir = readdir(dp)) != NULL) { if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { @@ -266,12 +319,12 @@ static int numa_check(void) { node = atoi(&dir -> d_name[4]); if (node > MAX_NODES) { - fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); + fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n"); exit(1); } common -> num_nodes ++; - common -> node_info[node] = get_cpumap(node); + get_cpumap(node, common->node_info[node]); } } @@ -284,7 +337,7 @@ static int numa_check(void) { fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); for (node = 0; node < common -> num_nodes; node ++) - fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); + fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]); #endif return common -> num_nodes; @@ -296,11 +349,13 @@ static void numa_mapping(void) { int i, j, h; unsigned long work, bit; int count = 0; + int bitmask_idx = 0; for (node = 0; node < common -> num_nodes; node ++) { core = 0; for (cpu = 0; cpu < common -> num_procs; cpu ++) { - if (common -> node_info[node] & common -> avail & (1UL << cpu)) { + bitmask_idx = CPUELT(cpu); + if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) { common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); count ++; core ++; @@ -357,58 +412,89 @@ static void numa_mapping(void) { static void disable_hyperthread(void) { - unsigned long share; + unsigned long share[MAX_BITMASK_LEN]; int cpu; + int bitmask_idx = 0; + int i=0, count=0; + bitmask_idx = CPUELT(common -> num_procs); - if(common->num_procs > 64){ - fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); - exit(1); - }else if(common->num_procs == 64){ - common -> avail = 0xFFFFFFFFFFFFFFFFUL; - }else - common -> avail = (1UL << common -> num_procs) - 1; + for(i=0; i< bitmask_idx; i++){ + common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL; + } + if(CPUMASK(common -> num_procs) != 1){ + common -> avail[count++] = CPUMASK(common -> num_procs) - 1; + } + common -> avail_count = count; + + /* if(common->num_procs > 64){ */ + /* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */ + /* exit(1); */ + /* }else if(common->num_procs == 64){ */ + /* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */ + /* }else */ + /* common -> avail = (1UL << common -> num_procs) - 1; */ #ifdef DEBUG - fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); + fprintf(stderr, "\nAvail CPUs : "); + for(i=0; i avail[i]); + fprintf(stderr, ".\n"); #endif for (cpu = 0; cpu < common -> num_procs; cpu ++) { - - share = (get_share(cpu, 1) & common -> avail); - - if (popcount(share) > 1) { + + get_share(cpu, 1, share); + + //When the shared cpu are in different element of share & avail array, this may be a bug. + for (i = 0; i < count ; i++){ + if (popcount(share[i]) > 1) { #ifdef DEBUG - fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", - cpu, share & ~(1UL << cpu)); + fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", + cpu, share[i] & ~(CPUMASK(cpu))); #endif - common -> avail &= ~((share & ~(1UL << cpu))); + common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu))); + } } } } static void disable_affinity(void) { - + int i=0; + int bitmask_idx=0; + int count=0; #ifdef DEBUG - fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); + fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]); fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); #endif - if(common->final_num_procs > 64){ - fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); - exit(1); - }else if(common->final_num_procs == 64){ - lprocmask = 0xFFFFFFFFFFFFFFFFUL; - }else - lprocmask = (1UL << common -> final_num_procs) - 1; + /* if(common->final_num_procs > 64){ */ + /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */ + /* exit(1); */ + /* }else if(common->final_num_procs == 64){ */ + /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */ + /* }else */ + /* lprocmask = (1UL << common -> final_num_procs) - 1; */ + + bitmask_idx = CPUELT(common -> final_num_procs); + + for(i=0; i< bitmask_idx; i++){ + lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL; + } + if(CPUMASK(common -> final_num_procs) != 1){ + lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1; + } + lprocmask_count = count; #ifndef USE_OPENMP - lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; + for(i=0; i< count; i++){ + lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i]; + } #endif #ifdef DEBUG - fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); + fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]); #endif } @@ -498,7 +584,7 @@ static void create_pshmem(void) { static void local_cpu_map(void) { int cpu, id, mapping; - + int bitmask_idx = 0; cpu = 0; mapping = 0; @@ -508,8 +594,9 @@ static void local_cpu_map(void) { if (id > 0) { if (is_dead(id)) common -> cpu_use[cpu] = 0; } - - if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { + + bitmask_idx = CPUELT(cpu); + if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) { common -> cpu_use[cpu] = pshmid; cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); @@ -595,6 +682,7 @@ void gotoblas_affinity_init(void) { #ifndef USE_OPENMP cpu_set_t cpu_mask; #endif + int i; if (initialized) return; @@ -646,6 +734,11 @@ void gotoblas_affinity_init(void) { common -> num_procs = get_nprocs(); + if(common -> num_procs > MAX_CPUS) { + fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); + exit(1); + } + for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; numa_check(); @@ -654,7 +747,8 @@ void gotoblas_affinity_init(void) { if (common -> num_nodes > 1) numa_mapping(); - common -> final_num_procs = popcount(common -> avail); + common -> final_num_procs = 0; + for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]); for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; @@ -664,7 +758,8 @@ void gotoblas_affinity_init(void) { disable_affinity(); - num_avail = popcount(lprocmask); + num_avail = 0; + for(i=0; i num_avail)) numprocs = num_avail; From a4daa34db77dd7410bd710be99cc22dd9dc5a5ce Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 30 May 2012 20:25:01 +0800 Subject: [PATCH 045/162] Refs #75. Use ffreep opcode directly. Please check out http://www.sandpile.org/x86/opc_fpu.htm . --- common_x86.h | 5 +++++ common_x86_64.h | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/common_x86.h b/common_x86.h index 8f1a0308d..4c17f3a04 100644 --- a/common_x86.h +++ b/common_x86.h @@ -356,4 +356,9 @@ REALNAME: #ifndef ALIGN_6 #define ALIGN_6 .align 64 + +// ffreep %st(0). +// Because Clang didn't support ffreep, we directly use the opcode. +// Please check out http://www.sandpile.org/x86/opc_fpu.htm +#define ffreep .byte 0xdf, 0xc0 # #endif diff --git a/common_x86_64.h b/common_x86_64.h index 2dc788c93..e61e37e6b 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -448,4 +448,8 @@ REALNAME: #define ALIGN_6 .align 64 #endif +// ffreep %st(0). +// Because Clang didn't support ffreep, we directly use the opcode. +// Please check out http://www.sandpile.org/x86/opc_fpu.htm +#define ffreep .byte 0xdf, 0xc0 # #endif From 37edae1c90c01d65e47ff57b3f98d6bedbfc766b Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 31 May 2012 17:17:02 +0800 Subject: [PATCH 046/162] Refs #75. Check ffreep macro before the define. --- common_x86.h | 2 ++ common_x86_64.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/common_x86.h b/common_x86.h index 4c17f3a04..4316318ec 100644 --- a/common_x86.h +++ b/common_x86.h @@ -360,5 +360,7 @@ REALNAME: // ffreep %st(0). // Because Clang didn't support ffreep, we directly use the opcode. // Please check out http://www.sandpile.org/x86/opc_fpu.htm +#ifndef ffreep #define ffreep .byte 0xdf, 0xc0 # #endif +#endif diff --git a/common_x86_64.h b/common_x86_64.h index e61e37e6b..7b6d11f7d 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -451,5 +451,7 @@ REALNAME: // ffreep %st(0). // Because Clang didn't support ffreep, we directly use the opcode. // Please check out http://www.sandpile.org/x86/opc_fpu.htm +#ifndef ffreep #define ffreep .byte 0xdf, 0xc0 # #endif +#endif From d6cab3f37ecab53d562e931ef358934940ac22d3 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 31 May 2012 18:17:45 +0800 Subject: [PATCH 047/162] Refs #113. Support AMD Bobcate using Barcelona kernel codes. Replace 3DNow! with MMX. --- Makefile.system | 4 +- TargetList.txt | 1 + cpuid.h | 2 + cpuid_x86.c | 10 ++++- driver/others/parameter.c | 4 +- getarch.c | 18 +++++++- kernel/setparam-ref.c | 16 +++++++ kernel/x86/KERNEL.BOBCATE | 59 +++++++++++++++++++++++++ kernel/x86/trsm_kernel_LN_2x4_sse2.S | 10 ++--- kernel/x86/trsm_kernel_LN_4x4_sse.S | 22 +++++----- kernel/x86/trsm_kernel_LT_2x4_sse2.S | 10 ++--- kernel/x86/trsm_kernel_LT_4x4_sse.S | 22 +++++----- kernel/x86/trsm_kernel_RT_2x4_sse2.S | 10 ++--- kernel/x86/trsm_kernel_RT_4x4_sse.S | 22 +++++----- kernel/x86/ztrsm_kernel_LN_2x2_sse.S | 4 +- kernel/x86/ztrsm_kernel_LT_2x2_sse.S | 4 +- kernel/x86/ztrsm_kernel_RT_2x2_sse.S | 4 +- kernel/x86_64/KERNEL.BOBCATE | 62 +++++++++++++++++++++++++++ kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- l1param.h | 7 +++ l2param.h | 2 +- param.h | 64 +++++++++++++++++++++++++++- 29 files changed, 303 insertions(+), 70 deletions(-) create mode 100644 kernel/x86/KERNEL.BOBCATE create mode 100644 kernel/x86_64/KERNEL.BOBCATE diff --git a/Makefile.system b/Makefile.system index e2c908e98..987bb83cf 100644 --- a/Makefile.system +++ b/Makefile.system @@ -247,11 +247,11 @@ endif ifdef DYNAMIC_ARCH ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ - CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO + CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO endif ifndef DYNAMIC_CORE diff --git a/TargetList.txt b/TargetList.txt index 9e0db4866..19008b862 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -28,6 +28,7 @@ OPTERON_SSE3 BARCELONA SHANGHAI ISTANBUL +BOBCATE c)VIA CPU: SSE_GENERIC diff --git a/cpuid.h b/cpuid.h index c0f21698d..1678d0a7e 100644 --- a/cpuid.h +++ b/cpuid.h @@ -104,6 +104,7 @@ #define CORE_ATOM 18 #define CORE_NANO 19 #define CORE_SANDYBRIDGE 20 +#define CORE_BOBCATE 21 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -191,4 +192,5 @@ typedef struct { #define CPUTYPE_VIAC3 42 #define CPUTYPE_NANO 43 #define CPUTYPE_SANDYBRIDGE 44 +#define CPUTYPE_BOBCATE 45 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 0b9b5b6e6..d31146a98 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1028,6 +1028,8 @@ int get_cpuname(void){ case 1: case 10: return CPUTYPE_BARCELONA; + case 5: + return CPUTYPE_BOBCATE; } break; } @@ -1148,6 +1150,7 @@ static char *cpuname[] = { "VIAC3", "NANO", "SANDYBRIDGE", + "BOBCATE", }; static char *lowercpuname[] = { @@ -1195,6 +1198,7 @@ static char *lowercpuname[] = { "nsgeode", "nano", "sandybridge", + "bobcate", }; static char *corename[] = { @@ -1219,6 +1223,7 @@ static char *corename[] = { "ATOM", "NANO", "SANDYBRIDGE", + "BOBCATE", }; static char *corename_lower[] = { @@ -1243,6 +1248,7 @@ static char *corename_lower[] = { "atom", "nano", "sandybridge", + "bobcate", }; @@ -1351,7 +1357,9 @@ int get_coretype(void){ if (family <= 0x5) return CORE_80486; if (family <= 0xe) return CORE_ATHLON; if (family == 0xf){ - if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; + if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; + else if (exfamily == 5) return CORE_BOBCATE; + else return CORE_BARCELONA; } } diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 5ff1f2934..ab90b89f0 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -163,7 +163,7 @@ int get_L2_size(void){ int eax, ebx, ecx, edx; -#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) @@ -446,7 +446,7 @@ void blas_set_parameter(void){ #endif #endif -#if defined(CORE_BARCELONA) +#if defined(CORE_BARCELONA) || defined(CORE_BOBCATE) size >>= 8; sgemm_p = 232 * size; diff --git a/getarch.c b/getarch.c index d8f467f03..a8c311035 100644 --- a/getarch.c +++ b/getarch.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_BARCELONA */ /* #define FORCE_SHANGHAI */ /* #define FORCE_ISTANBUL */ +/* #define FORCE_BOBCATE */ /* #define FORCE_SSE_GENERIC */ /* #define FORCE_VIAC3 */ /* #define FORCE_NANO */ @@ -363,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "BARCELONA" #endif +#if defined(FORCE_BOBCATE) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "BOBCATE" +#define ARCHCONFIG "-DBOBCATE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" +#define LIBNAME "bobcate" +#define CORENAME "BOBCATE" +#endif + #ifdef FORCE_SSE_GENERIC #define FORCE #define FORCE_INTEL diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index e841bb171..4f438d5af 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -794,6 +794,22 @@ static void init_parameter(void) { #endif #endif +#ifdef BOBCATE + +#ifdef DEBUG + fprintf(stderr, "Bobcate\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef NANO #ifdef DEBUG diff --git a/kernel/x86/KERNEL.BOBCATE b/kernel/x86/KERNEL.BOBCATE new file mode 100644 index 000000000..231350a62 --- /dev/null +++ b/kernel/x86/KERNEL.BOBCATE @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_barcelona.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_barcelona.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse2.S b/kernel/x86/trsm_kernel_LN_2x4_sse2.S index 9a7a466a6..f16dda05f 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LN_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -439,7 +439,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -488,7 +488,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1697,7 +1697,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1727,7 +1727,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LN_4x4_sse.S b/kernel/x86/trsm_kernel_LN_4x4_sse.S index 147ed19bd..455096a63 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LN_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -437,7 +437,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -833,7 +833,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1848,7 +1848,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2109,7 +2109,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2429,7 +2429,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -2459,7 +2459,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2952,7 +2952,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -3148,7 +3148,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3389,7 +3389,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -3404,7 +3404,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse2.S b/kernel/x86/trsm_kernel_LT_2x4_sse2.S index e4f59819b..0222caccb 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -910,7 +910,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -959,7 +959,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1439,7 +1439,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1469,7 +1469,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LT_4x4_sse.S b/kernel/x86/trsm_kernel_LT_4x4_sse.S index 8d6189865..4c38714da 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -872,7 +872,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1316,7 +1316,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1855,7 +1855,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1885,7 +1885,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2249,7 +2249,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2562,7 +2562,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2957,7 +2957,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -2972,7 +2972,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -3280,7 +3280,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3515,7 +3515,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse2.S b/kernel/x86/trsm_kernel_RT_2x4_sse2.S index 6c2682a10..94a479474 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_RT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -1036,7 +1036,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1066,7 +1066,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 @@ -2224,7 +2224,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -2273,7 +2273,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_RT_4x4_sse.S b/kernel/x86/trsm_kernel_RT_4x4_sse.S index 0d2fcb6d2..95e3d469b 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_RT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -439,7 +439,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -454,7 +454,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -758,7 +758,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -993,7 +993,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -1324,7 +1324,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1354,7 +1354,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1718,7 +1718,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2031,7 +2031,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2859,7 +2859,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -3303,7 +3303,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S index f77a06d6c..f75f0ae08 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -533,7 +533,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S index 84d40ddec..be5aa54b9 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -994,7 +994,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S index bce0b0252..e0f37c3e2 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -1820,7 +1820,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86_64/KERNEL.BOBCATE b/kernel/x86_64/KERNEL.BOBCATE new file mode 100644 index 000000000..051a52286 --- /dev/null +++ b/kernel/x86_64/KERNEL.BOBCATE @@ -0,0 +1,62 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_barcelona.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_barcelona.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 9db45a642..af7afafcc 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index ca03f86b7..a01d4def6 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 01ad2d96e..958f26df8 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 60c1ea778..580f6d1f8 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index fc54dc4a5..aa46ba68b 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -160,7 +160,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index eae31b955..14d696024 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 4d6ad3326..ded298a98 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 2623bfe6d..fb20a1a2a 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/l1param.h b/l1param.h index 61c61aa94..aef675633 100644 --- a/l1param.h +++ b/l1param.h @@ -67,6 +67,13 @@ #define ALIGNED_ACCESS #endif +#ifdef BOBCATE +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 5) +#define ALIGNED_ACCESS +#endif + #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 diff --git a/l2param.h b/l2param.h index a371b2ded..a2b632e97 100644 --- a/l2param.h +++ b/l2param.h @@ -85,7 +85,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps diff --git a/param.h b/param.h index 53159a4fd..f0e49cc8b 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -208,6 +208,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#define SGEMM_DEFAULT_R sgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#define GEMM_THREAD gemm_thread_mn + +#endif + +#if defined(BOBCATE) + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 832 +#define GEMM_DEFAULT_ALIGN 0x0fffUL + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + + +#define SGEMM_DEFAULT_P 448 +#define DGEMM_DEFAULT_P 224 +#define QGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 +#define ZGEMM_DEFAULT_P 112 +#define XGEMM_DEFAULT_P 56 + +#define SGEMM_DEFAULT_Q 224 +#define DGEMM_DEFAULT_Q 224 +#define QGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define XGEMM_DEFAULT_Q 224 + + #define SGEMM_DEFAULT_R sgemm_r #define QGEMM_DEFAULT_R qgemm_r #define DGEMM_DEFAULT_R dgemm_r From d3b67d0bd85f7036954ebcda6d2d7dcc20c5da19 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 31 May 2012 22:40:15 +0800 Subject: [PATCH 048/162] Refs #113. Fixed the typo BOBCATE -> BOBCAT --- TargetList.txt | 2 +- cpuid.h | 4 ++-- cpuid_x86.c | 12 +++++----- driver/others/parameter.c | 4 ++-- getarch.c | 12 +++++----- kernel/setparam-ref.c | 2 +- kernel/x86/{KERNEL.BOBCATE => KERNEL.BOBCAT} | 0 kernel/x86/trsm_kernel_LN_2x4_sse2.S | 10 ++++----- kernel/x86/trsm_kernel_LN_4x4_sse.S | 22 +++++++++---------- kernel/x86/trsm_kernel_LT_2x4_sse2.S | 10 ++++----- kernel/x86/trsm_kernel_LT_4x4_sse.S | 22 +++++++++---------- kernel/x86/trsm_kernel_RT_2x4_sse2.S | 10 ++++----- kernel/x86/trsm_kernel_RT_4x4_sse.S | 22 +++++++++---------- kernel/x86/ztrsm_kernel_LN_2x2_sse.S | 4 ++-- kernel/x86/ztrsm_kernel_LT_2x2_sse.S | 4 ++-- kernel/x86/ztrsm_kernel_RT_2x2_sse.S | 4 ++-- .../x86_64/{KERNEL.BOBCATE => KERNEL.BOBCAT} | 0 kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- l1param.h | 2 +- l2param.h | 2 +- param.h | 2 +- 28 files changed, 83 insertions(+), 83 deletions(-) rename kernel/x86/{KERNEL.BOBCATE => KERNEL.BOBCAT} (100%) rename kernel/x86_64/{KERNEL.BOBCATE => KERNEL.BOBCAT} (100%) diff --git a/TargetList.txt b/TargetList.txt index 19008b862..1a212e6ca 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -28,7 +28,7 @@ OPTERON_SSE3 BARCELONA SHANGHAI ISTANBUL -BOBCATE +BOBCAT c)VIA CPU: SSE_GENERIC diff --git a/cpuid.h b/cpuid.h index 1678d0a7e..fdcfcea00 100644 --- a/cpuid.h +++ b/cpuid.h @@ -104,7 +104,7 @@ #define CORE_ATOM 18 #define CORE_NANO 19 #define CORE_SANDYBRIDGE 20 -#define CORE_BOBCATE 21 +#define CORE_BOBCAT 21 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -192,5 +192,5 @@ typedef struct { #define CPUTYPE_VIAC3 42 #define CPUTYPE_NANO 43 #define CPUTYPE_SANDYBRIDGE 44 -#define CPUTYPE_BOBCATE 45 +#define CPUTYPE_BOBCAT 45 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index d31146a98..204f41d51 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1029,7 +1029,7 @@ int get_cpuname(void){ case 10: return CPUTYPE_BARCELONA; case 5: - return CPUTYPE_BOBCATE; + return CPUTYPE_BOBCAT; } break; } @@ -1150,7 +1150,7 @@ static char *cpuname[] = { "VIAC3", "NANO", "SANDYBRIDGE", - "BOBCATE", + "BOBCAT", }; static char *lowercpuname[] = { @@ -1198,7 +1198,7 @@ static char *lowercpuname[] = { "nsgeode", "nano", "sandybridge", - "bobcate", + "bobcat", }; static char *corename[] = { @@ -1223,7 +1223,7 @@ static char *corename[] = { "ATOM", "NANO", "SANDYBRIDGE", - "BOBCATE", + "BOBCAT", }; static char *corename_lower[] = { @@ -1248,7 +1248,7 @@ static char *corename_lower[] = { "atom", "nano", "sandybridge", - "bobcate", + "bobcat", }; @@ -1358,7 +1358,7 @@ int get_coretype(void){ if (family <= 0xe) return CORE_ATHLON; if (family == 0xf){ if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; - else if (exfamily == 5) return CORE_BOBCATE; + else if (exfamily == 5) return CORE_BOBCAT; else return CORE_BARCELONA; } } diff --git a/driver/others/parameter.c b/driver/others/parameter.c index ab90b89f0..d261e5a4e 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -163,7 +163,7 @@ int get_L2_size(void){ int eax, ebx, ecx, edx; -#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) || \ +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) @@ -446,7 +446,7 @@ void blas_set_parameter(void){ #endif #endif -#if defined(CORE_BARCELONA) || defined(CORE_BOBCATE) +#if defined(CORE_BARCELONA) || defined(CORE_BOBCAT) size >>= 8; sgemm_p = 232 * size; diff --git a/getarch.c b/getarch.c index a8c311035..7e08e774e 100644 --- a/getarch.c +++ b/getarch.c @@ -102,7 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_BARCELONA */ /* #define FORCE_SHANGHAI */ /* #define FORCE_ISTANBUL */ -/* #define FORCE_BOBCATE */ +/* #define FORCE_BOBCAT */ /* #define FORCE_SSE_GENERIC */ /* #define FORCE_VIAC3 */ /* #define FORCE_NANO */ @@ -364,19 +364,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "BARCELONA" #endif -#if defined(FORCE_BOBCATE) +#if defined(FORCE_BOBCAT) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" -#define SUBARCHITECTURE "BOBCATE" -#define ARCHCONFIG "-DBOBCATE " \ +#define SUBARCHITECTURE "BOBCAT" +#define ARCHCONFIG "-DBOBCAT " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" -#define LIBNAME "bobcate" -#define CORENAME "BOBCATE" +#define LIBNAME "bobcat" +#define CORENAME "BOBCAT" #endif #ifdef FORCE_SSE_GENERIC diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 4f438d5af..f57b425e6 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -794,7 +794,7 @@ static void init_parameter(void) { #endif #endif -#ifdef BOBCATE +#ifdef BOBCAT #ifdef DEBUG fprintf(stderr, "Bobcate\n"); diff --git a/kernel/x86/KERNEL.BOBCATE b/kernel/x86/KERNEL.BOBCAT similarity index 100% rename from kernel/x86/KERNEL.BOBCATE rename to kernel/x86/KERNEL.BOBCAT diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse2.S b/kernel/x86/trsm_kernel_LN_2x4_sse2.S index f16dda05f..2b6877a31 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LN_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -439,7 +439,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -488,7 +488,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1697,7 +1697,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1727,7 +1727,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LN_4x4_sse.S b/kernel/x86/trsm_kernel_LN_4x4_sse.S index 455096a63..82bb1d3ec 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LN_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -437,7 +437,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -833,7 +833,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1848,7 +1848,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2109,7 +2109,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2429,7 +2429,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -2459,7 +2459,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2952,7 +2952,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -3148,7 +3148,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3389,7 +3389,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -3404,7 +3404,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse2.S b/kernel/x86/trsm_kernel_LT_2x4_sse2.S index 0222caccb..d81177b7e 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -910,7 +910,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -959,7 +959,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1439,7 +1439,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1469,7 +1469,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LT_4x4_sse.S b/kernel/x86/trsm_kernel_LT_4x4_sse.S index 4c38714da..854c44e7a 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -872,7 +872,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1316,7 +1316,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1855,7 +1855,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1885,7 +1885,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2249,7 +2249,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2562,7 +2562,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2957,7 +2957,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -2972,7 +2972,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -3280,7 +3280,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3515,7 +3515,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse2.S b/kernel/x86/trsm_kernel_RT_2x4_sse2.S index 94a479474..f7a08c699 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_RT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -1036,7 +1036,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1066,7 +1066,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 @@ -2224,7 +2224,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -2273,7 +2273,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_RT_4x4_sse.S b/kernel/x86/trsm_kernel_RT_4x4_sse.S index 95e3d469b..80dc2451c 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_RT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -439,7 +439,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -454,7 +454,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -758,7 +758,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -993,7 +993,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -1324,7 +1324,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1354,7 +1354,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1718,7 +1718,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2031,7 +2031,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2859,7 +2859,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -3303,7 +3303,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S index f75f0ae08..ee9eb9d25 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -533,7 +533,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S index be5aa54b9..9ef572470 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -994,7 +994,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S index e0f37c3e2..cd1bf2f53 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -1820,7 +1820,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86_64/KERNEL.BOBCATE b/kernel/x86_64/KERNEL.BOBCAT similarity index 100% rename from kernel/x86_64/KERNEL.BOBCATE rename to kernel/x86_64/KERNEL.BOBCAT diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index af7afafcc..5a123d7f6 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index a01d4def6..8afdc87db 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 958f26df8..5aef6b461 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 580f6d1f8..fa1bfba85 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index aa46ba68b..6af65a4ba 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -160,7 +160,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 14d696024..71aca0198 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index ded298a98..4b8422d82 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index fb20a1a2a..33667f79e 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/l1param.h b/l1param.h index aef675633..6fe756f17 100644 --- a/l1param.h +++ b/l1param.h @@ -67,7 +67,7 @@ #define ALIGNED_ACCESS #endif -#ifdef BOBCATE +#ifdef BOBCAT #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (128 * 5) diff --git a/l2param.h b/l2param.h index a2b632e97..cdbd8805e 100644 --- a/l2param.h +++ b/l2param.h @@ -85,7 +85,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps diff --git a/param.h b/param.h index f0e49cc8b..3add52615 100644 --- a/param.h +++ b/param.h @@ -222,7 +222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(BOBCATE) +#if defined(BOBCAT) #define SNUMOPT 8 #define DNUMOPT 4 From eefd30881c3b1f46b3f9490815b1cd3286e63e4d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 2 Jun 2012 21:34:23 +0800 Subject: [PATCH 049/162] Refs #113. Fixed the build bug on AMD Bobcat 64-bit OS. --- kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S | 2 +- kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S | 2 +- kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S | 2 +- kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S | 2 +- kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S | 2 +- kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S index fb428cbf5..b8caa9a44 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S index e9edc29ac..2db8cbc5d 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S index dabc97c3e..16c9ca828 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S index 7375c3487..dbdbfe2e1 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S index 3ab9e5be8..181cdd29c 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S index 85c0ac231..c28d02927 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta From f76f9525477785bb452699c07d1985ec14dc2b61 Mon Sep 17 00:00:00 2001 From: wangqian Date: Tue, 19 Jun 2012 16:17:43 +0800 Subject: [PATCH 050/162] Refs #83 #53. Adding Intel Sandy Bridge (AVX supported) kernel codes for BLAS level 3 functions. --- kernel/generic/zgemm_ncopy_4_sandy.c | 235 ++ kernel/generic/zgemm_ncopy_8_sandy.c | 401 +++ kernel/generic/zgemm_tcopy_4_sandy.c | 237 ++ kernel/generic/zgemm_tcopy_8_sandy.c | 370 ++ kernel/x86_64/KERNEL.SANDYBRIDGE | 103 +- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 4478 ++++++++++++++++++++++++ kernel/x86_64/dgemm_kernel_4x8_sandy.S | 3186 +++++++++++++++++ kernel/x86_64/sgemm_kernel_8x8_sandy.S | 3736 ++++++++++++++++++++ kernel/x86_64/zgemm_kernel_4x4_sandy.S | 3257 +++++++++++++++++ param.h | 100 +- 10 files changed, 15985 insertions(+), 118 deletions(-) create mode 100644 kernel/generic/zgemm_ncopy_4_sandy.c create mode 100644 kernel/generic/zgemm_ncopy_8_sandy.c create mode 100644 kernel/generic/zgemm_tcopy_4_sandy.c create mode 100644 kernel/generic/zgemm_tcopy_8_sandy.c create mode 100644 kernel/x86_64/cgemm_kernel_4x8_sandy.S create mode 100644 kernel/x86_64/dgemm_kernel_4x8_sandy.S create mode 100644 kernel/x86_64/sgemm_kernel_8x8_sandy.S create mode 100644 kernel/x86_64/zgemm_kernel_4x4_sandy.S diff --git a/kernel/generic/zgemm_ncopy_4_sandy.c b/kernel/generic/zgemm_ncopy_4_sandy.c new file mode 100644 index 000000000..839bd5939 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_4_sandy.c @@ -0,0 +1,235 @@ +/***************************************************************************** + Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + **********************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2,*dest4; + ii = col&-8; + ii = ii*(2*row); + dest4 = dest+ii; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j Date: Tue, 19 Jun 2012 17:05:16 +0800 Subject: [PATCH 051/162] Refs #113. Fixed BOBCATE typo in dynamic arch building. --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 987bb83cf..425cbb68a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -247,11 +247,11 @@ endif ifdef DYNAMIC_ARCH ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ - CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO + CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO endif ifndef DYNAMIC_CORE From 996dc6d1c89e605a721294685db8549cd21e19b3 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 19 Jun 2012 17:29:06 +0800 Subject: [PATCH 052/162] Fixed dynamic_arch building bug. --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index db9ec6a3b..27aeeb6ac 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define SNUMOPT 8 #define DNUMOPT 4 From 3ef96aa567e27ab76f07701b37da1ca0c0c59f39 Mon Sep 17 00:00:00 2001 From: wangqian Date: Tue, 19 Jun 2012 20:37:22 +0800 Subject: [PATCH 053/162] Fixed bug in MOVQ redefine and ALIGN SIZE problem. --- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 137 +++++++++--------- kernel/x86_64/dgemm_kernel_4x8_sandy.S | 163 +++++++++++----------- kernel/x86_64/sgemm_kernel_8x8_sandy.S | 185 +++++++++++++------------ kernel/x86_64/zgemm_kernel_4x4_sandy.S | 118 ++++++++-------- 4 files changed, 304 insertions(+), 299 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 2b4e4dc64..56ebee120 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -146,6 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JMP jmp #define NOP #define XOR xorpd +#undef MOVQ #define MOVQ movq #define XOR_SY vxorps @@ -305,7 +306,7 @@ movq %r11, kk; MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; -.align 32; +ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -320,7 +321,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; # Rm = 8 JLE .L1_loopE; -.align 32; +ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -367,7 +368,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; # Unroll 4 times JLE .L2_loopE; -.align 32; +ALIGN_5; .L2_bodyB:; # Computing kernel @@ -591,7 +592,7 @@ ADD2_SY yvec7, yvec8, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_5 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -599,7 +600,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L3_loopE; -.align 64 +ALIGN_5 .L3_loopB: ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba) @@ -717,7 +718,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L4_loopE; -.align 64 +ALIGN_5 .L4_loopB:; ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba) @@ -875,7 +876,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; -.align 32 +ALIGN_5 EXTRA_SY $1,yvec15,xvec7; EXTRA_SY $1,yvec14,xvec6; EXTRA_SY $1,yvec13,xvec5; @@ -934,7 +935,7 @@ ADDQ $16*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 32 +ALIGN_5 .L4_loopEx: EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL @@ -1077,11 +1078,11 @@ ADDQ $16*SIZE, C0; ADDQ $16*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 32; +ALIGN_5; .L1_loopE:; TEST $4, bm; JLE .L5_loopE; -.align 32 +ALIGN_5 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1113,7 +1114,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L8_loopE; -.align 32 +ALIGN_5 .L8_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -1242,7 +1243,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L8_bodyB; -.align 32 +ALIGN_5 .L8_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1250,7 +1251,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L9_loopE; -.align 32 +ALIGN_5 .L9_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -1323,7 +1324,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L10_loopE; -.align 32 +ALIGN_5 .L10_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -1494,7 +1495,7 @@ ADDQ $8*SIZE, C1; .L5_loopE: TEST $2, bm; JLE .L6_loopE; -.align 32 +ALIGN_5 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1527,7 +1528,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L11_loopE; -.align 32 +ALIGN_5 .L11_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 @@ -1652,7 +1653,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L11_bodyB; -.align 32 +ALIGN_5 .L11_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1660,7 +1661,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L12_loopE; -.align 32 +ALIGN_5 .L12_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 @@ -1731,7 +1732,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L13_loopE; -.align 32 +ALIGN_5 .L13_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 @@ -1875,7 +1876,7 @@ ADDQ $4*SIZE, C1; .L6_loopE: TEST $1, bm; JLE .L7_loopE; -.align 32 +ALIGN_5 .L7_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1905,7 +1906,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; -.align 32 +ALIGN_5 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1978,7 +1979,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L14_bodyB; -.align 32 +ALIGN_5 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1986,7 +1987,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L15_loopE; -.align 32 +ALIGN_5 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2031,7 +2032,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L16_loopE; -.align 32 +ALIGN_5 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2129,11 +2130,11 @@ LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 32; +ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; -.align 32 +ALIGN_5 .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -2145,7 +2146,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L21_loopE; -.align 32 +ALIGN_5 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2181,7 +2182,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; -.align 32 +ALIGN_5 .L211_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -2430,7 +2431,7 @@ ADDQ $64*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L211_bodyB; -.align 32 +ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2438,7 +2439,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L212_loopE; -.align 32 +ALIGN_5 .L212_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -2571,7 +2572,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L213_loopE; -.align 32 +ALIGN_5 .L213_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -2825,11 +2826,11 @@ ADDQ $16*SIZE, C0; ADDQ $16*SIZE, C1; DECQ i; JG .L21_bodyB; -.align 32 +ALIGN_5 .L21_loopE: TEST $4, bm; JLE .L22_loopE; -.align 32 +ALIGN_5 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2862,7 +2863,7 @@ MOVQ %rax, kkk; SARQ $2, k; JLE .L221_loopE; -.align 32 +ALIGN_5 .L221_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3002,7 +3003,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L221_bodyB; -.align 32 +ALIGN_5 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3010,7 +3011,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L222_loopE; -.align 32 +ALIGN_5 .L222_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3089,7 +3090,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L223_loopE; -.align 32 +ALIGN_5 .L223_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3237,7 +3238,7 @@ ADDQ $8*SIZE, C1; .L22_loopE: TEST $2, bm; JLE .L23_loopE; -.align 32 +ALIGN_5 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3267,7 +3268,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 32 +ALIGN_5 .L231_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3351,7 +3352,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 32 +ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3359,7 +3360,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -.align 32 +ALIGN_5 .L232_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3409,7 +3410,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L233_loopE; -.align 32 +ALIGN_5 .L233_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3503,7 +3504,7 @@ ADDQ $4*SIZE, C1; .L23_loopE: TEST $1, bm; JLE .L24_loopE; -.align 32 +ALIGN_5 .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3532,7 +3533,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; -.align 32 +ALIGN_5 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -3585,7 +3586,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L242_loopE; -.align 32 +ALIGN_5 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -3616,7 +3617,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L243_loopE; -.align 32 +ALIGN_5 .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -3684,7 +3685,7 @@ LEAQ (C, ldc, 2), C; .L20_loopE: TEST $1, bn; JLE .L30_loopE; -.align 32 +ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -3695,7 +3696,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; -.align 32 +ALIGN_5 .L31_bodyB: MOVQ bb, ptrbb; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) @@ -3727,7 +3728,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 32 +ALIGN_5 .L311_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -3800,7 +3801,7 @@ ADDQ $64*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 32 +ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3808,7 +3809,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L312_loopE; -.align 32 +ALIGN_5 .L312_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -3853,7 +3854,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L313_loopE; -.align 32 +ALIGN_5 .L313_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -3941,11 +3942,11 @@ ADDQ $8, kk; ADDQ $16*SIZE, C0; DECQ i; JG .L31_bodyB; -.align 32 +ALIGN_5 .L31_loopE: TEST $4, bm; JLE .L32_loopE; -.align 32 +ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3974,7 +3975,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; -.align 32 +ALIGN_5 .L321_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -4023,7 +4024,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 32 +ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -4031,7 +4032,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L322_loopE; -.align 32 +ALIGN_5 .L322_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -4064,7 +4065,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L323_loopE; -.align 32 +ALIGN_5 .L323_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -4128,7 +4129,7 @@ ADDQ $8*SIZE, C0; .L32_loopE: TEST $2, bm; JLE .L33_loopE; -.align 32 +ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -4157,7 +4158,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 32 +ALIGN_5 .L331_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; @@ -4202,7 +4203,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 32 +ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -4210,7 +4211,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L332_loopE; -.align 32 +ALIGN_5 .L332_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; @@ -4241,7 +4242,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L333_loopE; -.align 32 +ALIGN_5 .L333_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; @@ -4300,7 +4301,7 @@ ADDQ $4*SIZE, C0; .L33_loopE: TEST $1, bm; JLE .L34_loopE; -.align 32 +ALIGN_5 .L34_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -4329,7 +4330,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; -.align 32 +ALIGN_5 .L341_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -4354,7 +4355,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L341_bodyB; -.align 32 +ALIGN_5 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -4362,7 +4363,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L342_loopE; -.align 32 +ALIGN_5 .L342_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -4383,7 +4384,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L343_loopE; -.align 32 +ALIGN_5 .L343_bodyB: XOR_SY yvec0, yvec0, yvec0; XOR_SY yvec2, yvec2, yvec2; diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index fea5ecb4a..c98879d7c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -140,6 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JNE jne #define NOP #define XOR xorpd +#undef MOVQ #define MOVQ movq #define XOR_SY vxorps @@ -265,7 +266,7 @@ movq %r11, kk MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; -.align 32; +ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -281,7 +282,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; # Rm = 8 JLE .L1_loopE; -.align 32; +ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -328,7 +329,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L2_loopE; -.align 32; +ALIGN_5; .L2_bodyB:; # Computing kernel @@ -448,7 +449,7 @@ ADD_DY yvec8, yvec7, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_5 .L2_loopE:; PREFETCH2 0*SIZE(prebb); ADDQ $8*SIZE, prebb; @@ -459,7 +460,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L3_loopE; -.align 64 +ALIGN_5 .L3_bodyB: #### Unroll times 1 #### PREFETCH0 64*SIZE(ptrba) @@ -529,7 +530,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L4_loopE; -.align 64 +ALIGN_5 .L4_bodyB:; #### Unroll times 1 #### PREFETCH0 64*SIZE(ptrba) @@ -588,7 +589,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; # Unalign part write back -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec14,xvec6; @@ -648,7 +649,7 @@ ADDQ $8*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 32; +ALIGN_5; .L4_loopEx:; EXTRA_DY $1, yvec15, xvec7; #ifndef TRMMKERNEL @@ -776,11 +777,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 32 +ALIGN_5 .L1_loopE:; TEST $4, bm; # Rm = 4 JLE .L5_loopE; -.align 32 +ALIGN_5 .L5_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -816,7 +817,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L6_loopE; -.align 32; +ALIGN_5; .L6_bodyB:; # Computing kernel @@ -887,7 +888,7 @@ MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec9, yvec7, yvec9; DECQ k; JG .L6_bodyB; -.align 32 +ALIGN_5 .L6_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -896,7 +897,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L7_loopE; -.align 32 +ALIGN_5 .L7_bodyB:; #### Untoll time 1 #### LD_DY 4*SIZE(ptrba), yvec1; @@ -940,7 +941,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L8_loopE; -.align 32 +ALIGN_5 .L8_bodyB:; #### Untoll time 1 #### MUL_DY yvec0, yvec2, yvec6; @@ -977,7 +978,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L8_loopEx; # Unalign part write back -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec13,xvec5; @@ -1014,7 +1015,7 @@ ADDQ $4, kk ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; -.align 32 +ALIGN_5 .L8_loopEx:; EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec13,xvec5; @@ -1080,7 +1081,7 @@ ADDQ $4*SIZE, C1; .L5_loopE:; TEST $2, bm; JLE .L9_loopE; -.align 32 +ALIGN_5 .L9_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1117,7 +1118,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L10_loopE; -.align 32; +ALIGN_5; .L10_bodyB:; # Computing kernel @@ -1192,7 +1193,7 @@ MUL_DX xvec1, xvec5; ADD_DX xvec5, xvec9; DECQ k; JG .L10_bodyB; -.align 32 +ALIGN_5 .L10_loopE:; #ifndef TRMMKERNEL TEST $2, bk @@ -1201,7 +1202,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L11_loopE; -.align 32 +ALIGN_5 .L11_bodyB:; ##### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; @@ -1248,7 +1249,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L12_loopE; -.align 32 +ALIGN_5 .L12_bodyB:; SHUF_DX $0x4e, xvec3, xvec5; MUL_DX xvec0, xvec2; @@ -1285,7 +1286,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L12_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec13; @@ -1310,7 +1311,7 @@ ADDQ $2, kk ADDQ $2*SIZE, C0 ADDQ $2*SIZE, C1 JMP .L9_loopE; -.align 32 +ALIGN_5 .L12_loopEx: #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec14; @@ -1349,7 +1350,7 @@ ADDQ $2*SIZE, C1; .L9_loopE:; TEST $1, bm JLE .L13_loopE; -.align 32 +ALIGN_5 .L13_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1379,7 +1380,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; -.align 32 +ALIGN_5 .L14_bodyB:; BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; @@ -1404,7 +1405,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L14_bodyB; -.align 32 +ALIGN_5 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1413,7 +1414,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L15_loopE; -.align 32 +ALIGN_5 .L15_bodyB: BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; @@ -1434,7 +1435,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L16_loopE; -.align 32 +ALIGN_5 .L16_bodyB:; BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; @@ -1485,11 +1486,11 @@ LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 32; +ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; -.align 32; +ALIGN_5; .L20_loopB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -1501,7 +1502,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; # Rm = 8 JLE .L21_loopE; -.align 32; +ALIGN_5; .L21_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1538,7 +1539,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; -.align 32; +ALIGN_5; .L211_bodyB: # Computing kernel #### Unroll time 1 #### @@ -1692,7 +1693,7 @@ MUL_DX xvec3, xvec7; ADD_DX xvec7, xvec8; DECQ k; JG .L211_bodyB; -.align 32 +ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1701,7 +1702,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L212_loopE; -.align 32; +ALIGN_5; .L212_bodyB: # Computing kernel #### Unroll time 1 #### @@ -1788,7 +1789,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L213_loopE; -.align 32 +ALIGN_5 .L213_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -1858,7 +1859,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L213_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11; @@ -1893,7 +1894,7 @@ ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; JMP .L21_loopE; -.align 32 +ALIGN_5 .L213_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -1956,7 +1957,7 @@ JG .L21_bodyB; .L21_loopE:; TEST $4, bm; # Rm = 4 JLE .L22_loopE; -.align 32; +ALIGN_5; .L22_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1989,7 +1990,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; -.align 32 +ALIGN_5 .L221_bodyB:; # Computing kernel #### Unroll time 1 #### @@ -2071,7 +2072,7 @@ MUL_DX xvec1, xvec5; ADD_DX xvec5, xvec10; DECQ k; JG .L221_bodyB; -.align 32 +ALIGN_5 .L221_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -2080,7 +2081,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L222_loopE; -.align 32 +ALIGN_5 .L222_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2129,7 +2130,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L223_loopE; -.align 32 +ALIGN_5 .L223_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2171,7 +2172,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L223_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11; @@ -2196,7 +2197,7 @@ ADDQ $4, kk ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L22_loopE; -.align 32 +ALIGN_5 .L223_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -2237,7 +2238,7 @@ ADDQ $4*SIZE, C1; .L22_loopE:; TEST $2, bm; # Rm = 2 JLE .L23_loopE; -.align 32; +ALIGN_5; .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2267,7 +2268,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 32 +ALIGN_5 .L231_bodyB: # Computing kernel #### Unroll time 1 #### @@ -2309,7 +2310,7 @@ ADD_DX xvec5, xvec11; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 32 +ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2318,7 +2319,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L232_loopE; -.align 32 +ALIGN_5 .L232_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2347,7 +2348,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L233_loopE; -.align 32 +ALIGN_5 .L233_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2373,7 +2374,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L233_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11; @@ -2394,7 +2395,7 @@ ADDQ $2, kk; ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; JMP .L23_loopE; -.align 32 +ALIGN_5 .L233_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -2425,7 +2426,7 @@ ADDQ $2*SIZE, C1; .L23_loopE: TEST $1, bm; # Rm = 1 JLE .L24_loopE; -.align 32; +ALIGN_5; .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2454,7 +2455,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; -.align 32 +ALIGN_5 .L241_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; @@ -2479,7 +2480,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L241_bodyB; -.align 32 +ALIGN_5 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2488,7 +2489,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L242_loopE; -.align 32 +ALIGN_5 .L242_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; @@ -2509,7 +2510,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L243_loopE; -.align 32 +ALIGN_5 .L243_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; @@ -2551,7 +2552,7 @@ LEAQ (C, ldc, 2), C; .L20_loopE:; TEST $1, bn; # Rn = 1 JLE .L30_loopE; -.align 32 +ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL)&&defined(LEFT) MOVQ OFFSET, %rax; @@ -2562,7 +2563,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; -.align 32 +ALIGN_5 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2593,7 +2594,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 32 +ALIGN_5 .L311_bodyB: #### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -2634,7 +2635,7 @@ ADD_DY yvec4, yvec14, yvec14; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 32 +ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2643,7 +2644,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L312_loopE; -.align 32 +ALIGN_5 .L312_bodyB: #### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -2673,7 +2674,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L313_loopE; -.align 32 +ALIGN_5 .L313_bodyB: #### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -2696,7 +2697,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L313_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; @@ -2724,7 +2725,7 @@ ADDQ $8*SIZE, C0; DECQ i; JG .L31_bodyB; JMP .L31_loopE; -.align 32 +ALIGN_5 .L313_loopEx: EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; @@ -2766,7 +2767,7 @@ JG .L31_bodyB; .L31_loopE: TEST $4, bm JLE .L32_loopE; -.align 32 +ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2796,7 +2797,7 @@ MOVQ %rax, kkk #endif SARQ $2, k; JLE .L321_loopE; -.align 32 +ALIGN_5 .L321_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; @@ -2821,7 +2822,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 32 +ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2830,7 +2831,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L322_loopE; -.align 32 +ALIGN_5 .L322_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; @@ -2852,7 +2853,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L323_loopE; -.align 32 +ALIGN_5 .L323_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; @@ -2870,7 +2871,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L323_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; #ifndef TRMMKERNEL @@ -2891,7 +2892,7 @@ ADDQ $4, kk #endif ADDQ $4*SIZE, C0; JMP .L32_loopE; -.align 32 +ALIGN_5 .L323_loopEx: #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; @@ -2921,7 +2922,7 @@ ADDQ $4*SIZE, C0; .L32_loopE: TEST $2, bm JLE .L33_loopE; -.align 32 +ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -2951,7 +2952,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 32 +ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -2976,7 +2977,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 32 +ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2,bk; @@ -2985,7 +2986,7 @@ MOVQ kkk, %rax; TEST $2, %rax #endif JLE .L332_loopE; -.align 32 +ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3006,7 +3007,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L333_loopE; -.align 32 +ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3039,7 +3040,7 @@ ADDQ $2*SIZE, C0; .L33_loopE: TEST $1, bm JLE .L34_loopE; -.align 32 +ALIGN_5 .L34_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -3068,7 +3069,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; -.align 32 +ALIGN_5 .L341_bodyB: movsd 0*SIZE(ptrba), xvec0; movsd 0*SIZE(ptrbb), xvec1; @@ -3093,7 +3094,7 @@ addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L341_bodyB; -.align 32 +ALIGN_5 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3102,7 +3103,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L342_loopE; -.align 32 +ALIGN_5 .L342_bodyB: movsd 0*SIZE(ptrba), xvec0; movsd 0*SIZE(ptrbb), xvec1; @@ -3124,7 +3125,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L343_loopE; -.align 32 +ALIGN_5 .L343_bodyB: movsd 0*SIZE(ptrba), xvec0; movsd 0*SIZE(ptrbb), xvec1; diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 44f8f1802..4d16a60d0 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JMP jmp #define NOP #define XOR xorpd +#undef MOVQ #define MOVQ movq #define XOR_SY vxorps @@ -273,7 +274,7 @@ movq %r11, kk MOVQ bn,j; SARQ $3,j; JLE .L0_loopE; -.align 16; +ALIGN_4; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -289,7 +290,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; JLE .L1_loopE; -.align 16; +ALIGN_4; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -342,7 +343,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L2_loopE; -.align 16; +ALIGN_4; .L2_bodyB:; # Computing kernel @@ -472,7 +473,7 @@ ADD_SY yvec8, yvec7, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_4 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -480,7 +481,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L3_loopE; -.align 64 +ALIGN_4 .L3_loobB: #### Unroll times 1 #### MUL_SY yvec0, yvec2, yvec6; @@ -550,7 +551,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L4_loopE; -.align 64 +ALIGN_4 .L4_loopB:; #### Unroll times 1 #### MUL_SY yvec0, yvec2, yvec6; @@ -609,7 +610,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; -.align 16 +ALIGN_4 LEAQ (ldc,ldc,2),%rax; EXTRA_SY $1,yvec15,xvec7; EXTRA_SY $1,yvec14,xvec6; @@ -669,7 +670,7 @@ ADDQ $8*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 16; +ALIGN_4; .L4_loopEx: LEAQ (ldc,ldc,2),%rax; EXTRA_SY $1, yvec15, xvec7; @@ -813,11 +814,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 16 +ALIGN_4 .L1_loopE:; TEST $4, bm; JLE .L5_loopE; -.align 16 +ALIGN_4 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -857,7 +858,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L8_loopE; -.align 16 +ALIGN_4 .L8_bodyB: #### Unroll time 1 #### @@ -983,7 +984,7 @@ MUL_SX xvec1, xvec5; ADD_SX xvec5, xvec8; DECQ k; JG .L8_bodyB; -.align 16 +ALIGN_4 .L8_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -991,7 +992,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L9_loopE; -.align 16 +ALIGN_4 .L9_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; @@ -1062,7 +1063,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L10_loopE; -.align 16 +ALIGN_4 .L10_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; @@ -1122,7 +1123,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L10_loopEx; -.align 16 +ALIGN_4 LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL ADD_SX 0*SIZE(C0), xvec15; @@ -1155,7 +1156,7 @@ ADDQ $4, kk ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; -.align 16 +ALIGN_4 .L10_loopEx: LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL @@ -1215,7 +1216,7 @@ ADDQ $4*SIZE, C1; .L5_loopE: TEST $2, bm; JLE .L6_loopE; -.align 16 +ALIGN_4 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1249,7 +1250,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L11_loopE; -.align 16 +ALIGN_4 .L11_bodyB: #### Computing kernel LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 @@ -1318,7 +1319,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L11_bodyB; -.align 16 +ALIGN_4 .L11_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1326,7 +1327,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L12_loopE; -.align 16 +ALIGN_4 .L12_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 @@ -1368,7 +1369,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L13_loopE; -.align 16 +ALIGN_4 .L13_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 @@ -1433,7 +1434,7 @@ ADDQ $2*SIZE, C1; .L6_loopE: TEST $1, bm; JLE .L7_loopE; -.align 16 +ALIGN_4 .L7_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1465,7 +1466,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; -.align 16 +ALIGN_4 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1503,7 +1504,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L14_bodyB; -.align 16 +ALIGN_4 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1511,7 +1512,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L15_loopE; -.align 16 +ALIGN_4 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1538,7 +1539,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L16_loopE; -.align 16 +ALIGN_4 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1611,11 +1612,11 @@ LEAQ (C,ldc,8),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 16; +ALIGN_4; .L0_loopE:; TEST $4, bn; # Rn = 4 JLE .L20_loopE; -.align 16; +ALIGN_4; .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -1628,7 +1629,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L21_loopE; -.align 16 +ALIGN_4 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1668,7 +1669,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L211_loopE; -.align 16 +ALIGN_4 .L211_bodyB: #### Unroll time 1 #### ODUP_SX 0*SIZE(ptrbb), xvec3; @@ -1800,7 +1801,7 @@ ADD_SX xvec7, xvec8; LD_SX 4*SIZE(ptrba), xvec1; DECQ k; JG .L211_bodyB; -.align 16 +ALIGN_4 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk @@ -1808,7 +1809,7 @@ TEST $2, bk TEST $2, kkk; #endif JLE .L212_loopE; -.align 16 +ALIGN_4 .L212_bodyB: #### Unroll time 1 #### ODUP_SX 0*SIZE(ptrbb), xvec3; @@ -1882,7 +1883,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L213_loopE; -.align 16 +ALIGN_4 .L213_bodyB: ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; @@ -1982,11 +1983,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; -.align 16 +ALIGN_4 .L21_loopE: TEST $4, bm; JLE .L22_loopE; -.align 16 +ALIGN_4 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2019,7 +2020,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; -.align 16 +ALIGN_4 .L221_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; @@ -2089,7 +2090,7 @@ ADDQ $16*SIZE, ptrbb; DECQ k; JG .L221_bodyB; -.align 16 +ALIGN_4 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2097,7 +2098,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L222_loopE; -.align 16 +ALIGN_4 .L222_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; @@ -2139,7 +2140,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L223_loopE; -.align 16 +ALIGN_4 .L223_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; @@ -2203,7 +2204,7 @@ ADDQ $4*SIZE, C1; .L22_loopE: TEST $2, bm; JLE .L23_loopE; -.align 16 +ALIGN_4 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2234,7 +2235,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 16 +ALIGN_4 .L231_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; @@ -2274,7 +2275,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 16 +ALIGN_4 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2282,7 +2283,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -.align 16 +ALIGN_4 .L232_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; @@ -2310,7 +2311,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L233_loopE; -.align 16 +ALIGN_4 .L233_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; @@ -2356,7 +2357,7 @@ ADDQ $2*SIZE, C1; .L23_loopE: TEST $1, bm; JLE .L24_loopE; -.align 16 +ALIGN_4 .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2386,7 +2387,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; -.align 16 +ALIGN_4 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; @@ -2419,7 +2420,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L242_loopE; -.align 16 +ALIGN_4 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; @@ -2440,7 +2441,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L243_loopE; -.align 16; +ALIGN_4; .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; @@ -2491,7 +2492,7 @@ LEAQ (C, ldc, 4), C; .L20_loopE: TEST $2, bn; JLE .L30_loopE; -.align 16 +ALIGN_4 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -2503,7 +2504,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; -.align 16 +ALIGN_4 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2536,7 +2537,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 16 +ALIGN_4 .L311_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2612,7 +2613,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 16 +ALIGN_4 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2620,7 +2621,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L312_loopE; -.align 16 +ALIGN_4 .L312_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2666,7 +2667,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L313_loopE; -.align 16 +ALIGN_4 .L313_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2731,11 +2732,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L31_bodyB; -.align 16 +ALIGN_4 .L31_loopE: TEST $4, bm; JLE .L32_loopE; -.align 16 +ALIGN_4 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2766,7 +2767,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; -.align 16 +ALIGN_4 .L321_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2806,7 +2807,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 16 +ALIGN_4 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2814,7 +2815,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L322_loopE; -.align 16 +ALIGN_4 .L322_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2842,7 +2843,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L323_loopE; -.align 16 +ALIGN_4 .L323_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2887,7 +2888,7 @@ ADDQ $4*SIZE, C1; .L32_loopE: TEST $2, bm; JLE .L33_loopE; -.align 16 +ALIGN_4 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2920,7 +2921,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 16 +ALIGN_4 .L331_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 @@ -2943,7 +2944,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 16 +ALIGN_4 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2951,7 +2952,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L332_loopE; -.align 16 +ALIGN_4 .L332_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 @@ -2972,7 +2973,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L333_loopE; -.align 16 +ALIGN_4 .L333_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3031,7 +3032,7 @@ ADDQ $2*SIZE, C1; .L33_loopE: TEST $1, bm; JLE .L34_loopE; -.align 16 +ALIGN_4 .L34_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -3062,7 +3063,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; -.align 16 +ALIGN_4 .L341_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3104,7 +3105,7 @@ addq $4*SIZE, ptrba; addq $8*SIZE, ptrbb; decq k; jg .L341_bodyB; -.align 16 +ALIGN_4 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3112,7 +3113,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L342_loopE; -.align 16 +ALIGN_4 .L342_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3140,7 +3141,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L343_loopE; -.align 16 +ALIGN_4 .L343_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3189,7 +3190,7 @@ LEAQ (C, ldc, 2), C; .L30_loopE: TEST $1, bn; JLE .L40_loopE; -.align 16 +ALIGN_4 .L40_bodyB: #if defined(TRMMKERNEL)&&defined(LEFT) MOVQ OFFSET, %rax; @@ -3200,7 +3201,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L41_loopE; -.align 16 +ALIGN_4 .L41_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -3230,7 +3231,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L411_loopE; -.align 16 +ALIGN_4 .L411_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; @@ -3256,7 +3257,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L411_bodyB; -.align 16 +ALIGN_4 .L411_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3264,7 +3265,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L412_loopE; -.align 16 +ALIGN_4 .L412_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; @@ -3285,7 +3286,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L413_loopE; -.align 16 +ALIGN_4 .L413_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; @@ -3329,11 +3330,11 @@ ADDQ $8, kk; ADDQ $8*SIZE, C0; DECQ i; JG .L41_bodyB; -.align 16 +ALIGN_4 .L41_loopE: TEST $4, bm; JLE .L42_loopE; -.align 16 +ALIGN_4 .L42_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -3362,7 +3363,7 @@ MOVQ %rax, kkk #endif SARQ $2, k; JLE .L421_loopE; -.align 16 +ALIGN_4 .L421_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; @@ -3387,7 +3388,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L421_bodyB; -.align 16 +ALIGN_4 .L421_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3395,7 +3396,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L422_loopE; -.align 16 +ALIGN_4 .L422_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; @@ -3416,7 +3417,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L423_loopE; -.align 16 +ALIGN_4 .L423_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; @@ -3451,7 +3452,7 @@ ADDQ $4*SIZE, C0; .L42_loopE: TEST $2, bm; JLE .L43_loopE; -.align 16 +ALIGN_4 .L43_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -3481,7 +3482,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L431_loopE; -.align 16 +ALIGN_4 .L431_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3518,7 +3519,7 @@ addq $8*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L431_bodyB; -.align 16 +ALIGN_4 .L431_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3526,7 +3527,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L432_loopE; -.align 16 +ALIGN_4 .L432_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3553,7 +3554,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L433_loopE; -.align 16 +ALIGN_4 .L433_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3592,7 +3593,7 @@ addq $2*SIZE, C0; .L43_loopE: TEST $1, bm; JLE .L44_loopE; -.align 16 +ALIGN_4 .L44_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -3621,7 +3622,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L441_loopE; -.align 16 +ALIGN_4 .L441_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3646,7 +3647,7 @@ addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L441_bodyB; -.align 16 +ALIGN_4 .L441_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3654,7 +3655,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L442_loopE; -.align 16 +ALIGN_4 .L442_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3675,7 +3676,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L443_loopE; -.align 16 +ALIGN_4 .L443_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; diff --git a/kernel/x86_64/zgemm_kernel_4x4_sandy.S b/kernel/x86_64/zgemm_kernel_4x4_sandy.S index 34abbb529..f6f9f707f 100644 --- a/kernel/x86_64/zgemm_kernel_4x4_sandy.S +++ b/kernel/x86_64/zgemm_kernel_4x4_sandy.S @@ -145,6 +145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JMP jmp #define NOP #define XOR xorpd +#undef MOVQ +#define MOVQ movq #define XOR_SY vxorps #define XOR_DY vxorpd @@ -297,7 +299,7 @@ movq %r11, kk; MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; -.align 32; +ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -312,7 +314,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $2,i; # Rm = 4 JLE .L1_loopE; -.align 32; +ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -361,7 +363,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; # Unroll 4 times JLE .L2_loopE; -.align 32; +ALIGN_5; .L2_bodyB:; #### Computing kernel #### @@ -584,7 +586,7 @@ ADD2_DY yvec6, yvec12, yvec12; ADD2_DY yvec7, yvec8, yvec8; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_5 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -592,7 +594,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L3_loopE; -.align 64 +ALIGN_5 .L3_bodyB: #### Unroll time 1 #### LD_DY 4*SIZE(ptrba), yvec1; @@ -710,7 +712,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L4_loopE; -.align 64 +ALIGN_5 .L4_loopB:; #### Unroll time 1 #### PREFETCH0 PRESIZE*SIZE(ptrba); @@ -852,7 +854,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; -.align 32 +ALIGN_5 #### Store Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec14,xvec6; @@ -912,7 +914,7 @@ ADDQ $8*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 32 +ALIGN_5 .L4_loopEx: EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; @@ -1024,11 +1026,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 32; +ALIGN_5; .L1_loopE:; TEST $2, bm; JLE .L5_loopE; -.align 32 +ALIGN_5 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1060,7 +1062,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L7_loopE; -.align 32 +ALIGN_5 .L7_bodyB: #### Compute kernel #### #### Unroll times 1 #### @@ -1194,7 +1196,7 @@ ADD2_DY yvec7, yvec12, yvec12; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L7_bodyB; -.align 32 +ALIGN_5 .L7_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1202,7 +1204,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L8_loopE; -.align 32 +ALIGN_5 .L8_bodyB: #### Unroll times 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -1276,7 +1278,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L9_loopE; -.align 32 +ALIGN_5 .L9_bodyB: #### Unroll times 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -1364,7 +1366,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L9_loopEx; -.align 32 +ALIGN_5 #### Writing back #### EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; @@ -1401,7 +1403,7 @@ ADDQ $2, kk; ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; -.align 32 +ALIGN_5 .L9_loopEx: EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; @@ -1466,7 +1468,7 @@ ADDQ $4*SIZE, C1; .L5_loopE: TEST $1, bm; JLE .L6_loopE; -.align 32 +ALIGN_5 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1496,7 +1498,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L10_loopE; -.align 32 +ALIGN_5 .L10_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1570,7 +1572,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L10_bodyB; -.align 32 +ALIGN_5 .L10_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1578,7 +1580,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L11_loopE; -.align 32 +ALIGN_5 .L11_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1624,7 +1626,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L12_loopE; -.align 32 +ALIGN_5 .L12_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1722,11 +1724,11 @@ LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 32; +ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; -.align 32 +ALIGN_5 .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -1738,7 +1740,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $2, i; JLE .L21_loopE; -.align 32 +ALIGN_5 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1770,7 +1772,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; -.align 32 +ALIGN_5 .L211_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1891,7 +1893,7 @@ ADD2_DY yvec7, yvec12, yvec12; ADDQ $32*SIZE, ptrba; DECQ k; JG .L211_bodyB; -.align 32 +ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1899,7 +1901,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L212_loopE; -.align 32 +ALIGN_5 .L212_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1969,7 +1971,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L213_loopE; -.align 32 +ALIGN_5 .L213_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2058,7 +2060,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L213_loopEx; -.align 32 +ALIGN_5 #### Writing back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0),xvec15; @@ -2093,7 +2095,7 @@ ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; JMP .L21_loopE; -.align 32 +ALIGN_5 .L213_loopEx: #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -2153,11 +2155,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; -.align 32 +ALIGN_5 .L21_loopE: TEST $2, bm; JLE .L22_loopE; -.align 32 +ALIGN_5 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2187,7 +2189,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; -.align 32 +ALIGN_5 .L221_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2268,7 +2270,7 @@ ADD2_DY yvec6, yvec13, yvec13; ADDQ $16*SIZE, ptrba; DECQ k; JG .L221_bodyB; -.align 32 +ALIGN_5 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2276,7 +2278,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L222_loopE; -.align 32 +ALIGN_5 .L222_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2325,7 +2327,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L223_loopE; -.align 32 +ALIGN_5 .L223_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2419,7 +2421,7 @@ ADDQ $4*SIZE, C1; .L22_loopE: TEST $1, bm; JLE .L23_loopE; -.align 32 +ALIGN_5 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2448,7 +2450,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 32 +ALIGN_5 .L231_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2498,7 +2500,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 32 +ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2506,7 +2508,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -.align 32 +ALIGN_5 .L232_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2540,7 +2542,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L233_loopE; -.align 32 +ALIGN_5 .L233_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2614,7 +2616,7 @@ LEAQ (C, ldc, 2), C; .L20_loopE: TEST $1, bn; JLE .L30_loopE; -.align 32 +ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -2625,7 +2627,7 @@ MOVQ C, C0; MOVQ bm, i; SARQ $2, i; JLE .L31_loopE; -.align 32 +ALIGN_5 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2655,7 +2657,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 32 +ALIGN_5 .L311_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2732,7 +2734,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 32 +ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2740,7 +2742,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L312_loopE; -.align 32 +ALIGN_5 .L312_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2787,7 +2789,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L313_loopE; -.align 32 +ALIGN_5 .L313_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2877,11 +2879,11 @@ ADDQ $4, kk; ADDQ $8*SIZE, C0; DECQ i; JG .L31_bodyB; -.align 32 +ALIGN_5 .L31_loopE: TEST $2, bm; JLE .L32_loopE; -.align 32 +ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2910,7 +2912,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; -.align 32 +ALIGN_5 .L321_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2951,7 +2953,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 32 +ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2959,7 +2961,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L322_loopE; -.align 32 +ALIGN_5 .L322_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2988,7 +2990,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L323_loopE; -.align 32 +ALIGN_5 .L323_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -3049,7 +3051,7 @@ ADDQ $4*SIZE, C0; .L32_loopE: TEST $1, bm; JLE .L33_loopE; -.align 32 +ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3078,7 +3080,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 32 +ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3123,7 +3125,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 32 +ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3131,7 +3133,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L332_loopE; -.align 32 +ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3162,7 +3164,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L333_loopE; -.align 32 +ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; From 6cfcb54a2810b4607f9b9353e275345c2d64f27f Mon Sep 17 00:00:00 2001 From: wangqian Date: Wed, 20 Jun 2012 07:38:39 +0800 Subject: [PATCH 054/162] Fixed align problem in S and C precision GEMM kernels. --- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 2 +- kernel/x86_64/sgemm_kernel_8x8_sandy.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 56ebee120..5987b8e61 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -3578,7 +3578,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L241_bodyB; -.align +ALIGN_5 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 4d16a60d0..23eda3af8 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -2412,7 +2412,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L241_bodyB; -.align +ALIGN_4 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; From 88c272f6a739039460afbca3e47b55cd3555f585 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 20 Jun 2012 09:20:20 +0800 Subject: [PATCH 055/162] Refs #83. Added the missing ALIGN_5 macro on Mac OSX. However, it still exists SEGFAULT bug. --- common_x86_64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_x86_64.h b/common_x86_64.h index 7b6d11f7d..19b0ac53c 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -425,6 +425,7 @@ REALNAME: #define ALIGN_2 .align 2 #define ALIGN_3 .align 3 #define ALIGN_4 .align 4 +#define ALIGN_5 .align 5 #define ffreep fstp #endif From b8b922d334568ea2cf5d7c471be187715ddfb33f Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 20 Jun 2012 11:07:36 +0800 Subject: [PATCH 056/162] Fixed #106. Use fetch instead of curl on FreeBSD. --- Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 905d686a2..796217291 100644 --- a/Makefile +++ b/Makefile @@ -257,12 +257,16 @@ LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.1.tgz lapack-3.4.1.tgz : ifndef NOFORTRAN #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or -ifeq ($(OSNAME), $(filter $(OSNAME),Darwin FreeBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD)) curl -O $(LAPACK_URL) +else +ifeq ($(OSNAME), FreeBSD) + fetch $(LAPACK_URL) else wget $(LAPACK_URL) endif endif +endif large.tgz : ifndef NOFORTRAN From d34fce56e4a980fefe4ddafe5d371798ad948b59 Mon Sep 17 00:00:00 2001 From: wangqian Date: Wed, 20 Jun 2012 19:53:18 +0800 Subject: [PATCH 057/162] Refs #83 Fixed S/DGEMM calling conventions bug on windows. --- kernel/x86_64/dgemm_kernel_4x8_sandy.S | 67 ++++++++++++++------------ kernel/x86_64/sgemm_kernel_8x8_sandy.S | 1 + 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index c98879d7c..603552464 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -162,7 +162,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ST_SX movaps #define ST_DX movapd #define STL_DX movlpd +#define STL_DY vmovlpd #define STH_DX movhpd +#define STH_DY vmovhpd #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup @@ -242,6 +244,7 @@ movq %r15, 40(%rsp); #ifdef TRMMKERNEL movq old_offset, %r11 #endif + movaps %xmm3, %xmm0 #else movq old_ldc, ldc @@ -660,10 +663,10 @@ LDL_DY 2*SIZE(C1), xvec5, xvec5; LDH_DY 3*SIZE(C1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec15, 0*SIZE(C0); -STH_DX xvec15, 1*SIZE(C0); -STL_DX xvec7, 2*SIZE(C1); -STH_DX xvec7, 3*SIZE(C1); +STL_DY xvec15, 0*SIZE(C0); +STH_DY xvec15, 1*SIZE(C0); +STL_DY xvec7, 2*SIZE(C1); +STH_DY xvec7, 3*SIZE(C1); EXTRA_DY $1, yvec14, xvec4; #ifndef TRMMKERNEL @@ -674,10 +677,10 @@ LDL_DY 6*SIZE(C1), xvec2, xvec2; LDH_DY 7*SIZE(C1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec14, 4*SIZE(C0); -STH_DX xvec14, 5*SIZE(C0); -STL_DX xvec4, 6*SIZE(C1); -STH_DX xvec4, 7*SIZE(C1); +STL_DY xvec14, 4*SIZE(C0); +STH_DY xvec14, 5*SIZE(C0); +STL_DY xvec4, 6*SIZE(C1); +STH_DY xvec4, 7*SIZE(C1); EXTRA_DY $1, yvec13, xvec7; #ifndef TRMMKERNEL @@ -688,10 +691,10 @@ LDL_DY 2*SIZE(C1, ldc, 1), xvec5, xvec5; LDH_DY 3*SIZE(C1, ldc, 1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec13, 0*SIZE(C0, ldc, 1); -STH_DX xvec13, 1*SIZE(C0, ldc, 1); -STL_DX xvec7, 2*SIZE(C1, ldc, 1); -STH_DX xvec7, 3*SIZE(C1, ldc, 1); +STL_DY xvec13, 0*SIZE(C0, ldc, 1); +STH_DY xvec13, 1*SIZE(C0, ldc, 1); +STL_DY xvec7, 2*SIZE(C1, ldc, 1); +STH_DY xvec7, 3*SIZE(C1, ldc, 1); EXTRA_DY $1, yvec12, xvec4; #ifndef TRMMKERNEL @@ -702,10 +705,10 @@ LDL_DY 6*SIZE(C1, ldc, 1), xvec2, xvec2; LDH_DY 7*SIZE(C1, ldc, 1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec12, 4*SIZE(C0, ldc, 1); -STH_DX xvec12, 5*SIZE(C0, ldc ,1); -STL_DX xvec4, 6*SIZE(C1, ldc, 1); -STH_DX xvec4, 7*SIZE(C1, ldc, 1); +STL_DY xvec12, 4*SIZE(C0, ldc, 1); +STH_DY xvec12, 5*SIZE(C0, ldc ,1); +STL_DY xvec4, 6*SIZE(C1, ldc, 1); +STH_DY xvec4, 7*SIZE(C1, ldc, 1); EXTRA_DY $1, yvec11, xvec7; #ifndef TRMMKERNEL @@ -716,10 +719,10 @@ LDL_DY 2*SIZE(C0), xvec5, xvec5; LDH_DY 3*SIZE(C0), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec11, 0*SIZE(C1); -STH_DX xvec11, 1*SIZE(C1); -STL_DX xvec7, 2*SIZE(C0); -STH_DX xvec7, 3*SIZE(C0); +STL_DY xvec11, 0*SIZE(C1); +STH_DY xvec11, 1*SIZE(C1); +STL_DY xvec7, 2*SIZE(C0); +STH_DY xvec7, 3*SIZE(C0); EXTRA_DY $1, yvec10, xvec4; #ifndef TRMMKERNEL @@ -730,10 +733,10 @@ LDL_DY 6*SIZE(C0), xvec2, xvec2; LDH_DY 7*SIZE(C0), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec10, 4*SIZE(C1); -STH_DX xvec10, 5*SIZE(C1); -STL_DX xvec4, 6*SIZE(C0); -STH_DX xvec4, 7*SIZE(C0); +STL_DY xvec10, 4*SIZE(C1); +STH_DY xvec10, 5*SIZE(C1); +STL_DY xvec4, 6*SIZE(C0); +STH_DY xvec4, 7*SIZE(C0); EXTRA_DY $1, yvec9, xvec7; #ifndef TRMMKERNEL @@ -744,10 +747,10 @@ LDL_DY 2*SIZE(C0, ldc, 1), xvec5, xvec5; LDH_DY 3*SIZE(C0, ldc ,1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec9, 0*SIZE(C1, ldc, 1); -STH_DX xvec9, 1*SIZE(C1, ldc, 1); -STL_DX xvec7, 2*SIZE(C0, ldc, 1); -STH_DX xvec7, 3*SIZE(C0, ldc, 1); +STL_DY xvec9, 0*SIZE(C1, ldc, 1); +STH_DY xvec9, 1*SIZE(C1, ldc, 1); +STL_DY xvec7, 2*SIZE(C0, ldc, 1); +STH_DY xvec7, 3*SIZE(C0, ldc, 1); EXTRA_DY $1, yvec8, xvec4; #ifndef TRMMKERNEL @@ -758,10 +761,10 @@ LDL_DY 6*SIZE(C0, ldc, 1), xvec2, xvec2; LDH_DY 7*SIZE(C0, ldc, 1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec8, 4*SIZE(C1, ldc, 1); -STH_DX xvec8, 5*SIZE(C1, ldc, 1); -STL_DX xvec4, 6*SIZE(C0, ldc, 1); -STH_DX xvec4, 7*SIZE(C0, ldc, 1); +STL_DY xvec8, 4*SIZE(C1, ldc, 1); +STH_DY xvec8, 5*SIZE(C1, ldc, 1); +STL_DY xvec4, 6*SIZE(C0, ldc, 1); +STH_DY xvec4, 7*SIZE(C0, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 23eda3af8..59458effe 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -251,6 +251,7 @@ movq %r15, 40(%rsp); #ifdef TRMMKERNEL movq old_offset, %r11 #endif + movaps %xmm3, %xmm0 #else movq old_ldc, ldc From 037d995c4d8c2c5281d9141ce2905f44cc908ac2 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Tue, 19 Jun 2012 22:05:32 +0200 Subject: [PATCH 058/162] Fixed noisy warning with Clang ../common_thread.h:138:24: warning: equality comparison with extraneous parentheses [-Wparentheses-equality] if ((blas_cpu_number == 1) ~~~~~~~~~~~~~~~~^~~~ ../common_thread.h:138:24: note: remove extraneous parentheses around the comparison to silence this warning if ((blas_cpu_number == 1) ~ ^ ~ ../common_thread.h:138:24: note: use '=' to turn this equality comparison into an assignment if ((blas_cpu_number == 1) ^~ = --- common_thread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_thread.h b/common_thread.h index dc963a635..97e060976 100644 --- a/common_thread.h +++ b/common_thread.h @@ -135,7 +135,7 @@ static __inline int num_cpu_avail(int level) { int openmp_nthreads=0; #endif - if ((blas_cpu_number == 1) + if (blas_cpu_number == 1 #ifdef USE_OPENMP || omp_in_parallel() From fda5e0da8a0a43234ef1f70e719f4a5dd60fad0d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 21 Jun 2012 08:25:52 +0800 Subject: [PATCH 059/162] Refs #83. Clang 3.1 works fine on Sandy Bridge Mac OSX. Edit the document. --- README | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README b/README index 6372e96bd..b3f1baa79 100644 --- a/README +++ b/README @@ -34,8 +34,10 @@ Please read GotoBLAS_01Readme.txt Additional support CPU: x86_64: Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. + Intel Sandy Bridge MIPS64: - ICT Loongson 3A //Level 3 BLAS subroutines are optimized. + ICT Loongson 3A + ICT Loongson 3B (Experimental) 4.Usages Link with libopenblas.a or -lopenblas for shared library. @@ -70,10 +72,10 @@ OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas 8.ChangeLog Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. -9.Known Issues -* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit - is 64. On 32 bits, it is 32. -* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. +9.Troubleshooting +* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. +* The number of CPUs/Cores should less than or equal to 256. +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. 10. Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). From 157cc5444981c60bd72e924eee0663fb96c6de48 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 08:04:58 +0800 Subject: [PATCH 060/162] Update git ignore file. --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 6cfc5b3c1..118205ca2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,16 +1,23 @@ *.obj *.lib *.dll +*.dylib *.def *.o lapack-3.1.1 lapack-3.1.1.tgz +lapack-3.4.1 +lapack-3.4.1.tgz *.so *.a .svn *~ +lib.grd +nohup.out config.h Makefile.conf +Makefile.conf_last +config_last.h getarch getarch_2nd utest/openblas_utest From fe809c39f9b3696a45531734e85edd9ff5eb93ff Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 08:22:53 +0800 Subject: [PATCH 061/162] Update the docs for 0.2.0 --- Makefile.rule | 2 +- README => README.md | 80 ++++++++++++++++++++++++++------------------- 2 files changed, 47 insertions(+), 35 deletions(-) rename README => README.md (64%) diff --git a/Makefile.rule b/Makefile.rule index 56cd63540..299273773 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.1.1 +VERSION = 0.2.0 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/README b/README.md similarity index 64% rename from README rename to README.md index b3f1baa79..000bc4158 100644 --- a/README +++ b/README.md @@ -1,34 +1,41 @@ -OpenBLAS Readme +# OpenBLAS -1.Introduction -OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn) -Please read the documents on OpenBLAS wiki pages(http://github.com/xianyi/OpenBLAS/wiki). +## Introduction +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS . -2.Intallation +Please read the documents on OpenBLAS wiki pages. + +## Intallation Download from project homepage. http://xianyi.github.com/OpenBLAS/ Or, check out codes from git://github.com/xianyi/OpenBLAS.git -1)Normal compile - (a) type "make" to detect the CPU automatically. +### Normal compile + * type "make" to detect the CPU automatically. or - (b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. + * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. -2)Cross compile +### Cross compile Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. -examples: +Examples: + On X86 box, compile this library for loongson3a CPU. -make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A -3)Debug version -make DEBUG=1 + make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A + +### Debug version + + make DEBUG=1 + +### Intall to the directory (Optional) + +Example: + + make install PREFIX=your_installation_directory -4)Intall to the directory (Optional) -e.g. -make install PREFIX=your_installation_directory The default directory is /opt/OpenBLAS -3.Support CPU & OS +## Support CPU & OS Please read GotoBLAS_01Readme.txt Additional support CPU: @@ -39,45 +46,50 @@ MIPS64: ICT Loongson 3A ICT Loongson 3B (Experimental) -4.Usages +## Usages Link with libopenblas.a or -lopenblas for shared library. -4.1 Set the number of threads with environment variables. for example, -export OPENBLAS_NUM_THREADS=4 +### Set the number of threads with environment variables. + +Examples: + export OPENBLAS_NUM_THREADS=4 + or -export GOTO_NUM_THREADS=4 + + export GOTO_NUM_THREADS=4 + or -export OMP_NUM_THREADS=4 + + export OMP_NUM_THREADS=4 The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. -4.2 Set the number of threads with calling functions. for example, -void goto_set_num_threads(int num_threads); -or -void openblas_set_num_threads(int num_threads); +### Set the number of threads with calling functions. + +Examples: + void goto_set_num_threads(int num_threads); + void openblas_set_num_threads(int num_threads); If you compile this lib with USE_OPENMP=1, you should use the above functions, too. -5.Report Bugs +## Report Bugs Please add a issue in https://github.com/xianyi/OpenBLAS/issues -6.To-Do List: -Optimization on ICT Loongson 3A CPU - -7.Contact +## Contact OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas -8.ChangeLog +## ChangeLog Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. -9.Troubleshooting +## Troubleshooting +* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. * The number of CPUs/Cores should less than or equal to 256. * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. -10. Specification of Git Branches +## Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). Now, there are 4 branches in github.com. * The master branch. This a main branch to reflect a production-ready state. From a6214c057e6b06783e08c3b450a24c3f86a63c31 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 08:34:40 +0800 Subject: [PATCH 062/162] Modified readme. --- README.md | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 000bc4158..80116c658 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,12 @@ ## Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS . -Please read the documents on OpenBLAS wiki pages. +Please read the documents on OpenBLAS wiki pages . ## Intallation Download from project homepage. http://xianyi.github.com/OpenBLAS/ -Or, -check out codes from git://github.com/xianyi/OpenBLAS.git + +Or, check out codes from git://github.com/xianyi/OpenBLAS.git ### Normal compile * type "make" to detect the CPU automatically. or @@ -38,13 +38,15 @@ The default directory is /opt/OpenBLAS ## Support CPU & OS Please read GotoBLAS_01Readme.txt -Additional support CPU: -x86_64: - Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. - Intel Sandy Bridge -MIPS64: - ICT Loongson 3A - ICT Loongson 3B (Experimental) +### Additional support CPU: + +#### x86/x86-64: +* Intel Xeon 56xx (Westmere). Used GotoBLAS2 Nehalem codes. +* Intel Sandy Bridge. Optimized Level-3 BLAS with AVX on x86-64. +* AMD Bobcat. Used GotoBLAS2 Barcelona codes. +#### MIPS64: +* ICT Loongson 3A. Optimized Level-3 BLAS and the part of Level-1,2. +* ICT Loongson 3B (Experimental) ## Usages Link with libopenblas.a or -lopenblas for shared library. @@ -52,6 +54,7 @@ Link with libopenblas.a or -lopenblas for shared library. ### Set the number of threads with environment variables. Examples: + export OPENBLAS_NUM_THREADS=4 or @@ -69,7 +72,9 @@ If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS enviro ### Set the number of threads with calling functions. Examples: + void goto_set_num_threads(int num_threads); + void openblas_set_num_threads(int num_threads); If you compile this lib with USE_OPENMP=1, you should use the above functions, too. From 544af1efec5602e7413c1211dd0deb92d97b5b26 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 09:35:19 +0800 Subject: [PATCH 063/162] Correct the error in readme --- README.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 80116c658..a13e069ec 100644 --- a/README.md +++ b/README.md @@ -41,12 +41,19 @@ Please read GotoBLAS_01Readme.txt ### Additional support CPU: #### x86/x86-64: -* Intel Xeon 56xx (Westmere). Used GotoBLAS2 Nehalem codes. -* Intel Sandy Bridge. Optimized Level-3 BLAS with AVX on x86-64. -* AMD Bobcat. Used GotoBLAS2 Barcelona codes. +- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. +- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. +- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. + #### MIPS64: -* ICT Loongson 3A. Optimized Level-3 BLAS and the part of Level-1,2. -* ICT Loongson 3B (Experimental) +- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. +- **ICT Loongson 3B**: Experimental + +### Support OS: +- **GNU/Linux** +- **MingWin/Windows**: Please read . +- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. +- **FreeBSD**: Supportted by community. We didn't test the library on this OS. ## Usages Link with libopenblas.a or -lopenblas for shared library. From 422359d09ac28b27bb652b303318485fb2c02cca Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 11:32:43 +0800 Subject: [PATCH 064/162] Export openblas_set_num_threads in shared library. --- cblas.h | 4 ++++ common_interface.h | 2 ++ driver/others/Makefile | 4 ++-- driver/others/openblas_set_num_threads.c | 13 ++++++++++--- exports/gensymbol | 18 +++++++++++++----- 5 files changed, 31 insertions(+), 10 deletions(-) diff --git a/cblas.h b/cblas.h index f3708a994..ee8bf08b2 100644 --- a/cblas.h +++ b/cblas.h @@ -9,6 +9,10 @@ extern "C" { #include #include "common.h" +/*Set the number of threads on runtime.*/ +void openblas_set_num_threads(int num_threads); +void goto_set_num_threads(int num_threads); + #define CBLAS_INDEX size_t enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; diff --git a/common_interface.h b/common_interface.h index 898d91001..dbe0bb851 100644 --- a/common_interface.h +++ b/common_interface.h @@ -45,6 +45,8 @@ extern "C" { int BLASFUNC(xerbla)(char *, blasint *info, blasint); +void BLASFUNC(openblas_set_num_threads)(int *); + FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); diff --git a/driver/others/Makefile b/driver/others/Makefile index 75b552b65..2fdbb4a42 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,12 +1,12 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) ifdef SMP -COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) +COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) ifndef NO_AFFINITY COMMONOBJS += init.$(SUFFIX) endif diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index 7ca3b7114..27de83ffc 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,7 +33,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef SMP_SERVER -#ifdef OS_LINUX extern void openblas_set_num_threads(int num_threads) ; @@ -41,5 +40,13 @@ void NAME(int* num_threads){ openblas_set_num_threads(*num_threads); } -#endif +#else +//Single thread + +void openblas_set_num_threads(int num_threads) { +} + +void NAME(int* num_threads){ + +} #endif diff --git a/exports/gensymbol b/exports/gensymbol index dbd559473..61e7c8367 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -72,6 +72,14 @@ zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m, ); +@misc_no_underscore_objs = ( + openblas_set_num_threads, goto_set_num_threads, + ); + +@misc_underscore_objs = ( + openblas_set_num_threads, + ); + @lapackobjs = ( # These routines are provided by OpenBLAS. sgesv, dgesv, cgesv, zgesv, @@ -2660,11 +2668,11 @@ if ($ARGV[5] == 1) { #NO_LAPACK=1 - @underscore_objs = (@blasobjs); + @underscore_objs = (@blasobjs, @misc_underscore_objs); } elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1") { - @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2); + @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs); } else { - @underscore_objs = (@blasobjs, @lapackobjs); + @underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs); } if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); }; @@ -2678,10 +2686,10 @@ if ($ARGV[1] eq "ia64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "MIPS"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[4] == 0) { - @no_underscore_objs = (@cblasobjs); + @no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs); }else{ #NO_CBLAS=1 - @no_underscore_objs = (); + @no_underscore_objs = (@misc_no_underscore_objs); } if ($ARGV[6] == 1) { #NO_LAPACKE=1 From 853d16ed7ec9169cf03ec024f5894e9a597c7da1 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sat, 23 Jun 2012 13:07:38 +0800 Subject: [PATCH 065/162] Added openblas_set_num_threads dummy function on Windows. We plan to implement this feature in next version. --- driver/others/blas_server_win32.c | 8 ++++++++ exports/gensymbol | 10 +++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 6708509e1..c71e7c276 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -63,6 +63,14 @@ static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; +void goto_set_num_threads(int num) +{ +} + +void openblas_set_num_threads(int num) +{ +} + static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ diff --git a/exports/gensymbol b/exports/gensymbol index 61e7c8367..e09a8b6ab 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -2759,6 +2759,10 @@ if ($ARGV[0] eq "aix"){ if ($ARGV[0] eq "win2k"){ print "EXPORTS\n"; $count = 1; + + #remove openblas_set_num_threads + @underscore_objs = grep /[^openblas_set_num_threads]/,@underscore_objs; + foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; @@ -2769,7 +2773,11 @@ if ($ARGV[0] eq "win2k"){ print "\t$uppercase=$objs", "_ \@", $count, "\n"; $count ++; } - + + #for openblas_set_num_threads + print "\topenblas_set_num_threads_=openblas_set_num_threads_ \@", $count, "\n"; + $count ++; + # if ($ARGV[4] == 0) { foreach $objs (@no_underscore_objs) { print "\t",$objs,"=$objs"," \@", $count, "\n"; From b39c51195b0ec09d17a0bcf345fcd7873f352acc Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 25 Jun 2012 14:29:17 +0800 Subject: [PATCH 066/162] Fixed the build bug about Sandy Bridge on 32-bit. We used Nehalem/Penryn codes on Sandy Bridge 32-bit. --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 27aeeb6ac..5465c1cbd 100644 --- a/param.h +++ b/param.h @@ -928,14 +928,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 -#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 From 0a958b6a02d22102822c580a4213db4486cda90c Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 25 Jun 2012 17:28:49 +0800 Subject: [PATCH 067/162] Refs #118. Detect AMD Bulldozer as Barcelona. --- cpuid.h | 2 ++ cpuid_x86.c | 6 ++++++ getarch.c | 7 ++++--- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/cpuid.h b/cpuid.h index fdcfcea00..bb57ad92d 100644 --- a/cpuid.h +++ b/cpuid.h @@ -105,6 +105,7 @@ #define CORE_NANO 19 #define CORE_SANDYBRIDGE 20 #define CORE_BOBCAT 21 +#define CORE_BULLDOZER 22 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -193,4 +194,5 @@ typedef struct { #define CPUTYPE_NANO 43 #define CPUTYPE_SANDYBRIDGE 44 #define CPUTYPE_BOBCAT 45 +#define CPUTYPE_BULLDOZER 46 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 204f41d51..ea1162e8f 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1027,6 +1027,7 @@ int get_cpuname(void){ return CPUTYPE_OPTERON; case 1: case 10: + case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series return CPUTYPE_BARCELONA; case 5: return CPUTYPE_BOBCAT; @@ -1151,6 +1152,7 @@ static char *cpuname[] = { "NANO", "SANDYBRIDGE", "BOBCAT", + "BULLDOZER", }; static char *lowercpuname[] = { @@ -1199,6 +1201,7 @@ static char *lowercpuname[] = { "nano", "sandybridge", "bobcat", + "bulldozer", }; static char *corename[] = { @@ -1224,6 +1227,7 @@ static char *corename[] = { "NANO", "SANDYBRIDGE", "BOBCAT", + "BULLDOZER", }; static char *corename_lower[] = { @@ -1249,6 +1253,7 @@ static char *corename_lower[] = { "nano", "sandybridge", "bobcat", + "bulldozer", }; @@ -1359,6 +1364,7 @@ int get_coretype(void){ if (family == 0xf){ if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else if (exfamily == 5) return CORE_BOBCAT; + else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series else return CORE_BARCELONA; } } diff --git a/getarch.c b/getarch.c index 7e08e774e..5916a9a04 100644 --- a/getarch.c +++ b/getarch.c @@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_BARCELONA */ /* #define FORCE_SHANGHAI */ /* #define FORCE_ISTANBUL */ +/* #define FORCE_BULLDOZER */ /* #define FORCE_BOBCAT */ /* #define FORCE_SSE_GENERIC */ /* #define FORCE_VIAC3 */ @@ -349,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "OPTERON" #endif -#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) +#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" @@ -357,8 +358,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DBARCELONA " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL3_SIZE=2097152 " \ - "-DDTB_DEFAULT_ENTRIES=48 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ - "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ + "-DDTB_DEFAULT_ENTRIES=48 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" #define LIBNAME "barcelona" #define CORENAME "BARCELONA" From 857a0fa0df83cd3ff79a4047ad31a9f0f9e9f5da Mon Sep 17 00:00:00 2001 From: wangqian Date: Mon, 25 Jun 2012 19:00:37 +0800 Subject: [PATCH 068/162] Fixed the issue of mixing AVX and SSE codes in S/D/C/ZGEMM. --- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 1827 ++++++++++++------------ kernel/x86_64/dgemm_kernel_4x8_sandy.S | 1041 +++++++------- kernel/x86_64/sgemm_kernel_8x8_sandy.S | 1605 +++++++++++---------- kernel/x86_64/zgemm_kernel_4x4_sandy.S | 358 +++-- 4 files changed, 2379 insertions(+), 2452 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 5987b8e61..5a5588089 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -150,79 +150,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MOVQ movq #define XOR_SY vxorps -#define XOR_DY vxorpd -#define XOR_SX xorps -#define XOR_DX xorpd +#define XOR_SX vxorps #define LD_SY vmovaps -#define LD_DY vmovapd -#define LD_SX movaps -#define LD_DX movapd -#define LDL_SX movlps +#define LD_SX vmovaps +#define LDL_SX vmovlps #define LDL_SY vmovlps -#define LDH_SX movhps +#define LDH_SX vmovhps #define LDH_SY vmovhps #define ST_SY vmovaps -#define ST_DY vmovapd -#define ST_SX movaps -#define ST_DX movapd -#define STL_SX movlps +#define ST_SX vmovaps +#define STL_SX vmovlps #define STL_SY vmovlps -#define STH_SX movhps +#define STH_SX vmovhps #define STH_SY vmovhps #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup -#define EDUP_SX movsldup -#define ODUP_SX movshdup -#define EDUP_DY vmovddup +#define EDUP_SX vmovsldup +#define ODUP_SX vmovshdup #define ADD_SY vaddps -#define ADD_DY vaddpd -#define ADD_SX addps -#define ADD_DX addpd -#define SUB_DY vsubpd +#define ADD_SX vaddps #define SUB_SY vsubps -#define SUB_DX subpd -#define SUB_SX subps +#define SUB_SX vsubps -#define ADDSUB_DY vaddsubpd -#define ADDSUB_DX addsubpd #define ADDSUB_SY vaddsubps -#define ADDSUB_SX addsubps +#define ADDSUB_SX vaddsubps #define MUL_SY vmulps -#define MUL_DY vmulpd -#define MUL_SX mulps -#define MUL_DX mulpd +#define MUL_SX vmulps #define SHUF_SY vperm2f128 -#define SHUF_DY vperm2f128 -#define SHUF_DX pshufd -#define SHUF_SX pshufd +#define SHUF_SX vpshufd #define VPERMILP_SY vpermilps #define VPERMILP_SX vpermilps -#define VPERMILP_DY vpermilpd #define BROAD_SY vbroadcastss -#define BROAD_DY vbroadcastsd #define BROAD_SX vbroadcastss -#define BROAD_DX movddup #define MOV_SY vmovaps -#define MOV_DY vmovapd -#define MOV_SX movaps -#define MOV_DX movapd +#define MOV_SX vmovaps #define REVS_SY vshufps -#define REVS_DY vshufpd -#define REVS_SX shufps -#define REVS_DX movsd +#define REVS_SX vshufps #define EXTRA_SY vextractf128 -#define EXTRA_DY vextractf128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1_SY ADD_SY @@ -289,6 +264,8 @@ movq old_offset, %r11; #endif #endif +vzeroupper + vmovlps %xmm0, MEMALPHA_R vmovlps %xmm1, MEMALPHA_I movq old_bm, bm @@ -1417,64 +1394,64 @@ REVS_SY $0xe4,yvec7,yvec9,yvec9; #### Writing back #### EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec6; -LDH_SX 2*SIZE(C0), xvec6; -ADD_SX xvec6, xvec15; +LDL_SX 0*SIZE(C0), xvec6, xvec6; +LDH_SX 2*SIZE(C0), xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); #ifndef TRMMKERNEL -LDL_SX 4*SIZE(C1), xvec4; -LDH_SX 6*SIZE(C1), xvec4; -ADD_SX xvec4, xvec7; +LDL_SX 4*SIZE(C1), xvec4, xvec4; +LDH_SX 6*SIZE(C1), xvec4, xvec4; +ADD_SX xvec4, xvec7, xvec7; #endif STL_SX xvec7, 4*SIZE(C1); STH_SX xvec7, 6*SIZE(C1); EXTRA_SY $1, yvec13, xvec5; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0, ldc, 1), xvec4; -LDH_SX 2*SIZE(C0, ldc, 1), xvec4; -ADD_SX xvec4, xvec13; +LDL_SX 0*SIZE(C0, ldc, 1), xvec4, xvec4; +LDH_SX 2*SIZE(C0, ldc, 1), xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; #endif STL_SX xvec13, 0*SIZE(C0, ldc, 1); STH_SX xvec13, 2*SIZE(C0, ldc, 1); #ifndef TRMMKERNEL -LDL_SX 4*SIZE(C1, ldc, 1), xvec2; -LDH_SX 6*SIZE(C1, ldc, 1), xvec2; -ADD_SX xvec2, xvec5; +LDL_SX 4*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_SX 6*SIZE(C1, ldc, 1), xvec2, xvec2; +ADD_SX xvec2, xvec5, xvec5; #endif STL_SX xvec5, 4*SIZE(C1, ldc, 1); STH_SX xvec5, 6*SIZE(C1, ldc, 1); EXTRA_SY $1, yvec11, xvec3; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1), xvec2; -LDH_SX 2*SIZE(C1), xvec2; -ADD_SX xvec2, xvec11; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C1), xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C1); #ifndef TRMMKERNEL -LDL_SX 4*SIZE(C0), xvec0; -LDH_SX 6*SIZE(C0), xvec0; -ADD_SX xvec0, xvec3; +LDL_SX 4*SIZE(C0), xvec0, xvec0; +LDH_SX 6*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec3, xvec3; #endif STL_SX xvec3, 4*SIZE(C0); STH_SX xvec3, 6*SIZE(C0); EXTRA_SY $1, yvec9, xvec1; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1, ldc, 1), xvec0; -LDH_SX 2*SIZE(C1, ldc, 1), xvec0; -ADD_SX xvec0, xvec9; +LDL_SX 0*SIZE(C1, ldc, 1), xvec0, xvec0; +LDH_SX 2*SIZE(C1, ldc, 1), xvec0, xvec0; +ADD_SX xvec0, xvec9, xvec9; #endif STL_SX xvec9, 0*SIZE(C1, ldc, 1); STH_SX xvec9, 2*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL -LDL_SX 4*SIZE(C0, ldc, 1), xvec6; -LDH_SX 6*SIZE(C0, ldc, 1), xvec6; -ADD_SX xvec6, xvec1; +LDL_SX 4*SIZE(C0, ldc, 1), xvec6, xvec6; +LDH_SX 6*SIZE(C0, ldc, 1), xvec6, xvec6; +ADD_SX xvec6, xvec1, xvec1; #endif STL_SX xvec1, 4*SIZE(C0, ldc, 1); STH_SX xvec1, 6*SIZE(C0, ldc, 1); @@ -1533,122 +1510,122 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 16*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 16*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 24*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 24*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; @@ -1666,62 +1643,62 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -1737,32 +1714,32 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -1770,29 +1747,29 @@ ADDQ $8*SIZE, ptrbb; #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec13, xvec7; +ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec12, xvec7; +ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec14, xvec7; +SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec13, xvec7; +SUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec12, xvec7; +SUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; @@ -1800,16 +1777,16 @@ SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec13, xvec13; SHUF_SX $0xb1, xvec12, xvec12; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec13, xvec7; +ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec12, xvec7; +ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; @@ -1821,35 +1798,35 @@ BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; -MUL_SX xvec7, xvec14; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec14; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec13, xvec3; -MUL_SX xvec7, xvec13; -MUL_SX xvec6, xvec3; -ADDSUB_SX xvec3, xvec13; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec6, xvec3, xvec3; +ADDSUB_SX xvec3, xvec13, xvec13; VPERMILP_SX $0xb1,xvec12, xvec2; -MUL_SX xvec7, xvec12; -MUL_SX xvec6, xvec2; -ADDSUB_SX xvec2, xvec12; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec6, xvec2, xvec2; +ADDSUB_SX xvec2, xvec12, xvec12; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0, ldc, 1), xvec0; -LDL_SX 0*SIZE(C0, ldc, 1), xvec1; -LDH_SX 2*SIZE(C0), xvec1; -LDL_SX 0*SIZE(C1), xvec2; -LDH_SX 2*SIZE(C1, ldc, 1), xvec2; -LDL_SX 0*SIZE(C1, ldc, 1), xvec3; -LDH_SX 2*SIZE(C1), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0, ldc,1), xvec0, xvec0; +LDL_SX 0*SIZE(C0, ldc,1), xvec1, xvec1; +LDH_SX 2*SIZE(C0), xvec1, xvec1; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_SX 0*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_SX 2*SIZE(C1), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0, ldc, 1); @@ -1911,70 +1888,70 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 4*SIZE(ptrba), xvec0; LD_SX 16*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 5*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 6*SIZE(ptrba), xvec0; LD_SX 24*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 7*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; @@ -1992,36 +1969,36 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -2037,19 +2014,19 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2057,26 +2034,26 @@ ADDQ $8*SIZE, ptrbb; #### Handle #### #if defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec14, xvec7; +SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; @@ -2086,21 +2063,21 @@ BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; -MUL_SX xvec7, xvec14; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec14; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 0*SIZE(C0, ldc, 1), xvec0; -LDL_SX 0*SIZE(C1), xvec1; -LDH_SX 0*SIZE(C1, ldc, 1), xvec1; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 0*SIZE(C0, ldc, 1), xvec0, xvec0; +LDL_SX 0*SIZE(C1), xvec1, xvec1; +LDH_SX 0*SIZE(C1, ldc, 1), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C0, ldc, 1); @@ -2191,59 +2168,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; @@ -2252,59 +2229,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; @@ -2313,59 +2290,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 32*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 36*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 40*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 44*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; @@ -2374,59 +2351,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 48*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 52*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 56*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 60*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; ADDQ $64*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -2448,59 +2425,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; @@ -2509,59 +2486,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2581,59 +2558,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb @@ -2641,53 +2618,53 @@ ADDQ $4*SIZE, ptrbb #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec13, xvec7; +ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec12, xvec7; +ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec10, xvec7; +ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec9, xvec7; +ADDSUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec8, xvec7; +ADDSUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec14, xvec7; +SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec13, xvec7; +SUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec12, xvec7; +SUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec11, xvec7; +SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec10, xvec7; +SUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec9, xvec7; +SUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec8, xvec7; +SUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; @@ -2699,28 +2676,28 @@ SHUF_SX $0xb1, xvec10, xvec10; SHUF_SX $0xb1, xvec9, xvec9; SHUF_SX $0xb1, xvec8, xvec8; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec13, xvec7; +ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec12, xvec7; +ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec10, xvec7; +ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec9, xvec7; +ADDSUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec8, xvec7; +ADDSUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; @@ -2736,50 +2713,50 @@ BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; -MUL_SX xvec7, xvec14; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec14; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec13, xvec3; -MUL_SX xvec7, xvec13; -MUL_SX xvec6, xvec3; -ADDSUB_SX xvec3, xvec13; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec6, xvec3, xvec3; +ADDSUB_SX xvec3, xvec13, xvec13; VPERMILP_SX $0xb1,xvec12, xvec2; -MUL_SX xvec7, xvec12; -MUL_SX xvec6, xvec2; -ADDSUB_SX xvec2, xvec12; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec6, xvec2, xvec2; +ADDSUB_SX xvec2, xvec12, xvec12; VPERMILP_SX $0xb1,xvec11, xvec1; -MUL_SX xvec7, xvec11; -MUL_SX xvec6, xvec1; -ADDSUB_SX xvec1, xvec11; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADDSUB_SX xvec1, xvec11, xvec11; VPERMILP_SX $0xb1,xvec10, xvec0; -MUL_SX xvec7, xvec10; -MUL_SX xvec6, xvec0; -ADDSUB_SX xvec0, xvec10; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec6, xvec0, xvec0; +ADDSUB_SX xvec0, xvec10, xvec10; VPERMILP_SX $0xb1,xvec9, xvec5; -MUL_SX xvec7, xvec9; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec9; +MUL_SX xvec7, xvec9, xvec9; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec9, xvec9; VPERMILP_SX $0xb1,xvec8, xvec4; -MUL_SX xvec7, xvec8; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec8; +MUL_SX xvec7, xvec8, xvec8; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec8, xvec8; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C1), xvec1; -LDL_SX 8*SIZE(C0), xvec2; -LDH_SX 10*SIZE(C1), xvec2; -LDL_SX 12*SIZE(C0), xvec3; -LDH_SX 14*SIZE(C1), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +LDL_SX 8*SIZE(C0), xvec2, xvec2; +LDH_SX 10*SIZE(C1), xvec2, xvec2; +LDL_SX 12*SIZE(C0), xvec3, xvec3; +LDH_SX 14*SIZE(C1), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2790,18 +2767,18 @@ STH_SX xvec13, 10*SIZE(C1); STL_SX xvec12, 12*SIZE(C0); STH_SX xvec12, 14*SIZE(C1); #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1), xvec4; -LDH_SX 2*SIZE(C0), xvec4; -LDL_SX 4*SIZE(C1), xvec5; -LDH_SX 6*SIZE(C0), xvec5; -LDL_SX 8*SIZE(C1), xvec6; -LDH_SX 10*SIZE(C0), xvec6; -LDL_SX 12*SIZE(C1), xvec7; -LDH_SX 14*SIZE(C0), xvec7; -ADD_SX xvec4, xvec11; -ADD_SX xvec5, xvec10; -ADD_SX xvec6, xvec9; -ADD_SX xvec7, xvec8; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +LDL_SX 4*SIZE(C1), xvec5, xvec5; +LDH_SX 6*SIZE(C0), xvec5, xvec5; +LDL_SX 8*SIZE(C1), xvec6, xvec6; +LDH_SX 10*SIZE(C0), xvec6, xvec6; +LDL_SX 12*SIZE(C1), xvec7, xvec7; +LDH_SX 14*SIZE(C0), xvec7, xvec7; +ADD_SX xvec4, xvec11, xvec11; +ADD_SX xvec5, xvec10, xvec10; +ADD_SX xvec6, xvec9, xvec9; +ADD_SX xvec7, xvec8, xvec8; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); @@ -2872,31 +2849,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; @@ -2906,31 +2883,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; #### Unroll 3 #### EDUP_SX 8*SIZE(ptrbb), xvec4; @@ -2940,31 +2917,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; #### Unroll 4 #### EDUP_SX 12*SIZE(ptrbb), xvec4; @@ -2974,31 +2951,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; ADDQ $32*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -3020,31 +2997,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; @@ -3054,31 +3031,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -3099,31 +3076,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -3131,29 +3108,29 @@ ADDQ $4*SIZE, ptrbb; #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec10, xvec7; +ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec14, xvec7; +SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec11, xvec7; +SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec10, xvec7; +SUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; @@ -3161,16 +3138,16 @@ SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec11, xvec11; SHUF_SX $0xb1, xvec10, xvec10; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec10, xvec7; +ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; @@ -3182,40 +3159,40 @@ BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; -MUL_SX xvec7, xvec14; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec14; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec11, xvec1; -MUL_SX xvec7, xvec11; -MUL_SX xvec6, xvec1; -ADDSUB_SX xvec1, xvec11; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADDSUB_SX xvec1, xvec11, xvec11; VPERMILP_SX $0xb1,xvec10, xvec0; -MUL_SX xvec7, xvec10; -MUL_SX xvec6, xvec0; -ADDSUB_SX xvec0, xvec10; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec6, xvec0, xvec0; +ADDSUB_SX xvec0, xvec10, xvec10; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C1), xvec1; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); STL_SX xvec14, 4*SIZE(C0); STH_SX xvec14, 6*SIZE(C1); #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1), xvec4; -LDH_SX 2*SIZE(C0), xvec4; -LDL_SX 4*SIZE(C1), xvec5; -LDH_SX 6*SIZE(C0), xvec5; -ADD_SX xvec4, xvec11; -ADD_SX xvec5, xvec10; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +LDL_SX 4*SIZE(C1), xvec5, xvec5; +LDH_SX 6*SIZE(C0), xvec5, xvec5; +ADD_SX xvec4, xvec11, xvec11; +ADD_SX xvec5, xvec10, xvec10; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); @@ -3277,17 +3254,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; @@ -3297,17 +3274,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; #### Unroll 3 #### EDUP_SX 8*SIZE(ptrbb), xvec4; @@ -3317,17 +3294,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; #### Unroll 4 #### EDUP_SX 12*SIZE(ptrbb), xvec4; @@ -3337,17 +3314,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -3369,17 +3346,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; @@ -3389,17 +3366,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -3419,17 +3396,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -3437,26 +3414,26 @@ ADDQ $4*SIZE, ptrbb; #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec11, xvec7; +SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec11, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec11, xvec11; @@ -3466,24 +3443,24 @@ BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec11, xvec1; -MUL_SX xvec7, xvec11; -MUL_SX xvec6, xvec1; -ADDSUB_SX xvec1, xvec11; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADDSUB_SX xvec1, xvec11, xvec11; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -ADD_SX xvec0, xvec15; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1), xvec4; -LDH_SX 2*SIZE(C0), xvec4; -ADD_SX xvec4, xvec11; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); @@ -3538,42 +3515,42 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 4*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 5*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 6*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 7*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -3591,22 +3568,22 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -3622,12 +3599,12 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -3635,14 +3612,14 @@ ADDQ $4*SIZE, ptrbb; #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(NR) || defined(NC) || defined(TR) || defined(TC) -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif @@ -3651,14 +3628,14 @@ BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 0*SIZE(C1), xvec0; -ADD_SX xvec0, xvec15; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 0*SIZE(C1), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C1); @@ -3908,18 +3885,18 @@ ADDSUB_SY yvec4, yvec14, yvec14; EXTRA_SY $1, yvec15, xvec7; EXTRA_SY $1, yvec14, xvec6; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C0), xvec1; -LDL_SX 8*SIZE(C0), xvec2; -LDH_SX 10*SIZE(C0), xvec2; -LDL_SX 12*SIZE(C0), xvec3; -LDH_SX 14*SIZE(C0), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec7; -ADD_SX xvec2, xvec14; -ADD_SX xvec3, xvec6; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C0), xvec1, xvec1; +LDL_SX 8*SIZE(C0), xvec2, xvec2; +LDH_SX 10*SIZE(C0), xvec2, xvec2; +LDL_SX 12*SIZE(C0), xvec3, xvec3; +LDH_SX 14*SIZE(C0), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec7, xvec7; +ADD_SX xvec2, xvec14, xvec14; +ADD_SX xvec3, xvec6, xvec6; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -4103,12 +4080,12 @@ ADDSUB_SY yvec5, yvec15, yvec15; #### Writing back #### EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C0), xvec1; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec7; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C0), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec7, xvec7; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -4163,42 +4140,42 @@ ALIGN_5 #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; #### Unroll 2 #### LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 3*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; #### Unroll 3 #### LD_SX 8*SIZE(ptrba), xvec0; BROAD_SX 4*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 5*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; #### Unroll 4 #### LD_SX 12*SIZE(ptrba), xvec0; BROAD_SX 6*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 7*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -4216,22 +4193,22 @@ ALIGN_5 #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; #### Unroll 2 #### LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 3*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -4247,12 +4224,12 @@ ALIGN_5 #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; @@ -4260,14 +4237,14 @@ ADDQ $2*SIZE, ptrbb; #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif @@ -4276,13 +4253,13 @@ BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0), xvec0; -ADD_SX xvec0, xvec15; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -4335,22 +4312,22 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -4368,12 +4345,12 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -4388,15 +4365,15 @@ ALIGN_5 .L343_bodyB: XOR_SY yvec0, yvec0, yvec0; XOR_SY yvec2, yvec2, yvec2; -LDL_SX 0*SIZE(ptrba), xvec0; -LDL_SX 0*SIZE(ptrbb), xvec2; +LDL_SX 0*SIZE(ptrba), xvec0, xvec0; +LDL_SX 0*SIZE(ptrbb), xvec2, xvec2; SHUF_SX $0xe0, xvec2, xvec3; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xe1, xvec0, xvec1; SHUF_SX $0xe5, xvec2, xvec4; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; @@ -4404,29 +4381,29 @@ ADDQ $2*SIZE, ptrbb; #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; VPERMILP_SX $0xb1, xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; SHUF_SX $0x44, xvec15, xvec14; SHUF_SX $0xee, xvec15, xvec13; -ADD_SX xvec13, xvec14; +ADD_SX xvec13, xvec14, xvec14; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -ADD_SX xvec0, xvec14; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec14, xvec14; #endif STL_SX xvec14, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) @@ -4458,6 +4435,8 @@ movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; +vzeroupper + #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index 603552464..3b1b2560e 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -143,71 +143,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef MOVQ #define MOVQ movq -#define XOR_SY vxorps #define XOR_DY vxorpd -#define XOR_SX xorps -#define XOR_DX xorpd +#define XOR_DX vxorpd -#define LD_SY vmovaps #define LD_DY vmovapd -#define LD_SX movaps -#define LD_DX movapd -#define LDL_DX movlpd +#define LD_DX vmovapd +#define LDL_DX vmovlpd #define LDL_DY vmovlpd -#define LDH_DX movhpd +#define LDH_DX vmovhpd #define LDH_DY vmovhpd -#define ST_SY vmovaps #define ST_DY vmovapd -#define ST_SX movaps -#define ST_DX movapd -#define STL_DX movlpd +#define ST_DX vmovapd +#define STL_DX vmovlpd #define STL_DY vmovlpd -#define STH_DX movhpd +#define STH_DX vmovhpd #define STH_DY vmovhpd -#define EDUP_SY vmovsldup -#define ODUP_SY vmovshdup #define EDUP_DY vmovddup -#define ADD_SY vaddps #define ADD_DY vaddpd -#define ADD_SX addps -#define ADD_DX addpd +#define ADD_DX vaddpd #define ADD1_DY vaddpd #define ADD2_DY vaddpd #define ADDSUB_DY vaddsubpd -#define ADDSUB_SY vaddsubps -#define MUL_SY vmulps #define MUL_DY vmulpd -#define MUL_SX mulps -#define MUL_DX mulpd +#define MUL_DX vmulpd -#define SHUF_SY vperm2f128 #define SHUF_DY vperm2f128 -#define SHUF_DX pshufd +#define SHUF_DX vpshufd -#define VPERMILP_SY vpermilps #define VPERMILP_DY vpermilpd -#define BROAD_SY vbroadcastss #define BROAD_DY vbroadcastsd -#define BROAD_SX -#define BROAD_DX movddup +#define BROAD_DX vmovddup -#define MOV_SY vmovaps #define MOV_DY vmovapd -#define MOV_SX movaps -#define MOV_DX movapd +#define MOV_DX vmovapd -#define REVS_SY vshufps #define REVS_DY vshufpd -#define REVS_SX shufps -#define REVS_DX movsd +#define REVS_DX vmovsd -#define EXTRA_SY vextractf128 #define EXTRA_DY vextractf128 PROLOGUE @@ -253,6 +231,8 @@ movq old_offset, %r11 #endif #endif +vzeroupper + vmovlps ALPHA, MEMALPHA movq old_bm, bm movq old_bn, bn @@ -988,14 +968,14 @@ EXTRA_DY $1,yvec13,xvec5; EXTRA_DY $1,yvec11,xvec3; EXTRA_DY $1,yvec9,xvec1; #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0),xvec15; -ADD_DX 2*SIZE(C1),xvec7; -ADD_DX 0*SIZE(C0,ldc,1),xvec13; -ADD_DX 2*SIZE(C1,ldc,1),xvec5; -ADD_DX 0*SIZE(C1),xvec11; -ADD_DX 2*SIZE(C0),xvec3; -ADD_DX 0*SIZE(C1,ldc,1),xvec9; -ADD_DX 2*SIZE(C0,ldc,1),xvec1; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec7, xvec7; +ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13; +ADD_DX 2*SIZE(C1, ldc, 1), xvec5, xvec5; +ADD_DX 0*SIZE(C1), xvec11, xvec11; +ADD_DX 2*SIZE(C0), xvec3, xvec3; +ADD_DX 0*SIZE(C1, ldc, 1), xvec9, xvec9; +ADD_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec7, 2*SIZE(C1); @@ -1025,18 +1005,18 @@ EXTRA_DY $1,yvec13,xvec5; EXTRA_DY $1,yvec11,xvec3; EXTRA_DY $1,yvec9,xvec1; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec14; -LDH_DX 1*SIZE(C0), xvec14; -LDL_DX 0*SIZE(C0, ldc, 1), xvec12; -LDH_DX 1*SIZE(C0, ldc, 1), xvec12; -LDL_DX 0*SIZE(C1), xvec10; -LDH_DX 1*SIZE(C1), xvec10; -LDL_DX 0*SIZE(C1, ldc, 1), xvec8; -LDH_DX 1*SIZE(C1, ldc, 1), xvec8; -ADD_DX xvec14, xvec15; -ADD_DX xvec12, xvec13; -ADD_DX xvec10, xvec11; -ADD_DX xvec8, xvec9; +LDL_DX 0*SIZE(C0), xvec14, xvec14; +LDH_DX 1*SIZE(C0), xvec14, xvec14; +LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12; +LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12; +LDL_DX 0*SIZE(C1), xvec10, xvec10; +LDH_DX 1*SIZE(C1), xvec10, xvec10; +LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8; +LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8; +ADD_DX xvec14, xvec15, xvec15; +ADD_DX xvec12, xvec13, xvec13; +ADD_DX xvec10, xvec11, xvec11; +ADD_DX xvec8, xvec9, xvec9; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -1047,18 +1027,18 @@ STH_DX xvec11, 1*SIZE(C1); STL_DX xvec9, 0*SIZE(C1, ldc, 1); STH_DX xvec9, 1*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL -LDL_DX 2*SIZE(C0), xvec0; -LDH_DX 3*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0, ldc, 1), xvec2; -LDH_DX 3*SIZE(C0, ldc, 1), xvec2; -LDL_DX 2*SIZE(C1), xvec4; -LDH_DX 3*SIZE(C1), xvec4; -LDL_DX 2*SIZE(C1, ldc, 1), xvec6; -LDH_DX 3*SIZE(C1, ldc, 1), xvec6; -ADD_DX xvec0, xvec3; -ADD_DX xvec2, xvec1; -ADD_DX xvec4, xvec7; -ADD_DX xvec6, xvec5; +LDL_DX 2*SIZE(C0), xvec0, xvec0; +LDH_DX 3*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_DX 3*SIZE(C0, ldc, 1), xvec2, xvec2; +LDL_DX 2*SIZE(C1), xvec4, xvec4; +LDH_DX 3*SIZE(C1), xvec4, xvec4; +LDL_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6; +LDH_DX 3*SIZE(C1, ldc, 1), xvec6, xvec6; +ADD_DX xvec0, xvec3, xvec3; +ADD_DX xvec2, xvec1, xvec1; +ADD_DX xvec4, xvec7, xvec7; +ADD_DX xvec6, xvec5, xvec5; #endif STL_DX xvec3, 2*SIZE(C0); STH_DX xvec3, 3*SIZE(C0); @@ -1128,72 +1108,72 @@ ALIGN_5; ##### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; SHUF_DX $0x4e, xvec3, xvec5; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 6*SIZE(ptrbb), xvec7; -MUL_DX xvec0, xvec3; -ADD_DX xvec3, xvec11; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; LD_DX 2*SIZE(ptrba), xvec1; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec6, xvec4; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; #### Unroll time 2 #### LD_DX 8*SIZE(ptrbb), xvec2; SHUF_DX $0x4e, xvec7, xvec5; -MUL_DX xvec1, xvec6; -ADD_DX xvec6, xvec15; +MUL_DX xvec1, xvec6, xvec6; +ADD_DX xvec6, xvec15, xvec15; LD_DX 10*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec7; -ADD_DX xvec7, xvec11; +MUL_DX xvec1, xvec7, xvec7; +ADD_DX xvec7, xvec11, xvec11; LD_DX 4*SIZE(ptrba), xvec0; -MUL_DX xvec1, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec1, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec2, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; ##### Unroll time 3 #### LD_DX 12*SIZE(ptrbb), xvec6; SHUF_DX $0x4e, xvec3, xvec5; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 14*SIZE(ptrbb), xvec7; -MUL_DX xvec0, xvec3; -ADD_DX xvec3, xvec11; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; ADDQ $16*SIZE, ptrbb; LD_DX 6*SIZE(ptrba), xvec1; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec6, xvec4; ADDQ $8*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; #### Unroll time 4 #### LD_DX 0*SIZE(ptrbb), xvec2; SHUF_DX $0x4e, xvec7, xvec5; -MUL_DX xvec1, xvec6; -ADD_DX xvec6, xvec15; +MUL_DX xvec1, xvec6, xvec6; +ADD_DX xvec6, xvec15, xvec15; LD_DX 2*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec7; -ADD_DX xvec7, xvec11; +MUL_DX xvec1, xvec7, xvec7; +ADD_DX xvec7, xvec11, xvec11; LD_DX 0*SIZE(ptrba), xvec0; -MUL_DX xvec1, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec1, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec2, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; DECQ k; JG .L10_bodyB; ALIGN_5 @@ -1210,39 +1190,39 @@ ALIGN_5 ##### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; SHUF_DX $0x4e, xvec3, xvec5; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 6*SIZE(ptrbb), xvec7; -MUL_DX xvec0, xvec3; -ADD_DX xvec3, xvec11; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; ADDQ $8*SIZE, ptrbb; LD_DX 2*SIZE(ptrba), xvec1; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec6, xvec4; ADDQ $4*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; #### Unroll time 2 #### LD_DX 0*SIZE(ptrbb), xvec2; SHUF_DX $0x4e, xvec7, xvec5; -MUL_DX xvec1, xvec6; -ADD_DX xvec6, xvec15; +MUL_DX xvec1, xvec6, xvec6; +ADD_DX xvec6, xvec15, xvec15; LD_DX 2*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec7; -ADD_DX xvec7, xvec11; +MUL_DX xvec1, xvec7, xvec7; +ADD_DX xvec7, xvec11, xvec11; LD_DX 0*SIZE(ptrba), xvec0; -MUL_DX xvec1, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec1, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec2, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; .L11_loopE:; #ifndef TRMMKERNEL @@ -1255,35 +1235,35 @@ JLE .L12_loopE; ALIGN_5 .L12_bodyB:; SHUF_DX $0x4e, xvec3, xvec5; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; -MUL_DX xvec0, xvec3; -ADD_DX xvec3, xvec11; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; ADDQ $2*SIZE, ptrba; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; .L12_loopE:; #### Load Alpha #### BROAD_DX MEMALPHA, xvec7; #### Multiply Alpha #### -MUL_DX xvec7, xvec15; -MUL_DX xvec7, xvec13; -MUL_DX xvec7, xvec11; -MUL_DX xvec7, xvec9; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec13, xvec13; +MUL_DX xvec7, xvec11, xvec11; +MUL_DX xvec7, xvec9, xvec9; #### Reverse the Results #### MOV_DX xvec15, xvec6; -REVS_DX xvec13, xvec15; -REVS_DX xvec6, xvec13; +REVS_DX xvec13, xvec15, xvec15; +REVS_DX xvec6, xvec13, xvec13; MOV_DX xvec11, xvec6; -REVS_DX xvec9, xvec11; -REVS_DX xvec6, xvec9; +REVS_DX xvec9, xvec11, xvec11; +REVS_DX xvec6, xvec9, xvec9; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -1292,10 +1272,10 @@ JNE .L12_loopEx; ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec13; -ADD_DX 0*SIZE(C0, ldc, 1), xvec15; -ADD_DX 0*SIZE(C1), xvec9; -ADD_DX 0*SIZE(C1, ldc, 1), xvec11; +ADD_DX 0*SIZE(C0), xvec13, xvec13; +ADD_DX 0*SIZE(C0, ldc, 1), xvec15, xvec15; +ADD_DX 0*SIZE(C1), xvec9, xvec9; +ADD_DX 0*SIZE(C1, ldc, 1), xvec11, xvec11; #endif ST_DX xvec13, 0*SIZE(C0); ST_DX xvec15, 0*SIZE(C0, ldc, 1); @@ -1317,18 +1297,18 @@ JMP .L9_loopE; ALIGN_5 .L12_loopEx: #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec14; -LDH_DX 1*SIZE(C0), xvec14; -LDL_DX 0*SIZE(C0, ldc, 1), xvec12; -LDH_DX 1*SIZE(C0, ldc, 1), xvec12; -LDL_DX 0*SIZE(C1), xvec10; -LDH_DX 1*SIZE(C1), xvec10; -LDL_DX 0*SIZE(C1, ldc, 1), xvec8; -LDH_DX 1*SIZE(C1, ldc, 1), xvec8; -ADD_DX xvec14, xvec13; -ADD_DX xvec12, xvec15; -ADD_DX xvec10, xvec9; -ADD_DX xvec8, xvec11; +LDL_DX 0*SIZE(C0), xvec14, xvec14; +LDH_DX 1*SIZE(C0), xvec14, xvec14; +LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12; +LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12; +LDL_DX 0*SIZE(C1), xvec10, xvec10; +LDH_DX 1*SIZE(C1), xvec10, xvec10; +LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8; +LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8; +ADD_DX xvec14, xvec13, xvec13; +ADD_DX xvec12, xvec15, xvec15; +ADD_DX xvec10, xvec9, xvec9; +ADD_DX xvec8, xvec11, xvec11; #endif STL_DX xvec13, 0*SIZE(C0); STH_DX xvec13, 1*SIZE(C0); @@ -1455,12 +1435,12 @@ MUL_DY yvec15, yvec7, yvec15; #### Writing Back #### EXTRA_DY $1, yvec15, xvec7; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 0*SIZE(C0, ldc, 1), xvec0; -LDL_DX 0*SIZE(C1), xvec1; -LDH_DX 0*SIZE(C1, ldc, 1), xvec1; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 0*SIZE(C0, ldc, 1), xvec0, xvec0; +LDL_DX 0*SIZE(C1), xvec1, xvec1; +LDH_DX 0*SIZE(C1, ldc, 1), xvec1, xvec1; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 0*SIZE(C0, ldc, 1); @@ -1549,151 +1529,151 @@ ALIGN_5; LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 4*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 6*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; #### Unroll time 2 #### LD_DX 8*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 10*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 12*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 14*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; #### Unroll time 3 #### LD_DX 16*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 18*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 20*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 22*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; #### Unroll time 4 #### LD_DX 24*SIZE(ptrba), xvec0; LD_DX 6*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrbb; LD_DX 26*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 28*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 30*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; ADDQ $32*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; DECQ k; JG .L211_bodyB; ALIGN_5 @@ -1712,77 +1692,77 @@ ALIGN_5; LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 4*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 6*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; #### Unroll time 2 #### LD_DX 8*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; LD_DX 10*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 12*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 14*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; ADDQ $16*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; .L212_loopE: #ifndef TRMMKERNEL @@ -1798,65 +1778,65 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrbb; LD_DX 2*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 4*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 6*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; ADDQ $8*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; .L213_loopE: #### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; -MUL_DX xvec7, xvec14; -MUL_DX xvec7, xvec13; -MUL_DX xvec7, xvec12; -MUL_DX xvec7, xvec11; -MUL_DX xvec7, xvec10; -MUL_DX xvec7, xvec9; -MUL_DX xvec7, xvec8; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec14, xvec14; +MUL_DX xvec7, xvec13, xvec13; +MUL_DX xvec7, xvec12, xvec12; +MUL_DX xvec7, xvec11, xvec11; +MUL_DX xvec7, xvec10, xvec10; +MUL_DX xvec7, xvec9, xvec9; +MUL_DX xvec7, xvec8, xvec8; #### Reverse ##### MOV_DX xvec15, xvec6; -REVS_DX xvec11, xvec15; -REVS_DX xvec6, xvec11; +REVS_DX xvec11, xvec15, xvec15; +REVS_DX xvec6, xvec11, xvec11; MOV_DX xvec14, xvec6; -REVS_DX xvec10, xvec14; -REVS_DX xvec6, xvec10; +REVS_DX xvec10, xvec14, xvec14; +REVS_DX xvec6, xvec10, xvec10; MOV_DX xvec13, xvec6; -REVS_DX xvec9, xvec13; -REVS_DX xvec6, xvec9; +REVS_DX xvec9, xvec13, xvec13; +REVS_DX xvec6, xvec9, xvec9; MOV_DX xvec12, xvec6; -REVS_DX xvec8, xvec12; -REVS_DX xvec6, xvec8; +REVS_DX xvec8, xvec12, xvec12; +REVS_DX xvec6, xvec8, xvec8; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -1865,14 +1845,14 @@ JNE .L213_loopEx; ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec11; -ADD_DX 2*SIZE(C0), xvec10; -ADD_DX 4*SIZE(C0), xvec9; -ADD_DX 6*SIZE(C0), xvec8; -ADD_DX 0*SIZE(C1), xvec15; -ADD_DX 2*SIZE(C1), xvec14; -ADD_DX 4*SIZE(C1), xvec13; -ADD_DX 6*SIZE(C1), xvec12; +ADD_DX 0*SIZE(C0), xvec11, xvec11; +ADD_DX 2*SIZE(C0), xvec10, xvec10; +ADD_DX 4*SIZE(C0), xvec9, xvec9; +ADD_DX 6*SIZE(C0), xvec8, xvec8; +ADD_DX 0*SIZE(C1), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec14, xvec14; +ADD_DX 4*SIZE(C1), xvec13, xvec13; +ADD_DX 6*SIZE(C1), xvec12, xvec12; #endif ST_DX xvec11, 0*SIZE(C0); ST_DX xvec10, 2*SIZE(C0); @@ -1900,18 +1880,18 @@ JMP .L21_loopE; ALIGN_5 .L213_loopEx:; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0), xvec1; -LDH_DX 3*SIZE(C0), xvec1; -LDL_DX 4*SIZE(C0), xvec2; -LDH_DX 5*SIZE(C0), xvec2; -LDL_DX 6*SIZE(C0), xvec3; -LDH_DX 7*SIZE(C0), xvec3; -ADD_DX xvec0, xvec11; -ADD_DX xvec1, xvec10; -ADD_DX xvec2, xvec9; -ADD_DX xvec3, xvec8; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +LDL_DX 4*SIZE(C0), xvec2, xvec2; +LDH_DX 5*SIZE(C0), xvec2, xvec2; +LDL_DX 6*SIZE(C0), xvec3, xvec3; +LDH_DX 7*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec11, xvec11; +ADD_DX xvec1, xvec10, xvec10; +ADD_DX xvec2, xvec9, xvec9; +ADD_DX xvec3, xvec8, xvec8; #endif STL_DX xvec11, 0*SIZE(C0); STH_DX xvec11, 1*SIZE(C0); @@ -1922,18 +1902,18 @@ STH_DX xvec9, 5*SIZE(C0); STL_DX xvec8, 6*SIZE(C0); STH_DX xvec8, 7*SIZE(C0); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec4; -LDH_DX 1*SIZE(C1), xvec4; -LDL_DX 2*SIZE(C1), xvec5; -LDH_DX 3*SIZE(C1), xvec5; -LDL_DX 4*SIZE(C1), xvec6; -LDH_DX 5*SIZE(C1), xvec6; -LDL_DX 6*SIZE(C1), xvec7; -LDH_DX 7*SIZE(C1), xvec7; -ADD_DX xvec4, xvec15; -ADD_DX xvec5, xvec14; -ADD_DX xvec6, xvec13; -ADD_DX xvec7, xvec12; +LDL_DX 0*SIZE(C1), xvec4, xvec4; +LDH_DX 1*SIZE(C1), xvec4, xvec4; +LDL_DX 2*SIZE(C1), xvec5, xvec5; +LDH_DX 3*SIZE(C1), xvec5, xvec5; +LDL_DX 4*SIZE(C1), xvec6, xvec6; +LDH_DX 5*SIZE(C1), xvec6, xvec6; +LDL_DX 6*SIZE(C1), xvec7, xvec7; +LDH_DX 7*SIZE(C1), xvec7, xvec7; +ADD_DX xvec4, xvec15, xvec15; +ADD_DX xvec5, xvec14, xvec14; +ADD_DX xvec6, xvec13, xvec13; +ADD_DX xvec7, xvec12, xvec12; #endif STL_DX xvec15, 0*SIZE(C1); STH_DX xvec15, 1*SIZE(C1); @@ -2000,79 +1980,79 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; #### Unroll time 2 #### LD_DX 4*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 6*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; #### Unroll time 3 #### LD_DX 8*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 10*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; #### Unroll time 4 #### LD_DX 12*SIZE(ptrba), xvec0; LD_DX 6*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrbb; LD_DX 14*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; ADDQ $16*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; DECQ k; JG .L221_bodyB; ALIGN_5 @@ -2090,40 +2070,40 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; #### Unroll time 2 #### LD_DX 4*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; LD_DX 6*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; ADDQ $8*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; .L222_loopE: #ifndef TRMMKERNEL @@ -2139,37 +2119,37 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrbb; LD_DX 2*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; .L223_loopE: #### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; -MUL_DX xvec7, xvec14; -MUL_DX xvec7, xvec11; -MUL_DX xvec7, xvec10; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec14, xvec14; +MUL_DX xvec7, xvec11, xvec11; +MUL_DX xvec7, xvec10, xvec10; #### Reverse ##### MOV_DX xvec15, xvec6; -REVS_DX xvec11, xvec15; -REVS_DX xvec6, xvec11; +REVS_DX xvec11, xvec15, xvec15; +REVS_DX xvec6, xvec11, xvec11; MOV_DX xvec14, xvec6; -REVS_DX xvec10, xvec14; -REVS_DX xvec6, xvec10; +REVS_DX xvec10, xvec14, xvec14; +REVS_DX xvec6, xvec10, xvec10; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -2178,10 +2158,10 @@ JNE .L223_loopEx; ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec11; -ADD_DX 2*SIZE(C0), xvec10; -ADD_DX 0*SIZE(C1), xvec15; -ADD_DX 2*SIZE(C1), xvec14; +ADD_DX 0*SIZE(C0), xvec11, xvec11; +ADD_DX 2*SIZE(C0), xvec10, xvec10; +ADD_DX 0*SIZE(C1), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec14, xvec14; #endif ST_DX xvec11, 0*SIZE(C0); ST_DX xvec10, 2*SIZE(C0); @@ -2203,24 +2183,24 @@ JMP .L22_loopE; ALIGN_5 .L223_loopEx:; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0), xvec1; -LDH_DX 3*SIZE(C0), xvec1; -ADD_DX xvec0, xvec11; -ADD_DX xvec1, xvec10; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +ADD_DX xvec0, xvec11, xvec11; +ADD_DX xvec1, xvec10, xvec10; #endif STL_DX xvec11, 0*SIZE(C0); STH_DX xvec11, 1*SIZE(C0); STL_DX xvec10, 2*SIZE(C0); STH_DX xvec10, 3*SIZE(C0); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec4; -LDH_DX 1*SIZE(C1), xvec4; -LDL_DX 2*SIZE(C1), xvec5; -LDH_DX 3*SIZE(C1), xvec5; -ADD_DX xvec4, xvec15; -ADD_DX xvec5, xvec14; +LDL_DX 0*SIZE(C1), xvec4, xvec4; +LDH_DX 1*SIZE(C1), xvec4, xvec4; +LDL_DX 2*SIZE(C1), xvec5, xvec5; +LDH_DX 3*SIZE(C1), xvec5, xvec5; +ADD_DX xvec4, xvec15, xvec15; +ADD_DX xvec5, xvec14, xvec14; #endif STL_DX xvec15, 0*SIZE(C1); STH_DX xvec15, 1*SIZE(C1); @@ -2278,38 +2258,38 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; #### Unroll time 2 #### LD_DX 2*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; #### Unroll time 3 #### LD_DX 4*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; #### Unroll time 4 #### LD_DX 6*SIZE(ptrba), xvec0; LD_DX 6*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L231_bodyB; @@ -2328,20 +2308,20 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; #### Unroll time 2 #### LD_DX 2*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; ADDQ $4*SIZE, ptrbb; .L232_loopE: #ifndef TRMMKERNEL @@ -2357,21 +2337,21 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; ADDQ $2*SIZE, ptrbb; .L233_loopE: #### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; -MUL_DX xvec7, xvec11; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec11, xvec11; #### Reverse ##### MOV_DX xvec15, xvec6; -REVS_DX xvec11, xvec15; -REVS_DX xvec6, xvec11; +REVS_DX xvec11, xvec15, xvec15; +REVS_DX xvec6, xvec11, xvec11; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -2380,8 +2360,8 @@ JNE .L233_loopEx; ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec11; -ADD_DX 0*SIZE(C1), xvec15; +ADD_DX 0*SIZE(C0), xvec11, xvec11; +ADD_DX 0*SIZE(C1), xvec15, xvec15; #endif ST_DX xvec11, 0*SIZE(C0); ST_DX xvec15, 0*SIZE(C1); @@ -2401,16 +2381,16 @@ JMP .L23_loopE; ALIGN_5 .L233_loopEx:; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -ADD_DX xvec0, xvec11; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +ADD_DX xvec0, xvec11, xvec11; #endif STL_DX xvec11, 0*SIZE(C0); STH_DX xvec11, 1*SIZE(C0); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec4; -LDH_DX 1*SIZE(C1), xvec4; -ADD_DX xvec4, xvec15; +LDL_DX 0*SIZE(C1), xvec4, xvec4; +LDH_DX 1*SIZE(C1), xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C1); STH_DX xvec15, 1*SIZE(C1); @@ -2462,23 +2442,23 @@ ALIGN_5 .L241_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; BROAD_DX 1*SIZE(ptrba), xvec1; LD_DX 2*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; BROAD_DX 2*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; BROAD_DX 3*SIZE(ptrba), xvec1; LD_DX 6*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -2496,13 +2476,13 @@ ALIGN_5 .L242_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; BROAD_DX 1*SIZE(ptrba), xvec1; LD_DX 2*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L242_loopE: @@ -2517,18 +2497,18 @@ ALIGN_5 .L243_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; ADDQ $1*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L243_loopE: BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; +MUL_DX xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 0*SIZE(C1), xvec0; -ADD_DX xvec0, xvec15; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 0*SIZE(C1), xvec0, xvec0; +ADD_DX xvec0, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 0*SIZE(C1); @@ -2705,10 +2685,10 @@ ALIGN_5 EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec15; -ADD_DX 2*SIZE(C0), xvec13; -ADD_DX 4*SIZE(C0), xvec14; -ADD_DX 6*SIZE(C0), xvec12; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C0), xvec13, xvec13; +ADD_DX 4*SIZE(C0), xvec14, xvec14; +ADD_DX 6*SIZE(C0), xvec12, xvec12; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec13, 2*SIZE(C0); @@ -2733,18 +2713,18 @@ ALIGN_5 EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec11; -LDH_DX 1*SIZE(C0), xvec11; -LDL_DX 2*SIZE(C0), xvec10; -LDH_DX 3*SIZE(C0), xvec10; -LDL_DX 4*SIZE(C0), xvec9; -LDH_DX 5*SIZE(C0), xvec9; -LDL_DX 6*SIZE(C0), xvec8; -LDH_DX 7*SIZE(C0), xvec8; -ADD_DX xvec11, xvec15; -ADD_DX xvec10, xvec13; -ADD_DX xvec9, xvec14; -ADD_DX xvec8, xvec12; +LDL_DX 0*SIZE(C0), xvec11, xvec11; +LDH_DX 1*SIZE(C0), xvec11, xvec11; +LDL_DX 2*SIZE(C0), xvec10, xvec10; +LDH_DX 3*SIZE(C0), xvec10, xvec10; +LDL_DX 4*SIZE(C0), xvec9, xvec9; +LDH_DX 5*SIZE(C0), xvec9, xvec9; +LDL_DX 6*SIZE(C0), xvec8, xvec8; +LDH_DX 7*SIZE(C0), xvec8, xvec8; +ADD_DX xvec11, xvec15, xvec15; +ADD_DX xvec10, xvec13, xvec13; +ADD_DX xvec9, xvec14, xvec14; +ADD_DX xvec8, xvec12, xvec12; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2878,8 +2858,8 @@ ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec15; -ADD_DX 2*SIZE(C0), xvec14; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C0), xvec14, xvec14; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec14, 2*SIZE(C0); @@ -2900,12 +2880,12 @@ ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec13; -LDH_DX 1*SIZE(C0), xvec13; -LDL_DX 2*SIZE(C0), xvec12; -LDH_DX 3*SIZE(C0), xvec12; -ADD_DX xvec13, xvec15; -ADD_DX xvec12, xvec14; +LDL_DX 0*SIZE(C0), xvec13, xvec13; +LDH_DX 1*SIZE(C0), xvec13, xvec13; +LDL_DX 2*SIZE(C0), xvec12, xvec12; +LDH_DX 3*SIZE(C0), xvec12, xvec12; +ADD_DX xvec13, xvec15, xvec15; +ADD_DX xvec12, xvec14, xvec14; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2959,23 +2939,23 @@ ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; LD_DX 4*SIZE(ptrba), xvec4; BROAD_DX 2*SIZE(ptrbb), xvec5; -MUL_DX xvec4, xvec5; -ADD_DX xvec5, xvec15; +MUL_DX xvec4, xvec5, xvec5; +ADD_DX xvec5, xvec15, xvec15; LD_DX 6*SIZE(ptrba), xvec6; BROAD_DX 3*SIZE(ptrbb), xvec7; -MUL_DX xvec6, xvec7; -ADD_DX xvec7, xvec15; +MUL_DX xvec6, xvec7, xvec7; +ADD_DX xvec7, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; @@ -2993,13 +2973,13 @@ ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L332_loopE: @@ -3014,18 +2994,18 @@ ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $1*SIZE, ptrbb; .L333_loopE: #### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; +MUL_DX xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec14; -LDH_DX 1*SIZE(C0), xvec14; -ADD_DX xvec14, xvec15; +LDL_DX 0*SIZE(C0), xvec14, xvec14; +LDH_DX 1*SIZE(C0), xvec14, xvec14; +ADD_DX xvec14, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -3074,25 +3054,25 @@ SARQ $2, k; JLE .L341_loopE; ALIGN_5 .L341_bodyB: -movsd 0*SIZE(ptrba), xvec0; -movsd 0*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; - -movsd 1*SIZE(ptrba), xvec0; -movsd 1*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; - -movsd 2*SIZE(ptrba), xvec0; -movsd 2*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; - -movsd 3*SIZE(ptrba), xvec0; -movsd 3*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 0*SIZE(ptrba), xvec0; +vmovsd 0*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; + +vmovsd 1*SIZE(ptrba), xvec0; +vmovsd 1*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; + +vmovsd 2*SIZE(ptrba), xvec0; +vmovsd 2*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; + +vmovsd 3*SIZE(ptrba), xvec0; +vmovsd 3*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; @@ -3108,15 +3088,15 @@ TEST $2, %rax; JLE .L342_loopE; ALIGN_5 .L342_bodyB: -movsd 0*SIZE(ptrba), xvec0; -movsd 0*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; - -movsd 1*SIZE(ptrba), xvec0; -movsd 1*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 0*SIZE(ptrba), xvec0; +vmovsd 0*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; + +vmovsd 1*SIZE(ptrba), xvec0; +vmovsd 1*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; addq $2*SIZE, ptrba; addq $2*SIZE, ptrbb; @@ -3130,20 +3110,20 @@ TEST $1, %rax; JLE .L343_loopE; ALIGN_5 .L343_bodyB: -movsd 0*SIZE(ptrba), xvec0; -movsd 0*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 0*SIZE(ptrba), xvec0; +vmovsd 0*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; addq $1*SIZE, ptrba; addq $1*SIZE, ptrbb; .L343_loopE: #### Writing Back #### -movsd MEMALPHA, xvec7; -mulsd xvec7, xvec15; +vmovsd MEMALPHA, xvec7; +vmulsd xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -movsd 0*SIZE(C0), xvec0; -addsd xvec0, xvec15; +vmovsd 0*SIZE(C0), xvec0; +vaddsd xvec0, xvec15, xvec15; #endif movsd xvec15, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) @@ -3170,6 +3150,9 @@ movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; + +vzeroupper + #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 59458effe..20ddcaa8e 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -146,75 +146,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MOVQ movq #define XOR_SY vxorps -#define XOR_DY vxorpd -#define XOR_SX xorps -#define XOR_DX xorpd +#define XOR_SX vxorps #define LD_SY vmovaps -#define LD_DY vmovapd -#define LD_SX movaps -#define LD_DX movapd -#define LDL_SX movlps +#define LD_SX vmovaps +#define LDL_SX vmovlps #define LDL_SY vmovlps -#define LDH_SX movhps +#define LDH_SX vmovhps #define LDH_SY vmovhps #define ST_SY vmovaps -#define ST_DY vmovapd -#define ST_SX movaps -#define ST_DX movapd -#define STL_SX movlps +#define ST_SX vmovaps +#define STL_SX vmovlps #define STL_SY vmovlps -#define STH_SX movhps +#define STH_SX vmovhps #define STH_SY vmovhps #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup -#define EDUP_SX movsldup -#define ODUP_SX movshdup -#define EDUP_DY vmovddup +#define EDUP_SX vmovsldup +#define ODUP_SX vmovshdup #define ADD_SY vaddps -#define ADD_DY vaddpd -#define ADD_SX addps -#define ADD_DX addpd +#define ADD_SX vaddps #define ADD1_DY vaddpd -#define ADD2_DY vaddpd -#define ADDSUB_DY vaddsubpd #define ADDSUB_SY vaddsubps #define MUL_SY vmulps -#define MUL_DY vmulpd -#define MUL_SX mulps -#define MUL_DX mulpd +#define MUL_SX vmulps #define SHUF_SY vperm2f128 #define SHUF_DY vperm2f128 -#define SHUF_DX pshufd -#define SHUF_SX pshufd +#define SHUF_SX vpshufd #define VPERMILP_SY vpermilps #define VPERMILP_SX vpermilps -#define VPERMILP_DY vpermilpd #define BROAD_SY vbroadcastss -#define BROAD_DY vbroadcastsd #define BROAD_SX vbroadcastss -#define BROAD_DX movddup #define MOV_SY vmovaps -#define MOV_DY vmovapd -#define MOV_SX movaps -#define MOV_DX movapd +#define MOV_SX vmovaps #define REVS_SY vshufps -#define REVS_DY vshufpd -#define REVS_SX shufps -#define REVS_DX movsd +#define REVS_SX vshufps #define EXTRA_SY vextractf128 -#define EXTRA_DY vextractf128 PROLOGUE @@ -260,6 +238,8 @@ movq old_offset, %r11 #endif #endif +vzeroupper + vmovlps ALPHA, MEMALPHA movq old_bm, bm movq old_bn, bn @@ -864,125 +844,125 @@ ALIGN_4 #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 8*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 8*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; #### Unroll time 2 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 12*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 16*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 16*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; #### Unroll time 3 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 20*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec1; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 24*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 24*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; ADDQ $16*SIZE, ptrba; #### Unroll time 4 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 28*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $32*SIZE, ptrbb; LD_SX 0*SIZE(ptrba), xvec0; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; DECQ k; JG .L8_bodyB; ALIGN_4 @@ -997,65 +977,65 @@ ALIGN_4 .L9_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 8*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 8*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; #### Unroll time 2 #### ADDQ $8*SIZE, ptrba; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 12*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrbb; LD_SX 0*SIZE(ptrba), xvec0; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; .L9_loopE: #ifndef TRMMKERNEL @@ -1068,57 +1048,57 @@ ALIGN_4 .L10_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; ADDQ $4*SIZE, ptrba; EDUP_SX 4*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrbb; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec9; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; .L10_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; -MUL_SX xvec7, xvec11; -MUL_SX xvec7, xvec10; -MUL_SX xvec7, xvec9; -MUL_SX xvec7, xvec8; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec7, xvec9, xvec9; +MUL_SX xvec7, xvec8, xvec8; #### Reverse Result #### MOV_SX xvec15, xvec7; -REVS_SX $0xe4, xvec13, xvec15; -REVS_SX $0xe4, xvec7, xvec13; +REVS_SX $0xe4, xvec13, xvec15, xvec15; +REVS_SX $0xe4, xvec7, xvec13, xvec13; MOV_SX xvec14, xvec7; -REVS_SX $0xe4, xvec12, xvec14; -REVS_SX $0xe4, xvec7, xvec12; +REVS_SX $0xe4, xvec12, xvec14, xvec14; +REVS_SX $0xe4, xvec7, xvec12, xvec12; MOV_SX xvec11, xvec7; -REVS_SX $0xe4, xvec9, xvec11; -REVS_SX $0xe4, xvec7, xvec9; +REVS_SX $0xe4, xvec9, xvec11, xvec11; +REVS_SX $0xe4, xvec7, xvec9, xvec9; MOV_SX xvec10, xvec7; -REVS_SX $0xe4, xvec8, xvec10; -REVS_SX $0xe4, xvec7, xvec8; +REVS_SX $0xe4, xvec8, xvec10, xvec10; +REVS_SX $0xe4, xvec7, xvec8, xvec8; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -1127,14 +1107,14 @@ JNE .L10_loopEx; ALIGN_4 LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL -ADD_SX 0*SIZE(C0), xvec15; -ADD_SX 0*SIZE(C0, ldc, 1), xvec14; -ADD_SX 0*SIZE(C0, ldc, 2), xvec13; -ADD_SX 0*SIZE(C0, %rax, 1), xvec12; -ADD_SX 0*SIZE(C1), xvec11; -ADD_SX 0*SIZE(C1, ldc, 1), xvec10; -ADD_SX 0*SIZE(C1, ldc, 2), xvec9; -ADD_SX 0*SIZE(C1, %rax, 1), xvec8; +ADD_SX 0*SIZE(C0), xvec15, xvec15; +ADD_SX 0*SIZE(C0, ldc,1), xvec14, xvec14; +ADD_SX 0*SIZE(C0, ldc,2), xvec13, xvec13; +ADD_SX 0*SIZE(C0, %rax,1), xvec12, xvec12; +ADD_SX 0*SIZE(C1), xvec11, xvec11; +ADD_SX 0*SIZE(C1, ldc,1), xvec10, xvec10; +ADD_SX 0*SIZE(C1, ldc,2), xvec9, xvec9; +ADD_SX 0*SIZE(C1, %rax,1), xvec8, xvec8; #endif ST_SX xvec15, 0*SIZE(C0); ST_SX xvec14, 0*SIZE(C0, ldc, 1); @@ -1161,30 +1141,30 @@ ALIGN_4 .L10_loopEx: LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec7; -LDH_SX 2*SIZE(C0), xvec7; -LDL_SX 0*SIZE(C0, ldc, 1), xvec6; -LDH_SX 2*SIZE(C0, ldc, 1), xvec6; -LDL_SX 0*SIZE(C0, ldc, 2), xvec5; -LDH_SX 2*SIZE(C0, ldc, 2), xvec5; -LDL_SX 0*SIZE(C0, %rax, 1), xvec4; -LDH_SX 2*SIZE(C0, %rax, 1), xvec4; -LDL_SX 0*SIZE(C1), xvec3; -LDH_SX 2*SIZE(C1), xvec3; -LDL_SX 0*SIZE(C1, ldc, 1), xvec2; -LDH_SX 2*SIZE(C1, ldc, 1), xvec2; -LDL_SX 0*SIZE(C1, ldc, 2), xvec1; -LDH_SX 2*SIZE(C1, ldc, 2), xvec1; -LDL_SX 0*SIZE(C1, %rax, 1), xvec0; -LDH_SX 2*SIZE(C1, %rax, 1), xvec0; -ADD_SX xvec7, xvec15; -ADD_SX xvec6, xvec14; -ADD_SX xvec5, xvec13; -ADD_SX xvec4, xvec12; -ADD_SX xvec3, xvec11; -ADD_SX xvec2, xvec10; -ADD_SX xvec1, xvec9; -ADD_SX xvec0, xvec8; +LDL_SX 0*SIZE(C0), xvec7, xvec7; +LDH_SX 2*SIZE(C0), xvec7, xvec7; +LDL_SX 0*SIZE(C0, ldc, 1), xvec6, xvec6; +LDH_SX 2*SIZE(C0, ldc, 1), xvec6, xvec6; +LDL_SX 0*SIZE(C0, ldc, 2), xvec5, xvec5; +LDH_SX 2*SIZE(C0, ldc, 2), xvec5, xvec5; +LDL_SX 0*SIZE(C0, %rax, 1), xvec4, xvec4; +LDH_SX 2*SIZE(C0, %rax, 1), xvec4, xvec4; +LDL_SX 0*SIZE(C1), xvec3, xvec3; +LDH_SX 2*SIZE(C1), xvec3, xvec3; +LDL_SX 0*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_SX 0*SIZE(C1, ldc, 2), xvec1, xvec1; +LDH_SX 2*SIZE(C1, ldc, 2), xvec1, xvec1; +LDL_SX 0*SIZE(C1, %rax, 1), xvec0, xvec0; +LDH_SX 2*SIZE(C1, %rax, 1), xvec0, xvec0; +ADD_SX xvec7, xvec15, xvec15; +ADD_SX xvec6, xvec14, xvec14; +ADD_SX xvec5, xvec13, xvec13; +ADD_SX xvec4, xvec12, xvec12; +ADD_SX xvec3, xvec11, xvec11; +ADD_SX xvec2, xvec10, xvec10; +ADD_SX xvec1, xvec9, xvec9; +ADD_SX xvec0, xvec8, xvec8; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -1258,63 +1238,63 @@ LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; SHUF_SX $0xee, xvec0, xvec6; EDUP_SX 8*SIZE(ptrbb), xvec2; ODUP_SX 8*SIZE(ptrbb), xvec3; -MUL_SX xvec6, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec6, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec6, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec6, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; -MUL_SX xvec6, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec6, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec6, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec6, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; SHUF_SX $0x44, xvec0, xvec1; EDUP_SX 16*SIZE(ptrbb), xvec2; ODUP_SX 16*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec4; ODUP_SX 20*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; SHUF_SX $0xee, xvec0, xvec6; EDUP_SX 24*SIZE(ptrbb), xvec2; ODUP_SX 24*SIZE(ptrbb), xvec3; -MUL_SX xvec6, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec6, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec6, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec6, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec4; ODUP_SX 28*SIZE(ptrbb), xvec5; -MUL_SX xvec6, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec6, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec6, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec6, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; @@ -1334,32 +1314,32 @@ LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; SHUF_SX $0xee, xvec0, xvec6; EDUP_SX 8*SIZE(ptrbb), xvec2; ODUP_SX 8*SIZE(ptrbb), xvec3; -MUL_SX xvec6, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec6, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec6, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec6, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; -MUL_SX xvec6, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec6, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec6, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec6, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -1376,40 +1356,40 @@ LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L13_loopE: LEAQ (ldc,ldc,2),%rax; #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; -#ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec11; -LDH_SX 0*SIZE(C0, ldc, 2), xvec11; -LDL_SX 0*SIZE(C0, ldc, 1), xvec10; -LDH_SX 0*SIZE(C0, %rax, 1), xvec10; -LDL_SX 0*SIZE(C1), xvec9; -LDH_SX 0*SIZE(C1, ldc, 2), xvec9; -LDL_SX 0*SIZE(C1, ldc, 1), xvec8; -LDH_SX 0*SIZE(C1, %rax, 1), xvec8; -ADD_SX xvec11, xvec15; -ADD_SX xvec10, xvec14; -ADD_SX xvec9, xvec13; -ADD_SX xvec8, xvec12; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; +#ifndef TRMMKERNEL +LDL_SX 0*SIZE(C0), xvec11, xvec11; +LDH_SX 0*SIZE(C0, ldc, 2), xvec11, xvec11; +LDL_SX 0*SIZE(C0, ldc, 1), xvec10, xvec10; +LDH_SX 0*SIZE(C0, %rax, 1), xvec10, xvec10; +LDL_SX 0*SIZE(C1), xvec9, xvec9; +LDH_SX 0*SIZE(C1, ldc, 2), xvec9, xvec9; +LDL_SX 0*SIZE(C1, ldc, 1), xvec8, xvec8; +LDH_SX 0*SIZE(C1, %rax,1), xvec8, xvec8; +ADD_SX xvec11, xvec15, xvec15; +ADD_SX xvec10, xvec14, xvec14; +ADD_SX xvec9, xvec13, xvec13; +ADD_SX xvec8, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C0, ldc, 2); @@ -1471,35 +1451,35 @@ ALIGN_4 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; LD_SX 8*SIZE(ptrbb), xvec4; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 16*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; LD_SX 20*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; LD_SX 24*SIZE(ptrbb), xvec4; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; LD_SX 28*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; @@ -1517,19 +1497,19 @@ ALIGN_4 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; LD_SX 8*SIZE(ptrbb), xvec4; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -1544,18 +1524,18 @@ ALIGN_4 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; ADDQ $1, ptrba; ADDQ $4, ptrbb; .L16_loopE: BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; LEAQ (ldc,ldc,2),%rax; SHUF_SX $0xff, xvec15, xvec13; @@ -1676,96 +1656,96 @@ ALIGN_4 ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 4*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 8*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 12*SIZE(ptrba), xvec1; #### Unroll time 2 #### ODUP_SX 4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 8*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 16*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 20*SIZE(ptrba), xvec1; #### Unroll time 3 #### ODUP_SX 8*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 12*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; ADDQ $16*SIZE, ptrbb; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 24*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 28*SIZE(ptrba), xvec1; ADDQ $32*SIZE, ptrba; @@ -1773,32 +1753,32 @@ ADDQ $32*SIZE, ptrba; ODUP_SX -4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 0*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 0*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 4*SIZE(ptrba), xvec1; DECQ k; JG .L211_bodyB; @@ -1816,33 +1796,33 @@ ALIGN_4 ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 4*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; ADDQ $8*SIZE, ptrbb; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 8*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 12*SIZE(ptrba), xvec1; ADDQ $16*SIZE, ptrba; @@ -1850,31 +1830,31 @@ ADDQ $16*SIZE, ptrba; ODUP_SX -4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 0*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 0*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 4*SIZE(ptrba), xvec1; .L212_loopE: @@ -1889,70 +1869,70 @@ ALIGN_4 ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; MOV_SX xvec4, xvec6; ADDQ $8*SIZE, ptrba; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; .L213_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; -MUL_SX xvec7, xvec11; -MUL_SX xvec7, xvec10; -MUL_SX xvec7, xvec9; -MUL_SX xvec7, xvec8; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec7, xvec9, xvec9; +MUL_SX xvec7, xvec8, xvec8; #### Writing Back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C1), xvec1; -LDL_SX 0*SIZE(C0, ldc, 1), xvec2; -LDH_SX 2*SIZE(C1, ldc, 1), xvec2; -LDL_SX 4*SIZE(C0, ldc, 1), xvec3; -LDH_SX 6*SIZE(C1, ldc, 1), xvec3; -LDL_SX 0*SIZE(C1), xvec4; -LDH_SX 2*SIZE(C0), xvec4; -LDL_SX 4*SIZE(C1), xvec5; -LDH_SX 6*SIZE(C0), xvec5; -LDL_SX 0*SIZE(C1, ldc, 1), xvec6; -LDH_SX 2*SIZE(C0, ldc, 1), xvec6; -LDL_SX 4*SIZE(C1, ldc, 1), xvec7; -LDH_SX 6*SIZE(C0, ldc, 1), xvec7; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; -ADD_SX xvec4, xvec11; -ADD_SX xvec5, xvec10; -ADD_SX xvec6, xvec9; -ADD_SX xvec7, xvec8; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +LDL_SX 0*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_SX 4*SIZE(C0, ldc, 1), xvec3, xvec3; +LDH_SX 6*SIZE(C1, ldc, 1), xvec3, xvec3; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +LDL_SX 4*SIZE(C1), xvec5, xvec5; +LDH_SX 6*SIZE(C0), xvec5, xvec5; +LDL_SX 0*SIZE(C1, ldc, 1), xvec6, xvec6; +LDH_SX 2*SIZE(C0, ldc, 1), xvec6, xvec6; +LDL_SX 4*SIZE(C1, ldc, 1), xvec7, xvec7; +LDH_SX 6*SIZE(C0, ldc, 1), xvec7, xvec7; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; +ADD_SX xvec4, xvec11, xvec11; +ADD_SX xvec5, xvec10, xvec10; +ADD_SX xvec6, xvec9, xvec9; +ADD_SX xvec7, xvec8, xvec8; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2028,64 +2008,64 @@ EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; EDUP_SX 4*SIZE(ptrbb), xvec2; ODUP_SX 4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; EDUP_SX 8*SIZE(ptrbb), xvec2; ODUP_SX 8*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec1; EDUP_SX 12*SIZE(ptrbb), xvec2; ODUP_SX 12*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15 +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15 SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -2106,32 +2086,32 @@ EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; EDUP_SX 4*SIZE(ptrbb), xvec2; ODUP_SX 4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13 -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13 +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L222_loopE: @@ -2148,39 +2128,39 @@ EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L223_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 0*SIZE(C0, ldc, 1), xvec1; -LDH_SX 2*SIZE(C1, ldc, 1), xvec1; -LDL_SX 0*SIZE(C1), xvec2; -LDH_SX 2*SIZE(C0), xvec2; -LDL_SX 0*SIZE(C1, ldc, 1), xvec3; -LDH_SX 2*SIZE(C0, ldc, 1), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 0*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_SX 2*SIZE(C1, ldc, 1), xvec1, xvec1; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C0), xvec2, xvec2; +LDL_SX 0*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_SX 2*SIZE(C0, ldc, 1), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2242,35 +2222,35 @@ LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; SHUF_SX $0xee, xvec0, xvec2; EDUP_SX 4*SIZE(ptrbb), xvec6; ODUP_SX 4*SIZE(ptrbb), xvec7; -MUL_SX xvec2, xvec6; -ADD_SX xvec6, xvec15; -MUL_SX xvec2, xvec7; -ADD_SX xvec7, xvec14; +MUL_SX xvec2, xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +MUL_SX xvec2, xvec7, xvec7; +ADD_SX xvec7, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; SHUF_SX $0xee, xvec0, xvec2; EDUP_SX 12*SIZE(ptrbb), xvec6; ODUP_SX 12*SIZE(ptrbb), xvec7; -MUL_SX xvec2, xvec6; -ADD_SX xvec6, xvec15; -MUL_SX xvec2, xvec7; -ADD_SX xvec7, xvec14; +MUL_SX xvec2, xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +MUL_SX xvec2, xvec7, xvec7; +ADD_SX xvec7, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -2290,18 +2270,18 @@ LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; SHUF_SX $0xee, xvec0, xvec2; EDUP_SX 4*SIZE(ptrbb), xvec6; ODUP_SX 4*SIZE(ptrbb), xvec7; -MUL_SX xvec2, xvec6; -ADD_SX xvec6, xvec15; -MUL_SX xvec2, xvec7; -ADD_SX xvec7, xvec14; +MUL_SX xvec2, xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +MUL_SX xvec2, xvec7, xvec7; +ADD_SX xvec7, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2318,10 +2298,10 @@ LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -2392,23 +2372,23 @@ ALIGN_4 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec2; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec2, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec2, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec4; LD_SX 8*SIZE(ptrbb), xvec5; -MUL_SX xvec4, xvec5; -ADD_SX xvec5, xvec15; +MUL_SX xvec4, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec6; LD_SX 12*SIZE(ptrbb), xvec7; -MUL_SX xvec6, xvec7; -ADD_SX xvec7, xvec15; +MUL_SX xvec6, xvec7, xvec7; +ADD_SX xvec7, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -2425,13 +2405,13 @@ ALIGN_4 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec2; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec2, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec2, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2446,14 +2426,14 @@ ALIGN_4; .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; ADDQ $1*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L243_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; +MUL_SX xvec7, xvec15, xvec15; SHUF_SX $0xff, xvec15, xvec14; SHUF_SX $0xaa, xvec15, xvec13; SHUF_SX $0x55, xvec15, xvec12; @@ -2546,34 +2526,34 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; SHUF_SX $0xfa, xvec2, xvec3; LD_SX 8*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2581,34 +2561,34 @@ LD_SX 16*SIZE(ptrba), xvec0; LD_SX 20*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; SHUF_SX $0xfa, xvec2, xvec3; LD_SX 24*SIZE(ptrba), xvec0; LD_SX 28*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2630,34 +2610,34 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; SHUF_SX $0xfa, xvec2, xvec3; LD_SX 8*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -2676,40 +2656,40 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L313_loopE: BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; #### Writing Back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C1), xvec1; -LDL_SX 0*SIZE(C1), xvec2; -LDH_SX 2*SIZE(C0), xvec2; -LDL_SX 4*SIZE(C1), xvec3; -LDH_SX 6*SIZE(C0), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C0), xvec2, xvec2; +LDL_SX 4*SIZE(C1), xvec3, xvec3; +LDH_SX 6*SIZE(C0), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2774,35 +2754,35 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; SHUF_SX $0xfa, xvec2, xvec5; SHUF_SX $0xaf, xvec2, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec15; -MUL_SX xvec0, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; +MUL_SX xvec0, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; LD_SX 8*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; LD_SX 12*SIZE(ptrba), xvec0; SHUF_SX $0xfa, xvec2, xvec5; SHUF_SX $0xaf, xvec2, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec15; -MUL_SX xvec0, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; +MUL_SX xvec0, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2822,18 +2802,18 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; SHUF_SX $0xfa, xvec2, xvec5; SHUF_SX $0xaf, xvec2, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec15; -MUL_SX xvec0, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; +MUL_SX xvec0, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -2850,25 +2830,25 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L323_loopE: BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 0*SIZE(C1), xvec1; -LDH_SX 2*SIZE(C0), xvec1; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 0*SIZE(C1), xvec1, xvec1; +LDH_SX 2*SIZE(C0), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2928,19 +2908,19 @@ LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3 -MUL_SX xvec0, xvec2; # c00, c10 -ADD_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; # C01, c11 -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; EDUP_SX 4*SIZE(ptrbb), xvec2; ODUP_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -2959,10 +2939,10 @@ LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3 -MUL_SX xvec0, xvec2; # c00, c10 -ADD_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; # C01, c11 -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -2985,7 +2965,7 @@ mulss xvec0, xvec2; addss xvec2, xvec15; mulss xvec1, xvec3; SHUF_SX $0xe1, xvec3, xvec4; -ADD_SX xvec4, xvec15; +ADD_SX xvec4, xvec15, xvec15; movss 1*SIZE(ptrbb), xvec5; XOR_SY yvec6, yvec6, yvec6; @@ -2994,26 +2974,26 @@ mulss xvec0, xvec5; addss xvec5, xvec14; mulss xvec1, xvec6; SHUF_SX $0xe1, xvec6, xvec7; -ADD_SX xvec7, xvec14 +ADD_SX xvec7, xvec14, xvec14 ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L333_loopE: BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; SHUF_SX $0xee, xvec15, xvec13; SHUF_SX $0xee, xvec14, xvec12; SHUF_SX $0x44, xvec15, xvec11; SHUF_SX $0x44, xvec14, xvec10; -ADD_SX xvec13, xvec11; -ADD_SX xvec12, xvec10; +ADD_SX xvec13, xvec11, xvec11; +ADD_SX xvec12, xvec10, xvec10; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDL_SX 0*SIZE(C1), xvec1; -ADD_SX xvec0, xvec11; -ADD_SX xvec1, xvec10; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDL_SX 0*SIZE(C1), xvec1, xvec1; +ADD_SX xvec0, xvec11, xvec11; +ADD_SX xvec1, xvec10, xvec10; #endif STL_SX xvec11, 0*SIZE(C0); STL_SX xvec10, 0*SIZE(C1); @@ -3305,14 +3285,14 @@ SHUF_SX $0xee, xvec15, xvec12; SHUF_SX $0x44, xvec14, xvec11; SHUF_SX $0xee, xvec14, xvec10; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDL_SX 2*SIZE(C0), xvec1; -LDL_SX 4*SIZE(C0), xvec2; -LDL_SX 6*SIZE(C0), xvec3; -ADD_SX xvec0, xvec13; -ADD_SX xvec1, xvec12; -ADD_SX xvec2, xvec11; -ADD_SX xvec3, xvec10; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDL_SX 2*SIZE(C0), xvec1, xvec1; +LDL_SX 4*SIZE(C0), xvec2, xvec2; +LDL_SX 6*SIZE(C0), xvec3, xvec3; +ADD_SX xvec0, xvec13, xvec13; +ADD_SX xvec1, xvec12, xvec12; +ADD_SX xvec2, xvec11, xvec11; +ADD_SX xvec3, xvec10, xvec10; #endif STL_SX xvec13, 0*SIZE(C0); STL_SX xvec12, 2*SIZE(C0); @@ -3368,23 +3348,23 @@ ALIGN_4 .L421_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 1*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; LD_SX 8*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; LD_SX 12*SIZE(ptrba), xvec0; BROAD_SX 3*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; @@ -3401,13 +3381,13 @@ ALIGN_4 .L422_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 1*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; @@ -3422,19 +3402,19 @@ ALIGN_4 .L423_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $1*SIZE, ptrbb; .L423_loopE: #### Writing back #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; +MUL_SX xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0), xvec0; -ADD_SX xvec0, xvec15; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -3485,37 +3465,37 @@ SARQ $2, k; JLE .L431_loopE; ALIGN_4 .L431_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrba), xvec1; -movss 0*SIZE(ptrbb), xvec2; -mulss xvec2, xvec0; -addss xvec0, xvec15; -mulss xvec2, xvec1; -addss xvec1, xvec14; - -movss 2*SIZE(ptrba), xvec3; -movss 3*SIZE(ptrba), xvec4; -movss 1*SIZE(ptrbb), xvec5; -mulss xvec5, xvec3; -addss xvec3, xvec15; -mulss xvec5, xvec4; -addss xvec4, xvec14; - -movss 4*SIZE(ptrba), xvec0; -movss 5*SIZE(ptrba), xvec1; -movss 2*SIZE(ptrbb), xvec2; -mulss xvec2, xvec0; -addss xvec0, xvec15; -mulss xvec2, xvec1; -addss xvec1, xvec14; - -movss 6*SIZE(ptrba), xvec3; -movss 7*SIZE(ptrba), xvec4; -movss 3*SIZE(ptrbb), xvec5; -mulss xvec5, xvec3; -addss xvec3, xvec15; -mulss xvec5, xvec4; -addss xvec4, xvec14; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrba), xvec1; +vmovss 0*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; + +vmovss 2*SIZE(ptrba), xvec3; +vmovss 3*SIZE(ptrba), xvec4; +vmovss 1*SIZE(ptrbb), xvec5; +vmulss xvec5, xvec3, xvec3; +vaddss xvec3, xvec15, xvec15; +vmulss xvec5, xvec4, xvec4; +vaddss xvec4, xvec14, xvec14; + +vmovss 4*SIZE(ptrba), xvec0; +vmovss 5*SIZE(ptrba), xvec1; +vmovss 2*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; + +vmovss 6*SIZE(ptrba), xvec3; +vmovss 7*SIZE(ptrba), xvec4; +vmovss 3*SIZE(ptrbb), xvec5; +vmulss xvec5, xvec3, xvec3; +vaddss xvec3, xvec15, xvec15; +vmulss xvec5, xvec4, xvec4; +vaddss xvec4, xvec14, xvec14; addq $8*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; @@ -3530,21 +3510,21 @@ TEST $2, kkk; JLE .L432_loopE; ALIGN_4 .L432_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrba), xvec1; -movss 0*SIZE(ptrbb), xvec2; -mulss xvec2, xvec0; -addss xvec0, xvec15; -mulss xvec2, xvec1; -addss xvec1, xvec14; - -movss 2*SIZE(ptrba), xvec3; -movss 3*SIZE(ptrba), xvec4; -movss 1*SIZE(ptrbb), xvec5; -mulss xvec5, xvec3; -addss xvec3, xvec15; -mulss xvec5, xvec4; -addss xvec4, xvec14; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrba), xvec1; +vmovss 0*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; + +vmovss 2*SIZE(ptrba), xvec3; +vmovss 3*SIZE(ptrba), xvec4; +vmovss 1*SIZE(ptrbb), xvec5; +vmulss xvec5, xvec3, xvec3; +vaddss xvec3, xvec15, xvec15; +vmulss xvec5, xvec4, xvec4; +vaddss xvec4, xvec14, xvec14; addq $4*SIZE, ptrba; addq $2*SIZE, ptrbb; @@ -3557,28 +3537,28 @@ TEST $1, kkk; JLE .L433_loopE; ALIGN_4 .L433_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrba), xvec1; -movss 0*SIZE(ptrbb), xvec2; -mulss xvec2, xvec0; -addss xvec0, xvec15; -mulss xvec2, xvec1; -addss xvec1, xvec14; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrba), xvec1; +vmovss 0*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; addq $2*SIZE, ptrba; addq $1*SIZE, ptrbb; .L433_loopE: #### Writing Back #### -movss MEMALPHA, xvec7; -mulss xvec7, xvec15; -mulss xvec7, xvec14; +vmovss MEMALPHA, xvec7; +vmulss xvec7, xvec15, xvec15; +vmulss xvec7, xvec14, xvec14; #ifndef TRMMKERNEL -addss 0*SIZE(C0), xvec15; -addss 1*SIZE(C0), xvec14; +vaddss 0*SIZE(C0), xvec15, xvec15; +vaddss 1*SIZE(C0), xvec14, xvec14; #endif -movss xvec15, 0*SIZE(C0); -movss xvec14, 1*SIZE(C0); +vmovss xvec15, 0*SIZE(C0); +vmovss xvec14, 1*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; @@ -3625,25 +3605,25 @@ SARQ $2, k; JLE .L441_loopE; ALIGN_4 .L441_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 0*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; - -movss 1*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; - -movss 2*SIZE(ptrba), xvec0; -movss 2*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; - -movss 3*SIZE(ptrba), xvec0; -movss 3*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 0*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; + +vmovss 1*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; + +vmovss 2*SIZE(ptrba), xvec0; +vmovss 2*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; + +vmovss 3*SIZE(ptrba), xvec0; +vmovss 3*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; @@ -3658,15 +3638,15 @@ TEST $2, kkk; JLE .L442_loopE; ALIGN_4 .L442_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 0*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; - -movss 1*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 0*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; + +vmovss 1*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; addq $2*SIZE, ptrba; addq $2*SIZE, ptrbb; @@ -3679,21 +3659,21 @@ TEST $1, kkk; JLE .L443_loopE; ALIGN_4 .L443_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 0*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 0*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; addq $1*SIZE, ptrba; addq $1*SIZE, ptrbb; .L443_loopE: #### Writing Back #### -movss MEMALPHA, xvec7; -mulss xvec7, xvec15; +vmovss MEMALPHA, xvec7; +vmulss xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -addss 0*SIZE(C0), xvec15; +vaddss 0*SIZE(C0), xvec15, xvec15; #endif -movss xvec15, 0*SIZE(C0); +vmovss xvec15, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; @@ -3711,6 +3691,7 @@ MOV bk, k; SALQ $2, k; ADDQ k, bb; ADDQ ldc, C; + .L40_loopE: movq 0(%rsp), %rbx; movq 8(%rsp), %rbp; @@ -3718,6 +3699,9 @@ movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; + +vzeroupper + #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi @@ -3732,6 +3716,7 @@ movq 40(%rsp), %r15; movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif + addq $STACKSIZE, %rsp; ret diff --git a/kernel/x86_64/zgemm_kernel_4x4_sandy.S b/kernel/x86_64/zgemm_kernel_4x4_sandy.S index f6f9f707f..9f6fb8a5f 100644 --- a/kernel/x86_64/zgemm_kernel_4x4_sandy.S +++ b/kernel/x86_64/zgemm_kernel_4x4_sandy.S @@ -148,74 +148,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef MOVQ #define MOVQ movq -#define XOR_SY vxorps #define XOR_DY vxorpd -#define XOR_SX xorps -#define XOR_DX xorpd +#define XOR_DX vxorpd -#define LD_SY vmovaps #define LD_DY vmovapd -#define LD_SX movaps -#define LD_DX movapd +#define LD_DX vmovapd #define LDL_DY vmovlpd -#define LDL_DX movlpd +#define LDL_DX vmovlpd #define LDH_DY vmovhpd -#define LDH_DX movhpd +#define LDH_DX vmovhpd -#define ST_SY vmovaps #define ST_DY vmovapd -#define ST_SX movaps -#define ST_DX movapd +#define ST_DX vmovapd #define STL_DY vmovlpd -#define STL_DX movlpd +#define STL_DX vmovlpd #define STH_DY vmovhpd -#define STH_DX movhpd +#define STH_DX vmovhpd -#define EDUP_SY vmovsldup -#define ODUP_SY vmovshdup -#define EDUP_SX movsldup -#define ODUP_SX movshdup #define EDUP_DY vmovddup -#define ADD_SY vaddps #define ADD_DY vaddpd -#define ADD_SX addps -#define ADD_DX addpd +#define ADD_DX vaddpd #define SUB_DY vsubpd -#define SUB_DX subpd +#define SUB_DX vsubpd #define ADDSUB_DY vaddsubpd -#define ADDSUB_DX addsubpd -#define ADDSUB_SY vaddsubps +#define ADDSUB_DX vaddsubpd -#define MUL_SY vmulps #define MUL_DY vmulpd -#define MUL_SX mulps -#define MUL_DX mulpd +#define MUL_DX vmulpd -#define SHUF_SY vperm2f128 #define SHUF_DY vperm2f128 -#define SHUF_DX pshufd -#define SHUF_SX pshufd +#define SHUF_DX vpshufd -#define VPERMILP_SY vpermilps -#define VPERMILP_SX vpermilps #define VPERMILP_DY vpermilpd -#define BROAD_SY vbroadcastss #define BROAD_DY vbroadcastsd -#define BROAD_SX vbroadcastss -#define BROAD_DX movddup +#define BROAD_DX vmovddup -#define MOV_SY vmovaps #define MOV_DY vmovapd -#define MOV_SX movaps -#define MOV_DX movapd +#define MOV_DX vmovapd -#define REVS_SY vshufps #define REVS_DY vshufpd -#define REVS_SX shufps -#define REVS_DX movsd +#define REVS_DX vmovsd #define EXTRA_DY vextractf128 @@ -282,6 +257,8 @@ movq old_offset, %r11; #endif #endif +vzeroupper + vmovlps %xmm0, MEMALPHA_R vmovlps %xmm1, MEMALPHA_I movq old_bm, bm @@ -1373,14 +1350,14 @@ EXTRA_DY $1, yvec14, xvec6; EXTRA_DY $1, yvec13, xvec5; EXTRA_DY $1, yvec12, xvec4; #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec15; -ADD_DX 2*SIZE(C0, ldc, 1), xvec7; -ADD_DX 0*SIZE(C0, ldc, 1), xvec13; -ADD_DX 2*SIZE(C0), xvec5; -ADD_DX 0*SIZE(C1), xvec14; -ADD_DX 2*SIZE(C1, ldc, 1), xvec6; -ADD_DX 0*SIZE(C1, ldc, 1), xvec12; -ADD_DX 2*SIZE(C1), xvec4; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C0, ldc, 1), xvec7, xvec7; +ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13; +ADD_DX 2*SIZE(C0), xvec5, xvec5; +ADD_DX 0*SIZE(C1), xvec14, xvec14; +ADD_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6; +ADD_DX 0*SIZE(C1, ldc, 1), xvec12, xvec12; +ADD_DX 2*SIZE(C1), xvec4, xvec4; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec7, 2*SIZE(C0, ldc, 1); @@ -1410,18 +1387,18 @@ EXTRA_DY $1, yvec14, xvec6; EXTRA_DY $1, yvec13, xvec5; EXTRA_DY $2, yvec12, xvec4; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0, ldc, 1), xvec1; -LDH_DX 3*SIZE(C0, ldc, 1), xvec1; -LDL_DX 0*SIZE(C0, ldc, 1), xvec2; -LDH_DX 1*SIZE(C0, ldc, 1), xvec2; -LDL_DX 2*SIZE(C0), xvec3; -LDH_DX 3*SIZE(C0), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec13; -ADD_DX xvec3, xvec5; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_DX 3*SIZE(C0, ldc, 1), xvec1, xvec1; +LDL_DX 0*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_DX 1*SIZE(C0, ldc, 1), xvec2, xvec2; +LDL_DX 2*SIZE(C0), xvec3, xvec3; +LDH_DX 3*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec13, xvec13; +ADD_DX xvec3, xvec5, xvec5; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -1432,18 +1409,18 @@ STH_DX xvec13, 1*SIZE(C0, ldc, 1); STL_DX xvec6, 2*SIZE(C0); STH_DX xvec6, 3*SIZE(C0); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec0; -LDH_DX 1*SIZE(C1), xvec0; -LDL_DX 2*SIZE(C1, ldc, 1), xvec1; -LDH_DX 3*SIZE(C1, ldc, 1), xvec1; -LDL_DX 0*SIZE(C1, ldc, 1), xvec2; -LDH_DX 1*SIZE(C1, ldc, 1), xvec2; -LDL_DX 2*SIZE(C1), xvec3; -LDH_DX 3*SIZE(C1), xvec3; -ADD_DX xvec0, xvec14; -ADD_DX xvec1, xvec6; -ADD_DX xvec2, xvec12; -ADD_DX xvec3, xvec4; +LDL_DX 0*SIZE(C1), xvec0, xvec0; +LDH_DX 1*SIZE(C1), xvec0, xvec0; +LDL_DX 2*SIZE(C1, ldc, 1), xvec1, xvec1; +LDH_DX 3*SIZE(C1, ldc, 1), xvec1, xvec1; +LDL_DX 0*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_DX 1*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_DX 2*SIZE(C1), xvec3, xvec3; +LDH_DX 3*SIZE(C1), xvec3, xvec3; +ADD_DX xvec0, xvec14, xvec14; +ADD_DX xvec1, xvec6, xvec6; +ADD_DX xvec2, xvec12, xvec12; +ADD_DX xvec3, xvec4, xvec4; #endif STL_DX xvec14, 0*SIZE(C1); STH_DX xvec14, 1*SIZE(C1); @@ -1680,18 +1657,18 @@ ADD2_DY yvec4, yvec14, yvec14; EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 0*SIZE(C0, ldc, 1), xvec1; -LDH_DX 1*SIZE(C0, ldc, 1), xvec1; -LDL_DX 0*SIZE(C1), xvec2; -LDH_DX 1*SIZE(C1), xvec2; -LDL_DX 0*SIZE(C1, ldc, 1), xvec3; -LDH_DX 1*SIZE(C1, ldc, 1), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec14; -ADD_DX xvec3, xvec6; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 0*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_DX 1*SIZE(C0, ldc, 1), xvec1, xvec1; +LDL_DX 0*SIZE(C1), xvec2, xvec2; +LDH_DX 1*SIZE(C1), xvec2, xvec2; +LDL_DX 0*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_DX 1*SIZE(C1, ldc, 1), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec14, xvec14; +ADD_DX xvec3, xvec6, xvec6; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2063,14 +2040,14 @@ JNE .L213_loopEx; ALIGN_5 #### Writing back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0),xvec15; -ADD_DX 2*SIZE(C1),xvec7; -ADD_DX 4*SIZE(C0),xvec14; -ADD_DX 6*SIZE(C1),xvec6; -ADD_DX 0*SIZE(C1),xvec13; -ADD_DX 2*SIZE(C0),xvec5; -ADD_DX 4*SIZE(C1),xvec12; -ADD_DX 6*SIZE(C0),xvec4; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec7, xvec7; +ADD_DX 4*SIZE(C0), xvec14, xvec14; +ADD_DX 6*SIZE(C1), xvec6, xvec6; +ADD_DX 0*SIZE(C1), xvec13, xvec13; +ADD_DX 2*SIZE(C0), xvec5, xvec5; +ADD_DX 4*SIZE(C1), xvec12, xvec12; +ADD_DX 6*SIZE(C0), xvec4, xvec4; #endif ST_DX xvec15,0*SIZE(C0); ST_DX xvec7,2*SIZE(C1); @@ -2098,18 +2075,18 @@ JMP .L21_loopE; ALIGN_5 .L213_loopEx: #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C1), xvec1; -LDH_DX 3*SIZE(C1), xvec1; -LDL_DX 4*SIZE(C0), xvec2; -LDH_DX 5*SIZE(C0), xvec2; -LDL_DX 6*SIZE(C1), xvec3; -LDH_DX 7*SIZE(C1), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec14; -ADD_DX xvec3, xvec6; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C1), xvec1, xvec1; +LDH_DX 3*SIZE(C1), xvec1, xvec1; +LDL_DX 4*SIZE(C0), xvec2, xvec2; +LDH_DX 5*SIZE(C0), xvec2, xvec2; +LDL_DX 6*SIZE(C1), xvec3, xvec3; +LDH_DX 7*SIZE(C1), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec14, xvec14; +ADD_DX xvec3, xvec6, xvec6; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2120,18 +2097,18 @@ STH_DX xvec14, 5*SIZE(C0); STL_DX xvec6, 6*SIZE(C1); STH_DX xvec6, 7*SIZE(C1); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec3; -LDH_DX 1*SIZE(C1), xvec3; -LDL_DX 2*SIZE(C0), xvec2; -LDH_DX 3*SIZE(C0), xvec2; -LDL_DX 4*SIZE(C1), xvec1; -LDH_DX 5*SIZE(C1), xvec1; -LDL_DX 6*SIZE(C0), xvec0; -LDH_DX 7*SIZE(C0), xvec0; -ADD_DX xvec3, xvec13; -ADD_DX xvec2, xvec5; -ADD_DX xvec1, xvec12; -ADD_DX xvec0, xvec4; +LDL_DX 0*SIZE(C1), xvec3, xvec3; +LDH_DX 1*SIZE(C1), xvec3, xvec3; +LDL_DX 2*SIZE(C0), xvec2, xvec2; +LDH_DX 3*SIZE(C0), xvec2, xvec2; +LDL_DX 4*SIZE(C1), xvec1, xvec1; +LDH_DX 5*SIZE(C1), xvec1, xvec1; +LDL_DX 6*SIZE(C0), xvec0, xvec0; +LDH_DX 7*SIZE(C0), xvec0, xvec0; +ADD_DX xvec3, xvec13, xvec13; +ADD_DX xvec2, xvec5, xvec5; +ADD_DX xvec1, xvec12, xvec12; +ADD_DX xvec0, xvec4, xvec4; #endif STL_DX xvec13, 0*SIZE(C1); STH_DX xvec13, 1*SIZE(C1); @@ -2384,18 +2361,18 @@ EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec13, xvec5; #### Write back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C1), xvec1; -LDH_DX 3*SIZE(C1), xvec1; -LDL_DX 0*SIZE(C1), xvec2; -LDH_DX 1*SIZE(C1), xvec2; -LDL_DX 2*SIZE(C0), xvec3; -LDH_DX 3*SIZE(C0), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec13; -ADD_DX xvec3, xvec5; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C1), xvec1, xvec1; +LDH_DX 3*SIZE(C1), xvec1, xvec1; +LDL_DX 0*SIZE(C1), xvec2, xvec2; +LDH_DX 1*SIZE(C1), xvec2, xvec2; +LDL_DX 2*SIZE(C0), xvec3, xvec3; +LDH_DX 3*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec13, xvec13; +ADD_DX xvec3, xvec5, xvec5; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2582,12 +2559,12 @@ ADD2_DY yvec5, yvec15, yvec15; EXTRA_DY $1, yvec15, xvec7; #### Writing Back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 0*SIZE(C1), xvec1; -LDH_DX 1*SIZE(C1), xvec1; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 0*SIZE(C1), xvec1, xvec1; +LDH_DX 1*SIZE(C1), xvec1, xvec1; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2845,18 +2822,18 @@ EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; #### Writing Back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0), xvec1; -LDH_DX 3*SIZE(C0), xvec1; -LDL_DX 4*SIZE(C0), xvec2; -LDH_DX 5*SIZE(C0), xvec2; -LDL_DX 6*SIZE(C0), xvec3; -LDH_DX 7*SIZE(C0), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec14; -ADD_DX xvec3, xvec6; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +LDL_DX 4*SIZE(C0), xvec2, xvec2; +LDH_DX 5*SIZE(C0), xvec2, xvec2; +LDL_DX 6*SIZE(C0), xvec3, xvec3; +LDH_DX 7*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec14, xvec14; +ADD_DX xvec3, xvec6, xvec6; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -3026,12 +3003,12 @@ ADD2_DY yvec5, yvec15, yvec15; EXTRA_DY $1, yvec15, xvec7; #### Writing Back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0), xvec1; -LDH_DX 3*SIZE(C0), xvec1; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -3084,43 +3061,43 @@ ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec0; BROAD_DX 2*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 3*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 4*SIZE(ptrba), xvec0; BROAD_DX 4*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 5*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 6*SIZE(ptrba), xvec0; BROAD_DX 6*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 7*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -3137,23 +3114,23 @@ ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec0; BROAD_DX 2*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 3*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -3168,13 +3145,13 @@ ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; @@ -3182,14 +3159,14 @@ ADDQ $2*SIZE, ptrbb; #### Handle #### XOR_DY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) -ADDSUB_DX xvec15, xvec7; +ADDSUB_DX xvec15, xvec7, xvec7; MOV_DX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -SUB_DX xvec15, xvec7; +SUB_DX xvec15, xvec7, xvec7; MOV_DX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_DX $0x4e, xvec15, xvec15; -ADDSUB_DX xvec15, xvec7; +ADDSUB_DX xvec15, xvec7, xvec7; MOV_DX xvec7, xvec15; SHUF_DX $0x4e, xvec15, xvec15; #endif @@ -3199,14 +3176,14 @@ BROAD_DX MEMALPHA_R,xvec7; BROAD_DX MEMALPHA_I,xvec6; #### Multiply Alpha #### SHUF_DX $0x4e, xvec15, xvec5; -MUL_DX xvec7, xvec15; -MUL_DX xvec6, xvec5; -ADDSUB_DX xvec5, xvec15; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec6, xvec5, xvec5; +ADDSUB_DX xvec5, xvec15, xvec15; #### Writing back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -ADD_DX xvec0, xvec15; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +ADD_DX xvec0, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -3237,6 +3214,9 @@ movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; + +vzeroupper + #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi From 13f5f181406df3de4553d2481206df1b19a99b4a Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 26 Jun 2012 07:43:06 +0800 Subject: [PATCH 069/162] Updated the doc for 0.2.0 version. --- Changelog.txt | 15 +++++++++++++++ README.md | 6 +++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 0ed35b0e4..c222c7eee 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,17 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.0 +26-Jun-2012 +common: + * Removed the limitation (64) of numbers of CPU cores. + Now, it supports 256 cores at max. + * Supported clang compiler. + * Fixed some build bugs on FreeBSD +x86/x86-64: + * Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions. + Please use gcc >= 4.6 or clang >=3.1. + * Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes. + ==================================================================== Version 0.1.1 29-Apr-2012 @@ -7,6 +20,8 @@ common: * Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia) * Fixed the build bug (MD5 and download) on Mac OSX. * Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1. + * Fxied the compatibility issue for compilers without C99 complex number + (e.g. Visual Studio) x86/x86_64: * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. * Test alpha=Nan in dscale. diff --git a/README.md b/README.md index a13e069ec..82e9f528c 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenB Please read the documents on OpenBLAS wiki pages . -## Intallation +## Installation Download from project homepage. http://xianyi.github.com/OpenBLAS/ Or, check out codes from git://github.com/xianyi/OpenBLAS.git @@ -76,9 +76,9 @@ The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. -### Set the number of threads with calling functions. +### Set the number of threads on runtime. -Examples: +We provided the below functions to controll the number of threads on runtime. So far, we didn't support changing the number of threads on Windows. On Windows, these functions are dummy. void goto_set_num_threads(int num_threads); From a507b56ab1bf0a69d22f93e24df2a71e4e08c44c Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 29 Jun 2012 15:53:24 +0800 Subject: [PATCH 070/162] Refs #119 #118. Fixed disabling hyper threading bug. --- driver/others/init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/driver/others/init.c b/driver/others/init.c index 4a6f0aae8..f6924d5f4 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -447,6 +447,9 @@ static void disable_hyperthread(void) { //When the shared cpu are in different element of share & avail array, this may be a bug. for (i = 0; i < count ; i++){ + + share[i] &= common->avail[i]; + if (popcount(share[i]) > 1) { #ifdef DEBUG From a4308807298c650118f86e5cb8126aea6d41f863 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sat, 30 Jun 2012 08:25:36 +0800 Subject: [PATCH 071/162] Updated the do for 0.2.1 version. --- Changelog.txt | 8 ++++++++ Makefile.rule | 2 +- README.md | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index c222c7eee..019870d8c 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,12 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.1 +30-Jun-2012 +common: +x86/x86-64: + * Fixed the SEGFAULT bug about hyper-theading + * Support AMD Bulldozer by using GotoBLAS2 AMD Barcelona codes + ==================================================================== Version 0.2.0 26-Jun-2012 diff --git a/Makefile.rule b/Makefile.rule index 299273773..082487835 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.0 +VERSION = 0.2.1 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/README.md b/README.md index 82e9f528c..52d098366 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ Please read GotoBLAS_01Readme.txt - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. +- **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes. #### MIPS64: - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. From f22bfe6a55dc6b52ab5c6de46e12b81926f5c227 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Mon, 2 Jul 2012 02:49:12 +0200 Subject: [PATCH 072/162] Set the tests for hurd similar to linux --- ctest.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ctest.c b/ctest.c index 9fc0b0c40..d12e7be99 100644 --- a/ctest.c +++ b/ctest.c @@ -70,6 +70,11 @@ OS_CYGWIN_NT OS_INTERIX #endif +#if defined(__gnu_hurd__) +/* Hurd is very similar to GNU/Linux, it should work out of the box */ +OS_LINUX +#endif + #if defined(__i386) || defined(_X86) ARCH_X86 #endif From 5719b7a58d25c302a9190d5f936be7dae97851d5 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Mon, 2 Jul 2012 02:50:02 +0200 Subject: [PATCH 073/162] if SYS_gettid doesn't exist (like under HURD), use getpid() instead --- common_linux.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/common_linux.h b/common_linux.h index b0381d991..6766ff37c 100644 --- a/common_linux.h +++ b/common_linux.h @@ -86,7 +86,13 @@ static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned return syscall(SYS_set_mempolicy, mode, addr, flag); } -static inline int my_gettid(void) { return syscall(SYS_gettid); } +static inline int my_gettid(void) { +#ifdef SYS_gettid +return syscall(SYS_gettid); +#else +return getpid(); +#endif +} #endif #endif From 4b7677a9165f2417c602152ea9fa5dfe87c21685 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Mon, 2 Jul 2012 02:50:41 +0200 Subject: [PATCH 074/162] When dealing with the kfreebsd kernel, set the same behavior as FreeBSD --- ctest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctest.c b/ctest.c index d12e7be99..95a5e8bb2 100644 --- a/ctest.c +++ b/ctest.c @@ -34,7 +34,7 @@ COMPILER_GNU OS_LINUX #endif -#if defined(__FreeBSD__) +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) OS_FREEBSD #endif From 3692b4d631cc34de62ad17dd0519d8332d6231a3 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Mon, 2 Jul 2012 02:51:38 +0200 Subject: [PATCH 075/162] Improve the detection of sparc --- driver/others/memory.c | 2 +- param.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 9b8863f39..af9b54eff 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1128,7 +1128,7 @@ static BLASULONG init_lock = 0UL; static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, void *sa, void *sb, BLASLONG pos) { -#ifndef ARCH_POWER +#if !defined(ARCH_POWER) && !defined(ARCH_SPARC) long size; BLASULONG buffer; diff --git a/param.h b/param.h index 5465c1cbd..c6cd354be 100644 --- a/param.h +++ b/param.h @@ -1482,7 +1482,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_THREAD gemm_thread_mn #endif -#if defined(SPARC) && defined(V9) +#if (defined(SPARC) && defined(V9)) || defined(__sparc_v9__) #define SNUMOPT 2 #define DNUMOPT 2 From 6fcdaa438727f2c6e4445216b8fe516197bd7535 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 5 Jul 2012 13:41:03 +0800 Subject: [PATCH 076/162] Detect Sandy Bridge 22nm desktop/mobile CPU. --- cpuid_x86.c | 54 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index ea1162e8f..b304cdade 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -975,27 +975,33 @@ int get_cpuname(void){ return CPUTYPE_DUNNINGTON; } break; - case 2: - switch (model) { - case 5: - //Intel Core (Clarkdale) / Core (Arrandale) - // Pentium (Clarkdale) / Pentium Mobile (Arrandale) - // Xeon (Clarkdale), 32nm - return CPUTYPE_NEHALEM; - case 10: - //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CPUTYPE_SANDYBRIDGE; - case 12: - //Xeon Processor 5600 (Westmere-EP) - return CPUTYPE_NEHALEM; - case 13: - //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) - return CPUTYPE_SANDYBRIDGE; - case 15: - //Xeon Processor E7 (Westmere-EX) - return CPUTYPE_NEHALEM; - } - break; + case 2: + switch (model) { + case 5: + //Intel Core (Clarkdale) / Core (Arrandale) + // Pentium (Clarkdale) / Pentium Mobile (Arrandale) + // Xeon (Clarkdale), 32nm + return CPUTYPE_NEHALEM; + case 10: + //Intel Core i5-2000 /i7-2000 (Sandy Bridge) + return CPUTYPE_SANDYBRIDGE; + case 12: + //Xeon Processor 5600 (Westmere-EP) + return CPUTYPE_NEHALEM; + case 13: + //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) + return CPUTYPE_SANDYBRIDGE; + case 15: + //Xeon Processor E7 (Westmere-EX) + return CPUTYPE_NEHALEM; + } + break; + case 3: + switch (model) { + case 10: + return CPUTYPE_SANDYBRIDGE; + } + break; } break; case 0x7: @@ -1349,6 +1355,12 @@ int get_coretype(void){ return CORE_NEHALEM; } break; + case 3: + switch (model) { + case 10: + return CORE_SANDYBRIDGE; + } + break; } break; From 4a5d08d0cf042e41836eafd98a2b9bb3f1790374 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 6 Jul 2012 21:37:51 +0800 Subject: [PATCH 077/162] Refs #123. Fixed exporting DLL functions bug on Windows --- exports/gensymbol | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index e09a8b6ab..64c92d396 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -2760,30 +2760,27 @@ if ($ARGV[0] eq "win2k"){ print "EXPORTS\n"; $count = 1; - #remove openblas_set_num_threads - @underscore_objs = grep /[^openblas_set_num_threads]/,@underscore_objs; - foreach $objs (@underscore_objs) { - $uppercase = $objs; - $uppercase =~ tr/[a-z]/[A-Z]/; - print "\t$objs=$objs","_ \@", $count, "\n"; - $count ++; - print "\t",$objs, "_=$objs","_ \@", $count, "\n"; - $count ++; - print "\t$uppercase=$objs", "_ \@", $count, "\n"; - $count ++; + unless ($objs =~ /openblas_set_num_threads/) { #remove openblas_set_num_threads + $uppercase = $objs; + $uppercase =~ tr/[a-z]/[A-Z]/; + print "\t$objs=$objs","_ \@", $count, "\n"; + $count ++; + print "\t",$objs, "_=$objs","_ \@", $count, "\n"; + $count ++; + print "\t$uppercase=$objs", "_ \@", $count, "\n"; + $count ++; + } } - #for openblas_set_num_threads + #for openblas_set_num_threads print "\topenblas_set_num_threads_=openblas_set_num_threads_ \@", $count, "\n"; + $count ++; + + foreach $objs (@no_underscore_objs) { + print "\t",$objs,"=$objs"," \@", $count, "\n"; $count ++; - -# if ($ARGV[4] == 0) { - foreach $objs (@no_underscore_objs) { - print "\t",$objs,"=$objs"," \@", $count, "\n"; - $count ++; - } -# } + } exit(0); } From 50848e34ec2b67ff38ee4ddb057ec8884707a469 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 6 Jul 2012 22:08:35 +0800 Subject: [PATCH 078/162] Updated the doc for 0.2.2 version. --- Changelog.txt | 12 ++++++++++++ Makefile.rule | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index 019870d8c..4e80473d6 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,16 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.2 +6-July-2012 +common: + * Fixed exporting DLL functions bug on Windows/MingW + * Support GNU Hurd (Thank Sylvestre Ledru) + * Support kfreebsd kernel (Thank Sylvestre Ledru) +x86/x86-64: + * Support Intel Sandy Bridge 22nm desktop/mobile CPU +SPARC: + * Improve the detection of SPARC (Thank Sylvestre Ledru) + ==================================================================== Version 0.2.1 30-Jun-2012 diff --git a/Makefile.rule b/Makefile.rule index 082487835..85abf584b 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.1 +VERSION = 0.2.2 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 25f1a573fdacef5131ecd2b06b9d0c00b0a8915f Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sat, 7 Jul 2012 12:12:24 +0800 Subject: [PATCH 079/162] Fixed the build bug when DYNAMIC_ARCH=0. --- Makefile | 8 ++++---- Makefile.system | 4 ++-- driver/others/Makefile | 4 ++-- kernel/Makefile | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 796217291..e538949db 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ include ./Makefile.system BLASDIRS = interface driver/level2 driver/level3 driver/others -ifndef DYNAMIC_ARCH +ifneq ($(DYNAMIC_ARCH), 1) BLASDIRS += kernel endif @@ -147,7 +147,7 @@ ifeq ($(EXPRECISION), 1) echo "#define EXPRECISION">> config_last.h endif ## -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) $(MAKE) -C kernel commonlibs || exit 1 for d in $(DYNAMIC_CORE) ; \ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ @@ -165,7 +165,7 @@ prof_blas : $(MAKE) -C $$d prof || exit 1 ; \ fi; \ done -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) $(MAKE) -C kernel commonprof || exit 1 endif @@ -184,7 +184,7 @@ hpl : $(MAKE) -C $$d $(@F) || exit 1 ; \ fi; \ done -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) $(MAKE) -C kernel commonlibs || exit 1 for d in $(DYNAMIC_CORE) ; \ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ diff --git a/Makefile.system b/Makefile.system index 425cbb68a..e4b92539c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -244,7 +244,7 @@ endif endif -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO @@ -705,7 +705,7 @@ ifndef LIBSUFFIX LIBSUFFIX = a endif -ifndef DYNAMIC_ARCH +ifneq ($(DYNAMIC_ARCH), 1) ifndef SMP LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) diff --git a/driver/others/Makefile b/driver/others/Makefile index 2fdbb4a42..921f47c9c 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -14,7 +14,7 @@ endif # COMMONOBJS += info.$(SUFFIX) -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) COMMONOBJS += dynamic.$(SUFFIX) else COMMONOBJS += parameter.$(SUFFIX) @@ -70,7 +70,7 @@ ifndef BLAS_SERVER BLAS_SERVER = blas_server.c endif -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) diff --git a/kernel/Makefile b/kernel/Makefile index aed145b60..41c5e89fd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -48,7 +48,7 @@ HPLOBJS = \ COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX) -ifdef DYNAMIC_ARCH +ifeq ($(DYNAMIC_ARCH), 1) SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) CCOMMON_OPT += -DTS=$(TSUFFIX) endif From 3108a1853de34244a94aa01dc62e8464ae7a2b9e Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 13 Jul 2012 14:19:30 +0800 Subject: [PATCH 080/162] Added the doc for the conflict with R parallel. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 52d098366..befc14fc0 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. * The number of CPUs/Cores should less than or equal to 256. +* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1. * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. ## Specification of Git Branches From e8306f623aee1ba6b5eba9e2538f058419fb5992 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Mon, 30 Jul 2012 19:46:30 +0200 Subject: [PATCH 081/162] Refs #127. Generate DLL without a version suffix on Windows. --- Makefile | 2 -- Makefile.system | 2 +- exports/Makefile | 5 +++++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index e538949db..c152488bc 100644 --- a/Makefile +++ b/Makefile @@ -99,11 +99,9 @@ ifeq ($(OSNAME), Darwin) endif ifeq ($(OSNAME), WINNT) $(MAKE) -C exports dll - -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll endif ifeq ($(OSNAME), CYGWIN_NT) $(MAKE) -C exports dll - -ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll endif tests : diff --git a/Makefile.system b/Makefile.system index e4b92539c..63d3577a7 100644 --- a/Makefile.system +++ b/Makefile.system @@ -724,8 +724,8 @@ endif endif +LIBDLLNAME = $(LIBPREFIX).dll LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) -LIBDLLNAME = $(LIBNAME:.$(LIBSUFFIX)=.dll) LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) diff --git a/exports/Makefile b/exports/Makefile index 40a3a7c63..c507032e9 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -66,6 +66,11 @@ dll : ../$(LIBDLLNAME) dll2 : libgoto2_shared.dll +# On Windows, we only generate a DLL without a version suffix. This is because +# applications which link against the dynamic library reference a fixed DLL name +# in their import table. By instead using a stable name it is possible to +# upgrade between library versions, without needing to re-link an application. +# For more details see: https://github.com/xianyi/OpenBLAS/issues/127. ../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX) $(RANLIB) ../$(LIBNAME) ifeq ($(BINARY32), 1) From 1b056c532860a1cec7294dc98d199b181cd6e894 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 9 Aug 2012 20:06:51 +0800 Subject: [PATCH 082/162] Refs #130 Prevent reading ipiv array beyond the bound in ?laswp. Use laswp instead of laswp_oncopy in getrf. --- lapack/getrf/getrf_parallel.c | 5 +- lapack/getrf/getrf_parallel_omp.c | 11 + lapack/getrf/getrf_single.c | 2 +- lapack/laswp/generic/laswp_k_1.c | 97 ++- lapack/laswp/generic/laswp_k_2.c | 296 +++++-- lapack/laswp/generic/laswp_k_4.c | 446 ++++++++-- lapack/laswp/generic/laswp_k_8.c | 1300 +++++++++++++++++++++-------- lapack/laswp/generic/zlaswp_k_1.c | 133 ++- lapack/laswp/generic/zlaswp_k_2.c | 283 ++++++- lapack/laswp/generic/zlaswp_k_4.c | 778 ++++++++++++++--- lapack/laswp/x86/Makefile | 4 +- lapack/laswp/x86_64/Makefile | 4 +- patch.for_lapack-3.4.1 | 8 +- 13 files changed, 2693 insertions(+), 674 deletions(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index 0db93da92..6f6672099 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -118,7 +118,7 @@ static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *ra min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - if (GEMM_UNROLL_N <= 8) { + if (0 && GEMM_UNROLL_N <= 8) { LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, @@ -245,7 +245,8 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - if (GEMM_UNROLL_N <= 8) { + if (0 && GEMM_UNROLL_N <= 8) { + printf("helllo\n"); LASWP_NCOPY(min_jj, off + 1, off + k, b + (- off + jjs * lda) * COMPSIZE, lda, diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c index b637e6db5..4922b9b52 100644 --- a/lapack/getrf/getrf_parallel_omp.c +++ b/lapack/getrf/getrf_parallel_omp.c @@ -77,10 +77,21 @@ static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; +#if 0 LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, ipiv, sb + k * (jjs - js) * COMPSIZE); +#else + LASWP_PLUS(min_jj, off + 1, off + k, ZERO, +#ifdef COMPLEX + ZERO, +#endif + c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); + + GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sb + (jjs - js) * k * COMPSIZE); +#endif + for (is = 0; is < k; is += GEMM_P) { min_i = k - is; if (min_i > GEMM_P) min_i = GEMM_P; diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c index a761dee4c..fcea0ae89 100644 --- a/lapack/getrf/getrf_single.c +++ b/lapack/getrf/getrf_single.c @@ -113,7 +113,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, min_jj = js + jmin - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; -#if 0 +#if 1 LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, #ifdef COMPLEX ZERO, diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c index c19017631..1b0db5f8c 100644 --- a/lapack/laswp/generic/laswp_k_1.c +++ b/lapack/laswp/generic/laswp_k_1.c @@ -48,7 +48,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; @@ -58,13 +58,34 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1 -; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; + + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j 0) { @@ -85,10 +106,11 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + + i--; + //Main Loop + while (i > 0) { #ifdef OPTERON #ifndef MINUS asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); @@ -172,12 +194,69 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a1 -= 2; #endif i --; - } while (i > 0); } + + //Loop Ending + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif - i = ((k2 - k1) & 1); + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c index 1105aee82..8a8a89bd1 100644 --- a/lapack/laswp/generic/laswp_k_2.c +++ b/lapack/laswp/generic/laswp_k_2.c @@ -50,7 +50,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3; FLOAT *b1, *b2, *b3, *b4; @@ -60,8 +60,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1 -; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif @@ -69,6 +68,28 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG if (n <= 0) return 0; j = (n >> 1); + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j 0) { do { piv = ipiv; @@ -92,10 +113,13 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - i = ((k2 - k1) >> 1); + i = ((rows) >> 1); - if (i > 0) { - do { + // Loop pipeline + i--; + + //Main Loop + while (i > 0) { #ifdef CORE2 #ifndef MINUS asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); @@ -202,12 +226,99 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a3 -= 2; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + + //Remain + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + + A1 = *a1; B1 = *b1; A3 = *a3; @@ -240,78 +351,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - B1 = *b1; - B2 = *b2; + i = ((rows) >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; } else - if (b2 == a2) { + if (b2 == b1) { *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; *b1 = A1; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - } - } + *b2 = A2; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; #ifndef MINUS - a1 += 2; + a1 += 2; #else - a1 -= 2; + a1 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); - + //Loop Ending (n=1) + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + + //Remain + i = (rows & 1); + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c index e08d49667..86ee949c4 100644 --- a/lapack/laswp/generic/laswp_k_4.c +++ b/lapack/laswp/generic/laswp_k_4.c @@ -54,7 +54,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *b1, *b2, *b3, *b4; @@ -66,14 +66,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1 -; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 2); if (j > 0) { do { @@ -106,8 +127,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG i = ((k2 - k1) >> 1); - if (i > 0) { - do { + i--; //Loop pipeline + //Main Loop + while (i > 0) { A1 = *a1; A2 = *a2; A3 = *a3; @@ -259,12 +281,156 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a7 -= 2; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + + //Loop Ending + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + } + } + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + + //Remain + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + A1 = *a1; B1 = *b1; A3 = *a3; @@ -312,10 +478,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = ((rows) >> 1); + i--; + + while (i > 0) { A1 = *a1; A2 = *a2; A3 = *a3; @@ -409,12 +575,97 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a3 -= 2; #endif i --; - } while (i > 0); } - i = ((k2 - k1) & 1); + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + A1 = *a1; B1 = *b1; A3 = *a3; @@ -445,78 +696,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - B1 = *b1; - B2 = *b2; + i = ((rows) >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; } else - if (b2 == a2) { + if (b2 == b1) { *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; *b1 = A1; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - } - } + *b2 = A2; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; #ifndef MINUS - a1 += 2; + a1 += 2; #else - a1 -= 2; + a1 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); - + //Loop Ending (n=1) + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + + //Remain + i = (rows & 1); + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c index a4d4bce99..e3a05dbcc 100644 --- a/lapack/laswp/generic/laswp_k_8.c +++ b/lapack/laswp/generic/laswp_k_8.c @@ -60,9 +60,9 @@ #endif int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, - FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *a9, *a11, *a13, *a15; @@ -79,13 +79,35 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG k1 --; #ifndef MINUS - ipiv += k1; + ipiv += k1; #else ipiv -= (k2 - 1) * incx; #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv; + a1 = a + k1 + 1; + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 3); if (j > 0) { do { @@ -129,50 +151,51 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b15 = b1 + 7 * lda; b16 = b2 + 7 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - B1 = *b1; - B2 = *b2; - B3 = *b3; - B4 = *b4; - B5 = *b5; - B6 = *b6; - B7 = *b7; - B8 = *b8; + i = (rows >> 1); + i--; + //Loop pipeline + //Main Loop + while (i > 0) { + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; - B9 = *b9; - B10 = *b10; - B11 = *b11; - B12 = *b12; - B13 = *b13; - B14 = *b14; - B15 = *b15; - B16 = *b16; + B9 = *b9; + B10 = *b10; + B11 = *b11; + B12 = *b12; + B13 = *b13; + B14 = *b14; + B15 = *b15; + B16 = *b16; - A1 = *a1; - A2 = *a2; - A3 = *a3; - A4 = *a4; - A5 = *a5; - A6 = *a6; - A7 = *a7; - A8 = *a8; - - A9 = *a9; - A10 = *a10; - A11 = *a11; - A12 = *a12; - A13 = *a13; - A14 = *a14; - A15 = *a15; - A16 = *a16; - - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + A9 = *a9; + A10 = *a10; + A11 = *a11; + A12 = *a12; + A13 = *a13; + A14 = *a14; + A15 = *a15; + A16 = *a16; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; if (b1 == a1) { if (b2 == a1) { @@ -371,51 +394,316 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG } } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; - b3 = b1 + 1 * lda; - b4 = b2 + 1 * lda; - b5 = b1 + 2 * lda; - b6 = b2 + 2 * lda; - b7 = b1 + 3 * lda; - b8 = b2 + 3 * lda; + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; - b9 = b1 + 4 * lda; - b10 = b2 + 4 * lda; - b11 = b1 + 5 * lda; - b12 = b2 + 5 * lda; - b13 = b1 + 6 * lda; - b14 = b2 + 6 * lda; - b15 = b1 + 7 * lda; - b16 = b2 + 7 * lda; + b9 = b1 + 4 * lda; + b10 = b2 + 4 * lda; + b11 = b1 + 5 * lda; + b12 = b2 + 5 * lda; + b13 = b1 + 6 * lda; + b14 = b2 + 6 * lda; + b15 = b1 + 7 * lda; + b16 = b2 + 7 * lda; #ifndef MINUS - a1 += 2; - a3 += 2; - a5 += 2; - a7 += 2; - a9 += 2; - a11 += 2; - a13 += 2; - a15 += 2; + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; + a9 += 2; + a11 += 2; + a13 += 2; + a15 += 2; #else - a1 -= 2; - a3 -= 2; - a5 -= 2; - a7 -= 2; - a9 -= 2; - a11 -= 2; - a13 -= 2; - a15 -= 2; + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; + a9 -= 2; + a11 -= 2; + a13 -= 2; + a15 -= 2; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + B9 = *b9; + B10 = *b10; + B11 = *b11; + B12 = *b12; + B13 = *b13; + B14 = *b14; + B15 = *b15; + B16 = *b16; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + A9 = *a9; + A10 = *a10; + A11 = *a11; + A12 = *a12; + A13 = *a13; + A14 = *a14; + A15 = *a15; + A16 = *a16; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + *a9 = A10; + *a10 = A9; + *a11 = A12; + *a12 = A11; + *a13 = A14; + *a14 = A13; + *a15 = A16; + *a16 = A15; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + + *a10 = B10; + *b10 = A10; + *a12 = B12; + *b12 = A12; + *a14 = B14; + *b14 = A14; + *a16 = B16; + *b16 = A16; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + + *a9 = A10; + *a10 = A9; + *a11 = A12; + *a12 = A11; + *a13 = A14; + *a14 = A13; + *a15 = A16; + *a16 = A15; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + + *a9 = A10; + *a10 = B10; + *b10 = A9; + *a11 = A12; + *a12 = B12; + *b12 = A11; + *a13 = A14; + *a14 = B14; + *b14 = A13; + *a15 = A16; + *a16 = B16; + *b16 = A15; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + + *a9 = A10; + *a10 = B9; + *b9 = A9; + *a11 = A12; + *a12 = B11; + *b11 = A11; + *a13 = A14; + *a14 = B13; + *b13 = A13; + *a15 = A16; + *a16 = B15; + *b15 = A15; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + + *a9 = B9; + *b9 = A9; + *a11 = B11; + *b11 = A11; + *a13 = B13; + *b13 = A13; + *a15 = B15; + *b15 = A15; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + + *a9 = B9; + *a10 = A9; + *b9 = A10; + *a11 = B11; + *a12 = A11; + *b11 = A12; + *a13 = B13; + *a14 = A13; + *b13 = A14; + *a15 = B15; + *a16 = A15; + *b15 = A16; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + + *a9 = B9; + *a10 = B10; + *b9 = A9; + *b10 = A10; + *a11 = B11; + *a12 = B12; + *b11 = A11; + *b12 = A12; + *a13 = B13; + *a14 = B14; + *b13 = A13; + *b14 = A14; + *a15 = B15; + *a16 = B16; + *b15 = A15; + *b16 = A16; + } + } + + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; + a9 += 2; + a11 += 2; + a13 += 2; + a15 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; + a9 -= 2; + a11 -= 2; + a13 -= 2; + a15 -= 2; +#endif + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + + b9 = b1 + 4 * lda; + b11 = b1 + 5 * lda; + b13 = b1 + 6 * lda; + b15 = b1 + 7 * lda; + + A1 = *a1; B1 = *b1; A3 = *a3; @@ -487,187 +775,327 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - A3 = *a3; - A4 = *a4; - A5 = *a5; - A6 = *a6; - A7 = *a7; - A8 = *a8; + i = (rows >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; - B1 = *b1; - B2 = *b2; - B3 = *b3; - B4 = *b4; - B5 = *b5; - B6 = *b6; - B7 = *b7; - B8 = *b8; + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; *a3 = A4; - *a4 = A3; + *a4 = B3; + *b3 = A3; *a5 = A6; - *a6 = A5; + *a6 = B5; + *b5 = A5; *a7 = A8; - *a8 = A7; + *a8 = B7; + *b7 = A7; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - *a4 = B4; - *b4 = A4; - *a6 = B6; - *b6 = A6; - *a8 = B8; - *b8 = A8; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; *a2 = A1; - *a3 = A4; + *b1 = A2; + *a3 = B3; *a4 = A3; - *a5 = A6; + *b3 = A4; + *a5 = B5; *a6 = A5; - *a7 = A8; + *b5 = A6; + *a7 = B7; *a8 = A7; + *b7 = A8; } else { - *a1 = A2; + *a1 = B1; *a2 = B2; - *b2 = A1; - *a3 = A4; + *b1 = A1; + *b2 = A2; + *a3 = B3; *a4 = B4; - *b4 = A3; - *a5 = A6; + *b3 = A3; + *b4 = A4; + *a5 = B5; *a6 = B6; - *b6 = A5; - *a7 = A8; - *a8 = B8; - *b8 = A7; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; - *b1 = A1; - *a3 = A4; - *a4 = B3; - *b3 = A3; - *a5 = A6; - *a6 = B5; - *b5 = A5; - *a7 = A8; - *a8 = B7; - *b7 = A7; - } else - if (b2 == a2) { - *a1 = B1; - *b1 = A1; - *a3 = B3; - *b3 = A3; - *a5 = B5; *b5 = A5; + *b6 = A6; *a7 = B7; + *a8 = B8; *b7 = A7; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - *a3 = B3; - *a4 = A3; - *b3 = A4; - *a5 = B5; - *a6 = A5; - *b5 = A6; - *a7 = B7; - *a8 = A7; - *b7 = A8; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - *a3 = B3; - *a4 = B4; - *b3 = A3; - *b4 = A4; - *a5 = B5; - *a6 = B6; - *b5 = A5; - *b6 = A6; - *a7 = B7; - *a8 = B8; - *b7 = A7; - *b8 = A8; - } - } + *b8 = A8; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; - b3 = b1 + 1 * lda; - b4 = b2 + 1 * lda; - b5 = b1 + 2 * lda; - b6 = b2 + 2 * lda; - b7 = b1 + 3 * lda; - b8 = b2 + 3 * lda; + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; #ifndef MINUS - a1 += 2; - a3 += 2; - a5 += 2; - a7 += 2; + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; #else - a1 -= 2; - a3 -= 2; - a5 -= 2; - a7 -= 2; + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; #endif - i --; - } while (i > 0); + i --; + } + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + } } + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + + i = (rows & 1); - i = ((k2 - k1) & 1); - - if (i > 0) { - A1 = *a1; - B1 = *b1; - A3 = *a3; - B3 = *b3; - A5 = *a5; - B5 = *b5; - A7 = *a7; - B7 = *b7; + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; - *a1 = B1; - *b1 = A1; - *a3 = B3; - *b3 = A3; - *a5 = B5; - *b5 = A5; - *a7 = B7; - *b7 = A7; - } + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + A5 = *a5; + B5 = *b5; + A7 = *a7; + B7 = *b7; + + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } - a += 4 * lda; + a += 4 * lda; } if (n & 2) { @@ -692,109 +1120,194 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b3 = b1 + 1 * lda; b4 = b2 + 1 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - A3 = *a3; - A4 = *a4; + i = ((rows) >> 1); + i--; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; - B1 = *b1; - B2 = *b2; - B3 = *b3; - B4 = *b4; + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; *a3 = A4; - *a4 = A3; + *a4 = B3; + *b3 = A3; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - *a4 = B4; - *b4 = A4; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - *a3 = A4; - *a4 = A3; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - *a3 = A4; - *a4 = B4; - *b4 = A3; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; - *a3 = A4; - *a4 = B3; + *a3 = B3; *b3 = A3; } else - if (b2 == a2) { + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { *a1 = B1; + *a2 = B2; *b1 = A1; + *b2 = A2; *a3 = B3; + *a4 = B4; *b3 = A3; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - *a3 = B3; - *a4 = A3; - *b3 = A4; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - *a3 = B3; - *a4 = B4; - *b3 = A3; - *b4 = A4; - } - } + *b4 = A4; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; - b3 = b1 + 1 * lda; - b4 = b2 + 1 * lda; + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; #ifndef MINUS - a1 += 2; - a3 += 2; + a1 += 2; + a3 += 2; #else - a1 -= 2; - a3 -= 2; + a1 -= 2; + a3 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); + //Loop Ending + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + + i = ((rows) & 1); if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + b3 = b1 + 1 * lda; + A1 = *a1; B1 = *b1; A3 = *a3; @@ -825,78 +1338,135 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *a1; - A2 = *a2; - B1 = *b1; - B2 = *b2; + i = ((rows) >> 1); + i --; + + while (i > 0) { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; - ip1 = *piv; - piv += incx; - ip2 = *piv; - piv += incx; + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { if (b2 == a1) { *a1 = A2; - *a2 = A1; + *a2 = B1; + *b1 = A1; } else - if (b2 != a2) { - *a2 = B2; - *b2 = A2; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *a1 = A2; - *a2 = A1; - } else { - *a1 = A2; - *a2 = B2; - *b2 = A1; - } - } - } else { - if (b2 == a1) { - *a1 = A2; - *a2 = B1; + if (b2 == a2) { + *a1 = B1; *b1 = A1; } else - if (b2 == a2) { + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { *a1 = B1; + *a2 = B2; *b1 = A1; - } else - if (b2 == b1) { - *a1 = B1; - *a2 = A1; - *b1 = A2; - } else { - *a1 = B1; - *a2 = B2; - *b1 = A1; - *b2 = A2; - } - } + *b2 = A2; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; #ifndef MINUS - a1 += 2; + a1 += 2; #else - a1 -= 2; + a1 -= 2; #endif - i --; - } while (i > 0); + i --; } - i = ((k2 - k1) & 1); - + //Loop Ending (n=1) + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + + //Remain + i = (rows & 1); + if (i > 0) { + ip1 = *piv; + b1 = a + ip1; + A1 = *a1; B1 = *b1; *a1 = B1; diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c index 3dd653baf..7a62dd9b8 100644 --- a/lapack/laswp/generic/zlaswp_k_1.c +++ b/lapack/laswp/generic/zlaswp_k_1.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; @@ -66,6 +66,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv * 2; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j 0) { @@ -87,9 +119,10 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b2 = a + ip2; i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i --; + //Loop pipeline + //Main Loop + while (i > 0) { #ifdef OPTERON #ifndef MINUS asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); @@ -198,12 +231,98 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); + } - - i = ((k2 - k1) & 1); + + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c index a877ef66b..0fa685859 100644 --- a/lapack/laswp/generic/zlaswp_k_2.c +++ b/lapack/laswp/generic/zlaswp_k_2.c @@ -49,7 +49,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1; FLOAT *b1, *b2; @@ -68,6 +68,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv * 2; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 1); if (j > 0) { @@ -88,10 +120,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { #ifdef CORE2 #ifndef MINUS asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); @@ -246,12 +280,149 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); } - - i = ((k2 - k1) & 1); + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + + A5 = *(a1 + 0 + lda); + A6 = *(a1 + 1 + lda); + A7 = *(a2 + 0 + lda); + A8 = *(a2 + 1 + lda); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + B5 = *(b1 + 0 + lda); + B6 = *(b1 + 1 + lda); + B7 = *(b2 + 0 + lda); + B8 = *(b2 + 1 + lda); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b2 + 0 + lda) = A7; + *(b2 + 1 + lda) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b2 + 0 + lda) = A5; + *(b2 + 1 + lda) = A6; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = B5; + *(a2 + 1 + lda) = B6; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + *(b1 + 0 + lda) = A7; + *(b1 + 1 + lda) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + *(b2 + 0 + lda) = A7; + *(b2 + 1 + lda) = A8; + } + } + + + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a1 + 0 + lda); @@ -293,10 +464,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); @@ -384,12 +557,94 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); } + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif - i = ((k2 - k1) & 1); + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c index 4dc559895..c63a8e2e0 100644 --- a/lapack/laswp/generic/zlaswp_k_4.c +++ b/lapack/laswp/generic/zlaswp_k_4.c @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, FLOAT *a, BLASLONG lda, FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ - BLASLONG i, j, ip1, ip2; + BLASLONG i, j, ip1, ip2, rows; blasint *piv; FLOAT *a1, *a3, *a5, *a7; FLOAT *b1, *b2, *b3, *b4; @@ -76,6 +76,38 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, #endif if (n <= 0) return 0; + rows = k2-k1; + if (rows <=0) return 0; + if (rows == 1) { + //Only have 1 row + ip1 = *ipiv * 2; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + b1 = a + ip1; + + if(a1 == b1) return 0; + + for(j=0; j> 2); if (j > 0) { @@ -107,10 +139,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b7 = b1 + 3 * lda; b8 = b2 + 3 * lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); @@ -366,12 +400,260 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a7 -= 4; #endif i --; - } while (i > 0); } - i = ((k2 - k1) & 1); + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + A9 = *(a5 + 0); + A10 = *(a5 + 1); + A11 = *(a6 + 0); + A12 = *(a6 + 1); + A13 = *(a7 + 0); + A14 = *(a7 + 1); + A15 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + B9 = *(b5 + 0); + B10 = *(b5 + 1); + B11 = *(b6 + 0); + B12 = *(b6 + 1); + B13 = *(b7 + 0); + B14 = *(b7 + 1); + B15 = *(b8 + 0); + B16 = *(b8 + 1); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b6 + 0) = A11; + *(b6 + 1) = A12; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b8 + 0) = A15; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b6 + 0) = A9; + *(b6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b8 + 0) = A13; + *(b8 + 1) = A14; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = B9; + *(a6 + 1) = B10; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = B13; + *(a8 + 1) = B14; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(b5 + 0) = A11; + *(b5 + 1) = A12; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + *(b7 + 0) = A15; + *(b7 + 1) = A16; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(b6 + 0) = A11; + *(b6 + 1) = A12; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + *(b8 + 0) = A15; + *(b8 + 1) = A16; + } + } + +#ifndef MINUS + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; +#else + a1 -= 4; + a3 -= 4; + a5 -= 4; + a7 -= 4; +#endif + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + b3 = b1 + 1 * lda; + b5 = b1 + 2 * lda; + b7 = b1 + 3 * lda; + + + A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a3 + 0); @@ -435,161 +717,303 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b3 = b1 + lda; b4 = b2 + lda; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { - A1 = *(a1 + 0); - A2 = *(a1 + 1); - A3 = *(a2 + 0); - A4 = *(a2 + 1); + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); - A5 = *(a3 + 0); - A6 = *(a3 + 1); - A7 = *(a4 + 0); - A8 = *(a4 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); - B1 = *(b1 + 0); - B2 = *(b1 + 1); - B3 = *(b2 + 0); - B4 = *(b2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); - B5 = *(b3 + 0); - B6 = *(b3 + 1); - B7 = *(b4 + 0); - B8 = *(b4 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); - ip1 = *piv * 2; - piv += incx; - ip2 = *piv * 2; - piv += incx; + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; - if (b1 == a1) { + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } + } else { if (b2 == a1) { *(a1 + 0) = A3; *(a1 + 1) = A4; - *(a2 + 0) = A1; - *(a2 + 1) = A2; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; *(a3 + 0) = A7; *(a3 + 1) = A8; - *(a4 + 0) = A5; - *(a4 + 1) = A6; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; } else - if (b2 != a2) { - *(a2 + 0) = B3; - *(a2 + 1) = B4; - *(b2 + 0) = A3; - *(b2 + 1) = A4; - *(a4 + 0) = B7; - *(a4 + 1) = B8; - *(b4 + 0) = A7; - *(b4 + 1) = A8; - } - } else - if (b1 == a2) { - if (b2 != a1) { - if (b2 == a2) { - *(a1 + 0) = A3; - *(a1 + 1) = A4; - *(a2 + 0) = A1; - *(a2 + 1) = A2; - *(a3 + 0) = A7; - *(a3 + 1) = A8; - *(a4 + 0) = A5; - *(a4 + 1) = A6; - } else { - *(a1 + 0) = A3; - *(a1 + 1) = A4; - *(a2 + 0) = B3; - *(a2 + 1) = B4; - *(b2 + 0) = A1; - *(b2 + 1) = A2; - *(a3 + 0) = A7; - *(a3 + 1) = A8; - *(a4 + 0) = B7; - *(a4 + 1) = B8; - *(b4 + 0) = A5; - *(b4 + 1) = A6; - } - } - } else { - if (b2 == a1) { - *(a1 + 0) = A3; - *(a1 + 1) = A4; - *(a2 + 0) = B1; - *(a2 + 1) = B2; + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; *(b1 + 0) = A1; *(b1 + 1) = A2; - *(a3 + 0) = A7; - *(a3 + 1) = A8; - *(a4 + 0) = B5; - *(a4 + 1) = B6; + *(a3 + 0) = B5; + *(a3 + 1) = B6; *(b3 + 0) = A5; *(b3 + 1) = A6; } else - if (b2 == a2) { + if (b2 == b1) { *(a1 + 0) = B1; *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; *(b1 + 0) = A1; *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; *(a3 + 0) = B5; *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; *(b3 + 0) = A5; *(b3 + 1) = A6; - } else - if (b2 == b1) { - *(a1 + 0) = B1; - *(a1 + 1) = B2; - *(a2 + 0) = A1; - *(a2 + 1) = A2; - *(b1 + 0) = A3; - *(b1 + 1) = A4; - *(a3 + 0) = B5; - *(a3 + 1) = B6; - *(a4 + 0) = A5; - *(a4 + 1) = A6; - *(b3 + 0) = A7; - *(b3 + 1) = A8; - } else { - *(a1 + 0) = B1; - *(a1 + 1) = B2; - *(a2 + 0) = B3; - *(a2 + 1) = B4; - *(b1 + 0) = A1; - *(b1 + 1) = A2; - *(b2 + 0) = A3; - *(b2 + 1) = A4; - *(a3 + 0) = B5; - *(a3 + 1) = B6; - *(a4 + 0) = B7; - *(a4 + 1) = B8; - *(b3 + 0) = A5; - *(b3 + 1) = A6; - *(b4 + 0) = A7; - *(b4 + 1) = A8; - } - } + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } - b1 = a + ip1; - b2 = a + ip2; + b1 = a + ip1; + b2 = a + ip2; - b3 = b1 + lda; - b4 = b2 + lda; + b3 = b1 + lda; + b4 = b2 + lda; #ifndef MINUS - a1 += 4; - a3 += 4; + a1 += 4; + a3 += 4; #else - a1 -= 4; - a3 -= 4; + a1 -= 4; + a3 -= 4; #endif - i --; - } while (i > 0); + i --; } - - i = ((k2 - k1) & 1); + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + +#ifndef MINUS + a1 += 4; + a3 += 4; +#else + a1 -= 4; + a3 -= 4; +#endif + + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + + b1 = a + ip1; + b3 = b1 + lda; + A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a3 + 0); @@ -629,10 +1053,12 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, b1 = a + ip1; b2 = a + ip2; - i = ((k2 - k1) >> 1); - - if (i > 0) { - do { + i = (rows >> 1); + i--; + + //Loop pipeline + //Main Loop + while (i > 0) { A1 = *(a1 + 0); A2 = *(a1 + 1); A3 = *(a2 + 0); @@ -720,12 +1146,94 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, a1 -= 4; #endif i --; - } while (i > 0); } + //Loop Ending + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif - i = ((k2 - k1) & 1); + //Remain + i = (rows & 1); if (i > 0) { + ip1 = *piv * 2; + b1 = a + ip1; + A1 = *(a1 + 0); A2 = *(a1 + 1); B1 = *(b1 + 0); diff --git a/lapack/laswp/x86/Makefile b/lapack/laswp/x86/Makefile index 105ec4027..2e9db4052 100644 --- a/lapack/laswp/x86/Makefile +++ b/lapack/laswp/x86/Makefile @@ -17,11 +17,11 @@ ZLASWP = ../generic/zlaswp_k_1.c endif ifndef LASWP -LASWP = ../generic/laswp_k_1.c +LASWP = ../generic/laswp_k.c endif ifndef ZLASWP -ZLASWP = ../generic/zlaswp_k_1.c +ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile diff --git a/lapack/laswp/x86_64/Makefile b/lapack/laswp/x86_64/Makefile index ba07dcf4f..17fb1f961 100644 --- a/lapack/laswp/x86_64/Makefile +++ b/lapack/laswp/x86_64/Makefile @@ -22,11 +22,11 @@ ZLASWP = ../generic/zlaswp_k_1.c endif ifndef LASWP -LASWP = ../generic/laswp_k_1.c +LASWP = ../generic/laswp_k.c endif ifndef ZLASWP -ZLASWP = ../generic/zlaswp_k_1.c +ZLASWP = ../generic/zlaswp_k.c endif include ../generic/Makefile diff --git a/patch.for_lapack-3.4.1 b/patch.for_lapack-3.4.1 index 79c74aad2..ff4954b09 100644 --- a/patch.for_lapack-3.4.1 +++ b/patch.for_lapack-3.4.1 @@ -191,7 +191,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile + slaqtr.$(SUFFIX) slar1v.$(SUFFIX) slar2v.$(SUFFIX) ilaslr.$(SUFFIX) ilaslc.$(SUFFIX) \ + slarf.$(SUFFIX) slarfb.$(SUFFIX) slarfg.$(SUFFIX) slarfgp.$(SUFFIX) slarft.$(SUFFIX) slarfx.$(SUFFIX) slargv.$(SUFFIX) \ + slarrv.$(SUFFIX) slartv.$(SUFFIX) \ -+ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slaswp.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ ++ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ + slatbs.$(SUFFIX) slatdf.$(SUFFIX) slatps.$(SUFFIX) slatrd.$(SUFFIX) slatrs.$(SUFFIX) slatrz.$(SUFFIX) slatzm.$(SUFFIX) \ + sopgtr.$(SUFFIX) sopmtr.$(SUFFIX) sorg2l.$(SUFFIX) sorg2r.$(SUFFIX) \ + sorgbr.$(SUFFIX) sorghr.$(SUFFIX) sorgl2.$(SUFFIX) sorglq.$(SUFFIX) sorgql.$(SUFFIX) sorgqr.$(SUFFIX) sorgr2.$(SUFFIX) \ @@ -345,7 +345,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile + clarf.$(SUFFIX) clarfb.$(SUFFIX) clarfg.$(SUFFIX) clarft.$(SUFFIX) clarfgp.$(SUFFIX) \ + clarfx.$(SUFFIX) clargv.$(SUFFIX) clarnv.$(SUFFIX) clarrv.$(SUFFIX) clartg.$(SUFFIX) clartv.$(SUFFIX) \ + clarz.$(SUFFIX) clarzb.$(SUFFIX) clarzt.$(SUFFIX) clascl.$(SUFFIX) claset.$(SUFFIX) clasr.$(SUFFIX) classq.$(SUFFIX) \ -+ claswp.$(SUFFIX) clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ ++ clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ + clatzm.$(SUFFIX) cpbcon.$(SUFFIX) cpbequ.$(SUFFIX) cpbrfs.$(SUFFIX) cpbstf.$(SUFFIX) cpbsv.$(SUFFIX) \ + cpbsvx.$(SUFFIX) cpbtf2.$(SUFFIX) cpbtrf.$(SUFFIX) cpbtrs.$(SUFFIX) cpocon.$(SUFFIX) cpoequ.$(SUFFIX) cporfs.$(SUFFIX) \ + cposv.$(SUFFIX) cposvx.$(SUFFIX) cpotri.$(SUFFIX) cpstrf.$(SUFFIX) cpstf2.$(SUFFIX) \ @@ -484,7 +484,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile + dlaqtr.$(SUFFIX) dlar1v.$(SUFFIX) dlar2v.$(SUFFIX) iladlr.$(SUFFIX) iladlc.$(SUFFIX) \ + dlarf.$(SUFFIX) dlarfb.$(SUFFIX) dlarfg.$(SUFFIX) dlarfgp.$(SUFFIX) dlarft.$(SUFFIX) dlarfx.$(SUFFIX) \ + dlargv.$(SUFFIX) dlarrv.$(SUFFIX) dlartv.$(SUFFIX) \ -+ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlaswp.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ ++ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ + dlatbs.$(SUFFIX) dlatdf.$(SUFFIX) dlatps.$(SUFFIX) dlatrd.$(SUFFIX) dlatrs.$(SUFFIX) dlatrz.$(SUFFIX) dlatzm.$(SUFFIX) \ + dopgtr.$(SUFFIX) dopmtr.$(SUFFIX) dorg2l.$(SUFFIX) dorg2r.$(SUFFIX) \ + dorgbr.$(SUFFIX) dorghr.$(SUFFIX) dorgl2.$(SUFFIX) dorglq.$(SUFFIX) dorgql.$(SUFFIX) dorgqr.$(SUFFIX) dorgr2.$(SUFFIX) \ @@ -643,7 +643,7 @@ diff -ruN lapack-3.4.1.old/SRC/Makefile lapack-3.4.1/SRC/Makefile + zlarfg.$(SUFFIX) zlarft.$(SUFFIX) zlarfgp.$(SUFFIX) \ + zlarfx.$(SUFFIX) zlargv.$(SUFFIX) zlarnv.$(SUFFIX) zlarrv.$(SUFFIX) zlartg.$(SUFFIX) zlartv.$(SUFFIX) \ + zlarz.$(SUFFIX) zlarzb.$(SUFFIX) zlarzt.$(SUFFIX) zlascl.$(SUFFIX) zlaset.$(SUFFIX) zlasr.$(SUFFIX) \ -+ zlassq.$(SUFFIX) zlaswp.$(SUFFIX) zlasyf.$(SUFFIX) \ ++ zlassq.$(SUFFIX) zlasyf.$(SUFFIX) \ + zlatbs.$(SUFFIX) zlatdf.$(SUFFIX) zlatps.$(SUFFIX) zlatrd.$(SUFFIX) zlatrs.$(SUFFIX) zlatrz.$(SUFFIX) zlatzm.$(SUFFIX) zlauu2.$(SUFFIX) \ + zpbcon.$(SUFFIX) zpbequ.$(SUFFIX) zpbrfs.$(SUFFIX) zpbstf.$(SUFFIX) zpbsv.$(SUFFIX) \ + zpbsvx.$(SUFFIX) zpbtf2.$(SUFFIX) zpbtrf.$(SUFFIX) zpbtrs.$(SUFFIX) zpocon.$(SUFFIX) zpoequ.$(SUFFIX) zporfs.$(SUFFIX) \ From 7bd1834d592e6d9bc0c6cc17d1d7571f63231a93 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 9 Aug 2012 20:36:18 +0800 Subject: [PATCH 083/162] Refs #130 Fixed laswp building bug with DYNAMIC_ARCH=1. --- lapack/laswp/x86/Makefile | 5 +++++ lapack/laswp/x86_64/Makefile | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/lapack/laswp/x86/Makefile b/lapack/laswp/x86/Makefile index 2e9db4052..434c82a84 100644 --- a/lapack/laswp/x86/Makefile +++ b/lapack/laswp/x86/Makefile @@ -16,6 +16,11 @@ LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + ifndef LASWP LASWP = ../generic/laswp_k.c endif diff --git a/lapack/laswp/x86_64/Makefile b/lapack/laswp/x86_64/Makefile index 17fb1f961..e6dae5344 100644 --- a/lapack/laswp/x86_64/Makefile +++ b/lapack/laswp/x86_64/Makefile @@ -21,6 +21,11 @@ LASWP = ../generic/laswp_k_1.c ZLASWP = ../generic/zlaswp_k_1.c endif +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + ifndef LASWP LASWP = ../generic/laswp_k.c endif From a92895939e2522b062c49e8b97caee07cb32e311 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 9 Aug 2012 20:37:45 +0800 Subject: [PATCH 084/162] Added the tip for Windows. --- quickbuild.win32 | 1 + quickbuild.win64 | 1 + 2 files changed, 2 insertions(+) diff --git a/quickbuild.win32 b/quickbuild.win32 index 29949c192..3d7db1770 100644 --- a/quickbuild.win32 +++ b/quickbuild.win32 @@ -1,3 +1,4 @@ #!/bin/bash +echo " Please read https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio " make BINARY=32 CC=gcc FC=gfortran diff --git a/quickbuild.win64 b/quickbuild.win64 index 88f748a8d..8f0189435 100644 --- a/quickbuild.win64 +++ b/quickbuild.win64 @@ -1,3 +1,4 @@ #!/bin/bash +echo " Please read https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio " make BINARY=64 CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran From d007cca61d415af7c16dc51f2ce1c013861b24a4 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 10 Aug 2012 11:54:21 +0800 Subject: [PATCH 085/162] Refs #134. Fixed the building bug on IBM Power. --- common_interface.h | 2 +- driver/others/openblas_set_num_threads.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common_interface.h b/common_interface.h index dbe0bb851..cc5771daa 100644 --- a/common_interface.h +++ b/common_interface.h @@ -45,7 +45,7 @@ extern "C" { int BLASFUNC(xerbla)(char *, blasint *info, blasint); -void BLASFUNC(openblas_set_num_threads)(int *); +void openblas_set_num_threads_(int *); FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index 27de83ffc..5e24cfcc7 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern void openblas_set_num_threads(int num_threads) ; -void NAME(int* num_threads){ +void openblas_set_num_threads_(int* num_threads){ openblas_set_num_threads(*num_threads); } @@ -46,7 +46,7 @@ void NAME(int* num_threads){ void openblas_set_num_threads(int num_threads) { } -void NAME(int* num_threads){ +void openblas_set_num_threads_(int* num_threads){ } #endif From 068861a927f204b4cbb9479d9c13697eeb802821 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 10 Aug 2012 14:36:26 +0800 Subject: [PATCH 086/162] Refs #133. Users can set COMMON_OPT flag to control CFLAGS and FFLAGS. --- Makefile.rule | 15 ++++++--------- Makefile.system | 9 +++++++++ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 85abf584b..b73f87d68 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -108,19 +108,16 @@ VERSION = 0.2.2 # The installation directory. # PREFIX = /opt/OpenBLAS -# Common Optimization Flag; -O2 is enough. -# DEBUG = 1 - -ifeq ($(DEBUG), 1) -COMMON_OPT += -g -# -DDEBUG -else -COMMON_OPT += -O2 -endif +# Common Optimization Flag; +# The default -O2 is enough. +# COMMON_OPT = -O2 # Profiling flags COMMON_PROF = -pg +# Build Debug version +# DEBUG = 1 + # # End of user configuration # diff --git a/Makefile.system b/Makefile.system index 63d3577a7..b2180f30d 100644 --- a/Makefile.system +++ b/Makefile.system @@ -687,6 +687,15 @@ AWK = awk REVISION = -r$(VERSION) MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) +ifeq ($(DEBUG), 1) +COMMON_OPT += -g +endif + +ifndef COMMON_OPT +COMMON_OPT = -O2 +endif + + CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) From a55821a2ec514ca0a20a07e436bcc82d2f9c962e Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 11 Aug 2012 21:33:15 +0800 Subject: [PATCH 087/162] Refs #132. Kill the threads when unload the library. --- driver/others/blas_server.c | 2 +- driver/others/blas_server_win32.c | 2 +- driver/others/memory.c | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 66067a05c..f16b827d3 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -435,7 +435,7 @@ static int blas_thread_server(void *arg){ blas_memory_free(buffer); - pthread_exit(NULL); + //pthread_exit(NULL); return 0; } diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index c71e7c276..16eed4c97 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -433,7 +433,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ /* Shutdown procedure, but user don't have to call this routine. The */ /* kernel automatically kill threads. */ -int blas_thread_shutdown_(void){ +int BLASFUNC(blas_thread_shutdown)(void){ int i; diff --git a/driver/others/memory.c b/driver/others/memory.c index af9b54eff..ff8cc248b 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1289,6 +1289,7 @@ void DESTRUCTOR gotoblas_quit(void) { moncontrol (1); #endif + blas_shutdown(); } #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) From 54cd65e47f514c9136194a29c3e68c700452bccc Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 13 Aug 2012 15:25:08 +0800 Subject: [PATCH 088/162] Use sandy bridge kernel when DYNAMIC_ARCH=1. --- Makefile | 2 +- driver/others/dynamic.c | 37 +++++++++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index c152488bc..d95373086 100644 --- a/Makefile +++ b/Makefile @@ -231,7 +231,7 @@ ifndef NOFORTRAN -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -# -@echo "CEXTRALIB = $(CEXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc endif diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 0364d0374..aa4b867fd 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -60,6 +60,8 @@ extern gotoblas_t gotoblas_NEHALEM; extern gotoblas_t gotoblas_OPTERON; extern gotoblas_t gotoblas_OPTERON_SSE3; extern gotoblas_t gotoblas_BARCELONA; +extern gotoblas_t gotoblas_SANDYBRIDGE; +extern gotoblas_t gotoblas_BOBCAT; #define VENDOR_INTEL 1 #define VENDOR_AMD 2 @@ -122,15 +124,24 @@ static gotoblas_t *get_coretype(void){ if (model == 12) return &gotoblas_ATOM; return NULL; - case 2: - //Intel Core (Clarkdale) / Core (Arrandale) - // Pentium (Clarkdale) / Pentium Mobile (Arrandale) - // Xeon (Clarkdale), 32nm - if (model == 5) return &gotoblas_NEHALEM; + case 2: + //Intel Core (Clarkdale) / Core (Arrandale) + // Pentium (Clarkdale) / Pentium Mobile (Arrandale) + // Xeon (Clarkdale), 32nm + if (model == 5) return &gotoblas_NEHALEM; - //Intel Xeon Processor 5600 (Westmere-EP) - if (model == 12) return &gotoblas_NEHALEM; - return NULL; + //Intel Xeon Processor 5600 (Westmere-EP) + //Xeon Processor E7 (Westmere-EX) + if (model == 12 || model == 15) return &gotoblas_NEHALEM; + + //Intel Core i5-2000 /i7-2000 (Sandy Bridge) + //Intel Core i7-3000 / Xeon E5 + if (model == 10 || model == 13) return &gotoblas_SANDYBRIDGE; + return NULL; + case 3: + //Intel Sandy Bridge 22nm (Ivy Bridge?) + if (model == 10) return &gotoblas_SANDYBRIDGE; + return NULL; } case 0xf: if (model <= 0x2) return &gotoblas_NORTHWOOD; @@ -144,7 +155,9 @@ static gotoblas_t *get_coretype(void){ if ((exfamily == 0) || (exfamily == 2)) { if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; else return &gotoblas_OPTERON; - } else { + } else if (exfamily == 5) { + return &gotoblas_BOBCAT; + } else { return &gotoblas_BARCELONA; } } @@ -178,6 +191,8 @@ static char *corename[] = { "Opteron(SSE3)", "Barcelona", "Nano", + "Sandybridge", + "Bobcat", }; char *gotoblas_corename(void) { @@ -197,7 +212,9 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_OPTERON) return corename[13]; if (gotoblas == &gotoblas_BARCELONA) return corename[14]; if (gotoblas == &gotoblas_NANO) return corename[15]; - + if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; + if (gotoblas == &gotoblas_BOBCAT) return corename[17]; + return corename[0]; } From 801383effe1cb702913cbe4a1c4ca17535bbd858 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 14 Aug 2012 18:33:28 +0800 Subject: [PATCH 089/162] Fixed a hang bug when shutdown blas threads server on Windows. Added the feature about dynamic changing the number of threads on Windows. --- driver/others/blas_server_win32.c | 48 +++++++++++++++++++++++++------ driver/others/memory.c | 10 ++++++- 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 16eed4c97..09b08e890 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -63,13 +63,7 @@ static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; -void goto_set_num_threads(int num) -{ -} -void openblas_set_num_threads(int num) -{ -} static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ @@ -187,7 +181,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ do { action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); - } while ((action != WAIT_OBJECT_0) && (action == WAIT_OBJECT_0 + 1)); + } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1)); if (action == WAIT_OBJECT_0 + 1) break; @@ -271,7 +265,9 @@ static DWORD WINAPI blas_thread_server(void *arg){ } else { legacy_exec(routine, queue -> mode, queue -> args, sb); } - } + }else{ + continue; //if queue == NULL + } #ifdef SMP_DEBUG fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); @@ -445,7 +441,7 @@ int BLASFUNC(blas_thread_shutdown)(void){ SetEvent(pool.killed); - for(i = 0; i < blas_cpu_number - 1; i++){ + for(i = 0; i < blas_num_threads - 1; i++){ WaitForSingleObject(blas_threads[i], INFINITE); } @@ -456,3 +452,37 @@ int BLASFUNC(blas_thread_shutdown)(void){ return 0; } + +void goto_set_num_threads(int num_threads) +{ + long i; + + if (num_threads < 1) num_threads = blas_cpu_number; + + if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; + + if (num_threads > blas_num_threads) { + + LOCK_COMMAND(&server_lock); + + //increased_threads = 1; + + for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + + blas_threads[i] = CreateThread(NULL, 0, + blas_thread_server, (void *)i, + 0, &blas_threads_id[i]); + } + + blas_num_threads = num_threads; + + UNLOCK_COMMAND(&server_lock); + } + + blas_cpu_number = num_threads; +} + +void openblas_set_num_threads(int num) +{ + goto_set_num_threads(num); +} \ No newline at end of file diff --git a/driver/others/memory.c b/driver/others/memory.c index ff8cc248b..d897fe7e0 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -206,7 +206,15 @@ int get_num_procs(void) { #endif +/* +OpenBLAS uses the numbers of CPU cores in multithreading. +It can be set by openblas_set_num_threads(int num_threads); +*/ int blas_cpu_number = 0; +/* +The numbers of threads in the thread pool. +This value is equal or large than blas_cpu_number. This means some threads are sleep. +*/ int blas_num_threads = 0; int goto_get_num_procs (void) { From fe4ab95cd511577d6629e48536321c1ea306699c Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 19 Aug 2012 23:50:54 +0800 Subject: [PATCH 090/162] Refs #136. Fixed a bug about controlling the number of threads on Windows. --- driver/others/blas_server_win32.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 09b08e890..9cbd7e219 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -466,7 +466,17 @@ void goto_set_num_threads(int num_threads) LOCK_COMMAND(&server_lock); //increased_threads = 1; + if (!blas_server_avail){ + InitializeCriticalSection(&pool.lock); + pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); + pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + + pool.shutdown = 0; + pool.queue = NULL; + blas_server_avail = 1; + } + for(i = blas_num_threads - 1; i < num_threads - 1; i++){ blas_threads[i] = CreateThread(NULL, 0, From 3e87648de33e1ad9bdc14c96ae3897edf0816f03 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 20 Aug 2012 16:51:47 +0800 Subject: [PATCH 091/162] Updated the doc for 0.2.3 version. --- Changelog.txt | 13 +++++++++++++ Makefile.rule | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index 4e80473d6..3d6151bb6 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,17 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.3 +20-Aug-2012 +common: + * Fixed LAPACK unstable bug about ?laswp. (#130) + * Fixed the shared library bug about unloading the library on + Linux (#132). + * Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2) + Please use gcc and IBM xlf. (#134) +x86/x86-64: + * Supported goto_set_num_threads and openblas_set_num_threads + APIs in Windows. They can set the number of threads on runtime. + ==================================================================== Version 0.2.2 6-July-2012 diff --git a/Makefile.rule b/Makefile.rule index b73f87d68..57094377a 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.2 +VERSION = 0.2.3 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From d0e731e8b825e7a554f245aa8f1118dcec9e2728 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Tue, 21 Aug 2012 00:31:12 -0400 Subject: [PATCH 092/162] provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to make on the command line --- Makefile.system | 9 +++++---- Makefile.tail | 26 +++++++++++++------------- ctest/Makefile | 2 +- driver/others/Makefile | 4 ++-- interface/Makefile | 6 +++--- kernel/Makefile | 2 +- 6 files changed, 25 insertions(+), 24 deletions(-) diff --git a/Makefile.system b/Makefile.system index b2180f30d..e5279d407 100644 --- a/Makefile.system +++ b/Makefile.system @@ -696,11 +696,12 @@ COMMON_OPT = -O2 endif -CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) +override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) +override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) -FFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) -FPFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) +override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) +override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) +#MAKEOVERRIDES = ifndef SUFFIX SUFFIX = o diff --git a/Makefile.tail b/Makefile.tail index 64f98ab0c..53dd0caad 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -22,19 +22,19 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif -$(SBLASOBJS) $(SBLASOBJS_P) : CFLAGS += -UDOUBLE -UCOMPLEX -$(DBLASOBJS) $(DBLASOBJS_P) : CFLAGS += -DDOUBLE -UCOMPLEX -$(QBLASOBJS) $(QBLASOBJS_P) : CFLAGS += -DXDOUBLE -UCOMPLEX -$(CBLASOBJS) $(CBLASOBJS_P) : CFLAGS += -UDOUBLE -DCOMPLEX -$(ZBLASOBJS) $(ZBLASOBJS_P) : CFLAGS += -DDOUBLE -DCOMPLEX -$(XBLASOBJS) $(XBLASOBJS_P) : CFLAGS += -DXDOUBLE -DCOMPLEX - -$(SBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) -$(DBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) -$(QBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) -$(CBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) -$(ZBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) -$(XBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) +$(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX +$(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX +$(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX +$(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX +$(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX +$(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX + +$(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) libs :: $(BLASOBJS) $(COMMONOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ diff --git a/ctest/Makefile b/ctest/Makefile index 1e07bd154..b1295640f 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -5,7 +5,7 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system -CFLAGS += -DADD$(BU) -DCBLAS +override CFLAGS += -DADD$(BU) -DCBLAS LIB = $(TOPDIR)/$(LIBNAME) diff --git a/driver/others/Makefile b/driver/others/Makefile index 921f47c9c..a1c7a504e 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -215,7 +215,7 @@ info.$(SUFFIX) : info.c info.h ../../common.h ../../param.h $(CC) $(CFLAGS) -c $< -o $(@F) -hpl : CFLAGS += -DHPL -hpl_p : CFLAGS += -DHPL +hpl : override CFLAGS += -DHPL +hpl_p : override CFLAGS += -DHPL include $(TOPDIR)/Makefile.tail diff --git a/interface/Makefile b/interface/Makefile index 5cf11cd9b..93892206f 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -318,7 +318,7 @@ CZBLAS3OBJS = \ ifndef NO_CBLAS -CFLAGS += -I. +override CFLAGS += -I. SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) @@ -400,7 +400,7 @@ all :: libs ifdef FUNCTION_PROFILE $(BLASOBJS) $(BLASOBJS_P) : functable.h -$(BLASOBJS) $(BLASOBJS_P) : CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) +$(BLASOBJS) $(BLASOBJS_P) : override CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) functable.h : Makefile ./create $(FUNCALLFILES) > functable.h @@ -420,7 +420,7 @@ level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $ $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ -$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : CFLAGS += -DCBLAS +$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c $(CC) $(CFLAGS) -c $< -o $(@F) diff --git a/kernel/Makefile b/kernel/Makefile index 41c5e89fd..55edcd287 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -6,7 +6,7 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system ifdef TARGET_CORE -CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) +override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) BUILD_KERNEL = 1 KDIR = TSUFFIX = _$(TARGET_CORE) From b695680a330a9bea4d9212ac57de355a300c8fc4 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 14 Sep 2012 14:06:14 +0800 Subject: [PATCH 093/162] Fixed #143. Don't generate cblas.h with NO_CBLAS. --- Makefile.install | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.install b/Makefile.install index a74f3d606..7f30d6b7c 100644 --- a/Makefile.install +++ b/Makefile.install @@ -32,8 +32,10 @@ install : lib.grd @cat common_interface.h >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h @echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h +ifndef NO_CBLAS @echo Generating cblas.h in $(OPENBLAS_INCLUDE_DIR) @sed 's/common/openblas_config/g' cblas.h > $(OPENBLAS_INCLUDE_DIR)/cblas.h +endif ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(OPENBLAS_LIBRARY_DIR) From 9419a43a7f59770970f284cd21a452d81209081f Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 14 Sep 2012 15:15:08 +0800 Subject: [PATCH 094/162] Fixed #142. Added the gesvd and potrs function families to common_interface.h. --- common_interface.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/common_interface.h b/common_interface.h index cc5771daa..bff1a85a1 100644 --- a/common_interface.h +++ b/common_interface.h @@ -642,6 +642,8 @@ int BLASFUNC(zgemc)(char *, char *, blasint *, blasint *, blasint *, double *, int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); +/* Lapack routines */ + int BLASFUNC(sgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); @@ -677,6 +679,13 @@ int BLASFUNC(cgesv)(blasint *, blasint *, float *, blasint *, blasint *, float int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *); int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *); +int BLASFUNC(sgesvd)(char *, char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dgesvd)(char *, char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cgesvd)(char *, char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zgesvd)(char *, char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); + int BLASFUNC(spotf2)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *); @@ -691,6 +700,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cpotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); + int BLASFUNC(slauu2)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *); From f76a38484152b1d28817f5c91a859f65bf2f6f73 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 17 Sep 2012 23:24:04 +0800 Subject: [PATCH 095/162] Refs #139. Added NO_AVX flag to use old Nehalem kernels on Sandy Bridge. For example, make NO_AVX=1 or make DYNAMIC_ARCH=1 NO_AVX=1 --- Makefile.rule | 4 ++++ Makefile.system | 18 ++++++++++++++++-- cpuid_x86.c | 7 +++++++ driver/others/dynamic.c | 8 +++++++- 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 57094377a..37b6c8acc 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -71,6 +71,10 @@ VERSION = 0.2.3 # If you want to disable CPU/Memory affinity on Linux. # NO_AFFINITY = 1 +# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers +# and OS. However, the performance is low. +# NO_AVX = 1 + # If you would like to know minute performance report of GotoBLAS. # FUNCTION_PROFILE = 1 diff --git a/Makefile.system b/Makefile.system index e5279d407..1db4d9d2f 100644 --- a/Makefile.system +++ b/Makefile.system @@ -57,6 +57,10 @@ GEMM_MULTITHREAD_THRESHOLD=50 endif GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) +ifeq ($(NO_AVX), 1) +GETARCH_FLAGS += -DNO_AVX +endif + # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 @@ -247,11 +251,17 @@ endif ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ - CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO + CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO +ifneq ($(NO_AVX), 1) +DYNAMIC_CORE += SANDYBRIDGE +endif endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO +ifneq ($(NO_AVX), 1) +DYNAMIC_CORE += SANDYBRIDGE +endif endif ifndef DYNAMIC_CORE @@ -562,6 +572,10 @@ ifeq ($(NO_LAPACKE), 1) CCOMMON_OPT += -DNO_LAPACKE endif +ifeq ($(NO_AVX), 1) +CCOMMON_OPT += -DNO_AVX +endif + ifdef SMP CCOMMON_OPT += -DSMP_SERVER diff --git a/cpuid_x86.c b/cpuid_x86.c index b304cdade..79fd20e3f 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -40,6 +40,11 @@ #include #include "cpuid.h" +#ifdef NO_AVX +#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM +#define CORE_SANDYBRIDGE CORE_NEHALEM +#endif + #ifndef CPUIDEMU #if defined(__APPLE__) && defined(__i386__) @@ -189,7 +194,9 @@ int get_cputype(int gettype){ if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; +#ifndef NO_AVX if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX; +#endif if (have_excpuid() >= 0x01) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index aa4b867fd..45783c517 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -60,8 +60,14 @@ extern gotoblas_t gotoblas_NEHALEM; extern gotoblas_t gotoblas_OPTERON; extern gotoblas_t gotoblas_OPTERON_SSE3; extern gotoblas_t gotoblas_BARCELONA; -extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BOBCAT; +#ifndef NO_AVX +extern gotoblas_t gotoblas_SANDYBRIDGE; +#else +//Use NEHALEM kernels for sandy bridge +#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#endif + #define VENDOR_INTEL 1 #define VENDOR_AMD 2 From 735ca38b8ffaa42be33e397761c57827aeaad5b9 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 18 Sep 2012 15:46:20 +0800 Subject: [PATCH 096/162] Refs #139. Check OS supporting AVX on runtime. --- cpuid_x86.c | 51 +++++++++++++++++++++++++++++++++++------ driver/others/dynamic.c | 39 ++++++++++++++++++++++++++++--- 2 files changed, 80 insertions(+), 10 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 79fd20e3f..ebbbe3fff 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -114,6 +114,25 @@ static inline int have_excpuid(void){ return eax & 0xffff; } +static inline void xgetbv(int op, int * eax, int * edx){ + __asm__ __volatile__ + ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); +} + +int support_avx(){ + int eax, ebx, ecx, edx; + int ret=0; + + cpuid(1, &eax, &ebx, &ecx, &edx); + if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 6) == 6){ + ret=1; //OS support AVX + } + } + return ret; +} + int get_vendor(void){ int eax, ebx, ecx, edx; char vendor[13]; @@ -195,7 +214,7 @@ int get_cputype(int gettype){ if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; #ifndef NO_AVX - if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX; + if (support_avx()) feature |= HAVE_AVX; #endif if (have_excpuid() >= 0x01) { @@ -991,13 +1010,19 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CPUTYPE_SANDYBRIDGE; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; //OS doesn't support AVX case 12: //Xeon Processor 5600 (Westmere-EP) return CPUTYPE_NEHALEM; case 13: //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) - return CPUTYPE_SANDYBRIDGE; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; case 15: //Xeon Processor E7 (Westmere-EX) return CPUTYPE_NEHALEM; @@ -1006,7 +1031,10 @@ int get_cpuname(void){ case 3: switch (model) { case 10: - return CPUTYPE_SANDYBRIDGE; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; } @@ -1350,13 +1378,19 @@ int get_coretype(void){ return CORE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CORE_SANDYBRIDGE; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; //OS doesn't support AVX case 12: //Xeon Processor 5600 (Westmere-EP) return CORE_NEHALEM; case 13: //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) - return CORE_SANDYBRIDGE; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; //OS doesn't support AVX case 15: //Xeon Processor E7 (Westmere-EX) return CORE_NEHALEM; @@ -1365,7 +1399,10 @@ int get_coretype(void){ case 3: switch (model) { case 10: - return CORE_SANDYBRIDGE; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; //OS doesn't support AVX } break; } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 45783c517..468ab0dc8 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -76,6 +76,25 @@ extern gotoblas_t gotoblas_SANDYBRIDGE; #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) +static inline void xgetbv(int op, int * eax, int * edx){ + __asm__ __volatile__ + ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); +} + +int support_avx(){ + int eax, ebx, ecx, edx; + int ret=0; + + cpuid(1, &eax, &ebx, &ecx, &edx); + if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0){ + xgetbv(0, &eax, &edx); + if((eax & 6) == 6){ + ret=1; //OS support AVX + } + } + return ret; +} + static int get_vendor(void){ int eax, ebx, ecx, edx; char vendor[13]; @@ -142,11 +161,25 @@ static gotoblas_t *get_coretype(void){ //Intel Core i5-2000 /i7-2000 (Sandy Bridge) //Intel Core i7-3000 / Xeon E5 - if (model == 10 || model == 13) return &gotoblas_SANDYBRIDGE; + if (model == 10 || model == 13) { + if(support_avx()) + return &gotoblas_SANDYBRIDGE; + else{ + fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; case 3: //Intel Sandy Bridge 22nm (Ivy Bridge?) - if (model == 10) return &gotoblas_SANDYBRIDGE; + if (model == 10) { + if(support_avx()) + return &gotoblas_SANDYBRIDGE; + else{ + fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: @@ -239,7 +272,7 @@ void gotoblas_dynamic_init(void) { if (gotoblas && gotoblas -> init) { gotoblas -> init(); } else { - fprintf(stderr, "GotoBLAS : Architecture Initialization failed. No initialization function found.\n"); + fprintf(stderr, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); exit(1); } From 758e34efbb8582994216a6779cd27358d26a3c4b Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 21 Sep 2012 10:14:07 +0000 Subject: [PATCH 097/162] Fixed the detection bug on Loongson 3A server. --- Makefile.system | 4 ++++ cpuid_mips.c | 22 +++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 1db4d9d2f..4c498ba1a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -61,6 +61,10 @@ ifeq ($(NO_AVX), 1) GETARCH_FLAGS += -DNO_AVX endif +ifeq ($(DEBUG), 1) +GETARCH_FLAGS += -g +endif + # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 diff --git a/cpuid_mips.c b/cpuid_mips.c index 217492dd7..45171da5e 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -101,12 +101,14 @@ int detect(void){ fclose(infile); + if(p != NULL){ if (strstr(p, "Loongson-3A")){ return CPU_LOONGSON3A; }else if(strstr(p, "Loongson-3B")){ return CPU_LOONGSON3B; }else if (strstr(p, "Loongson-3")){ infile = fopen("/proc/cpuinfo", "r"); + p = (char *)NULL; while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("system type", buffer, 11)){ p = strchr(buffer, ':') + 2; @@ -119,6 +121,24 @@ int detect(void){ }else{ return CPU_SICORTEX; } + } + //Check model name for Loongson3 + infile = fopen("/proc/cpuinfo", "r"); + p = (char *)NULL; + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("model name", buffer, 10)){ + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if(p != NULL){ + if (strstr(p, "Loongson-3A")){ + return CPU_LOONGSON3A; + }else if(strstr(p, "Loongson-3B")){ + return CPU_LOONGSON3B; + } + } #endif return CPU_UNKNOWN; } From 1d72b8bf1be826652a6161ec608964f892a3e0fa Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 21 Sep 2012 11:49:07 +0000 Subject: [PATCH 098/162] Fixed generating shared library bug on MIPS. --- f_check | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/f_check b/f_check index 8e3855b10..83587e609 100644 --- a/f_check +++ b/f_check @@ -211,6 +211,10 @@ if (!$?) { if ($?) { $link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } + #For gfortran MIPS + if ($?) { + $link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } $binary = "" if ($?); } @@ -219,6 +223,10 @@ if (!$?) { if ($?) { $link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } + #For gfortran MIPS + if ($?) { + $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } $binary = "" if ($?); } From 2573311308a43d0b1058c4be23173def5a9d4571 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 24 Sep 2012 20:34:33 +0800 Subject: [PATCH 099/162] refs #140. Fixed zdot incompatibility ABI issue with GCC 4.7 on Win 32. GCC 4.7 uses MSVC ABI on Win 32. This means the caller pops the hidden pointer for returning aggregate structures larger than 8 bytes. --- Makefile.system | 20 ++++++++++++++++++++ kernel/x86/zdot_sse2.S | 12 +++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 4c498ba1a..5ff174300 100644 --- a/Makefile.system +++ b/Makefile.system @@ -149,6 +149,26 @@ EXTRALIB += -defaultlib:advapi32 SUFFIX = obj PSUFFIX = pobj LIBSUFFIX = lib +ifeq ($(C_COMPILER), GCC) +#Test for supporting MS_ABI +GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) +GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) +ifeq ($(GCCVERSIONGT4), 1) +# GCC Majar version > 4 +# It is compatible with MSVC ABI. +CCOMMON_OPT += -DMS_ABI +endif + +ifeq ($(GCCVERSIONGTEQ4), 1) +ifeq ($(GCCMINORVERSIONGTEQ7), 1) +# GCC Version >=4.7 +# It is compatible with MSVC ABI. +CCOMMON_OPT += -DMS_ABI +endif +endif + +endif endif ifeq ($(OSNAME), Interix) diff --git a/kernel/x86/zdot_sse2.S b/kernel/x86/zdot_sse2.S index efebe637b..61e1bfc27 100644 --- a/kernel/x86/zdot_sse2.S +++ b/kernel/x86/zdot_sse2.S @@ -1541,6 +1541,16 @@ popl %ebx popl %esi popl %edi -/*remove the hidden return value address from the stack.*/ +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) +#ifdef MS_ABI +/* For MingW GCC >= 4.7. It is compatible with MSVC ABI. http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36834 */ + ret +#else +/* remove the hidden return value address from the stack. For MingW GCC < 4.7 */ ret $0x4 +#endif +#else +/*remove the hidden return value address from the stack on Linux.*/ + ret $0x4 +#endif EPILOGUE From 08c177ca36a0c17cbce0b9308c10c0cf4ad33e11 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 29 Sep 2012 23:14:39 +0800 Subject: [PATCH 100/162] Refs #145. Update LAPACK to 3.4.2 version. --- Makefile | 18 +- Makefile.system | 2 +- patch.for_lapack-3.4.2 | 932 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 942 insertions(+), 10 deletions(-) create mode 100644 patch.for_lapack-3.4.2 diff --git a/Makefile b/Makefile index d95373086..9a2a8b765 100644 --- a/Makefile +++ b/Makefile @@ -201,7 +201,7 @@ ifeq ($(NO_LAPACK), 1) netlib : else -netlib : lapack-3.4.1 patch.for_lapack-3.4.1 $(NETLIB_LAPACK_DIR)/make.inc +netlib : lapack-3.4.2 patch.for_lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc ifndef NOFORTRAN -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib endif @@ -210,7 +210,7 @@ ifndef NO_LAPACKE endif endif -prof_lapack : lapack-3.4.1 $(NETLIB_LAPACK_DIR)/make.inc +prof_lapack : lapack-3.4.2 $(NETLIB_LAPACK_DIR)/make.inc -@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof $(NETLIB_LAPACK_DIR)/make.inc : @@ -235,24 +235,24 @@ ifndef NOFORTRAN -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc endif -lapack-3.4.1 : lapack-3.4.1.tgz +lapack-3.4.2 : lapack-3.4.2.tgz ifndef NOFORTRAN ifndef NO_LAPACK - @if test `$(MD5SUM) lapack-3.4.1.tgz | $(AWK) '{print $$1}'` = 44c3869c38c8335c2b9c2a8bb276eb55; then \ + @if test `$(MD5SUM) lapack-3.4.2.tgz | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \ echo $(TAR) zxf $< ;\ - $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.1) ;\ + $(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\ rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\ else \ rm -rf $(NETLIB_LAPACK_DIR) ;\ - echo " Cannot download lapack-3.4.1.tgz or the MD5 check sum is wrong (Please use orignal)."; \ + echo " Cannot download lapack-3.4.2.tgz or the MD5 check sum is wrong (Please use orignal)."; \ exit 1; \ fi endif endif -LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.1.tgz +LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.2.tgz -lapack-3.4.1.tgz : +lapack-3.4.2.tgz : ifndef NOFORTRAN #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD)) @@ -276,7 +276,7 @@ ifndef NOFORTRAN -wget http://www.netlib.org/lapack/timing/timing.tgz endif -lapack-timing : lapack-3.4.1 large.tgz timing.tgz +lapack-timing : lapack-3.4.2 large.tgz timing.tgz ifndef NOFORTRAN (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) diff --git a/Makefile.system b/Makefile.system index 5ff174300..5aaf84609 100644 --- a/Makefile.system +++ b/Makefile.system @@ -10,7 +10,7 @@ TOPDIR = . endif ifndef NETLIB_LAPACK_DIR -NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.1 +NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.2 endif # Default C compiler diff --git a/patch.for_lapack-3.4.2 b/patch.for_lapack-3.4.2 new file mode 100644 index 000000000..f6c85c74b --- /dev/null +++ b/patch.for_lapack-3.4.2 @@ -0,0 +1,932 @@ +diff -ruN lapack-3.4.2.old/INSTALL/Makefile lapack-3.4.2/INSTALL/Makefile +--- lapack-3.4.2.old/INSTALL/Makefile 2011-10-01 04:37:03 +0200 ++++ lapack-3.4.2/INSTALL/Makefile 2012-04-22 21:48:48 +0200 +@@ -27,7 +27,7 @@ + $(LOADER) $(LOADOPTS) -o testversion ilaver.o LAPACK_version.o + + clean: +- rm -f *.o ++ rm -f *.o test* + .f.o: + $(FORTRAN) $(OPTS) -c $< -o $@ + +diff -ruN lapack-3.4.2.old/Makefile lapack-3.4.2/Makefile +--- lapack-3.4.2.old/Makefile 2012-04-13 20:13:07 +0200 ++++ lapack-3.4.2/Makefile 2012-04-22 21:48:07 +0200 +@@ -20,9 +20,12 @@ + blaslib: + ( cd BLAS/SRC; $(MAKE) ) + +-lapacklib: lapack_install ++lapacklib: + ( cd SRC; $(MAKE) ) + ++lapack_prof: ++ ( cd SRC; $(MAKE) lapack_prof) ++ + lapackelib: lapacklib + ( cd lapacke; $(MAKE) ) + +diff -ruN lapack-3.4.2.old/SRC/Makefile lapack-3.4.2/SRC/Makefile +--- lapack-3.4.2.old/SRC/Makefile 2012-04-02 21:06:36 +0200 ++++ lapack-3.4.2/SRC/Makefile 2012-04-22 21:40:21 +0200 +@@ -54,363 +54,371 @@ + # + ####################################################################### + +-ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o xerbla_array.o iparmq.o \ +- ilaprec.o ilatrans.o ilauplo.o iladiag.o chla_transtype.o \ +- ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o ++ALLAUX = ilaenv.$(SUFFIX) ieeeck.$(SUFFIX) lsamen.$(SUFFIX) xerbla_array.$(SUFFIX) iparmq.$(SUFFIX) \ ++ ilaprec.$(SUFFIX) ilatrans.$(SUFFIX) ilauplo.$(SUFFIX) iladiag.$(SUFFIX) chla_transtype.$(SUFFIX) \ ++ ../INSTALL/ilaver.$(SUFFIX) + + SCLAUX = \ +- sbdsdc.o \ +- sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ +- slaed0.o slaed1.o slaed2.o slaed3.o slaed4.o slaed5.o slaed6.o \ +- slaed7.o slaed8.o slaed9.o slaeda.o slaev2.o slagtf.o \ +- slagts.o slamrg.o slanst.o \ +- slapy2.o slapy3.o slarnv.o \ +- slarra.o slarrb.o slarrc.o slarrd.o slarre.o slarrf.o slarrj.o \ +- slarrk.o slarrr.o slaneg.o \ +- slartg.o slaruv.o slas2.o slascl.o \ +- slasd0.o slasd1.o slasd2.o slasd3.o slasd4.o slasd5.o slasd6.o \ +- slasd7.o slasd8.o slasda.o slasdq.o slasdt.o \ +- slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \ +- slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ +- ssteqr.o ssterf.o slaisnan.o sisnan.o \ +- slartgp.o slartgs.o \ +- ../INSTALL/second_$(TIMER).o ++ sbdsdc.$(SUFFIX) \ ++ sbdsqr.$(SUFFIX) sdisna.$(SUFFIX) slabad.$(SUFFIX) slacpy.$(SUFFIX) sladiv.$(SUFFIX) slae2.$(SUFFIX) slaebz.$(SUFFIX) \ ++ slaed0.$(SUFFIX) slaed1.$(SUFFIX) slaed2.$(SUFFIX) slaed3.$(SUFFIX) slaed4.$(SUFFIX) slaed5.$(SUFFIX) slaed6.$(SUFFIX) \ ++ slaed7.$(SUFFIX) slaed8.$(SUFFIX) slaed9.$(SUFFIX) slaeda.$(SUFFIX) slaev2.$(SUFFIX) slagtf.$(SUFFIX) \ ++ slagts.$(SUFFIX) slamrg.$(SUFFIX) slanst.$(SUFFIX) \ ++ slapy2.$(SUFFIX) slapy3.$(SUFFIX) slarnv.$(SUFFIX) \ ++ slarra.$(SUFFIX) slarrb.$(SUFFIX) slarrc.$(SUFFIX) slarrd.$(SUFFIX) slarre.$(SUFFIX) slarrf.$(SUFFIX) slarrj.$(SUFFIX) \ ++ slarrk.$(SUFFIX) slarrr.$(SUFFIX) slaneg.$(SUFFIX) \ ++ slartg.$(SUFFIX) slaruv.$(SUFFIX) slas2.$(SUFFIX) slascl.$(SUFFIX) \ ++ slasd0.$(SUFFIX) slasd1.$(SUFFIX) slasd2.$(SUFFIX) slasd3.$(SUFFIX) slasd4.$(SUFFIX) slasd5.$(SUFFIX) slasd6.$(SUFFIX) \ ++ slasd7.$(SUFFIX) slasd8.$(SUFFIX) slasda.$(SUFFIX) slasdq.$(SUFFIX) slasdt.$(SUFFIX) \ ++ slaset.$(SUFFIX) slasq1.$(SUFFIX) slasq2.$(SUFFIX) slasq3.$(SUFFIX) slasq4.$(SUFFIX) slasq5.$(SUFFIX) slasq6.$(SUFFIX) \ ++ slasr.$(SUFFIX) slasrt.$(SUFFIX) slassq.$(SUFFIX) slasv2.$(SUFFIX) spttrf.$(SUFFIX) sstebz.$(SUFFIX) sstedc.$(SUFFIX) \ ++ ssteqr.$(SUFFIX) ssterf.$(SUFFIX) slaisnan.$(SUFFIX) sisnan.$(SUFFIX) \ ++ slartgp.$(SUFFIX) slartgs.$(SUFFIX) \ ++ ../INSTALL/second_$(TIMER).$(SUFFIX) + + DZLAUX = \ +- dbdsdc.o \ +- dbdsqr.o ddisna.o dlabad.o dlacpy.o dladiv.o dlae2.o dlaebz.o \ +- dlaed0.o dlaed1.o dlaed2.o dlaed3.o dlaed4.o dlaed5.o dlaed6.o \ +- dlaed7.o dlaed8.o dlaed9.o dlaeda.o dlaev2.o dlagtf.o \ +- dlagts.o dlamrg.o dlanst.o \ +- dlapy2.o dlapy3.o dlarnv.o \ +- dlarra.o dlarrb.o dlarrc.o dlarrd.o dlarre.o dlarrf.o dlarrj.o \ +- dlarrk.o dlarrr.o dlaneg.o \ +- dlartg.o dlaruv.o dlas2.o dlascl.o \ +- dlasd0.o dlasd1.o dlasd2.o dlasd3.o dlasd4.o dlasd5.o dlasd6.o \ +- dlasd7.o dlasd8.o dlasda.o dlasdq.o dlasdt.o \ +- dlaset.o dlasq1.o dlasq2.o dlasq3.o dlasq4.o dlasq5.o dlasq6.o \ +- dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \ +- dsteqr.o dsterf.o dlaisnan.o disnan.o \ +- dlartgp.o dlartgs.o \ +- ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o ++ dbdsdc.$(SUFFIX) \ ++ dbdsqr.$(SUFFIX) ddisna.$(SUFFIX) dlabad.$(SUFFIX) dlacpy.$(SUFFIX) dladiv.$(SUFFIX) dlae2.$(SUFFIX) dlaebz.$(SUFFIX) \ ++ dlaed0.$(SUFFIX) dlaed1.$(SUFFIX) dlaed2.$(SUFFIX) dlaed3.$(SUFFIX) dlaed4.$(SUFFIX) dlaed5.$(SUFFIX) dlaed6.$(SUFFIX) \ ++ dlaed7.$(SUFFIX) dlaed8.$(SUFFIX) dlaed9.$(SUFFIX) dlaeda.$(SUFFIX) dlaev2.$(SUFFIX) dlagtf.$(SUFFIX) \ ++ dlagts.$(SUFFIX) dlamrg.$(SUFFIX) dlanst.$(SUFFIX) \ ++ dlapy2.$(SUFFIX) dlapy3.$(SUFFIX) dlarnv.$(SUFFIX) \ ++ dlarra.$(SUFFIX) dlarrb.$(SUFFIX) dlarrc.$(SUFFIX) dlarrd.$(SUFFIX) dlarre.$(SUFFIX) dlarrf.$(SUFFIX) dlarrj.$(SUFFIX) \ ++ dlarrk.$(SUFFIX) dlarrr.$(SUFFIX) dlaneg.$(SUFFIX) \ ++ dlartg.$(SUFFIX) dlaruv.$(SUFFIX) dlas2.$(SUFFIX) dlascl.$(SUFFIX) \ ++ dlasd0.$(SUFFIX) dlasd1.$(SUFFIX) dlasd2.$(SUFFIX) dlasd3.$(SUFFIX) dlasd4.$(SUFFIX) dlasd5.$(SUFFIX) dlasd6.$(SUFFIX) \ ++ dlasd7.$(SUFFIX) dlasd8.$(SUFFIX) dlasda.$(SUFFIX) dlasdq.$(SUFFIX) dlasdt.$(SUFFIX) \ ++ dlaset.$(SUFFIX) dlasq1.$(SUFFIX) dlasq2.$(SUFFIX) dlasq3.$(SUFFIX) dlasq4.$(SUFFIX) dlasq5.$(SUFFIX) dlasq6.$(SUFFIX) \ ++ dlasr.$(SUFFIX) dlasrt.$(SUFFIX) dlassq.$(SUFFIX) dlasv2.$(SUFFIX) dpttrf.$(SUFFIX) dstebz.$(SUFFIX) dstedc.$(SUFFIX) \ ++ dsteqr.$(SUFFIX) dsterf.$(SUFFIX) dlaisnan.$(SUFFIX) disnan.$(SUFFIX) \ ++ dlartgp.$(SUFFIX) dlartgs.$(SUFFIX) \ ++ ../INSTALL/dsecnd_$(TIMER).$(SUFFIX) + + SLASRC = \ +- sgbbrd.o sgbcon.o sgbequ.o sgbrfs.o sgbsv.o \ +- sgbsvx.o sgbtf2.o sgbtrf.o sgbtrs.o sgebak.o sgebal.o sgebd2.o \ +- sgebrd.o sgecon.o sgeequ.o sgees.o sgeesx.o sgeev.o sgeevx.o \ +- sgegs.o sgegv.o sgehd2.o sgehrd.o sgelq2.o sgelqf.o \ +- sgels.o sgelsd.o sgelss.o sgelsx.o sgelsy.o sgeql2.o sgeqlf.o \ +- sgeqp3.o sgeqpf.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \ +- sgerq2.o sgerqf.o sgesc2.o sgesdd.o sgesv.o sgesvd.o sgesvx.o \ +- sgetc2.o sgetf2.o sgetri.o \ +- sggbak.o sggbal.o sgges.o sggesx.o sggev.o sggevx.o \ +- sggglm.o sgghrd.o sgglse.o sggqrf.o \ +- sggrqf.o sggsvd.o sggsvp.o sgtcon.o sgtrfs.o sgtsv.o \ +- sgtsvx.o sgttrf.o sgttrs.o sgtts2.o shgeqz.o \ +- shsein.o shseqr.o slabrd.o slacon.o slacn2.o \ +- slaein.o slaexc.o slag2.o slags2.o slagtm.o slagv2.o slahqr.o \ +- slahrd.o slahr2.o slaic1.o slaln2.o slals0.o slalsa.o slalsd.o \ +- slangb.o slange.o slangt.o slanhs.o slansb.o slansp.o \ +- slansy.o slantb.o slantp.o slantr.o slanv2.o \ +- slapll.o slapmt.o \ +- slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ +- slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ +- slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ +- slarf.o slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slargv.o \ +- slarrv.o slartv.o \ +- slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o \ +- slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o slatzm.o \ +- slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ +- sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ +- sorgrq.o sorgtr.o sorm2l.o sorm2r.o \ +- sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ +- sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \ +- spbstf.o spbsv.o spbsvx.o \ +- spbtf2.o spbtrf.o spbtrs.o spocon.o spoequ.o sporfs.o sposv.o \ +- sposvx.o spotf2.o spotri.o spstrf.o spstf2.o \ +- sppcon.o sppequ.o \ +- spprfs.o sppsv.o sppsvx.o spptrf.o spptri.o spptrs.o sptcon.o \ +- spteqr.o sptrfs.o sptsv.o sptsvx.o spttrs.o sptts2.o srscl.o \ +- ssbev.o ssbevd.o ssbevx.o ssbgst.o ssbgv.o ssbgvd.o ssbgvx.o \ +- ssbtrd.o sspcon.o sspev.o sspevd.o sspevx.o sspgst.o \ +- sspgv.o sspgvd.o sspgvx.o ssprfs.o sspsv.o sspsvx.o ssptrd.o \ +- ssptrf.o ssptri.o ssptrs.o sstegr.o sstein.o sstev.o sstevd.o sstevr.o \ +- sstevx.o \ +- ssycon.o ssyev.o ssyevd.o ssyevr.o ssyevx.o ssygs2.o \ +- ssygst.o ssygv.o ssygvd.o ssygvx.o ssyrfs.o ssysv.o ssysvx.o \ +- ssytd2.o ssytf2.o ssytrd.o ssytrf.o ssytri.o ssytri2.o ssytri2x.o \ +- ssyswapr.o ssytrs.o ssytrs2.o ssyconv.o \ +- stbcon.o \ +- stbrfs.o stbtrs.o stgevc.o stgex2.o stgexc.o stgsen.o \ +- stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ +- stptrs.o \ +- strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ +- strti2.o strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o \ +- slansf.o spftrf.o spftri.o spftrs.o ssfrk.o stfsm.o stftri.o stfttp.o \ +- stfttr.o stpttf.o stpttr.o strttf.o strttp.o \ +- sgejsv.o sgesvj.o sgsvj0.o sgsvj1.o \ +- sgeequb.o ssyequb.o spoequb.o sgbequb.o \ +- sbbcsd.o slapmr.o sorbdb.o sorcsd.o \ +- sgeqrt.o sgeqrt2.o sgeqrt3.o sgemqrt.o \ +- stpqrt.o stpqrt2.o stpmqrt.o stprfb.o ++ sgbbrd.$(SUFFIX) sgbcon.$(SUFFIX) sgbequ.$(SUFFIX) sgbrfs.$(SUFFIX) sgbsv.$(SUFFIX) \ ++ sgbsvx.$(SUFFIX) sgbtf2.$(SUFFIX) sgbtrf.$(SUFFIX) sgbtrs.$(SUFFIX) sgebak.$(SUFFIX) sgebal.$(SUFFIX) sgebd2.$(SUFFIX) \ ++ sgebrd.$(SUFFIX) sgecon.$(SUFFIX) sgeequ.$(SUFFIX) sgees.$(SUFFIX) sgeesx.$(SUFFIX) sgeev.$(SUFFIX) sgeevx.$(SUFFIX) \ ++ sgegs.$(SUFFIX) sgegv.$(SUFFIX) sgehd2.$(SUFFIX) sgehrd.$(SUFFIX) sgelq2.$(SUFFIX) sgelqf.$(SUFFIX) \ ++ sgels.$(SUFFIX) sgelsd.$(SUFFIX) sgelss.$(SUFFIX) sgelsx.$(SUFFIX) sgelsy.$(SUFFIX) sgeql2.$(SUFFIX) sgeqlf.$(SUFFIX) \ ++ sgeqp3.$(SUFFIX) sgeqpf.$(SUFFIX) sgeqr2.$(SUFFIX) sgeqr2p.$(SUFFIX) sgeqrf.$(SUFFIX) sgeqrfp.$(SUFFIX) sgerfs.$(SUFFIX) \ ++ sgerq2.$(SUFFIX) sgerqf.$(SUFFIX) sgesc2.$(SUFFIX) sgesdd.$(SUFFIX) sgesv.$(SUFFIX) sgesvd.$(SUFFIX) sgesvx.$(SUFFIX) \ ++ sgetc2.$(SUFFIX) sgetri.$(SUFFIX) \ ++ sggbak.$(SUFFIX) sggbal.$(SUFFIX) sgges.$(SUFFIX) sggesx.$(SUFFIX) sggev.$(SUFFIX) sggevx.$(SUFFIX) \ ++ sggglm.$(SUFFIX) sgghrd.$(SUFFIX) sgglse.$(SUFFIX) sggqrf.$(SUFFIX) \ ++ sggrqf.$(SUFFIX) sggsvd.$(SUFFIX) sggsvp.$(SUFFIX) sgtcon.$(SUFFIX) sgtrfs.$(SUFFIX) sgtsv.$(SUFFIX) \ ++ sgtsvx.$(SUFFIX) sgttrf.$(SUFFIX) sgttrs.$(SUFFIX) sgtts2.$(SUFFIX) shgeqz.$(SUFFIX) \ ++ shsein.$(SUFFIX) shseqr.$(SUFFIX) slabrd.$(SUFFIX) slacon.$(SUFFIX) slacn2.$(SUFFIX) \ ++ slaein.$(SUFFIX) slaexc.$(SUFFIX) slag2.$(SUFFIX) slags2.$(SUFFIX) slagtm.$(SUFFIX) slagv2.$(SUFFIX) slahqr.$(SUFFIX) \ ++ slahrd.$(SUFFIX) slahr2.$(SUFFIX) slaic1.$(SUFFIX) slaln2.$(SUFFIX) slals0.$(SUFFIX) slalsa.$(SUFFIX) slalsd.$(SUFFIX) \ ++ slangb.$(SUFFIX) slange.$(SUFFIX) slangt.$(SUFFIX) slanhs.$(SUFFIX) slansb.$(SUFFIX) slansp.$(SUFFIX) \ ++ slansy.$(SUFFIX) slantb.$(SUFFIX) slantp.$(SUFFIX) slantr.$(SUFFIX) slanv2.$(SUFFIX) \ ++ slapll.$(SUFFIX) slapmt.$(SUFFIX) \ ++ slaqgb.$(SUFFIX) slaqge.$(SUFFIX) slaqp2.$(SUFFIX) slaqps.$(SUFFIX) slaqsb.$(SUFFIX) slaqsp.$(SUFFIX) slaqsy.$(SUFFIX) \ ++ slaqr0.$(SUFFIX) slaqr1.$(SUFFIX) slaqr2.$(SUFFIX) slaqr3.$(SUFFIX) slaqr4.$(SUFFIX) slaqr5.$(SUFFIX) \ ++ slaqtr.$(SUFFIX) slar1v.$(SUFFIX) slar2v.$(SUFFIX) ilaslr.$(SUFFIX) ilaslc.$(SUFFIX) \ ++ slarf.$(SUFFIX) slarfb.$(SUFFIX) slarfg.$(SUFFIX) slarfgp.$(SUFFIX) slarft.$(SUFFIX) slarfx.$(SUFFIX) slargv.$(SUFFIX) \ ++ slarrv.$(SUFFIX) slartv.$(SUFFIX) \ ++ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ ++ slatbs.$(SUFFIX) slatdf.$(SUFFIX) slatps.$(SUFFIX) slatrd.$(SUFFIX) slatrs.$(SUFFIX) slatrz.$(SUFFIX) slatzm.$(SUFFIX) \ ++ sopgtr.$(SUFFIX) sopmtr.$(SUFFIX) sorg2l.$(SUFFIX) sorg2r.$(SUFFIX) \ ++ sorgbr.$(SUFFIX) sorghr.$(SUFFIX) sorgl2.$(SUFFIX) sorglq.$(SUFFIX) sorgql.$(SUFFIX) sorgqr.$(SUFFIX) sorgr2.$(SUFFIX) \ ++ sorgrq.$(SUFFIX) sorgtr.$(SUFFIX) sorm2l.$(SUFFIX) sorm2r.$(SUFFIX) \ ++ sormbr.$(SUFFIX) sormhr.$(SUFFIX) sorml2.$(SUFFIX) sormlq.$(SUFFIX) sormql.$(SUFFIX) sormqr.$(SUFFIX) sormr2.$(SUFFIX) \ ++ sormr3.$(SUFFIX) sormrq.$(SUFFIX) sormrz.$(SUFFIX) sormtr.$(SUFFIX) spbcon.$(SUFFIX) spbequ.$(SUFFIX) spbrfs.$(SUFFIX) \ ++ spbstf.$(SUFFIX) spbsv.$(SUFFIX) spbsvx.$(SUFFIX) \ ++ spbtf2.$(SUFFIX) spbtrf.$(SUFFIX) spbtrs.$(SUFFIX) spocon.$(SUFFIX) spoequ.$(SUFFIX) sporfs.$(SUFFIX) sposv.$(SUFFIX) \ ++ sposvx.$(SUFFIX) spotri.$(SUFFIX) spstrf.$(SUFFIX) spstf2.$(SUFFIX) \ ++ sppcon.$(SUFFIX) sppequ.$(SUFFIX) \ ++ spprfs.$(SUFFIX) sppsv.$(SUFFIX) sppsvx.$(SUFFIX) spptrf.$(SUFFIX) spptri.$(SUFFIX) spptrs.$(SUFFIX) sptcon.$(SUFFIX) \ ++ spteqr.$(SUFFIX) sptrfs.$(SUFFIX) sptsv.$(SUFFIX) sptsvx.$(SUFFIX) spttrs.$(SUFFIX) sptts2.$(SUFFIX) srscl.$(SUFFIX) \ ++ ssbev.$(SUFFIX) ssbevd.$(SUFFIX) ssbevx.$(SUFFIX) ssbgst.$(SUFFIX) ssbgv.$(SUFFIX) ssbgvd.$(SUFFIX) ssbgvx.$(SUFFIX) \ ++ ssbtrd.$(SUFFIX) sspcon.$(SUFFIX) sspev.$(SUFFIX) sspevd.$(SUFFIX) sspevx.$(SUFFIX) sspgst.$(SUFFIX) \ ++ sspgv.$(SUFFIX) sspgvd.$(SUFFIX) sspgvx.$(SUFFIX) ssprfs.$(SUFFIX) sspsv.$(SUFFIX) sspsvx.$(SUFFIX) ssptrd.$(SUFFIX) \ ++ ssptrf.$(SUFFIX) ssptri.$(SUFFIX) ssptrs.$(SUFFIX) sstegr.$(SUFFIX) sstein.$(SUFFIX) sstev.$(SUFFIX) sstevd.$(SUFFIX) sstevr.$(SUFFIX) \ ++ sstevx.$(SUFFIX) \ ++ ssycon.$(SUFFIX) ssyev.$(SUFFIX) ssyevd.$(SUFFIX) ssyevr.$(SUFFIX) ssyevx.$(SUFFIX) ssygs2.$(SUFFIX) \ ++ ssygst.$(SUFFIX) ssygv.$(SUFFIX) ssygvd.$(SUFFIX) ssygvx.$(SUFFIX) ssyrfs.$(SUFFIX) ssysv.$(SUFFIX) ssysvx.$(SUFFIX) \ ++ ssytd2.$(SUFFIX) ssytf2.$(SUFFIX) ssytrd.$(SUFFIX) ssytrf.$(SUFFIX) ssytri.$(SUFFIX) ssytri2.$(SUFFIX) ssytri2x.$(SUFFIX) \ ++ ssyswapr.$(SUFFIX) ssytrs.$(SUFFIX) ssytrs2.$(SUFFIX) ssyconv.$(SUFFIX) \ ++ stbcon.$(SUFFIX) \ ++ stbrfs.$(SUFFIX) stbtrs.$(SUFFIX) stgevc.$(SUFFIX) stgex2.$(SUFFIX) stgexc.$(SUFFIX) stgsen.$(SUFFIX) \ ++ stgsja.$(SUFFIX) stgsna.$(SUFFIX) stgsy2.$(SUFFIX) stgsyl.$(SUFFIX) stpcon.$(SUFFIX) stprfs.$(SUFFIX) stptri.$(SUFFIX) \ ++ stptrs.$(SUFFIX) \ ++ strcon.$(SUFFIX) strevc.$(SUFFIX) strexc.$(SUFFIX) strrfs.$(SUFFIX) strsen.$(SUFFIX) strsna.$(SUFFIX) strsyl.$(SUFFIX) \ ++ strtrs.$(SUFFIX) stzrqf.$(SUFFIX) stzrzf.$(SUFFIX) sstemr.$(SUFFIX) \ ++ slansf.$(SUFFIX) spftrf.$(SUFFIX) spftri.$(SUFFIX) spftrs.$(SUFFIX) ssfrk.$(SUFFIX) stfsm.$(SUFFIX) stftri.$(SUFFIX) stfttp.$(SUFFIX) \ ++ stfttr.$(SUFFIX) stpttf.$(SUFFIX) stpttr.$(SUFFIX) strttf.$(SUFFIX) strttp.$(SUFFIX) \ ++ sgejsv.$(SUFFIX) sgesvj.$(SUFFIX) sgsvj0.$(SUFFIX) sgsvj1.$(SUFFIX) \ ++ sgeequb.$(SUFFIX) ssyequb.$(SUFFIX) spoequb.$(SUFFIX) sgbequb.$(SUFFIX) \ ++ sbbcsd.$(SUFFIX) slapmr.$(SUFFIX) sorbdb.$(SUFFIX) sorcsd.$(SUFFIX) \ ++ sgeqrt.$(SUFFIX) sgeqrt2.$(SUFFIX) sgeqrt3.$(SUFFIX) sgemqrt.$(SUFFIX) \ ++ stpqrt.$(SUFFIX) stpqrt2.$(SUFFIX) stpmqrt.$(SUFFIX) stprfb.$(SUFFIX) + +-DSLASRC = spotrs.o sgetrs.o spotrf.o sgetrf.o ++DSLASRC = spotrs.$(SUFFIX) + + ifdef USEXBLAS +-SXLASRC = sgesvxx.o sgerfsx.o sla_gerfsx_extended.o sla_geamv.o \ +- sla_gercond.o sla_gerpvgrw.o ssysvxx.o ssyrfsx.o \ +- sla_syrfsx_extended.o sla_syamv.o sla_syrcond.o sla_syrpvgrw.o \ +- sposvxx.o sporfsx.o sla_porfsx_extended.o sla_porcond.o \ +- sla_porpvgrw.o sgbsvxx.o sgbrfsx.o sla_gbrfsx_extended.o \ +- sla_gbamv.o sla_gbrcond.o sla_gbrpvgrw.o sla_lin_berr.o slarscl2.o \ +- slascl2.o sla_wwaddw.o ++SXLASRC = sgesvxx.$(SUFFIX) sgerfsx.$(SUFFIX) sla_gerfsx_extended.$(SUFFIX) sla_geamv.$(SUFFIX) \ ++ sla_gercond.$(SUFFIX) sla_gerpvgrw.$(SUFFIX) ssysvxx.$(SUFFIX) ssyrfsx.$(SUFFIX) \ ++ sla_syrfsx_extended.$(SUFFIX) sla_syamv.$(SUFFIX) sla_syrcond.$(SUFFIX) sla_syrpvgrw.$(SUFFIX) \ ++ sposvxx.$(SUFFIX) sporfsx.$(SUFFIX) sla_porfsx_extended.$(SUFFIX) sla_porcond.$(SUFFIX) \ ++ sla_porpvgrw.$(SUFFIX) sgbsvxx.$(SUFFIX) sgbrfsx.$(SUFFIX) sla_gbrfsx_extended.$(SUFFIX) \ ++ sla_gbamv.$(SUFFIX) sla_gbrcond.$(SUFFIX) sla_gbrpvgrw.$(SUFFIX) sla_lin_berr.$(SUFFIX) slarscl2.$(SUFFIX) \ ++ slascl2.$(SUFFIX) sla_wwaddw.$(SUFFIX) + endif + + CLASRC = \ +- cbdsqr.o cgbbrd.o cgbcon.o cgbequ.o cgbrfs.o cgbsv.o cgbsvx.o \ +- cgbtf2.o cgbtrf.o cgbtrs.o cgebak.o cgebal.o cgebd2.o cgebrd.o \ +- cgecon.o cgeequ.o cgees.o cgeesx.o cgeev.o cgeevx.o \ +- cgegs.o cgegv.o cgehd2.o cgehrd.o cgelq2.o cgelqf.o \ +- cgels.o cgelsd.o cgelss.o cgelsx.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o \ +- cgeqpf.o cgeqr2.o cgeqr2p.o cgeqrf.o cgeqrfp.o cgerfs.o \ +- cgerq2.o cgerqf.o cgesc2.o cgesdd.o cgesv.o cgesvd.o \ +- cgesvx.o cgetc2.o cgetf2.o cgetri.o \ +- cggbak.o cggbal.o cgges.o cggesx.o cggev.o cggevx.o cggglm.o \ +- cgghrd.o cgglse.o cggqrf.o cggrqf.o \ +- cggsvd.o cggsvp.o \ +- cgtcon.o cgtrfs.o cgtsv.o cgtsvx.o cgttrf.o cgttrs.o cgtts2.o chbev.o \ +- chbevd.o chbevx.o chbgst.o chbgv.o chbgvd.o chbgvx.o chbtrd.o \ +- checon.o cheev.o cheevd.o cheevr.o cheevx.o chegs2.o chegst.o \ +- chegv.o chegvd.o chegvx.o cherfs.o chesv.o chesvx.o chetd2.o \ +- chetf2.o chetrd.o \ +- chetrf.o chetri.o chetri2.o chetri2x.o cheswapr.o \ +- chetrs.o chetrs2.o chgeqz.o chpcon.o chpev.o chpevd.o \ +- chpevx.o chpgst.o chpgv.o chpgvd.o chpgvx.o chprfs.o chpsv.o \ +- chpsvx.o \ +- chptrd.o chptrf.o chptri.o chptrs.o chsein.o chseqr.o clabrd.o \ +- clacgv.o clacon.o clacn2.o clacp2.o clacpy.o clacrm.o clacrt.o cladiv.o \ +- claed0.o claed7.o claed8.o \ +- claein.o claesy.o claev2.o clags2.o clagtm.o \ +- clahef.o clahqr.o \ +- clahrd.o clahr2.o claic1.o clals0.o clalsa.o clalsd.o clangb.o clange.o clangt.o \ +- clanhb.o clanhe.o \ +- clanhp.o clanhs.o clanht.o clansb.o clansp.o clansy.o clantb.o \ +- clantp.o clantr.o clapll.o clapmt.o clarcm.o claqgb.o claqge.o \ +- claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \ +- claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ +- claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ +- clarf.o clarfb.o clarfg.o clarft.o clarfgp.o \ +- clarfx.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ +- clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ +- claswp.o clasyf.o clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrz.o \ +- clatzm.o clauu2.o clauum.o cpbcon.o cpbequ.o cpbrfs.o cpbstf.o cpbsv.o \ +- cpbsvx.o cpbtf2.o cpbtrf.o cpbtrs.o cpocon.o cpoequ.o cporfs.o \ +- cposv.o cposvx.o cpotf2.o cpotri.o cpstrf.o cpstf2.o \ +- cppcon.o cppequ.o cpprfs.o cppsv.o cppsvx.o cpptrf.o cpptri.o cpptrs.o \ +- cptcon.o cpteqr.o cptrfs.o cptsv.o cptsvx.o cpttrf.o cpttrs.o cptts2.o \ +- crot.o cspcon.o cspmv.o cspr.o csprfs.o cspsv.o \ +- cspsvx.o csptrf.o csptri.o csptrs.o csrscl.o cstedc.o \ +- cstegr.o cstein.o csteqr.o \ +- csycon.o csymv.o \ +- csyr.o csyrfs.o csysv.o csysvx.o csytf2.o csytrf.o csytri.o csytri2.o csytri2x.o \ +- csyswapr.o csytrs.o csytrs2.o csyconv.o \ +- ctbcon.o ctbrfs.o ctbtrs.o ctgevc.o ctgex2.o \ +- ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ +- ctprfs.o ctptri.o \ +- ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ +- ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ +- cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ +- cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ +- cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ +- cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \ +- chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \ +- ctfttr.o ctpttf.o ctpttr.o ctrttf.o ctrttp.o \ +- cgeequb.o cgbequb.o csyequb.o cpoequb.o cheequb.o \ +- cbbcsd.o clapmr.o cunbdb.o cuncsd.o \ +- cgeqrt.o cgeqrt2.o cgeqrt3.o cgemqrt.o \ +- ctpqrt.o ctpqrt2.o ctpmqrt.o ctprfb.o ++ cbdsqr.$(SUFFIX) cgbbrd.$(SUFFIX) cgbcon.$(SUFFIX) cgbequ.$(SUFFIX) cgbrfs.$(SUFFIX) cgbsv.$(SUFFIX) cgbsvx.$(SUFFIX) \ ++ cgbtf2.$(SUFFIX) cgbtrf.$(SUFFIX) cgbtrs.$(SUFFIX) cgebak.$(SUFFIX) cgebal.$(SUFFIX) cgebd2.$(SUFFIX) cgebrd.$(SUFFIX) \ ++ cgecon.$(SUFFIX) cgeequ.$(SUFFIX) cgees.$(SUFFIX) cgeesx.$(SUFFIX) cgeev.$(SUFFIX) cgeevx.$(SUFFIX) \ ++ cgegs.$(SUFFIX) cgegv.$(SUFFIX) cgehd2.$(SUFFIX) cgehrd.$(SUFFIX) cgelq2.$(SUFFIX) cgelqf.$(SUFFIX) \ ++ cgels.$(SUFFIX) cgelsd.$(SUFFIX) cgelss.$(SUFFIX) cgelsx.$(SUFFIX) cgelsy.$(SUFFIX) cgeql2.$(SUFFIX) cgeqlf.$(SUFFIX) cgeqp3.$(SUFFIX) \ ++ cgeqpf.$(SUFFIX) cgeqr2.$(SUFFIX) cgeqr2p.$(SUFFIX) cgeqrf.$(SUFFIX) cgeqrfp.$(SUFFIX) cgerfs.$(SUFFIX) \ ++ cgerq2.$(SUFFIX) cgerqf.$(SUFFIX) cgesc2.$(SUFFIX) cgesdd.$(SUFFIX) cgesv.$(SUFFIX) cgesvd.$(SUFFIX) \ ++ cgesvx.$(SUFFIX) cgetc2.$(SUFFIX) cgetri.$(SUFFIX) \ ++ cggbak.$(SUFFIX) cggbal.$(SUFFIX) cgges.$(SUFFIX) cggesx.$(SUFFIX) cggev.$(SUFFIX) cggevx.$(SUFFIX) cggglm.$(SUFFIX) \ ++ cgghrd.$(SUFFIX) cgglse.$(SUFFIX) cggqrf.$(SUFFIX) cggrqf.$(SUFFIX) \ ++ cggsvd.$(SUFFIX) cggsvp.$(SUFFIX) \ ++ cgtcon.$(SUFFIX) cgtrfs.$(SUFFIX) cgtsv.$(SUFFIX) cgtsvx.$(SUFFIX) cgttrf.$(SUFFIX) cgttrs.$(SUFFIX) cgtts2.$(SUFFIX) chbev.$(SUFFIX) \ ++ chbevd.$(SUFFIX) chbevx.$(SUFFIX) chbgst.$(SUFFIX) chbgv.$(SUFFIX) chbgvd.$(SUFFIX) chbgvx.$(SUFFIX) chbtrd.$(SUFFIX) \ ++ checon.$(SUFFIX) cheev.$(SUFFIX) cheevd.$(SUFFIX) cheevr.$(SUFFIX) cheevx.$(SUFFIX) chegs2.$(SUFFIX) chegst.$(SUFFIX) \ ++ chegv.$(SUFFIX) chegvd.$(SUFFIX) chegvx.$(SUFFIX) cherfs.$(SUFFIX) chesv.$(SUFFIX) chesvx.$(SUFFIX) chetd2.$(SUFFIX) \ ++ chetf2.$(SUFFIX) chetrd.$(SUFFIX) \ ++ chetrf.$(SUFFIX) chetri.$(SUFFIX) chetri2.$(SUFFIX) chetri2x.$(SUFFIX) cheswapr.$(SUFFIX) \ ++ chetrs.$(SUFFIX) chetrs2.$(SUFFIX) chgeqz.$(SUFFIX) chpcon.$(SUFFIX) chpev.$(SUFFIX) chpevd.$(SUFFIX) \ ++ chpevx.$(SUFFIX) chpgst.$(SUFFIX) chpgv.$(SUFFIX) chpgvd.$(SUFFIX) chpgvx.$(SUFFIX) chprfs.$(SUFFIX) chpsv.$(SUFFIX) \ ++ chpsvx.$(SUFFIX) \ ++ chptrd.$(SUFFIX) chptrf.$(SUFFIX) chptri.$(SUFFIX) chptrs.$(SUFFIX) chsein.$(SUFFIX) chseqr.$(SUFFIX) clabrd.$(SUFFIX) \ ++ clacgv.$(SUFFIX) clacon.$(SUFFIX) clacn2.$(SUFFIX) clacp2.$(SUFFIX) clacpy.$(SUFFIX) clacrm.$(SUFFIX) clacrt.$(SUFFIX) cladiv.$(SUFFIX) \ ++ claed0.$(SUFFIX) claed7.$(SUFFIX) claed8.$(SUFFIX) \ ++ claein.$(SUFFIX) claesy.$(SUFFIX) claev2.$(SUFFIX) clags2.$(SUFFIX) clagtm.$(SUFFIX) \ ++ clahef.$(SUFFIX) clahqr.$(SUFFIX) \ ++ clahrd.$(SUFFIX) clahr2.$(SUFFIX) claic1.$(SUFFIX) clals0.$(SUFFIX) clalsa.$(SUFFIX) clalsd.$(SUFFIX) clangb.$(SUFFIX) clange.$(SUFFIX) clangt.$(SUFFIX) \ ++ clanhb.$(SUFFIX) clanhe.$(SUFFIX) \ ++ clanhp.$(SUFFIX) clanhs.$(SUFFIX) clanht.$(SUFFIX) clansb.$(SUFFIX) clansp.$(SUFFIX) clansy.$(SUFFIX) clantb.$(SUFFIX) \ ++ clantp.$(SUFFIX) clantr.$(SUFFIX) clapll.$(SUFFIX) clapmt.$(SUFFIX) clarcm.$(SUFFIX) claqgb.$(SUFFIX) claqge.$(SUFFIX) \ ++ claqhb.$(SUFFIX) claqhe.$(SUFFIX) claqhp.$(SUFFIX) claqp2.$(SUFFIX) claqps.$(SUFFIX) claqsb.$(SUFFIX) \ ++ claqr0.$(SUFFIX) claqr1.$(SUFFIX) claqr2.$(SUFFIX) claqr3.$(SUFFIX) claqr4.$(SUFFIX) claqr5.$(SUFFIX) \ ++ claqsp.$(SUFFIX) claqsy.$(SUFFIX) clar1v.$(SUFFIX) clar2v.$(SUFFIX) ilaclr.$(SUFFIX) ilaclc.$(SUFFIX) \ ++ clarf.$(SUFFIX) clarfb.$(SUFFIX) clarfg.$(SUFFIX) clarft.$(SUFFIX) clarfgp.$(SUFFIX) \ ++ clarfx.$(SUFFIX) clargv.$(SUFFIX) clarnv.$(SUFFIX) clarrv.$(SUFFIX) clartg.$(SUFFIX) clartv.$(SUFFIX) \ ++ clarz.$(SUFFIX) clarzb.$(SUFFIX) clarzt.$(SUFFIX) clascl.$(SUFFIX) claset.$(SUFFIX) clasr.$(SUFFIX) classq.$(SUFFIX) \ ++ clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ ++ clatzm.$(SUFFIX) cpbcon.$(SUFFIX) cpbequ.$(SUFFIX) cpbrfs.$(SUFFIX) cpbstf.$(SUFFIX) cpbsv.$(SUFFIX) \ ++ cpbsvx.$(SUFFIX) cpbtf2.$(SUFFIX) cpbtrf.$(SUFFIX) cpbtrs.$(SUFFIX) cpocon.$(SUFFIX) cpoequ.$(SUFFIX) cporfs.$(SUFFIX) \ ++ cposv.$(SUFFIX) cposvx.$(SUFFIX) cpotri.$(SUFFIX) cpstrf.$(SUFFIX) cpstf2.$(SUFFIX) \ ++ cppcon.$(SUFFIX) cppequ.$(SUFFIX) cpprfs.$(SUFFIX) cppsv.$(SUFFIX) cppsvx.$(SUFFIX) cpptrf.$(SUFFIX) cpptri.$(SUFFIX) cpptrs.$(SUFFIX) \ ++ cptcon.$(SUFFIX) cpteqr.$(SUFFIX) cptrfs.$(SUFFIX) cptsv.$(SUFFIX) cptsvx.$(SUFFIX) cpttrf.$(SUFFIX) cpttrs.$(SUFFIX) cptts2.$(SUFFIX) \ ++ crot.$(SUFFIX) cspcon.$(SUFFIX) cspmv.$(SUFFIX) cspr.$(SUFFIX) csprfs.$(SUFFIX) cspsv.$(SUFFIX) \ ++ cspsvx.$(SUFFIX) csptrf.$(SUFFIX) csptri.$(SUFFIX) csptrs.$(SUFFIX) csrscl.$(SUFFIX) cstedc.$(SUFFIX) \ ++ cstegr.$(SUFFIX) cstein.$(SUFFIX) csteqr.$(SUFFIX) \ ++ csycon.$(SUFFIX) csymv.$(SUFFIX) \ ++ csyr.$(SUFFIX) csyrfs.$(SUFFIX) csysv.$(SUFFIX) csysvx.$(SUFFIX) csytf2.$(SUFFIX) csytrf.$(SUFFIX) csytri.$(SUFFIX) csytri2.$(SUFFIX) csytri2x.$(SUFFIX) \ ++ csyswapr.$(SUFFIX) csytrs.$(SUFFIX) csytrs2.$(SUFFIX) csyconv.$(SUFFIX) \ ++ ctbcon.$(SUFFIX) ctbrfs.$(SUFFIX) ctbtrs.$(SUFFIX) ctgevc.$(SUFFIX) ctgex2.$(SUFFIX) \ ++ ctgexc.$(SUFFIX) ctgsen.$(SUFFIX) ctgsja.$(SUFFIX) ctgsna.$(SUFFIX) ctgsy2.$(SUFFIX) ctgsyl.$(SUFFIX) ctpcon.$(SUFFIX) \ ++ ctprfs.$(SUFFIX) ctptri.$(SUFFIX) \ ++ ctptrs.$(SUFFIX) ctrcon.$(SUFFIX) ctrevc.$(SUFFIX) ctrexc.$(SUFFIX) ctrrfs.$(SUFFIX) ctrsen.$(SUFFIX) ctrsna.$(SUFFIX) \ ++ ctrsyl.$(SUFFIX) ctrtrs.$(SUFFIX) ctzrqf.$(SUFFIX) ctzrzf.$(SUFFIX) cung2l.$(SUFFIX) cung2r.$(SUFFIX) \ ++ cungbr.$(SUFFIX) cunghr.$(SUFFIX) cungl2.$(SUFFIX) cunglq.$(SUFFIX) cungql.$(SUFFIX) cungqr.$(SUFFIX) cungr2.$(SUFFIX) \ ++ cungrq.$(SUFFIX) cungtr.$(SUFFIX) cunm2l.$(SUFFIX) cunm2r.$(SUFFIX) cunmbr.$(SUFFIX) cunmhr.$(SUFFIX) cunml2.$(SUFFIX) \ ++ cunmlq.$(SUFFIX) cunmql.$(SUFFIX) cunmqr.$(SUFFIX) cunmr2.$(SUFFIX) cunmr3.$(SUFFIX) cunmrq.$(SUFFIX) cunmrz.$(SUFFIX) \ ++ cunmtr.$(SUFFIX) cupgtr.$(SUFFIX) cupmtr.$(SUFFIX) icmax1.$(SUFFIX) scsum1.$(SUFFIX) cstemr.$(SUFFIX) \ ++ chfrk.$(SUFFIX) ctfttp.$(SUFFIX) clanhf.$(SUFFIX) cpftrf.$(SUFFIX) cpftri.$(SUFFIX) cpftrs.$(SUFFIX) ctfsm.$(SUFFIX) ctftri.$(SUFFIX) \ ++ ctfttr.$(SUFFIX) ctpttf.$(SUFFIX) ctpttr.$(SUFFIX) ctrttf.$(SUFFIX) ctrttp.$(SUFFIX) \ ++ cgeequb.$(SUFFIX) cgbequb.$(SUFFIX) csyequb.$(SUFFIX) cpoequb.$(SUFFIX) cheequb.$(SUFFIX) \ ++ cbbcsd.$(SUFFIX) clapmr.$(SUFFIX) cunbdb.$(SUFFIX) cuncsd.$(SUFFIX) \ ++ cgeqrt.$(SUFFIX) cgeqrt2.$(SUFFIX) cgeqrt3.$(SUFFIX) cgemqrt.$(SUFFIX) \ ++ ctpqrt.$(SUFFIX) ctpqrt2.$(SUFFIX) ctpmqrt.$(SUFFIX) ctprfb.$(SUFFIX) + + ifdef USEXBLAS +-CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \ +- cla_gercond_c.o cla_gercond_x.o cla_gerpvgrw.o \ +- csysvxx.o csyrfsx.o cla_syrfsx_extended.o cla_syamv.o \ +- cla_syrcond_c.o cla_syrcond_x.o cla_syrpvgrw.o \ +- cposvxx.o cporfsx.o cla_porfsx_extended.o \ +- cla_porcond_c.o cla_porcond_x.o cla_porpvgrw.o \ +- cgbsvxx.o cgbrfsx.o cla_gbrfsx_extended.o cla_gbamv.o \ +- cla_gbrcond_c.o cla_gbrcond_x.o cla_gbrpvgrw.o \ +- chesvxx.o cherfsx.o cla_herfsx_extended.o cla_heamv.o \ +- cla_hercond_c.o cla_hercond_x.o cla_herpvgrw.o \ +- cla_lin_berr.o clarscl2.o clascl2.o cla_wwaddw.o ++CXLASRC = cgesvxx.$(SUFFIX) cgerfsx.$(SUFFIX) cla_gerfsx_extended.$(SUFFIX) cla_geamv.$(SUFFIX) \ ++ cla_gercond_c.$(SUFFIX) cla_gercond_x.$(SUFFIX) cla_gerpvgrw.$(SUFFIX) \ ++ csysvxx.$(SUFFIX) csyrfsx.$(SUFFIX) cla_syrfsx_extended.$(SUFFIX) cla_syamv.$(SUFFIX) \ ++ cla_syrcond_c.$(SUFFIX) cla_syrcond_x.$(SUFFIX) cla_syrpvgrw.$(SUFFIX) \ ++ cposvxx.$(SUFFIX) cporfsx.$(SUFFIX) cla_porfsx_extended.$(SUFFIX) \ ++ cla_porcond_c.$(SUFFIX) cla_porcond_x.$(SUFFIX) cla_porpvgrw.$(SUFFIX) \ ++ cgbsvxx.$(SUFFIX) cgbrfsx.$(SUFFIX) cla_gbrfsx_extended.$(SUFFIX) cla_gbamv.$(SUFFIX) \ ++ cla_gbrcond_c.$(SUFFIX) cla_gbrcond_x.$(SUFFIX) cla_gbrpvgrw.$(SUFFIX) \ ++ chesvxx.$(SUFFIX) cherfsx.$(SUFFIX) cla_herfsx_extended.$(SUFFIX) cla_heamv.$(SUFFIX) \ ++ cla_hercond_c.$(SUFFIX) cla_hercond_x.$(SUFFIX) cla_herpvgrw.$(SUFFIX) \ ++ cla_lin_berr.$(SUFFIX) clarscl2.$(SUFFIX) clascl2.$(SUFFIX) cla_wwaddw.$(SUFFIX) + endif + +-ZCLASRC = cpotrs.o cgetrs.o cpotrf.o cgetrf.o ++ZCLASRC = cpotrs.$(SUFFIX) + + DLASRC = \ +- dgbbrd.o dgbcon.o dgbequ.o dgbrfs.o dgbsv.o \ +- dgbsvx.o dgbtf2.o dgbtrf.o dgbtrs.o dgebak.o dgebal.o dgebd2.o \ +- dgebrd.o dgecon.o dgeequ.o dgees.o dgeesx.o dgeev.o dgeevx.o \ +- dgegs.o dgegv.o dgehd2.o dgehrd.o dgelq2.o dgelqf.o \ +- dgels.o dgelsd.o dgelss.o dgelsx.o dgelsy.o dgeql2.o dgeqlf.o \ +- dgeqp3.o dgeqpf.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \ +- dgerq2.o dgerqf.o dgesc2.o dgesdd.o dgesv.o dgesvd.o dgesvx.o \ +- dgetc2.o dgetf2.o dgetrf.o dgetri.o \ +- dgetrs.o dggbak.o dggbal.o dgges.o dggesx.o dggev.o dggevx.o \ +- dggglm.o dgghrd.o dgglse.o dggqrf.o \ +- dggrqf.o dggsvd.o dggsvp.o dgtcon.o dgtrfs.o dgtsv.o \ +- dgtsvx.o dgttrf.o dgttrs.o dgtts2.o dhgeqz.o \ +- dhsein.o dhseqr.o dlabrd.o dlacon.o dlacn2.o \ +- dlaein.o dlaexc.o dlag2.o dlags2.o dlagtm.o dlagv2.o dlahqr.o \ +- dlahrd.o dlahr2.o dlaic1.o dlaln2.o dlals0.o dlalsa.o dlalsd.o \ +- dlangb.o dlange.o dlangt.o dlanhs.o dlansb.o dlansp.o \ +- dlansy.o dlantb.o dlantp.o dlantr.o dlanv2.o \ +- dlapll.o dlapmt.o \ +- dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ +- dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ +- dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ +- dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o \ +- dlargv.o dlarrv.o dlartv.o \ +- dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o dlasyf.o \ +- dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlatzm.o dlauu2.o \ +- dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ +- dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ +- dorgrq.o dorgtr.o dorm2l.o dorm2r.o \ +- dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ +- dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \ +- dpbstf.o dpbsv.o dpbsvx.o \ +- dpbtf2.o dpbtrf.o dpbtrs.o dpocon.o dpoequ.o dporfs.o dposv.o \ +- dposvx.o dpotf2.o dpotrf.o dpotri.o dpotrs.o dpstrf.o dpstf2.o \ +- dppcon.o dppequ.o \ +- dpprfs.o dppsv.o dppsvx.o dpptrf.o dpptri.o dpptrs.o dptcon.o \ +- dpteqr.o dptrfs.o dptsv.o dptsvx.o dpttrs.o dptts2.o drscl.o \ +- dsbev.o dsbevd.o dsbevx.o dsbgst.o dsbgv.o dsbgvd.o dsbgvx.o \ +- dsbtrd.o dspcon.o dspev.o dspevd.o dspevx.o dspgst.o \ +- dspgv.o dspgvd.o dspgvx.o dsprfs.o dspsv.o dspsvx.o dsptrd.o \ +- dsptrf.o dsptri.o dsptrs.o dstegr.o dstein.o dstev.o dstevd.o dstevr.o \ +- dstevx.o \ +- dsycon.o dsyev.o dsyevd.o dsyevr.o \ +- dsyevx.o dsygs2.o dsygst.o dsygv.o dsygvd.o dsygvx.o dsyrfs.o \ +- dsysv.o dsysvx.o \ +- dsytd2.o dsytf2.o dsytrd.o dsytrf.o dsytri.o dsytri2.o dsytri2x.o \ +- dsyswapr.o dsytrs.o dsytrs2.o dsyconv.o \ +- dtbcon.o dtbrfs.o dtbtrs.o dtgevc.o dtgex2.o dtgexc.o dtgsen.o \ +- dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ +- dtptrs.o \ +- dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ +- dtrti2.o dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ +- dsgesv.o dsposv.o dlag2s.o slag2d.o dlat2s.o \ +- dlansf.o dpftrf.o dpftri.o dpftrs.o dsfrk.o dtfsm.o dtftri.o dtfttp.o \ +- dtfttr.o dtpttf.o dtpttr.o dtrttf.o dtrttp.o \ +- dgejsv.o dgesvj.o dgsvj0.o dgsvj1.o \ +- dgeequb.o dsyequb.o dpoequb.o dgbequb.o \ +- dbbcsd.o dlapmr.o dorbdb.o dorcsd.o \ +- dgeqrt.o dgeqrt2.o dgeqrt3.o dgemqrt.o \ +- dtpqrt.o dtpqrt2.o dtpmqrt.o dtprfb.o ++ dgbbrd.$(SUFFIX) dgbcon.$(SUFFIX) dgbequ.$(SUFFIX) dgbrfs.$(SUFFIX) dgbsv.$(SUFFIX) \ ++ dgbsvx.$(SUFFIX) dgbtf2.$(SUFFIX) dgbtrf.$(SUFFIX) dgbtrs.$(SUFFIX) dgebak.$(SUFFIX) dgebal.$(SUFFIX) dgebd2.$(SUFFIX) \ ++ dgebrd.$(SUFFIX) dgecon.$(SUFFIX) dgeequ.$(SUFFIX) dgees.$(SUFFIX) dgeesx.$(SUFFIX) dgeev.$(SUFFIX) dgeevx.$(SUFFIX) \ ++ dgegs.$(SUFFIX) dgegv.$(SUFFIX) dgehd2.$(SUFFIX) dgehrd.$(SUFFIX) dgelq2.$(SUFFIX) dgelqf.$(SUFFIX) \ ++ dgels.$(SUFFIX) dgelsd.$(SUFFIX) dgelss.$(SUFFIX) dgelsx.$(SUFFIX) dgelsy.$(SUFFIX) dgeql2.$(SUFFIX) dgeqlf.$(SUFFIX) \ ++ dgeqp3.$(SUFFIX) dgeqpf.$(SUFFIX) dgeqr2.$(SUFFIX) dgeqr2p.$(SUFFIX) dgeqrf.$(SUFFIX) dgeqrfp.$(SUFFIX) dgerfs.$(SUFFIX) \ ++ dgerq2.$(SUFFIX) dgerqf.$(SUFFIX) dgesc2.$(SUFFIX) dgesdd.$(SUFFIX) dgesv.$(SUFFIX) dgesvd.$(SUFFIX) dgesvx.$(SUFFIX) \ ++ dgetc2.$(SUFFIX) dgetri.$(SUFFIX) \ ++ dggbak.$(SUFFIX) dggbal.$(SUFFIX) dgges.$(SUFFIX) dggesx.$(SUFFIX) dggev.$(SUFFIX) dggevx.$(SUFFIX) \ ++ dggglm.$(SUFFIX) dgghrd.$(SUFFIX) dgglse.$(SUFFIX) dggqrf.$(SUFFIX) \ ++ dggrqf.$(SUFFIX) dggsvd.$(SUFFIX) dggsvp.$(SUFFIX) dgtcon.$(SUFFIX) dgtrfs.$(SUFFIX) dgtsv.$(SUFFIX) \ ++ dgtsvx.$(SUFFIX) dgttrf.$(SUFFIX) dgttrs.$(SUFFIX) dgtts2.$(SUFFIX) dhgeqz.$(SUFFIX) \ ++ dhsein.$(SUFFIX) dhseqr.$(SUFFIX) dlabrd.$(SUFFIX) dlacon.$(SUFFIX) dlacn2.$(SUFFIX) \ ++ dlaein.$(SUFFIX) dlaexc.$(SUFFIX) dlag2.$(SUFFIX) dlags2.$(SUFFIX) dlagtm.$(SUFFIX) dlagv2.$(SUFFIX) dlahqr.$(SUFFIX) \ ++ dlahrd.$(SUFFIX) dlahr2.$(SUFFIX) dlaic1.$(SUFFIX) dlaln2.$(SUFFIX) dlals0.$(SUFFIX) dlalsa.$(SUFFIX) dlalsd.$(SUFFIX) \ ++ dlangb.$(SUFFIX) dlange.$(SUFFIX) dlangt.$(SUFFIX) dlanhs.$(SUFFIX) dlansb.$(SUFFIX) dlansp.$(SUFFIX) \ ++ dlansy.$(SUFFIX) dlantb.$(SUFFIX) dlantp.$(SUFFIX) dlantr.$(SUFFIX) dlanv2.$(SUFFIX) \ ++ dlapll.$(SUFFIX) dlapmt.$(SUFFIX) \ ++ dlaqgb.$(SUFFIX) dlaqge.$(SUFFIX) dlaqp2.$(SUFFIX) dlaqps.$(SUFFIX) dlaqsb.$(SUFFIX) dlaqsp.$(SUFFIX) dlaqsy.$(SUFFIX) \ ++ dlaqr0.$(SUFFIX) dlaqr1.$(SUFFIX) dlaqr2.$(SUFFIX) dlaqr3.$(SUFFIX) dlaqr4.$(SUFFIX) dlaqr5.$(SUFFIX) \ ++ dlaqtr.$(SUFFIX) dlar1v.$(SUFFIX) dlar2v.$(SUFFIX) iladlr.$(SUFFIX) iladlc.$(SUFFIX) \ ++ dlarf.$(SUFFIX) dlarfb.$(SUFFIX) dlarfg.$(SUFFIX) dlarfgp.$(SUFFIX) dlarft.$(SUFFIX) dlarfx.$(SUFFIX) \ ++ dlargv.$(SUFFIX) dlarrv.$(SUFFIX) dlartv.$(SUFFIX) \ ++ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ ++ dlatbs.$(SUFFIX) dlatdf.$(SUFFIX) dlatps.$(SUFFIX) dlatrd.$(SUFFIX) dlatrs.$(SUFFIX) dlatrz.$(SUFFIX) dlatzm.$(SUFFIX) \ ++ dopgtr.$(SUFFIX) dopmtr.$(SUFFIX) dorg2l.$(SUFFIX) dorg2r.$(SUFFIX) \ ++ dorgbr.$(SUFFIX) dorghr.$(SUFFIX) dorgl2.$(SUFFIX) dorglq.$(SUFFIX) dorgql.$(SUFFIX) dorgqr.$(SUFFIX) dorgr2.$(SUFFIX) \ ++ dorgrq.$(SUFFIX) dorgtr.$(SUFFIX) dorm2l.$(SUFFIX) dorm2r.$(SUFFIX) \ ++ dormbr.$(SUFFIX) dormhr.$(SUFFIX) dorml2.$(SUFFIX) dormlq.$(SUFFIX) dormql.$(SUFFIX) dormqr.$(SUFFIX) dormr2.$(SUFFIX) \ ++ dormr3.$(SUFFIX) dormrq.$(SUFFIX) dormrz.$(SUFFIX) dormtr.$(SUFFIX) dpbcon.$(SUFFIX) dpbequ.$(SUFFIX) dpbrfs.$(SUFFIX) \ ++ dpbstf.$(SUFFIX) dpbsv.$(SUFFIX) dpbsvx.$(SUFFIX) \ ++ dpbtf2.$(SUFFIX) dpbtrf.$(SUFFIX) dpbtrs.$(SUFFIX) dpocon.$(SUFFIX) dpoequ.$(SUFFIX) dporfs.$(SUFFIX) dposv.$(SUFFIX) \ ++ dposvx.$(SUFFIX) dpotri.$(SUFFIX) dpotrs.$(SUFFIX) dpstrf.$(SUFFIX) dpstf2.$(SUFFIX) \ ++ dppcon.$(SUFFIX) dppequ.$(SUFFIX) \ ++ dpprfs.$(SUFFIX) dppsv.$(SUFFIX) dppsvx.$(SUFFIX) dpptrf.$(SUFFIX) dpptri.$(SUFFIX) dpptrs.$(SUFFIX) dptcon.$(SUFFIX) \ ++ dpteqr.$(SUFFIX) dptrfs.$(SUFFIX) dptsv.$(SUFFIX) dptsvx.$(SUFFIX) dpttrs.$(SUFFIX) dptts2.$(SUFFIX) drscl.$(SUFFIX) \ ++ dsbev.$(SUFFIX) dsbevd.$(SUFFIX) dsbevx.$(SUFFIX) dsbgst.$(SUFFIX) dsbgv.$(SUFFIX) dsbgvd.$(SUFFIX) dsbgvx.$(SUFFIX) \ ++ dsbtrd.$(SUFFIX) dspcon.$(SUFFIX) dspev.$(SUFFIX) dspevd.$(SUFFIX) dspevx.$(SUFFIX) dspgst.$(SUFFIX) \ ++ dspgv.$(SUFFIX) dspgvd.$(SUFFIX) dspgvx.$(SUFFIX) dsprfs.$(SUFFIX) dspsv.$(SUFFIX) dspsvx.$(SUFFIX) dsptrd.$(SUFFIX) \ ++ dsptrf.$(SUFFIX) dsptri.$(SUFFIX) dsptrs.$(SUFFIX) dstegr.$(SUFFIX) dstein.$(SUFFIX) dstev.$(SUFFIX) dstevd.$(SUFFIX) dstevr.$(SUFFIX) \ ++ dstevx.$(SUFFIX) \ ++ dsycon.$(SUFFIX) dsyev.$(SUFFIX) dsyevd.$(SUFFIX) dsyevr.$(SUFFIX) \ ++ dsyevx.$(SUFFIX) dsygs2.$(SUFFIX) dsygst.$(SUFFIX) dsygv.$(SUFFIX) dsygvd.$(SUFFIX) dsygvx.$(SUFFIX) dsyrfs.$(SUFFIX) \ ++ dsysv.$(SUFFIX) dsysvx.$(SUFFIX) \ ++ dsytd2.$(SUFFIX) dsytf2.$(SUFFIX) dsytrd.$(SUFFIX) dsytrf.$(SUFFIX) dsytri.$(SUFFIX) dsytri2.$(SUFFIX) dsytri2x.$(SUFFIX) \ ++ dsyswapr.$(SUFFIX) dsytrs.$(SUFFIX) dsytrs2.$(SUFFIX) dsyconv.$(SUFFIX) \ ++ dtbcon.$(SUFFIX) dtbrfs.$(SUFFIX) dtbtrs.$(SUFFIX) dtgevc.$(SUFFIX) dtgex2.$(SUFFIX) dtgexc.$(SUFFIX) dtgsen.$(SUFFIX) \ ++ dtgsja.$(SUFFIX) dtgsna.$(SUFFIX) dtgsy2.$(SUFFIX) dtgsyl.$(SUFFIX) dtpcon.$(SUFFIX) dtprfs.$(SUFFIX) dtptri.$(SUFFIX) \ ++ dtptrs.$(SUFFIX) \ ++ dtrcon.$(SUFFIX) dtrevc.$(SUFFIX) dtrexc.$(SUFFIX) dtrrfs.$(SUFFIX) dtrsen.$(SUFFIX) dtrsna.$(SUFFIX) dtrsyl.$(SUFFIX) \ ++ dtrtrs.$(SUFFIX) dtzrqf.$(SUFFIX) dtzrzf.$(SUFFIX) dstemr.$(SUFFIX) \ ++ dsgesv.$(SUFFIX) dsposv.$(SUFFIX) dlag2s.$(SUFFIX) slag2d.$(SUFFIX) dlat2s.$(SUFFIX) \ ++ dlansf.$(SUFFIX) dpftrf.$(SUFFIX) dpftri.$(SUFFIX) dpftrs.$(SUFFIX) dsfrk.$(SUFFIX) dtfsm.$(SUFFIX) dtftri.$(SUFFIX) dtfttp.$(SUFFIX) \ ++ dtfttr.$(SUFFIX) dtpttf.$(SUFFIX) dtpttr.$(SUFFIX) dtrttf.$(SUFFIX) dtrttp.$(SUFFIX) \ ++ dgejsv.$(SUFFIX) dgesvj.$(SUFFIX) dgsvj0.$(SUFFIX) dgsvj1.$(SUFFIX) \ ++ dgeequb.$(SUFFIX) dsyequb.$(SUFFIX) dpoequb.$(SUFFIX) dgbequb.$(SUFFIX) \ ++ dbbcsd.$(SUFFIX) dlapmr.$(SUFFIX) dorbdb.$(SUFFIX) dorcsd.$(SUFFIX) \ ++ dgeqrt.$(SUFFIX) dgeqrt2.$(SUFFIX) dgeqrt3.$(SUFFIX) dgemqrt.$(SUFFIX) \ ++ dtpqrt.$(SUFFIX) dtpqrt2.$(SUFFIX) dtpmqrt.$(SUFFIX) dtprfb.$(SUFFIX) + + ifdef USEXBLAS +-DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o \ +- dla_gercond.o dla_gerpvgrw.o dsysvxx.o dsyrfsx.o \ +- dla_syrfsx_extended.o dla_syamv.o dla_syrcond.o dla_syrpvgrw.o \ +- dposvxx.o dporfsx.o dla_porfsx_extended.o dla_porcond.o \ +- dla_porpvgrw.o dgbsvxx.o dgbrfsx.o dla_gbrfsx_extended.o \ +- dla_gbamv.o dla_gbrcond.o dla_gbrpvgrw.o dla_lin_berr.o dlarscl2.o \ +- dlascl2.o dla_wwaddw.o ++DXLASRC = dgesvxx.$(SUFFIX) dgerfsx.$(SUFFIX) dla_gerfsx_extended.$(SUFFIX) dla_geamv.$(SUFFIX) \ ++ dla_gercond.$(SUFFIX) dla_gerpvgrw.$(SUFFIX) dsysvxx.$(SUFFIX) dsyrfsx.$(SUFFIX) \ ++ dla_syrfsx_extended.$(SUFFIX) dla_syamv.$(SUFFIX) dla_syrcond.$(SUFFIX) dla_syrpvgrw.$(SUFFIX) \ ++ dposvxx.$(SUFFIX) dporfsx.$(SUFFIX) dla_porfsx_extended.$(SUFFIX) dla_porcond.$(SUFFIX) \ ++ dla_porpvgrw.$(SUFFIX) dgbsvxx.$(SUFFIX) dgbrfsx.$(SUFFIX) dla_gbrfsx_extended.$(SUFFIX) \ ++ dla_gbamv.$(SUFFIX) dla_gbrcond.$(SUFFIX) dla_gbrpvgrw.$(SUFFIX) dla_lin_berr.$(SUFFIX) dlarscl2.$(SUFFIX) \ ++ dlascl2.$(SUFFIX) dla_wwaddw.$(SUFFIX) + endif + + ZLASRC = \ +- zbdsqr.o zgbbrd.o zgbcon.o zgbequ.o zgbrfs.o zgbsv.o zgbsvx.o \ +- zgbtf2.o zgbtrf.o zgbtrs.o zgebak.o zgebal.o zgebd2.o zgebrd.o \ +- zgecon.o zgeequ.o zgees.o zgeesx.o zgeev.o zgeevx.o \ +- zgegs.o zgegv.o zgehd2.o zgehrd.o zgelq2.o zgelqf.o \ +- zgels.o zgelsd.o zgelss.o zgelsx.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o \ +- zgeqpf.o zgeqr2.o zgeqr2p.o zgeqrf.o zgeqrfp.o zgerfs.o zgerq2.o zgerqf.o \ +- zgesc2.o zgesdd.o zgesv.o zgesvd.o zgesvx.o zgetc2.o zgetf2.o zgetrf.o \ +- zgetri.o zgetrs.o \ +- zggbak.o zggbal.o zgges.o zggesx.o zggev.o zggevx.o zggglm.o \ +- zgghrd.o zgglse.o zggqrf.o zggrqf.o \ +- zggsvd.o zggsvp.o \ +- zgtcon.o zgtrfs.o zgtsv.o zgtsvx.o zgttrf.o zgttrs.o zgtts2.o zhbev.o \ +- zhbevd.o zhbevx.o zhbgst.o zhbgv.o zhbgvd.o zhbgvx.o zhbtrd.o \ +- zhecon.o zheev.o zheevd.o zheevr.o zheevx.o zhegs2.o zhegst.o \ +- zhegv.o zhegvd.o zhegvx.o zherfs.o zhesv.o zhesvx.o zhetd2.o \ +- zhetf2.o zhetrd.o \ +- zhetrf.o zhetri.o zhetri2.o zhetri2x.o zheswapr.o \ +- zhetrs.o zhetrs2.o zhgeqz.o zhpcon.o zhpev.o zhpevd.o \ +- zhpevx.o zhpgst.o zhpgv.o zhpgvd.o zhpgvx.o zhprfs.o zhpsv.o \ +- zhpsvx.o \ +- zhptrd.o zhptrf.o zhptri.o zhptrs.o zhsein.o zhseqr.o zlabrd.o \ +- zlacgv.o zlacon.o zlacn2.o zlacp2.o zlacpy.o zlacrm.o zlacrt.o zladiv.o \ +- zlaed0.o zlaed7.o zlaed8.o \ +- zlaein.o zlaesy.o zlaev2.o zlags2.o zlagtm.o \ +- zlahef.o zlahqr.o \ +- zlahrd.o zlahr2.o zlaic1.o zlals0.o zlalsa.o zlalsd.o zlangb.o zlange.o \ +- zlangt.o zlanhb.o \ +- zlanhe.o \ +- zlanhp.o zlanhs.o zlanht.o zlansb.o zlansp.o zlansy.o zlantb.o \ +- zlantp.o zlantr.o zlapll.o zlapmt.o zlaqgb.o zlaqge.o \ +- zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \ +- zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ +- zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ +- zlarcm.o zlarf.o zlarfb.o \ +- zlarfg.o zlarft.o zlarfgp.o \ +- zlarfx.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ +- zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ +- zlassq.o zlaswp.o zlasyf.o \ +- zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlatzm.o zlauu2.o \ +- zlauum.o zpbcon.o zpbequ.o zpbrfs.o zpbstf.o zpbsv.o \ +- zpbsvx.o zpbtf2.o zpbtrf.o zpbtrs.o zpocon.o zpoequ.o zporfs.o \ +- zposv.o zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \ +- zppcon.o zppequ.o zpprfs.o zppsv.o zppsvx.o zpptrf.o zpptri.o zpptrs.o \ +- zptcon.o zpteqr.o zptrfs.o zptsv.o zptsvx.o zpttrf.o zpttrs.o zptts2.o \ +- zrot.o zspcon.o zspmv.o zspr.o zsprfs.o zspsv.o \ +- zspsvx.o zsptrf.o zsptri.o zsptrs.o zdrscl.o zstedc.o \ +- zstegr.o zstein.o zsteqr.o \ +- zsycon.o zsymv.o \ +- zsyr.o zsyrfs.o zsysv.o zsysvx.o zsytf2.o zsytrf.o zsytri.o zsytri2.o zsytri2x.o \ +- zsyswapr.o zsytrs.o zsytrs2.o zsyconv.o \ +- ztbcon.o ztbrfs.o ztbtrs.o ztgevc.o ztgex2.o \ +- ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ +- ztprfs.o ztptri.o \ +- ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ +- ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ +- zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ +- zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ +- zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ +- zunmtr.o zupgtr.o \ +- zupmtr.o izmax1.o dzsum1.o zstemr.o \ +- zcgesv.o zcposv.o zlag2c.o clag2z.o zlat2c.o \ +- zhfrk.o ztfttp.o zlanhf.o zpftrf.o zpftri.o zpftrs.o ztfsm.o ztftri.o \ +- ztfttr.o ztpttf.o ztpttr.o ztrttf.o ztrttp.o \ +- zgeequb.o zgbequb.o zsyequb.o zpoequb.o zheequb.o \ +- zbbcsd.o zlapmr.o zunbdb.o zuncsd.o \ +- zgeqrt.o zgeqrt2.o zgeqrt3.o zgemqrt.o \ +- ztpqrt.o ztpqrt2.o ztpmqrt.o ztprfb.o ++ zbdsqr.$(SUFFIX) zgbbrd.$(SUFFIX) zgbcon.$(SUFFIX) zgbequ.$(SUFFIX) zgbrfs.$(SUFFIX) zgbsv.$(SUFFIX) zgbsvx.$(SUFFIX) \ ++ zgbtf2.$(SUFFIX) zgbtrf.$(SUFFIX) zgbtrs.$(SUFFIX) zgebak.$(SUFFIX) zgebal.$(SUFFIX) zgebd2.$(SUFFIX) zgebrd.$(SUFFIX) \ ++ zgecon.$(SUFFIX) zgeequ.$(SUFFIX) zgees.$(SUFFIX) zgeesx.$(SUFFIX) zgeev.$(SUFFIX) zgeevx.$(SUFFIX) \ ++ zgegs.$(SUFFIX) zgegv.$(SUFFIX) zgehd2.$(SUFFIX) zgehrd.$(SUFFIX) zgelq2.$(SUFFIX) zgelqf.$(SUFFIX) \ ++ zgels.$(SUFFIX) zgelsd.$(SUFFIX) zgelss.$(SUFFIX) zgelsx.$(SUFFIX) zgelsy.$(SUFFIX) zgeql2.$(SUFFIX) zgeqlf.$(SUFFIX) zgeqp3.$(SUFFIX) \ ++ zgeqpf.$(SUFFIX) zgeqr2.$(SUFFIX) zgeqr2p.$(SUFFIX) zgeqrf.$(SUFFIX) zgeqrfp.$(SUFFIX) zgerfs.$(SUFFIX) zgerq2.$(SUFFIX) zgerqf.$(SUFFIX) \ ++ zgesc2.$(SUFFIX) zgesdd.$(SUFFIX) zgesv.$(SUFFIX) zgesvd.$(SUFFIX) zgesvx.$(SUFFIX) zgetc2.$(SUFFIX) \ ++ zgetri.$(SUFFIX) \ ++ zggbak.$(SUFFIX) zggbal.$(SUFFIX) zgges.$(SUFFIX) zggesx.$(SUFFIX) zggev.$(SUFFIX) zggevx.$(SUFFIX) zggglm.$(SUFFIX) \ ++ zgghrd.$(SUFFIX) zgglse.$(SUFFIX) zggqrf.$(SUFFIX) zggrqf.$(SUFFIX) \ ++ zggsvd.$(SUFFIX) zggsvp.$(SUFFIX) \ ++ zgtcon.$(SUFFIX) zgtrfs.$(SUFFIX) zgtsv.$(SUFFIX) zgtsvx.$(SUFFIX) zgttrf.$(SUFFIX) zgttrs.$(SUFFIX) zgtts2.$(SUFFIX) zhbev.$(SUFFIX) \ ++ zhbevd.$(SUFFIX) zhbevx.$(SUFFIX) zhbgst.$(SUFFIX) zhbgv.$(SUFFIX) zhbgvd.$(SUFFIX) zhbgvx.$(SUFFIX) zhbtrd.$(SUFFIX) \ ++ zhecon.$(SUFFIX) zheev.$(SUFFIX) zheevd.$(SUFFIX) zheevr.$(SUFFIX) zheevx.$(SUFFIX) zhegs2.$(SUFFIX) zhegst.$(SUFFIX) \ ++ zhegv.$(SUFFIX) zhegvd.$(SUFFIX) zhegvx.$(SUFFIX) zherfs.$(SUFFIX) zhesv.$(SUFFIX) zhesvx.$(SUFFIX) zhetd2.$(SUFFIX) \ ++ zhetf2.$(SUFFIX) zhetrd.$(SUFFIX) \ ++ zhetrf.$(SUFFIX) zhetri.$(SUFFIX) zhetri2.$(SUFFIX) zhetri2x.$(SUFFIX) zheswapr.$(SUFFIX) \ ++ zhetrs.$(SUFFIX) zhetrs2.$(SUFFIX) zhgeqz.$(SUFFIX) zhpcon.$(SUFFIX) zhpev.$(SUFFIX) zhpevd.$(SUFFIX) \ ++ zhpevx.$(SUFFIX) zhpgst.$(SUFFIX) zhpgv.$(SUFFIX) zhpgvd.$(SUFFIX) zhpgvx.$(SUFFIX) zhprfs.$(SUFFIX) zhpsv.$(SUFFIX) \ ++ zhpsvx.$(SUFFIX) \ ++ zhptrd.$(SUFFIX) zhptrf.$(SUFFIX) zhptri.$(SUFFIX) zhptrs.$(SUFFIX) zhsein.$(SUFFIX) zhseqr.$(SUFFIX) zlabrd.$(SUFFIX) \ ++ zlacgv.$(SUFFIX) zlacon.$(SUFFIX) zlacn2.$(SUFFIX) zlacp2.$(SUFFIX) zlacpy.$(SUFFIX) zlacrm.$(SUFFIX) zlacrt.$(SUFFIX) zladiv.$(SUFFIX) \ ++ zlaed0.$(SUFFIX) zlaed7.$(SUFFIX) zlaed8.$(SUFFIX) \ ++ zlaein.$(SUFFIX) zlaesy.$(SUFFIX) zlaev2.$(SUFFIX) zlags2.$(SUFFIX) zlagtm.$(SUFFIX) \ ++ zlahef.$(SUFFIX) zlahqr.$(SUFFIX) \ ++ zlahrd.$(SUFFIX) zlahr2.$(SUFFIX) zlaic1.$(SUFFIX) zlals0.$(SUFFIX) zlalsa.$(SUFFIX) zlalsd.$(SUFFIX) zlangb.$(SUFFIX) zlange.$(SUFFIX) \ ++ zlangt.$(SUFFIX) zlanhb.$(SUFFIX) \ ++ zlanhe.$(SUFFIX) \ ++ zlanhp.$(SUFFIX) zlanhs.$(SUFFIX) zlanht.$(SUFFIX) zlansb.$(SUFFIX) zlansp.$(SUFFIX) zlansy.$(SUFFIX) zlantb.$(SUFFIX) \ ++ zlantp.$(SUFFIX) zlantr.$(SUFFIX) zlapll.$(SUFFIX) zlapmt.$(SUFFIX) zlaqgb.$(SUFFIX) zlaqge.$(SUFFIX) \ ++ zlaqhb.$(SUFFIX) zlaqhe.$(SUFFIX) zlaqhp.$(SUFFIX) zlaqp2.$(SUFFIX) zlaqps.$(SUFFIX) zlaqsb.$(SUFFIX) \ ++ zlaqr0.$(SUFFIX) zlaqr1.$(SUFFIX) zlaqr2.$(SUFFIX) zlaqr3.$(SUFFIX) zlaqr4.$(SUFFIX) zlaqr5.$(SUFFIX) \ ++ zlaqsp.$(SUFFIX) zlaqsy.$(SUFFIX) zlar1v.$(SUFFIX) zlar2v.$(SUFFIX) ilazlr.$(SUFFIX) ilazlc.$(SUFFIX) \ ++ zlarcm.$(SUFFIX) zlarf.$(SUFFIX) zlarfb.$(SUFFIX) \ ++ zlarfg.$(SUFFIX) zlarft.$(SUFFIX) zlarfgp.$(SUFFIX) \ ++ zlarfx.$(SUFFIX) zlargv.$(SUFFIX) zlarnv.$(SUFFIX) zlarrv.$(SUFFIX) zlartg.$(SUFFIX) zlartv.$(SUFFIX) \ ++ zlarz.$(SUFFIX) zlarzb.$(SUFFIX) zlarzt.$(SUFFIX) zlascl.$(SUFFIX) zlaset.$(SUFFIX) zlasr.$(SUFFIX) \ ++ zlassq.$(SUFFIX) zlasyf.$(SUFFIX) \ ++ zlatbs.$(SUFFIX) zlatdf.$(SUFFIX) zlatps.$(SUFFIX) zlatrd.$(SUFFIX) zlatrs.$(SUFFIX) zlatrz.$(SUFFIX) zlatzm.$(SUFFIX) zlauu2.$(SUFFIX) \ ++ zpbcon.$(SUFFIX) zpbequ.$(SUFFIX) zpbrfs.$(SUFFIX) zpbstf.$(SUFFIX) zpbsv.$(SUFFIX) \ ++ zpbsvx.$(SUFFIX) zpbtf2.$(SUFFIX) zpbtrf.$(SUFFIX) zpbtrs.$(SUFFIX) zpocon.$(SUFFIX) zpoequ.$(SUFFIX) zporfs.$(SUFFIX) \ ++ zposv.$(SUFFIX) zposvx.$(SUFFIX) zpotri.$(SUFFIX) zpotrs.$(SUFFIX) zpstrf.$(SUFFIX) zpstf2.$(SUFFIX) \ ++ zppcon.$(SUFFIX) zppequ.$(SUFFIX) zpprfs.$(SUFFIX) zppsv.$(SUFFIX) zppsvx.$(SUFFIX) zpptrf.$(SUFFIX) zpptri.$(SUFFIX) zpptrs.$(SUFFIX) \ ++ zptcon.$(SUFFIX) zpteqr.$(SUFFIX) zptrfs.$(SUFFIX) zptsv.$(SUFFIX) zptsvx.$(SUFFIX) zpttrf.$(SUFFIX) zpttrs.$(SUFFIX) zptts2.$(SUFFIX) \ ++ zrot.$(SUFFIX) zspcon.$(SUFFIX) zspmv.$(SUFFIX) zspr.$(SUFFIX) zsprfs.$(SUFFIX) zspsv.$(SUFFIX) \ ++ zspsvx.$(SUFFIX) zsptrf.$(SUFFIX) zsptri.$(SUFFIX) zsptrs.$(SUFFIX) zdrscl.$(SUFFIX) zstedc.$(SUFFIX) \ ++ zstegr.$(SUFFIX) zstein.$(SUFFIX) zsteqr.$(SUFFIX) \ ++ zsycon.$(SUFFIX) zsymv.$(SUFFIX) \ ++ zsyr.$(SUFFIX) zsyrfs.$(SUFFIX) zsysv.$(SUFFIX) zsysvx.$(SUFFIX) zsytf2.$(SUFFIX) zsytrf.$(SUFFIX) zsytri.$(SUFFIX) zsytri2.$(SUFFIX) zsytri2x.$(SUFFIX) \ ++ zsyswapr.$(SUFFIX) zsytrs.$(SUFFIX) zsytrs2.$(SUFFIX) zsyconv.$(SUFFIX) \ ++ ztbcon.$(SUFFIX) ztbrfs.$(SUFFIX) ztbtrs.$(SUFFIX) ztgevc.$(SUFFIX) ztgex2.$(SUFFIX) \ ++ ztgexc.$(SUFFIX) ztgsen.$(SUFFIX) ztgsja.$(SUFFIX) ztgsna.$(SUFFIX) ztgsy2.$(SUFFIX) ztgsyl.$(SUFFIX) ztpcon.$(SUFFIX) \ ++ ztprfs.$(SUFFIX) ztptri.$(SUFFIX) \ ++ ztptrs.$(SUFFIX) ztrcon.$(SUFFIX) ztrevc.$(SUFFIX) ztrexc.$(SUFFIX) ztrrfs.$(SUFFIX) ztrsen.$(SUFFIX) ztrsna.$(SUFFIX) \ ++ ztrsyl.$(SUFFIX) ztrtrs.$(SUFFIX) ztzrqf.$(SUFFIX) ztzrzf.$(SUFFIX) zung2l.$(SUFFIX) \ ++ zung2r.$(SUFFIX) zungbr.$(SUFFIX) zunghr.$(SUFFIX) zungl2.$(SUFFIX) zunglq.$(SUFFIX) zungql.$(SUFFIX) zungqr.$(SUFFIX) zungr2.$(SUFFIX) \ ++ zungrq.$(SUFFIX) zungtr.$(SUFFIX) zunm2l.$(SUFFIX) zunm2r.$(SUFFIX) zunmbr.$(SUFFIX) zunmhr.$(SUFFIX) zunml2.$(SUFFIX) \ ++ zunmlq.$(SUFFIX) zunmql.$(SUFFIX) zunmqr.$(SUFFIX) zunmr2.$(SUFFIX) zunmr3.$(SUFFIX) zunmrq.$(SUFFIX) zunmrz.$(SUFFIX) \ ++ zunmtr.$(SUFFIX) zupgtr.$(SUFFIX) \ ++ zupmtr.$(SUFFIX) izmax1.$(SUFFIX) dzsum1.$(SUFFIX) zstemr.$(SUFFIX) \ ++ zcgesv.$(SUFFIX) zcposv.$(SUFFIX) zlag2c.$(SUFFIX) clag2z.$(SUFFIX) zlat2c.$(SUFFIX) \ ++ zhfrk.$(SUFFIX) ztfttp.$(SUFFIX) zlanhf.$(SUFFIX) zpftrf.$(SUFFIX) zpftri.$(SUFFIX) zpftrs.$(SUFFIX) ztfsm.$(SUFFIX) ztftri.$(SUFFIX) \ ++ ztfttr.$(SUFFIX) ztpttf.$(SUFFIX) ztpttr.$(SUFFIX) ztrttf.$(SUFFIX) ztrttp.$(SUFFIX) \ ++ zgeequb.$(SUFFIX) zgbequb.$(SUFFIX) zsyequb.$(SUFFIX) zpoequb.$(SUFFIX) zheequb.$(SUFFIX) \ ++ zbbcsd.$(SUFFIX) zlapmr.$(SUFFIX) zunbdb.$(SUFFIX) zuncsd.$(SUFFIX) \ ++ zgeqrt.$(SUFFIX) zgeqrt2.$(SUFFIX) zgeqrt3.$(SUFFIX) zgemqrt.$(SUFFIX) \ ++ ztpqrt.$(SUFFIX) ztpqrt2.$(SUFFIX) ztpmqrt.$(SUFFIX) ztprfb.$(SUFFIX) + + ifdef USEXBLAS +-ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o \ +- zla_gercond_c.o zla_gercond_x.o zla_gerpvgrw.o zsysvxx.o zsyrfsx.o \ +- zla_syrfsx_extended.o zla_syamv.o zla_syrcond_c.o zla_syrcond_x.o \ +- zla_syrpvgrw.o zposvxx.o zporfsx.o zla_porfsx_extended.o \ +- zla_porcond_c.o zla_porcond_x.o zla_porpvgrw.o zgbsvxx.o zgbrfsx.o \ +- zla_gbrfsx_extended.o zla_gbamv.o zla_gbrcond_c.o zla_gbrcond_x.o \ +- zla_gbrpvgrw.o zhesvxx.o zherfsx.o zla_herfsx_extended.o \ +- zla_heamv.o zla_hercond_c.o zla_hercond_x.o zla_herpvgrw.o \ +- zla_lin_berr.o zlarscl2.o zlascl2.o zla_wwaddw.o ++ZXLASRC = zgesvxx.$(SUFFIX) zgerfsx.$(SUFFIX) zla_gerfsx_extended.$(SUFFIX) zla_geamv.$(SUFFIX) \ ++ zla_gercond_c.$(SUFFIX) zla_gercond_x.$(SUFFIX) zla_gerpvgrw.$(SUFFIX) zsysvxx.$(SUFFIX) zsyrfsx.$(SUFFIX) \ ++ zla_syrfsx_extended.$(SUFFIX) zla_syamv.$(SUFFIX) zla_syrcond_c.$(SUFFIX) zla_syrcond_x.$(SUFFIX) \ ++ zla_syrpvgrw.$(SUFFIX) zposvxx.$(SUFFIX) zporfsx.$(SUFFIX) zla_porfsx_extended.$(SUFFIX) \ ++ zla_porcond_c.$(SUFFIX) zla_porcond_x.$(SUFFIX) zla_porpvgrw.$(SUFFIX) zgbsvxx.$(SUFFIX) zgbrfsx.$(SUFFIX) \ ++ zla_gbrfsx_extended.$(SUFFIX) zla_gbamv.$(SUFFIX) zla_gbrcond_c.$(SUFFIX) zla_gbrcond_x.$(SUFFIX) \ ++ zla_gbrpvgrw.$(SUFFIX) zhesvxx.$(SUFFIX) zherfsx.$(SUFFIX) zla_herfsx_extended.$(SUFFIX) \ ++ zla_heamv.$(SUFFIX) zla_hercond_c.$(SUFFIX) zla_hercond_x.$(SUFFIX) zla_herpvgrw.$(SUFFIX) \ ++ zla_lin_berr.$(SUFFIX) zlarscl2.$(SUFFIX) zlascl2.$(SUFFIX) zla_wwaddw.$(SUFFIX) + endif + + ALLOBJ = $(SLASRC) $(DLASRC) $(DSLASRC) $(CLASRC) $(ZLASRC) $(ZCLASRC) \ + $(SCLAUX) $(DZLAUX) $(ALLAUX) + ++ALLOBJ_P = $(ALLOBJ:.$(SUFFIX)=.$(PSUFFIX)) ++ + ifdef USEXBLAS + ALLXOBJ = $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) + endif + + all: ../$(LAPACKLIB) + ++lapack_prof: ../$(LAPACKLIB_P) ++ + ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) + $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) + $(RANLIB) $@ + ++../$(LAPACKLIB_P): $(ALLOBJ_P) ++ $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ_P) ++ $(RANLIB) $@ ++ + single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX) + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \ + $(SXLASRC) $(SCLAUX) $(ALLAUX) $(ALLXAUX) +@@ -451,15 +459,24 @@ + @FRC=$(FRC) + + clean: +- rm -f *.o ++ rm -f *.$(SUFFIX) *.$(PSUFFIX) + +-.f.o: ++%.$(SUFFIX): %.f + $(FORTRAN) $(OPTS) -c $< -o $@ + +-slaruv.o: slaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-dlaruv.o: dlaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-sla_wwaddw.o: sla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-dla_wwaddw.o: dla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-cla_wwaddw.o: cla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-zla_wwaddw.o: zla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ ++%.$(PSUFFIX): %.f ++ $(FORTRAN) $(POPTS) -c $< -o $@ + ++slaruv.$(SUFFIX): slaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++dlaruv.$(SUFFIX): dlaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++sla_wwaddw.$(SUFFIX): sla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++dla_wwaddw.$(SUFFIX): dla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++cla_wwaddw.$(SUFFIX): cla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++zla_wwaddw.$(SUFFIX): zla_wwaddw.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++ ++slaruv.$(PSUFFIX): slaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++dlaruv.$(PSUFFIX): dlaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++sla_wwaddw.$(PSUFFIX): sla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++dla_wwaddw.$(PSUFFIX): dla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++cla_wwaddw.$(PSUFFIX): cla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++zla_wwaddw.$(PSUFFIX): zla_wwaddw.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ +diff -ruN lapack-3.4.2.old/TESTING/EIG/Makefile lapack-3.4.2/TESTING/EIG/Makefile +--- lapack-3.4.2.old/TESTING/EIG/Makefile 2011-09-26 23:52:31 +0200 ++++ lapack-3.4.2/TESTING/EIG/Makefile 2012-04-22 21:41:45 +0200 +@@ -78,7 +78,7 @@ + cget35.o cget36.o cget37.o cget38.o cget51.o cget52.o \ + cget54.o cglmts.o cgqrts.o cgrqts.o cgsvts.o \ + chbt21.o chet21.o chet22.o chpt21.o chst01.o \ +- clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o csbmv.o \ ++ clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o \ + csgt01.o cslect.o \ + cstt21.o cstt22.o cunt01.o cunt03.o + +@@ -115,7 +115,7 @@ + zget35.o zget36.o zget37.o zget38.o zget51.o zget52.o \ + zget54.o zglmts.o zgqrts.o zgrqts.o zgsvts.o \ + zhbt21.o zhet21.o zhet22.o zhpt21.o zhst01.o \ +- zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o zsbmv.o \ ++ zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o \ + zsgt01.o zslect.o \ + zstt21.o zstt22.o zunt01.o zunt03.o + +@@ -129,22 +129,22 @@ + ../xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtsts \ + $(SEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtsts $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtsts $@ + + ../xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtstc \ + $(CEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtstc $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtstc $@ + + ../xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtstd \ + $(DEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtstd $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtstd $@ + + ../xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) ../../$(LAPACKLIB); \ + $(LOADER) $(LOADOPTS) -o xeigtstz \ + $(ZEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) && mv xeigtstz $@ ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) && mv xeigtstz $@ + + $(AEIGTST): $(FRC) + $(SCIGTST): $(FRC) +diff -ruN lapack-3.4.2.old/TESTING/LIN/Makefile lapack-3.4.2/TESTING/LIN/Makefile +--- lapack-3.4.2.old/TESTING/LIN/Makefile 2012-04-02 21:06:36 +0200 ++++ lapack-3.4.2/TESTING/LIN/Makefile 2012-04-22 21:43:30 +0200 +@@ -109,7 +109,7 @@ + cqpt01.o cqrt01.o cqrt01p.o cqrt02.o cqrt03.o cqrt11.o \ + cqrt12.o cqrt13.o cqrt14.o cqrt15.o cqrt16.o \ + cqrt17.o crqt01.o crqt02.o crqt03.o crzt01.o crzt02.o \ +- csbmv.o cspt01.o \ ++ cspt01.o \ + cspt02.o cspt03.o csyt01.o csyt02.o csyt03.o \ + ctbt02.o ctbt03.o ctbt05.o ctbt06.o ctpt01.o \ + ctpt02.o ctpt03.o ctpt05.o ctpt06.o ctrt01.o \ +@@ -188,7 +188,7 @@ + zqpt01.o zqrt01.o zqrt01p.o zqrt02.o zqrt03.o zqrt11.o \ + zqrt12.o zqrt13.o zqrt14.o zqrt15.o zqrt16.o \ + zqrt17.o zrqt01.o zrqt02.o zrqt03.o zrzt01.o zrzt02.o \ +- zsbmv.o zspt01.o \ ++ zspt01.o \ + zspt02.o zspt03.o zsyt01.o zsyt02.o zsyt03.o \ + ztbt02.o ztbt03.o ztbt05.o ztbt06.o ztpt01.o \ + ztpt02.o ztpt03.o ztpt05.o ztpt06.o ztrt01.o \ +@@ -214,7 +214,7 @@ + zdrvab.o zdrvac.o zerrab.o zerrac.o zget08.o \ + alaerh.o alahd.o aladhd.o alareq.o \ + chkxer.o zget02.o zlarhs.o zlatb4.o \ +- zsbmv.o xerbla.o zpot06.o zlaipd.o ++ xerbla.o zpot06.o zlaipd.o + + SLINTSTRFP = schkrfp.o sdrvrfp.o sdrvrf1.o sdrvrf2.o sdrvrf3.o sdrvrf4.o serrrfp.o \ + slatb4.o slarhs.o sget04.o spot01.o spot03.o spot02.o \ +@@ -225,11 +225,11 @@ + chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o + + CLINTSTRFP = cchkrfp.o cdrvrfp.o cdrvrf1.o cdrvrf2.o cdrvrf3.o cdrvrf4.o cerrrfp.o \ +- claipd.o clatb4.o clarhs.o csbmv.o cget04.o cpot01.o cpot03.o cpot02.o \ ++ claipd.o clatb4.o clarhs.o cget04.o cpot01.o cpot03.o cpot02.o \ + chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o + + ZLINTSTRFP = zchkrfp.o zdrvrfp.o zdrvrf1.o zdrvrf2.o zdrvrf3.o zdrvrf4.o zerrrfp.o \ +- zlatb4.o zlaipd.o zlarhs.o zsbmv.o zget04.o zpot01.o zpot03.o zpot02.o \ ++ zlatb4.o zlaipd.o zlarhs.o zget04.o zpot01.o zpot03.o zpot02.o \ + chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o + + all: single double complex complex16 proto-single proto-double proto-complex proto-complex16 +@@ -246,43 +246,43 @@ + + xlintsts : $(ALINTST) $(SLINTST) $(SCLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ALINTST) $(SCLNTST) $(SLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstc : $(ALINTST) $(CLINTST) $(SCLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ALINTST) $(SCLNTST) $(CLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstd : $(ALINTST) $(DLINTST) $(DZLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $^ \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstz : $(ALINTST) $(ZLINTST) $(DZLNTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ALINTST) $(DZLNTST) $(ZLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstds : $(DSLINTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(DSLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstzc : $(ZCLINTST) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ZCLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfs : $(SLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(SLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfd : $(DLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(DLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfc : $(CLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(CLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + xlintstrfz : $(ZLINTSTRFP) ../../$(LAPACKLIB) + $(LOADER) $(LOADOPTS) $(ZLINTSTRFP) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintsts: xlintsts + mv xlintsts $@ +diff -ruN lapack-3.4.2.old/lapacke/src/Makefile lapack-3.4.2/lapacke/src/Makefile +--- lapack-3.4.2.old/lapacke/src/Makefile 2012-04-02 22:16:32 +0200 ++++ lapack-3.4.2/lapacke/src/Makefile 2012-04-22 21:38:38 +0200 +@@ -2041,19 +2041,21 @@ + lapacke_zlagsy.o \ + lapacke_zlagsy_work.o + +-ALLOBJ = $(SRC_OBJ) $(MATGEN_OBJ) ++OBJ_FILES := $(SRC_OBJ) + +-ifdef USEXBLAS +-ALLXOBJ = $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) ++ifdef LAPACKE_EXTENDED ++OBJ_FILES += $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) + endif + +- +-OBJ_FILES := $(C_FILES:.o=.o) ++ifdef LAPACKE_TESTING ++OBJ_FILES += $(MATGEN_OBJ) ++endif + + all: ../../$(LAPACKELIB) + +-../../$(LAPACKELIB): $(ALLOBJ) $(ALLXOBJ) +- $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(ALLOBJ) $(ALLXOBJ) ++../../$(LAPACKELIB): $(OBJ_FILES) ++# http://hackage.haskell.org/trac/gtk2hs/ticket/1146 ++ echo $(OBJ_FILES) | xargs -n 100 $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) + $(RANLIB) ../../$(LAPACKELIB) + + .c.o: From b1a54a0107b61da4cfe50fa6947c73770b58b2c1 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 8 Oct 2012 12:48:20 +0800 Subject: [PATCH 101/162] Fixed #141. make f77blas.h compatible with compilers which lack C99 complex number. Apply the patch from Tony @tonyhill. Thank you. --- common.h | 2 ++ common_interface.h | 24 ++++++++++++------------ openblas_config_template.h | 32 ++++++++++++++++++++------------ 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/common.h b/common.h index 3718cdee4..b4dc5deba 100644 --- a/common.h +++ b/common.h @@ -389,10 +389,12 @@ typedef int blasint; #define OPENBLAS_COMPLEX_C99 typedef float _Complex openblas_complex_float; typedef double _Complex openblas_complex_double; + typedef xdouble _Complex openblas_complex_xdouble; #else #define OPENBLAS_COMPLEX_STRUCT typedef struct { float real, imag; } openblas_complex_float; typedef struct { double real, imag; } openblas_complex_double; + typedef struct { xdouble real, imag; } openblas_complex_xdouble; #endif #endif // ASSEMBLER diff --git a/common_interface.h b/common_interface.h index bff1a85a1..14c2cf7a4 100644 --- a/common_interface.h +++ b/common_interface.h @@ -76,19 +76,19 @@ myxcomplex_t BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, myxcomplex_t BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); #elif defined RETURN_BY_STACK -void BLASFUNC(cdotu) (float _Complex *, blasint *, float * , blasint *, float *, blasint *); -void BLASFUNC(cdotc) (float _Complex *, blasint *, float *, blasint *, float *, blasint *); -void BLASFUNC(zdotu) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); -void BLASFUNC(zdotc) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); -void BLASFUNC(xdotu) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); -void BLASFUNC(xdotc) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(cdotu) (openblas_complex_float *, blasint *, float * , blasint *, float *, blasint *); +void BLASFUNC(cdotc) (openblas_complex_float *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(zdotu) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(zdotc) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xdotu) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(xdotc) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); #else -float _Complex BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); -float _Complex BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); -double _Complex BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); -double _Complex BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); -xdouble _Complex BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); -xdouble _Complex BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +openblas_complex_float BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); +openblas_complex_float BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); +openblas_complex_double BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); +openblas_complex_double BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); +openblas_complex_xdouble BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +openblas_complex_xdouble BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); #endif void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); diff --git a/openblas_config_template.h b/openblas_config_template.h index caeccf026..a2b05696f 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -53,20 +53,28 @@ typedef int blasint; #include typedef float _Complex openblas_complex_float; typedef double _Complex openblas_complex_double; - #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) - #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) - #define openblas_complex_float_real(z) (creal(z)) - #define openblas_complex_float_imag(z) (cimag(z)) - #define openblas_complex_double_real(z) (creal(z)) - #define openblas_complex_double_imag(z) (cimag(z)) + typedef xdouble _Complex openblas_complex_xdouble; + #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_complex_float_real(z) (creal(z)) + #define openblas_complex_float_imag(z) (cimag(z)) + #define openblas_complex_double_real(z) (creal(z)) + #define openblas_complex_double_imag(z) (cimag(z)) + #define openblas_complex_xdouble_real(z) (creal(z)) + #define openblas_complex_xdouble_imag(z) (cimag(z)) #else #define OPENBLAS_COMPLEX_STRUCT typedef struct { float real, imag; } openblas_complex_float; typedef struct { double real, imag; } openblas_complex_double; - #define openblas_make_complex_float(real, imag) {(real), (imag)} - #define openblas_make_complex_double(real, imag) {(real), (imag)} - #define openblas_complex_float_real(z) ((z).real) - #define openblas_complex_float_imag(z) ((z).imag) - #define openblas_complex_double_real(z) ((z).real) - #define openblas_complex_double_imag(z) ((z).imag) + typedef struct { xdouble real, imag; } openblas_complex_xdouble; + #define openblas_make_complex_float(real, imag) {(real), (imag)} + #define openblas_make_complex_double(real, imag) {(real), (imag)} + #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} + #define openblas_complex_float_real(z) ((z).real) + #define openblas_complex_float_imag(z) ((z).imag) + #define openblas_complex_double_real(z) ((z).real) + #define openblas_complex_double_imag(z) ((z).imag) + #define openblas_complex_xdouble_real(z) ((z).real) + #define openblas_complex_xdouble_imag(z) ((z).imag) #endif From 0b719945c56bc2a25146706ff12542a0d6688d9b Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 8 Oct 2012 13:37:44 +0800 Subject: [PATCH 102/162] Updated the doc for 0.2.4 version. --- Changelog.txt | 16 ++++++++++++++++ Makefile.rule | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index 3d6151bb6..1b0a9b01e 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,5 +1,21 @@ OpenBLAS ChangeLog ==================================================================== +Version 0.2.4 +8-Oct-2012 +common: + * Upgraded LAPACK to 3.4.2 version. (#145) + * Provided support for passing CFLAGS, FFLAGS, PFLAGS, + FPFLAGS to make. (#137) + * f77blas.h:compatibility for compilers without C99 complex + number support. (#141) +x86/x86-64: + * Added NO_AVX flag. Check OS supporting AVX on runtime. (#139) + * Fixed zdot incompatibility ABI issue with GCC 4.7 on + Windows 32-bit. (#140) +MIPS64: + * Fixed the generation of shared library bug. + * Fixed the detection bug on the Loongson 3A server. +==================================================================== Version 0.2.3 20-Aug-2012 common: diff --git a/Makefile.rule b/Makefile.rule index 37b6c8acc..077bf9267 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.3 +VERSION = 0.2.4 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 6c5899dff5dc2e6f7fa6415046c1b99b4d6c0bbd Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 9 Oct 2012 14:52:35 +0800 Subject: [PATCH 103/162] Don't use xgetbv instruction when NO_AVX=1 --- README.md | 3 ++- cpuid_x86.c | 7 +++++++ driver/others/dynamic.c | 6 ++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index befc14fc0..ce2688f03 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS enviro ### Set the number of threads on runtime. -We provided the below functions to controll the number of threads on runtime. So far, we didn't support changing the number of threads on Windows. On Windows, these functions are dummy. +We provided the below functions to controll the number of threads on runtime. void goto_set_num_threads(int num_threads); @@ -97,6 +97,7 @@ OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. ## Troubleshooting +* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. * Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. * The number of CPUs/Cores should less than or equal to 256. diff --git a/cpuid_x86.c b/cpuid_x86.c index ebbbe3fff..6e4eae20d 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -114,12 +114,15 @@ static inline int have_excpuid(void){ return eax & 0xffff; } +#ifndef NO_AVX static inline void xgetbv(int op, int * eax, int * edx){ __asm__ __volatile__ ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); } +#endif int support_avx(){ +#ifndef NO_AVX int eax, ebx, ecx, edx; int ret=0; @@ -131,8 +134,12 @@ int support_avx(){ } } return ret; +#else + return 0; +#endif } + int get_vendor(void){ int eax, ebx, ecx, edx; char vendor[13]; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 468ab0dc8..5d2bc782f 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -76,12 +76,15 @@ extern gotoblas_t gotoblas_SANDYBRIDGE; #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) +#ifndef NO_AVX static inline void xgetbv(int op, int * eax, int * edx){ __asm__ __volatile__ ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); } +#endif int support_avx(){ +#ifndef NO_AVX int eax, ebx, ecx, edx; int ret=0; @@ -93,6 +96,9 @@ int support_avx(){ } } return ret; +#else + return 0; +#endif } static int get_vendor(void){ From ca4136cf416157b8e8f5852c348b9bc8f361af4c Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Fri, 12 Oct 2012 23:44:23 +0200 Subject: [PATCH 104/162] Fixed #147: LAPACK symbols were not being exported for version 3.4.2 --- .gitignore | 2 ++ exports/gensymbol | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 118205ca2..aaa1b31ad 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ lapack-3.1.1 lapack-3.1.1.tgz lapack-3.4.1 lapack-3.4.1.tgz +lapack-3.4.2 +lapack-3.4.2.tgz *.so *.a .svn diff --git a/exports/gensymbol b/exports/gensymbol index 64c92d396..c492eefb5 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -2669,7 +2669,8 @@ if ($ARGV[5] == 1) { #NO_LAPACK=1 @underscore_objs = (@blasobjs, @misc_underscore_objs); -} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1") { +} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1" || + -d "../lapack-3.4.2") { @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs); } else { @underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs); From 5c1efa1149c0b18dbb455cd5659d7f655d5a6cf2 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Mon, 15 Oct 2012 22:13:37 +0200 Subject: [PATCH 105/162] Fix installation step on Windows (regression from e8306f623a) Since the DLL now has a fixed name there is no need to install a versioned alias too. --- Makefile.install | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile.install b/Makefile.install index 7f30d6b7c..87730a10c 100644 --- a/Makefile.install +++ b/Makefile.install @@ -71,11 +71,9 @@ ifeq ($(OSNAME), Darwin) endif ifeq ($(OSNAME), WINNT) -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll endif ifeq ($(OSNAME), CYGWIN_NT) -cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR) - -ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll endif @echo Install OK! From 0f26a21624ac6d5b3ee2bc913f74edc6bd611ad6 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Mon, 15 Oct 2012 22:26:18 +0200 Subject: [PATCH 106/162] Alternative approach to avoid command-line length while archiving lapacke -- Thanks Michel! --- patch.for_lapack-3.4.2 | 66 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/patch.for_lapack-3.4.2 b/patch.for_lapack-3.4.2 index f6c85c74b..b1edcb84d 100644 --- a/patch.for_lapack-3.4.2 +++ b/patch.for_lapack-3.4.2 @@ -899,19 +899,64 @@ diff -ruN lapack-3.4.2.old/TESTING/LIN/Makefile lapack-3.4.2/TESTING/LIN/Makefil ../xlintsts: xlintsts mv xlintsts $@ diff -ruN lapack-3.4.2.old/lapacke/src/Makefile lapack-3.4.2/lapacke/src/Makefile ---- lapack-3.4.2.old/lapacke/src/Makefile 2012-04-02 22:16:32 +0200 -+++ lapack-3.4.2/lapacke/src/Makefile 2012-04-22 21:38:38 +0200 -@@ -2041,19 +2041,21 @@ +--- lapack-3.4.2.old/lapacke/src/Makefile 2012-09-21 04:21:29 +0200 ++++ lapack-3.4.2/lapacke/src/Makefile 2012-10-15 22:04:56 +0200 +@@ -34,7 +34,7 @@ + # + include ../../make.inc + +-SRC_OBJ = \ ++CSRC_OBJ = \ + lapacke_cbbcsd.o \ + lapacke_cbbcsd_work.o \ + lapacke_cbdsqr.o \ +@@ -526,7 +526,9 @@ + lapacke_cupgtr.o \ + lapacke_cupgtr_work.o \ + lapacke_cupmtr.o \ +-lapacke_cupmtr_work.o \ ++lapacke_cupmtr_work.o ++ ++DSRC_OBJ = \ + lapacke_dbbcsd.o \ + lapacke_dbbcsd_work.o \ + lapacke_dbdsdc.o \ +@@ -1012,7 +1014,9 @@ + lapacke_dtrttp.o \ + lapacke_dtrttp_work.o \ + lapacke_dtzrzf.o \ +-lapacke_dtzrzf_work.o \ ++lapacke_dtzrzf_work.o ++ ++SSRC_OBJ = \ + lapacke_sbbcsd.o \ + lapacke_sbbcsd_work.o \ + lapacke_sbdsdc.o \ +@@ -1492,7 +1496,9 @@ + lapacke_strttp.o \ + lapacke_strttp_work.o \ + lapacke_stzrzf.o \ +-lapacke_stzrzf_work.o \ ++lapacke_stzrzf_work.o ++ ++ZSRC_OBJ = \ + lapacke_zbbcsd.o \ + lapacke_zbbcsd_work.o \ + lapacke_zbdsqr.o \ +@@ -2041,19 +2047,29 @@ lapacke_zlagsy.o \ lapacke_zlagsy_work.o -ALLOBJ = $(SRC_OBJ) $(MATGEN_OBJ) -+OBJ_FILES := $(SRC_OBJ) ++COBJ_FILES := $(CSRC_OBJ) ++SOBJ_FILES := $(SSRC_OBJ) ++DOBJ_FILES := $(DSRC_OBJ) ++ZOBJ_FILES := $(ZSRC_OBJ) -ifdef USEXBLAS -ALLXOBJ = $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) +ifdef LAPACKE_EXTENDED -+OBJ_FILES += $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) ++OBJ_FILES += $(SRCX_OBJ) endif - @@ -924,9 +969,14 @@ diff -ruN lapack-3.4.2.old/lapacke/src/Makefile lapack-3.4.2/lapacke/src/Makefil -../../$(LAPACKELIB): $(ALLOBJ) $(ALLXOBJ) - $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(ALLOBJ) $(ALLXOBJ) -+../../$(LAPACKELIB): $(OBJ_FILES) -+# http://hackage.haskell.org/trac/gtk2hs/ticket/1146 -+ echo $(OBJ_FILES) | xargs -n 100 $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) ++../../$(LAPACKELIB): $(COBJ_FILES) $(DOBJ_FILES) $(SOBJ_FILES) $(ZOBJ_FILES) $(OBJ_FILES) ++ $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(COBJ_FILES) ++ $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(DOBJ_FILES) ++ $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(SOBJ_FILES) ++ $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(ZOBJ_FILES) ++ifneq ($(strip $(OBJ_FILES)),) ++ $(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $(OBJ_FILES) ++endif $(RANLIB) ../../$(LAPACKELIB) .c.o: From 538c764d2b4b030503c4dd86b786f90dd6e12c4c Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 6 Nov 2012 18:21:46 +0800 Subject: [PATCH 107/162] Refs #153. Restore the original CPU affinity when calling openblas_set_num_threads(1). Please read the issue on github.com for the detail. --- driver/others/blas_server.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index f16b827d3..c51e681a5 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -770,6 +770,19 @@ void goto_set_num_threads(int num_threads) { if (num_threads < 1) num_threads = blas_num_threads; +#ifndef NO_AFFINITY + if (num_threads == 1) { + if (blas_cpu_number == 1){ + //OpenBLAS is already single thread. + return; + }else{ + //From multi-threads to single thread + //Restore the original affinity mask + gotoblas_set_affinity(-1); + } + } +#endif + if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; if (num_threads > blas_num_threads) { @@ -800,6 +813,13 @@ void goto_set_num_threads(int num_threads) { UNLOCK_COMMAND(&server_lock); } +#ifndef NO_AFFINITY + if(blas_cpu_number == 1 && num_threads > 1){ + //Restore the thread 0 affinity. + gotoblas_set_affinity(0); + } +#endif + blas_cpu_number = num_threads; #if defined(ARCH_MIPS64) From 378acfe826e2759b1464bd9f9d7c02e22e14ea05 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 8 Nov 2012 22:08:01 +0800 Subject: [PATCH 108/162] Added NO_SHARED flag to disable generating the shared library. --- Makefile | 2 ++ Makefile.rule | 3 +++ 2 files changed, 5 insertions(+) diff --git a/Makefile b/Makefile index 9a2a8b765..c49a328d9 100644 --- a/Makefile +++ b/Makefile @@ -80,6 +80,7 @@ endif @echo shared : +ifndef NO_SHARED ifeq ($(OSNAME), Linux) $(MAKE) -C exports so -ln -fs $(LIBSONAME) $(LIBPREFIX).so @@ -103,6 +104,7 @@ endif ifeq ($(OSNAME), CYGWIN_NT) $(MAKE) -C exports dll endif +endif tests : ifndef NOFORTRAN diff --git a/Makefile.rule b/Makefile.rule index 077bf9267..f667c1661 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -45,6 +45,9 @@ VERSION = 0.2.4 # automatically detected by the the script. # NUM_THREADS = 24 +# if you don't need generate the shared library, please comment it in. +# NO_SHARED = 1 + # If you don't need CBLAS interface, please comment it in. # NO_CBLAS = 1 From 789f205177b6beee41e56f007bdb2a25cc1a0cb7 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 8 Nov 2012 22:15:04 +0800 Subject: [PATCH 109/162] Improved Makefile.rule for cross compiler. --- Makefile.rule | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index f667c1661..debc91dcf 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -24,10 +24,13 @@ VERSION = 0.2.4 # Fortran compiler. Default is g77. # FC = gfortran -# Even you can specify cross compiler +# Even you can specify cross compiler. Meanwhile, please set HOSTCC. # CC = x86_64-w64-mingw32-gcc # FC = x86_64-w64-mingw32-gfortran +# If you use the cross compiler, please set this host compiler. +# HOSTCC = gcc + # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 # BINARY=64 From e85549ee1171e9a4fc28a0a09dec49f87741fa51 Mon Sep 17 00:00:00 2001 From: Alexander Nasonov Date: Sat, 10 Nov 2012 23:20:44 +0000 Subject: [PATCH 110/162] Fix NetBSD build. --- Makefile.system | 2 +- common.h | 5 +++++ exports/Makefile | 3 ++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 5aaf84609..27f30fa61 100644 --- a/Makefile.system +++ b/Makefile.system @@ -129,7 +129,7 @@ MD5SUM = md5 -r endif ifeq ($(OSNAME), NetBSD) -MD5SUM = md5 -r +MD5SUM = md5 -n endif ifeq ($(OSNAME), Linux) diff --git a/common.h b/common.h index b4dc5deba..003fde77f 100644 --- a/common.h +++ b/common.h @@ -351,7 +351,12 @@ typedef int blasint; #endif #define MMAP_ACCESS (PROT_READ | PROT_WRITE) + +#ifdef __NetBSD__ +#define MMAP_POLICY (MAP_PRIVATE | MAP_ANON) +#else #define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) +#endif #include "param.h" #include "common_param.h" diff --git a/exports/Makefile b/exports/Makefile index c507032e9..5219560ee 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -119,7 +119,8 @@ so : ../$(LIBSONAME) endif -ifeq ($(OSNAME), FreeBSD) +#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or +ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD)) so : ../$(LIBSONAME) From b45d43d29526591372d0c3eaa05bc7b257c47bd9 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 13 Nov 2012 00:53:26 +0800 Subject: [PATCH 111/162] Added the patch for lapacke example. --- patch.for_lapack-3.4.2 | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/patch.for_lapack-3.4.2 b/patch.for_lapack-3.4.2 index b1edcb84d..3f7d72ed3 100644 --- a/patch.for_lapack-3.4.2 +++ b/patch.for_lapack-3.4.2 @@ -980,3 +980,21 @@ diff -ruN lapack-3.4.2.old/lapacke/src/Makefile lapack-3.4.2/lapacke/src/Makefil $(RANLIB) ../../$(LAPACKELIB) .c.o: +diff -ruN lapack-3.4.2.old/lapacke/example/Makefile lapack-3.4.2/lapacke/example/Makefile +--- lapack-3.4.2.old/lapacke/example/Makefile 2012-03-23 06:55:22.000000000 +0800 ++++ lapack-3.4.2/lapacke/example/Makefile 2012-11-13 00:32:24.125449952 +0800 +@@ -4,12 +4,12 @@ + + xexample_DGESV_rowmajor: example_DGESV_rowmajor.o ../../$(LAPACKLIB) ../../$(LAPACKELIB) + $(LOADER) $(LOADOPTS) example_DGESV_rowmajor.o \ +- ../../$(LAPACKELIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(LAPACKELIB) $(CEXTRALIB) -o $@ + ./$@ + + xexample_ZGESV_rowmajor: example_ZGESV_rowmajor.o ../../$(LAPACKLIB) ../../$(LAPACKELIB) + $(LOADER) $(LOADOPTS) example_ZGESV_rowmajor.o \ +- ../../$(LAPACKELIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(LAPACKELIB) $(CEXTRALIB) -o $@ + ./$@ + + .c.o: From d5717a97eadd4b1bcba99e3a895aba2a8583d4ac Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 13 Nov 2012 00:54:20 +0800 Subject: [PATCH 112/162] Compile lapacke with ILP64 modle when INTERFACE64=1 --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index c49a328d9..39e3bbd65 100644 --- a/Makefile +++ b/Makefile @@ -224,7 +224,11 @@ ifndef NOFORTRAN -@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc +ifdef INTERFACE64 + -@echo "CFLAGS = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc +else -@echo "CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc +endif -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc From 6751f7b9a793e9080396b1fec1739953017e3b8c Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 13 Nov 2012 15:48:57 +0800 Subject: [PATCH 113/162] Fixed #157. Only detect the number of physical CPU cores on Mac OSX. --- driver/others/memory.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index d897fe7e0..2070adf5d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -185,7 +185,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) int get_num_procs(void) { @@ -206,6 +206,18 @@ int get_num_procs(void) { #endif +#if defined(OS_DARWIN) +int get_num_procs(void) { + static int nums = 0; + size_t len; + if (nums == 0){ + len = sizeof(int); + sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0); + } + return nums; +} +#endif + /* OpenBLAS uses the numbers of CPU cores in multithreading. It can be set by openblas_set_num_threads(int num_threads); From 01e3c984cef12d881e4cb535f0d4f0045b8b2ee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Villemot?= Date: Wed, 14 Nov 2012 21:04:05 +0100 Subject: [PATCH 114/162] Fix compilation with TARGET=GENERIC Patch applied to Debian package --- param.h | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/param.h b/param.h index c6cd354be..11c1a269e 100644 --- a/param.h +++ b/param.h @@ -1664,26 +1664,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_UNROLL_M 1 #endif -#define SGEMM_P sgemm_p -#define DGEMM_P dgemm_p -#define QGEMM_P qgemm_p -#define CGEMM_P cgemm_p -#define ZGEMM_P zgemm_p -#define XGEMM_P xgemm_p - -#define SGEMM_R sgemm_r -#define DGEMM_R dgemm_r -#define QGEMM_R qgemm_r -#define CGEMM_R cgemm_r -#define ZGEMM_R zgemm_r -#define XGEMM_R xgemm_r - -#define SGEMM_Q 128 -#define DGEMM_Q 128 -#define QGEMM_Q 128 -#define CGEMM_Q 128 -#define ZGEMM_Q 128 -#define XGEMM_Q 128 +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define QGEMM_DEFAULT_P qgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p +#define XGEMM_DEFAULT_P xgemm_p + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 128 +#define XGEMM_DEFAULT_Q 128 #define SYMV_P 16 From 5f0117385e1d4f986ad75fa66b873b014a7792c2 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 19 Nov 2012 22:32:27 +0800 Subject: [PATCH 115/162] Refs #154. Fixed a SEGFAULT bug of dgemv_t when m is very large. It overflowed the internal buffer. Thus, we split vector x into blocks when m is very large. Thank @wangqian for this patch. --- kernel/x86_64/dgemv_t.S | 69 ++++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 22 deletions(-) diff --git a/kernel/x86_64/dgemv_t.S b/kernel/x86_64/dgemv_t.S index 071920723..02601be0a 100644 --- a/kernel/x86_64/dgemv_t.S +++ b/kernel/x86_64/dgemv_t.S @@ -47,7 +47,7 @@ #ifndef WINDOWS_ABI -#define STACKSIZE 64 +#define STACKSIZE 128 #define OLD_M %rdi #define OLD_N %rsi @@ -57,7 +57,10 @@ #define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp) - +#define MMM 56(%rsp) +#define NN 64(%rsp) +#define AA 72(%rsp) +#define LDAX 80(%rsp) #else #define STACKSIZE 256 @@ -132,12 +135,44 @@ movq OLD_LDA, LDA movq OLD_X, X #else - movq OLD_M, M - movq OLD_N, N - movq OLD_A, A - movq OLD_LDA, LDA + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, AA + movq OLD_LDA, LDAX +#endif +#ifdef HAVE_SSE3 +#ifndef WINDOWS_ABI + movddup %xmm0, ALPHA +#else + movddup %xmm3, ALPHA #endif +#else +#ifndef WINDOWS_ABI + movapd %xmm0, ALPHA +#else + movapd %xmm3, ALPHA +#endif + unpcklpd ALPHA, ALPHA +#endif + + +.L0x: + xorq M,M + addq $1,M + salq $22,M + subq M,MMM + jge .L00 + + movq MMM,%rax + addq M,%rax + jle .L999x + movq %rax,M + +.L00: + movq LDAX,LDA + movq NN,N + movq AA,A movq STACK_INCX, INCX movq STACK_Y, Y movq STACK_INCY, INCY @@ -153,21 +188,6 @@ subq $-16 * SIZE, A -#ifdef HAVE_SSE3 -#ifndef WINDOWS_ABI - movddup %xmm0, ALPHA -#else - movddup %xmm3, ALPHA -#endif -#else -#ifndef WINDOWS_ABI - movapd %xmm0, ALPHA -#else - movapd %xmm3, ALPHA -#endif - unpcklpd ALPHA, ALPHA -#endif - testq M, M jle .L999 testq N, N @@ -854,7 +874,6 @@ .L21: #endif - subq $4, N leaq 16 * SIZE(BUFFER), X1 @@ -2461,6 +2480,12 @@ ALIGN_4 .L999: + leaq (, M, SIZE), %rax + addq %rax,AA + jmp .L0x; + ALIGN_4 + +.L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 From 2345bdec6850677dc363d2e1edbc3aa818375d1c Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 26 Nov 2012 17:32:25 +0800 Subject: [PATCH 116/162] Update the doc for 0.2.5 version. --- Changelog.txt | 16 ++++++++++++++++ Makefile.rule | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index 1b0a9b01e..db0732c4f 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,20 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.5 +26-Nov-2012 +common: + * Added NO_SHARED flag to disable generating the shared library. + * Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158) + * Export LAPACK 3.4.2 symbols in shared library. (#147) + * Only detect the number of physical CPU cores on Mac OSX. (#157) + * Fixed NetBSD build. (#155) + * Fixed compilation with TARGET=GENERIC. (#160) +x86/x86-64: + * Restore the original CPU affinity when calling + openblas_set_num_threads(1) (#153) + * Fixed a SEGFAULT bug in dgemv_t when m is very large.(#154) +MIPS64: + ==================================================================== Version 0.2.4 8-Oct-2012 diff --git a/Makefile.rule b/Makefile.rule index debc91dcf..1240ab0ad 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.4 +VERSION = 0.2.5 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From cea1a885b5cd38bea67feb6437ef0c3622a96c58 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 27 Nov 2012 07:24:04 +0800 Subject: [PATCH 117/162] Refs #154. Fixed the build bug of dgemv_t on MinW64. --- kernel/x86_64/dgemv_t.S | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/x86_64/dgemv_t.S b/kernel/x86_64/dgemv_t.S index 02601be0a..48b3f17c4 100644 --- a/kernel/x86_64/dgemv_t.S +++ b/kernel/x86_64/dgemv_t.S @@ -74,6 +74,11 @@ #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) +//Temp variables for M,N,A,LDA +#define MMM 224(%rsp) +#define NN 232(%rsp) +#define AA 240(%rsp) +#define LDAX 248(%rsp) #endif @@ -134,6 +139,12 @@ movq OLD_A, A movq OLD_LDA, LDA movq OLD_X, X + + movq M, MMM + movq N, NN + movq A, AA + movq LDA, LDAX + #else movq OLD_M, MMM movq OLD_N, NN From 7110d171468cf8fe11463c3a3bcd5cc4cef54868 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 28 Nov 2012 12:52:28 +0800 Subject: [PATCH 118/162] Added -lgomp for generating DLL on Windows. --- exports/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 5219560ee..15041be86 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -22,6 +22,11 @@ ifeq ($(OSNAME), WINNT) ifeq ($(F_COMPILER), GFORTRAN) EXTRALIB += -lgfortran endif +ifeq ($(USE_OPENMP), 1) +ifeq ($(C_COMPILER), GCC) +EXTRALIB += -lgomp +endif +endif endif ifeq ($(OSNAME), CYGWIN_NT) From b7c0fa6bd223085f1b7ddade0bef487bd5c15688 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 6 Dec 2012 07:29:54 -0500 Subject: [PATCH 119/162] Init AMD Bulldozer codebase. --- Makefile.system | 4 +-- cpuid.h | 3 +- cpuid_x86.c | 20 +++++++++-- driver/others/dynamic.c | 12 +++++++ getarch.c | 18 +++++++++- kernel/x86/KERNEL.BULLDOZER | 59 ++++++++++++++++++++++++++++++++ kernel/x86_64/KERNEL.BULLDOZER | 62 ++++++++++++++++++++++++++++++++++ param.h | 2 +- 8 files changed, 172 insertions(+), 8 deletions(-) create mode 100644 kernel/x86/KERNEL.BULLDOZER create mode 100644 kernel/x86_64/KERNEL.BULLDOZER diff --git a/Makefile.system b/Makefile.system index 27f30fa61..75c0e0ad4 100644 --- a/Makefile.system +++ b/Makefile.system @@ -277,14 +277,14 @@ ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER endif endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER endif endif diff --git a/cpuid.h b/cpuid.h index bb57ad92d..c52d503cc 100644 --- a/cpuid.h +++ b/cpuid.h @@ -125,7 +125,8 @@ #define HAVE_MISALIGNSSE (1 << 15) #define HAVE_128BITFPU (1 << 16) #define HAVE_FASTMOVU (1 << 17) -#define HAVE_AVX (1 << 18) +#define HAVE_AVX (1 << 18) +#define HAVE_FMA4 (1 << 19) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 diff --git a/cpuid_x86.c b/cpuid_x86.c index 6e4eae20d..afc3b17b7 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -43,6 +43,8 @@ #ifdef NO_AVX #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM +#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA +#define CORE_BULLDOZER CORE_BARCELONA #endif #ifndef CPUIDEMU @@ -228,6 +230,9 @@ int get_cputype(int gettype){ cpuid(0x80000001, &eax, &ebx, &ecx, &edx); if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; +#ifndef NO_AVX + if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4; +#endif if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; } @@ -1075,8 +1080,12 @@ int get_cpuname(void){ return CPUTYPE_OPTERON; case 1: case 10: - case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series return CPUTYPE_BARCELONA; + case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return CPUTYPE_BULLDOZER; + else + return CPUTYPE_BARCELONA; //OS don't support AVX. case 5: return CPUTYPE_BOBCAT; } @@ -1427,8 +1436,13 @@ int get_coretype(void){ if (family == 0xf){ if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else if (exfamily == 5) return CORE_BOBCAT; - else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series - else return CORE_BARCELONA; + else if (exfamily == 6) { + //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return CORE_BULLDOZER; + else + return CORE_BARCELONA; //OS don't support AVX. Use old kernels. + }else return CORE_BARCELONA; } } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 5d2bc782f..1c0e1d3bb 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -63,9 +63,11 @@ extern gotoblas_t gotoblas_BARCELONA; extern gotoblas_t gotoblas_BOBCAT; #ifndef NO_AVX extern gotoblas_t gotoblas_SANDYBRIDGE; +extern gotoblas_t gotoblas_BULLDOZER; #else //Use NEHALEM kernels for sandy bridge #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#define gotoblas_BULLDOZER gotoblas_BARCELONA #endif @@ -202,6 +204,14 @@ static gotoblas_t *get_coretype(void){ else return &gotoblas_OPTERON; } else if (exfamily == 5) { return &gotoblas_BOBCAT; + } else if (exfamily == 6) { + //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return &gotoblas_BULLDOZER; + else{ + fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n"); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } } else { return &gotoblas_BARCELONA; } @@ -238,6 +248,7 @@ static char *corename[] = { "Nano", "Sandybridge", "Bobcat", + "Bulldozer", }; char *gotoblas_corename(void) { @@ -259,6 +270,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_NANO) return corename[15]; if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; if (gotoblas == &gotoblas_BOBCAT) return corename[17]; + if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; return corename[0]; } diff --git a/getarch.c b/getarch.c index 5916a9a04..4daf260f0 100644 --- a/getarch.c +++ b/getarch.c @@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "OPTERON" #endif -#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER) +#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" @@ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "BOBCAT" #endif +#if defined (FORCE_BULLDOZER) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "BULLDOZER" +#define ARCHCONFIG "-DBARCELONA " \ + "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ + "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" \ + "-DHAVE_AVX -DHAVE_FMA4" +#define LIBNAME "bulldozer" +#define CORENAME "BULLDOZER" +#endif + #ifdef FORCE_SSE_GENERIC #define FORCE #define FORCE_INTEL diff --git a/kernel/x86/KERNEL.BULLDOZER b/kernel/x86/KERNEL.BULLDOZER new file mode 100644 index 000000000..231350a62 --- /dev/null +++ b/kernel/x86/KERNEL.BULLDOZER @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_barcelona.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_barcelona.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER new file mode 100644 index 000000000..051a52286 --- /dev/null +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -0,0 +1,62 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_barcelona.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_barcelona.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S diff --git a/param.h b/param.h index 11c1a269e..5b6a19ad5 100644 --- a/param.h +++ b/param.h @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define SNUMOPT 8 #define DNUMOPT 4 From bfaaa975e6789acbce20384d01bd34b122832d18 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 7 Dec 2012 00:53:31 +0800 Subject: [PATCH 120/162] Added BULLDOZER target. So far it uses barcelona kernels. --- TargetList.txt | 1 + driver/others/parameter.c | 2 +- getarch.c | 4 +-- kernel/setparam-ref.c | 16 ++++++++++ kernel/x86/gemm_kernel_4x4_barcelona.S | 20 ++++++------ kernel/x86/scal_sse.S | 2 +- kernel/x86/scal_sse2.S | 2 +- kernel/x86/trsm_kernel_LN_2x4_sse2.S | 10 +++--- kernel/x86/trsm_kernel_LN_4x4_sse.S | 22 ++++++------- kernel/x86/trsm_kernel_LT_2x4_sse2.S | 10 +++--- kernel/x86/trsm_kernel_LT_4x4_sse.S | 22 ++++++------- kernel/x86/trsm_kernel_RT_2x4_sse2.S | 10 +++--- kernel/x86/trsm_kernel_RT_4x4_sse.S | 22 ++++++------- kernel/x86/zgemm3m_kernel_4x4_barcelona.S | 22 ++++++------- kernel/x86/zgemv_n_sse.S | 2 +- kernel/x86/zgemv_n_sse2.S | 2 +- kernel/x86/zgemv_t_sse.S | 2 +- kernel/x86/zgemv_t_sse2.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x2_sse.S | 4 +-- kernel/x86/ztrsm_kernel_LT_2x2_sse.S | 4 +-- kernel/x86/ztrsm_kernel_RT_2x2_sse.S | 4 +-- kernel/x86_64/gemm_kernel_8x4_barcelona.S | 38 +++++++++++------------ kernel/x86_64/gemm_ncopy_4_opteron.S | 4 +-- kernel/x86_64/gemm_tcopy_4_opteron.S | 4 +-- kernel/x86_64/izamax_sse2.S | 2 +- kernel/x86_64/scal_sse.S | 2 +- kernel/x86_64/scal_sse2.S | 2 +- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/trsm_kernel_LN_8x4_sse.S | 2 +- kernel/x86_64/trsm_kernel_LT_8x4_sse.S | 2 +- kernel/x86_64/trsm_kernel_RT_8x4_sse.S | 2 +- kernel/x86_64/zgemm_ncopy_2.S | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 4 +-- kernel/x86_64/zsymv_U_sse.S | 4 +-- kernel/x86_64/zsymv_U_sse2.S | 4 +-- kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S | 2 +- kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S | 2 +- kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S | 2 +- kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S | 2 +- kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S | 2 +- kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S | 2 +- l1param.h | 7 +++++ l2param.h | 2 +- 47 files changed, 156 insertions(+), 132 deletions(-) diff --git a/TargetList.txt b/TargetList.txt index 1a212e6ca..c859db082 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -29,6 +29,7 @@ BARCELONA SHANGHAI ISTANBUL BOBCAT +BULLDOZER c)VIA CPU: SSE_GENERIC diff --git a/driver/others/parameter.c b/driver/others/parameter.c index d261e5a4e..58e5fb11d 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -163,7 +163,7 @@ int get_L2_size(void){ int eax, ebx, ecx, edx; -#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \ +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) diff --git a/getarch.c b/getarch.c index 4daf260f0..2b9856338 100644 --- a/getarch.c +++ b/getarch.c @@ -385,12 +385,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "BULLDOZER" -#define ARCHCONFIG "-DBARCELONA " \ +#define ARCHCONFIG "-DBULLDOZER " \ "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ - "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ "-DHAVE_AVX -DHAVE_FMA4" #define LIBNAME "bulldozer" #define CORENAME "BULLDOZER" diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index f57b425e6..e8db76871 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -810,6 +810,22 @@ static void init_parameter(void) { #endif #endif +#ifdef BULLDOZER + +#ifdef DEBUG + fprintf(stderr, "Bulldozer\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef NANO #ifdef DEBUG diff --git a/kernel/x86/gemm_kernel_4x4_barcelona.S b/kernel/x86/gemm_kernel_4x4_barcelona.S index 18b9a43bd..f081aec2a 100644 --- a/kernel/x86/gemm_kernel_4x4_barcelona.S +++ b/kernel/x86/gemm_kernel_4x4_barcelona.S @@ -596,7 +596,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 4 * SIZE(BB), %xmm2 @@ -842,7 +842,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1168,7 +1168,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1198,7 +1198,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1347,7 +1347,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -1531,7 +1531,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -1778,7 +1778,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -1793,7 +1793,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -1924,7 +1924,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -2069,7 +2069,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/scal_sse.S b/kernel/x86/scal_sse.S index aa5ab760e..48edfc585 100644 --- a/kernel/x86/scal_sse.S +++ b/kernel/x86/scal_sse.S @@ -269,7 +269,7 @@ sarl $5, I jle .L113 -#if defined(BARCELONA) +#if defined(BARCELONA) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulps -32 * SIZE(X), %xmm1 diff --git a/kernel/x86/scal_sse2.S b/kernel/x86/scal_sse2.S index 67c1f437b..35b79132c 100644 --- a/kernel/x86/scal_sse2.S +++ b/kernel/x86/scal_sse2.S @@ -253,7 +253,7 @@ sarl $4, I jle .L113 -#if defined(BARCELONA) +#if defined(BARCELONA) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulpd -16 * SIZE(X), %xmm1 diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse2.S b/kernel/x86/trsm_kernel_LN_2x4_sse2.S index 2b6877a31..036e17338 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LN_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -439,7 +439,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -488,7 +488,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1697,7 +1697,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1727,7 +1727,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LN_4x4_sse.S b/kernel/x86/trsm_kernel_LN_4x4_sse.S index 82bb1d3ec..84da443a8 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LN_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -437,7 +437,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -833,7 +833,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1848,7 +1848,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2109,7 +2109,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2429,7 +2429,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -2459,7 +2459,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2952,7 +2952,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -3148,7 +3148,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3389,7 +3389,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -3404,7 +3404,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse2.S b/kernel/x86/trsm_kernel_LT_2x4_sse2.S index d81177b7e..0bd924cba 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -910,7 +910,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -959,7 +959,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1439,7 +1439,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1469,7 +1469,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LT_4x4_sse.S b/kernel/x86/trsm_kernel_LT_4x4_sse.S index 854c44e7a..de7c04593 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -872,7 +872,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1316,7 +1316,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1855,7 +1855,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1885,7 +1885,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2249,7 +2249,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2562,7 +2562,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2957,7 +2957,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -2972,7 +2972,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -3280,7 +3280,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3515,7 +3515,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse2.S b/kernel/x86/trsm_kernel_RT_2x4_sse2.S index f7a08c699..f5d5ad465 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_RT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -1036,7 +1036,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1066,7 +1066,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 @@ -2224,7 +2224,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -2273,7 +2273,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_RT_4x4_sse.S b/kernel/x86/trsm_kernel_RT_4x4_sse.S index 80dc2451c..5c2dcd0d6 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_RT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -439,7 +439,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -454,7 +454,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -758,7 +758,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -993,7 +993,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -1324,7 +1324,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1354,7 +1354,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1718,7 +1718,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2031,7 +2031,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2859,7 +2859,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -3303,7 +3303,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 diff --git a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S index 29158df25..623f0beec 100644 --- a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S +++ b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S @@ -74,7 +74,7 @@ #define BB %ecx #define LDC %ebp -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define movsd movlps #endif @@ -625,7 +625,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 4 * SIZE(BB), %xmm2 @@ -870,7 +870,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1173,7 +1173,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1203,7 +1203,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1359,7 +1359,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -1536,7 +1536,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -1794,7 +1794,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -1809,7 +1809,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -1936,7 +1936,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -2069,7 +2069,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/zgemv_n_sse.S b/kernel/x86/zgemv_n_sse.S index 8e28bb8e6..0087ac6f4 100644 --- a/kernel/x86/zgemv_n_sse.S +++ b/kernel/x86/zgemv_n_sse.S @@ -71,7 +71,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5) diff --git a/kernel/x86/zgemv_n_sse2.S b/kernel/x86/zgemv_n_sse2.S index 607c51de0..f0f2dc0ec 100644 --- a/kernel/x86/zgemv_n_sse2.S +++ b/kernel/x86/zgemv_n_sse2.S @@ -58,7 +58,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5) diff --git a/kernel/x86/zgemv_t_sse.S b/kernel/x86/zgemv_t_sse.S index fb98226ee..c7ad91235 100644 --- a/kernel/x86/zgemv_t_sse.S +++ b/kernel/x86/zgemv_t_sse.S @@ -71,7 +71,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5) diff --git a/kernel/x86/zgemv_t_sse2.S b/kernel/x86/zgemv_t_sse2.S index e2f391a82..6c4842893 100644 --- a/kernel/x86/zgemv_t_sse2.S +++ b/kernel/x86/zgemv_t_sse2.S @@ -58,7 +58,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5) diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S index ee9eb9d25..d32451574 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -533,7 +533,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S index 9ef572470..9f9449852 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -994,7 +994,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S index cd1bf2f53..dd0c5ab21 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -1820,7 +1820,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86_64/gemm_kernel_8x4_barcelona.S b/kernel/x86_64/gemm_kernel_8x4_barcelona.S index b40c8bac7..becd19544 100644 --- a/kernel/x86_64/gemm_kernel_8x4_barcelona.S +++ b/kernel/x86_64/gemm_kernel_8x4_barcelona.S @@ -930,7 +930,7 @@ .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 @@ -983,7 +983,7 @@ addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 @@ -1178,7 +1178,7 @@ .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 4 * SIZE(BO), %xmm9 @@ -1423,7 +1423,7 @@ .L42: mulss %xmm8, %xmm9 addss %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 @@ -1765,7 +1765,7 @@ .L62: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 @@ -1793,7 +1793,7 @@ addps %xmm8, %xmm5 movaps 32 * SIZE(AO), %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 @@ -1822,7 +1822,7 @@ addps %xmm10, %xmm5 movaps 48 * SIZE(AO), %xmm10 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 @@ -1851,7 +1851,7 @@ addps %xmm12, %xmm5 movaps 64 * SIZE(AO), %xmm12 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 @@ -2024,7 +2024,7 @@ .L72: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif @@ -2208,7 +2208,7 @@ .L82: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 4 * SIZE(BO), %xmm9 @@ -2395,7 +2395,7 @@ .L92: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 @@ -2670,7 +2670,7 @@ .L112: mulps %xmm9, %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif @@ -2687,7 +2687,7 @@ addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm9, %xmm10 @@ -2704,7 +2704,7 @@ addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm11, %xmm12 @@ -2721,7 +2721,7 @@ addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm11, %xmm14 @@ -2857,7 +2857,7 @@ .L122: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps -28 * SIZE(AO), %xmm8 @@ -2873,7 +2873,7 @@ addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 @@ -3003,7 +3003,7 @@ .L132: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd -30 * SIZE(AO), %xmm8 @@ -3150,7 +3150,7 @@ .L142: mulss %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss -31 * SIZE(AO), %xmm8 diff --git a/kernel/x86_64/gemm_ncopy_4_opteron.S b/kernel/x86_64/gemm_ncopy_4_opteron.S index edde7e2c1..e5cbd62eb 100644 --- a/kernel/x86_64/gemm_ncopy_4_opteron.S +++ b/kernel/x86_64/gemm_ncopy_4_opteron.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (48 + 4) #define MOVNTQ MOVQ @@ -79,7 +79,7 @@ #define AO3 %r13 #define AO4 %rax -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCH prefetch #else #define RPREFETCH prefetch diff --git a/kernel/x86_64/gemm_tcopy_4_opteron.S b/kernel/x86_64/gemm_tcopy_4_opteron.S index 459eeb8c5..105fe3b47 100644 --- a/kernel/x86_64/gemm_tcopy_4_opteron.S +++ b/kernel/x86_64/gemm_tcopy_4_opteron.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (12 + 4) #define MOVNTQ MOVQ @@ -96,7 +96,7 @@ #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCH prefetch #else #define RPREFETCH prefetch diff --git a/kernel/x86_64/izamax_sse2.S b/kernel/x86_64/izamax_sse2.S index 4e66e5338..404608256 100644 --- a/kernel/x86_64/izamax_sse2.S +++ b/kernel/x86_64/izamax_sse2.S @@ -469,7 +469,7 @@ ALIGN_4 .L71: -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) prefetch PREFETCHSIZE * SIZE(X) #endif diff --git a/kernel/x86_64/scal_sse.S b/kernel/x86_64/scal_sse.S index 323e8b9dd..9c8dd9dc2 100644 --- a/kernel/x86_64/scal_sse.S +++ b/kernel/x86_64/scal_sse.S @@ -266,7 +266,7 @@ sarq $5, I jle .L113 -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulps -32 * SIZE(X), %xmm1 diff --git a/kernel/x86_64/scal_sse2.S b/kernel/x86_64/scal_sse2.S index 8f5612081..3823b1fc9 100644 --- a/kernel/x86_64/scal_sse2.S +++ b/kernel/x86_64/scal_sse2.S @@ -251,7 +251,7 @@ sarq $4, I jle .L113 -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulpd -16 * SIZE(X), %xmm1 diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 5a123d7f6..0f1ebd564 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index 8afdc87db..9dd123c52 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 5aef6b461..93a66aaa7 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index fa1bfba85..f412b3e2f 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S index 513572ee9..552dbacdc 100644 --- a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S @@ -86,7 +86,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps diff --git a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S index 526a78c57..7727fd591 100644 --- a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S @@ -86,7 +86,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps diff --git a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S index e96496fd6..699364941 100644 --- a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S @@ -86,7 +86,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps diff --git a/kernel/x86_64/zgemm_ncopy_2.S b/kernel/x86_64/zgemm_ncopy_2.S index bf318b7ff..8876b61ff 100644 --- a/kernel/x86_64/zgemm_ncopy_2.S +++ b/kernel/x86_64/zgemm_ncopy_2.S @@ -85,7 +85,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE 32 #define WPREFETCHSIZE 48 #endif diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 6af65a4ba..dcfe83189 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -160,7 +160,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 71aca0198..04605e3cb 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) @@ -167,7 +167,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 4b8422d82..e8b01ad7a 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) @@ -166,7 +166,7 @@ #define xt1 %xmm14 #define xt2 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 33667f79e..40246e52e 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) @@ -166,7 +166,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S index b8caa9a44..79f20b641 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S index 2db8cbc5d..f5c100ec1 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S index 16c9ca828..18edeed57 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S index dbdbfe2e1..f58cecdf5 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S index 181cdd29c..1b589e0cf 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S index c28d02927..2c47ce3fd 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/l1param.h b/l1param.h index 6fe756f17..0b216c7c5 100644 --- a/l1param.h +++ b/l1param.h @@ -74,6 +74,13 @@ #define ALIGNED_ACCESS #endif +#ifdef BULLDOZER +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 5) +#define ALIGNED_ACCESS +#endif + #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 diff --git a/l2param.h b/l2param.h index cdbd8805e..01fe7943d 100644 --- a/l2param.h +++ b/l2param.h @@ -85,7 +85,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps From f19af5ecc05080cd8de729490347e796b6a7af89 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 7 Dec 2012 00:58:03 +0800 Subject: [PATCH 121/162] Refs #54. Added AMD Bulldozer x86_64 dgemm kernel developed by Werner Saar Based on the dgemm kernel for AMD Barcelona, he used AVX and FMA4 instructions. Thank Werner Saar! --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- kernel/x86_64/dgemm_kernel_4x4_bulldozer.S | 1860 ++++++++++++++++++++ 2 files changed, 1861 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemm_kernel_4x4_bulldozer.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 051a52286..d59668519 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -10,7 +10,7 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = gemm_kernel_4x4_barcelona.S +DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4_opteron.S diff --git a/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S new file mode 100644 index 000000000..b06b07edf --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S @@ -0,0 +1,1860 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp +#define J %rbx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define OFFSET 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define movapd movaps +#define movupd movups + +#define KERNEL1(xx) \ + vfmaddpd %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vmovaps %xmm2, %xmm0 ;\ + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vfmaddpd %xmm12,%xmm2,%xmm1,%xmm12 ;\ + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm13,%xmm2,%xmm3,%xmm13 ;\ + vmovddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm3,%xmm0,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm0, %xmm2 + +#define KERNEL2(xx) \ + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 ;\ + vmovaps %xmm2, %xmm0 ;\ + vmovups -10 * SIZE(AO, %rax, 4),%xmm2 ;\ +/*A*/ vmovups (AO, %rax, 4), %xmm6 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 ;\ + vmovddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ +/**/ vmovddup (BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm4, %xmm2 + +#define KERNEL3(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm4, %xmm8 ;\ + vmovaps %xmm2, %xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vmovddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm4, %xmm11 ;\ + vmovups -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + vmovddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm15,%xmm2,%xmm3,%xmm15 ;\ + vmovddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm4, %xmm2 + +#define KERNEL4(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ + vmovaps %xmm2, %xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5 ,%xmm12;\ +/*A*/ vmovups 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + vmovddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ +/**/ vmovddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm6, %xmm2 + +#define KERNEL5(xx) \ + vfmaddpd %xmm8,%xmm1, %xmm6,%xmm8 ;\ + vmovaps %xmm2, %xmm6 ;\ + vmovups 2 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vmovddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm9,%xmm3, %xmm6,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1, %xmm6,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm6,%xmm11 ;\ + vmovups 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + vmovddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm6, %xmm2 + +#define KERNEL6(xx) \ + vfmaddpd %xmm8,%xmm1, %xmm6,%xmm8 ;\ + vmovaps %xmm2, %xmm6 ;\ + vmovups 6 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ +/*A*/ vmovups 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + vmovddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm9,%xmm3, %xmm6,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1, %xmm6,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ +/**/ vmovddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm11,%xmm3, %xmm6,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2,%xmm3,%xmm15 ;\ + vmovddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm7, %xmm2 + +#define KERNEL7(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm7,%xmm8 ;\ + vmovaps %xmm2, %xmm7 ;\ + vmovups 10 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vmovddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm9,%xmm3, %xmm7,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm7,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm7,%xmm11 ;\ + vmovups 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + vmovddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm7, %xmm2 + +#define KERNEL8(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm7,%xmm8 ;\ + vmovaps %xmm2, %xmm7 ;\ + vmovups 14 * SIZE(AO, %rax, 4),%xmm2 ;\ +/*A*/ vmovups 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm7,%xmm9 ;\ + vmovddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm13, %xmm2, %xmm3,%xmm13 ;\ + vmovddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm7,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ +/**/ vmovddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm11,%xmm3, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax ;\ + +#define KERNEL_SUB1(xx) \ + vfmaddpd %xmm8, %xmm1, %xmm0,%xmm8 ;\ + vmovapd %xmm2, %xmm0 ;\ + vmovups -14 * SIZE(AO),%xmm2 ;\ + vfmaddpd %xmm12, %xmm2, %xmm1,%xmm12 ;\ + vmovddup -14 * SIZE(BO), %xmm1 ;\ + vfmaddpd %xmm9, %xmm3, %xmm0,%xmm9 ;\ + vfmaddpd %xmm13, %xmm2, %xmm3,%xmm13 ;\ + vmovddup -13 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10, %xmm1, %xmm0,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1 ,%xmm14 ;\ + vfmaddpd %xmm11, %xmm3, %xmm0,%xmm11 ;\ + vfmaddpd %xmm15, %xmm2, %xmm3,%xmm15 ;\ + vmovups -12 * SIZE(AO), %xmm0 ;\ + vmovddup -12 * SIZE(BO), %xmm1 ;\ + vmovddup -11 * SIZE(BO), %xmm3 ;\ + vmovapd %xmm0, %xmm2 + + +#define KERNEL_SUB2(xx) \ + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 ;\ + vmovaps %xmm2, %xmm0 ;\ + vmovups -10 * SIZE(AO),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -10 * SIZE(BO), %xmm1 ;\ + vmovddup -9 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovups (AO), %xmm0 ;\ + vmovddup (BO), %xmm1 ;\ + vmovddup -7 * SIZE(BO), %xmm3 ;\ + vmovaps %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ + vmovaps %xmm2, %xmm4 ;\ + vmovups -6 * SIZE(AO),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vmovddup -6 * SIZE(BO), %xmm5 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -5 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ + vmovups -4 * SIZE(AO), %xmm4 ;\ + vmovddup -4 * SIZE(BO), %xmm5 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup -3 * SIZE(BO), %xmm3 ;\ + vmovaps %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ + vmovaps %xmm2, %xmm4 ;\ + vmovups -2 * SIZE(AO),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -2 * SIZE(BO), %xmm5 ;\ + vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vmovddup -1 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 1 * SIZE(BO), %xmm3 ;\ + vmovaps %xmm0, %xmm2 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 2), CO2 # coffset2 = c + ldc + + leaq (C, LDC, 4), C # c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + + .align 16 +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + vzeroall + prefetcht0 256(CO1) + prefetcht0 320(CO1) + prefetcht0 256(CO2) + prefetcht0 320(CO2) + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -16 * SIZE(BO), %xmm1 + vmovddup -15 * SIZE(BO), %xmm3 + vmovups -8 * SIZE(AO), %xmm4 + vmovddup -8 * SIZE(BO), %xmm5 + + vmovaps %xmm0, %xmm2 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + + .align 16 +.L12: + prefetcht0 (AO,%rax,4) + prefetcht0 (BO,%rax,4) + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + jl .L12 + ALIGN_4 + +.L15: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vmovaps %xmm2, %xmm0 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 + vmovddup -13 * SIZE(BO, %rax, 4), %xmm3 + vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 + vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 + vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 + vmovaps %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: + // prefetch -8 * SIZE(BB) + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd 2 * SIZE(CO1),%xmm7, %xmm12,%xmm12 + .align 2 + vfmaddpd (CO1, LDC),%xmm7, %xmm9,%xmm9 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm7, %xmm13,%xmm13 + .align 2 + vfmaddpd (CO2),%xmm7, %xmm10,%xmm10 + vfmaddpd 2 * SIZE(CO2),%xmm7, %xmm14,%xmm14 + .align 2 + vfmaddpd (CO2, LDC),%xmm7, %xmm11,%xmm11 + vfmaddpd 2 * SIZE(CO2, LDC),%xmm7, %xmm15,%xmm15 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm12,%xmm12 + vmulpd %xmm7, %xmm9,%xmm9 + vmulpd %xmm7, %xmm13,%xmm13 + vmulpd %xmm7, %xmm10,%xmm10 + vmulpd %xmm7, %xmm14,%xmm14 + vmulpd %xmm7, %xmm11,%xmm11 + vmulpd %xmm7, %xmm15,%xmm15 + +#endif + + .align 2 + vmovups %xmm8, (CO1) + vmovups %xmm12, 2 * SIZE(CO1) + .align 2 + vmovups %xmm9, (CO1, LDC) + vmovups %xmm13, 2 * SIZE(CO1, LDC) + .align 2 + vmovups %xmm10, (CO2) + vmovups %xmm14, 2 * SIZE(CO2) + .align 2 + vmovups %xmm11, (CO2, LDC) + vmovups %xmm15, 2 * SIZE(CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8, %xmm8 + vmovups -12 * SIZE(AO), %xmm2 + vxorps %xmm9, %xmm9 ,%xmm9 + vmovddup -16 * SIZE(BO), %xmm1 + vxorps %xmm10, %xmm10, %xmm10 + vmovddup -15 * SIZE(BO), %xmm5 + vxorps %xmm11, %xmm11, %xmm11 + vmovddup -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vmovddup -13 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -11 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9 + vmovddup -10 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -9 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11 + vmovddup (BO, %rax, 4), %xmm1 + vmovddup -7 * SIZE(BO, %rax, 4), %xmm5 + vmovups -8 * SIZE(AO, %rax, 2), %xmm0 + vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8 + vfmaddpd %xmm9,%xmm2, %xmm5,%xmm9 + vmovddup -6 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -5 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10 + vfmaddpd %xmm11,%xmm2, %xmm5,%xmm11 + vmovups -10 * SIZE(AO, %rax, 2), %xmm2 + vmovddup -4 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -3 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8 + vfmaddpd %xmm9,%xmm2, %xmm5,%xmm9 + vmovddup -2 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -1 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10 + vfmaddpd %xmm11,%xmm2, %xmm5,%xmm11 + vmovddup 8 * SIZE(BO, %rax, 4), %xmm3 + vmovups -4 * SIZE(AO, %rax, 2), %xmm2 + vmovddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9 + vmovddup -13 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd (CO1, LDC),%xmm7, %xmm9,%xmm9 + vfmaddpd (CO2),%xmm7, %xmm10,%xmm10 + vfmaddpd (CO2, LDC),%xmm7, %xmm11,%xmm11 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + vmulpd %xmm7, %xmm10,%xmm10 + vmulpd %xmm7, %xmm11,%xmm11 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm9, (CO1, LDC) + + vmovups %xmm10, (CO2) + vmovups %xmm11, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + vmovddup -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8, %xmm8 + vmovddup -14 * SIZE(AO), %xmm2 + vxorps %xmm9, %xmm9, %xmm9 + vmovddup -15 * SIZE(AO), %xmm4 + vxorps %xmm10, %xmm10,%xmm10 + vmovups -16 * SIZE(BO), %xmm1 + vxorps %xmm11, %xmm11,%xmm11 + vmovups -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 4), %xmm0,%xmm9 + vmovups -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -12 * SIZE(AO, %rax, 1), %xmm0 + vfmaddpd %xmm10,%xmm4, %xmm1,%xmm10 + vfmaddpd %xmm11,-10 * SIZE(BO, %rax, 4), %xmm4,%xmm11 + vmovups (BO, %rax, 4), %xmm1 + vmovddup -11 * SIZE(AO, %rax, 1), %xmm4 + vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8 + vfmaddpd %xmm9,-6 * SIZE(BO, %rax, 4), %xmm2,%xmm9 + vmovups -4 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -13 * SIZE(AO, %rax, 1), %xmm2 + vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10 + vfmaddpd %xmm11,-2 * SIZE(BO, %rax, 4), %xmm2,%xmm11 + vmovups 8 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 4), %xmm0,%xmm9 + vmovups -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + vaddpd %xmm10, %xmm8,%xmm8 + vaddpd %xmm11, %xmm9,%xmm9 + +#ifndef TRMMKERNEL + vmovsd (CO1), %xmm0 + vmovhpd (CO1, LDC), %xmm0,%xmm0 + vmovsd (CO2), %xmm1 + vmovhpd (CO2, LDC), %xmm1,%xmm1 + + + vfmaddpd %xmm0, %xmm7,%xmm8,%xmm8 + vfmaddpd %xmm1, %xmm7,%xmm9,%xmm9 +#else + + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + +#endif + + vmovsd %xmm8, (CO1) + vmovhpd %xmm8, (CO1, LDC) + vmovsd %xmm9, (CO2) + vmovhpd %xmm9, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + vmovddup -16 * SIZE(BO), %xmm1 + vmovddup -15 * SIZE(BO), %xmm5 + vmovddup -12 * SIZE(BO), %xmm3 + vxorps %xmm8, %xmm8,%xmm8 + vxorps %xmm9, %xmm9,%xmm9 + vxorps %xmm12, %xmm12,%xmm12 + vxorps %xmm13, %xmm13,%xmm13 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -8 * SIZE(AO), %xmm4 + // prefetcht0 256(CO1) + // prefetcht0 320(CO1) + // prefetcht0 256(CO2) + // prefetcht0 320(CO2) + // prefetchnta 24 * SIZE(CO1) + // prefetchnta 32 * SIZE(CO1) + // prefetchw 3 * SIZE(CO1) + vmovups %xmm0, %xmm2 + // prefetchw 3 * SIZE(CO2) + // prefetchnta -16 * SIZE(BB) + // prefetch -16 * SIZE(BB) + subq $-8 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm5 + vmovups -10 * SIZE(AO, %rax, 4), %xmm2 + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9 + vmovups (AO, %rax, 4), %xmm0 + vmovddup -8 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -11 * SIZE(BO, %rax, 2), %xmm5 + vmovups -6 * SIZE(AO, %rax, 4), %xmm2 + vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8 + vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12 + vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9 + vmovups -4 * SIZE(AO, %rax, 4), %xmm4 + vmovddup -10 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -9 * SIZE(BO, %rax, 2), %xmm5 + vmovups -2 * SIZE(AO, %rax, 4), %xmm2 + vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8 + vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12 + vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovups 8 * SIZE(AO, %rax, 4), %xmm4 + vmovddup -4 * SIZE(BO, %rax, 2), %xmm3 + vmovddup -7 * SIZE(BO, %rax, 2), %xmm5 + vmovaps %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm5 + vmovaps %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: +#ifndef TRMMKERNEL + vfmaddpd (CO1),%xmm7, %xmm8, %xmm8 + vfmaddpd 2 * SIZE(CO1),%xmm7, %xmm12, %xmm12 + vfmaddpd (CO2),%xmm7, %xmm9, %xmm9 + vfmaddpd 2 * SIZE(CO2),%xmm7, %xmm13, %xmm13 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + vmulpd %xmm7, %xmm12,%xmm12 + vmulpd %xmm7, %xmm13,%xmm13 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm12, 2 * SIZE(CO1) + + vmovups %xmm9, (CO2) + vmovups %xmm13, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + vmovups -12 * SIZE(AO), %xmm2 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -16 * SIZE(BO), %xmm1 + vxorps %xmm10, %xmm10,%xmm10 + vmovddup -15 * SIZE(BO), %xmm3 + vxorps %xmm11, %xmm11,%xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vmovddup -12 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 + vmovups -8 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -11 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm8,%xmm2, %xmm1,%xmm8 + vmovddup -10 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm9,%xmm2, %xmm3,%xmm9 + vmovups -10 * SIZE(AO, %rax, 2), %xmm2 + vmovddup -9 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm10,%xmm2, %xmm1,%xmm10 + vmovddup -8 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm11,%xmm2, %xmm3,%xmm11 + vmovups -4 * SIZE(AO, %rax, 2), %xmm2 + vmovddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + vaddpd %xmm10, %xmm8,%xmm8 + vaddpd %xmm11, %xmm9,%xmm9 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd (CO2),%xmm7, %xmm9,%xmm9 + +#else + + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm9, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + vmovddup -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + vmovddup -15 * SIZE(AO), %xmm1 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -14 * SIZE(AO), %xmm2 + vxorps %xmm10, %xmm10,%xmm10 + vmovddup -13 * SIZE(AO), %xmm3 + vxorps %xmm11, %xmm11,%xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 2), %xmm0,%xmm8 + vmovddup -12 * SIZE(AO, %rax, 1), %xmm0 + + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 2), %xmm1,%xmm9 + vmovddup -11 * SIZE(AO, %rax, 1), %xmm1 + + vfmaddpd %xmm10,-12 * SIZE(BO, %rax, 2), %xmm2,%xmm10 + vmovddup -10 * SIZE(AO, %rax, 1), %xmm2 + + vfmaddpd %xmm11,-10 * SIZE(BO, %rax, 2), %xmm3,%xmm11 + vmovddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 2), %xmm0,%xmm8 + vmovddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + vaddpd %xmm9, %xmm8,%xmm8 + vaddpd %xmm11, %xmm10,%xmm10 + vaddpd %xmm10, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vmovsd (CO1), %xmm0 + vmovhpd (CO2), %xmm0,%xmm0 +#endif + + vmulpd %xmm7, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vaddpd %xmm0, %xmm8,%xmm8 +#endif + + vmovsd %xmm8, (CO1) + vmovhpd %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + vmovups -8 * SIZE(AO), %xmm2 + vxorps %xmm8, %xmm8,%xmm8 + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -16 * SIZE(BO), %xmm1 + vxorps %xmm12, %xmm12,%xmm12 + vmovddup -14 * SIZE(BO), %xmm3 + vxorps %xmm13, %xmm13,%xmm13 + vmovddup -15 * SIZE(BO), %xmm5 + + // prefetchw 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm12,-14 * SIZE(AO, %rax, 4), %xmm1,%xmm12 + vmovapd -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 1), %xmm1 + vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9 + vfmaddpd %xmm13,-10 * SIZE(AO, %rax, 4), %xmm5,%xmm13 + vmovapd (AO, %rax, 4), %xmm0 + vmovddup -13 * SIZE(BO, %rax, 1), %xmm5 + vfmaddpd %xmm8,%xmm3, %xmm2,%xmm8 + vfmaddpd %xmm12,-6 * SIZE(AO, %rax, 4), %xmm3,%xmm12 + vmovapd -4 * SIZE(AO, %rax, 4), %xmm2 + vmovddup -10 * SIZE(BO, %rax, 1), %xmm3 + vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 + vfmaddpd %xmm13,-2 * SIZE(AO, %rax, 4), %xmm5,%xmm13 + vmovapd 8 * SIZE(AO, %rax, 4), %xmm2 + vmovddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm12,-14 * SIZE(AO, %rax, 4), %xmm1,%xmm12 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 + +.L99: + vaddpd %xmm9, %xmm8,%xmm8 + vaddpd %xmm13, %xmm12,%xmm12 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd 2 * SIZE(CO1),%xmm7,%xmm12,%xmm12 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm12,%xmm12 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm12, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + vmovddup -16 * SIZE(BO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + vmovddup -15 * SIZE(BO), %xmm1 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -14 * SIZE(BO), %xmm2 + vxorps %xmm10, %xmm10,%xmm10 + vmovddup -13 * SIZE(BO), %xmm3 + vxorps %xmm11, %xmm11,%xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + vfmaddpd %xmm8,-16 * SIZE(AO, %rax, 2), %xmm0,%xmm8 + vmovddup -12 * SIZE(BO, %rax, 1), %xmm0 + + vfmaddpd %xmm9,-14 * SIZE(AO, %rax, 2), %xmm1,%xmm9 + vmovddup -11 * SIZE(BO, %rax, 1), %xmm1 + + vfmaddpd %xmm10,-12 * SIZE(AO, %rax, 2), %xmm2,%xmm10 + vmovddup -10 * SIZE(BO, %rax, 1), %xmm2 + + vfmaddpd %xmm11,-10 * SIZE(AO, %rax, 2), %xmm3,%xmm11 + vmovddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + vmovddup -16 * SIZE(BO, %rax, 1), %xmm0 + vfmaddpd %xmm8,-16 * SIZE(AO, %rax, 2), %xmm0,%xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + vaddpd %xmm9, %xmm8,%xmm8 + vaddpd %xmm11, %xmm10,%xmm10 + vaddpd %xmm10, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 +#else + vmulpd %xmm7, %xmm8,%xmm8 + +#endif + + vmovups %xmm8, (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + +.L110: + testq $1, M + je .L999 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + movups -14 * SIZE(AO), %xmm1 + vxorps %xmm9, %xmm9,%xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 1), %xmm0,%xmm8 + vmovups -12 * SIZE(AO, %rax, 1), %xmm0 + + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 1), %xmm1,%xmm9 + vmovups -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + vmulsd -16 * SIZE(BO, %rax, 1), %xmm0,%xmm0 + vaddsd %xmm0, %xmm8,%xmm8 + vmovsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + vaddpd %xmm9, %xmm8,%xmm8 + vhaddpd %xmm8, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vmovsd (CO1), %xmm0 +#endif + + vmulsd %xmm7, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vaddsd %xmm0, %xmm8,%xmm8 +#endif + + vmovsd %xmm8, (CO1) + ALIGN_4 + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE From bb10cb8442f84e76d1140f58b37fa1edc3393972 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 10 Dec 2012 11:51:39 +0800 Subject: [PATCH 122/162] Refs #165. fall back of DTB_DEFAULT_ENTRIES for some virtual machines. --- cpuid_x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 6e4eae20d..2ffc5f1d5 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1494,6 +1494,9 @@ void get_cpuconfig(void){ printf("#define DTB_SIZE %d\n", info.size * 1024); printf("#define DTB_ASSOCIATIVE %d\n", info.associative); printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize); + } else { + //fall back for some virtual machines. + printf("#define DTB_DEFAULT_ENTRIES 32\n"); } features = get_cputype(GET_FEATURE); From bdf8d9411e2e3698c7462a46e813c148c0e8aa98 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 10 Dec 2012 15:49:01 +0800 Subject: [PATCH 123/162] Refs #163. Obtain the build configure on runtime. openblas_get_config function returns the configure string. So far, it supports USE64BITINT, NO_CBLAS, NO_LAPACK, NO_LAPACKE, DYNAMIC_ARCH, NO_AFFINITY. Example: #include extern char * openblas_get_config(); void main() { printf("%s\n",openblas_get_config()); return; } --- cblas.h | 3 ++ driver/others/Makefile | 5 ++- driver/others/openblas_get_config.c | 59 +++++++++++++++++++++++++++++ exports/gensymbol | 1 + 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 driver/others/openblas_get_config.c diff --git a/cblas.h b/cblas.h index ee8bf08b2..e9664fe79 100644 --- a/cblas.h +++ b/cblas.h @@ -13,6 +13,9 @@ extern "C" { void openblas_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads); +/*Get the build configure on runtime.*/ +char* openblas_get_config(void); + #define CBLAS_INDEX size_t enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; diff --git a/driver/others/Makefile b/driver/others/Makefile index a1c7a504e..c449ec6c6 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,7 +1,7 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) @@ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../. openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c $(CC) $(CFLAGS) -c $< -o $(@F) +openblas_get_config.$(SUFFIX) : openblas_get_config.c + $(CC) $(CFLAGS) -c $< -o $(@F) + blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h $(CC) $(CFLAGS) -c $< -o $(@F) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c new file mode 100644 index 000000000..581ab1a43 --- /dev/null +++ b/driver/others/openblas_get_config.c @@ -0,0 +1,59 @@ +/***************************************************************************** +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +static char* openblas_config_str="" +#ifdef USE64BITINT + "USE64BITINT " +#endif +#ifdef NO_CBLAS + "NO_CBLAS " +#endif +#ifdef NO_LAPACK + "NO_LAPACK " +#endif +#ifdef NO_LAPACKE + "NO_LAPACKE " +#endif +#ifdef DYNAMIC_ARCH + "DYNAMIC_ARCH " +#endif +#ifdef NO_AFFINITY + "NO_AFFINITY " +#endif + ; + +char* CNAME() { + return openblas_config_str; +} + diff --git a/exports/gensymbol b/exports/gensymbol index c492eefb5..04cbd7d84 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -74,6 +74,7 @@ @misc_no_underscore_objs = ( openblas_set_num_threads, goto_set_num_threads, + openblas_get_config, ); @misc_underscore_objs = ( From 13f8fc0b1a4f843f0b77913caeb791cb15a3ae3c Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 11 Dec 2012 10:55:10 +0100 Subject: [PATCH 124/162] Write FMA4 flag to the configure file. --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index afc3b17b7..385114619 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1525,6 +1525,7 @@ void get_cpuconfig(void){ if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); + if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); @@ -1591,5 +1592,6 @@ void get_sse(void){ if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); + if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); } From 1138817dd2e38293f59e12c4fc7eb54a1a882c91 Mon Sep 17 00:00:00 2001 From: Julian Taylor Date: Sat, 15 Dec 2012 13:29:46 +0100 Subject: [PATCH 125/162] add a sanity check on the detected cpu type if we have 64 bit pointers we can't have a 32 bit cpu, so fall back to the 64bit cpu fallback (prescott) E.g. the cpu detection fails in amd qemu64 emulation (family 6 model 2) causing it to use the uninitialized gotoblas_ATHLON --- driver/others/dynamic.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 5d2bc782f..23de095ca 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -273,6 +273,15 @@ void gotoblas_dynamic_init(void) { if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; #else if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; + /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ + if (sizeof(void*) == 8) { + if (gotoblas == &gotoblas_KATMAI || + gotoblas == &gotoblas_COPPERMINE || + gotoblas == &gotoblas_NORTHWOOD || + gotoblas == &gotoblas_BANIAS || + gotoblas == &gotoblas_ATHLON) + gotoblas = &gotoblas_PRESCOTT; + } #endif if (gotoblas && gotoblas -> init) { From 9fb341a9f8d94e4d532d51b1216d92e74a67a569 Mon Sep 17 00:00:00 2001 From: Julian Taylor Date: Sat, 15 Dec 2012 16:05:33 +0100 Subject: [PATCH 126/162] set parameters for CORE_ATHLON else dgemm_p is set to zero leading to a segfault in alloc_mmap due to allocsize being zero --- kernel/setparam-ref.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index f57b425e6..4deabdacb 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -634,10 +634,10 @@ static void init_parameter(void) { TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; #endif -#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) +#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON) #ifdef DEBUG - fprintf(stderr, "Katmai, Coppermine, Banias\n"); + fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n"); #endif TABLE_NAME.sgemm_p = 64 * (l2 >> 7); From a4ee6f3915758e9272ec9f33206e86d7059bcd1e Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 18 Dec 2012 08:57:46 +0800 Subject: [PATCH 127/162] Fixed #172. Support Intel Xeon E7540. --- cpuid_x86.c | 4 ++++ driver/others/dynamic.c | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 2ffc5f1d5..b3352244b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1030,6 +1030,8 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + case 14: + // Xeon E7540 case 15: //Xeon Processor E7 (Westmere-EX) return CPUTYPE_NEHALEM; @@ -1398,6 +1400,8 @@ int get_coretype(void){ return CORE_SANDYBRIDGE; else return CORE_NEHALEM; //OS doesn't support AVX + case 14: + //Xeon E7540 case 15: //Xeon Processor E7 (Westmere-EX) return CORE_NEHALEM; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 23de095ca..28fdd30d8 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -163,7 +163,8 @@ static gotoblas_t *get_coretype(void){ //Intel Xeon Processor 5600 (Westmere-EP) //Xeon Processor E7 (Westmere-EX) - if (model == 12 || model == 15) return &gotoblas_NEHALEM; + //Xeon E7540 + if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; //Intel Core i5-2000 /i7-2000 (Sandy Bridge) //Intel Core i7-3000 / Xeon E5 From fd3046b32a1f7049fcb2bfb255d72e4204e5522e Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 23 Dec 2012 21:47:22 +0800 Subject: [PATCH 128/162] Refs #173. Fixed overflow internal buffer bug of gemv_t on x86. --- kernel/x86/gemv_t_sse.S | 69 +++++++++++++++++++++++++++++++------- kernel/x86/gemv_t_sse2.S | 71 +++++++++++++++++++++++++++++++++------- kernel/x86_64/sgemv_t.S | 61 +++++++++++++++++++++++++--------- 3 files changed, 164 insertions(+), 37 deletions(-) diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index 5bacb7da8..c72febe3d 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -89,17 +89,23 @@ #endif #define STACKSIZE 16 - -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 20 + STACKSIZE(%esp) -#define STACK_LDA 24 + STACKSIZE(%esp) -#define STACK_X 28 + STACKSIZE(%esp) -#define STACK_INCX 32 + STACKSIZE(%esp) -#define Y 36 + STACKSIZE(%esp) -#define STACK_INCY 40 + STACKSIZE(%esp) -#define BUFFER 44 + STACKSIZE(%esp) +#define ARGS 16 + +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 20 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 24 + STACKSIZE+ARGS(%esp) +#define STACK_X 28 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 32 + STACKSIZE+ARGS(%esp) +#define Y 36 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) +#define BUFFER 44 + STACKSIZE+ARGS(%esp) + +#define MMM 0+STACKSIZE(%esp) +#define NN 4+STACKSIZE(%esp) +#define AA 8+STACKSIZE(%esp) +#define LDAX 12+STACKSIZE(%esp) #define I %eax #define J %ebx @@ -114,6 +120,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -122,6 +129,37 @@ PROFCODE movl STACK_LDA, LDA + movl LDA,LDAX # backup LDA + movl N,J + movl J,NN # backup N + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # mov M to MMM +.L0t: + xorl J,J + addl $1,J + sall $23,J # J=2^22 + subl J,MMM # MMM=MMM-J + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A # mov AA to A + + movl NN,%eax + movl %eax,N # reset N + + + movl LDAX, LDA # reset LDA + movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -628,10 +666,19 @@ ALIGN_4 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S index c7e685dd8..d46d7e43e 100644 --- a/kernel/x86/gemv_t_sse2.S +++ b/kernel/x86/gemv_t_sse2.S @@ -76,18 +76,24 @@ #endif #define STACKSIZE 16 +#define ARGS 16 + +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 24 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 28 + STACKSIZE+ARGS(%esp) +#define STACK_X 32 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 36 + STACKSIZE+ARGS(%esp) +#define Y 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) +#define BUFFER 48 + STACKSIZE+ARGS(%esp) + +#define MMM 0+STACKSIZE(%esp) +#define AA 4+STACKSIZE(%esp) +#define LDAX 8+STACKSIZE(%esp) +#define NN 12+STACKSIZE(%esp) -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 24 + STACKSIZE(%esp) -#define STACK_LDA 28 + STACKSIZE(%esp) -#define STACK_X 32 + STACKSIZE(%esp) -#define STACK_INCX 36 + STACKSIZE(%esp) -#define Y 40 + STACKSIZE(%esp) -#define STACK_INCY 44 + STACKSIZE(%esp) -#define BUFFER 48 + STACKSIZE(%esp) - #define I %eax #define J %ebx @@ -101,6 +107,8 @@ PROLOGUE + subl $ARGS,%esp + pushl %ebp pushl %edi pushl %esi @@ -108,7 +116,38 @@ PROFCODE + movl STACK_LDA, LDA + movl LDA,LDAX # backup LDA + movl N,J + movl J,NN # backup N + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # mov M to MMM +.L0t: + xorl J,J + addl $1,J + sall $22,J # J=2^22 + subl J,MMM # MMM=MMM-J + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A # mov AA to A + + movl NN,%eax + movl %eax,N # reset N + + + movl LDAX, LDA # reset LDA movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -117,6 +156,7 @@ leal (,INCY, SIZE), INCY leal (,LDA, SIZE), LDA + subl $-16 * SIZE, A cmpl $0, N @@ -560,10 +600,19 @@ ALIGN_4 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S index 052ff1a79..06970a055 100644 --- a/kernel/x86_64/sgemv_t.S +++ b/kernel/x86_64/sgemv_t.S @@ -47,7 +47,7 @@ #ifndef WINDOWS_ABI -#define STACKSIZE 64 +#define STACKSIZE 128 #define OLD_M %rdi #define OLD_N %rsi @@ -57,6 +57,10 @@ #define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define MMM 56(%rsp) +#define NN 64(%rsp) +#define AA 72(%rsp) +#define LDAX 80(%rsp) #else @@ -71,6 +75,10 @@ #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) +#defien MMM 216(%rsp) +#defien NN 224(%rsp) +#define AA 232(%rsp) +#define LDAX 240(%rsp) #endif @@ -127,29 +135,46 @@ movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) - movq OLD_M, M - movq OLD_N, N - movq OLD_A, A - movq OLD_LDA, LDA + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, AA + movq OLD_LDA, LDAX movq OLD_X, X #else - movq OLD_M, M - movq OLD_N, N - movq OLD_A, A - movq OLD_LDA, LDA + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, AA + movq OLD_LDA, LDAX #endif - - movq STACK_INCX, INCX - movq STACK_Y, Y - movq STACK_INCY, INCY - movq STACK_BUFFER, BUFFER - #ifndef WINDOWS_ABI pshufd $0, %xmm0, ALPHA #else pshufd $0, %xmm3, ALPHA #endif + +.L0t: + xorq M,M + addq $1,M + salq $22,M + subq M,MMM + jge .L00t + ALIGN_4 + + movq MMM,%rax + addq M,%rax + jle .L999x + movq %rax,M + +.L00t: + movq LDAX,LDA + movq NN,N + movq AA,A + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA @@ -6341,6 +6366,12 @@ ALIGN_4 .L999: + leaq (,M,SIZE),%rax + addq %rax,AA + jmp .L0t + ALIGN_4 + +.L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 From 91ed4e4450ceabd71493e0bf80e7455df414bebf Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 23 Dec 2012 23:14:17 +0800 Subject: [PATCH 129/162] Refs #171. Prevent loading the dirty number from the buffer in sgemv_t x86 kernel. --- kernel/x86/gemv_t_sse.S | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index c72febe3d..42ed19998 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -236,6 +236,20 @@ jg .L06 ALIGN_4 +//Padding zero to prevent loading the dirty number from buffer. + movl M, I + movl $8, J + andl $7, I + xorps %xmm0, %xmm0 + subl I, J + ALIGN_2 +.L07: + movss %xmm0, 0 * SIZE(Y1) + addl $SIZE, Y1 + decl J + jg .L07 + ALIGN_4 + .L10: movl Y, Y1 From 0d1518add98bc3c0e83887be74cda3b23c8937ee Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 25 Dec 2012 09:10:17 +0800 Subject: [PATCH 130/162] Refs #173. Fixed overflow internal buffer bug of sgemv_t on x86 --- kernel/x86/gemv_t_sse.S | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index 42ed19998..fa6cfc50b 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -89,7 +89,7 @@ #endif #define STACKSIZE 16 -#define ARGS 16 +#define ARGS 20 #define M 4 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp) @@ -106,6 +106,7 @@ #define NN 4+STACKSIZE(%esp) #define AA 8+STACKSIZE(%esp) #define LDAX 12+STACKSIZE(%esp) +#define XX 16+STACKSIZE(%esp) #define I %eax #define J %ebx @@ -130,6 +131,8 @@ movl STACK_LDA, LDA movl LDA,LDAX # backup LDA + movl STACK_X, X + movl X,XX movl N,J movl J,NN # backup N movl A,J @@ -139,7 +142,7 @@ .L0t: xorl J,J addl $1,J - sall $23,J # J=2^22 + sall $21,J # J=2^22 subl J,MMM # MMM=MMM-J movl J,M jge .L00t @@ -159,8 +162,8 @@ movl LDAX, LDA # reset LDA + movl XX,X - movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -683,6 +686,9 @@ movl M,J leal (,J,SIZE),%eax addl %eax,AA + movl XX,J + addl %eax,J + movl J,XX jmp .L0t ALIGN_4 From 69200884e13e98b79487cfd1c78faf054278ec2f Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 25 Dec 2012 09:27:49 +0800 Subject: [PATCH 131/162] Refs #173. Fixed overflow internal buffer bug of gemv_n on x86 --- kernel/x86/gemv_n_sse.S | 67 ++++++++++++++++++++++++++++++++------- kernel/x86/gemv_n_sse2.S | 68 +++++++++++++++++++++++++++++++++------- 2 files changed, 112 insertions(+), 23 deletions(-) diff --git a/kernel/x86/gemv_n_sse.S b/kernel/x86/gemv_n_sse.S index 0891657fa..3ff9203c8 100644 --- a/kernel/x86/gemv_n_sse.S +++ b/kernel/x86/gemv_n_sse.S @@ -89,17 +89,22 @@ #endif #define STACKSIZE 16 - -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 20 + STACKSIZE(%esp) -#define STACK_LDA 24 + STACKSIZE(%esp) -#define STACK_X 28 + STACKSIZE(%esp) -#define STACK_INCX 32 + STACKSIZE(%esp) -#define Y 36 + STACKSIZE(%esp) -#define STACK_INCY 40 + STACKSIZE(%esp) -#define BUFFER 44 + STACKSIZE(%esp) +#define ARGS 16 + +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 20 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 24 + STACKSIZE+ARGS(%esp) +#define STACK_X 28 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 32 + STACKSIZE+ARGS(%esp) +#define Y 36 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) +#define BUFFER 44 + STACKSIZE+ARGS(%esp) +#define MMM 0+ARGS(%esp) +#define YY 4+ARGS(%esp) +#define AA 8+ARGS(%esp) +#define LDAX 12+ARGS(%esp) #define I %eax #define J %ebx @@ -114,6 +119,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -121,7 +127,34 @@ PROFCODE + movl Y,J + movl J,YY # backup Y + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # backup MM +.L0t: + xorl J,J + addl $1,J + sall $21,J + subl J,MMM + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A + + movl YY,J + movl J,Y movl STACK_LDA, LDA + movl STACK_X, X movl STACK_INCX, INCX @@ -651,12 +684,22 @@ addss 0 * SIZE(X), %xmm0 movss %xmm0, (Y1) ALIGN_3 - .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + movl YY,J + addl %eax,J + movl J,YY + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/gemv_n_sse2.S b/kernel/x86/gemv_n_sse2.S index 5f5fa5a51..980797d91 100644 --- a/kernel/x86/gemv_n_sse2.S +++ b/kernel/x86/gemv_n_sse2.S @@ -76,17 +76,22 @@ #endif #define STACKSIZE 16 - -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 24 + STACKSIZE(%esp) -#define STACK_LDA 28 + STACKSIZE(%esp) -#define STACK_X 32 + STACKSIZE(%esp) -#define STACK_INCX 36 + STACKSIZE(%esp) -#define Y 40 + STACKSIZE(%esp) -#define STACK_INCY 44 + STACKSIZE(%esp) -#define BUFFER 48 + STACKSIZE(%esp) +#define ARGS 16 + +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 24 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 28 + STACKSIZE+ARGS(%esp) +#define STACK_X 32 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 36 + STACKSIZE+ARGS(%esp) +#define Y 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) +#define BUFFER 48 + STACKSIZE+ARGS(%esp) + +#define MMM 0+ARGS(%esp) +#define YY 4+ARGS(%esp) +#define AA 8+ARGS(%esp) #define I %eax #define J %ebx @@ -101,6 +106,8 @@ PROLOGUE + + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -108,6 +115,33 @@ PROFCODE + movl Y,J + movl J,YY # backup Y + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # backup MM +.L0t: + xorl J,J + addl $1,J + sall $20,J + subl J,MMM + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A + + movl YY,J + movl J,Y + movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX @@ -677,10 +711,22 @@ ALIGN_3 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + movl YY,J + addl %eax,J + movl J,YY + jmp .L0t + ALIGN_4 + +.L999x: + popl %ebx popl %esi popl %edi popl %ebp + addl $ARGS,%esp ret EPILOGUE From 8b122ff9dc8a7d3e695283f0d5c6b4d576e9356e Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 3 Jan 2013 01:47:31 +0800 Subject: [PATCH 132/162] Refs #176. Fixed make.inc overriding RANLIB bug when cross-compiling LAPACK. --- make.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/make.inc b/make.inc index 30004233f..01b9bde92 100644 --- a/make.inc +++ b/make.inc @@ -4,7 +4,7 @@ DRVOPTS = $(OPTS) LOADER = $(FORTRAN) TIMER = NONE ARCHFLAGS= -ru -RANLIB = ranlib +#RANLIB = ranlib BLASLIB = TMGLIB = tmglib.a EIGSRCLIB = eigsrc.a From 08bf6674d543db41c13053d1388602cb4d070373 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 5 Jan 2013 11:36:39 +0800 Subject: [PATCH 133/162] Refs #177. Fixed sgemv_t compiling bug on Win64. --- kernel/x86_64/sgemv_t.S | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S index 06970a055..f516f08af 100644 --- a/kernel/x86_64/sgemv_t.S +++ b/kernel/x86_64/sgemv_t.S @@ -1,4 +1,5 @@ -/*********************************************************************/ + ;; 2c +1 /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -75,8 +76,8 @@ #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) -#defien MMM 216(%rsp) -#defien NN 224(%rsp) +#define MMM 216(%rsp) +#define NN 224(%rsp) #define AA 232(%rsp) #define LDAX 240(%rsp) @@ -137,8 +138,10 @@ movq OLD_M, MMM movq OLD_N, NN - movq OLD_A, AA - movq OLD_LDA, LDAX + movq OLD_A, X + movq X, AA + movq OLD_LDA, X + movq X, LDAX movq OLD_X, X #else movq OLD_M, MMM From 99d1978df7d9968db1e1f7ed147f67a9ec799d95 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 12 Jan 2013 12:31:14 +0800 Subject: [PATCH 134/162] Fixed #180. the typos in kernel/x86_64/sgemv_t.S --- kernel/x86_64/sgemv_t.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S index f516f08af..854e0f295 100644 --- a/kernel/x86_64/sgemv_t.S +++ b/kernel/x86_64/sgemv_t.S @@ -1,5 +1,3 @@ - ;; 2c -1 /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ From 200e4acf152f11444cf32f8a2a93fde0bc700e9d Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Mon, 25 Jun 2012 13:51:46 +0200 Subject: [PATCH 135/162] cblas: typedef enums for improved compatibility with Intel MKL. Netlib style: enum CBLAS_XYZ {X=1, Y=2, Z=3}; Intel MKL style: typedef enum {X=1, Y=2, Z=3} CBLAS_XYZ; With this hybrid style, code written in the latter form won't need any modifications to be built with OpenBLAS. This change should not affect existing code, although a warning may be emitted for C code which does the following (does not occur with C++): typedef enum CBLAS_XYZ CBLAS_XYZ; warning: redefinition of typedef 'CBLAS_XYZ' [-pedantic] --- cblas.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cblas.h b/cblas.h index e9664fe79..5d50238e1 100644 --- a/cblas.h +++ b/cblas.h @@ -18,11 +18,11 @@ char* openblas_get_config(void); #define CBLAS_INDEX size_t -enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; -enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114}; -enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; -enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; -enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; +typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; +typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; +typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; +typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; +typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); From 0b08f7479e26ce0ef8e076185bb89f16479335e9 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 20 Jan 2013 21:22:12 +0800 Subject: [PATCH 136/162] Refs #154. Fixed gemv_t bug about overflow 16MB buffer on x86. --- kernel/x86/gemv_t_sse.S | 4 +++- kernel/x86/gemv_t_sse2.S | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index fa6cfc50b..326584bbc 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -142,7 +142,9 @@ .L0t: xorl J,J addl $1,J - sall $21,J # J=2^22 + sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) + subl $8, J # Don't use last 8 float in the buffer. + # Now, split M by block J subl J,MMM # MMM=MMM-J movl J,M jge .L00t diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S index d46d7e43e..60d6ef270 100644 --- a/kernel/x86/gemv_t_sse2.S +++ b/kernel/x86/gemv_t_sse2.S @@ -128,7 +128,9 @@ .L0t: xorl J,J addl $1,J - sall $22,J # J=2^22 + sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) + subl $4, J # Don't use last 4 double in the buffer. + # Now, split M by block J subl J,MMM # MMM=MMM-J movl J,M jge .L00t From 4db6660de4756b25d7b71c00d7893f2b15587f1c Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Sun, 20 Jan 2013 21:53:52 +0100 Subject: [PATCH 137/162] Refs #185. Add missing 'const' to declarations in . Thanks to Dan Povey! The 'const' modifications were done automatically using this scripts: https://kaldi.svn.sourceforge.net/svnroot/kaldi/sandbox/dan/tools/for_openblas --- Makefile | 2 +- Makefile.getarch => Makefile.prebuild | 7 +- Makefile.system | 2 +- cblas.h | 511 +++++++++++++------------- common.h | 3 +- 5 files changed, 265 insertions(+), 260 deletions(-) rename Makefile.getarch => Makefile.prebuild (81%) diff --git a/Makefile b/Makefile index 39e3bbd65..a84b54d4b 100644 --- a/Makefile +++ b/Makefile @@ -314,7 +314,7 @@ clean :: #endif @$(MAKE) -C reference clean @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h - @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib + @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib @if test -d $(NETLIB_LAPACK_DIR); then \ echo deleting $(NETLIB_LAPACK_DIR); \ rm -rf $(NETLIB_LAPACK_DIR) ;\ diff --git a/Makefile.getarch b/Makefile.prebuild similarity index 81% rename from Makefile.getarch rename to Makefile.prebuild index dadfb5b1b..f4b0bb5af 100644 --- a/Makefile.getarch +++ b/Makefile.prebuild @@ -1,3 +1,5 @@ +# This is triggered by Makefile.system and runs before any of the code is built. + export BINARY export USE_OPENMP @@ -15,7 +17,7 @@ ifdef CPUIDEMU EXFLAGS = -DCPUIDEMU -DVENDOR=99 endif -all: getarch_2nd +all: getarch_2nd cblas_noconst.h ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) @@ -36,4 +38,7 @@ else $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c endif +cblas_noconst.h : cblas.h + sed -e "s/\bconst\b\s*//g" cblas.h > cblas_noconst.h + dummy: diff --git a/Makefile.system b/Makefile.system index 27f30fa61..239047f36 100644 --- a/Makefile.system +++ b/Makefile.system @@ -70,7 +70,7 @@ ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf diff --git a/cblas.h b/cblas.h index 5d50238e1..501e7d0d1 100644 --- a/cblas.h +++ b/cblas.h @@ -24,271 +24,270 @@ typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; -float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); -double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); -float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); -double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); - -openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); -openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); -openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); -openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); -void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); -void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); -void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); - -float cblas_sasum (blasint n, float *x, blasint incx); -double cblas_dasum (blasint n, double *x, blasint incx); -float cblas_scasum(blasint n, float *x, blasint incx); -double cblas_dzasum(blasint n, double *x, blasint incx); - -float cblas_snrm2 (blasint N, float *X, blasint incX); -double cblas_dnrm2 (blasint N, double *X, blasint incX); -float cblas_scnrm2(blasint N, float *X, blasint incX); -double cblas_dznrm2(blasint N, double *X, blasint incX); - -CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); -CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); - -void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy); -void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy); -void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy); -void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy); - -void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); -void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); +float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy); +double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); + +openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); +openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); + +void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); +void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); +void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); +void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); + +float cblas_sasum (const blasint n, const float *x, const blasint incx); +double cblas_dasum (const blasint n, const double *x, const blasint incx); +float cblas_scasum(const blasint n, const float *x, const blasint incx); +double cblas_dzasum(const blasint n, const double *x, const blasint incx); + +float cblas_snrm2 (const blasint N, const float *X, const blasint incX); +double cblas_dnrm2 (const blasint N, const double *X, const blasint incX); +float cblas_scnrm2(const blasint N, const float *X, const blasint incX); +double cblas_dznrm2(const blasint N, const double *X, const blasint incX); + +CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx); +CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx); +CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx); +CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx); + +void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy); +void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy); + +void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); +void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); + +void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); +void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); +void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); +void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); + +void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s); +void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s); void cblas_srotg(float *a, float *b, float *c, float *s); void cblas_drotg(double *a, double *b, double *c, double *s); -void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); -void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); - -void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); -void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); - -void cblas_sscal(blasint N, float alpha, float *X, blasint incX); -void cblas_dscal(blasint N, double alpha, double *X, blasint incX); -void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); -void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); -void cblas_csscal(blasint N, float alpha, float *X, blasint incX); -void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); - -void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); -void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); -void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); -void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); - -void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); - -void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); - -void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); - -void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); -void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); - -void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, - blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, - blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, - float *Y, blasint incY, float *A, blasint lda); -void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, - double *Y, blasint incY, double *A, blasint lda); - -void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); - -void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); - - -void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); - -void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); - -void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); - -void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); - -void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, - blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, - blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); - - -void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, - float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, - double *X, blasint incX, double beta, double *Y, blasint incY); - -void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); -void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); - -void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); -void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); - -void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); -void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); -void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); -void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); - -void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); - -void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); - -void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); -void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); -void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); - -void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); - -void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); - -void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); - -void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P); +void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P); + +void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); +void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); + +void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX); +void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX); +void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX); +void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX); +void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX); +void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX); + +void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy); +void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy); +void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy); +void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy); + +void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); +void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); +void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); + +void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); + +void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); + +void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); +void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); +void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); +void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); + +void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X, + const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, + const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); +void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, + const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, + const double *Y, const blasint incY, double *A, const blasint lda); + +void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); +void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); + +void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A, + const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A, + const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); + + +void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); + +void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); + +void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); +void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); + +void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); +void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); + +void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A, + const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A, + const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); +void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A, + const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A, + const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); + + +void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap, + const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap, + const double *X, const blasint incX, const double beta, double *Y, const blasint incY); + +void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap); +void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap); + +void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A); +void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A); + +void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A); +void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A); +void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap); +void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap); + +void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, + const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, + const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); + +void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, + const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, + const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); + +void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); +void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); + +void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); +void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); + +void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); +void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); +void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc); +void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc); + +void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); +void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); + +void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); +void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); + +void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); +void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); + +void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); + +void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); +void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); + +void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); void cblas_xerbla(blasint p, char *rout, char *form, ...); #ifdef __cplusplus } - #endif /* __cplusplus */ #endif diff --git a/common.h b/common.h index 003fde77f..4403af13d 100644 --- a/common.h +++ b/common.h @@ -557,7 +557,8 @@ typedef struct { #include "common_level3.h" #include "common_lapack.h" #ifdef CBLAS -#include "cblas.h" +/* This header file is generated from "cblas.h" (see Makefile.prebuild). */ +#include "cblas_noconst.h" #endif #ifndef ASSEMBLER From 8cdb79543823f1da894e18c6487a8d4d9cfdb1c3 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 22 Jan 2013 00:18:21 +0800 Subject: [PATCH 138/162] Refs #187. Use binary code for xgetbv, which is compatible with old compiler. --- cpuid_x86.c | 3 ++- driver/others/dynamic.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index b3352244b..a19dedeee 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -116,8 +116,9 @@ static inline int have_excpuid(void){ #ifndef NO_AVX static inline void xgetbv(int op, int * eax, int * edx){ + //Use binary code for xgetbv __asm__ __volatile__ - ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); + (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); } #endif diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 28fdd30d8..b6f27d0ad 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -78,8 +78,9 @@ extern gotoblas_t gotoblas_SANDYBRIDGE; #ifndef NO_AVX static inline void xgetbv(int op, int * eax, int * edx){ + //Use binary code for xgetbv __asm__ __volatile__ - ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); + (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); } #endif From 36e098296684b264f8b0979268e775519be1c81e Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 22 Jan 2013 00:29:54 +0800 Subject: [PATCH 139/162] Refs #187. Use perl to generate cblas_noconst.h instead of sed. Thank Dan Povey's patch. https://github.com/xianyi/OpenBLAS/issues/187 --- Makefile.prebuild | 2 +- getarch_2nd.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index f4b0bb5af..c7d0de70e 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -39,6 +39,6 @@ else endif cblas_noconst.h : cblas.h - sed -e "s/\bconst\b\s*//g" cblas.h > cblas_noconst.h + perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h dummy: diff --git a/getarch_2nd.c b/getarch_2nd.c index 5339af442..4bdd16a99 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -34,7 +34,7 @@ int main(int argc, char **argv) { #ifdef USE64BITINT printf("#define USE64BITINT\n"); #endif - printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); + printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD); } return 0; From d311236dfdefa41f31a2e7fefa548abf47f0461c Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 25 Jan 2013 16:18:27 +0800 Subject: [PATCH 140/162] Refs #189. Fixed the bug of s/cdot about invalid reading NAN on x86_64. --- kernel/x86_64/dot_sse.S | 7 ++++--- kernel/x86_64/zdot_sse.S | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/dot_sse.S b/kernel/x86_64/dot_sse.S index 61c481064..985ce9fec 100644 --- a/kernel/x86_64/dot_sse.S +++ b/kernel/x86_64/dot_sse.S @@ -530,7 +530,7 @@ #endif movsd -32 * SIZE(Y), %xmm8 - pshufd $0x39, %xmm4, %xmm5 + pshufd $0x29, %xmm4, %xmm5 mulps %xmm8, %xmm5 addps %xmm5, %xmm3 @@ -750,7 +750,8 @@ xorps %xmm5, %xmm5 movhlps %xmm4, %xmm5 - mulps -32 * SIZE(Y), %xmm5 + movlps -32 * SIZE(Y), %xmm4 + mulps %xmm4, %xmm5 addps %xmm5, %xmm0 addq $2 * SIZE, X @@ -992,7 +993,7 @@ movsd -32 * SIZE(Y), %xmm8 movss %xmm5, %xmm4 - shufps $0x93, %xmm5, %xmm4 + shufps $0x93, %xmm4, %xmm4 mulps %xmm8, %xmm4 addps %xmm4, %xmm3 diff --git a/kernel/x86_64/zdot_sse.S b/kernel/x86_64/zdot_sse.S index 13804e0f8..e2f153ab3 100644 --- a/kernel/x86_64/zdot_sse.S +++ b/kernel/x86_64/zdot_sse.S @@ -699,7 +699,7 @@ movsd -32 * SIZE(X), %xmm4 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x39, %xmm8, %xmm8 + shufps $0x59, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -1336,7 +1336,7 @@ movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x93, %xmm8, %xmm8 + shufps $0x03, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -1697,7 +1697,7 @@ movsd -32 * SIZE(Y), %xmm4 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x39, %xmm8, %xmm8 + shufps $0xa9, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -2024,7 +2024,7 @@ movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x93, %xmm8, %xmm8 + shufps $0x03, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 From 875d520ccfcfbb6a77cc5166b8bd562c3d111718 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Thu, 31 Jan 2013 08:48:27 +0100 Subject: [PATCH 141/162] Refs #193. cblas: move #include out of extern "C" block. Standard headers may contain C++ templates which are not permitted inside an extern "C" block. This might be the case when we include . --- cblas.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cblas.h b/cblas.h index 501e7d0d1..6684262e2 100644 --- a/cblas.h +++ b/cblas.h @@ -1,14 +1,14 @@ #ifndef CBLAS_H #define CBLAS_H +#include +#include "common.h" + #ifdef __cplusplus extern "C" { /* Assume C declarations for C++ */ #endif /* __cplusplus */ -#include -#include "common.h" - /*Set the number of threads on runtime.*/ void openblas_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads); From 64ad8b9809e3768981d540d6f674a4642b86bf8d Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Fri, 1 Feb 2013 09:24:44 +0100 Subject: [PATCH 142/162] Refs #193. Don't use C99 complex numbers when building C++ code. --- common.h | 3 ++- openblas_config_template.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/common.h b/common.h index 4403af13d..a822b7182 100644 --- a/common.h +++ b/common.h @@ -390,7 +390,8 @@ typedef int blasint; /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 +#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || + (__GNUC__ >= 3 && !defined(__cplusplus))) #define OPENBLAS_COMPLEX_C99 typedef float _Complex openblas_complex_float; typedef double _Complex openblas_complex_double; diff --git a/openblas_config_template.h b/openblas_config_template.h index a2b05696f..0d1186819 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -48,7 +48,8 @@ typedef int blasint; /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 +#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || + (__GNUC__ >= 3 && !defined(__cplusplus))) #define OPENBLAS_COMPLEX_C99 #include typedef float _Complex openblas_complex_float; From a9500d00793bc8a63939bfb634f46d4b1654a2ec Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Fri, 1 Feb 2013 09:34:12 +0100 Subject: [PATCH 143/162] Missing line continuation -- follow-up to last commit (64ad8b9809). --- common.h | 2 +- openblas_config_template.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common.h b/common.h index a822b7182..d46a5230a 100644 --- a/common.h +++ b/common.h @@ -390,7 +390,7 @@ typedef int blasint; /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || +#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ (__GNUC__ >= 3 && !defined(__cplusplus))) #define OPENBLAS_COMPLEX_C99 typedef float _Complex openblas_complex_float; diff --git a/openblas_config_template.h b/openblas_config_template.h index 0d1186819..cf2c037cc 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -48,7 +48,7 @@ typedef int blasint; /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || +#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ (__GNUC__ >= 3 && !defined(__cplusplus))) #define OPENBLAS_COMPLEX_C99 #include From 5155e3f5090aa313ce342f4bc0880db63208c5a5 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 13 Feb 2013 16:05:58 +0800 Subject: [PATCH 144/162] Refs #174. Fixed the overflowing buffer bug of multithreading hbmv and sbmv. Instead of using thread 0 buffer, each thread uses its own sb buffer. Thus, it can avoid overflowing thread 0 buffer. --- driver/level2/sbmv_thread.c | 10 ++++++---- driver/others/blas_server.c | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index 222734d5e..7dfabfa81 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; - y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; @@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F n_from = 0; n_to = n; + //Use y as each thread's n* COMPSIZE elements in sb buffer + y = buffer; + buffer += ((COMPSIZE * n + 1023) & ~1023); + if (range_m) { n_from = *(range_m + 0); n_to = *(range_m + 1); @@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F a += n_from * lda * COMPSIZE; } - if (range_n) y += *range_n * COMPSIZE; if (incx != 1) { COPY_K(n, x, incx, buffer, 1); @@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x if (num_cpu) { queue[0].sa = NULL; - queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; + queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); @@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x #else ONE, ZERO, #endif - buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0); } AXPYU_K(n, 0, 0, diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index c51e681a5..2afcb742e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){ + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } + queue->sb=sb; } #ifdef MONITOR From 4c2123c3343c523d5359853ac0ebe1bd6550a881 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Feb 2013 12:51:13 +0800 Subject: [PATCH 145/162] Fixed the overflowing bug in single thread cholesky factorization. --- lapack/potrf/potrf_L_single.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lapack/potrf/potrf_L_single.c b/lapack/potrf/potrf_L_single.c index b88f8fc7a..d6d143623 100644 --- a/lapack/potrf/potrf_L_single.c +++ b/lapack/potrf/potrf_L_single.c @@ -66,7 +66,9 @@ static FLOAT dm1 = -1.; #endif #define GEMM_PQ MAX(GEMM_P, GEMM_Q) -#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +//leave some space for GEMM_ALIGN in sb2 +#define REAL_GEMM_R (GEMM_R - 2*GEMM_PQ) #if 0 #define SHARED_ARRAY @@ -220,7 +222,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, sa, sb2, a + (is + js * lda) * COMPSIZE, lda, - - is + js); + is - js); #endif } From 3cc6ae793eb9deba2d9b94e1326d2bc8b155f2f6 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 26 Feb 2013 00:48:21 +0800 Subject: [PATCH 146/162] Refs #174. Return sb pointer when OpenMP or Windows. --- driver/others/blas_server_omp.c | 1 + driver/others/blas_server_win32.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index c45856fd9..21bc5f78e 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -224,6 +224,7 @@ static void exec_threads(blas_queue_t *queue){ + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } + queue->sb=sb; } } diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 9cbd7e219..bd1069c5e 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -253,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } + queue->sb=sb; } #ifdef MONITOR @@ -495,4 +496,4 @@ void goto_set_num_threads(int num_threads) void openblas_set_num_threads(int num) { goto_set_num_threads(num); -} \ No newline at end of file +} From d744c9590ae18706b40e151a9adf2070639909fb Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 1 Mar 2013 14:36:47 +0800 Subject: [PATCH 147/162] In OpenMP threading, preallocate the thread buffer instead of allocating the buffer every time. This patch improved the performance slightly. --- driver/others/blas_server_omp.c | 50 +++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 21bc5f78e..c567ed688 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -49,8 +49,12 @@ int blas_server_avail = 0; +static void * blas_thread_buffer[MAX_CPU_NUMBER]; + void goto_set_num_threads(int num_threads) { + int i=0; + if (num_threads < 1) num_threads = blas_num_threads; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; @@ -62,7 +66,19 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; omp_set_num_threads(blas_cpu_number); - + + //adjust buffer for each thread + for(i=0; i sa; sb = queue -> sb; @@ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){ if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { - buffer = blas_memory_alloc(2); + pos = omp_get_thread_num(); + buffer = blas_thread_buffer[pos]; + + //fallback + if(buffer==NULL) { + buffer = blas_memory_alloc(2); + release_flag=1; + } if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); @@ -242,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){ } - if (buffer != NULL) blas_memory_free(buffer); + if (release_flag) blas_memory_free(buffer); } From f1ce74ffdda640d31a58dd0b867e959672444811 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 2 Mar 2013 14:15:54 +0800 Subject: [PATCH 148/162] Improved the print when OS don't support AVX. --- driver/others/dynamic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 6523abb4d..893dd0738 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -175,7 +175,7 @@ static gotoblas_t *get_coretype(void){ if(support_avx()) return &gotoblas_SANDYBRIDGE; else{ - fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); + fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } @@ -186,7 +186,7 @@ static gotoblas_t *get_coretype(void){ if(support_avx()) return &gotoblas_SANDYBRIDGE; else{ - fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); + fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } @@ -211,7 +211,7 @@ static gotoblas_t *get_coretype(void){ if(support_avx()) return &gotoblas_BULLDOZER; else{ - fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n"); + fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } } else { From 0d0405b434808d8c8122474a4dcfa089f4962512 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 2 Mar 2013 14:22:27 +0800 Subject: [PATCH 149/162] Updated the doc for 0.2.6 version. --- Changelog.txt | 18 ++++++++++++++++++ Makefile.rule | 2 +- README.md | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index db0732c4f..54b11ad81 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,22 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.6 +2-Mar-2013 +common: + * Improved OpenMP performance slightly. (d744c9) + * Improved cblas.h compatibility with Intel MKL.(#185) + * Fixed the overflowing bug in single thread cholesky factorization. + * Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174) + +x86/x86-64: + * Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) + We will tune the performance in future. + * Auto-detect Intel Xeon E7540. + * Fixed the overflowing buffer bug of gemv. (#173) + * Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189) + +MIPS64: + ==================================================================== Version 0.2.5 26-Nov-2012 diff --git a/Makefile.rule b/Makefile.rule index 1240ab0ad..4e238575a 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.5 +VERSION = 0.2.6 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/README.md b/README.md index ce2688f03..ed5f196c7 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Please read GotoBLAS_01Readme.txt - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. -- **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes. +- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) #### MIPS64: - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. From 529f1b500609a67a83843aa2c6374a3541b911f8 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 2 Mar 2013 14:41:18 +0800 Subject: [PATCH 150/162] Refs#194. Export the missing LAPACK s/dlamc3 functions. --- exports/gensymbol | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 04cbd7d84..99609b356 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -111,7 +111,7 @@ # already provided by @blasobjs: xerbla, lsame ilaenv, ieeeck, lsamen, xerbla_array, iparmq, ilaprec, ilatrans, ilauplo, iladiag, chla_transtype, - ilaver, slamch, + ilaver, slamch, slamc3, # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX. # excluded: second_$(TIMER) @@ -148,7 +148,7 @@ dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc, dsteqr, dsterf, dlaisnan, disnan, dlartgp, dlartgs, - dlamch, + dlamch, dlamc3, # SLASRC -- Single precision real LAPACK routines # already provided by @lapackobjs: From 9405f26f4b4feedf67a35b8aa0ce3e4882474d58 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Mon, 4 Mar 2013 17:37:38 +0100 Subject: [PATCH 151/162] new dgemm_kernel for bulldozer --- kernel/x86_64/dgemm_kernel_4x4_bulldozer.S | 231 +++++++++++---------- 1 file changed, 125 insertions(+), 106 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S index b06b07edf..9d0c613e4 100644 --- a/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S +++ b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S @@ -88,151 +88,142 @@ #define movupd movups #define KERNEL1(xx) \ - vfmaddpd %xmm8,%xmm1,%xmm0,%xmm8 ;\ - vmovaps %xmm2, %xmm0 ;\ - vmovups -14 * SIZE(AO, %rax, 4),%xmm2 ;\ - vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vmovups -16 * SIZE(AO, %rax, 4),%xmm0 ;\ + vfmaddpd %xmm8,%xmm0,%xmm1,%xmm8 ;\ + vmovaps %xmm2,%xmm0 ;\ + vmovddup -16 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup -15 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm9,%xmm0,%xmm3,%xmm9 ;\ + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 ;\ vfmaddpd %xmm12,%xmm2,%xmm1,%xmm12 ;\ - vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ vfmaddpd %xmm13,%xmm2,%xmm3,%xmm13 ;\ + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ vmovddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ - vfmaddpd %xmm11,%xmm3,%xmm0,%xmm11 ;\ - vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vfmaddpd %xmm10,%xmm0,%xmm1,%xmm10 ;\ + vfmaddpd %xmm11,%xmm0,%xmm3,%xmm11 ;\ vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ - vmovups -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + vmovups -10 * SIZE(AO, %rax, 4),%xmm2 ;\ vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ - vmovaps %xmm0, %xmm2 #define KERNEL2(xx) \ - vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 ;\ - vmovaps %xmm2, %xmm0 ;\ - vmovups -10 * SIZE(AO, %rax, 4),%xmm2 ;\ -/*A*/ vmovups (AO, %rax, 4), %xmm6 ;\ + vmovups -8 * SIZE(AO, %rax, 4),%xmm4 ;\ + vfmaddpd %xmm8, %xmm0, %xmm1,%xmm8 ;\ + vfmaddpd %xmm9, %xmm0, %xmm3,%xmm9 ;\ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ - vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 ;\ - vmovddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ vmovddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 ;\ + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 ;\ + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 ;\ vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ -/**/ vmovddup (BO, %rax, 4), %xmm1 ;\ - vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 ;\ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup -8 * SIZE(BO, %rax, 4), %xmm5 ;\ vmovddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ - vmovaps %xmm4, %xmm2 #define KERNEL3(xx) \ - vfmaddpd %xmm8,%xmm5, %xmm4, %xmm8 ;\ - vmovaps %xmm2, %xmm4 ;\ - vmovups -6 * SIZE(AO, %rax, 4),%xmm2 ;\ + vmovups -6 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm8, %xmm4, %xmm5, %xmm8 ;\ + vfmaddpd %xmm9, %xmm4, %xmm3,%xmm9 ;\ vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ - vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ - vmovddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ vmovddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm10,%xmm4, %xmm5,%xmm10 ;\ + vfmaddpd %xmm11,%xmm4, %xmm3, %xmm11 ;\ vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ - vfmaddpd %xmm11,%xmm3, %xmm4, %xmm11 ;\ - vmovups -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovups -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 4),%xmm2 ;\ vmovddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ - vfmaddpd %xmm15,%xmm2,%xmm3,%xmm15 ;\ vmovddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ - vmovaps %xmm4, %xmm2 #define KERNEL4(xx) \ - vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ - vmovaps %xmm2, %xmm4 ;\ - vmovups -2 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm8,%xmm4, %xmm5,%xmm8 ;\ + vfmaddpd %xmm9,%xmm4, %xmm3,%xmm9 ;\ vfmaddpd %xmm12,%xmm2, %xmm5 ,%xmm12;\ -/*A*/ vmovups 8 * SIZE(AO, %rax, 4), %xmm7 ;\ - vmovddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ - vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ vmovddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm10,%xmm4, %xmm5,%xmm10 ;\ + vfmaddpd %xmm11,%xmm4, %xmm3,%xmm11 ;\ vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ -/**/ vmovddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ - vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ +/*A*/ vmovups (AO, %rax, 4), %xmm6 ;\ + vmovups 2 * SIZE(AO, %rax, 4),%xmm2 ;\ vmovddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ - vmovaps %xmm6, %xmm2 +/**/ vmovddup (BO, %rax, 4), %xmm1 ;\ #define KERNEL5(xx) \ - vfmaddpd %xmm8,%xmm1, %xmm6,%xmm8 ;\ - vmovaps %xmm2, %xmm6 ;\ - vmovups 2 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm8,%xmm6, %xmm1,%xmm8 ;\ + vfmaddpd %xmm9,%xmm6, %xmm3,%xmm9 ;\ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ - vmovddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ - vfmaddpd %xmm9,%xmm3, %xmm6,%xmm9 ;\ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ vmovddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm1, %xmm6,%xmm10 ;\ + vfmaddpd %xmm10,%xmm6, %xmm1,%xmm10 ;\ + vfmaddpd %xmm11,%xmm6, %xmm3,%xmm11 ;\ vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ - vfmaddpd %xmm11,%xmm3, %xmm6,%xmm11 ;\ - vmovups 4 * SIZE(AO, %rax, 4), %xmm6 ;\ - vmovddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ - vmovddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ - vmovaps %xmm6, %xmm2 + vmovups 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + vmovups 6 * SIZE(AO, %rax, 4),%xmm2 ;\ + vmovddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ #define KERNEL6(xx) \ - vfmaddpd %xmm8,%xmm1, %xmm6,%xmm8 ;\ - vmovaps %xmm2, %xmm6 ;\ - vmovups 6 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm8,%xmm6, %xmm1,%xmm8 ;\ + vfmaddpd %xmm9,%xmm6, %xmm3,%xmm9 ;\ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ -/*A*/ vmovups 16 * SIZE(AO, %rax, 4), %xmm0 ;\ - vmovddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ - vfmaddpd %xmm9,%xmm3, %xmm6,%xmm9 ;\ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ vmovddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm1, %xmm6,%xmm10 ;\ + vfmaddpd %xmm10,%xmm6, %xmm1,%xmm10 ;\ + vfmaddpd %xmm11,%xmm6, %xmm3,%xmm11 ;\ vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ -/**/ vmovddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ - vfmaddpd %xmm11,%xmm3, %xmm6,%xmm11 ;\ - vfmaddpd %xmm15,%xmm2,%xmm3,%xmm15 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ +/*A*/ vmovups 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + vmovups 10 * SIZE(AO, %rax, 4),%xmm2 ;\ +/**/ vmovddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ vmovddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ - vmovaps %xmm7, %xmm2 #define KERNEL7(xx) \ - vfmaddpd %xmm8,%xmm5, %xmm7,%xmm8 ;\ - vmovaps %xmm2, %xmm7 ;\ - vmovups 10 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm8,%xmm7, %xmm5,%xmm8 ;\ + vfmaddpd %xmm9,%xmm7, %xmm3,%xmm9 ;\ vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ - vmovddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ - vfmaddpd %xmm9,%xmm3, %xmm7,%xmm9 ;\ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ vmovddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm5, %xmm7,%xmm10 ;\ + vfmaddpd %xmm10,%xmm7, %xmm5,%xmm10 ;\ + vfmaddpd %xmm11,%xmm7, %xmm3,%xmm11 ;\ vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ - vfmaddpd %xmm11,%xmm3, %xmm7,%xmm11 ;\ - vmovups 12 * SIZE(AO, %rax, 4), %xmm7 ;\ - vmovddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovups 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + vmovups 14 * SIZE(AO, %rax, 4),%xmm2 ;\ + vmovddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ vmovddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ - vmovaps %xmm7, %xmm2 #define KERNEL8(xx) \ - vfmaddpd %xmm8,%xmm5, %xmm7,%xmm8 ;\ - vmovaps %xmm2, %xmm7 ;\ - vmovups 14 * SIZE(AO, %rax, 4),%xmm2 ;\ -/*A*/ vmovups 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + vfmaddpd %xmm8,%xmm7, %xmm5,%xmm8 ;\ + vfmaddpd %xmm9,%xmm7, %xmm3,%xmm9 ;\ vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ - vfmaddpd %xmm9,%xmm3, %xmm7,%xmm9 ;\ - vmovddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ vfmaddpd %xmm13, %xmm2, %xmm3,%xmm13 ;\ + vmovddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ vmovddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm5, %xmm7,%xmm10 ;\ + vfmaddpd %xmm10,%xmm7, %xmm5,%xmm10 ;\ + vfmaddpd %xmm11,%xmm7, %xmm3,%xmm11 ;\ vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ -/**/ vmovddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ - vfmaddpd %xmm11,%xmm3, %xmm7,%xmm11 ;\ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ +/*A*/ vmovups 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + vmovddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ vmovddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ vmovaps %xmm0, %xmm2 ;\ addq $8 * SIZE, %rax ;\ #define KERNEL_SUB1(xx) \ + vmovddup -15 * SIZE(BO), %xmm3 ;\ + vmovups -16 * SIZE(AO),%xmm0 ;\ vfmaddpd %xmm8, %xmm1, %xmm0,%xmm8 ;\ vmovapd %xmm2, %xmm0 ;\ vmovups -14 * SIZE(AO),%xmm2 ;\ @@ -255,17 +246,17 @@ vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 ;\ vmovaps %xmm2, %xmm0 ;\ vmovups -10 * SIZE(AO),%xmm2 ;\ + vmovups -8 * SIZE(AO),%xmm4 ;\ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 ;\ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ vmovddup -10 * SIZE(BO), %xmm1 ;\ vmovddup -9 * SIZE(BO), %xmm3 ;\ + vmovddup -8 * SIZE(BO), %xmm5 ;\ vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 ;\ vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 ;\ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ - vmovups (AO), %xmm0 ;\ - vmovddup (BO), %xmm1 ;\ vmovddup -7 * SIZE(BO), %xmm3 ;\ vmovaps %xmm4, %xmm2 @@ -291,11 +282,13 @@ vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ vmovaps %xmm2, %xmm4 ;\ vmovups -2 * SIZE(AO),%xmm2 ;\ + vmovups (AO), %xmm0 ;\ vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ vmovddup -2 * SIZE(BO), %xmm5 ;\ vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ vmovddup -1 * SIZE(BO), %xmm3 ;\ + vmovddup (BO), %xmm1 ;\ vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ @@ -407,16 +400,26 @@ leaq (B, %rax, 4), BO #endif - vzeroall - prefetcht0 256(CO1) - prefetcht0 320(CO1) - prefetcht0 256(CO2) - prefetcht0 320(CO2) + vxorpd %xmm8, %xmm8,%xmm8 + vxorpd %xmm9, %xmm9,%xmm9 + vxorpd %xmm10, %xmm10,%xmm10 + vxorpd %xmm11, %xmm11,%xmm11 + vxorpd %xmm12, %xmm12,%xmm12 + vxorpd %xmm13, %xmm13,%xmm13 + vxorpd %xmm14, %xmm14,%xmm14 + vxorpd %xmm15, %xmm15,%xmm15 + + prefetcht0 (CO1) + prefetcht0 8*SIZE(CO1) + prefetcht0 (CO1,LDC) + prefetcht0 8*SIZE(CO1,LDC) + prefetcht0 (CO2) + prefetcht0 8*SIZE(CO2) + prefetcht0 (CO2,LDC) + prefetcht0 8*(CO2,LDC) vmovups -16 * SIZE(AO), %xmm0 vmovddup -16 * SIZE(BO), %xmm1 vmovddup -15 * SIZE(BO), %xmm3 - vmovups -8 * SIZE(AO), %xmm4 - vmovddup -8 * SIZE(BO), %xmm5 vmovaps %xmm0, %xmm2 @@ -448,8 +451,10 @@ .align 16 .L12: - prefetcht0 (AO,%rax,4) - prefetcht0 (BO,%rax,4) + prefetcht0 24*SIZE(AO,%rax,4) + prefetcht0 32*SIZE(AO,%rax,4) + prefetcht0 24*SIZE(BO,%rax,4) + prefetcht0 32*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -460,6 +465,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 + prefetcht0 24*SIZE(AO,%rax,4) + prefetcht0 32*SIZE(AO,%rax,4) + prefetcht0 24*SIZE(BO,%rax,4) + prefetcht0 32*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -470,6 +479,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 + prefetcht0 24*SIZE(AO,%rax,4) + prefetcht0 32*SIZE(AO,%rax,4) + prefetcht0 24*SIZE(BO,%rax,4) + prefetcht0 32*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -480,6 +493,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 + prefetcht0 24*SIZE(AO,%rax,4) + prefetcht0 32*SIZE(AO,%rax,4) + prefetcht0 24*SIZE(BO,%rax,4) + prefetcht0 32*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -490,6 +507,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 + prefetcht0 24*SIZE(AO,%rax,4) + prefetcht0 32*SIZE(AO,%rax,4) + prefetcht0 24*SIZE(BO,%rax,4) + prefetcht0 32*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -500,6 +521,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 + prefetcht0 24*SIZE(AO,%rax,4) + prefetcht0 32*SIZE(AO,%rax,4) + prefetcht0 24*SIZE(BO,%rax,4) + prefetcht0 32*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -510,6 +535,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 + prefetcht0 24*SIZE(AO,%rax,4) + prefetcht0 32*SIZE(AO,%rax,4) + prefetcht0 24*SIZE(BO,%rax,4) + prefetcht0 32*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -520,6 +549,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 + prefetcht0 24*SIZE(AO,%rax,4) + prefetcht0 32*SIZE(AO,%rax,4) + prefetcht0 24*SIZE(BO,%rax,4) + prefetcht0 32*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -619,16 +652,12 @@ #endif - .align 2 vmovups %xmm8, (CO1) vmovups %xmm12, 2 * SIZE(CO1) - .align 2 vmovups %xmm9, (CO1, LDC) vmovups %xmm13, 2 * SIZE(CO1, LDC) - .align 2 vmovups %xmm10, (CO2) vmovups %xmm14, 2 * SIZE(CO2) - .align 2 vmovups %xmm11, (CO2, LDC) vmovups %xmm15, 2 * SIZE(CO2, LDC) @@ -1019,17 +1048,7 @@ vxorps %xmm13, %xmm13,%xmm13 vmovups -16 * SIZE(AO), %xmm0 vmovups -8 * SIZE(AO), %xmm4 - // prefetcht0 256(CO1) - // prefetcht0 320(CO1) - // prefetcht0 256(CO2) - // prefetcht0 320(CO2) - // prefetchnta 24 * SIZE(CO1) - // prefetchnta 32 * SIZE(CO1) - // prefetchw 3 * SIZE(CO1) vmovups %xmm0, %xmm2 - // prefetchw 3 * SIZE(CO2) - // prefetchnta -16 * SIZE(BB) - // prefetch -16 * SIZE(BB) subq $-8 * SIZE, BB #ifndef TRMMKERNEL From 66e64131ed59df1313b7b3a0853744fb8e609ca2 Mon Sep 17 00:00:00 2001 From: wernsaar Date: Tue, 5 Mar 2013 19:51:37 +0100 Subject: [PATCH 152/162] optimized again bulldozer dgemm kernel --- kernel/x86_64/dgemm_kernel_4x4_bulldozer.S | 396 ++++++++++----------- 1 file changed, 191 insertions(+), 205 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S index 9d0c613e4..e43dad4e7 100644 --- a/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S +++ b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S @@ -89,210 +89,199 @@ #define KERNEL1(xx) \ vmovups -16 * SIZE(AO, %rax, 4),%xmm0 ;\ - vfmaddpd %xmm8,%xmm0,%xmm1,%xmm8 ;\ - vmovaps %xmm2,%xmm0 ;\ + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 ;\ vmovddup -16 * SIZE(BO, %rax, 4), %xmm1 ;\ vmovddup -15 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm9,%xmm0,%xmm3,%xmm9 ;\ - vmovups -14 * SIZE(AO, %rax, 4),%xmm2 ;\ + vmovddup -14 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm8,%xmm0,%xmm1,%xmm8 ;\ vfmaddpd %xmm12,%xmm2,%xmm1,%xmm12 ;\ + vmovddup -13 * SIZE(BO, %rax, 4), %xmm7 ;\ + vmovups -12 * SIZE(AO, %rax, 4), %xmm4 ;\ + vmovups -10 * SIZE(AO, %rax, 4),%xmm6 ;\ + vfmaddpd %xmm9,%xmm0,%xmm3,%xmm9 ;\ vfmaddpd %xmm13,%xmm2,%xmm3,%xmm13 ;\ - vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ - vmovddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm0,%xmm1,%xmm10 ;\ - vfmaddpd %xmm11,%xmm0,%xmm3,%xmm11 ;\ - vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ - vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ - vmovups -12 * SIZE(AO, %rax, 4), %xmm0 ;\ - vmovups -10 * SIZE(AO, %rax, 4),%xmm2 ;\ - vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ - vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm0,%xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm0,%xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm7,%xmm15 ;\ #define KERNEL2(xx) \ - vmovups -8 * SIZE(AO, %rax, 4),%xmm4 ;\ - vfmaddpd %xmm8, %xmm0, %xmm1,%xmm8 ;\ - vfmaddpd %xmm9, %xmm0, %xmm3,%xmm9 ;\ - vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ - vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ - vmovddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ - vmovddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 ;\ - vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 ;\ - vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ - vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ - vmovddup -8 * SIZE(BO, %rax, 4), %xmm5 ;\ - vmovddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup -10 * SIZE(BO, %rax, 4), %xmm5 ;\ + vmovddup -9 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8, %xmm4, %xmm1,%xmm8 ;\ + vfmaddpd %xmm12,%xmm6, %xmm1,%xmm12 ;\ + vfmaddpd %xmm9, %xmm4, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13,%xmm6, %xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm4, %xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm6, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm4, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm6, %xmm7,%xmm15 ;\ #define KERNEL3(xx) \ + vmovups -8 * SIZE(AO, %rax, 4),%xmm0 ;\ vmovups -6 * SIZE(AO, %rax, 4),%xmm2 ;\ - vfmaddpd %xmm8, %xmm4, %xmm5, %xmm8 ;\ - vfmaddpd %xmm9, %xmm4, %xmm3,%xmm9 ;\ - vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ - vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -8 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ vmovddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ - vmovddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm4, %xmm5,%xmm10 ;\ - vfmaddpd %xmm11,%xmm4, %xmm3, %xmm11 ;\ - vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ - vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup -5 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8, %xmm0, %xmm1, %xmm8 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ vmovups -4 * SIZE(AO, %rax, 4), %xmm4 ;\ - vmovups -2 * SIZE(AO, %rax, 4),%xmm2 ;\ - vmovddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ - vmovddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovups -2 * SIZE(AO, %rax, 4),%xmm6 ;\ + vfmaddpd %xmm9, %xmm0, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm0, %xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm0, %xmm7, %xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm7,%xmm15 ;\ #define KERNEL4(xx) \ - vfmaddpd %xmm8,%xmm4, %xmm5,%xmm8 ;\ - vfmaddpd %xmm9,%xmm4, %xmm3,%xmm9 ;\ - vfmaddpd %xmm12,%xmm2, %xmm5 ,%xmm12;\ - vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -4 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ vmovddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ - vmovddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup -1 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8,%xmm4, %xmm1,%xmm8 ;\ + vfmaddpd %xmm12,%xmm6, %xmm1 ,%xmm12;\ + vfmaddpd %xmm9,%xmm4, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13,%xmm6, %xmm3,%xmm13 ;\ vfmaddpd %xmm10,%xmm4, %xmm5,%xmm10 ;\ - vfmaddpd %xmm11,%xmm4, %xmm3,%xmm11 ;\ - vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ - vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ -/*A*/ vmovups (AO, %rax, 4), %xmm6 ;\ - vmovups 2 * SIZE(AO, %rax, 4),%xmm2 ;\ - vmovddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ -/**/ vmovddup (BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm14,%xmm6, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm4, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm6, %xmm7,%xmm15 ;\ #define KERNEL5(xx) \ - vfmaddpd %xmm8,%xmm6, %xmm1,%xmm8 ;\ - vfmaddpd %xmm9,%xmm6, %xmm3,%xmm9 ;\ + vmovups (AO, %rax, 4), %xmm0 ;\ + vmovups 2 * SIZE(AO, %rax, 4),%xmm2 ;\ + vmovddup (BO, %rax, 4), %xmm1 ;\ + vmovddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup 2 * SIZE(BO, %rax, 4), %xmm5 ;\ + vmovddup 3 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 ;\ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vmovups 4 * SIZE(AO, %rax, 4), %xmm4 ;\ + vmovups 6 * SIZE(AO, %rax, 4),%xmm6 ;\ + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 ;\ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ - vmovddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ - vmovddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm6, %xmm1,%xmm10 ;\ - vfmaddpd %xmm11,%xmm6, %xmm3,%xmm11 ;\ - vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ - vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ - vmovups 4 * SIZE(AO, %rax, 4), %xmm6 ;\ - vmovups 6 * SIZE(AO, %rax, 4),%xmm2 ;\ - vmovddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ - vmovddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm0, %xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm0, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm7,%xmm15 ;\ #define KERNEL6(xx) \ - vfmaddpd %xmm8,%xmm6, %xmm1,%xmm8 ;\ - vfmaddpd %xmm9,%xmm6, %xmm3,%xmm9 ;\ - vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ - vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ - vmovddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ - vmovddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm6, %xmm1,%xmm10 ;\ - vfmaddpd %xmm11,%xmm6, %xmm3,%xmm11 ;\ - vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ - vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ -/*A*/ vmovups 8 * SIZE(AO, %rax, 4), %xmm7 ;\ - vmovups 10 * SIZE(AO, %rax, 4),%xmm2 ;\ -/**/ vmovddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ - vmovddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovddup 6 * SIZE(BO, %rax, 4), %xmm5 ;\ + vmovddup 7 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8,%xmm4, %xmm1,%xmm8 ;\ + vfmaddpd %xmm12,%xmm6, %xmm1,%xmm12 ;\ + vfmaddpd %xmm9,%xmm4, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13,%xmm6, %xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm4, %xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm6, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm4, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm6, %xmm7,%xmm15 ;\ #define KERNEL7(xx) \ - vfmaddpd %xmm8,%xmm7, %xmm5,%xmm8 ;\ - vfmaddpd %xmm9,%xmm7, %xmm3,%xmm9 ;\ - vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ - vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovups 8 * SIZE(AO, %rax, 4), %xmm0 ;\ + vmovups 10 * SIZE(AO, %rax, 4),%xmm2 ;\ + vmovddup 8 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ vmovddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ - vmovddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm7, %xmm5,%xmm10 ;\ - vfmaddpd %xmm11,%xmm7, %xmm3,%xmm11 ;\ + vmovddup 11 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vmovups 12 * SIZE(AO, %rax, 4), %xmm4 ;\ + vmovups 14 * SIZE(AO, %rax, 4), %xmm6 ;\ + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm0, %xmm5,%xmm10 ;\ vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ - vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ - vmovups 12 * SIZE(AO, %rax, 4), %xmm7 ;\ - vmovups 14 * SIZE(AO, %rax, 4),%xmm2 ;\ - vmovddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ - vmovddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm11,%xmm0, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm7,%xmm15 ;\ #define KERNEL8(xx) \ - vfmaddpd %xmm8,%xmm7, %xmm5,%xmm8 ;\ - vfmaddpd %xmm9,%xmm7, %xmm3,%xmm9 ;\ - vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ - vfmaddpd %xmm13, %xmm2, %xmm3,%xmm13 ;\ + vmovddup 12 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ vmovddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ - vmovddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ - vfmaddpd %xmm10,%xmm7, %xmm5,%xmm10 ;\ - vfmaddpd %xmm11,%xmm7, %xmm3,%xmm11 ;\ - vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ - vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ -/*A*/ vmovups 16 * SIZE(AO, %rax, 4), %xmm0 ;\ - vmovddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ - vmovddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ - vmovaps %xmm0, %xmm2 ;\ + vmovddup 15 * SIZE(BO, %rax, 4), %xmm7 ;\ + vfmaddpd %xmm8,%xmm4, %xmm1,%xmm8 ;\ + vfmaddpd %xmm12,%xmm6, %xmm1,%xmm12 ;\ + vfmaddpd %xmm9,%xmm4, %xmm3,%xmm9 ;\ + vfmaddpd %xmm13, %xmm6, %xmm3,%xmm13 ;\ + vfmaddpd %xmm10,%xmm4, %xmm5,%xmm10 ;\ + vfmaddpd %xmm14,%xmm6, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm4, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm6, %xmm7,%xmm15 ;\ addq $8 * SIZE, %rax ;\ #define KERNEL_SUB1(xx) \ - vmovddup -15 * SIZE(BO), %xmm3 ;\ - vmovups -16 * SIZE(AO),%xmm0 ;\ - vfmaddpd %xmm8, %xmm1, %xmm0,%xmm8 ;\ - vmovapd %xmm2, %xmm0 ;\ - vmovups -14 * SIZE(AO),%xmm2 ;\ + vmovups -16 * SIZE(AO),%xmm0 ;\ + vmovups -14 * SIZE(AO),%xmm2 ;\ + vmovddup -16 * SIZE(BO), %xmm1 ;\ + vmovddup -15 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm8, %xmm0, %xmm1,%xmm8 ;\ + vfmaddpd %xmm9, %xmm0, %xmm3,%xmm9 ;\ vfmaddpd %xmm12, %xmm2, %xmm1,%xmm12 ;\ - vmovddup -14 * SIZE(BO), %xmm1 ;\ - vfmaddpd %xmm9, %xmm3, %xmm0,%xmm9 ;\ vfmaddpd %xmm13, %xmm2, %xmm3,%xmm13 ;\ - vmovddup -13 * SIZE(BO), %xmm3 ;\ - vfmaddpd %xmm10, %xmm1, %xmm0,%xmm10 ;\ - vfmaddpd %xmm14,%xmm2, %xmm1 ,%xmm14 ;\ - vfmaddpd %xmm11, %xmm3, %xmm0,%xmm11 ;\ + vmovddup -14 * SIZE(BO), %xmm1 ;\ + vmovddup -13 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10, %xmm0, %xmm1,%xmm10 ;\ + vfmaddpd %xmm11, %xmm0, %xmm3,%xmm11 ;\ + vfmaddpd %xmm14, %xmm2, %xmm1,%xmm14 ;\ vfmaddpd %xmm15, %xmm2, %xmm3,%xmm15 ;\ - vmovups -12 * SIZE(AO), %xmm0 ;\ - vmovddup -12 * SIZE(BO), %xmm1 ;\ - vmovddup -11 * SIZE(BO), %xmm3 ;\ - vmovapd %xmm0, %xmm2 #define KERNEL_SUB2(xx) \ - vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 ;\ - vmovaps %xmm2, %xmm0 ;\ - vmovups -10 * SIZE(AO),%xmm2 ;\ - vmovups -8 * SIZE(AO),%xmm4 ;\ + vmovups -12 * SIZE(AO), %xmm0 ;\ + vmovups -10 * SIZE(AO), %xmm2 ;\ + vmovddup -12 * SIZE(BO), %xmm1 ;\ + vmovddup -11 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm8, %xmm0, %xmm1,%xmm8 ;\ + vfmaddpd %xmm9, %xmm0, %xmm3,%xmm9 ;\ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ - vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 ;\ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ vmovddup -10 * SIZE(BO), %xmm1 ;\ vmovddup -9 * SIZE(BO), %xmm3 ;\ - vmovddup -8 * SIZE(BO), %xmm5 ;\ - vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 ;\ + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 ;\ + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 ;\ vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ - vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 ;\ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ - vmovddup -7 * SIZE(BO), %xmm3 ;\ - vmovaps %xmm4, %xmm2 #define KERNEL_SUB3(xx) \ - vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ - vmovaps %xmm2, %xmm4 ;\ - vmovups -6 * SIZE(AO),%xmm2 ;\ - vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ - vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ - vmovddup -6 * SIZE(BO), %xmm5 ;\ + vmovups -8 * SIZE(AO),%xmm0 ;\ + vmovups -6 * SIZE(AO),%xmm2 ;\ + vmovddup -8 * SIZE(BO), %xmm1 ;\ + vmovddup -7 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm8, %xmm0, %xmm1,%xmm8 ;\ + vfmaddpd %xmm9, %xmm0, %xmm3,%xmm9 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -6 * SIZE(BO), %xmm1 ;\ vmovddup -5 * SIZE(BO), %xmm3 ;\ - vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ - vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ - vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ - vmovups -4 * SIZE(AO), %xmm4 ;\ - vmovddup -4 * SIZE(BO), %xmm5 ;\ + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 ;\ + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ - vmovddup -3 * SIZE(BO), %xmm3 ;\ - vmovaps %xmm4, %xmm2 #define KERNEL_SUB4(xx) \ - vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ - vmovaps %xmm2, %xmm4 ;\ - vmovups -2 * SIZE(AO),%xmm2 ;\ - vmovups (AO), %xmm0 ;\ - vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vmovups -4 * SIZE(AO), %xmm0 ;\ + vmovups -2 * SIZE(AO), %xmm2 ;\ + vmovddup -4 * SIZE(BO), %xmm1 ;\ + vmovddup -3 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm8, %xmm0, %xmm1,%xmm8 ;\ + vfmaddpd %xmm9, %xmm0, %xmm3,%xmm9 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ - vmovddup -2 * SIZE(BO), %xmm5 ;\ - vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vmovddup -2 * SIZE(BO), %xmm1 ;\ vmovddup -1 * SIZE(BO), %xmm3 ;\ - vmovddup (BO), %xmm1 ;\ - vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ - vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ - vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 ;\ + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovups (AO), %xmm0 ;\ + vmovddup (BO), %xmm1 ;\ vmovddup 1 * SIZE(BO), %xmm3 ;\ vmovaps %xmm0, %xmm2 @@ -410,19 +399,9 @@ vxorpd %xmm15, %xmm15,%xmm15 prefetcht0 (CO1) - prefetcht0 8*SIZE(CO1) prefetcht0 (CO1,LDC) - prefetcht0 8*SIZE(CO1,LDC) prefetcht0 (CO2) - prefetcht0 8*SIZE(CO2) prefetcht0 (CO2,LDC) - prefetcht0 8*(CO2,LDC) - vmovups -16 * SIZE(AO), %xmm0 - vmovddup -16 * SIZE(BO), %xmm1 - vmovddup -15 * SIZE(BO), %xmm3 - - vmovaps %xmm0, %xmm2 - #ifndef TRMMKERNEL movq K, %rax @@ -447,14 +426,18 @@ negq %rax NOBRANCH je .L15 - ALIGN_4 + // ALIGN_4 .align 16 + +#define PR1 16 +#define PR2 24 + .L12: - prefetcht0 24*SIZE(AO,%rax,4) - prefetcht0 32*SIZE(AO,%rax,4) - prefetcht0 24*SIZE(BO,%rax,4) - prefetcht0 32*SIZE(BO,%rax,4) + prefetcht0 PR1*SIZE(AO,%rax,4) + prefetcht0 PR2*SIZE(AO,%rax,4) + prefetcht0 PR1*SIZE(BO,%rax,4) + prefetcht0 PR2*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -465,10 +448,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 - prefetcht0 24*SIZE(AO,%rax,4) - prefetcht0 32*SIZE(AO,%rax,4) - prefetcht0 24*SIZE(BO,%rax,4) - prefetcht0 32*SIZE(BO,%rax,4) + prefetcht0 PR1*SIZE(AO,%rax,4) + prefetcht0 PR2*SIZE(AO,%rax,4) + prefetcht0 PR1*SIZE(BO,%rax,4) + prefetcht0 PR2*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -479,10 +462,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 - prefetcht0 24*SIZE(AO,%rax,4) - prefetcht0 32*SIZE(AO,%rax,4) - prefetcht0 24*SIZE(BO,%rax,4) - prefetcht0 32*SIZE(BO,%rax,4) + prefetcht0 PR1*SIZE(AO,%rax,4) + prefetcht0 PR2*SIZE(AO,%rax,4) + prefetcht0 PR1*SIZE(BO,%rax,4) + prefetcht0 PR2*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -493,10 +476,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 - prefetcht0 24*SIZE(AO,%rax,4) - prefetcht0 32*SIZE(AO,%rax,4) - prefetcht0 24*SIZE(BO,%rax,4) - prefetcht0 32*SIZE(BO,%rax,4) + prefetcht0 PR1*SIZE(AO,%rax,4) + prefetcht0 PR2*SIZE(AO,%rax,4) + prefetcht0 PR1*SIZE(BO,%rax,4) + prefetcht0 PR2*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -507,10 +490,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 - prefetcht0 24*SIZE(AO,%rax,4) - prefetcht0 32*SIZE(AO,%rax,4) - prefetcht0 24*SIZE(BO,%rax,4) - prefetcht0 32*SIZE(BO,%rax,4) + prefetcht0 PR1*SIZE(AO,%rax,4) + prefetcht0 PR2*SIZE(AO,%rax,4) + prefetcht0 PR1*SIZE(BO,%rax,4) + prefetcht0 PR2*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -521,10 +504,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 - prefetcht0 24*SIZE(AO,%rax,4) - prefetcht0 32*SIZE(AO,%rax,4) - prefetcht0 24*SIZE(BO,%rax,4) - prefetcht0 32*SIZE(BO,%rax,4) + prefetcht0 PR1*SIZE(AO,%rax,4) + prefetcht0 PR2*SIZE(AO,%rax,4) + prefetcht0 PR1*SIZE(BO,%rax,4) + prefetcht0 PR2*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -535,10 +518,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 - prefetcht0 24*SIZE(AO,%rax,4) - prefetcht0 32*SIZE(AO,%rax,4) - prefetcht0 24*SIZE(BO,%rax,4) - prefetcht0 32*SIZE(BO,%rax,4) + prefetcht0 PR1*SIZE(AO,%rax,4) + prefetcht0 PR2*SIZE(AO,%rax,4) + prefetcht0 PR1*SIZE(BO,%rax,4) + prefetcht0 PR2*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -549,10 +532,10 @@ KERNEL8(16 * 0) NOBRANCH je .L15 - prefetcht0 24*SIZE(AO,%rax,4) - prefetcht0 32*SIZE(AO,%rax,4) - prefetcht0 24*SIZE(BO,%rax,4) - prefetcht0 32*SIZE(BO,%rax,4) + prefetcht0 PR1*SIZE(AO,%rax,4) + prefetcht0 PR2*SIZE(AO,%rax,4) + prefetcht0 PR1*SIZE(BO,%rax,4) + prefetcht0 PR2*SIZE(BO,%rax,4) KERNEL1(16 * 0) KERNEL2(16 * 0) KERNEL3(16 * 0) @@ -601,23 +584,26 @@ ALIGN_4 .L17: - vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 - vmovaps %xmm2, %xmm0 - vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vmovups -16 * SIZE(AO, %rax, 4), %xmm0 + vmovups -14 * SIZE(AO, %rax, 4), %xmm2 + vmovddup -16 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -15 * SIZE(BO, %rax, 4), %xmm3 + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 - vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 - vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 vmovddup -13 * SIZE(BO, %rax, 4), %xmm3 - vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 - vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 - vmovups -12 * SIZE(AO, %rax, 4), %xmm0 - vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 +/* + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 - vmovaps %xmm0, %xmm2 - + vmovaps %xmm0, %xmm2 +*/ addq $SIZE, %rax jl .L17 ALIGN_4 From f300ce3df52656424d2a253732932ef691f7728a Mon Sep 17 00:00:00 2001 From: wernsaar Date: Wed, 6 Mar 2013 17:26:03 +0100 Subject: [PATCH 153/162] new optimization of dgemm kernel for bulldozer: 10% performance increase --- kernel/x86_64/dgemm_kernel_4x4_bulldozer.S | 138 +++------------------ 1 file changed, 20 insertions(+), 118 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S index e43dad4e7..f8a316b64 100644 --- a/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S +++ b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S @@ -340,7 +340,7 @@ vmovsd %xmm0, ALPHA - salq $BASE_SHIFT, LDC + salq $BASE_SHIFT, LDC # LDC << 3 # LDC * 8 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET @@ -350,7 +350,7 @@ #endif #endif movq N, J - sarq $2, J # j = (n >> 2) + sarq $2, J # j = (n >> 2) # j = n / 4 jle .L40 ALIGN_4 @@ -434,104 +434,6 @@ #define PR2 24 .L12: - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 - prefetcht0 PR1*SIZE(AO,%rax,4) - prefetcht0 PR2*SIZE(AO,%rax,4) - prefetcht0 PR1*SIZE(BO,%rax,4) - prefetcht0 PR2*SIZE(BO,%rax,4) - KERNEL1(16 * 0) - KERNEL2(16 * 0) - KERNEL3(16 * 0) - KERNEL4(16 * 0) - KERNEL5(16 * 0) - KERNEL6(16 * 0) - KERNEL7(16 * 0) - KERNEL8(16 * 0) - NOBRANCH - je .L15 prefetcht0 PR1*SIZE(AO,%rax,4) prefetcht0 PR2*SIZE(AO,%rax,4) prefetcht0 PR1*SIZE(BO,%rax,4) @@ -986,15 +888,15 @@ jg .L01 ALIGN_4 -.L40: - testq $3, N - je .L999 +.L40: # N % 4 + testq $3, N # N % 4 == 3 + je .L999 # Jump to end if N % 4 == 0 - testq $2, N + testq $2, N # N % 4 == 2 je .L80 ALIGN_4 -.L41: +.L41: # N % 4 > 1 #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK @@ -1002,14 +904,14 @@ movq C, CO1 # coffset1 = c leaq (C, LDC, 1), CO2 # coffset2 = c + ldc - movq A, AO # aoffset = a + movq A, AO # aoffset = a movq K, %rax - salq $BASE_SHIFT + 1, %rax + salq $BASE_SHIFT + 1, %rax # k << 4 leaq (B, %rax), BB movq M, I - sarq $2, I # i = (m >> 2) + sarq $2, I # i = (m >> 2) jle .L60 ALIGN_4 @@ -1063,12 +965,12 @@ je .L56 ALIGN_4 -.L52: +.L52: # Loop for (N % 4) == 2 vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 - vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 - vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 vmovddup -13 * SIZE(BO, %rax, 2), %xmm5 @@ -1076,15 +978,15 @@ vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9 - vmovups (AO, %rax, 4), %xmm0 - vmovddup -8 * SIZE(BO, %rax, 2), %xmm1 + vmovups (AO, %rax, 4), %xmm0 + vmovddup -8 * SIZE(BO, %rax, 2), %xmm1 vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 vmovddup -11 * SIZE(BO, %rax, 2), %xmm5 - vmovups -6 * SIZE(AO, %rax, 4), %xmm2 + vmovups -6 * SIZE(AO, %rax, 4), %xmm2 vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8 vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12 vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9 - vmovups -4 * SIZE(AO, %rax, 4), %xmm4 + vmovups -4 * SIZE(AO, %rax, 4), %xmm4 vmovddup -10 * SIZE(BO, %rax, 2), %xmm3 vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 vmovddup -9 * SIZE(BO, %rax, 2), %xmm5 @@ -1093,7 +995,7 @@ vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12 vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9 vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 - vmovups 8 * SIZE(AO, %rax, 4), %xmm4 + vmovups 8 * SIZE(AO, %rax, 4), %xmm4 vmovddup -4 * SIZE(BO, %rax, 2), %xmm3 vmovddup -7 * SIZE(BO, %rax, 2), %xmm5 vmovaps %xmm0, %xmm2 @@ -1455,8 +1357,8 @@ ALIGN_4 .L80: - testq $1, N - je .L999 + testq $1, N # N % 4 == 1 + je .L999 # Jump to end if N % 4 == 0 ALIGN_4 .L81: From 724ae159cedfe8e080c8b18ed21b1769c83cc2c0 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 8 Mar 2013 22:28:34 +0800 Subject: [PATCH 154/162] Fixed the Windows x86_64 ABI bug in s/daxpy kernels. --- kernel/x86_64/axpy_sse.S | 6 +++++- kernel/x86_64/axpy_sse2.S | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/axpy_sse.S b/kernel/x86_64/axpy_sse.S index 9a7512575..2a9e928ed 100644 --- a/kernel/x86_64/axpy_sse.S +++ b/kernel/x86_64/axpy_sse.S @@ -69,7 +69,7 @@ #endif movaps %xmm0, ALPHA #else - movaps %xmm3, ALPHA + movq 40(%rsp), X movq 48(%rsp), INCX @@ -79,6 +79,10 @@ SAVEREGISTERS +#ifdef WINDOWS_ABI + movaps %xmm3, ALPHA +#endif + shufps $0, ALPHA, ALPHA leaq (, INCX, SIZE), INCX diff --git a/kernel/x86_64/axpy_sse2.S b/kernel/x86_64/axpy_sse2.S index dea8d0382..45c7b0380 100644 --- a/kernel/x86_64/axpy_sse2.S +++ b/kernel/x86_64/axpy_sse2.S @@ -69,7 +69,6 @@ #endif movaps %xmm0, ALPHA #else - movaps %xmm3, ALPHA movq 40(%rsp), X movq 48(%rsp), INCX @@ -79,6 +78,10 @@ SAVEREGISTERS +#ifdef WINDOWS_ABI + movaps %xmm3, ALPHA +#endif + unpcklpd ALPHA, ALPHA leaq (, INCX, SIZE), INCX From d831b2ff8b60bb55e5697cc00f49018dc7b90154 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 10 Mar 2013 01:01:16 +0800 Subject: [PATCH 155/162] Override CFLAGS in LAPACK make.in. --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index a84b54d4b..fc6f3db01 100644 --- a/Makefile +++ b/Makefile @@ -225,9 +225,9 @@ ifndef NOFORTRAN -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc ifdef INTERFACE64 - -@echo "CFLAGS = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "override CFLAGS = $(CFLAGS) -DHAVE_LAPACK_CONFIG_H -DLAPACK_ILP64" >> $(NETLIB_LAPACK_DIR)/make.inc else - -@echo "CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "override CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc endif -@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc From 773c01f4966b2ca0d9cbb654a7f188358372da0c Mon Sep 17 00:00:00 2001 From: Explorer09 Date: Sun, 17 Mar 2013 22:48:24 +0800 Subject: [PATCH 156/162] Typo correction in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ed5f196c7..f932b5141 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ On X86 box, compile this library for loongson3a CPU. make DEBUG=1 -### Intall to the directory (Optional) +### Install to the directory (Optional) Example: From 309f90e563776d54787e04c449f38a55cd9240e4 Mon Sep 17 00:00:00 2001 From: Explorer09 Date: Sun, 17 Mar 2013 23:03:05 +0800 Subject: [PATCH 157/162] TargetList.txt: minor re-ordering --- TargetList.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TargetList.txt b/TargetList.txt index c859db082..ce35a3faa 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -8,8 +8,8 @@ Supported List: 1.X86/X86_64 a)Intel CPU: P2 -COPPERMINE KATMAI +COPPERMINE NORTHWOOD PRESCOTT BANIAS From b47f13ee4c2e33ba45a2f869fb4c845cb3a2074f Mon Sep 17 00:00:00 2001 From: Explorer09 Date: Sun, 17 Mar 2013 23:07:48 +0800 Subject: [PATCH 158/162] getarch.c: Minor re-ordering of architecture list --- getarch.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/getarch.c b/getarch.c index 2b9856338..2e0f2ed42 100644 --- a/getarch.c +++ b/getarch.c @@ -96,6 +96,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_PENRYN */ /* #define FORCE_DUNNINGTON */ /* #define FORCE_NEHALEM */ +/* #define FORCE_SANDYBRIDGE */ +/* #define FORCE_ATOM */ /* #define FORCE_ATHLON */ /* #define FORCE_OPTERON */ /* #define FORCE_OPTERON_SSE3 */ @@ -103,7 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_SHANGHAI */ /* #define FORCE_ISTANBUL */ /* #define FORCE_BULLDOZER */ -/* #define FORCE_BOBCAT */ +/* #define FORCE_BOBCAT */ /* #define FORCE_SSE_GENERIC */ /* #define FORCE_VIAC3 */ /* #define FORCE_NANO */ @@ -118,12 +120,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_PPC440FP2 */ /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ -/* #define FORCE_LOONGSON3A */ -/* #define FORCE_LOONGSON3B */ +/* #define FORCE_LOONGSON3A */ +/* #define FORCE_LOONGSON3B */ /* #define FORCE_ITANIUM2 */ -/* #define FORCE_GENERIC */ /* #define FORCE_SPARC */ /* #define FORCE_SPARCV7 */ +/* #define FORCE_GENERIC */ #ifdef FORCE_P2 #define FORCE @@ -139,32 +141,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "P5" #endif -#ifdef FORCE_COPPERMINE +#ifdef FORCE_KATMAI #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "PENTIUM3" #define ARCHCONFIG "-DPENTIUM3 " \ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ - "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " -#define LIBNAME "coppermine" -#define CORENAME "COPPERMINE" +#define LIBNAME "katmai" +#define CORENAME "KATMAI" #endif -#ifdef FORCE_KATMAI +#ifdef FORCE_COPPERMINE #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "PENTIUM3" #define ARCHCONFIG "-DPENTIUM3 " \ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ - "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " -#define LIBNAME "katmai" -#define CORENAME "KATMAI" +#define LIBNAME "coppermine" +#define CORENAME "COPPERMINE" #endif #ifdef FORCE_NORTHWOOD From 53588bc78604a88b29fc9296723ed3e0b0ff40b8 Mon Sep 17 00:00:00 2001 From: Explorer09 Date: Sun, 17 Mar 2013 23:09:23 +0800 Subject: [PATCH 159/162] getarch.c: Minor re-ordering of architecture list --- getarch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/getarch.c b/getarch.c index 2e0f2ed42..ac10f1cd5 100644 --- a/getarch.c +++ b/getarch.c @@ -104,8 +104,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_BARCELONA */ /* #define FORCE_SHANGHAI */ /* #define FORCE_ISTANBUL */ -/* #define FORCE_BULLDOZER */ /* #define FORCE_BOBCAT */ +/* #define FORCE_BULLDOZER */ /* #define FORCE_SSE_GENERIC */ /* #define FORCE_VIAC3 */ /* #define FORCE_NANO */ From 7a9b94b51973325bed0ac824906332b8760a8d72 Mon Sep 17 00:00:00 2001 From: wlbksy Date: Sat, 23 Mar 2013 14:41:26 +0800 Subject: [PATCH 160/162] Fix #204 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index fc6f3db01..bde7cf376 100644 --- a/Makefile +++ b/Makefile @@ -267,7 +267,7 @@ else ifeq ($(OSNAME), FreeBSD) fetch $(LAPACK_URL) else - wget $(LAPACK_URL) + wget -O $@ $(LAPACK_URL) endif endif endif @@ -320,4 +320,4 @@ clean :: rm -rf $(NETLIB_LAPACK_DIR) ;\ fi @rm -f *.grd Makefile.conf_last config_last.h - @echo Done. \ No newline at end of file + @echo Done. From 6b01d587127536673e0db38231e92945701afeb0 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sat, 30 Mar 2013 20:12:43 +0000 Subject: [PATCH 161/162] Disable the optimization of muli-threading gemm on the Loongson3A. --- driver/level3/gemm_thread_n.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c index f9007f831..3e11f9aba 100644 --- a/driver/level3/gemm_thread_n.c +++ b/driver/level3/gemm_thread_n.c @@ -71,7 +71,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( queue[num_cpu].args = arg; queue[num_cpu].range_m = range_m; queue[num_cpu].range_n = &range[num_cpu]; -#if defined(LOONGSON3A) +#if 0 //defined(LOONGSON3A) queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu; queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5; #else @@ -83,7 +83,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( } if (num_cpu) { -#if defined(LOONGSON3A) +#if 0 //defined(LOONGSON3A) queue[0].sa = sa; queue[0].sb = sa + GEMM_OFFSET_A1 * 5; #else From 1a57717b1a19efd0d8dd7a01ae0b1c242e0a2742 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sun, 7 Apr 2013 15:42:07 +0800 Subject: [PATCH 162/162] Added the configuration of Loongcc compiler for Loongson 3 CPU. --- Makefile.system | 35 +++++++++++++++++++++++++++++++++++ README.md | 4 ++++ 2 files changed, 39 insertions(+) diff --git a/Makefile.system b/Makefile.system index eac61e961..5f8c251b0 100644 --- a/Makefile.system +++ b/Makefile.system @@ -514,11 +514,28 @@ ifdef INTERFACE64 FCOMMON_OPT += -i8 endif endif + +ifeq ($(ARCH), mips64) +ifndef BINARY64 +FCOMMON_OPT += -n32 +else +FCOMMON_OPT += -n64 +endif +ifeq ($(CORE), LOONGSON3A) +FCOMMON_OPT += -loongson3 +endif + +ifeq ($(CORE), LOONGSON3B) +FCOMMON_OPT += -loongson3 +endif + +else ifndef BINARY64 FCOMMON_OPT += -m32 else FCOMMON_OPT += -m64 endif +endif ifdef USE_OPENMP FEXTRALIB += -lstdc++ @@ -527,12 +544,30 @@ endif endif ifeq ($(C_COMPILER), OPEN64) + +ifeq ($(ARCH), mips64) +ifndef BINARY64 +CCOMMON_OPT += -n32 +else +CCOMMON_OPT += -n64 +endif +ifeq ($(CORE), LOONGSON3A) +CCOMMON_OPT += -loongson3 +endif + +ifeq ($(CORE), LOONGSON3B) +CCOMMON_OPT += -loongson3 +endif + +else + ifndef BINARY64 CCOMMON_OPT += -m32 else CCOMMON_OPT += -m64 endif endif +endif ifeq ($(C_COMPILER), SUN) CCOMMON_OPT += -w diff --git a/README.md b/README.md index f932b5141..0e3a49530 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,10 @@ On X86 box, compile this library for loongson3a CPU. make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A +On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler. + + make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 + ### Debug version make DEBUG=1