| @@ -277,14 +277,14 @@ ifeq ($(ARCH), x86) | |||||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | ||||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | ifneq ($(NO_AVX), 1) | ||||
| DYNAMIC_CORE += SANDYBRIDGE | |||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | ifneq ($(NO_AVX), 1) | ||||
| DYNAMIC_CORE += SANDYBRIDGE | |||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -125,7 +125,8 @@ | |||||
| #define HAVE_MISALIGNSSE (1 << 15) | #define HAVE_MISALIGNSSE (1 << 15) | ||||
| #define HAVE_128BITFPU (1 << 16) | #define HAVE_128BITFPU (1 << 16) | ||||
| #define HAVE_FASTMOVU (1 << 17) | #define HAVE_FASTMOVU (1 << 17) | ||||
| #define HAVE_AVX (1 << 18) | |||||
| #define HAVE_AVX (1 << 18) | |||||
| #define HAVE_FMA4 (1 << 19) | |||||
| #define CACHE_INFO_L1_I 1 | #define CACHE_INFO_L1_I 1 | ||||
| #define CACHE_INFO_L1_D 2 | #define CACHE_INFO_L1_D 2 | ||||
| @@ -43,6 +43,8 @@ | |||||
| #ifdef NO_AVX | #ifdef NO_AVX | ||||
| #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | ||||
| #define CORE_SANDYBRIDGE CORE_NEHALEM | #define CORE_SANDYBRIDGE CORE_NEHALEM | ||||
| #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | |||||
| #define CORE_BULLDOZER CORE_BARCELONA | |||||
| #endif | #endif | ||||
| #ifndef CPUIDEMU | #ifndef CPUIDEMU | ||||
| @@ -228,6 +230,9 @@ int get_cputype(int gettype){ | |||||
| cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||||
| if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; | if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; | ||||
| if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; | if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; | ||||
| #ifndef NO_AVX | |||||
| if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4; | |||||
| #endif | |||||
| if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; | if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; | ||||
| if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; | if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; | ||||
| } | } | ||||
| @@ -1075,8 +1080,12 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_OPTERON; | return CPUTYPE_OPTERON; | ||||
| case 1: | case 1: | ||||
| case 10: | case 10: | ||||
| case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| return CPUTYPE_BARCELONA; | return CPUTYPE_BARCELONA; | ||||
| case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return CPUTYPE_BULLDOZER; | |||||
| else | |||||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||||
| case 5: | case 5: | ||||
| return CPUTYPE_BOBCAT; | return CPUTYPE_BOBCAT; | ||||
| } | } | ||||
| @@ -1427,8 +1436,13 @@ int get_coretype(void){ | |||||
| if (family == 0xf){ | if (family == 0xf){ | ||||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | ||||
| else if (exfamily == 5) return CORE_BOBCAT; | else if (exfamily == 5) return CORE_BOBCAT; | ||||
| else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| else return CORE_BARCELONA; | |||||
| else if (exfamily == 6) { | |||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return CORE_BULLDOZER; | |||||
| else | |||||
| return CORE_BARCELONA; //OS don't support AVX. Use old kernels. | |||||
| }else return CORE_BARCELONA; | |||||
| } | } | ||||
| } | } | ||||
| @@ -63,9 +63,11 @@ extern gotoblas_t gotoblas_BARCELONA; | |||||
| extern gotoblas_t gotoblas_BOBCAT; | extern gotoblas_t gotoblas_BOBCAT; | ||||
| #ifndef NO_AVX | #ifndef NO_AVX | ||||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | extern gotoblas_t gotoblas_SANDYBRIDGE; | ||||
| extern gotoblas_t gotoblas_BULLDOZER; | |||||
| #else | #else | ||||
| //Use NEHALEM kernels for sandy bridge | //Use NEHALEM kernels for sandy bridge | ||||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | ||||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | |||||
| #endif | #endif | ||||
| @@ -202,6 +204,14 @@ static gotoblas_t *get_coretype(void){ | |||||
| else return &gotoblas_OPTERON; | else return &gotoblas_OPTERON; | ||||
| } else if (exfamily == 5) { | } else if (exfamily == 5) { | ||||
| return &gotoblas_BOBCAT; | return &gotoblas_BOBCAT; | ||||
| } else if (exfamily == 6) { | |||||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||||
| if(support_avx()) | |||||
| return &gotoblas_BULLDOZER; | |||||
| else{ | |||||
| fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n"); | |||||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } else { | } else { | ||||
| return &gotoblas_BARCELONA; | return &gotoblas_BARCELONA; | ||||
| } | } | ||||
| @@ -238,6 +248,7 @@ static char *corename[] = { | |||||
| "Nano", | "Nano", | ||||
| "Sandybridge", | "Sandybridge", | ||||
| "Bobcat", | "Bobcat", | ||||
| "Bulldozer", | |||||
| }; | }; | ||||
| char *gotoblas_corename(void) { | char *gotoblas_corename(void) { | ||||
| @@ -259,6 +270,7 @@ char *gotoblas_corename(void) { | |||||
| if (gotoblas == &gotoblas_NANO) return corename[15]; | if (gotoblas == &gotoblas_NANO) return corename[15]; | ||||
| if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | ||||
| if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | ||||
| if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | |||||
| return corename[0]; | return corename[0]; | ||||
| } | } | ||||
| @@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "OPTERON" | #define CORENAME "OPTERON" | ||||
| #endif | #endif | ||||
| #if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER) | |||||
| #if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) | |||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| #define ARCHITECTURE "X86" | #define ARCHITECTURE "X86" | ||||
| @@ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "BOBCAT" | #define CORENAME "BOBCAT" | ||||
| #endif | #endif | ||||
| #if defined (FORCE_BULLDOZER) | |||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "BULLDOZER" | |||||
| #define ARCHCONFIG "-DBARCELONA " \ | |||||
| "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | |||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" \ | |||||
| "-DHAVE_AVX -DHAVE_FMA4" | |||||
| #define LIBNAME "bulldozer" | |||||
| #define CORENAME "BULLDOZER" | |||||
| #endif | |||||
| #ifdef FORCE_SSE_GENERIC | #ifdef FORCE_SSE_GENERIC | ||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| @@ -0,0 +1,59 @@ | |||||
| SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
| SGEMMINCOPY = | |||||
| SGEMMITCOPY = | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMINCOPYOBJ = | |||||
| SGEMMITCOPYOBJ = | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| CGEMMINCOPY = | |||||
| CGEMMITCOPY = | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = | |||||
| CGEMMITCOPYOBJ = | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S | |||||
| @@ -0,0 +1,62 @@ | |||||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||||
| SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
| SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
| DGEMMINCOPY = | |||||
| DGEMMITCOPY = | |||||
| DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
| DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
| DGEMMINCOPYOBJ = | |||||
| DGEMMITCOPYOBJ = | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMONCOPY = zgemm_ncopy_2.S | |||||
| CGEMMOTCOPY = zgemm_tcopy_2.S | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| ZGEMMINCOPY = | |||||
| ZGEMMITCOPY = | |||||
| ZGEMMONCOPY = zgemm_ncopy_2.S | |||||
| ZGEMMOTCOPY = zgemm_tcopy_2.S | |||||
| ZGEMMINCOPYOBJ = | |||||
| ZGEMMITCOPYOBJ = | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||||
| #define SNUMOPT 8 | #define SNUMOPT 8 | ||||
| #define DNUMOPT 4 | #define DNUMOPT 4 | ||||