| @@ -277,14 +277,14 @@ ifeq ($(ARCH), x86) | |||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | |||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
| ifneq ($(NO_AVX), 1) | |||
| DYNAMIC_CORE += SANDYBRIDGE | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
| ifneq ($(NO_AVX), 1) | |||
| DYNAMIC_CORE += SANDYBRIDGE | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||
| endif | |||
| endif | |||
| @@ -125,7 +125,8 @@ | |||
| #define HAVE_MISALIGNSSE (1 << 15) | |||
| #define HAVE_128BITFPU (1 << 16) | |||
| #define HAVE_FASTMOVU (1 << 17) | |||
| #define HAVE_AVX (1 << 18) | |||
| #define HAVE_AVX (1 << 18) | |||
| #define HAVE_FMA4 (1 << 19) | |||
| #define CACHE_INFO_L1_I 1 | |||
| #define CACHE_INFO_L1_D 2 | |||
| @@ -43,6 +43,8 @@ | |||
| #ifdef NO_AVX | |||
| #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | |||
| #define CORE_SANDYBRIDGE CORE_NEHALEM | |||
| #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | |||
| #define CORE_BULLDOZER CORE_BARCELONA | |||
| #endif | |||
| #ifndef CPUIDEMU | |||
| @@ -228,6 +230,9 @@ int get_cputype(int gettype){ | |||
| cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | |||
| if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; | |||
| if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; | |||
| #ifndef NO_AVX | |||
| if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4; | |||
| #endif | |||
| if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; | |||
| if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; | |||
| } | |||
| @@ -1075,8 +1080,12 @@ int get_cpuname(void){ | |||
| return CPUTYPE_OPTERON; | |||
| case 1: | |||
| case 10: | |||
| case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| return CPUTYPE_BARCELONA; | |||
| case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| if(support_avx()) | |||
| return CPUTYPE_BULLDOZER; | |||
| else | |||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||
| case 5: | |||
| return CPUTYPE_BOBCAT; | |||
| } | |||
| @@ -1427,8 +1436,13 @@ int get_coretype(void){ | |||
| if (family == 0xf){ | |||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | |||
| else if (exfamily == 5) return CORE_BOBCAT; | |||
| else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| else return CORE_BARCELONA; | |||
| else if (exfamily == 6) { | |||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| if(support_avx()) | |||
| return CORE_BULLDOZER; | |||
| else | |||
| return CORE_BARCELONA; //OS don't support AVX. Use old kernels. | |||
| }else return CORE_BARCELONA; | |||
| } | |||
| } | |||
| @@ -63,9 +63,11 @@ extern gotoblas_t gotoblas_BARCELONA; | |||
| extern gotoblas_t gotoblas_BOBCAT; | |||
| #ifndef NO_AVX | |||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | |||
| extern gotoblas_t gotoblas_BULLDOZER; | |||
| #else | |||
| //Use NEHALEM kernels for sandy bridge | |||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | |||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | |||
| #endif | |||
| @@ -202,6 +204,14 @@ static gotoblas_t *get_coretype(void){ | |||
| else return &gotoblas_OPTERON; | |||
| } else if (exfamily == 5) { | |||
| return &gotoblas_BOBCAT; | |||
| } else if (exfamily == 6) { | |||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| if(support_avx()) | |||
| return &gotoblas_BULLDOZER; | |||
| else{ | |||
| fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n"); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } else { | |||
| return &gotoblas_BARCELONA; | |||
| } | |||
| @@ -238,6 +248,7 @@ static char *corename[] = { | |||
| "Nano", | |||
| "Sandybridge", | |||
| "Bobcat", | |||
| "Bulldozer", | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| @@ -259,6 +270,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_NANO) return corename[15]; | |||
| if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | |||
| if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | |||
| if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | |||
| return corename[0]; | |||
| } | |||
| @@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "OPTERON" | |||
| #endif | |||
| #if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER) | |||
| #if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| @@ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "BOBCAT" | |||
| #endif | |||
| #if defined (FORCE_BULLDOZER) | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "BULLDOZER" | |||
| #define ARCHCONFIG "-DBARCELONA " \ | |||
| "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | |||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" \ | |||
| "-DHAVE_AVX -DHAVE_FMA4" | |||
| #define LIBNAME "bulldozer" | |||
| #define CORENAME "BULLDOZER" | |||
| #endif | |||
| #ifdef FORCE_SSE_GENERIC | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| @@ -0,0 +1,59 @@ | |||
| SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||
| SGEMMINCOPY = | |||
| SGEMMITCOPY = | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMINCOPYOBJ = | |||
| SGEMMITCOPYOBJ = | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
| CGEMMINCOPY = | |||
| CGEMMITCOPY = | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMINCOPYOBJ = | |||
| CGEMMITCOPYOBJ = | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||
| STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||
| STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||
| STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||
| CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S | |||
| @@ -0,0 +1,62 @@ | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||
| SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||
| SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||
| DGEMMINCOPY = | |||
| DGEMMITCOPY = | |||
| DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||
| DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||
| DGEMMINCOPYOBJ = | |||
| DGEMMITCOPYOBJ = | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPY = zgemm_ncopy_2.S | |||
| CGEMMOTCOPY = zgemm_tcopy_2.S | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
| ZGEMMINCOPY = | |||
| ZGEMMITCOPY = | |||
| ZGEMMONCOPY = zgemm_ncopy_2.S | |||
| ZGEMMOTCOPY = zgemm_tcopy_2.S | |||
| ZGEMMINCOPYOBJ = | |||
| ZGEMMITCOPYOBJ = | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||
| STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||
| STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||
| STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||
| ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
| @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define SNUMOPT 8 | |||
| #define DNUMOPT 4 | |||