| @@ -59,6 +59,9 @@ endif | |||
| @$(CC) --version > /dev/null 2>&1;\ | |||
| if [ $$? -eq 0 ]; then \ | |||
| cverinfo=`$(CC) --version | sed -n '1p'`; \ | |||
| if [ -z "$${cverinfo}" ]; then \ | |||
| cverinfo=`$(CC) --version | sed -n '2p'`; \ | |||
| fi; \ | |||
| echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ | |||
| else \ | |||
| echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ | |||
| @@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| @$(FC) --version > /dev/null 2>&1;\ | |||
| if [ $$? -eq 0 ]; then \ | |||
| fverinfo=`$(FC) --version | sed -n '1p'`; \ | |||
| if [ -z "$${fverinfo}" ]; then \ | |||
| fverinfo=`$(FC) --version | sed -n '2p'`; \ | |||
| fi; \ | |||
| echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ | |||
| else \ | |||
| echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ | |||
| @@ -10,9 +10,11 @@ USE_OPENMP = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| @@ -181,7 +181,7 @@ endif | |||
| # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. | |||
| ifeq ($(HOSTARCH), x86_64) | |||
| ifeq ($(findstring pgcc,$(HOSTCC)),) | |||
| ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),) | |||
| GETARCH_FLAGS += -march=native | |||
| endif | |||
| endif | |||
| @@ -663,6 +663,7 @@ endif | |||
| endif # ARCH zarch | |||
| ifeq ($(ARCH), power) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| DYNAMIC_CORE = POWER6 | |||
| DYNAMIC_CORE += POWER8 | |||
| ifneq ($(C_COMPILER), GCC) | |||
| @@ -689,6 +690,10 @@ else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | |||
| endif | |||
| endif | |||
| else | |||
| DYNAMIC_CORE = POWER8 | |||
| DYNAMIC_CORE += POWER9 | |||
| endif | |||
| endif | |||
| # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty | |||
| @@ -847,9 +852,19 @@ endif | |||
| endif | |||
| ifeq ($(C_COMPILER), PGI) | |||
| PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) | |||
| PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20) | |||
| PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11) | |||
| PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) | |||
| ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011)) | |||
| NEWPGI := 1 | |||
| endif | |||
| ifdef BINARY64 | |||
| ifeq ($(ARCH), x86_64) | |||
| CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm | |||
| CCOMMON_OPT += -tp p7-64 | |||
| ifneq ($(NEWPGI),1) | |||
| CCOMMON_OPT += -D__MMX__ -Mnollvm | |||
| endif | |||
| else | |||
| ifeq ($(ARCH), power) | |||
| ifeq ($(CORE), POWER8) | |||
| @@ -1029,18 +1044,24 @@ ifeq ($(ARCH), x86_64) | |||
| FCOMMON_OPT += -tp p7-64 | |||
| else | |||
| ifeq ($(ARCH), power) | |||
| ifeq ($(CORE), POWER6) | |||
| $(warning NVIDIA HPC compilers do not support POWER6.) | |||
| endif | |||
| ifeq ($(CORE), POWER8) | |||
| FCOMMON_OPT += -tp pwr8 | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| FCOMMON_OPT += -tp pwr9 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| $(warning NVIDIA HPC compilers do not support POWER10.) | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| FCOMMON_OPT += -tp p7 | |||
| endif | |||
| FCOMMON_OPT += -Mrecursive | |||
| FCOMMON_OPT += -Mrecursive -Kieee | |||
| ifeq ($(USE_OPENMP), 1) | |||
| FCOMMON_OPT += -mp | |||
| endif | |||
| @@ -13,7 +13,7 @@ Drone CI: [ { | |||
| #ifndef C_PGI | |||
| if (gotoblas == &gotoblas_POWER6) return corename[1]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_POWER8) return corename[2]; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| if (gotoblas == &gotoblas_POWER9) return corename[3]; | |||
| @@ -38,10 +40,157 @@ char *gotoblas_corename(void) { | |||
| return corename[0]; | |||
| } | |||
| #ifdef C_PGI | |||
| /* | |||
| * NV HPC compilers do not yet implement __builtin_cpu_is(). | |||
| * Fake a version here for use in the CPU detection code below. | |||
| * | |||
| * Strategy here is to first check the CPU to see what it actually is, | |||
| * and then test the input to see if what the CPU actually is matches | |||
| * what was requested. | |||
| */ | |||
| #include <string.h> | |||
| /* | |||
| * Define POWER processor version table. | |||
| * | |||
| * NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time | |||
| */ | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_POWER5 5 | |||
| #define CPU_POWER6 6 | |||
| #define CPU_POWER8 8 | |||
| #define CPU_POWER9 9 | |||
| #define CPU_POWER10 10 | |||
| static struct { | |||
| uint32_t pvr_mask; | |||
| uint32_t pvr_value; | |||
| const char* cpu_name; | |||
| uint32_t cpu_type; | |||
| } pvrPOWER [] = { | |||
| { /* POWER6 in P5+ mode; 2.04-compliant processor */ | |||
| .pvr_mask = 0xffffffff, | |||
| .pvr_value = 0x0f000001, | |||
| .cpu_name = "POWER5+", | |||
| .cpu_type = CPU_POWER5, | |||
| }, | |||
| { /* Power6 aka POWER6X*/ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x003e0000, | |||
| .cpu_name = "POWER6 (raw)", | |||
| .cpu_type = CPU_POWER6, | |||
| }, | |||
| { /* Power7 */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x003f0000, | |||
| .cpu_name = "POWER7 (raw)", | |||
| .cpu_type = CPU_POWER6, | |||
| }, | |||
| { /* Power7+ */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004A0000, | |||
| .cpu_name = "POWER7+ (raw)", | |||
| .cpu_type = CPU_POWER6, | |||
| }, | |||
| { /* Power8E */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004b0000, | |||
| .cpu_name = "POWER8E (raw)", | |||
| .cpu_type = CPU_POWER8, | |||
| }, | |||
| { /* Power8NVL */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004c0000, | |||
| .cpu_name = "POWER8NVL (raw)", | |||
| .cpu_type = CPU_POWER8, | |||
| }, | |||
| { /* Power8 */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004d0000, | |||
| .cpu_name = "POWER8 (raw)", | |||
| .cpu_type = CPU_POWER8, | |||
| }, | |||
| { /* Power9 DD2.0 */ | |||
| .pvr_mask = 0xffffefff, | |||
| .pvr_value = 0x004e0200, | |||
| .cpu_name = "POWER9 (raw)", | |||
| .cpu_type = CPU_POWER9, | |||
| }, | |||
| { /* Power9 DD 2.1 */ | |||
| .pvr_mask = 0xffffefff, | |||
| .pvr_value = 0x004e0201, | |||
| .cpu_name = "POWER9 (raw)", | |||
| .cpu_type = CPU_POWER9, | |||
| }, | |||
| { /* Power9 DD2.2 or later */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004e0000, | |||
| .cpu_name = "POWER9 (raw)", | |||
| .cpu_type = CPU_POWER9, | |||
| }, | |||
| { /* Power10 */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x00800000, | |||
| .cpu_name = "POWER10 (raw)", | |||
| .cpu_type = CPU_POWER10, | |||
| }, | |||
| { /* End of table, pvr_mask and pvr_value must be zero */ | |||
| .pvr_mask = 0x0, | |||
| .pvr_value = 0x0, | |||
| .cpu_name = "Unknown", | |||
| .cpu_type = CPU_UNKNOWN, | |||
| }, | |||
| }; | |||
| static int __builtin_cpu_is(const char *cpu) { | |||
| int i; | |||
| uint32_t pvr; | |||
| uint32_t cpu_type; | |||
| asm("mfpvr %0" : "=r"(pvr)); | |||
| for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { | |||
| if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { | |||
| break; | |||
| } | |||
| } | |||
| #if defined(DEBUG) | |||
| printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, | |||
| pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); | |||
| #endif | |||
| cpu_type = pvrPOWER[i].cpu_type; | |||
| if (!strcmp(cpu, "power8")) | |||
| return cpu_type == CPU_POWER8; | |||
| if (!strcmp(cpu, "power9")) | |||
| return cpu_type == CPU_POWER9; | |||
| return 0; | |||
| } | |||
| #endif /* C_PGI */ | |||
| static gotoblas_t *get_coretype(void) { | |||
| #ifndef C_PGI | |||
| if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) | |||
| return &gotoblas_POWER6; | |||
| #endif | |||
| if (__builtin_cpu_is("power8")) | |||
| return &gotoblas_POWER8; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| @@ -77,7 +226,9 @@ static gotoblas_t *force_coretype(char * coretype) { | |||
| switch (found) | |||
| { | |||
| #ifndef C_PGI | |||
| case 1: return (&gotoblas_POWER6); | |||
| #endif | |||
| case 2: return (&gotoblas_POWER8); | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| case 3: return (&gotoblas_POWER9); | |||
| @@ -32,7 +32,7 @@ if ($compiler eq "") { | |||
| "xlf95", "xlf90", "xlf", | |||
| "ppuf77", "ppuf95", "ppuf90", "ppuxlf", | |||
| "pathf90", "pathf95", | |||
| "pgf95", "pgf90", "pgf77", | |||
| "pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran", | |||
| "flang", "egfortran", | |||
| "ifort"); | |||
| @@ -64,7 +64,6 @@ if ($compiler eq "") { | |||
| if (!$?) { | |||
| $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; | |||
| if ($data =~ /zhoge_/) { | |||
| $bu = "_"; | |||
| } | |||
| @@ -87,7 +86,7 @@ if ($compiler eq "") { | |||
| if ($compiler =~ /flang/) { | |||
| $vendor = FLANG; | |||
| $openmp = "-fopenmp"; | |||
| } elsif ($compiler =~ /pgf/) { | |||
| } elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) { | |||
| $vendor = PGI; | |||
| $openmp = "-mp"; | |||
| } else { | |||
| @@ -123,7 +122,7 @@ if ($compiler eq "") { | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($data =~ /PGF/) { | |||
| if ($data =~ /PGF/ || $data =~ /NVF/) { | |||
| $vendor = PGI; | |||
| $openmp = "-mp"; | |||
| } | |||
| @@ -177,7 +176,7 @@ if ($compiler eq "") { | |||
| $openmp = "-mp"; | |||
| } | |||
| if ($compiler =~ /pgf/) { | |||
| if ($compiler =~ /pgf/ || $compiler =~ /nvf/) { | |||
| $vendor = PGI; | |||
| $bu = "_"; | |||
| $openmp = "-mp"; | |||
| @@ -330,7 +329,7 @@ if ($link ne "") { | |||
| $flags =~ s/\@/\,/g; | |||
| $linker_L .= "-Wl,". $flags . " " ; | |||
| } | |||
| if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { | |||
| if ($flags =~ /-lgomp/ && $CC =~ /clang/) { | |||
| $flags = "-lomp"; | |||
| } | |||
| @@ -36,7 +36,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| override CFLAGS += -march=cooperlake | |||
| else | |||
| override CFLAGS += -march=skylake-avx512 | |||
| override CFLAGS += -march=skylake-avx512 -mavx512f | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| override CFLAGS += -fno-asynchronous-unwind-tables | |||
| @@ -47,7 +47,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| endif | |||
| endif | |||
| else ifeq ($(TARGET_CORE), SKYLAKEX) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| override CFLAGS += -fno-asynchronous-unwind-tables | |||
| endif | |||
| @@ -91,10 +91,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c | |||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot_thunderx2t99.c | |||
| SDOTKERNEL = dot_thunderx2t99.c | |||
| @@ -153,12 +153,12 @@ IDAMAXKERNEL = iamax_thunderx2t99.c | |||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| SNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = nrm2.S | |||
| #DNRM2KERNEL = dznrm2_thunderx2t99_fast.c | |||
| #ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c | |||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot_thunderx2t99.c | |||
| @@ -153,13 +153,16 @@ IDAMAXKERNEL = iamax_thunderx2t99.c | |||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| #DNRM2KERNEL = dznrm2_thunderx2t99_fast.c | |||
| #ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c | |||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| #SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| #CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| ##DNRM2KERNEL = dznrm2_thunderx2t99_fast.c | |||
| ##ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c | |||
| #DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| #ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot_thunderx2t99.c | |||
| SDOTKERNEL = dot_thunderx2t99.c | |||