| @@ -43,6 +43,18 @@ matrix: | |||
| - TARGET_BOX=IBMZ_LINUX | |||
| - BTYPE="BINARY=64 USE_OPENMP=1" | |||
| - <<: *test-ubuntu | |||
| os: linux | |||
| dist: focal | |||
| arch: s390x | |||
| compiler: clang | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32" | |||
| env: | |||
| # for matrix annotation only | |||
| - TARGET_BOX=IBMZ_LINUX | |||
| - BTYPE="BINARY=64 USE_OPENMP=0 CC=clang" | |||
| - <<: *test-ubuntu | |||
| env: | |||
| - TARGET_BOX=LINUX64 | |||
| @@ -187,6 +187,7 @@ In chronological order: | |||
| * Marius Hillenbrand <https://github.com/mhillenibm> | |||
| * [2020-05-12] Revise dynamic architecture detection for IBM z | |||
| * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 | |||
| * [2020-09-07] Fix builds with clang on IBM z, including dynamic architecture support | |||
| * Danfeng Zhang <https://github.com/craft-zhang> | |||
| * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 | |||
| @@ -295,7 +295,6 @@ endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | |||
| GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) | |||
| GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) | |||
| GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | |||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
| @@ -594,34 +593,36 @@ endif | |||
| ifeq ($(ARCH), zarch) | |||
| DYNAMIC_CORE = ZARCH_GENERIC | |||
| # Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer | |||
| ifeq ($(GCCVERSIONGT5), 1) | |||
| ZARCH_SUPPORT_Z13 := 1 | |||
| else ifeq ($(GCCVERSIONEQ5), 1) | |||
| ifeq ($(GCCMINORVERSIONGTEQ2), 1) | |||
| ZARCH_SUPPORT_Z13 := 1 | |||
| endif | |||
| endif | |||
| ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) | |||
| ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) | |||
| ZARCH_SUPPORT_Z13 := 1 | |||
| endif | |||
| endif | |||
| ifeq ($(ZARCH_SUPPORT_Z13), 1) | |||
| # if the compiler accepts -march=arch11 or -march=z13 and can compile a file | |||
| # with z13-specific inline assembly, then we can include support for Z13. | |||
| # note: -march=z13 is equivalent to -march=arch11 yet some compiler releases | |||
| # only support one or the other. | |||
| # note: LLVM version 6.x supported -march=z13 yet could not handle vector | |||
| # registers in inline assembly, so the check for supporting the -march flag is | |||
| # not enough. | |||
| ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null | |||
| ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1) | |||
| ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) | |||
| ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) | |||
| DYNAMIC_CORE += Z13 | |||
| CCOMMON_OPT += -DDYN_Z13 | |||
| else | |||
| $(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) | |||
| $(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ7), 1) | |||
| # as above for z13, check for -march=arch12 and z14 support in the compiler. | |||
| ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1) | |||
| ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) | |||
| ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) | |||
| DYNAMIC_CORE += Z14 | |||
| CCOMMON_OPT += -DDYN_Z14 | |||
| else | |||
| $(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) | |||
| endif | |||
| $(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) | |||
| endif | |||
| endif # ARCH zarch | |||
| ifeq ($(ARCH), power) | |||
| DYNAMIC_CORE = POWER6 | |||
| DYNAMIC_CORE += POWER8 | |||
| @@ -1,18 +1,6 @@ | |||
| #include "common.h" | |||
| #include <stdbool.h> | |||
| // Gate kernels for z13 and z14 on gcc version | |||
| #if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \ | |||
| /* RHEL 7 since 7.3: */ \ | |||
| (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \ | |||
| __GNUC_RH_RELEASE__ >= 11) | |||
| #define HAVE_Z13_SUPPORT | |||
| #endif | |||
| #if __GNUC__ >= 7 | |||
| #define HAVE_Z14_SUPPORT | |||
| #endif | |||
| // Guard the use of getauxval() on glibc version >= 2.16 | |||
| #ifdef __GLIBC__ | |||
| #include <features.h> | |||
| @@ -47,10 +35,10 @@ static unsigned long get_hwcap(void) { | |||
| #endif // __GLIBC | |||
| extern gotoblas_t gotoblas_ZARCH_GENERIC; | |||
| #ifdef HAVE_Z13_SUPPORT | |||
| #ifdef DYN_Z13 | |||
| extern gotoblas_t gotoblas_Z13; | |||
| #endif | |||
| #ifdef HAVE_Z14_SUPPORT | |||
| #ifdef DYN_Z14 | |||
| extern gotoblas_t gotoblas_Z14; | |||
| #endif | |||
| @@ -66,10 +54,10 @@ static char* corename[] = { | |||
| }; | |||
| char* gotoblas_corename(void) { | |||
| #ifdef HAVE_Z13_SUPPORT | |||
| #ifdef DYN_Z13 | |||
| if (gotoblas == &gotoblas_Z13) return corename[1]; | |||
| #endif | |||
| #ifdef HAVE_Z14_SUPPORT | |||
| #ifdef DYN_Z14 | |||
| if (gotoblas == &gotoblas_Z14) return corename[2]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; | |||
| @@ -77,6 +65,10 @@ char* gotoblas_corename(void) { | |||
| return corename[0]; | |||
| } | |||
| #ifndef HWCAP_S390_VXE | |||
| #define HWCAP_S390_VXE 8192 | |||
| #endif | |||
| /** | |||
| * Detect the fitting set of kernels by retrieving the CPU features supported by | |||
| * OS from the auxiliary value AT_HWCAP and choosing the set of kernels | |||
| @@ -89,15 +81,15 @@ static gotoblas_t* get_coretype(void) { | |||
| unsigned long hwcap __attribute__((unused)) = get_hwcap(); | |||
| #ifdef DYN_Z14 | |||
| // z14 and z15 systems: exploit Vector Facility (SIMD) and | |||
| // Vector-Enhancements Facility 1 (float SIMD instructions), if present. | |||
| #ifdef HAVE_Z14_SUPPORT | |||
| if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) | |||
| return &gotoblas_Z14; | |||
| #endif | |||
| #ifdef DYN_Z13 | |||
| // z13: Vector Facility (SIMD for double) | |||
| #ifdef HAVE_Z13_SUPPORT | |||
| if (hwcap & HWCAP_S390_VX) | |||
| return &gotoblas_Z13; | |||
| #endif | |||
| @@ -123,19 +115,27 @@ static gotoblas_t* force_coretype(char* coretype) { | |||
| } | |||
| } | |||
| switch (found) | |||
| { | |||
| #ifdef HAVE_Z13_SUPPORT | |||
| case 1: return (&gotoblas_Z13); | |||
| if (found == 1) { | |||
| #ifdef DYN_Z13 | |||
| return &gotoblas_Z13; | |||
| #else | |||
| openblas_warning(1, "Z13 support not compiled in"); | |||
| return NULL; | |||
| #endif | |||
| #ifdef HAVE_Z14_SUPPORT | |||
| case 2: return (&gotoblas_Z14); | |||
| } else if (found == 2) { | |||
| #ifdef DYN_Z14 | |||
| return &gotoblas_Z14; | |||
| #else | |||
| openblas_warning(1, "Z14 support not compiled in"); | |||
| return NULL; | |||
| #endif | |||
| case 3: return (&gotoblas_ZARCH_GENERIC); | |||
| default: return NULL; | |||
| } else if (found == 3) { | |||
| return &gotoblas_ZARCH_GENERIC; | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| return NULL; | |||
| } | |||
| void gotoblas_dynamic_init(void) { | |||
| @@ -1014,8 +1014,8 @@ | |||
| * the one from above. Compare it with D1 computed | |||
| * using the 1-stage. | |||
| * | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||
| CALL CLACPY( 'U', N, N, A, LDA, V, LDU ) | |||
| LH = MAX(1, 4*N) | |||
| LW = LWORK - LH | |||
| @@ -1048,8 +1048,8 @@ | |||
| * the one from above. Compare it with D1 computed | |||
| * using the 1-stage. | |||
| * | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||
| CALL CLACPY( 'L', N, N, A, LDA, V, LDU ) | |||
| CALL CHETRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, | |||
| $ WORK, LH, WORK( LH+1 ), LW, IINFO ) | |||
| @@ -670,8 +670,8 @@ | |||
| * the one from above. Compare it with D1 computed | |||
| * using the DSBTRD. | |||
| * | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||
| CALL DLACPY( ' ', K+1, N, A, LDA, U, LDU ) | |||
| LH = MAX(1, 4*N) | |||
| LW = LWORK - LH | |||
| @@ -743,8 +743,8 @@ | |||
| * the one from above. Compare it with D1 computed | |||
| * using the DSBTRD. | |||
| * | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||
| CALL DLACPY( ' ', K+1, N, A, LDA, U, LDU ) | |||
| LH = MAX(1, 4*N) | |||
| LW = LWORK - LH | |||
| @@ -999,8 +999,8 @@ | |||
| * the one from above. Compare it with D1 computed | |||
| * using the 1-stage. | |||
| * | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||
| CALL DLACPY( "U", N, N, A, LDA, V, LDU ) | |||
| LH = MAX(1, 4*N) | |||
| LW = LWORK - LH | |||
| @@ -1032,8 +1032,8 @@ | |||
| * the one from above. Compare it with D1 computed | |||
| * using the 1-stage. | |||
| * | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||
| CALL DLACPY( "L", N, N, A, LDA, V, LDU ) | |||
| CALL DSYTRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, | |||
| $ WORK, LH, WORK( LH+1 ), LW, IINFO ) | |||
| @@ -680,8 +680,8 @@ | |||
| * the one from above. Compare it with D1 computed | |||
| * using the DSBTRD. | |||
| * | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||
| CALL ZLACPY( ' ', K+1, N, A, LDA, U, LDU ) | |||
| LH = MAX(1, 4*N) | |||
| LW = LWORK - LH | |||
| @@ -753,8 +753,8 @@ | |||
| * the one from above. Compare it with D1 computed | |||
| * using the DSBTRD. | |||
| * | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||
| CALL ZLACPY( ' ', K+1, N, A, LDA, U, LDU ) | |||
| LH = MAX(1, 4*N) | |||
| LW = LWORK - LH | |||
| @@ -1014,8 +1014,8 @@ | |||
| * the one from above. Compare it with D1 computed | |||
| * using the 1-stage. | |||
| * | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||
| CALL ZLACPY( 'U', N, N, A, LDA, V, LDU ) | |||
| LH = MAX(1, 4*N) | |||
| LW = LWORK - LH | |||
| @@ -1048,8 +1048,8 @@ | |||
| * the one from above. Compare it with D1 computed | |||
| * using the 1-stage. | |||
| * | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||
| CALL ZLACPY( 'L', N, N, A, LDA, V, LDU ) | |||
| CALL ZHETRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, | |||
| $ WORK, LH, WORK( LH+1 ), LW, IINFO ) | |||