| @@ -43,6 +43,18 @@ matrix: | |||||
| - TARGET_BOX=IBMZ_LINUX | - TARGET_BOX=IBMZ_LINUX | ||||
| - BTYPE="BINARY=64 USE_OPENMP=1" | - BTYPE="BINARY=64 USE_OPENMP=1" | ||||
| - <<: *test-ubuntu | |||||
| os: linux | |||||
| dist: focal | |||||
| arch: s390x | |||||
| compiler: clang | |||||
| before_script: | |||||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32" | |||||
| env: | |||||
| # for matrix annotation only | |||||
| - TARGET_BOX=IBMZ_LINUX | |||||
| - BTYPE="BINARY=64 USE_OPENMP=0 CC=clang" | |||||
| - <<: *test-ubuntu | - <<: *test-ubuntu | ||||
| env: | env: | ||||
| - TARGET_BOX=LINUX64 | - TARGET_BOX=LINUX64 | ||||
| @@ -187,6 +187,7 @@ In chronological order: | |||||
| * Marius Hillenbrand <https://github.com/mhillenibm> | * Marius Hillenbrand <https://github.com/mhillenibm> | ||||
| * [2020-05-12] Revise dynamic architecture detection for IBM z | * [2020-05-12] Revise dynamic architecture detection for IBM z | ||||
| * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 | * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 | ||||
| * [2020-09-07] Fix builds with clang on IBM z, including dynamic architecture support | |||||
| * Danfeng Zhang <https://github.com/craft-zhang> | * Danfeng Zhang <https://github.com/craft-zhang> | ||||
| * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 | * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 | ||||
| @@ -295,7 +295,6 @@ endif | |||||
| ifeq ($(C_COMPILER), GCC) | ifeq ($(C_COMPILER), GCC) | ||||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | ||||
| GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | ||||
| GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) | |||||
| GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) | GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) | ||||
| GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | ||||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | ||||
| @@ -594,34 +593,36 @@ endif | |||||
| ifeq ($(ARCH), zarch) | ifeq ($(ARCH), zarch) | ||||
| DYNAMIC_CORE = ZARCH_GENERIC | DYNAMIC_CORE = ZARCH_GENERIC | ||||
| # Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer | |||||
| ifeq ($(GCCVERSIONGT5), 1) | |||||
| ZARCH_SUPPORT_Z13 := 1 | |||||
| else ifeq ($(GCCVERSIONEQ5), 1) | |||||
| ifeq ($(GCCMINORVERSIONGTEQ2), 1) | |||||
| ZARCH_SUPPORT_Z13 := 1 | |||||
| endif | |||||
| endif | |||||
| ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) | |||||
| ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) | |||||
| ZARCH_SUPPORT_Z13 := 1 | |||||
| endif | |||||
| endif | |||||
| ifeq ($(ZARCH_SUPPORT_Z13), 1) | |||||
| # if the compiler accepts -march=arch11 or -march=z13 and can compile a file | |||||
| # with z13-specific inline assembly, then we can include support for Z13. | |||||
| # note: -march=z13 is equivalent to -march=arch11 yet some compiler releases | |||||
| # only support one or the other. | |||||
| # note: LLVM version 6.x supported -march=z13 yet could not handle vector | |||||
| # registers in inline assembly, so the check for supporting the -march flag is | |||||
| # not enough. | |||||
| ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null | |||||
| ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1) | |||||
| ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) | |||||
| ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) | |||||
| DYNAMIC_CORE += Z13 | DYNAMIC_CORE += Z13 | ||||
| CCOMMON_OPT += -DDYN_Z13 | |||||
| else | else | ||||
| $(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) | |||||
| $(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) | |||||
| endif | endif | ||||
| ifeq ($(GCCVERSIONGTEQ7), 1) | |||||
| # as above for z13, check for -march=arch12 and z14 support in the compiler. | |||||
| ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1) | |||||
| ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) | |||||
| ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) | |||||
| DYNAMIC_CORE += Z14 | DYNAMIC_CORE += Z14 | ||||
| CCOMMON_OPT += -DDYN_Z14 | |||||
| else | else | ||||
| $(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) | |||||
| endif | |||||
| $(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) | |||||
| endif | endif | ||||
| endif # ARCH zarch | |||||
| ifeq ($(ARCH), power) | ifeq ($(ARCH), power) | ||||
| DYNAMIC_CORE = POWER6 | DYNAMIC_CORE = POWER6 | ||||
| DYNAMIC_CORE += POWER8 | DYNAMIC_CORE += POWER8 | ||||
| @@ -1,18 +1,6 @@ | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <stdbool.h> | #include <stdbool.h> | ||||
| // Gate kernels for z13 and z14 on gcc version | |||||
| #if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \ | |||||
| /* RHEL 7 since 7.3: */ \ | |||||
| (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \ | |||||
| __GNUC_RH_RELEASE__ >= 11) | |||||
| #define HAVE_Z13_SUPPORT | |||||
| #endif | |||||
| #if __GNUC__ >= 7 | |||||
| #define HAVE_Z14_SUPPORT | |||||
| #endif | |||||
| // Guard the use of getauxval() on glibc version >= 2.16 | // Guard the use of getauxval() on glibc version >= 2.16 | ||||
| #ifdef __GLIBC__ | #ifdef __GLIBC__ | ||||
| #include <features.h> | #include <features.h> | ||||
| @@ -47,10 +35,10 @@ static unsigned long get_hwcap(void) { | |||||
| #endif // __GLIBC | #endif // __GLIBC | ||||
| extern gotoblas_t gotoblas_ZARCH_GENERIC; | extern gotoblas_t gotoblas_ZARCH_GENERIC; | ||||
| #ifdef HAVE_Z13_SUPPORT | |||||
| #ifdef DYN_Z13 | |||||
| extern gotoblas_t gotoblas_Z13; | extern gotoblas_t gotoblas_Z13; | ||||
| #endif | #endif | ||||
| #ifdef HAVE_Z14_SUPPORT | |||||
| #ifdef DYN_Z14 | |||||
| extern gotoblas_t gotoblas_Z14; | extern gotoblas_t gotoblas_Z14; | ||||
| #endif | #endif | ||||
| @@ -66,10 +54,10 @@ static char* corename[] = { | |||||
| }; | }; | ||||
| char* gotoblas_corename(void) { | char* gotoblas_corename(void) { | ||||
| #ifdef HAVE_Z13_SUPPORT | |||||
| #ifdef DYN_Z13 | |||||
| if (gotoblas == &gotoblas_Z13) return corename[1]; | if (gotoblas == &gotoblas_Z13) return corename[1]; | ||||
| #endif | #endif | ||||
| #ifdef HAVE_Z14_SUPPORT | |||||
| #ifdef DYN_Z14 | |||||
| if (gotoblas == &gotoblas_Z14) return corename[2]; | if (gotoblas == &gotoblas_Z14) return corename[2]; | ||||
| #endif | #endif | ||||
| if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; | if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; | ||||
| @@ -77,6 +65,10 @@ char* gotoblas_corename(void) { | |||||
| return corename[0]; | return corename[0]; | ||||
| } | } | ||||
| #ifndef HWCAP_S390_VXE | |||||
| #define HWCAP_S390_VXE 8192 | |||||
| #endif | |||||
| /** | /** | ||||
| * Detect the fitting set of kernels by retrieving the CPU features supported by | * Detect the fitting set of kernels by retrieving the CPU features supported by | ||||
| * OS from the auxiliary value AT_HWCAP and choosing the set of kernels | * OS from the auxiliary value AT_HWCAP and choosing the set of kernels | ||||
| @@ -89,15 +81,15 @@ static gotoblas_t* get_coretype(void) { | |||||
| unsigned long hwcap __attribute__((unused)) = get_hwcap(); | unsigned long hwcap __attribute__((unused)) = get_hwcap(); | ||||
| #ifdef DYN_Z14 | |||||
| // z14 and z15 systems: exploit Vector Facility (SIMD) and | // z14 and z15 systems: exploit Vector Facility (SIMD) and | ||||
| // Vector-Enhancements Facility 1 (float SIMD instructions), if present. | // Vector-Enhancements Facility 1 (float SIMD instructions), if present. | ||||
| #ifdef HAVE_Z14_SUPPORT | |||||
| if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) | if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) | ||||
| return &gotoblas_Z14; | return &gotoblas_Z14; | ||||
| #endif | #endif | ||||
| #ifdef DYN_Z13 | |||||
| // z13: Vector Facility (SIMD for double) | // z13: Vector Facility (SIMD for double) | ||||
| #ifdef HAVE_Z13_SUPPORT | |||||
| if (hwcap & HWCAP_S390_VX) | if (hwcap & HWCAP_S390_VX) | ||||
| return &gotoblas_Z13; | return &gotoblas_Z13; | ||||
| #endif | #endif | ||||
| @@ -123,19 +115,27 @@ static gotoblas_t* force_coretype(char* coretype) { | |||||
| } | } | ||||
| } | } | ||||
| switch (found) | |||||
| { | |||||
| #ifdef HAVE_Z13_SUPPORT | |||||
| case 1: return (&gotoblas_Z13); | |||||
| if (found == 1) { | |||||
| #ifdef DYN_Z13 | |||||
| return &gotoblas_Z13; | |||||
| #else | |||||
| openblas_warning(1, "Z13 support not compiled in"); | |||||
| return NULL; | |||||
| #endif | #endif | ||||
| #ifdef HAVE_Z14_SUPPORT | |||||
| case 2: return (&gotoblas_Z14); | |||||
| } else if (found == 2) { | |||||
| #ifdef DYN_Z14 | |||||
| return &gotoblas_Z14; | |||||
| #else | |||||
| openblas_warning(1, "Z14 support not compiled in"); | |||||
| return NULL; | |||||
| #endif | #endif | ||||
| case 3: return (&gotoblas_ZARCH_GENERIC); | |||||
| default: return NULL; | |||||
| } else if (found == 3) { | |||||
| return &gotoblas_ZARCH_GENERIC; | |||||
| } | } | ||||
| snprintf(message, 128, "Core not found: %s\n", coretype); | snprintf(message, 128, "Core not found: %s\n", coretype); | ||||
| openblas_warning(1, message); | openblas_warning(1, message); | ||||
| return NULL; | |||||
| } | } | ||||
| void gotoblas_dynamic_init(void) { | void gotoblas_dynamic_init(void) { | ||||
| @@ -1014,8 +1014,8 @@ | |||||
| * the one from above. Compare it with D1 computed | * the one from above. Compare it with D1 computed | ||||
| * using the 1-stage. | * using the 1-stage. | ||||
| * | * | ||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||||
| CALL CLACPY( 'U', N, N, A, LDA, V, LDU ) | CALL CLACPY( 'U', N, N, A, LDA, V, LDU ) | ||||
| LH = MAX(1, 4*N) | LH = MAX(1, 4*N) | ||||
| LW = LWORK - LH | LW = LWORK - LH | ||||
| @@ -1048,8 +1048,8 @@ | |||||
| * the one from above. Compare it with D1 computed | * the one from above. Compare it with D1 computed | ||||
| * using the 1-stage. | * using the 1-stage. | ||||
| * | * | ||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||||
| CALL CLACPY( 'L', N, N, A, LDA, V, LDU ) | CALL CLACPY( 'L', N, N, A, LDA, V, LDU ) | ||||
| CALL CHETRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, | CALL CHETRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, | ||||
| $ WORK, LH, WORK( LH+1 ), LW, IINFO ) | $ WORK, LH, WORK( LH+1 ), LW, IINFO ) | ||||
| @@ -670,8 +670,8 @@ | |||||
| * the one from above. Compare it with D1 computed | * the one from above. Compare it with D1 computed | ||||
| * using the DSBTRD. | * using the DSBTRD. | ||||
| * | * | ||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||||
| CALL DLACPY( ' ', K+1, N, A, LDA, U, LDU ) | CALL DLACPY( ' ', K+1, N, A, LDA, U, LDU ) | ||||
| LH = MAX(1, 4*N) | LH = MAX(1, 4*N) | ||||
| LW = LWORK - LH | LW = LWORK - LH | ||||
| @@ -743,8 +743,8 @@ | |||||
| * the one from above. Compare it with D1 computed | * the one from above. Compare it with D1 computed | ||||
| * using the DSBTRD. | * using the DSBTRD. | ||||
| * | * | ||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||||
| CALL DLACPY( ' ', K+1, N, A, LDA, U, LDU ) | CALL DLACPY( ' ', K+1, N, A, LDA, U, LDU ) | ||||
| LH = MAX(1, 4*N) | LH = MAX(1, 4*N) | ||||
| LW = LWORK - LH | LW = LWORK - LH | ||||
| @@ -999,8 +999,8 @@ | |||||
| * the one from above. Compare it with D1 computed | * the one from above. Compare it with D1 computed | ||||
| * using the 1-stage. | * using the 1-stage. | ||||
| * | * | ||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||||
| CALL DLACPY( "U", N, N, A, LDA, V, LDU ) | CALL DLACPY( "U", N, N, A, LDA, V, LDU ) | ||||
| LH = MAX(1, 4*N) | LH = MAX(1, 4*N) | ||||
| LW = LWORK - LH | LW = LWORK - LH | ||||
| @@ -1032,8 +1032,8 @@ | |||||
| * the one from above. Compare it with D1 computed | * the one from above. Compare it with D1 computed | ||||
| * using the 1-stage. | * using the 1-stage. | ||||
| * | * | ||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||||
| CALL DLACPY( "L", N, N, A, LDA, V, LDU ) | CALL DLACPY( "L", N, N, A, LDA, V, LDU ) | ||||
| CALL DSYTRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, | CALL DSYTRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, | ||||
| $ WORK, LH, WORK( LH+1 ), LW, IINFO ) | $ WORK, LH, WORK( LH+1 ), LW, IINFO ) | ||||
| @@ -680,8 +680,8 @@ | |||||
| * the one from above. Compare it with D1 computed | * the one from above. Compare it with D1 computed | ||||
| * using the DSBTRD. | * using the DSBTRD. | ||||
| * | * | ||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||||
| CALL ZLACPY( ' ', K+1, N, A, LDA, U, LDU ) | CALL ZLACPY( ' ', K+1, N, A, LDA, U, LDU ) | ||||
| LH = MAX(1, 4*N) | LH = MAX(1, 4*N) | ||||
| LW = LWORK - LH | LW = LWORK - LH | ||||
| @@ -753,8 +753,8 @@ | |||||
| * the one from above. Compare it with D1 computed | * the one from above. Compare it with D1 computed | ||||
| * using the DSBTRD. | * using the DSBTRD. | ||||
| * | * | ||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||||
| CALL ZLACPY( ' ', K+1, N, A, LDA, U, LDU ) | CALL ZLACPY( ' ', K+1, N, A, LDA, U, LDU ) | ||||
| LH = MAX(1, 4*N) | LH = MAX(1, 4*N) | ||||
| LW = LWORK - LH | LW = LWORK - LH | ||||
| @@ -1014,8 +1014,8 @@ | |||||
| * the one from above. Compare it with D1 computed | * the one from above. Compare it with D1 computed | ||||
| * using the 1-stage. | * using the 1-stage. | ||||
| * | * | ||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||||
| CALL ZLACPY( 'U', N, N, A, LDA, V, LDU ) | CALL ZLACPY( 'U', N, N, A, LDA, V, LDU ) | ||||
| LH = MAX(1, 4*N) | LH = MAX(1, 4*N) | ||||
| LW = LWORK - LH | LW = LWORK - LH | ||||
| @@ -1048,8 +1048,8 @@ | |||||
| * the one from above. Compare it with D1 computed | * the one from above. Compare it with D1 computed | ||||
| * using the 1-stage. | * using the 1-stage. | ||||
| * | * | ||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) | |||||
| CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) | |||||
| CALL ZLACPY( 'L', N, N, A, LDA, V, LDU ) | CALL ZLACPY( 'L', N, N, A, LDA, V, LDU ) | ||||
| CALL ZHETRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, | CALL ZHETRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, | ||||
| $ WORK, LH, WORK( LH+1 ), LW, IINFO ) | $ WORK, LH, WORK( LH+1 ), LW, IINFO ) | ||||