| @@ -9,6 +9,16 @@ else | |||
| USE_OPENMP = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -fno-fast-math | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| @@ -604,6 +604,7 @@ DYNAMIC_CORE = POWER6 | |||
| DYNAMIC_CORE += POWER8 | |||
| ifneq ($(C_COMPILER), GCC) | |||
| DYNAMIC_CORE += POWER9 | |||
| DYNAMIC_CORE += POWER10 | |||
| endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| ifeq ($(GCCVERSIONGT5), 1) | |||
| @@ -611,6 +612,12 @@ DYNAMIC_CORE += POWER9 | |||
| else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) | |||
| endif | |||
| GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) | |||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||
| DYNAMIC_CORE += POWER10 | |||
| else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -49,6 +49,7 @@ POWER6 | |||
| POWER7 | |||
| POWER8 | |||
| POWER9 | |||
| POWER10 | |||
| PPCG4 | |||
| PPC970 | |||
| PPC970MP | |||
| @@ -49,7 +49,7 @@ if (DYNAMIC_ARCH) | |||
| endif () | |||
| if (POWER) | |||
| set(DYNAMIC_CORE POWER6 POWER8 POWER9) | |||
| set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) | |||
| endif () | |||
| if (X86) | |||
| @@ -420,7 +420,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(ZGEMM_UNROLL_M 8) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(SYMV_P 8) | |||
| elseif ("${TCORE}" STREQUAL "POWER9") | |||
| elseif ("${TCORE}" STREQUAL "POWER9" OR "${TCORE}" STREQUAL "POWER10") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE 32768\n" | |||
| "#define L1_DATA_LINESIZE 128\n" | |||
| @@ -360,13 +360,8 @@ typedef int blasint; | |||
| #endif | |||
| #endif | |||
| #ifdef POWER8 | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| #endif | |||
| #ifdef POWER9 | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| @@ -68,7 +68,7 @@ | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #define MB __asm__ __volatile__ ("eieio":::"memory") | |||
| #define WMB __asm__ __volatile__ ("eieio":::"memory") | |||
| #define RMB __asm__ __volatile__ ("eieio":::"memory") | |||
| @@ -272,7 +272,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define HAVE_PREFETCH | |||
| #endif | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970) | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(POWER10) || defined(PPC970) | |||
| #define DCBT_ARG 0 | |||
| #else | |||
| #define DCBT_ARG 8 | |||
| @@ -294,7 +294,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define L1_PREFETCH dcbtst | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #define L1_DUALFETCH | |||
| #define L1_PREFETCHSIZE (16 + 128 * 100) | |||
| #define L1_PREFETCH dcbtst | |||
| @@ -843,7 +843,7 @@ Lmcount$lazy_ptr: | |||
| #define BUFFER_SIZE ( 2 << 20) | |||
| #elif defined(PPC440FP2) | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #elif defined(POWER8) || defined(POWER9) | |||
| #elif defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #define BUFFER_SIZE ( 64 << 20) | |||
| #else | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| @@ -57,6 +57,7 @@ | |||
| #define CPUTYPE_PPCG4 7 | |||
| #define CPUTYPE_POWER8 8 | |||
| #define CPUTYPE_POWER9 9 | |||
| #define CPUTYPE_POWER10 10 | |||
| char *cpuname[] = { | |||
| "UNKNOWN", | |||
| @@ -68,7 +69,8 @@ char *cpuname[] = { | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8", | |||
| "POWER9" | |||
| "POWER9", | |||
| "POWER10" | |||
| }; | |||
| char *lowercpuname[] = { | |||
| @@ -81,7 +83,8 @@ char *lowercpuname[] = { | |||
| "cell", | |||
| "ppcg4", | |||
| "power8", | |||
| "power9" | |||
| "power9", | |||
| "power10" | |||
| }; | |||
| char *corename[] = { | |||
| @@ -94,7 +97,8 @@ char *corename[] = { | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8", | |||
| "POWER9" | |||
| "POWER9", | |||
| "POWER10" | |||
| }; | |||
| int detect(void){ | |||
| @@ -125,6 +129,7 @@ int detect(void){ | |||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||
| if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; | |||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
| @@ -157,6 +162,7 @@ int detect(void){ | |||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||
| if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; | |||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
| return CPUTYPE_POWER5; | |||
| @@ -179,6 +185,9 @@ int detect(void){ | |||
| int id; | |||
| __asm __volatile("mfpvr %0" : "=r"(id)); | |||
| switch ( id >> 16 ) { | |||
| case 0x80: // POWER10 | |||
| return CPUTYPE_POWER10; | |||
| break; | |||
| case 0x4e: // POWER9 | |||
| return CPUTYPE_POWER9; | |||
| break; | |||
| @@ -6,6 +6,9 @@ extern gotoblas_t gotoblas_POWER8; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| extern gotoblas_t gotoblas_POWER9; | |||
| #endif | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) | |||
| extern gotoblas_t gotoblas_POWER10; | |||
| #endif | |||
| extern void openblas_warning(int verbose, const char *msg); | |||
| @@ -13,7 +16,8 @@ static char *corename[] = { | |||
| "unknown", | |||
| "POWER6", | |||
| "POWER8", | |||
| "POWER9" | |||
| "POWER9", | |||
| "POWER10" | |||
| }; | |||
| #define NUM_CORETYPES 4 | |||
| @@ -23,6 +27,9 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_POWER8) return corename[2]; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| if (gotoblas == &gotoblas_POWER9) return corename[3]; | |||
| #endif | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) | |||
| if (gotoblas == &gotoblas_POWER10) return corename[4]; | |||
| #endif | |||
| return corename[0]; | |||
| } | |||
| @@ -36,6 +43,10 @@ static gotoblas_t *get_coretype(void) { | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| if (__builtin_cpu_is("power9")) | |||
| return &gotoblas_POWER9; | |||
| #endif | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) | |||
| if (__builtin_cpu_is("isa_3_1") && __builtin_cpu_supports ("mma")) | |||
| return &gotoblas_POWER10; | |||
| #endif | |||
| return NULL; | |||
| } | |||
| @@ -61,6 +72,9 @@ static gotoblas_t *force_coretype(char * coretype) { | |||
| case 2: return (&gotoblas_POWER8); | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| case 3: return (&gotoblas_POWER9); | |||
| #endif | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) | |||
| case 4: return (&gotoblas_POWER10); | |||
| #endif | |||
| default: return NULL; | |||
| } | |||
| @@ -650,6 +650,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "POWER9" | |||
| #endif | |||
| #if defined(FORCE_POWER10) | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| #define SUBARCHITECTURE "POWER10" | |||
| #define SUBDIRNAME "power" | |||
| #define ARCHCONFIG "-DPOWER10 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ | |||
| "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "power10" | |||
| #define CORENAME "POWER10" | |||
| #endif | |||
| #ifdef FORCE_PPCG4 | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| @@ -130,7 +130,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) ) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9)) | |||
| if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| @@ -51,6 +51,10 @@ ifeq ($(CORE), POWER9) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(ARCH), zarch) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -0,0 +1,214 @@ | |||
| ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | |||
| include $(KERNELDIR)/KERNEL.POWER8 | |||
| else | |||
| #SGEMM_BETA = ../generic/gemm_beta.c | |||
| #DGEMM_BETA = ../generic/gemm_beta.c | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| #ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| STRMMKERNEL = sgemm_kernel_power9.S | |||
| DTRMMKERNEL = dgemm_kernel_power9.S | |||
| CTRMMKERNEL = cgemm_kernel_power9.S | |||
| ZTRMMKERNEL = zgemm_kernel_power9.S | |||
| SGEMMKERNEL = sgemm_kernel_power9.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_power9.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||
| DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_power9.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_power9.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| ZGEMMITCOPY = zgemm_tcopy_8_power8.S | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. | |||
| #CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S | |||
| #ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S | |||
| #Pure C for other kernels | |||
| #SAMAXKERNEL = ../arm/amax.c | |||
| #DAMAXKERNEL = ../arm/amax.c | |||
| #CAMAXKERNEL = ../arm/zamax.c | |||
| #ZAMAXKERNEL = ../arm/zamax.c | |||
| # | |||
| #SAMINKERNEL = ../arm/amin.c | |||
| #DAMINKERNEL = ../arm/amin.c | |||
| #CAMINKERNEL = ../arm/zamin.c | |||
| #ZAMINKERNEL = ../arm/zamin.c | |||
| # | |||
| #SMAXKERNEL = ../arm/max.c | |||
| #DMAXKERNEL = ../arm/max.c | |||
| # | |||
| #SMINKERNEL = ../arm/min.c | |||
| #DMINKERNEL = ../arm/min.c | |||
| # | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| ISAMAXKERNEL = isamax_power9.S | |||
| else | |||
| ISAMAXKERNEL = isamax.c | |||
| endif | |||
| IDAMAXKERNEL = idamax.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| ICAMAXKERNEL = icamax_power9.S | |||
| else | |||
| ICAMAXKERNEL = icamax.c | |||
| endif | |||
| IZAMAXKERNEL = izamax.c | |||
| # | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| ISAMINKERNEL = isamin_power9.S | |||
| else | |||
| ISAMINKERNEL = isamin.c | |||
| endif | |||
| IDAMINKERNEL = idamin.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| ICAMINKERNEL = icamin_power9.S | |||
| else | |||
| ICAMINKERNEL = icamin.c | |||
| endif | |||
| IZAMINKERNEL = izamin.c | |||
| # | |||
| #ISMAXKERNEL = ../arm/imax.c | |||
| #IDMAXKERNEL = ../arm/imax.c | |||
| # | |||
| #ISMINKERNEL = ../arm/imin.c | |||
| #IDMINKERNEL = ../arm/imin.c | |||
| # | |||
| SASUMKERNEL = sasum.c | |||
| DASUMKERNEL = dasum.c | |||
| CASUMKERNEL = casum.c | |||
| ZASUMKERNEL = zasum.c | |||
| # | |||
| SAXPYKERNEL = saxpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| CAXPYKERNEL = caxpy_power9.S | |||
| else | |||
| CAXPYKERNEL = caxpy.c | |||
| endif | |||
| ZAXPYKERNEL = zaxpy.c | |||
| # | |||
| SCOPYKERNEL = scopy.c | |||
| DCOPYKERNEL = dcopy.c | |||
| CCOPYKERNEL = ccopy.c | |||
| ZCOPYKERNEL = zcopy.c | |||
| # | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| DSDOTKERNEL = sdot.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| CDOTKERNEL = cdot_power9.S | |||
| else | |||
| CDOTKERNEL = cdot.c | |||
| endif | |||
| ZDOTKERNEL = zdot.c | |||
| # | |||
| SNRM2KERNEL = ../arm/nrm2.c | |||
| DNRM2KERNEL = ../arm/nrm2.c | |||
| CNRM2KERNEL = ../arm/znrm2.c | |||
| ZNRM2KERNEL = ../arm/znrm2.c | |||
| # | |||
| SROTKERNEL = srot.c | |||
| DROTKERNEL = drot.c | |||
| CROTKERNEL = crot.c | |||
| ZROTKERNEL = zrot.c | |||
| # | |||
| SSCALKERNEL = sscal.c | |||
| DSCALKERNEL = dscal.c | |||
| CSCALKERNEL = zscal.c | |||
| ZSCALKERNEL = zscal.c | |||
| # | |||
| SSWAPKERNEL = sswap.c | |||
| DSWAPKERNEL = dswap.c | |||
| CSWAPKERNEL = cswap.c | |||
| ZSWAPKERNEL = zswap.c | |||
| # | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| DGEMVNKERNEL = dgemv_n.c | |||
| CGEMVNKERNEL = cgemv_n.c | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| # | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| DGEMVTKERNEL = dgemv_t.c | |||
| CGEMVTKERNEL = cgemv_t.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| #SSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #SSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #DSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #DSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #QSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #QSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||
| #ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| #Dump kernel | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| endif | |||
| @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "casum_microk_power8.c" | |||
| #endif | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "ccopy_microk_power8.c" | |||
| #endif | |||
| @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
| { | |||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "cswap_microk_power8.c" | |||
| #endif | |||
| @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "dasum_microk_power8.c" | |||
| #endif | |||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "daxpy_microk_power8.c" | |||
| #endif | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "dcopy_microk_power8.c" | |||
| #endif | |||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "ddot_microk_power8.c" | |||
| #endif | |||
| @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "dgemv_n_microk_power8.c" | |||
| #endif | |||
| @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "drot_microk_power8.c" | |||
| #endif | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "dscal_microk_power8.c" | |||
| #endif | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "dswap_microk_power8.c" | |||
| #endif | |||
| @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "sasum_microk_power8.c" | |||
| #endif | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "scopy_microk_power8.c" | |||
| #endif | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "sdot_microk_power8.c" | |||
| #endif | |||
| @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "srot_microk_power8.c" | |||
| #endif | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "sscal_microk_power8.c" | |||
| #endif | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "sswap_microk_power8.c" | |||
| #endif | |||
| @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "zasum_microk_power8.c" | |||
| #endif | |||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "zaxpy_microk_power8.c" | |||
| #endif | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "zcopy_microk_power8.c" | |||
| #endif | |||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "zdot_microk_power8.c" | |||
| #endif | |||
| @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(DOUBLE) | |||
| #include "zscal_microk_power8.c" | |||
| #endif | |||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "zswap_microk_power8.c" | |||
| #endif | |||
| @@ -14,7 +14,7 @@ STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c | |||
| DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| DGEMMITCOPY = dgemm_tcopy_16_skylakex.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| @@ -0,0 +1,129 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <immintrin.h> | |||
| int CNAME(BLASLONG dim_second, BLASLONG dim_first, double *src, BLASLONG lead_dim, double *dst){ | |||
| double *src1, *src2, *src3, *src4, *dst1; | |||
| __m512d z1,z2,z3,z4,z5,z6,z7,z8; __m256d y1,y2,y3,y4; __m128d x1,x2,x3,x4; double s1,s2,s3,s4; | |||
| BLASLONG dim1_count, dim2_count, src_inc; | |||
| src_inc = 4 * lead_dim - dim_first; | |||
| src1 = src; src2 = src + lead_dim; src3 = src2 + lead_dim; src4 = src3 + lead_dim; | |||
| for(dim2_count=dim_second; dim2_count>3; dim2_count-=4){ | |||
| dst1 = dst + 16 * (dim_second - dim2_count); | |||
| for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ | |||
| z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; | |||
| z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16; | |||
| z5 = _mm512_loadu_pd(src3); z6 = _mm512_loadu_pd(src3+8); src3 += 16; | |||
| z7 = _mm512_loadu_pd(src4); z8 = _mm512_loadu_pd(src4+8); src4 += 16; | |||
| _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); | |||
| _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); | |||
| _mm512_storeu_pd(dst1+32,z5); _mm512_storeu_pd(dst1+40,z6); | |||
| _mm512_storeu_pd(dst1+48,z7); _mm512_storeu_pd(dst1+56,z8); dst1 += 16 * dim_second; | |||
| } | |||
| dst1 -= 8 * (dim_second - dim2_count); | |||
| if(dim1_count>7){ | |||
| z1 = _mm512_loadu_pd(src1); src1 += 8; | |||
| z2 = _mm512_loadu_pd(src2); src2 += 8; | |||
| z3 = _mm512_loadu_pd(src3); src3 += 8; | |||
| z4 = _mm512_loadu_pd(src4); src4 += 8; | |||
| _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); | |||
| _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 8 * dim_second; | |||
| dim1_count -= 8; | |||
| } | |||
| dst1 -= 4 * (dim_second - dim2_count); | |||
| if(dim1_count>3){ | |||
| y1 = _mm256_loadu_pd(src1); src1 += 4; | |||
| y2 = _mm256_loadu_pd(src2); src2 += 4; | |||
| y3 = _mm256_loadu_pd(src3); src3 += 4; | |||
| y4 = _mm256_loadu_pd(src4); src4 += 4; | |||
| _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2); | |||
| _mm256_storeu_pd(dst1+ 8,y3); _mm256_storeu_pd(dst1+12,y4); dst1 += 4 * dim_second; | |||
| dim1_count -= 4; | |||
| } | |||
| dst1 -= 2 * (dim_second - dim2_count); | |||
| if(dim1_count>1){ | |||
| x1 = _mm_loadu_pd(src1); src1 += 2; | |||
| x2 = _mm_loadu_pd(src2); src2 += 2; | |||
| x3 = _mm_loadu_pd(src3); src3 += 2; | |||
| x4 = _mm_loadu_pd(src4); src4 += 2; | |||
| _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2); | |||
| _mm_storeu_pd(dst1+4,x3); _mm_storeu_pd(dst1+6,x4); dst1 += 2 * dim_second; | |||
| dim1_count -= 2; | |||
| } | |||
| dst1 -= dim_second - dim2_count; | |||
| if(dim1_count>0){ | |||
| s1 = *src1; src1++; s2 = *src2; src2++; s3 = *src3; src3++; s4 = *src4; src4++; | |||
| dst1[0] = s1; dst1[1] = s2; dst1[2] = s3; dst1[3] = s4; | |||
| } | |||
| src1 += src_inc; src2 += src_inc; src3 += src_inc; src4 += src_inc; | |||
| } | |||
| src_inc -= 2 * lead_dim; | |||
| for(; dim2_count>1; dim2_count-=2){ | |||
| dst1 = dst + 16 * (dim_second - dim2_count); | |||
| for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ | |||
| z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; | |||
| z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16; | |||
| _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); | |||
| _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 16 * dim_second; | |||
| } | |||
| dst1 -= 8 * (dim_second - dim2_count); | |||
| if(dim1_count>7){ | |||
| z1 = _mm512_loadu_pd(src1); src1 += 8; | |||
| z2 = _mm512_loadu_pd(src2); src2 += 8; | |||
| _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 8 * dim_second; | |||
| dim1_count -= 8; | |||
| } | |||
| dst1 -= 4 * (dim_second - dim2_count); | |||
| if(dim1_count>3){ | |||
| y1 = _mm256_loadu_pd(src1); src1 += 4; | |||
| y2 = _mm256_loadu_pd(src2); src2 += 4; | |||
| _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2); dst1 += 4 * dim_second; | |||
| dim1_count -= 4; | |||
| } | |||
| dst1 -= 2 * (dim_second - dim2_count); | |||
| if(dim1_count>1){ | |||
| x1 = _mm_loadu_pd(src1); src1 += 2; | |||
| x2 = _mm_loadu_pd(src2); src2 += 2; | |||
| _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2); dst1 += 2 * dim_second; | |||
| dim1_count -= 2; | |||
| } | |||
| dst1 -= dim_second - dim2_count; | |||
| if(dim1_count>0){ | |||
| s1 = *src1; src1++; s2 = *src2; src2++; | |||
| dst1[0] = s1; dst1[1] = s2; | |||
| } | |||
| src1 += src_inc; src2 += src_inc; | |||
| } | |||
| src_inc -= lead_dim; | |||
| for(; dim2_count>0; dim2_count--){ | |||
| dst1 = dst + 16 * (dim_second - dim2_count); | |||
| for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ | |||
| z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; | |||
| _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 16 * dim_second; | |||
| } | |||
| dst1 -= 8 * (dim_second - dim2_count); | |||
| if(dim1_count>7){ | |||
| z1 = _mm512_loadu_pd(src1); src1 += 8; | |||
| _mm512_storeu_pd(dst1+ 0,z1); dst1 += 8 * dim_second; | |||
| dim1_count -= 8; | |||
| } | |||
| dst1 -= 4 * (dim_second - dim2_count); | |||
| if(dim1_count>3){ | |||
| y1 = _mm256_loadu_pd(src1); src1 += 4; | |||
| _mm256_storeu_pd(dst1+ 0,y1); dst1 += 4 * dim_second; | |||
| dim1_count -= 4; | |||
| } | |||
| dst1 -= 2 * (dim_second - dim2_count); | |||
| if(dim1_count>1){ | |||
| x1 = _mm_loadu_pd(src1); src1 += 2; | |||
| _mm_storeu_pd(dst1+0,x1); dst1 += 2 * dim_second; | |||
| dim1_count -= 2; | |||
| } | |||
| dst1 -= dim_second - dim2_count; | |||
| if(dim1_count>0){ | |||
| s1 = *src1; src1++; | |||
| dst1[0] = s1; | |||
| } | |||
| src1 += src_inc; | |||
| } | |||
| } | |||
| @@ -2260,7 +2260,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER9) | |||
| #if defined(POWER9) || defined(POWER10) | |||
| #define SNUMOPT 16 | |||
| #define DNUMOPT 8 | |||