| @@ -44,6 +44,11 @@ jobs: | |||||
| if: github.event_name != 'pull_request' | if: github.event_name != 'pull_request' | ||||
| run: brew update || true | run: brew update || true | ||||
| - name: unlink installed gcc to allow updating | |||||
| run: | | |||||
| brew unlink gcc@8 | |||||
| brew unlink gcc@9 | |||||
| - name: Install prerequisites | - name: Install prerequisites | ||||
| run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas | run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas | ||||
| @@ -1,4 +1,4 @@ | |||||
| ifneq ($(C_COMPILER), PGI) | |||||
| ifeq ($(CORE), ARMV8) | ifeq ($(CORE), ARMV8) | ||||
| CCOMMON_OPT += -march=armv8-a | CCOMMON_OPT += -march=armv8-a | ||||
| FCOMMON_OPT += -march=armv8-a | FCOMMON_OPT += -march=armv8-a | ||||
| @@ -77,4 +77,4 @@ CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||||
| FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| @@ -1279,6 +1279,10 @@ CCOMMON_OPT += -DUSE_PAPI | |||||
| EXTRALIB += -lpapi -lperfctr | EXTRALIB += -lpapi -lperfctr | ||||
| endif | endif | ||||
| ifdef BUFFERSIZE | |||||
| CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE) | |||||
| endif | |||||
| ifdef DYNAMIC_THREADS | ifdef DYNAMIC_THREADS | ||||
| CCOMMON_OPT += -DDYNAMIC_THREADS | CCOMMON_OPT += -DDYNAMIC_THREADS | ||||
| endif | endif | ||||
| @@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, | |||||
| void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); | void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); | ||||
| void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); | void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); | ||||
| void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); | |||||
| void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); | |||||
| void cblas_srotg(float *a, float *b, float *c, float *s); | void cblas_srotg(float *a, float *b, float *c, float *s); | ||||
| void cblas_drotg(double *a, double *b, double *c, double *s); | void cblas_drotg(double *a, double *b, double *c, double *s); | ||||
| void cblas_crotg(void *a, void *b, float *c, void *s); | |||||
| void cblas_zrotg(void *a, void *b, double *c, void *s); | |||||
| void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); | void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); | ||||
| void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); | void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); | ||||
| @@ -74,6 +74,9 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||||
| string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | ||||
| if (NOT "${line_match}" STREQUAL "") | if (NOT "${line_match}" STREQUAL "") | ||||
| # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | ||||
| if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER) | |||||
| set (CMAKE_MATCH_1 CMAKE_C_COMPILER) | |||||
| endif () | |||||
| if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) | if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) | ||||
| # message (STATUS "condition is true") | # message (STATUS "condition is true") | ||||
| set (IfElse 1) | set (IfElse 1) | ||||
| @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define INLINE inline | #define INLINE inline | ||||
| #ifdef F_INTERFACE_FLANG | |||||
| #if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI) | |||||
| #define RETURN_BY_STACK | #define RETURN_BY_STACK | ||||
| #else | #else | ||||
| #define RETURN_BY_COMPLEX | #define RETURN_BY_COMPLEX | ||||
| @@ -1436,6 +1436,15 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_SANDYBRIDGE; | return CPUTYPE_SANDYBRIDGE; | ||||
| else | else | ||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| case 7: // Rocket Lake | |||||
| if(support_avx512()) | |||||
| return CPUTYPE_SKYLAKEX; | |||||
| if(support_avx2()) | |||||
| return CPUTYPE_HASWELL; | |||||
| if(support_avx()) | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| else | |||||
| return CPUTYPE_NEHALEM; | |||||
| } | } | ||||
| break; | break; | ||||
| } | } | ||||
| @@ -2014,6 +2023,19 @@ int get_coretype(void){ | |||||
| #endif | #endif | ||||
| else | else | ||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| case 7:// Rocket Lake | |||||
| #ifndef NO_AVX512 | |||||
| if(support_avx512()) | |||||
| return CORE_SKYLAKEX; | |||||
| #endif | |||||
| #ifndef NO_AVX2 | |||||
| if(support_avx2()) | |||||
| return CORE_HASWELL; | |||||
| #endif | |||||
| if(support_avx()) | |||||
| return CORE_SANDYBRIDGE; | |||||
| else | |||||
| return CORE_NEHALEM; | |||||
| } | } | ||||
| case 5: | case 5: | ||||
| switch (model) { | switch (model) { | ||||
| @@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){ | |||||
| } | } | ||||
| } | } | ||||
| case 10: | case 10: | ||||
| if (model == 5 || model == 6) { | |||||
| if (model == 5 || model == 6) { | |||||
| if(support_avx2()) | if(support_avx2()) | ||||
| return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
| if(support_avx()) { | if(support_avx()) { | ||||
| @@ -666,7 +666,20 @@ static gotoblas_t *get_coretype(void){ | |||||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | ||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
| } | } | ||||
| } | |||||
| } | |||||
| if (model == 7) { | |||||
| if (support_avx512()) | |||||
| return &gotoblas_SKYLAKEX; | |||||
| if(support_avx2()) | |||||
| return &gotoblas_HASWELL; | |||||
| if(support_avx()) { | |||||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
| return &gotoblas_SANDYBRIDGE; | |||||
| } else { | |||||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } | |||||
| return NULL; | return NULL; | ||||
| } | } | ||||
| case 0xf: | case 0xf: | ||||
| @@ -68,7 +68,7 @@ extern void openblas_warning(int verbose, const char * msg); | |||||
| #endif | #endif | ||||
| #define get_cpu_ftr(id, var) ({ \ | #define get_cpu_ftr(id, var) ({ \ | ||||
| __asm__("mrs %0, "#id : "=r" (var)); \ | |||||
| __asm__ __volatile__("mrs %0, "#id : "=r" (var)); \ | |||||
| }) | }) | ||||
| static char *corename[] = { | static char *corename[] = { | ||||
| @@ -316,7 +316,7 @@ CCBLAS1OBJS = \ | |||||
| cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | ||||
| cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | ||||
| cblas_caxpby.$(SUFFIX) \ | cblas_caxpby.$(SUFFIX) \ | ||||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) | |||||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) | |||||
| CCBLAS2OBJS = \ | CCBLAS2OBJS = \ | ||||
| cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | ||||
| @@ -346,7 +346,7 @@ CZBLAS1OBJS = \ | |||||
| cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | ||||
| cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | ||||
| cblas_zaxpby.$(SUFFIX) \ | cblas_zaxpby.$(SUFFIX) \ | ||||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) | |||||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) | |||||
| CZBLAS2OBJS = \ | CZBLAS2OBJS = \ | ||||
| @@ -1634,6 +1634,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c | |||||
| cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c | cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| cblas_crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c | |||||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||||
| cblas_zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c | |||||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||||
| cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c | cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| @@ -1664,6 +1670,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c | |||||
| cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c | cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) | ||||
| cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
| cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
| ifeq ($(BUILD_BFLOAT16),1) | ifeq ($(BUILD_BFLOAT16),1) | ||||
| cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c | cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c | ||||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | ||||
| @@ -187,10 +187,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| endif () | endif () | ||||
| # Makefile.L3 | # Makefile.L3 | ||||
| set(USE_TRMM false) | set(USE_TRMM false) | ||||
| if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) | |||||
| string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | |||||
| if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE)) | |||||
| set(USE_TRMM true) | set(USE_TRMM true) | ||||
| endif () | endif () | ||||
| if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) | |||||
| if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) | |||||
| set(USE_TRMM true) | set(USE_TRMM true) | ||||
| endif () | endif () | ||||
| @@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| dot[0]=0.0; | dot[0]=0.0; | ||||
| dot[1]=0.0; | dot[1]=0.0; | ||||
| #if !defined(__PPC__) && !defined(__SunOS) | |||||
| #if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) | |||||
| CREAL(result) = 0.0 ; | CREAL(result) = 0.0 ; | ||||
| CIMAG(result) = 0.0 ; | CIMAG(result) = 0.0 ; | ||||
| #else | #else | ||||
| @@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| i++ ; | i++ ; | ||||
| } | } | ||||
| #if !defined(__PPC__) && !defined(__SunOS) | |||||
| #if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) | |||||
| CREAL(result) = dot[0]; | CREAL(result) = dot[0]; | ||||
| CIMAG(result) = dot[1]; | CIMAG(result) = dot[1]; | ||||
| #else | #else | ||||
| @@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S | |||||
| ZNRM2KERNEL = znrm2.S | ZNRM2KERNEL = znrm2.S | ||||
| DDOTKERNEL = dot.S | DDOTKERNEL = dot.S | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| SDOTKERNEL = ../generic/dot.c | SDOTKERNEL = ../generic/dot.c | ||||
| else | |||||
| SDOTKERNEL = dot.S | |||||
| endif | |||||
| ifneq ($(C_COMPILER), PGI) | |||||
| CDOTKERNEL = zdot.S | CDOTKERNEL = zdot.S | ||||
| ZDOTKERNEL = zdot.S | ZDOTKERNEL = zdot.S | ||||
| else | |||||
| CDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| endif | |||||
| DSDOTKERNEL = dot.S | DSDOTKERNEL = dot.S | ||||
| DGEMM_BETA = dgemm_beta.S | DGEMM_BETA = dgemm_beta.S | ||||
| @@ -96,11 +96,20 @@ DNRM2KERNEL = nrm2.S | |||||
| CNRM2KERNEL = znrm2.S | CNRM2KERNEL = znrm2.S | ||||
| ZNRM2KERNEL = znrm2.S | ZNRM2KERNEL = znrm2.S | ||||
| DDOTKERNEL = dot.S | |||||
| SDOTKERNEL = ../generic/dot.c | |||||
| CDOTKERNEL = zdot.S | |||||
| ZDOTKERNEL = zdot.S | |||||
| DSDOTKERNEL = dot.S | |||||
| ifneq ($(C_COMPILER), PGI) | |||||
| SDOTKERNEL = ../generic/dot.c | |||||
| else | |||||
| SDOTKERNEL = dot.S | |||||
| endif | |||||
| DDOTKERNEL = dot.S | |||||
| ifneq ($(C_COMPILER), PGI) | |||||
| CDOTKERNEL = zdot.S | |||||
| ZDOTKERNEL = zdot.S | |||||
| else | |||||
| CDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| endif | |||||
| DSDOTKERNEL = dot.S | |||||
| DGEMM_BETA = dgemm_beta.S | DGEMM_BETA = dgemm_beta.S | ||||
| SGEMM_BETA = sgemm_beta.S | SGEMM_BETA = sgemm_beta.S | ||||
| @@ -70,10 +70,19 @@ DCOPYKERNEL = copy.S | |||||
| CCOPYKERNEL = copy.S | CCOPYKERNEL = copy.S | ||||
| ZCOPYKERNEL = copy.S | ZCOPYKERNEL = copy.S | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| SDOTKERNEL = ../generic/dot.c | SDOTKERNEL = ../generic/dot.c | ||||
| else | |||||
| SDOTKERNEL = dot.S | |||||
| endif | |||||
| DDOTKERNEL = dot.S | DDOTKERNEL = dot.S | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| CDOTKERNEL = zdot.S | CDOTKERNEL = zdot.S | ||||
| ZDOTKERNEL = zdot.S | ZDOTKERNEL = zdot.S | ||||
| else | |||||
| CDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| endif | |||||
| DSDOTKERNEL = dot.S | DSDOTKERNEL = dot.S | ||||
| SNRM2KERNEL = nrm2.S | SNRM2KERNEL = nrm2.S | ||||
| @@ -47,8 +47,13 @@ ZCOPYKERNEL = copy.S | |||||
| SDOTKERNEL = dot_thunderx.c | SDOTKERNEL = dot_thunderx.c | ||||
| DDOTKERNEL = ddot_thunderx.c | DDOTKERNEL = ddot_thunderx.c | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| CDOTKERNEL = zdot.S | CDOTKERNEL = zdot.S | ||||
| ZDOTKERNEL = zdot.S | ZDOTKERNEL = zdot.S | ||||
| else | |||||
| CDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| endif | |||||
| DSDOTKERNEL = dot.S | DSDOTKERNEL = dot.S | ||||
| SNRM2KERNEL = nrm2.S | SNRM2KERNEL = nrm2.S | ||||
| @@ -72,8 +72,13 @@ ZCOPYKERNEL = copy.S | |||||
| SDOTKERNEL = dot.S | SDOTKERNEL = dot.S | ||||
| DDOTKERNEL = dot.S | DDOTKERNEL = dot.S | ||||
| ifneq ($(C_COMPILER), PGI) | |||||
| CDOTKERNEL = zdot.S | CDOTKERNEL = zdot.S | ||||
| ZDOTKERNEL = zdot.S | ZDOTKERNEL = zdot.S | ||||
| else | |||||
| CDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| endif | |||||
| DSDOTKERNEL = dot.S | DSDOTKERNEL = dot.S | ||||
| SNRM2KERNEL = nrm2.S | SNRM2KERNEL = nrm2.S | ||||
| @@ -154,11 +154,7 @@ ZCOPYKERNEL = zcopy_power10.c | |||||
| SDOTKERNEL = sdot_power10.c | SDOTKERNEL = sdot_power10.c | ||||
| DDOTKERNEL = ddot_power10.c | DDOTKERNEL = ddot_power10.c | ||||
| DSDOTKERNEL = sdot_power10.c | DSDOTKERNEL = sdot_power10.c | ||||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||||
| CDOTKERNEL = cdot_power9.S | |||||
| else | |||||
| CDOTKERNEL = cdot.c | CDOTKERNEL = cdot.c | ||||
| endif | |||||
| ZDOTKERNEL = zdot.c | ZDOTKERNEL = zdot.c | ||||
| # | # | ||||
| SNRM2KERNEL = ../arm/nrm2.c | SNRM2KERNEL = ../arm/nrm2.c | ||||
| @@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER10) | |||||
| #include "cdot_microk_power10.c" | |||||
| #else | |||||
| #ifndef HAVE_KERNEL_8 | #ifndef HAVE_KERNEL_8 | ||||
| #include <altivec.h> | #include <altivec.h> | ||||
| @@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) | |||||
| } | } | ||||
| #endif | #endif | ||||
| #endif | |||||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | ||||
| @@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||||
| if ((inc_x == 1) && (inc_y == 1)) { | if ((inc_x == 1) && (inc_y == 1)) { | ||||
| #if defined(POWER10) | |||||
| BLASLONG n1 = n & -16; | |||||
| #else | |||||
| BLASLONG n1 = n & -8; | BLASLONG n1 = n & -8; | ||||
| #endif | |||||
| BLASLONG j=0; | BLASLONG j=0; | ||||
| if (n1){ | if (n1){ | ||||
| @@ -0,0 +1,177 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_8 1 | |||||
| static void cdot_kernel_8 (long n, float *x, float *y, float *dot) | |||||
| { | |||||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||||
| __asm__ | |||||
| ( | |||||
| "dcbt 0, %2 \n\t" | |||||
| "dcbt 0, %3 \n\t" | |||||
| "xxlxor 32, 32, 32 \n\t" | |||||
| "xxlxor 33, 33, 33 \n\t" | |||||
| "xxlxor 34, 34, 34 \n\t" | |||||
| "xxlxor 35, 35, 35 \n\t" | |||||
| "xxlxor 36, 36, 36 \n\t" | |||||
| "xxlxor 37, 37, 37 \n\t" | |||||
| "xxlxor 38, 38, 38 \n\t" | |||||
| "xxlxor 39, 39, 39 \n\t" | |||||
| "lxvp 40, 0(%2) \n\t" | |||||
| "lxvp 42, 32(%2) \n\t" | |||||
| "lxvp 44, 64(%2) \n\t" | |||||
| "lxvp 46, 96(%2) \n\t" | |||||
| "lxvp 48, 0(%3) \n\t" | |||||
| "lxvp 50, 32(%3) \n\t" | |||||
| "lxvp 52, 64(%3) \n\t" | |||||
| "lxvp 54, 96(%3) \n\t" | |||||
| "xxperm 56, 48, %x7 \n\t" | |||||
| "xxperm 57, 49, %x7 \n\t" | |||||
| "xxperm 58, 50, %x7 \n\t" | |||||
| "xxperm 59, 51, %x7 \n\t" | |||||
| "xxperm 60, 52, %x7 \n\t" | |||||
| "xxperm 61, 53, %x7 \n\t" | |||||
| "xxperm 62, 54, %x7 \n\t" | |||||
| "xxperm 63, 55, %x7 \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addi %3, %3, 128 \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "ble two%= \n\t" | |||||
| ".align 5 \n" | |||||
| "one%=: \n\t" | |||||
| "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i | |||||
| "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i | |||||
| "lxvp 48, 0(%3) \n\t" | |||||
| "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i | |||||
| "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i | |||||
| "lxvp 50, 32(%3) \n\t" | |||||
| "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r | |||||
| "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r | |||||
| "lxvp 40, 0(%2) \n\t" | |||||
| "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r | |||||
| "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r | |||||
| "lxvp 42, 32(%2) \n\t" | |||||
| "xxperm 56, 48, %x7 \n\t" | |||||
| "xxperm 57, 49, %x7 \n\t" | |||||
| "xxperm 58, 50, %x7 \n\t" | |||||
| "xxperm 59, 51, %x7 \n\t" | |||||
| "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i | |||||
| "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i | |||||
| "lxvp 52, 64(%3) \n\t" | |||||
| "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i | |||||
| "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i | |||||
| "lxvp 54, 96(%3) \n\t" | |||||
| "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r | |||||
| "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r | |||||
| "lxvp 44, 64(%2) \n\t" | |||||
| "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r | |||||
| "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r | |||||
| "lxvp 46, 96(%2) \n\t" | |||||
| "xxperm 60, 52, %x7 \n\t" | |||||
| "xxperm 61, 53, %x7 \n\t" | |||||
| "xxperm 62, 54, %x7 \n\t" | |||||
| "xxperm 63, 55, %x7 \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addi %3, %3, 128 \n\t" | |||||
| "addic. %1, %1, -16 \n\t" | |||||
| "bgt one%= \n" | |||||
| "two%=: \n\t" | |||||
| "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i | |||||
| "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i | |||||
| "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i | |||||
| "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i | |||||
| "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r | |||||
| "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r | |||||
| "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r | |||||
| "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r | |||||
| "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i | |||||
| "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i | |||||
| "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i | |||||
| "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i | |||||
| "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r | |||||
| "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r | |||||
| "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r | |||||
| "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r | |||||
| "xvaddsp 32, 32, 34 \n\t" | |||||
| "xvaddsp 36, 36, 38 \n\t" | |||||
| "xvaddsp 33, 33, 35 \n\t" | |||||
| "xvaddsp 37, 37, 39 \n\t" | |||||
| "xvaddsp 35, 32, 36 \n\t" | |||||
| "xvaddsp 34, 33, 37 \n\t" | |||||
| "xxswapd 32, 35 \n\t" | |||||
| "xxswapd 33, 34 \n\t" | |||||
| "xvaddsp 35, 35, 32 \n\t" | |||||
| "xvaddsp 34, 34, 33 \n\t" | |||||
| "xxpermdi 34, 34, 35, 2 \n\t" | |||||
| "stxv 34, 0(%6) \n\t" | |||||
| "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" | |||||
| : | |||||
| "=m" (*dot), | |||||
| "+r" (n), // 1 | |||||
| "+b" (x), // 2 | |||||
| "+b" (y) // 3 | |||||
| : | |||||
| "m" (*x), | |||||
| "m" (*y), | |||||
| "b" (dot), // 6 | |||||
| "wa" (mask) | |||||
| : | |||||
| "cr0", | |||||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||||
| "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" | |||||
| ); | |||||
| } | |||||
| @@ -2399,6 +2399,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_B 65536 | #define GEMM_DEFAULT_OFFSET_B 65536 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | #define GEMM_DEFAULT_ALIGN 0x0ffffUL | ||||
| #define SWITCH_RATIO 16 | |||||
| #define GEMM_PREFERED_SIZE 16 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | #define SGEMM_DEFAULT_UNROLL_M 16 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | #define SGEMM_DEFAULT_UNROLL_N 8 | ||||
| #define DGEMM_DEFAULT_UNROLL_M 16 | #define DGEMM_DEFAULT_UNROLL_M 16 | ||||
| @@ -2435,6 +2438,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_B 65536 | #define GEMM_DEFAULT_OFFSET_B 65536 | ||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | #define GEMM_DEFAULT_ALIGN 0x0ffffUL | ||||
| #define SWITCH_RATIO 16 | |||||
| #define GEMM_PREFERED_SIZE 16 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | #define SGEMM_DEFAULT_UNROLL_M 16 | ||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | #define SGEMM_DEFAULT_UNROLL_N 8 | ||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | #define DGEMM_DEFAULT_UNROLL_M 8 | ||||