| @@ -44,6 +44,11 @@ jobs: | |||
| if: github.event_name != 'pull_request' | |||
| run: brew update || true | |||
| - name: unlink installed gcc to allow updating | |||
| run: | | |||
| brew unlink gcc@8 | |||
| brew unlink gcc@9 | |||
| - name: Install prerequisites | |||
| run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas | |||
| @@ -1,4 +1,4 @@ | |||
| ifneq ($(C_COMPILER), PGI) | |||
| ifeq ($(CORE), ARMV8) | |||
| CCOMMON_OPT += -march=armv8-a | |||
| FCOMMON_OPT += -march=armv8-a | |||
| @@ -77,4 +77,4 @@ CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -1279,6 +1279,10 @@ CCOMMON_OPT += -DUSE_PAPI | |||
| EXTRALIB += -lpapi -lperfctr | |||
| endif | |||
| ifdef BUFFERSIZE | |||
| CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE) | |||
| endif | |||
| ifdef DYNAMIC_THREADS | |||
| CCOMMON_OPT += -DDYNAMIC_THREADS | |||
| endif | |||
| @@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, | |||
| void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); | |||
| void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); | |||
| void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); | |||
| void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); | |||
| void cblas_srotg(float *a, float *b, float *c, float *s); | |||
| void cblas_drotg(double *a, double *b, double *c, double *s); | |||
| void cblas_crotg(void *a, void *b, float *c, void *s); | |||
| void cblas_zrotg(void *a, void *b, double *c, void *s); | |||
| void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); | |||
| void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); | |||
| @@ -74,6 +74,9 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||
| string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | |||
| if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER) | |||
| set (CMAKE_MATCH_1 CMAKE_C_COMPILER) | |||
| endif () | |||
| if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) | |||
| # message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define INLINE inline | |||
| #ifdef F_INTERFACE_FLANG | |||
| #if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI) | |||
| #define RETURN_BY_STACK | |||
| #else | |||
| #define RETURN_BY_COMPLEX | |||
| @@ -1436,6 +1436,15 @@ int get_cpuname(void){ | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 7: // Rocket Lake | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| } | |||
| @@ -2014,6 +2023,19 @@ int get_coretype(void){ | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| case 7:// Rocket Lake | |||
| #ifndef NO_AVX512 | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| #endif | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| #endif | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 5: | |||
| switch (model) { | |||
| @@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| } | |||
| case 10: | |||
| if (model == 5 || model == 6) { | |||
| if (model == 5 || model == 6) { | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| @@ -666,7 +666,20 @@ static gotoblas_t *get_coretype(void){ | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| } | |||
| if (model == 7) { | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| return NULL; | |||
| } | |||
| case 0xf: | |||
| @@ -68,7 +68,7 @@ extern void openblas_warning(int verbose, const char * msg); | |||
| #endif | |||
| #define get_cpu_ftr(id, var) ({ \ | |||
| __asm__("mrs %0, "#id : "=r" (var)); \ | |||
| __asm__ __volatile__("mrs %0, "#id : "=r" (var)); \ | |||
| }) | |||
| static char *corename[] = { | |||
| @@ -316,7 +316,7 @@ CCBLAS1OBJS = \ | |||
| cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | |||
| cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | |||
| cblas_caxpby.$(SUFFIX) \ | |||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) | |||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) | |||
| CCBLAS2OBJS = \ | |||
| cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | |||
| @@ -346,7 +346,7 @@ CZBLAS1OBJS = \ | |||
| cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | |||
| cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | |||
| cblas_zaxpby.$(SUFFIX) \ | |||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) | |||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) | |||
| CZBLAS2OBJS = \ | |||
| @@ -1634,6 +1634,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c | |||
| cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c | |||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||
| cblas_zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c | |||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||
| cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -1664,6 +1670,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c | |||
| cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) | |||
| cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| @@ -187,10 +187,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| endif () | |||
| # Makefile.L3 | |||
| set(USE_TRMM false) | |||
| if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) | |||
| string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | |||
| if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE)) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) | |||
| if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| @@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| dot[0]=0.0; | |||
| dot[1]=0.0; | |||
| #if !defined(__PPC__) && !defined(__SunOS) | |||
| #if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) | |||
| CREAL(result) = 0.0 ; | |||
| CIMAG(result) = 0.0 ; | |||
| #else | |||
| @@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| i++ ; | |||
| } | |||
| #if !defined(__PPC__) && !defined(__SunOS) | |||
| #if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) | |||
| CREAL(result) = dot[0]; | |||
| CIMAG(result) = dot[1]; | |||
| #else | |||
| @@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| @@ -96,11 +96,20 @@ DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| SDOTKERNEL = ../generic/dot.c | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| DSDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| @@ -70,10 +70,19 @@ DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| SNRM2KERNEL = nrm2.S | |||
| @@ -47,8 +47,13 @@ ZCOPYKERNEL = copy.S | |||
| SDOTKERNEL = dot_thunderx.c | |||
| DDOTKERNEL = ddot_thunderx.c | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| SNRM2KERNEL = nrm2.S | |||
| @@ -72,8 +72,13 @@ ZCOPYKERNEL = copy.S | |||
| SDOTKERNEL = dot.S | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| SNRM2KERNEL = nrm2.S | |||
| @@ -154,11 +154,7 @@ ZCOPYKERNEL = zcopy_power10.c | |||
| SDOTKERNEL = sdot_power10.c | |||
| DDOTKERNEL = ddot_power10.c | |||
| DSDOTKERNEL = sdot_power10.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| CDOTKERNEL = cdot_power9.S | |||
| else | |||
| CDOTKERNEL = cdot.c | |||
| endif | |||
| ZDOTKERNEL = zdot.c | |||
| # | |||
| SNRM2KERNEL = ../arm/nrm2.c | |||
| @@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #include "common.h" | |||
| #if defined(POWER10) | |||
| #include "cdot_microk_power10.c" | |||
| #else | |||
| #ifndef HAVE_KERNEL_8 | |||
| #include <altivec.h> | |||
| @@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) | |||
| } | |||
| #endif | |||
| #endif | |||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||
| @@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| if ((inc_x == 1) && (inc_y == 1)) { | |||
| #if defined(POWER10) | |||
| BLASLONG n1 = n & -16; | |||
| #else | |||
| BLASLONG n1 = n & -8; | |||
| #endif | |||
| BLASLONG j=0; | |||
| if (n1){ | |||
| @@ -0,0 +1,177 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void cdot_kernel_8 (long n, float *x, float *y, float *dot) | |||
| { | |||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| "dcbt 0, %3 \n\t" | |||
| "xxlxor 32, 32, 32 \n\t" | |||
| "xxlxor 33, 33, 33 \n\t" | |||
| "xxlxor 34, 34, 34 \n\t" | |||
| "xxlxor 35, 35, 35 \n\t" | |||
| "xxlxor 36, 36, 36 \n\t" | |||
| "xxlxor 37, 37, 37 \n\t" | |||
| "xxlxor 38, 38, 38 \n\t" | |||
| "xxlxor 39, 39, 39 \n\t" | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "lxvp 48, 0(%3) \n\t" | |||
| "lxvp 50, 32(%3) \n\t" | |||
| "lxvp 52, 64(%3) \n\t" | |||
| "lxvp 54, 96(%3) \n\t" | |||
| "xxperm 56, 48, %x7 \n\t" | |||
| "xxperm 57, 49, %x7 \n\t" | |||
| "xxperm 58, 50, %x7 \n\t" | |||
| "xxperm 59, 51, %x7 \n\t" | |||
| "xxperm 60, 52, %x7 \n\t" | |||
| "xxperm 61, 53, %x7 \n\t" | |||
| "xxperm 62, 54, %x7 \n\t" | |||
| "xxperm 63, 55, %x7 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i | |||
| "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i | |||
| "lxvp 48, 0(%3) \n\t" | |||
| "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i | |||
| "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i | |||
| "lxvp 50, 32(%3) \n\t" | |||
| "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r | |||
| "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r | |||
| "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r | |||
| "lxvp 42, 32(%2) \n\t" | |||
| "xxperm 56, 48, %x7 \n\t" | |||
| "xxperm 57, 49, %x7 \n\t" | |||
| "xxperm 58, 50, %x7 \n\t" | |||
| "xxperm 59, 51, %x7 \n\t" | |||
| "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i | |||
| "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i | |||
| "lxvp 52, 64(%3) \n\t" | |||
| "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i | |||
| "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i | |||
| "lxvp 54, 96(%3) \n\t" | |||
| "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r | |||
| "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r | |||
| "lxvp 44, 64(%2) \n\t" | |||
| "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r | |||
| "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r | |||
| "lxvp 46, 96(%2) \n\t" | |||
| "xxperm 60, 52, %x7 \n\t" | |||
| "xxperm 61, 53, %x7 \n\t" | |||
| "xxperm 62, 54, %x7 \n\t" | |||
| "xxperm 63, 55, %x7 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addic. %1, %1, -16 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i | |||
| "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i | |||
| "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i | |||
| "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i | |||
| "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r | |||
| "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r | |||
| "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r | |||
| "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r | |||
| "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i | |||
| "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i | |||
| "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i | |||
| "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i | |||
| "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r | |||
| "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r | |||
| "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r | |||
| "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r | |||
| "xvaddsp 32, 32, 34 \n\t" | |||
| "xvaddsp 36, 36, 38 \n\t" | |||
| "xvaddsp 33, 33, 35 \n\t" | |||
| "xvaddsp 37, 37, 39 \n\t" | |||
| "xvaddsp 35, 32, 36 \n\t" | |||
| "xvaddsp 34, 33, 37 \n\t" | |||
| "xxswapd 32, 35 \n\t" | |||
| "xxswapd 33, 34 \n\t" | |||
| "xvaddsp 35, 35, 32 \n\t" | |||
| "xvaddsp 34, 34, 33 \n\t" | |||
| "xxpermdi 34, 34, 35, 2 \n\t" | |||
| "stxv 34, 0(%6) \n\t" | |||
| "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" | |||
| : | |||
| "=m" (*dot), | |||
| "+r" (n), // 1 | |||
| "+b" (x), // 2 | |||
| "+b" (y) // 3 | |||
| : | |||
| "m" (*x), | |||
| "m" (*y), | |||
| "b" (dot), // 6 | |||
| "wa" (mask) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", | |||
| "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" | |||
| ); | |||
| } | |||
| @@ -2399,6 +2399,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_B 65536 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define SWITCH_RATIO 16 | |||
| #define GEMM_PREFERED_SIZE 16 | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #define DGEMM_DEFAULT_UNROLL_M 16 | |||
| @@ -2435,6 +2438,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_B 65536 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define SWITCH_RATIO 16 | |||
| #define GEMM_PREFERED_SIZE 16 | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||