| @@ -219,6 +219,7 @@ In chronological order: | |||
| * Mark Seminatore <https://github.com/mseminatore> | |||
| * [2023-11-09] Improve Windows threading performance scaling | |||
| * [2024-02-09] Introduce MT_TRACE facility and improve code consistency | |||
| * Dirreke <https://github.com/mseminatore> | |||
| * [2024-01-16] Add basic support for the CSKY architecture | |||
| @@ -59,6 +59,22 @@ ifeq ($(TARGET), CK860FV) | |||
| TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float | |||
| endif | |||
| ifeq ($(TARGET), x280) | |||
| TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), RISCV64_ZVL256B) | |||
| TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), RISCV64_ZVL128B) | |||
| TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), RISCV64_GENERIC) | |||
| TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d | |||
| endif | |||
| all: getarch_2nd | |||
| ./getarch_2nd 0 >> $(TARGET_MAKE) | |||
| ./getarch_2nd 1 >> $(TARGET_CONF) | |||
| @@ -2,3 +2,19 @@ ifeq ($(CORE), C910V) | |||
| CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 | |||
| FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static | |||
| endif | |||
| ifeq ($(CORE), x280) | |||
| CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math | |||
| FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | |||
| endif | |||
| ifeq ($(CORE), RISCV64_ZVL256B) | |||
| CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d | |||
| FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static | |||
| endif | |||
| ifeq ($(CORE), RISCV64_ZVL128B) | |||
| CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d | |||
| FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static | |||
| endif | |||
| ifeq ($(CORE), RISCV64_GENERIC) | |||
| CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d | |||
| FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static | |||
| endif | |||
| @@ -198,6 +198,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| ``` | |||
| (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision) | |||
| - **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0. | |||
| ```sh | |||
| make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran | |||
| ``` | |||
| ### Support for multiple targets in a single library | |||
| OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. | |||
| @@ -118,8 +118,11 @@ Z13 | |||
| Z14 | |||
| 10.RISC-V 64: | |||
| RISCV64_GENERIC | |||
| RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) | |||
| RISCV64_ZVL128B | |||
| C910V | |||
| x280 | |||
| RISCV64_ZVL256B | |||
| 11.LOONGARCH64: | |||
| LOONGSONGENERIC | |||
| @@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib | |||
| #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | |||
| LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | |||
| # x280 temporary workaround for gfortran | |||
| ifeq ($(TARGET), x280) | |||
| CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT)) | |||
| endif | |||
| ifneq ($(NO_LAPACK), 1) | |||
| GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ | |||
| @@ -265,9 +271,9 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ | |||
| ismax.goto idmax.goto \ | |||
| isamin.goto idamin.goto icamin.goto izamin.goto \ | |||
| ismin.goto idmin.goto \ | |||
| samax.goto damax.goto scamax.goto dzamax.goto \ | |||
| samax.goto damax.goto camax.goto zamax.goto \ | |||
| smax.goto dmax.goto \ | |||
| samin.goto damin.goto scamin.goto dzamin.goto \ | |||
| samin.goto damin.goto camin.goto zamin.goto \ | |||
| smin.goto dmin.goto \ | |||
| saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ | |||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) | |||
| @@ -2832,12 +2838,12 @@ samax.goto : samax.$(SUFFIX) ../$(LIBNAME) | |||
| damax.goto : damax.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| ############################################## SCAMAX ############################################## | |||
| scamax.goto : scamax.$(SUFFIX) ../$(LIBNAME) | |||
| ############################################## CAMAX ############################################## | |||
| camax.goto : camax.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| ############################################## DZAMAX ############################################## | |||
| dzamax.goto : dzamax.$(SUFFIX) ../$(LIBNAME) | |||
| ############################################## ZAMAX ############################################## | |||
| zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| ############################################## SMAX ############################################## | |||
| @@ -2856,12 +2862,12 @@ samin.goto : samin.$(SUFFIX) ../$(LIBNAME) | |||
| damin.goto : damin.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| ############################################## SCAMIN ############################################## | |||
| scamin.goto : scamin.$(SUFFIX) ../$(LIBNAME) | |||
| ############################################## CAMIN ############################################## | |||
| camin.goto : camin.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| ############################################## DZAMIN ############################################## | |||
| dzamin.goto : dzamin.$(SUFFIX) ../$(LIBNAME) | |||
| ############################################## ZAMIN ############################################## | |||
| zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| ############################################## SMIN ############################################## | |||
| @@ -3383,10 +3389,10 @@ samax.$(SUFFIX) : amax.c | |||
| damax.$(SUFFIX) : amax.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| scamax.$(SUFFIX) : amax.c | |||
| camax.$(SUFFIX) : amax.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| dzamax.$(SUFFIX) : amax.c | |||
| zamax.$(SUFFIX) : amax.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| @@ -3403,10 +3409,10 @@ samin.$(SUFFIX) : amin.c | |||
| damin.$(SUFFIX) : amin.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| scamin.$(SUFFIX) : amin.c | |||
| camin.$(SUFFIX) : amin.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| dzamin.$(SUFFIX) : amin.c | |||
| zamin.$(SUFFIX) : amin.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| @@ -3436,4 +3442,4 @@ smallscaling: smallscaling.c ../$(LIBNAME) | |||
| clean :: | |||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling | |||
| include $(TOPDIR)/Makefile.tail | |||
| include $(TOPDIR)/Makefile.tail | |||
| @@ -101,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE | |||
| CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| @@ -116,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS | |||
| void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| @@ -290,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA | |||
| void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, | |||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); | |||
| @@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double * | |||
| void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, | |||
| xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); | |||
| void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *, | |||
| float *, blasint *, float *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *, | |||
| double *, blasint *, double *, blasint *, double *, double *, blasint *); | |||
| void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *, | |||
| float *, blasint *, float *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *, | |||
| double *, blasint *, double *, blasint *, double *, double *, blasint *); | |||
| int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, | |||
| float *, float *, blasint *, float *, blasint *, | |||
| float *, float *, blasint *); | |||
| @@ -764,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *); | |||
| void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); | |||
| void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); | |||
| void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *); | |||
| void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *); | |||
| void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); | |||
| void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); | |||
| @@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define BUFFER_SIZE ( 32 << 20) | |||
| #define SEEK_ADDRESS | |||
| #if defined(C910V) | |||
| #include <riscv_vector.h> | |||
| #if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280) | |||
| # include <riscv_vector.h> | |||
| #endif | |||
| #if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 ) | |||
| // t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this | |||
| #define RISCV_0p10_INTRINSICS | |||
| #define RISCV_RVV(x) x | |||
| #else | |||
| #define RISCV_RVV(x) __riscv_ ## x | |||
| #endif | |||
| #if defined(C910V) || defined(RISCV64_ZVL256B) | |||
| # if !defined(DOUBLE) | |||
| # define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v) | |||
| # else | |||
| # define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v) | |||
| # endif | |||
| #else | |||
| # define EXTRACT_FLOAT(v) (v[0]) | |||
| #endif | |||
| #endif | |||
| @@ -70,12 +70,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_C910V 1 | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_C910V 1 | |||
| #define CPU_x280 2 | |||
| #define CPU_RISCV64_ZVL256B 3 | |||
| #define CPU_RISCV64_ZVL128B 4 | |||
| static char *cpuname[] = { | |||
| "RISCV64_GENERIC", | |||
| "C910V" | |||
| "C910V", | |||
| "x280", | |||
| "CPU_RISCV64_ZVL256B", | |||
| "CPU_RISCV64_ZVL128B" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "riscv64_generic", | |||
| "c910v", | |||
| "x280", | |||
| "riscv64_zvl256b", | |||
| "riscv64_zvl128b" | |||
| }; | |||
| int detect(void){ | |||
| @@ -86,23 +100,29 @@ int detect(void){ | |||
| char *pmodel = NULL, *pisa = NULL; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| if (!infile) | |||
| return CPU_GENERIC; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if(!strncmp(buffer, "model name", 10)){ | |||
| strcpy(model_buffer, buffer); | |||
| pmodel = strchr(isa_buffer, ':') + 1; | |||
| pmodel = strchr(model_buffer, ':'); | |||
| if (pmodel) | |||
| pmodel++; | |||
| } | |||
| if(!strncmp(buffer, "isa", 3)){ | |||
| strcpy(isa_buffer, buffer); | |||
| pisa = strchr(isa_buffer, '4') + 1; | |||
| pisa = strchr(isa_buffer, '4'); | |||
| if (pisa) | |||
| pisa++; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if (!pmodel) | |||
| if (!pmodel || !pisa) | |||
| return(CPU_GENERIC); | |||
| if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v')) | |||
| return CPU_C910V; | |||
| @@ -140,5 +160,5 @@ void get_cpuconfig(void){ | |||
| } | |||
| void get_libname(void){ | |||
| printf("riscv64\n"); | |||
| printf("%s", cpuname_lower[detect()]); | |||
| } | |||
| @@ -218,6 +218,9 @@ ifeq ($(F_COMPILER), IBM) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| CEXTRALIB += -lgomp | |||
| endif | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB += -lomp | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -96,7 +96,7 @@ | |||
| INTEGER ICAMAXTEST | |||
| EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST | |||
| * .. External Subroutines .. | |||
| EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 | |||
| EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1 | |||
| * .. Intrinsic Functions .. | |||
| INTRINSIC MAX | |||
| * .. Common blocks .. | |||
| @@ -214,8 +214,8 @@ | |||
| CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), | |||
| + STRUE4(NP1),SFAC) | |||
| ELSE IF (ICASE.EQ.8) THEN | |||
| * .. CSCAL .. | |||
| CALL CSCAL(N,CA,CX,INCX) | |||
| * .. CSCALTEST .. | |||
| CALL CSCALTEST(N,CA,CX,INCX) | |||
| CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), | |||
| + SFAC) | |||
| ELSE IF (ICASE.EQ.9) THEN | |||
| @@ -236,14 +236,14 @@ | |||
| * | |||
| INCX = 1 | |||
| IF (ICASE.EQ.8) THEN | |||
| * CSCAL | |||
| * CSCALTEST | |||
| * Add a test for alpha equal to zero. | |||
| CA = (0.0E0,0.0E0) | |||
| DO 80 I = 1, 5 | |||
| MWPCT(I) = (0.0E0,0.0E0) | |||
| MWPCS(I) = (1.0E0,1.0E0) | |||
| 80 CONTINUE | |||
| CALL CSCAL(5,CA,CX,INCX) | |||
| CALL CSCALTEST(5,CA,CX,INCX) | |||
| CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) | |||
| ELSE IF (ICASE.EQ.9) THEN | |||
| * CSSCALTEST | |||
| @@ -440,6 +440,7 @@ static real c_b43 = (float)1.; | |||
| extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); | |||
| static complex mwpcs[5], mwpct[5]; | |||
| extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); | |||
| extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); | |||
| static complex cx[8]; | |||
| extern real scnrm2test_(integer*, complex*, integer*); | |||
| static integer np1; | |||
| @@ -481,7 +482,7 @@ static real c_b43 = (float)1.; | |||
| stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac); | |||
| } else if (combla_1.icase == 8) { | |||
| /* .. CSCAL .. */ | |||
| cscal_(&combla_1.n, &ca, cx, &combla_1.incx); | |||
| cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx); | |||
| ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], | |||
| &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac); | |||
| } else if (combla_1.icase == 9) { | |||
| @@ -515,7 +516,7 @@ static real c_b43 = (float)1.; | |||
| mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.; | |||
| /* L80: */ | |||
| } | |||
| cscal_(&c__5, &ca, cx, &combla_1.incx); | |||
| cscaltest_(&c__5, &ca, cx, &combla_1.incx); | |||
| ctest_(&c__5, cx, mwpct, mwpcs, sfac); | |||
| } else if (combla_1.icase == 9) { | |||
| /* CSSCALTEST */ | |||
| @@ -48,6 +48,12 @@ | |||
| #endif | |||
| #endif | |||
| #ifdef SMP_DEBUG | |||
| # define MT_TRACE(...) fprintf(stderr, __VA_ARGS__) | |||
| #else | |||
| # define MT_TRACE(...) | |||
| #endif | |||
| /* This is a thread implementation for Win32 lazy implementation */ | |||
| /* Thread server common information */ | |||
| @@ -68,19 +74,12 @@ static HANDLE blas_threads [MAX_CPU_NUMBER]; | |||
| static DWORD blas_threads_id[MAX_CPU_NUMBER]; | |||
| static volatile int thread_target; // target num of live threads, volatile for cross-thread reads | |||
| #if defined (__GNUC__) && (__GNUC__ < 6) | |||
| #define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) | |||
| #else | |||
| #if defined(_WIN64) | |||
| #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp) | |||
| #else | |||
| #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp) | |||
| #endif | |||
| #endif | |||
| // | |||
| // Legacy code path | |||
| // | |||
| static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) { | |||
| static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| if (!(mode & BLAS_COMPLEX)){ | |||
| if (!(mode & BLAS_COMPLEX)) { | |||
| #ifdef EXPRECISION | |||
| if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||
| /* REAL / Extended Double */ | |||
| @@ -95,7 +94,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> c, args -> ldc, sb); | |||
| } else | |||
| #endif | |||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE) { | |||
| /* REAL / Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | |||
| double *, BLASLONG, double *, BLASLONG, | |||
| @@ -106,7 +105,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> a, args -> lda, | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ | |||
| } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { | |||
| /* REAL / Single */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, float *, BLASLONG, | |||
| @@ -118,7 +117,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| #ifdef BUILD_BFLOAT16 | |||
| } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ | |||
| } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) { | |||
| /* REAL / BFLOAT16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | |||
| bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, | |||
| @@ -129,7 +128,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> a, args -> lda, | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ | |||
| } else if ((mode & BLAS_PREC) == BLAS_STOBF16) { | |||
| /* REAL / BLAS_STOBF16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, bfloat16 *, BLASLONG, | |||
| @@ -140,7 +139,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> a, args -> lda, | |||
| args -> b, args -> ldb, | |||
| args -> c, args -> ldc, sb); | |||
| } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ | |||
| } else if ((mode & BLAS_PREC) == BLAS_DTOBF16) { | |||
| /* REAL / BLAS_DTOBF16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | |||
| double *, BLASLONG, bfloat16 *, BLASLONG, | |||
| @@ -157,7 +156,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| } | |||
| } else { | |||
| #ifdef EXPRECISION | |||
| if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||
| if ((mode & BLAS_PREC) == BLAS_XDOUBLE) { | |||
| /* COMPLEX / Extended Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, | |||
| xdouble *, BLASLONG, xdouble *, BLASLONG, | |||
| @@ -171,7 +170,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| args -> c, args -> ldc, sb); | |||
| } else | |||
| #endif | |||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE) { | |||
| /* COMPLEX / Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, | |||
| double *, BLASLONG, double *, BLASLONG, | |||
| @@ -201,10 +200,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| } | |||
| } | |||
| /* This is a main routine of threads. Each thread waits until job is */ | |||
| /* queued. */ | |||
| static DWORD WINAPI blas_thread_server(void *arg){ | |||
| // | |||
| // This is a main routine of threads. Each thread waits until job is queued. | |||
| // | |||
| static DWORD WINAPI blas_thread_server(void *arg) { | |||
| /* Thread identifier */ | |||
| BLASLONG cpu = (BLASLONG)arg; | |||
| @@ -215,31 +214,24 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| /* Each server needs each buffer */ | |||
| buffer = blas_memory_alloc(2); | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu); | |||
| #endif | |||
| MT_TRACE("Server[%2ld] Thread is started!\n", cpu); | |||
| while (1){ | |||
| while (1) { | |||
| /* Waiting for Queue */ | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); | |||
| #endif | |||
| // event raised when work is added to the queue | |||
| WaitForSingleObject(kickoff_event, INFINITE); | |||
| MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu); | |||
| if (cpu > thread_target - 2) | |||
| { | |||
| //printf("thread [%d] exiting.\n", cpu); | |||
| break; // excess thread, so worker thread exits | |||
| } | |||
| // event raised when work is added to the queue | |||
| WaitForSingleObject(kickoff_event, INFINITE); | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); | |||
| #endif | |||
| if (cpu > thread_target - 2) { | |||
| //MT_TRACE("thread [%d] exiting.\n", cpu); | |||
| break; // excess thread, so worker thread exits | |||
| } | |||
| MT_TRACE("Server[%2ld] Got it.\n", cpu); | |||
| #if 1 | |||
| EnterCriticalSection(&queue_lock); | |||
| queue = work_queue; | |||
| @@ -247,53 +239,39 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| work_queue = work_queue->next; | |||
| LeaveCriticalSection(&queue_lock); | |||
| #else | |||
| volatile blas_queue_t* queue_next; | |||
| INT_PTR prev_value; | |||
| do { | |||
| queue = (volatile blas_queue_t*)work_queue; | |||
| if (!queue) | |||
| break; | |||
| queue_next = (volatile blas_queue_t*)queue->next; | |||
| prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue); | |||
| } while (prev_value != queue); | |||
| #endif | |||
| if (queue) { | |||
| if (queue) { | |||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | |||
| sa = queue -> sa; | |||
| sb = queue -> sb; | |||
| #ifdef CONSISTENT_FPCSR | |||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||
| #endif | |||
| #ifdef CONSISTENT_FPCSR | |||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||
| #endif | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||
| MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||
| cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); | |||
| #endif | |||
| // fprintf(stderr, "queue start[%ld]!!!\n", cpu); | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING1; | |||
| #endif | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING1; | |||
| #endif | |||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
| if (sa == NULL) | |||
| sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
| if (sb == NULL) { | |||
| if (!(queue -> mode & BLAS_COMPLEX)){ | |||
| if (!(queue -> mode & BLAS_COMPLEX)) { | |||
| #ifdef EXPRECISION | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) { | |||
| sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| } else | |||
| #endif | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { | |||
| #ifdef BUILD_DOUBLE | |||
| sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| @@ -327,65 +305,58 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| /* Other types in future */ | |||
| } | |||
| } | |||
| queue->sb=sb; | |||
| queue->sb=sb; | |||
| } | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING2; | |||
| #endif | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING2; | |||
| #endif | |||
| if (!(queue -> mode & BLAS_LEGACY)) { | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||
| } else { | |||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||
| } | |||
| }else{ | |||
| continue; //if queue == NULL | |||
| } | |||
| } else { | |||
| continue; //if queue == NULL | |||
| } | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); | |||
| #endif | |||
| MT_TRACE("Server[%2ld] Finished!\n", cpu); | |||
| queue->finished = 1; | |||
| queue->finished = 1; | |||
| } | |||
| /* Shutdown procedure */ | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); | |||
| #endif | |||
| MT_TRACE("Server[%2ld] Shutdown!\n", cpu); | |||
| blas_memory_free(buffer); | |||
| return 0; | |||
| } | |||
| } | |||
| /* Initializing routine */ | |||
| int blas_thread_init(void){ | |||
| // | |||
| // Initializing routine | |||
| // | |||
| int blas_thread_init(void) { | |||
| BLASLONG i; | |||
| if (blas_server_avail || (blas_cpu_number <= 1)) return 0; | |||
| LOCK_COMMAND(&server_lock); | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", | |||
| blas_cpu_number); | |||
| #endif | |||
| MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); | |||
| if (!blas_server_avail){ | |||
| // create the kickoff Event | |||
| kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
| if (!blas_server_avail) { | |||
| // create the kickoff Event | |||
| kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
| thread_target = blas_cpu_number; | |||
| thread_target = blas_cpu_number; | |||
| InitializeCriticalSection(&queue_lock); | |||
| for(i = 0; i < blas_cpu_number - 1; i++){ | |||
| //printf("thread_init: creating thread [%d]\n", i); | |||
| for(i = 0; i < blas_cpu_number - 1; i++) { | |||
| //MT_TRACE("thread_init: creating thread [%d]\n", i); | |||
| blas_threads[i] = CreateThread(NULL, 0, | |||
| blas_thread_server, (void *)i, | |||
| @@ -400,15 +371,12 @@ int blas_thread_init(void){ | |||
| return 0; | |||
| } | |||
| /* | |||
| User can call one of two routines. | |||
| exec_blas_async ... immediately returns after jobs are queued. | |||
| exec_blas ... returns after jobs are finished. | |||
| */ | |||
| int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| // | |||
| // User can call one of two routines. | |||
| // exec_blas_async ... immediately returns after jobs are queued. | |||
| // exec_blas ... returns after jobs are finished. | |||
| // | |||
| int exec_blas_async(BLASLONG pos, blas_queue_t *queue) { | |||
| #if defined(SMP_SERVER) | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| @@ -428,7 +396,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); | |||
| #endif | |||
| current->finished = 0; | |||
| current->finished = 0; | |||
| current = current -> next; | |||
| pos ++; | |||
| } | |||
| @@ -437,18 +405,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| if (!work_queue) | |||
| { | |||
| work_queue = queue; | |||
| work_queue = queue; | |||
| } | |||
| else | |||
| { | |||
| blas_queue_t *next_item = work_queue; | |||
| // find the end of the work queue | |||
| while (next_item) | |||
| next_item = next_item->next; | |||
| // find the end of the work queue | |||
| while (next_item) | |||
| next_item = next_item->next; | |||
| // add new work to the end | |||
| next_item = queue; | |||
| // add new work to the end | |||
| next_item = queue; | |||
| } | |||
| LeaveCriticalSection(&queue_lock); | |||
| @@ -458,26 +426,25 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| return 0; | |||
| } | |||
| int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | |||
| // | |||
| // Join. Wait for all queued tasks to complete | |||
| // | |||
| int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) { | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Synchronization Waiting.\n"); | |||
| #endif | |||
| MT_TRACE("Synchronization Waiting.\n"); | |||
| while (num){ | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Waiting Queue ..\n"); | |||
| #endif | |||
| while (!queue->finished) | |||
| YIELDING; | |||
| while (num) { | |||
| MT_TRACE("Waiting Queue ..\n"); | |||
| queue = queue->next; | |||
| num--; | |||
| } | |||
| while (!queue->finished) | |||
| YIELDING; | |||
| queue = queue->next; | |||
| num--; | |||
| } | |||
| MT_TRACE("Completely Done.\n\n"); | |||
| #ifdef SMP_DEBUG | |||
| fprintf(STDERR, "Completely Done.\n\n"); | |||
| #endif | |||
| // if work was added to the queue after this batch we can't sleep the worker threads | |||
| // by resetting the event | |||
| EnterCriticalSection(&queue_lock); | |||
| @@ -490,8 +457,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | |||
| return 0; | |||
| } | |||
| /* Execute Threads */ | |||
| int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| // | |||
| // Execute Threads | |||
| // | |||
| int exec_blas(BLASLONG num, blas_queue_t *queue) { | |||
| #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| @@ -504,29 +473,33 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| if ((num <= 0) || (queue == NULL)) return 0; | |||
| if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); | |||
| if ((num > 1) && queue -> next) | |||
| exec_blas_async(1, queue -> next); | |||
| routine = queue -> routine; | |||
| if (queue -> mode & BLAS_LEGACY) { | |||
| legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); | |||
| } else | |||
| } else { | |||
| if (queue -> mode & BLAS_PTHREAD) { | |||
| void (*pthreadcompat)(void *) = queue -> routine; | |||
| (pthreadcompat)(queue -> args); | |||
| } else | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, | |||
| queue -> sa, queue -> sb, 0); | |||
| queue -> sa, queue -> sb, 0); | |||
| } | |||
| if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); | |||
| if ((num > 1) && queue -> next) | |||
| exec_blas_async_wait(num - 1, queue -> next); | |||
| return 0; | |||
| } | |||
| /* Shutdown procedure, but user don't have to call this routine. The */ | |||
| /* kernel automatically kill threads. */ | |||
| int BLASFUNC(blas_thread_shutdown)(void){ | |||
| // | |||
| // Shutdown procedure, but user don't have to call this routine. The | |||
| // kernel automatically kill threads. | |||
| // | |||
| int BLASFUNC(blas_thread_shutdown)(void) { | |||
| int i; | |||
| @@ -534,9 +507,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
| LOCK_COMMAND(&server_lock); | |||
| if (blas_server_avail){ | |||
| if (blas_server_avail) { | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| for (i = 0; i < blas_num_threads - 1; i++) { | |||
| // Could also just use WaitForMultipleObjects | |||
| DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); | |||
| @@ -558,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
| return 0; | |||
| } | |||
| // | |||
| // Legacy function to set numbef of threads | |||
| // | |||
| void goto_set_num_threads(int num_threads) | |||
| { | |||
| long i; | |||
| @@ -571,7 +547,7 @@ void goto_set_num_threads(int num_threads) | |||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||
| if (blas_server_avail && num_threads < blas_num_threads) { | |||
| if (blas_server_avail && num_threads < blas_num_threads) { | |||
| LOCK_COMMAND(&server_lock); | |||
| thread_target = num_threads; | |||
| @@ -579,11 +555,11 @@ void goto_set_num_threads(int num_threads) | |||
| SetEvent(kickoff_event); | |||
| for (i = num_threads - 1; i < blas_num_threads - 1; i++) { | |||
| //printf("set_num_threads: waiting on thread [%d] to quit.\n", i); | |||
| //MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i); | |||
| WaitForSingleObject(blas_threads[i], INFINITE); | |||
| //printf("set_num_threads: thread [%d] has quit.\n", i); | |||
| //MT_TRACE("set_num_threads: thread [%d] has quit.\n", i); | |||
| CloseHandle(blas_threads[i]); | |||
| } | |||
| @@ -601,8 +577,8 @@ void goto_set_num_threads(int num_threads) | |||
| thread_target = num_threads; | |||
| //increased_threads = 1; | |||
| if (!blas_server_avail){ | |||
| //increased_threads = 1; | |||
| if (!blas_server_avail) { | |||
| // create the kickoff Event | |||
| kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||
| @@ -611,8 +587,8 @@ void goto_set_num_threads(int num_threads) | |||
| blas_server_avail = 1; | |||
| } | |||
| for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ | |||
| //printf("set_num_threads: creating thread [%d]\n", i); | |||
| for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { | |||
| //MT_TRACE("set_num_threads: creating thread [%d]\n", i); | |||
| blas_threads[i] = CreateThread(NULL, 0, | |||
| blas_thread_server, (void *)i, | |||
| @@ -627,6 +603,9 @@ void goto_set_num_threads(int num_threads) | |||
| blas_cpu_number = num_threads; | |||
| } | |||
| // | |||
| // Openblas function to set thread count | |||
| // | |||
| void openblas_set_num_threads(int num) | |||
| { | |||
| goto_set_num_threads(num); | |||
| @@ -275,6 +275,7 @@ extern gotoblas_t gotoblas_EXCAVATOR; | |||
| #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE | |||
| #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE | |||
| #define gotoblas_ZEN gotoblas_SANDYBRIDGE | |||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE | |||
| #else | |||
| extern gotoblas_t gotoblas_HASWELL; | |||
| extern gotoblas_t gotoblas_ZEN; | |||
| @@ -43,6 +43,13 @@ char *gotoblas_corename(void) { | |||
| #define CPU_POWER9 9 | |||
| #define CPU_POWER10 10 | |||
| #ifndef POWER_9 | |||
| #define POWER_9 0x20000 /* 9 class CPU */ | |||
| #endif | |||
| #ifndef POWER_10 | |||
| #define POWER_10 0x40000 /* 10 class CPU */ | |||
| #endif | |||
| #ifdef _AIX | |||
| #include <sys/systemcfg.h> | |||
| @@ -62,7 +69,7 @@ static int cpuid(void) | |||
| else if (arch == POWER_9) return CPU_POWER9; | |||
| #endif | |||
| #ifdef POWER_10 | |||
| else if (arch == POWER_10) return CPU_POWER10; | |||
| else if (arch >= POWER_10) return CPU_POWER10; | |||
| #endif | |||
| return CPU_UNKNOWN; | |||
| } | |||
| @@ -332,6 +339,9 @@ void gotoblas_dynamic_init(void) { | |||
| if (gotoblas && gotoblas -> init) { | |||
| strncpy(coren,gotoblas_corename(),20); | |||
| sprintf(coremsg, "Core: %s\n",coren); | |||
| if (getenv("GET_OPENBLAS_CORETYPE")) { | |||
| fprintf(stderr, "%s", coremsg); | |||
| } | |||
| openblas_warning(2, coremsg); | |||
| gotoblas -> init(); | |||
| } else { | |||
| @@ -3214,7 +3214,7 @@ void blas_shutdown(void){ | |||
| #endif | |||
| memory[pos].lock = 0; | |||
| } | |||
| if (memory_overflowed) | |||
| if (memory_overflowed) { | |||
| for (pos = 0; pos < NEW_BUFFERS; pos ++){ | |||
| newmemory[pos].addr = (void *)0; | |||
| newmemory[pos].used = 0; | |||
| @@ -3222,6 +3222,10 @@ void blas_shutdown(void){ | |||
| newmemory[pos].pos = -1; | |||
| #endif | |||
| newmemory[pos].lock = 0; | |||
| } | |||
| free(newmemory); | |||
| newmemory = NULL; | |||
| memory_overflowed = 0; | |||
| } | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| @@ -60,6 +60,7 @@ cblasobjsc=" | |||
| cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv | |||
| cblas_scnrm2 cblas_scasum cblas_cgemmt | |||
| cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy | |||
| cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin | |||
| " | |||
| cblasobjsd=" | |||
| cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot | |||
| @@ -69,6 +70,7 @@ cblasobjsd=" | |||
| cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv | |||
| cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt | |||
| cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy | |||
| cblas_damax cblas_damin | |||
| " | |||
| cblasobjss=" | |||
| @@ -80,6 +82,7 @@ cblasobjss=" | |||
| cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm | |||
| cblas_strsv cblas_sgeadd cblas_sgemmt | |||
| cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy | |||
| cblas_samax cblas_samin | |||
| " | |||
| cblasobjsz=" | |||
| @@ -91,6 +94,7 @@ cblasobjsz=" | |||
| cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub | |||
| cblas_zaxpby cblas_zgeadd cblas_zgemmt | |||
| cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy | |||
| cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin | |||
| " | |||
| cblasobjs="cblas_xerbla" | |||
| @@ -861,6 +865,53 @@ lapackobjs2z="$lapackobjs2z | |||
| zgedmd | |||
| zgedmdq | |||
| " | |||
| #functions added post 3.11 | |||
| lapackobjs2c="$lapackobjs2c | |||
| claqp2rk | |||
| claqp3rk | |||
| ctrsyl3 | |||
| " | |||
| # claqz0 | |||
| # claqz1 | |||
| # claqz2 | |||
| # claqz3 | |||
| # clatrs3 | |||
| lapackobjs2d="$lapackobjs2d | |||
| dgelqs | |||
| dgelst | |||
| dgeqp3rk | |||
| dgeqrs | |||
| dlaqp2rk | |||
| dlaqp3rk | |||
| dlarmm | |||
| dlatrs3 | |||
| dtrsyl3 | |||
| " | |||
| # dlaqz0 | |||
| # dlaqz1 | |||
| # dlaqz2 | |||
| # dlaqz3 | |||
| # dlaqz4 | |||
| lapackobjs2z="$lapackobjs2z | |||
| zgelqs | |||
| zgelst | |||
| zgeqp3rk | |||
| zgeqrs | |||
| zlaqp2rk | |||
| zlaqp3rk | |||
| zlatrs3 | |||
| zrscl | |||
| ztrsyl3 | |||
| " | |||
| # zlaqz0 | |||
| # zlaqz1 | |||
| # zlaqz2 | |||
| # zlaqz3 | |||
| lapack_extendedprecision_objs=" | |||
| zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx | |||
| dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx | |||
| @@ -1622,6 +1673,14 @@ lapackeobjsc=" | |||
| LAPACKE_cgetsqrhrt_work | |||
| LAPACKE_cungtsqr_row | |||
| LAPACKE_cungtsqr_row_work | |||
| LAPACKE_clangb | |||
| LAPACKE_clangb_work | |||
| LAPACKE_ctrsyl3 | |||
| LAPACKE_ctrsyl3_work | |||
| LAPACKE_ctz_nancheck | |||
| LAPACKE_ctz_trans | |||
| LAPACKE_cunhr_col | |||
| LAPACKE_cunhr_col_work | |||
| " | |||
| lapackeobjsd=" | |||
| @@ -2239,6 +2298,14 @@ lapackeobjsd=" | |||
| LAPACKE_dgetsqrhrt_work | |||
| LAPACKE_dorgtsqr_row | |||
| LAPACKE_dorgtsqr_row_work | |||
| LAPACKE_dlangb | |||
| LAPACKE_dlangb_work | |||
| LAPACKE_dorhr_col | |||
| LAPACKE_dorhr_col_work | |||
| LAPACKE_dtrsyl3 | |||
| LAPACKE_dtrsyl3_work | |||
| LAPACKE_dtz_nancheck | |||
| LAPACKE_dtz_trans | |||
| " | |||
| lapackeobjss=" | |||
| @@ -2848,6 +2915,14 @@ lapackeobjss=" | |||
| LAPACKE_sgetsqrhrt_work | |||
| LAPACKE_sorgtsqr_row | |||
| LAPACKE_sorgtsqr_row_work | |||
| LAPACKE_slangb | |||
| LAPACKE_slangb_work | |||
| LAPACKE_sorhr_col | |||
| LAPACKE_sorhr_col_work | |||
| LAPACKE_strsyl3 | |||
| LAPACKE_strsyl3_work | |||
| LAPACKE_stz_nancheck | |||
| LAPACKE_stz_trans | |||
| " | |||
| lapackeobjsz=" | |||
| @@ -3515,6 +3590,14 @@ lapackeobjsz=" | |||
| LAPACKE_zgetsqrhrt_work | |||
| LAPACKE_zungtsqr_row | |||
| LAPACKE_zungtsqr_row_work | |||
| LAPACKE_zlangb | |||
| LAPACKE_zlangb_work | |||
| LAPACKE_ztrsyl3 | |||
| LAPACKE_ztrsyl3_work | |||
| LAPACKE_ztz_nancheck | |||
| LAPACKE_ztz_trans | |||
| LAPACKE_zunhr_col | |||
| LAPACKE_zunhr_col_work | |||
| " | |||
| ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` | |||
| ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the | |||
| @@ -3616,6 +3699,7 @@ lapack_embeded_underscore_objs_s=" | |||
| ssysv_aa_2stage ssytrf_aa_2stage | |||
| ssytrs_aa_2stage | |||
| slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col | |||
| slarfb_gett | |||
| " | |||
| lapack_embeded_underscore_objs_c=" | |||
| chetf2_rook chetrf_rook chetri_rook | |||
| @@ -3641,6 +3725,7 @@ lapack_embeded_underscore_objs_c=" | |||
| csysv_aa_2stage csytrf_aa_2stage | |||
| csytrs_aa_2stage | |||
| claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col | |||
| clarfb_gett | |||
| " | |||
| lapack_embeded_underscore_objs_d=" | |||
| dlasyf_rook | |||
| @@ -3658,6 +3743,7 @@ lapack_embeded_underscore_objs_d=" | |||
| dsysv_aa_2stage | |||
| dsytrf_aa_2stage dsytrs_aa_2stage | |||
| dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col | |||
| dlarfb_gett | |||
| " | |||
| lapack_embeded_underscore_objs_z=" | |||
| zhetf2_rook zhetrf_rook zhetri_rook | |||
| @@ -3682,6 +3768,7 @@ lapack_embeded_underscore_objs_z=" | |||
| zhetrs_aa_2stage zsysv_aa_2stage | |||
| zsytrf_aa_2stage zsytrs_aa_2stage | |||
| zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col | |||
| zlarfb_gett | |||
| " | |||
| dirname=`pwd -P`/../lapack-netlib | |||
| @@ -1679,9 +1679,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define LIBNAME "c910v" | |||
| #define CORENAME "C910V" | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_x280 | |||
| #define FORCE | |||
| #define ARCHITECTURE "RISCV64" | |||
| #define SUBARCHITECTURE "x280" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-Dx280 " \ | |||
| "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "x280" | |||
| #define CORENAME "x280" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_RISCV64_ZVL256B | |||
| #define FORCE | |||
| #define ARCHITECTURE "RISCV64" | |||
| #define SUBARCHITECTURE "RISCV64_ZVL256B" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-DRISCV64_ZVL256B " \ | |||
| "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "riscv64_zvl256b" | |||
| #define CORENAME "RISCV64_ZVL256B" | |||
| #endif | |||
| #ifdef FORCE_RISCV64_ZVL128B | |||
| #define FORCE | |||
| #define ARCHITECTURE "RISCV64" | |||
| #define SUBARCHITECTURE "RISCV64_ZVL128B" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-DRISCV64_ZVL128B " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "riscv64_zvl128b" | |||
| #define CORENAME "RISCV64_ZVL128B" | |||
| #endif | |||
| #if defined(FORCE_E2K) || defined(__e2k__) | |||
| #define FORCE | |||
| @@ -119,6 +119,7 @@ endif () | |||
| if (BUILD_BFLOAT16) | |||
| GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| @@ -130,6 +131,8 @@ endif () | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| @@ -270,7 +270,8 @@ CSBLAS1OBJS = \ | |||
| cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | |||
| cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | |||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | |||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) | |||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \ | |||
| cblas_samin.$(SUFFIX) | |||
| CSBLAS2OBJS = \ | |||
| cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | |||
| @@ -295,7 +296,8 @@ CDBLAS1OBJS = \ | |||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | |||
| cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | |||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | |||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) | |||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \ | |||
| cblas_damin.$(SUFFIX) | |||
| CDBLAS2OBJS = \ | |||
| cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | |||
| @@ -315,7 +317,7 @@ CCBLAS1OBJS = \ | |||
| cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | |||
| cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | |||
| cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | |||
| cblas_caxpby.$(SUFFIX) \ | |||
| cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \ | |||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) | |||
| CCBLAS2OBJS = \ | |||
| @@ -340,12 +342,12 @@ CXERBLAOBJ = \ | |||
| CZBLAS1OBJS = \ | |||
| cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | |||
| cblas_zcopy.$(SUFFIX) \ | |||
| cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \ | |||
| cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ | |||
| cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | |||
| cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | |||
| cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | |||
| cblas_zaxpby.$(SUFFIX) \ | |||
| cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \ | |||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) | |||
| @@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| @@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c | |||
| cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c | |||
| cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||
| cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||
| cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||
| cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| @@ -78,6 +78,9 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||
| char transA, transB, Uplo; | |||
| blasint nrowa, nrowb; | |||
| #if defined(COMPLEX) | |||
| blasint ncolb; | |||
| #endif | |||
| IFLOAT *buffer; | |||
| IFLOAT *aa, *bb; | |||
| FLOAT *cc; | |||
| @@ -155,19 +158,27 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||
| uplo = 0; | |||
| if (Uplo == 'L') | |||
| uplo = 1; | |||
| nrowa = m; | |||
| if (transa) nrowa = k; | |||
| if (transa & 1) nrowa = k; | |||
| nrowb = k; | |||
| if (transb) nrowb = m; | |||
| #if defined(COMPLEX) | |||
| ncolb = m; | |||
| #endif | |||
| if (transb & 1) { | |||
| nrowb = m; | |||
| #if defined(COMPLEX) | |||
| ncolb = k; | |||
| #endif | |||
| } | |||
| info = 0; | |||
| if (ldc < MAX(1, m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, nrowa)) | |||
| if (ldb < MAX(1, nrowb)) | |||
| info = 10; | |||
| if (lda < MAX(1, nrowb)) | |||
| if (lda < MAX(1, nrowa)) | |||
| info = 8; | |||
| if (k < 0) | |||
| info = 5; | |||
| @@ -211,6 +222,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| blasint info; | |||
| blasint lda, ldb; | |||
| FLOAT *a, *b; | |||
| #if defined(COMPLEX) | |||
| blasint nrowb, ncolb; | |||
| #endif | |||
| XFLOAT *buffer; | |||
| PRINT_DEBUG_CNAME; | |||
| @@ -262,11 +276,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| info = -1; | |||
| blasint nrowa, nrowb; | |||
| blasint nrowa; | |||
| #if !defined(COMPLEX) | |||
| blasint nrowb; | |||
| #endif | |||
| nrowa = m; | |||
| if (transa) nrowa = k; | |||
| if (transa & 1) nrowa = k; | |||
| nrowb = k; | |||
| if (transb) nrowb = m; | |||
| #if defined(COMPLEX) | |||
| ncolb = m; | |||
| #endif | |||
| if (transb & 1) { | |||
| nrowb = m; | |||
| #if defined(COMPLEX) | |||
| ncolb = k; | |||
| #endif | |||
| } | |||
| if (ldc < MAX(1, m)) | |||
| info = 13; | |||
| @@ -330,26 +355,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| info = -1; | |||
| blasint ncola, ncolb; | |||
| ncola = k; | |||
| if (transa) ncola = m; | |||
| ncolb = m; | |||
| if (transb) ncolb = k; | |||
| blasint ncola; | |||
| #if !defined(COMPLEX) | |||
| blasint ncolb; | |||
| #endif | |||
| ncola = m; | |||
| if (transa & 1) ncola = k; | |||
| ncolb = k; | |||
| #if defined(COMPLEX) | |||
| nrowb = m; | |||
| #endif | |||
| if (transb & 1) { | |||
| #if defined(COMPLEX) | |||
| nrowb = k; | |||
| #endif | |||
| ncolb = m; | |||
| } | |||
| if (ldc < MAX(1,m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, ncolb)) | |||
| info = 10; | |||
| if (lda < MAX(1, ncola)) | |||
| info = 8; | |||
| if (lda < MAX(1, ncola)) | |||
| info = 10; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (m < 0) | |||
| info = 4; | |||
| if (transb < 0) | |||
| info = 3; | |||
| if (transa < 0) | |||
| info = 2; | |||
| if (transa < 0) | |||
| info = 3; | |||
| if (uplo < 0) | |||
| info = 1; | |||
| } | |||
| @@ -428,7 +465,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| IDEBUG_START; | |||
| const blasint incb = (transb == 0) ? 1 : ldb; | |||
| #if defined(COMPLEX) | |||
| if (transb > 1){ | |||
| #ifndef CBLAS | |||
| IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
| #else | |||
| if (order == CblasColMajor) | |||
| IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
| if (order == CblasRowMajor) | |||
| IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
| #endif | |||
| } | |||
| #endif | |||
| const blasint incb = ((transb & 1) == 0) ? 1 : ldb; | |||
| if (uplo == 1) { | |||
| for (i = 0; i < m; i++) { | |||
| @@ -438,19 +488,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #if defined(COMPLEX) | |||
| aa = a + i * 2; | |||
| bb = b + i * ldb * 2; | |||
| if (transa) { | |||
| if (transa & 1) { | |||
| aa = a + lda * i * 2; | |||
| } | |||
| if (transb) | |||
| if (transb & 1) | |||
| bb = b + i * 2; | |||
| cc = c + i * 2 * ldc + i * 2; | |||
| #else | |||
| aa = a + i; | |||
| bb = b + i * ldb; | |||
| if (transa) { | |||
| if (transa & 1) { | |||
| aa = a + lda * i; | |||
| } | |||
| if (transb) | |||
| if (transb & 1) | |||
| bb = b + i; | |||
| cc = c + i * ldc + i; | |||
| #endif | |||
| @@ -461,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| NULL, 0); | |||
| if (alpha_r == ZERO && alpha_i == ZERO) | |||
| return; | |||
| continue; | |||
| #else | |||
| if (beta != ONE) | |||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||
| @@ -478,7 +528,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
| STACK_ALLOC(buffer_size, IFLOAT, buffer); | |||
| #ifdef SMP | |||
| @@ -491,7 +541,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | |||
| aa, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| @@ -500,7 +550,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| aa, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| #else | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, | |||
| bb, incb, cc, 1, buffer); | |||
| else | |||
| @@ -509,7 +559,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| #ifdef SMP | |||
| } else { | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv_thread[(int)transa]) (j, k, alpha, aa, | |||
| lda, bb, incb, cc, | |||
| 1, buffer, | |||
| @@ -533,13 +583,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| l = j; | |||
| #if defined COMPLEX | |||
| bb = b + i * ldb * 2; | |||
| if (transb) { | |||
| if (transb & 1) { | |||
| bb = b + i * 2; | |||
| } | |||
| cc = c + i * 2 * ldc; | |||
| #else | |||
| bb = b + i * ldb; | |||
| if (transb) { | |||
| if (transb & 1) { | |||
| bb = b + i; | |||
| } | |||
| cc = c + i * ldc; | |||
| @@ -551,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| NULL, 0); | |||
| if (alpha_r == ZERO && alpha_i == ZERO) | |||
| return; | |||
| continue; | |||
| #else | |||
| if (beta != ONE) | |||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||
| @@ -567,7 +617,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
| STACK_ALLOC(buffer_size, IFLOAT, buffer); | |||
| #ifdef SMP | |||
| @@ -580,7 +630,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | |||
| a, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| @@ -589,7 +639,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| a, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| #else | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, | |||
| incb, cc, 1, buffer); | |||
| else | |||
| @@ -599,7 +649,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #ifdef SMP | |||
| } else { | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv_thread[(int)transa]) (j, k, alpha, a, lda, | |||
| bb, incb, cc, 1, | |||
| buffer, nthreads); | |||
| @@ -154,7 +154,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| } | |||
| #endif | |||
| msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT); | |||
| if ( *rows > *cols ) | |||
| msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT); | |||
| else | |||
| msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT); | |||
| b = malloc(msize); | |||
| if ( b == NULL ) | |||
| @@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
| #else | |||
| #ifdef COMPLEX | |||
| FLOAT CNAME(blasint n, void *vx, blasint incx){ | |||
| FLOAT *x = (FLOAT*) vx; | |||
| #else | |||
| FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||
| #endif | |||
| FLOAT ret; | |||
| PRINT_DEBUG_CNAME; | |||
| @@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| else | |||
| { | |||
| dp2 = *dd2 * dy1; | |||
| if(dp2 == ZERO) | |||
| { | |||
| dflag = -TWO; | |||
| dparam[0] = dflag; | |||
| return; | |||
| } | |||
| dp1 = *dd1 * *dx1; | |||
| dq2 = dp2 * dy1; | |||
| dq1 = dp1 * *dx1; | |||
| @@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| dh12 = dp2 / dp1; | |||
| du = ONE - dh12 * dh21; | |||
| if(du > ZERO) | |||
| { | |||
| dflag = ZERO; | |||
| *dd1 = *dd1 / du; | |||
| *dd2 = *dd2 / du; | |||
| *dx1 = *dx1 * du; | |||
| } else { | |||
| dflag = -ONE; | |||
| dh11 = ZERO; | |||
| dh12 = ZERO; | |||
| dh21 = ZERO; | |||
| dh22 = ZERO; | |||
| *dd1 = ZERO; | |||
| *dd2 = ZERO; | |||
| *dx1 = ZERO; | |||
| } | |||
| dflag = ZERO; | |||
| *dd1 = *dd1 / du; | |||
| *dd2 = *dd2 / du; | |||
| *dx1 = *dx1 * du; | |||
| } | |||
| else | |||
| @@ -0,0 +1,447 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2024, The OpenBLAS Project. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #include "common.h" | |||
| #define SMP_THRESHOLD_MIN 65536.0 | |||
| #define ERROR_NAME "SBGEMMT " | |||
| #ifndef GEMM_MULTITHREAD_THRESHOLD | |||
| #define GEMM_MULTITHREAD_THRESHOLD 4 | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||
| blasint * M, blasint * K, | |||
| FLOAT * Alpha, | |||
| IFLOAT * a, blasint * ldA, | |||
| IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) | |||
| { | |||
| blasint m, k; | |||
| blasint lda, ldb, ldc; | |||
| int transa, transb, uplo; | |||
| blasint info; | |||
| char transA, transB, Uplo; | |||
| blasint nrowa, nrowb; | |||
| IFLOAT *buffer; | |||
| IFLOAT *aa, *bb; | |||
| FLOAT *cc; | |||
| FLOAT alpha, beta; | |||
| PRINT_DEBUG_NAME; | |||
| m = *M; | |||
| k = *K; | |||
| alpha = *Alpha; | |||
| beta = *Beta; | |||
| lda = *ldA; | |||
| ldb = *ldB; | |||
| ldc = *ldC; | |||
| transA = *TRANSA; | |||
| transB = *TRANSB; | |||
| Uplo = *UPLO; | |||
| TOUPPER(transA); | |||
| TOUPPER(transB); | |||
| TOUPPER(Uplo); | |||
| transa = -1; | |||
| transb = -1; | |||
| uplo = -1; | |||
| if (transA == 'N') | |||
| transa = 0; | |||
| if (transA == 'T') | |||
| transa = 1; | |||
| if (transA == 'R') | |||
| transa = 0; | |||
| if (transA == 'C') | |||
| transa = 1; | |||
| if (transB == 'N') | |||
| transb = 0; | |||
| if (transB == 'T') | |||
| transb = 1; | |||
| if (transB == 'R') | |||
| transb = 0; | |||
| if (transB == 'C') | |||
| transb = 1; | |||
| if (Uplo == 'U') | |||
| uplo = 0; | |||
| if (Uplo == 'L') | |||
| uplo = 1; | |||
| nrowa = m; | |||
| if (transa & 1) nrowa = k; | |||
| nrowb = k; | |||
| if (transb & 1) nrowb = m; | |||
| info = 0; | |||
| if (ldc < MAX(1, m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, nrowb)) | |||
| info = 10; | |||
| if (lda < MAX(1, nrowa)) | |||
| info = 8; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (m < 0) | |||
| info = 4; | |||
| if (transb < 0) | |||
| info = 3; | |||
| if (transa < 0) | |||
| info = 2; | |||
| if (uplo < 0) | |||
| info = 1; | |||
| if (info != 0) { | |||
| BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #else | |||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, | |||
| blasint k, | |||
| FLOAT alpha, | |||
| IFLOAT * A, blasint LDA, | |||
| IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) | |||
| { | |||
| IFLOAT *aa, *bb; | |||
| FLOAT *cc; | |||
| int transa, transb, uplo; | |||
| blasint info; | |||
| blasint lda, ldb; | |||
| IFLOAT *a, *b; | |||
| XFLOAT *buffer; | |||
| PRINT_DEBUG_CNAME; | |||
| uplo = -1; | |||
| transa = -1; | |||
| transb = -1; | |||
| info = 0; | |||
| if (order == CblasColMajor) { | |||
| if (Uplo == CblasUpper) uplo = 0; | |||
| if (Uplo == CblasLower) uplo = 1; | |||
| if (TransA == CblasNoTrans) | |||
| transa = 0; | |||
| if (TransA == CblasTrans) | |||
| transa = 1; | |||
| if (TransA == CblasConjNoTrans) | |||
| transa = 0; | |||
| if (TransA == CblasConjTrans) | |||
| transa = 1; | |||
| if (TransB == CblasNoTrans) | |||
| transb = 0; | |||
| if (TransB == CblasTrans) | |||
| transb = 1; | |||
| if (TransB == CblasConjNoTrans) | |||
| transb = 0; | |||
| if (TransB == CblasConjTrans) | |||
| transb = 1; | |||
| a = (void *)A; | |||
| b = (void *)B; | |||
| lda = LDA; | |||
| ldb = LDB; | |||
| info = -1; | |||
| blasint nrowa; | |||
| blasint nrowb; | |||
| nrowa = m; | |||
| if (transa & 1) nrowa = k; | |||
| nrowb = k; | |||
| if (transb & 1) nrowb = m; | |||
| if (ldc < MAX(1, m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, nrowb)) | |||
| info = 10; | |||
| if (lda < MAX(1, nrowa)) | |||
| info = 8; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (m < 0) | |||
| info = 4; | |||
| if (transb < 0) | |||
| info = 3; | |||
| if (transa < 0) | |||
| info = 2; | |||
| if (uplo < 0) | |||
| info = 1; | |||
| } | |||
| if (order == CblasRowMajor) { | |||
| a = (void *)B; | |||
| b = (void *)A; | |||
| lda = LDB; | |||
| ldb = LDA; | |||
| if (Uplo == CblasUpper) uplo = 0; | |||
| if (Uplo == CblasLower) uplo = 1; | |||
| if (TransB == CblasNoTrans) | |||
| transa = 0; | |||
| if (TransB == CblasTrans) | |||
| transa = 1; | |||
| if (TransB == CblasConjNoTrans) | |||
| transa = 0; | |||
| if (TransB == CblasConjTrans) | |||
| transa = 1; | |||
| if (TransA == CblasNoTrans) | |||
| transb = 0; | |||
| if (TransA == CblasTrans) | |||
| transb = 1; | |||
| if (TransA == CblasConjNoTrans) | |||
| transb = 0; | |||
| if (TransA == CblasConjTrans) | |||
| transb = 1; | |||
| info = -1; | |||
| blasint ncola; | |||
| blasint ncolb; | |||
| ncola = m; | |||
| if (transa & 1) ncola = k; | |||
| ncolb = k; | |||
| if (transb & 1) { | |||
| ncolb = m; | |||
| } | |||
| if (ldc < MAX(1,m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, ncolb)) | |||
| info = 8; | |||
| if (lda < MAX(1, ncola)) | |||
| info = 10; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (m < 0) | |||
| info = 4; | |||
| if (transb < 0) | |||
| info = 2; | |||
| if (transa < 0) | |||
| info = 3; | |||
| if (uplo < 0) | |||
| info = 1; | |||
| } | |||
| if (info >= 0) { | |||
| BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #endif | |||
| int buffer_size; | |||
| blasint i, j; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| #endif | |||
| #ifdef SMP | |||
| static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, | |||
| BLASLONG, IFLOAT *, BLASLONG, FLOAT, | |||
| FLOAT *, BLASLONG, int) = { | |||
| sbgemv_thread_n, sbgemv_thread_t, | |||
| }; | |||
| #endif | |||
| int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG, | |||
| IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { | |||
| SBGEMV_N, SBGEMV_T,}; | |||
| if (m == 0) | |||
| return; | |||
| IDEBUG_START; | |||
| const blasint incb = ((transb & 1) == 0) ? 1 : ldb; | |||
| if (uplo == 1) { | |||
| for (i = 0; i < m; i++) { | |||
| j = m - i; | |||
| aa = a + i; | |||
| bb = b + i * ldb; | |||
| if (transa & 1) { | |||
| aa = a + lda * i; | |||
| } | |||
| if (transb & 1) | |||
| bb = b + i; | |||
| cc = c + i * ldc + i; | |||
| #if 0 | |||
| if (beta != ONE) | |||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) | |||
| continue; | |||
| #endif | |||
| IDEBUG_START; | |||
| buffer_size = j + k + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT); | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, IFLOAT, buffer); | |||
| #ifdef SMP | |||
| if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(2); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, alpha, aa, lda, | |||
| bb, incb, beta, cc, 1); | |||
| else | |||
| (gemv[(int)transa]) (k, j, alpha, aa, lda, | |||
| bb, incb, beta, cc, 1); | |||
| #ifdef SMP | |||
| } else { | |||
| if (!(transa & 1)) | |||
| (gemv_thread[(int)transa]) (j, k, alpha, aa, | |||
| lda, bb, incb, beta, cc, | |||
| 1, nthreads); | |||
| else | |||
| (gemv_thread[(int)transa]) (k, j, alpha, aa, | |||
| lda, bb, incb, beta, cc, | |||
| 1, nthreads); | |||
| } | |||
| #endif | |||
| STACK_FREE(buffer); | |||
| } | |||
| } else { | |||
| for (i = 0; i < m; i++) { | |||
| j = i + 1; | |||
| bb = b + i * ldb; | |||
| if (transb & 1) { | |||
| bb = b + i; | |||
| } | |||
| cc = c + i * ldc; | |||
| #if 0 | |||
| if (beta != ONE) | |||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) | |||
| continue; | |||
| #endif | |||
| IDEBUG_START; | |||
| buffer_size = j + k + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT); | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, IFLOAT, buffer); | |||
| #ifdef SMP | |||
| if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(2); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, alpha, a, lda, bb, | |||
| incb, beta, cc, 1); | |||
| else | |||
| (gemv[(int)transa]) (k, j, alpha, a, lda, bb, | |||
| incb, beta, cc, 1); | |||
| #ifdef SMP | |||
| } else { | |||
| if (!(transa & 1)) | |||
| (gemv_thread[(int)transa]) (j, k, alpha, a, lda, | |||
| bb, incb, beta, cc, 1, | |||
| nthreads); | |||
| else | |||
| (gemv_thread[(int)transa]) (k, j, alpha, a, lda, | |||
| bb, incb, beta, cc, 1, | |||
| nthreads); | |||
| } | |||
| #endif | |||
| STACK_FREE(buffer); | |||
| } | |||
| } | |||
| IDEBUG_END; | |||
| return; | |||
| } | |||
| @@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifndef CBLAS | |||
| void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) | |||
| void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY) | |||
| { | |||
| blasint n = *N; | |||
| blasint incx = *INCX; | |||
| blasint incy = *INCY; | |||
| FLOAT* ALPHA = (FLOAT*) VALPHA; | |||
| FLOAT* BETA = (FLOAT*) VBETA; | |||
| #else | |||
| @@ -183,7 +183,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| } | |||
| #endif | |||
| msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2; | |||
| if ( *rows > *cols ) | |||
| msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2; | |||
| else | |||
| msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2; | |||
| b = malloc(msize); | |||
| if ( b == NULL ) | |||
| @@ -1349,6 +1349,9 @@ endif () | |||
| set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}") | |||
| get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES) | |||
| set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}") | |||
| if (USE_GEMM3M) | |||
| target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) | |||
| endif() | |||
| endfunction () | |||
| @@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, | |||
| if ( rows <= 0 ) return(0); | |||
| if ( cols <= 0 ) return(0); | |||
| if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0); | |||
| aptr = a; | |||
| lda *= 2; | |||
| @@ -58,6 +58,8 @@ ZAXPYKERNEL = caxpy_lsx.S | |||
| SAXPBYKERNEL = axpby_lsx.S | |||
| DAXPBYKERNEL = axpby_lsx.S | |||
| CAXPBYKERNEL = caxpby_lsx.S | |||
| ZAXPBYKERNEL = caxpby_lsx.S | |||
| SSUMKERNEL = sum_lsx.S | |||
| DSUMKERNEL = sum_lsx.S | |||
| @@ -98,9 +100,13 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CGEMMKERNEL = cgemm_kernel_2x2_lsx.S | |||
| CGEMMONCOPY = cgemm_ncopy_2_lsx.S | |||
| CGEMMOTCOPY = cgemm_tcopy_2_lsx.S | |||
| CGEMMKERNEL = cgemm_kernel_8x4_lsx.S | |||
| CGEMMINCOPY = cgemm_ncopy_8_lsx.S | |||
| CGEMMITCOPY = cgemm_tcopy_8_lsx.S | |||
| CGEMMONCOPY = cgemm_ncopy_4_lsx.S | |||
| CGEMMOTCOPY = cgemm_tcopy_4_lsx.S | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -109,4 +115,14 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S | |||
| ZGEMMONCOPY = zgemm_ncopy_4_lsx.S | |||
| ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| @@ -58,6 +58,8 @@ ZAXPYKERNEL = caxpy_lasx.S | |||
| SAXPBYKERNEL = axpby_lasx.S | |||
| DAXPBYKERNEL = axpby_lasx.S | |||
| CAXPBYKERNEL = caxpby_lasx.S | |||
| ZAXPBYKERNEL = caxpby_lasx.S | |||
| SSUMKERNEL = sum_lasx.S | |||
| DSUMKERNEL = sum_lasx.S | |||
| @@ -120,9 +122,13 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S | |||
| ZGEMMONCOPY = zgemm_ncopy_2_lasx.S | |||
| ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S | |||
| ZGEMMKERNEL = zgemm_kernel_8x4_lasx.S | |||
| ZGEMMINCOPY = zgemm_ncopy_8_lasx.S | |||
| ZGEMMITCOPY = zgemm_tcopy_8_lasx.S | |||
| ZGEMMONCOPY = zgemm_ncopy_4_lasx.S | |||
| ZGEMMOTCOPY = zgemm_tcopy_4_lasx.S | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -124,7 +124,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L13: | |||
| FABS $f0, $f0 | |||
| SUB $f0, $f0, $f0 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| @@ -57,10 +57,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| bge $r0, N, .L999 | |||
| li.d TEMP, 1 | |||
| movgr2fr.d a1, $r0 | |||
| ffint.s.l a1, a1 | |||
| slli.d TEMP, TEMP, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| MTG t1, ALPHA | |||
| @@ -75,6 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvreplgr2vr.w VXB, t2 | |||
| xvreplgr2vr.w VXZ, t3 | |||
| #endif | |||
| // If incx == 0 || incy == 0, do one by one | |||
| and TEMP, INCX, INCY | |||
| or I, N, N | |||
| beqz TEMP, .L998 | |||
| li.d TEMP, 1 | |||
| slli.d TEMP, TEMP, BASE_SHIFT | |||
| srai.d I, N, 3 | |||
| bne INCX, TEMP, .L20 | |||
| bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 | |||
| @@ -57,10 +57,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| bge $r0, N, .L999 | |||
| li.d TEMP, 1 | |||
| movgr2fr.d a1, $r0 | |||
| ffint.s.l a1, a1 | |||
| slli.d TEMP, TEMP, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| MTG t1, ALPHA | |||
| @@ -75,6 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vreplgr2vr.w VXB, t2 | |||
| vreplgr2vr.w VXZ, t3 | |||
| #endif | |||
| // If incx == 0 || incy == 0, do one by one | |||
| and TEMP, INCX, INCY | |||
| or I, N, N | |||
| beqz TEMP, .L998 | |||
| li.d TEMP, 1 | |||
| slli.d TEMP, TEMP, BASE_SHIFT | |||
| srai.d I, N, 3 | |||
| bne INCX, TEMP, .L20 | |||
| bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 | |||
| @@ -0,0 +1,341 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define TD $r20 | |||
| #define TS $r11 | |||
| #define TL $r19 | |||
| #define T0 $r23 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LSX vectors */ | |||
| #define U0 $vr0 | |||
| #define U1 $vr1 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| #define D0 $vr8 | |||
| #define D1 $vr9 | |||
| #define D2 $vr10 | |||
| #define D3 $vr11 | |||
| #define D4 $vr12 | |||
| #define D5 $vr13 | |||
| #define D6 $vr14 | |||
| #define D7 $vr15 | |||
| #define D8 $vr16 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -8 | |||
| SDARG $r23, $sp, 0 | |||
| move TD, DST //boffset | |||
| move TS, SRC //aoffset | |||
| slli.d TL, LDA, 0x02 | |||
| slli.d TL, TL, 0x01 | |||
| srai.d J, N, 0x02 | |||
| beq J, ZERO, .L_N0 | |||
| .L_J1: /* J-- */ | |||
| move S1, TS | |||
| add.d S2, S1, TL | |||
| add.d S3, S2, TL | |||
| add.d S4, S3, TL | |||
| slli.d T0, TL, 0x02 | |||
| add.d TS, TS, T0 | |||
| srai.d I, M, 0x02 | |||
| beq I, ZERO, .L_I3 | |||
| .L_I1: /* I-- */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vld U4, S3, 0x00 | |||
| vld U5, S3, 0x10 | |||
| vld U6, S4, 0x00 | |||
| vld U7, S4, 0x10 | |||
| vand.v D0, U2, U2 | |||
| vand.v D1, U3, U3 | |||
| vand.v D2, U2, U2 | |||
| vand.v D3, U3, U3 | |||
| vand.v D4, U6, U6 | |||
| vand.v D5, U7, U7 | |||
| vand.v D6, U6, U6 | |||
| vand.v D7, U7, U7 | |||
| vpermi.w D0, U0, 0x44 | |||
| vpermi.w D4, U4, 0x44 | |||
| vpermi.w D2, U0, 0xee | |||
| vpermi.w D6, U4, 0xee | |||
| vpermi.w D1, U1, 0x44 | |||
| vpermi.w D5, U5, 0x44 | |||
| vpermi.w D3, U1, 0xee | |||
| vpermi.w D7, U5, 0xee | |||
| vst D0, TD, 0x00 | |||
| vst D4, TD, 0x10 | |||
| vst D2, TD, 0x20 | |||
| vst D6, TD, 0x30 | |||
| vst D1, TD, 0x40 | |||
| vst D5, TD, 0x50 | |||
| vst D3, TD, 0x60 | |||
| vst D7, TD, 0x70 | |||
| addi.d S1, S1, 0x20 // a_offset | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d TD, TD, 0x80 // b_offset | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_I3: /* if(m&2) */ | |||
| andi I, M, 0x02 | |||
| beq I, ZERO, .L_II20 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vld U2, S3, 0x00 | |||
| vld U3, S4, 0x00 | |||
| vand.v D0, U1, U1 | |||
| vand.v D1, U1, U1 | |||
| vand.v D2, U3, U3 | |||
| vand.v D3, U3, U3 | |||
| vpermi.w D0, U0, 0x44 | |||
| vpermi.w D2, U2, 0x44 | |||
| vpermi.w D1, U0, 0xee | |||
| vpermi.w D3, U2, 0xee | |||
| vst D0, TD, 0x00 | |||
| vst D2, TD, 0x10 | |||
| vst D1, TD, 0x20 | |||
| vst D3, TD, 0x30 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d S3, S3, 0x10 | |||
| addi.d S4, S4, 0x10 | |||
| addi.d TD, TD, 0x40 | |||
| .L_II20: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_J0 | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S1, 0x04 | |||
| fld.s F2, S2, 0x00 | |||
| fld.s F3, S2, 0x04 | |||
| fld.s F4, S3, 0x00 | |||
| fld.s F5, S3, 0x04 | |||
| fld.s F6, S4, 0x00 | |||
| fld.s F7, S4, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| fst.s F3, TD, 0x0c | |||
| fst.s F4, TD, 0x10 | |||
| fst.s F5, TD, 0x14 | |||
| fst.s F6, TD, 0x18 | |||
| fst.s F7, TD, 0x1c | |||
| addi.d TD, TD, 0x20 | |||
| .L_J0: | |||
| addi.d J, J, -1 | |||
| blt ZERO, J, .L_J1 | |||
| .L_N0: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N20 | |||
| move S1, TS | |||
| add.d S2, S1, TL | |||
| slli.d T0, TL, 0x01 | |||
| add.d TS, TS, T0 | |||
| srai.d I, M, 0x02 | |||
| beq ZERO, I, .L_N10 | |||
| .L_N11: /* if(i>0) */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vand.v D0, U2, U2 | |||
| vand.v D1, U3, U3 | |||
| vand.v D2, U2, U2 | |||
| vand.v D3, U3, U3 | |||
| vpermi.w D0, U0, 0x44 | |||
| vpermi.w D2, U0, 0xee | |||
| vpermi.w D1, U1, 0x44 | |||
| vpermi.w D3, U1, 0xee | |||
| vst D0, TD, 0x00 | |||
| vst D2, TD, 0x10 | |||
| vst D1, TD, 0x20 | |||
| vst D3, TD, 0x30 | |||
| addi.d S1, S1, 0x20 // a_offset | |||
| addi.d S2, S2, 0x20 | |||
| addi.d TD, TD, 0x40 // b_offset | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N11 | |||
| .L_N10: /* if(m&2) */ | |||
| andi I, M, 0x02 | |||
| beq I, ZERO, .L_N130 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vand.v D0, U1, U1 | |||
| vpermi.w D0, U0, 0x44 | |||
| vpermi.w U1, U0, 0xee | |||
| vst D0, TD, 0x00 | |||
| vst U1, TD, 0x10 | |||
| addi.d S1, S1, 0x10 // a_offset | |||
| addi.d S2, S2, 0x10 | |||
| addi.d TD, TD, 0x20 // b_offset | |||
| .L_N130: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N20 | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S1, 0x04 | |||
| fld.s F2, S2, 0x00 | |||
| fld.s F3, S2, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| fst.s F3, TD, 0x0c | |||
| addi.d TD, TD, 0x10 | |||
| .L_N20: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq I, ZERO, .L_N00 | |||
| move S1, TS | |||
| srai.d I, M, 0x02 | |||
| beq I, ZERO, .L_N30 | |||
| .L_N21: /* if(i>0) */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vst U0, TD, 0x00 | |||
| vst U1, TD, 0x10 | |||
| addi.d S1, S1, 0x20 // aoffset1 | |||
| addi.d TD, TD, 0x20 // b_offset | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N21 | |||
| .L_N30: /* if(m&2) */ | |||
| andi I, M, 0x02 | |||
| beq I, ZERO, .L_N330 | |||
| vld U0, S1, 0x00 | |||
| vst U0, TD, 0x00 | |||
| addi.d S1, S1, 0x10 // aoffset1 | |||
| addi.d TD, TD, 0x10 // b_offset | |||
| .L_N330: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N00 | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S1, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| .L_N00: | |||
| LDARG $r23, $sp, 0 | |||
| addi.d $sp, $sp, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,263 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define TD $r20 | |||
| #define TS $r11 | |||
| #define TL $r7 | |||
| #define T0 $r23 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LSX vectors */ | |||
| #define U0 $vr0 | |||
| #define U1 $vr1 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| #define D0 $vr8 | |||
| #define D1 $vr9 | |||
| #define D2 $vr10 | |||
| #define D3 $vr11 | |||
| #define D4 $vr12 | |||
| #define D5 $vr13 | |||
| #define D6 $vr14 | |||
| #define D7 $vr15 | |||
| #define D8 $vr16 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -8 | |||
| SDARG $r23, $sp, 0 | |||
| move TD, DST //boffset | |||
| move TS, SRC //aoffset | |||
| slli.d TL, LDA, 0x02 //lda | |||
| slli.d TL, TL, 0x01 | |||
| slli.d T0, TL, 0x03 | |||
| srai.d J, N, 0x03 //j | |||
| beq J, ZERO, .L_N1 | |||
| .L_J1: /* if(j>0) j--*/ | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| move I, M | |||
| add.d S3, S2, TL | |||
| add.d S4, S3, TL | |||
| add.d S5, S4, TL | |||
| add.d S6, S5, TL | |||
| add.d S7, S6, TL | |||
| add.d S8, S7, TL | |||
| add.d TS, TS, T0 | |||
| beq I, ZERO, .L_J11 | |||
| .L_I1: /* if(i>0) i--*/ | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S1, 0x04 | |||
| fld.s F2, S2, 0x00 | |||
| fld.s F3, S2, 0x04 | |||
| fld.s F4, S3, 0x00 | |||
| fld.s F5, S3, 0x04 | |||
| fld.s F6, S4, 0x00 | |||
| fld.s F7, S4, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| fst.s F3, TD, 0x0c | |||
| fst.s F4, TD, 0x10 | |||
| fst.s F5, TD, 0x14 | |||
| fst.s F6, TD, 0x18 | |||
| fst.s F7, TD, 0x1c | |||
| fld.s F0, S5, 0x00 | |||
| fld.s F1, S5, 0x04 | |||
| fld.s F2, S6, 0x00 | |||
| fld.s F3, S6, 0x04 | |||
| fld.s F4, S7, 0x00 | |||
| fld.s F5, S7, 0x04 | |||
| fld.s F6, S8, 0x00 | |||
| fld.s F7, S8, 0x04 | |||
| fst.s F0, TD, 0x20 | |||
| fst.s F1, TD, 0x24 | |||
| fst.s F2, TD, 0x28 | |||
| fst.s F3, TD, 0x2c | |||
| fst.s F4, TD, 0x30 | |||
| fst.s F5, TD, 0x34 | |||
| fst.s F6, TD, 0x38 | |||
| fst.s F7, TD, 0x3c | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d S3, S3, 0x08 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d S5, S5, 0x08 | |||
| addi.d S6, S6, 0x08 | |||
| addi.d S7, S7, 0x08 | |||
| addi.d S8, S8, 0x08 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_J11: /* j--*/ | |||
| addi.d J, J, -1 | |||
| blt ZERO, J, .L_J1 | |||
| .L_N1: /* if(n&4)*/ | |||
| andi I, N, 0x04 | |||
| beq I, ZERO, .L_N2 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| move I, M | |||
| add.d S3, S2, TL | |||
| add.d S4, S3, TL | |||
| add.d TS, S4, TL | |||
| beq I, ZERO, .L_N2 | |||
| .L_N11: /* if(i>0)*/ | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S1, 0x04 | |||
| fld.s F2, S2, 0x00 | |||
| fld.s F3, S2, 0x04 | |||
| fld.s F4, S3, 0x00 | |||
| fld.s F5, S3, 0x04 | |||
| fld.s F6, S4, 0x00 | |||
| fld.s F7, S4, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| fst.s F3, TD, 0x0c | |||
| fst.s F4, TD, 0x10 | |||
| fst.s F5, TD, 0x14 | |||
| fst.s F6, TD, 0x18 | |||
| fst.s F7, TD, 0x1c | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d S3, S3, 0x08 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N11 | |||
| .L_N2: /* if(n&2)*/ | |||
| andi I, N, 0x02 | |||
| beq I, ZERO, .L_N3 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| move I, M | |||
| add.d TS, S2, TL | |||
| beq I, ZERO, .L_N3 | |||
| .L_N21: /* if(i>0)*/ | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S1, 0x04 | |||
| fld.s F2, S2, 0x00 | |||
| fld.s F3, S2, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| fst.s F3, TD, 0x0c | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d TD, TD, 0x10 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N21 | |||
| .L_N3: /* if(n&2)*/ | |||
| andi I, N, 0x01 | |||
| beq I, ZERO, .L_N0 | |||
| move S1, TS | |||
| move I, M | |||
| beq I, ZERO, .L_N0 | |||
| .L_N31: /* if(i>0)*/ | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S1, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d TD, TD, 0x08 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N31 | |||
| .L_N0: | |||
| LDARG $r23, $sp, 0 | |||
| addi.d $sp, $sp, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,324 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define TD $r16 | |||
| #define TS $r17 | |||
| #define TL $r18 | |||
| #define T0 $r19 | |||
| #define S8 $r20 | |||
| #define S9 $r23 | |||
| #define S10 $r11 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LSX vectors */ | |||
| #define U0 $vr0 | |||
| #define U1 $vr1 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| #define U8 $vr8 | |||
| #define U9 $vr9 | |||
| #define U10 $vr10 | |||
| #define U11 $vr11 | |||
| #define U12 $vr12 | |||
| #define U13 $vr13 | |||
| #define U14 $vr14 | |||
| #define U15 $vr15 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -8 | |||
| SDARG $r23, $sp, 0 | |||
| move TS, SRC //aoffset | |||
| move TD, DST //boffset | |||
| slli.d TL, LDA, 0x02 //lda | |||
| slli.d TL, TL, 0x01 //lda | |||
| ori T0, ZERO, 0x03 | |||
| andn T0, N, T0 | |||
| mul.w T0, M, T0 | |||
| slli.d T0, T0, 0x01 | |||
| slli.d T0, T0, 0x02 | |||
| add.d S9, DST, T0 //boffset2 | |||
| ori T0, ZERO, 0x01 | |||
| andn T0, N, T0 | |||
| mul.w T0, M, T0 | |||
| slli.d T0, T0, 0x01 | |||
| slli.d T0, T0, 0x02 | |||
| add.d S10, DST, T0 //boffset3 | |||
| srai.d J, M, 0x02 //j | |||
| beq J, ZERO, .L_M1 | |||
| .L_J1: /* if(j>0) j--*/ | |||
| move S1, TS //aoffset1 | |||
| add.d S2, S1, TL | |||
| add.d S3, S2, TL | |||
| add.d S4, S3, TL | |||
| slli.d T0, TL, 0x02 | |||
| add.d TS, TS, T0 | |||
| move S8, TD //boffset1 | |||
| addi.d TD, TD, 0x80 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_JN1 | |||
| .L_JI1: /* if(i>0) i--*/ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vld U4, S3, 0x00 | |||
| vld U5, S3, 0x10 | |||
| vld U6, S4, 0x00 | |||
| vld U7, S4, 0x10 | |||
| vst U0, S8, 0x00 | |||
| vst U1, S8, 0x10 | |||
| vst U2, S8, 0x20 | |||
| vst U3, S8, 0x30 | |||
| vst U4, S8, 0x40 | |||
| vst U5, S8, 0x50 | |||
| vst U6, S8, 0x60 | |||
| vst U7, S8, 0x70 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| slli.d T0, M, 0x05 | |||
| add.d S8, S8, T0 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_JI1 | |||
| .L_JN1: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_JN2 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vld U2, S3, 0x00 | |||
| vld U3, S4, 0x00 | |||
| vst U0, S9, 0x00 | |||
| vst U1, S9, 0x10 | |||
| vst U2, S9, 0x20 | |||
| vst U3, S9, 0x30 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d S3, S3, 0x10 | |||
| addi.d S4, S4, 0x10 | |||
| addi.d S9, S9, 0x40 | |||
| .L_JN2: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_J0 | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S1, 0x04 | |||
| fld.s F2, S2, 0x00 | |||
| fld.s F3, S2, 0x04 | |||
| fld.s F4, S3, 0x00 | |||
| fld.s F5, S3, 0x04 | |||
| fld.s F6, S4, 0x00 | |||
| fld.s F7, S4, 0x04 | |||
| fst.s F0, S10, 0x00 | |||
| fst.s F1, S10, 0x04 | |||
| fst.s F2, S10, 0x08 | |||
| fst.s F3, S10, 0x0c | |||
| fst.s F4, S10, 0x10 | |||
| fst.s F5, S10, 0x14 | |||
| fst.s F6, S10, 0x18 | |||
| fst.s F7, S10, 0x1c | |||
| addi.d S10, S10, 0x20 | |||
| .L_J0: | |||
| addi.d J, J, -1 | |||
| blt ZERO, J, .L_J1 | |||
| .L_M1: /* if(m&2) */ | |||
| andi I, M, 0x02 | |||
| beq ZERO, I, .L_M2 | |||
| move S1, TS //aoffset1 | |||
| add.d S2, S1, TL | |||
| slli.d T0, TL, 0x01 | |||
| add.d TS, TS, T0 | |||
| move S8, TD //boffset1 | |||
| addi.d TD, TD, 0x40 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_M1N1 | |||
| .L_M1I1: /* if(i>0) */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vst U0, S8, 0x00 | |||
| vst U1, S8, 0x10 | |||
| vst U2, S8, 0x20 | |||
| vst U3, S8, 0x30 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| slli.d T0, M, 0x05 | |||
| add.d S8, S8, T0 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_M1I1 | |||
| .L_M1N1: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M1N2 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vst U0, S9, 0x00 | |||
| vst U1, S9, 0x10 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d S9, S9, 0x20 | |||
| .L_M1N2: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M2 | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S1, 0x04 | |||
| fld.s F2, S2, 0x00 | |||
| fld.s F3, S2, 0x04 | |||
| fst.s F0, S10, 0x00 | |||
| fst.s F1, S10, 0x04 | |||
| fst.s F2, S10, 0x08 | |||
| fst.s F3, S10, 0x0c | |||
| addi.d S10, S10, 0x10 | |||
| .L_M2: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq ZERO, I, .L_M0 | |||
| move S1, TS //aoffset1 | |||
| move S8, TD //boffset1 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_M2N1 | |||
| .L_M2I1: /* if(i>0) */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vst U0, S8, 0x00 | |||
| vst U1, S8, 0x10 | |||
| addi.d S1, S1, 0x20 | |||
| slli.d T0, M, 0x05 | |||
| add.d S8, S8, T0 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_M2I1 | |||
| .L_M2N1: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M2N2 | |||
| vld U0, S1, 0x00 | |||
| vst U0, S9, 0x00 | |||
| addi.d S1, S1, 0x10 | |||
| .L_M2N2: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M0 | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S1, 0x04 | |||
| fst.s F0, S10, 0x00 | |||
| fst.s F1, S10, 0x04 | |||
| .L_M0: | |||
| LDARG $r23, $sp, 0 | |||
| addi.d $sp, $sp, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,277 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define TD $r20 | |||
| #define TS $r11 | |||
| #define TL $r7 | |||
| #define T0 $r23 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $vr0 | |||
| #define U1 $vr1 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| #define D0 $vr8 | |||
| #define D1 $vr9 | |||
| #define D2 $vr10 | |||
| #define D3 $vr11 | |||
| #define D4 $vr12 | |||
| #define D5 $vr13 | |||
| #define D6 $vr14 | |||
| #define D7 $vr15 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -8 | |||
| SDARG $r23, $sp, 0 | |||
| move TS, SRC //aoffset | |||
| move TD, DST //boffset | |||
| slli.d TL, LDA, 0x02 //lda | |||
| slli.d TL, TL, 0x01 | |||
| srai.d J, N, 0x03 //j | |||
| beq J, ZERO, .L_N1 | |||
| .L_J1: /* if(j>0) j--*/ | |||
| move S1, TS //aoffset1 | |||
| slli.d T0, TL, 0x01 //2*lda | |||
| add.d S2, TS, TL | |||
| addi.d TS, TS, 0x40 | |||
| srai.d I, M, 0x01 | |||
| beq ZERO, I, .L_J1M1 | |||
| .L_J1I1: /* if(i>0) i--*/ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S1, 0x20 | |||
| vld U3, S1, 0x30 | |||
| vld U4, S2, 0x00 | |||
| vld U5, S2, 0x10 | |||
| vld U6, S2, 0x20 | |||
| vld U7, S2, 0x30 | |||
| vst U0, TD, 0x00 | |||
| vst U1, TD, 0x10 | |||
| vst U2, TD, 0x20 | |||
| vst U3, TD, 0x30 | |||
| vst U4, TD, 0x40 | |||
| vst U5, TD, 0x50 | |||
| vst U6, TD, 0x60 | |||
| vst U7, TD, 0x70 | |||
| add.d S1, S1, T0 | |||
| add.d S2, S2, T0 | |||
| addi.d TD, TD, 0x80 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_J1I1 | |||
| .L_J1M1: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq ZERO, I, .L_J0 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S1, 0x20 | |||
| vld U3, S1, 0x30 | |||
| vst U0, TD, 0x00 | |||
| vst U1, TD, 0x10 | |||
| vst U2, TD, 0x20 | |||
| vst U3, TD, 0x30 | |||
| addi.d TD, TD, 0x40 | |||
| .L_J0: | |||
| addi.d J, J, -1 | |||
| blt ZERO, J, .L_J1 | |||
| .L_N1: /* if(n&4) */ | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_N2 | |||
| move S1, TS //aoffset1 | |||
| slli.d T0, TL, 0x01 //2*lda | |||
| add.d S2, TS, TL | |||
| addi.d TS, TS, 0x20 | |||
| srai.d I, M, 0x01 | |||
| beq ZERO, I, .L_N1M1 | |||
| .L_N1I1: /* if(i>0) i-- */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vst U0, TD, 0x00 | |||
| vst U1, TD, 0x10 | |||
| vst U2, TD, 0x20 | |||
| vst U3, TD, 0x30 | |||
| add.d S1, S1, T0 | |||
| add.d S2, S2, T0 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N1I1 | |||
| .L_N1M1: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq ZERO, I, .L_N2 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vst U0, TD, 0x00 | |||
| vst U1, TD, 0x10 | |||
| addi.d TD, TD, 0x20 | |||
| .L_N2: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N3 | |||
| move S1, TS //aoffset1 | |||
| slli.d T0, TL, 0x01 //2*lda | |||
| add.d S2, TS, TL | |||
| addi.d TS, TS, 0x10 | |||
| srai.d I, M, 0x01 | |||
| beq ZERO, I, .L_N2M1 | |||
| .L_N2I1: /* if(i>0) i-- */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vst U0, TD, 0x00 | |||
| vst U1, TD, 0x10 | |||
| add.d S1, S1, T0 | |||
| add.d S2, S2, T0 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N2I1 | |||
| .L_N2M1: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq ZERO, I, .L_N3 | |||
| vld U0, S1, 0x00 | |||
| vst U0, TD, 0x00 | |||
| addi.d TD, TD, 0x10 | |||
| .L_N3: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_N0 | |||
| move S1, TS //aoffset1 | |||
| slli.d T0, TL, 0x01 //2*lda | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x01 | |||
| beq ZERO, I, .L_N3M1 | |||
| .L_N3I1: /* if(i>0) i-- */ | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S1, 0x04 | |||
| fld.s F2, S2, 0x00 | |||
| fld.s F3, S2, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| fst.s F3, TD, 0x0c | |||
| add.d S1, S1, T0 | |||
| add.d S2, S2, T0 | |||
| addi.d TD, TD, 0x10 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N3I1 | |||
| .L_N3M1: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq ZERO, I, .L_N0 | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S1, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| .L_N0: | |||
| LDARG $r23, $sp, 0 | |||
| addi.d $sp, $sp, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,320 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define TD $r20 | |||
| #define TS $r11 | |||
| #define TL $r19 | |||
| #define T0 $r23 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| #define D0 $xr8 | |||
| #define D1 $xr9 | |||
| #define D2 $xr10 | |||
| #define D3 $xr11 | |||
| #define D4 $xr12 | |||
| #define D5 $xr13 | |||
| #define D6 $xr14 | |||
| #define D7 $xr15 | |||
| #define D8 $xr16 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -8 | |||
| SDARG $r23, $sp, 0 | |||
| move TD, DST //boffset | |||
| move TS, SRC //aoffset | |||
| slli.d TL, LDA, 0x03 | |||
| slli.d TL, TL, 0x01 | |||
| srai.d J, N, 0x02 | |||
| beq J, ZERO, .L_N0 | |||
| .L_J1: /* J-- */ | |||
| move S1, TS | |||
| add.d S2, S1, TL | |||
| add.d S3, S2, TL | |||
| add.d S4, S3, TL | |||
| slli.d T0, TL, 0x02 | |||
| add.d TS, TS, T0 | |||
| srai.d I, M, 0x02 | |||
| beq I, ZERO, .L_I3 | |||
| .L_I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvld U4, S3, 0x00 | |||
| xvld U5, S3, 0x20 | |||
| xvld U6, S4, 0x00 | |||
| xvld U7, S4, 0x20 | |||
| xvand.v D0, U0, U0 | |||
| xvand.v D1, U1, U1 | |||
| xvand.v D2, U2, U2 | |||
| xvand.v D3, U3, U3 | |||
| xvand.v D4, U4, U4 | |||
| xvand.v D5, U5, U5 | |||
| xvand.v D6, U6, U6 | |||
| xvand.v D7, U7, U7 | |||
| xvpermi.q D0, U2, 0x02 | |||
| xvpermi.q D4, U6, 0x02 | |||
| xvpermi.q D2, U0, 0x31 | |||
| xvpermi.q D6, U4, 0x31 | |||
| xvpermi.q D1, U3, 0x02 | |||
| xvpermi.q D5, U7, 0x02 | |||
| xvpermi.q D3, U1, 0x31 | |||
| xvpermi.q D7, U5, 0x31 | |||
| xvst D0, TD, 0x00 | |||
| xvst D4, TD, 0x20 | |||
| xvst D2, TD, 0x40 | |||
| xvst D6, TD, 0x60 | |||
| xvst D1, TD, 0x80 | |||
| xvst D5, TD, 0xa0 | |||
| xvst D3, TD, 0xc0 | |||
| xvst D7, TD, 0xe0 | |||
| addi.d S1, S1, 0x40 // a_offset | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| addi.d TD, TD, 0x100 // b_offset | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_I3: | |||
| andi I, M, 0x02 | |||
| beq I, ZERO, .L_II20 | |||
| .L_II1: /* if(m&2) */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvand.v D0, U0, U0 | |||
| xvand.v D1, U1, U1 | |||
| xvand.v D2, U2, U2 | |||
| xvand.v D3, U3, U3 | |||
| xvpermi.q D0, U1, 0x02 | |||
| xvpermi.q D2, U3, 0x02 | |||
| xvpermi.q D1, U0, 0x31 | |||
| xvpermi.q D3, U2, 0x31 | |||
| xvst D0, TD, 0x00 | |||
| xvst D2, TD, 0x20 | |||
| xvst D1, TD, 0x40 | |||
| xvst D3, TD, 0x60 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d TD, TD, 0x80 | |||
| .L_II20: | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_J0 | |||
| .L_II2: /* if(m&1) */ | |||
| vld $vr0, S1, 0x00 | |||
| vld $vr1, S2, 0x00 | |||
| vld $vr2, S3, 0x00 | |||
| vld $vr3, S4, 0x00 | |||
| vst $vr0, TD, 0x00 | |||
| vst $vr1, TD, 0x10 | |||
| vst $vr2, TD, 0x20 | |||
| vst $vr3, TD, 0x30 | |||
| addi.d TD, TD, 0x40 | |||
| .L_J0: | |||
| addi.d J, J, -1 | |||
| blt ZERO, J, .L_J1 | |||
| .L_N0: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N20 | |||
| move S1, TS | |||
| add.d S2, S1, TL | |||
| slli.d T0, TL, 0x01 | |||
| add.d TS, TS, T0 | |||
| srai.d I, M, 0x02 | |||
| beq ZERO, I, .L_N10 | |||
| .L_N11: /* if(i>0) */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvand.v D0, U0, U0 | |||
| xvand.v D1, U1, U1 | |||
| xvand.v D2, U2, U2 | |||
| xvand.v D3, U3, U3 | |||
| xvpermi.q D0, U2, 0x02 | |||
| xvpermi.q D2, U0, 0x31 | |||
| xvpermi.q D1, U3, 0x02 | |||
| xvpermi.q D3, U1, 0x31 | |||
| xvst D0, TD, 0x00 | |||
| xvst D2, TD, 0x20 | |||
| xvst D1, TD, 0x40 | |||
| xvst D3, TD, 0x60 | |||
| addi.d S1, S1, 0x40 // a_offset | |||
| addi.d S2, S2, 0x40 | |||
| addi.d TD, TD, 0x80 // b_offset | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N11 | |||
| .L_N10: /* if(m&2) */ | |||
| andi I, M, 0x02 | |||
| beq I, ZERO, .L_N130 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvand.v D0, U0, U0 | |||
| xvpermi.q D0, U1, 0x02 | |||
| xvpermi.q U1, U0, 0x31 | |||
| xvst D0, TD, 0x00 | |||
| xvst U1, TD, 0x20 | |||
| addi.d S1, S1, 0x20 // a_offset | |||
| addi.d S2, S2, 0x20 | |||
| addi.d TD, TD, 0x40 // b_offset | |||
| .L_N130: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N20 | |||
| vld $vr0, S1, 0x00 | |||
| vld $vr1, S2, 0x00 | |||
| vst $vr0, TD, 0x00 | |||
| vst $vr1, TD, 0x10 | |||
| addi.d TD, TD, 0x20 | |||
| .L_N20: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq I, ZERO, .L_N00 | |||
| move S1, TS | |||
| srai.d I, M, 0x02 | |||
| beq I, ZERO, .L_N30 | |||
| .L_N21: /* if(i>0) */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvst U0, TD, 0x00 | |||
| xvst U1, TD, 0x20 | |||
| addi.d S1, S1, 0x40 // aoffset1 | |||
| addi.d TD, TD, 0x40 // b_offset | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N21 | |||
| .L_N30: /* if(m&2) */ | |||
| andi I, M, 0x02 | |||
| beq I, ZERO, .L_N330 | |||
| xvld U0, S1, 0x00 | |||
| xvst U0, TD, 0x00 | |||
| addi.d S1, S1, 0x20 // aoffset1 | |||
| addi.d TD, TD, 0x20 // b_offset | |||
| .L_N330: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N00 | |||
| vld $vr0, S1, 0x00 | |||
| vst $vr0, TD, 0x00 | |||
| .L_N00: | |||
| LDARG $r23, $sp, 0 | |||
| addi.d $sp, $sp, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,332 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define TD $r20 | |||
| #define TS $r11 | |||
| #define TL $r19 | |||
| #define T0 $r23 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LSX vectors */ | |||
| #define U0 $vr0 | |||
| #define U1 $vr1 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| #define U8 $vr8 | |||
| #define U9 $vr9 | |||
| #define U10 $vr10 | |||
| #define U11 $vr11 | |||
| #define U12 $vr12 | |||
| #define U13 $vr13 | |||
| #define U14 $vr14 | |||
| #define U15 $vr15 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -8 | |||
| SDARG $r23, $sp, 0 | |||
| move TD, DST //boffset | |||
| move TS, SRC //aoffset | |||
| slli.d TL, LDA, 0x03 | |||
| slli.d TL, TL, 0x01 | |||
| srai.d J, N, 0x02 | |||
| beq J, ZERO, .L_N0 | |||
| .L_J1: /* J-- */ | |||
| move S1, TS | |||
| add.d S2, S1, TL | |||
| add.d S3, S2, TL | |||
| add.d S4, S3, TL | |||
| slli.d T0, TL, 0x02 | |||
| add.d TS, TS, T0 | |||
| srai.d I, M, 0x02 | |||
| beq I, ZERO, .L_I3 | |||
| .L_I1: /* I-- */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S1, 0x20 | |||
| vld U3, S1, 0x30 | |||
| vld U4, S2, 0x00 | |||
| vld U5, S2, 0x10 | |||
| vld U6, S2, 0x20 | |||
| vld U7, S2, 0x30 | |||
| vld U8, S3, 0x00 | |||
| vld U9, S3, 0x10 | |||
| vld U10, S3, 0x20 | |||
| vld U11, S3, 0x30 | |||
| vld U12, S4, 0x00 | |||
| vld U13, S4, 0x10 | |||
| vld U14, S4, 0x20 | |||
| vld U15, S4, 0x30 | |||
| vst U0, TD, 0x00 | |||
| vst U4, TD, 0x10 | |||
| vst U8, TD, 0x20 | |||
| vst U12, TD, 0x30 | |||
| vst U1, TD, 0x40 | |||
| vst U5, TD, 0x50 | |||
| vst U9, TD, 0x60 | |||
| vst U13, TD, 0x70 | |||
| vst U2, TD, 0x80 | |||
| vst U6, TD, 0x90 | |||
| vst U10, TD, 0xa0 | |||
| vst U14, TD, 0xb0 | |||
| vst U3, TD, 0xc0 | |||
| vst U7, TD, 0xd0 | |||
| vst U11, TD, 0xe0 | |||
| vst U15, TD, 0xf0 | |||
| addi.d S1, S1, 0x40 // a_offset | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| addi.d TD, TD, 0x100 // b_offset | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_I3: /* if(m&2) */ | |||
| andi I, M, 0x02 | |||
| beq I, ZERO, .L_II20 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vld U4, S3, 0x00 | |||
| vld U5, S3, 0x10 | |||
| vld U6, S4, 0x00 | |||
| vld U7, S4, 0x10 | |||
| vst U0, TD, 0x00 | |||
| vst U2, TD, 0x10 | |||
| vst U4, TD, 0x20 | |||
| vst U6, TD, 0x30 | |||
| vst U1, TD, 0x40 | |||
| vst U3, TD, 0x50 | |||
| vst U5, TD, 0x60 | |||
| vst U7, TD, 0x70 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d TD, TD, 0x80 | |||
| .L_II20: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_J0 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vld U2, S3, 0x00 | |||
| vld U3, S4, 0x00 | |||
| vst U0, TD, 0x00 | |||
| vst U1, TD, 0x10 | |||
| vst U2, TD, 0x20 | |||
| vst U3, TD, 0x30 | |||
| addi.d TD, TD, 0x40 | |||
| .L_J0: | |||
| addi.d J, J, -1 | |||
| blt ZERO, J, .L_J1 | |||
| .L_N0: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N20 | |||
| move S1, TS | |||
| add.d S2, S1, TL | |||
| slli.d T0, TL, 0x01 | |||
| add.d TS, TS, T0 | |||
| srai.d I, M, 0x02 | |||
| beq ZERO, I, .L_N10 | |||
| .L_N11: /* if(i>0) */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S1, 0x20 | |||
| vld U3, S1, 0x30 | |||
| vld U4, S2, 0x00 | |||
| vld U5, S2, 0x10 | |||
| vld U6, S2, 0x20 | |||
| vld U7, S2, 0x30 | |||
| vst U0, TD, 0x00 | |||
| vst U4, TD, 0x10 | |||
| vst U1, TD, 0x20 | |||
| vst U5, TD, 0x30 | |||
| vst U2, TD, 0x40 | |||
| vst U6, TD, 0x50 | |||
| vst U3, TD, 0x60 | |||
| vst U7, TD, 0x70 | |||
| addi.d S1, S1, 0x40 // a_offset | |||
| addi.d S2, S2, 0x40 | |||
| addi.d TD, TD, 0x80 // b_offset | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N11 | |||
| .L_N10: /* if(m&2) */ | |||
| andi I, M, 0x02 | |||
| beq I, ZERO, .L_N130 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vst U0, TD, 0x00 | |||
| vst U2, TD, 0x10 | |||
| vst U1, TD, 0x20 | |||
| vst U3, TD, 0x30 | |||
| addi.d S1, S1, 0x20 // a_offset | |||
| addi.d S2, S2, 0x20 | |||
| addi.d TD, TD, 0x40 // b_offset | |||
| .L_N130: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N20 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vst U0, TD, 0x00 | |||
| vst U1, TD, 0x10 | |||
| addi.d TD, TD, 0x20 | |||
| .L_N20: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq I, ZERO, .L_N00 | |||
| move S1, TS | |||
| srai.d I, M, 0x02 | |||
| beq I, ZERO, .L_N30 | |||
| .L_N21: /* if(i>0) */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S1, 0x20 | |||
| vld U3, S1, 0x30 | |||
| vst U0, TD, 0x00 | |||
| vst U1, TD, 0x10 | |||
| vst U2, TD, 0x20 | |||
| vst U3, TD, 0x30 | |||
| addi.d S1, S1, 0x40 // aoffset1 | |||
| addi.d TD, TD, 0x40 // b_offset | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N21 | |||
| .L_N30: /* if(m&2) */ | |||
| andi I, M, 0x02 | |||
| beq I, ZERO, .L_N330 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vst U0, TD, 0x00 | |||
| vst U1, TD, 0x10 | |||
| addi.d S1, S1, 0x20 // aoffset1 | |||
| addi.d TD, TD, 0x20 // b_offset | |||
| .L_N330: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N00 | |||
| vld U0, S1, 0x00 | |||
| vst U0, TD, 0x00 | |||
| .L_N00: | |||
| LDARG $r23, $sp, 0 | |||
| addi.d $sp, $sp, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,263 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define TD $r20 | |||
| #define TS $r11 | |||
| #define TL $r7 | |||
| #define T0 $r23 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| #define D0 $xr8 | |||
| #define D1 $xr9 | |||
| #define D2 $xr10 | |||
| #define D3 $xr11 | |||
| #define D4 $xr12 | |||
| #define D5 $xr13 | |||
| #define D6 $xr14 | |||
| #define D7 $xr15 | |||
| #define D8 $xr16 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -8 | |||
| SDARG $r23, $sp, 0 | |||
| move TD, DST //boffset | |||
| move TS, SRC //aoffset | |||
| slli.d TL, LDA, 0x03 //lda | |||
| slli.d TL, TL, 0x01 | |||
| slli.d T0, TL, 0x03 | |||
| srai.d J, N, 0x03 //j | |||
| beq J, ZERO, .L_N1 | |||
| .L_J1: /* if(j>0) j--*/ | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| move I, M | |||
| add.d S3, S2, TL | |||
| add.d S4, S3, TL | |||
| add.d S5, S4, TL | |||
| add.d S6, S5, TL | |||
| add.d S7, S6, TL | |||
| add.d S8, S7, TL | |||
| add.d TS, TS, T0 | |||
| beq I, ZERO, .L_J11 | |||
| .L_I1: /* if(i>0) i--*/ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S1, 0x08 | |||
| fld.d F2, S2, 0x00 | |||
| fld.d F3, S2, 0x08 | |||
| fld.d F4, S3, 0x00 | |||
| fld.d F5, S3, 0x08 | |||
| fld.d F6, S4, 0x00 | |||
| fld.d F7, S4, 0x08 | |||
| fst.d F0, TD, 0x00 | |||
| fst.d F1, TD, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| fst.d F3, TD, 0x18 | |||
| fst.d F4, TD, 0x20 | |||
| fst.d F5, TD, 0x28 | |||
| fst.d F6, TD, 0x30 | |||
| fst.d F7, TD, 0x38 | |||
| fld.d F0, S5, 0x00 | |||
| fld.d F1, S5, 0x08 | |||
| fld.d F2, S6, 0x00 | |||
| fld.d F3, S6, 0x08 | |||
| fld.d F4, S7, 0x00 | |||
| fld.d F5, S7, 0x08 | |||
| fld.d F6, S8, 0x00 | |||
| fld.d F7, S8, 0x08 | |||
| fst.d F0, TD, 0x40 | |||
| fst.d F1, TD, 0x48 | |||
| fst.d F2, TD, 0x50 | |||
| fst.d F3, TD, 0x58 | |||
| fst.d F4, TD, 0x60 | |||
| fst.d F5, TD, 0x68 | |||
| fst.d F6, TD, 0x70 | |||
| fst.d F7, TD, 0x78 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d S3, S3, 0x10 | |||
| addi.d S4, S4, 0x10 | |||
| addi.d S5, S5, 0x10 | |||
| addi.d S6, S6, 0x10 | |||
| addi.d S7, S7, 0x10 | |||
| addi.d S8, S8, 0x10 | |||
| addi.d TD, TD, 0x80 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_J11: /* j--*/ | |||
| addi.d J, J, -1 | |||
| blt ZERO, J, .L_J1 | |||
| .L_N1: /* if(n&4)*/ | |||
| andi I, N, 0x04 | |||
| beq I, ZERO, .L_N2 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| move I, M | |||
| add.d S3, S2, TL | |||
| add.d S4, S3, TL | |||
| add.d TS, S4, TL | |||
| beq I, ZERO, .L_N2 | |||
| .L_N11: /* if(i>0)*/ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S1, 0x08 | |||
| fld.d F2, S2, 0x00 | |||
| fld.d F3, S2, 0x08 | |||
| fld.d F4, S3, 0x00 | |||
| fld.d F5, S3, 0x08 | |||
| fld.d F6, S4, 0x00 | |||
| fld.d F7, S4, 0x08 | |||
| fst.d F0, TD, 0x00 | |||
| fst.d F1, TD, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| fst.d F3, TD, 0x18 | |||
| fst.d F4, TD, 0x20 | |||
| fst.d F5, TD, 0x28 | |||
| fst.d F6, TD, 0x30 | |||
| fst.d F7, TD, 0x38 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d S3, S3, 0x10 | |||
| addi.d S4, S4, 0x10 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N11 | |||
| .L_N2: /* if(n&2)*/ | |||
| andi I, N, 0x02 | |||
| beq I, ZERO, .L_N3 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| move I, M | |||
| add.d TS, S2, TL | |||
| beq I, ZERO, .L_N3 | |||
| .L_N21: /* if(i>0)*/ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S1, 0x08 | |||
| fld.d F2, S2, 0x00 | |||
| fld.d F3, S2, 0x08 | |||
| fst.d F0, TD, 0x00 | |||
| fst.d F1, TD, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N21 | |||
| .L_N3: /* if(n&2)*/ | |||
| andi I, N, 0x01 | |||
| beq I, ZERO, .L_N0 | |||
| move S1, TS | |||
| move I, M | |||
| beq I, ZERO, .L_N0 | |||
| .L_N31: /* if(i>0)*/ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S1, 0x08 | |||
| fst.d F0, TD, 0x00 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d TD, TD, 0x10 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N31 | |||
| .L_N0: | |||
| LDARG $r23, $sp, 0 | |||
| addi.d $sp, $sp, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,302 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define TD $r16 | |||
| #define TS $r17 | |||
| #define TL $r18 | |||
| #define T0 $r19 | |||
| #define S8 $r20 | |||
| #define S9 $r23 | |||
| #define S10 $r11 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| #define D0 $xr8 | |||
| #define D1 $xr9 | |||
| #define D2 $xr10 | |||
| #define D3 $xr11 | |||
| #define D4 $xr12 | |||
| #define D5 $xr13 | |||
| #define D6 $xr14 | |||
| #define D7 $xr15 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -8 | |||
| SDARG $r23, $sp, 0 | |||
| move TS, SRC //aoffset | |||
| move TD, DST //boffset | |||
| slli.d TL, LDA, 0x03 //lda | |||
| slli.d TL, TL, 0x01 //lda | |||
| ori T0, ZERO, 0x03 | |||
| andn T0, N, T0 | |||
| mul.d T0, M, T0 | |||
| slli.d T0, T0, 0x01 | |||
| slli.d T0, T0, 0x03 | |||
| add.d S9, DST, T0 //boffset2 | |||
| ori T0, ZERO, 0x01 | |||
| andn T0, N, T0 | |||
| mul.d T0, M, T0 | |||
| slli.d T0, T0, 0x01 | |||
| slli.d T0, T0, 0x03 | |||
| add.d S10, DST, T0 //boffset3 | |||
| srai.d J, M, 0x02 //j | |||
| beq J, ZERO, .L_M1 | |||
| .L_J1: /* if(j>0) j--*/ | |||
| move S1, TS //aoffset1 | |||
| add.d S2, S1, TL | |||
| add.d S3, S2, TL | |||
| add.d S4, S3, TL | |||
| slli.d T0, TL, 0x02 | |||
| add.d TS, TS, T0 | |||
| move S8, TD //boffset1 | |||
| addi.d TD, TD, 0x100 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_JN1 | |||
| .L_JI1: /* if(i>0) i--*/ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvld U4, S3, 0x00 | |||
| xvld U5, S3, 0x20 | |||
| xvld U6, S4, 0x00 | |||
| xvld U7, S4, 0x20 | |||
| xvst U0, S8, 0x00 | |||
| xvst U1, S8, 0x20 | |||
| xvst U2, S8, 0x40 | |||
| xvst U3, S8, 0x60 | |||
| xvst U4, S8, 0x80 | |||
| xvst U5, S8, 0xa0 | |||
| xvst U6, S8, 0xc0 | |||
| xvst U7, S8, 0xe0 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| slli.d T0, M, 0x06 | |||
| add.d S8, S8, T0 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_JI1 | |||
| .L_JN1: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_JN2 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvst U0, S9, 0x00 | |||
| xvst U1, S9, 0x20 | |||
| xvst U2, S9, 0x40 | |||
| xvst U3, S9, 0x60 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d S9, S9, 0x80 | |||
| .L_JN2: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_J0 | |||
| vld $vr0, S1, 0x00 | |||
| vld $vr1, S2, 0x00 | |||
| vld $vr2, S3, 0x00 | |||
| vld $vr3, S4, 0x00 | |||
| vst $vr0, S10, 0x00 | |||
| vst $vr1, S10, 0x10 | |||
| vst $vr2, S10, 0x20 | |||
| vst $vr3, S10, 0x30 | |||
| addi.d S10, S10, 0x40 | |||
| .L_J0: | |||
| addi.d J, J, -1 | |||
| blt ZERO, J, .L_J1 | |||
| .L_M1: /* if(m&2) */ | |||
| andi I, M, 0x02 | |||
| beq ZERO, I, .L_M2 | |||
| move S1, TS //aoffset1 | |||
| add.d S2, S1, TL | |||
| slli.d T0, TL, 0x01 | |||
| add.d TS, TS, T0 | |||
| move S8, TD //boffset1 | |||
| addi.d TD, TD, 0x80 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_M1N1 | |||
| .L_M1I1: /* if(i>0) */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvst U0, S8, 0x00 | |||
| xvst U1, S8, 0x20 | |||
| xvst U2, S8, 0x40 | |||
| xvst U3, S8, 0x60 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| slli.d T0, M, 0x06 | |||
| add.d S8, S8, T0 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_M1I1 | |||
| .L_M1N1: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M1N2 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvst U0, S9, 0x00 | |||
| xvst U1, S9, 0x20 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S9, S9, 0x40 | |||
| .L_M1N2: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M2 | |||
| vld $vr0, S1, 0x00 | |||
| vld $vr1, S2, 0x00 | |||
| vst $vr0, S10, 0x00 | |||
| vst $vr1, S10, 0x10 | |||
| addi.d S10, S10, 0x20 | |||
| .L_M2: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq ZERO, I, .L_M0 | |||
| move S1, TS //aoffset1 | |||
| move S8, TD //boffset1 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_M2N1 | |||
| .L_M2I1: /* if(i>0) */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvst U0, S8, 0x00 | |||
| xvst U1, S8, 0x20 | |||
| addi.d S1, S1, 0x40 | |||
| slli.d T0, M, 0x06 | |||
| add.d S8, S8, T0 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_M2I1 | |||
| .L_M2N1: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M2N2 | |||
| xvld U0, S1, 0x00 | |||
| xvst U0, S9, 0x00 | |||
| addi.d S1, S1, 0x20 | |||
| .L_M2N2: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M0 | |||
| vld $vr0, S1, 0x00 | |||
| vst $vr0, S10, 0x00 | |||
| .L_M0: | |||
| LDARG $r23, $sp, 0 | |||
| addi.d $sp, $sp, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,355 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define TD $r16 | |||
| #define TS $r17 | |||
| #define TL $r18 | |||
| #define T0 $r19 | |||
| #define S8 $r20 | |||
| #define S9 $r23 | |||
| #define S10 $r11 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LSX vectors */ | |||
| #define U0 $vr0 | |||
| #define U1 $vr1 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| #define U8 $vr8 | |||
| #define U9 $vr9 | |||
| #define U10 $vr10 | |||
| #define U11 $vr11 | |||
| #define U12 $vr12 | |||
| #define U13 $vr13 | |||
| #define U14 $vr14 | |||
| #define U15 $vr15 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -8 | |||
| SDARG $r23, $sp, 0 | |||
| move TS, SRC //aoffset | |||
| move TD, DST //boffset | |||
| slli.d TL, LDA, 0x03 //lda | |||
| slli.d TL, TL, 0x01 //lda | |||
| ori T0, ZERO, 0x03 | |||
| andn T0, N, T0 | |||
| mul.d T0, M, T0 | |||
| slli.d T0, T0, 0x01 | |||
| slli.d T0, T0, 0x03 | |||
| add.d S9, DST, T0 //boffset2 | |||
| ori T0, ZERO, 0x01 | |||
| andn T0, N, T0 | |||
| mul.d T0, M, T0 | |||
| slli.d T0, T0, 0x01 | |||
| slli.d T0, T0, 0x03 | |||
| add.d S10, DST, T0 //boffset3 | |||
| srai.d J, M, 0x02 //j | |||
| beq J, ZERO, .L_M1 | |||
| .L_J1: /* if(j>0) j--*/ | |||
| move S1, TS //aoffset1 | |||
| add.d S2, S1, TL | |||
| add.d S3, S2, TL | |||
| add.d S4, S3, TL | |||
| slli.d T0, TL, 0x02 | |||
| add.d TS, TS, T0 | |||
| move S8, TD //boffset1 | |||
| addi.d TD, TD, 0x100 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_JN1 | |||
| .L_JI1: /* if(i>0) i--*/ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S1, 0x20 | |||
| vld U3, S1, 0x30 | |||
| vld U4, S2, 0x00 | |||
| vld U5, S2, 0x10 | |||
| vld U6, S2, 0x20 | |||
| vld U7, S2, 0x30 | |||
| vld U8, S3, 0x00 | |||
| vld U9, S3, 0x10 | |||
| vld U10, S3, 0x20 | |||
| vld U11, S3, 0x30 | |||
| vld U12, S4, 0x00 | |||
| vld U13, S4, 0x10 | |||
| vld U14, S4, 0x20 | |||
| vld U15, S4, 0x30 | |||
| vst U0, S8, 0x00 | |||
| vst U1, S8, 0x10 | |||
| vst U2, S8, 0x20 | |||
| vst U3, S8, 0x30 | |||
| vst U4, S8, 0x40 | |||
| vst U5, S8, 0x50 | |||
| vst U6, S8, 0x60 | |||
| vst U7, S8, 0x70 | |||
| vst U8, S8, 0x80 | |||
| vst U9, S8, 0x90 | |||
| vst U10, S8, 0xa0 | |||
| vst U11, S8, 0xb0 | |||
| vst U12, S8, 0xc0 | |||
| vst U13, S8, 0xd0 | |||
| vst U14, S8, 0xe0 | |||
| vst U15, S8, 0xf0 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| slli.d T0, M, 0x06 | |||
| add.d S8, S8, T0 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_JI1 | |||
| .L_JN1: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_JN2 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vld U4, S3, 0x00 | |||
| vld U5, S3, 0x10 | |||
| vld U6, S4, 0x00 | |||
| vld U7, S4, 0x10 | |||
| vst U0, S9, 0x00 | |||
| vst U1, S9, 0x10 | |||
| vst U2, S9, 0x20 | |||
| vst U3, S9, 0x30 | |||
| vst U4, S9, 0x40 | |||
| vst U5, S9, 0x50 | |||
| vst U6, S9, 0x60 | |||
| vst U7, S9, 0x70 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d S9, S9, 0x80 | |||
| .L_JN2: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_J0 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vld U2, S3, 0x00 | |||
| vld U3, S4, 0x00 | |||
| vst U0, S10, 0x00 | |||
| vst U1, S10, 0x10 | |||
| vst U2, S10, 0x20 | |||
| vst U3, S10, 0x30 | |||
| addi.d S10, S10, 0x40 | |||
| .L_J0: | |||
| addi.d J, J, -1 | |||
| blt ZERO, J, .L_J1 | |||
| .L_M1: /* if(m&2) */ | |||
| andi I, M, 0x02 | |||
| beq ZERO, I, .L_M2 | |||
| move S1, TS //aoffset1 | |||
| add.d S2, S1, TL | |||
| slli.d T0, TL, 0x01 | |||
| add.d TS, TS, T0 | |||
| move S8, TD //boffset1 | |||
| addi.d TD, TD, 0x80 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_M1N1 | |||
| .L_M1I1: /* if(i>0) */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S1, 0x20 | |||
| vld U3, S1, 0x30 | |||
| vld U4, S2, 0x00 | |||
| vld U5, S2, 0x10 | |||
| vld U6, S2, 0x20 | |||
| vld U7, S2, 0x30 | |||
| vst U0, S8, 0x00 | |||
| vst U1, S8, 0x10 | |||
| vst U2, S8, 0x20 | |||
| vst U3, S8, 0x30 | |||
| vst U4, S8, 0x40 | |||
| vst U5, S8, 0x50 | |||
| vst U6, S8, 0x60 | |||
| vst U7, S8, 0x70 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| slli.d T0, M, 0x06 | |||
| add.d S8, S8, T0 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_M1I1 | |||
| .L_M1N1: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M1N2 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S2, 0x00 | |||
| vld U3, S2, 0x10 | |||
| vst U0, S9, 0x00 | |||
| vst U1, S9, 0x10 | |||
| vst U2, S9, 0x20 | |||
| vst U3, S9, 0x30 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S9, S9, 0x40 | |||
| .L_M1N2: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M2 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S2, 0x00 | |||
| vst U0, S10, 0x00 | |||
| vst U1, S10, 0x10 | |||
| addi.d S10, S10, 0x20 | |||
| .L_M2: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq ZERO, I, .L_M0 | |||
| move S1, TS //aoffset1 | |||
| move S8, TD //boffset1 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_M2N1 | |||
| .L_M2I1: /* if(i>0) */ | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vld U2, S1, 0x20 | |||
| vld U3, S1, 0x30 | |||
| vst U0, S8, 0x00 | |||
| vst U1, S8, 0x10 | |||
| vst U2, S8, 0x20 | |||
| vst U3, S8, 0x30 | |||
| addi.d S1, S1, 0x40 | |||
| slli.d T0, M, 0x06 | |||
| add.d S8, S8, T0 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_M2I1 | |||
| .L_M2N1: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M2N2 | |||
| vld U0, S1, 0x00 | |||
| vld U1, S1, 0x10 | |||
| vst U0, S9, 0x00 | |||
| vst U1, S9, 0x10 | |||
| addi.d S1, S1, 0x20 | |||
| .L_M2N2: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M0 | |||
| vld U0, S1, 0x00 | |||
| vst U0, S10, 0x00 | |||
| .L_M0: | |||
| LDARG $r23, $sp, 0 | |||
| addi.d $sp, $sp, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,268 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define TD $r20 | |||
| #define TS $r11 | |||
| #define TL $r7 | |||
| #define T0 $r23 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| #define D0 $xr8 | |||
| #define D1 $xr9 | |||
| #define D2 $xr10 | |||
| #define D3 $xr11 | |||
| #define D4 $xr12 | |||
| #define D5 $xr13 | |||
| #define D6 $xr14 | |||
| #define D7 $xr15 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -8 | |||
| SDARG $r23, $sp, 0 | |||
| move TS, SRC //aoffset | |||
| move TD, DST //boffset | |||
| slli.d TL, LDA, 0x03 //lda | |||
| slli.d TL, TL, 0x01 | |||
| srai.d J, N, 0x03 //j | |||
| beq J, ZERO, .L_N1 | |||
| .L_J1: /* if(j>0) j--*/ | |||
| move S1, TS //aoffset1 | |||
| slli.d T0, TL, 0x01 //2*lda | |||
| add.d S2, TS, TL | |||
| addi.d TS, TS, 0x80 | |||
| srai.d I, M, 0x01 | |||
| beq ZERO, I, .L_J1M1 | |||
| .L_J1I1: /* if(i>0) i--*/ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S1, 0x40 | |||
| xvld U3, S1, 0x60 | |||
| xvld U4, S2, 0x00 | |||
| xvld U5, S2, 0x20 | |||
| xvld U6, S2, 0x40 | |||
| xvld U7, S2, 0x60 | |||
| xvst U0, TD, 0x00 | |||
| xvst U1, TD, 0x20 | |||
| xvst U2, TD, 0x40 | |||
| xvst U3, TD, 0x60 | |||
| xvst U4, TD, 0x80 | |||
| xvst U5, TD, 0xa0 | |||
| xvst U6, TD, 0xc0 | |||
| xvst U7, TD, 0xe0 | |||
| add.d S1, S1, T0 | |||
| add.d S2, S2, T0 | |||
| addi.d TD, TD, 0x100 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_J1I1 | |||
| .L_J1M1: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq ZERO, I, .L_J0 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S1, 0x40 | |||
| xvld U3, S1, 0x60 | |||
| xvst U0, TD, 0x00 | |||
| xvst U1, TD, 0x20 | |||
| xvst U2, TD, 0x40 | |||
| xvst U3, TD, 0x60 | |||
| addi.d TD, TD, 0x80 | |||
| .L_J0: | |||
| addi.d J, J, -1 | |||
| blt ZERO, J, .L_J1 | |||
| .L_N1: /* if(n&4) */ | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_N2 | |||
| move S1, TS //aoffset1 | |||
| slli.d T0, TL, 0x01 //2*lda | |||
| add.d S2, TS, TL | |||
| addi.d TS, TS, 0x40 | |||
| srai.d I, M, 0x01 | |||
| beq ZERO, I, .L_N1M1 | |||
| .L_N1I1: /* if(i>0) i-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvst U0, TD, 0x00 | |||
| xvst U1, TD, 0x20 | |||
| xvst U2, TD, 0x40 | |||
| xvst U3, TD, 0x60 | |||
| add.d S1, S1, T0 | |||
| add.d S2, S2, T0 | |||
| addi.d TD, TD, 0x80 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N1I1 | |||
| .L_N1M1: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq ZERO, I, .L_N2 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvst U0, TD, 0x00 | |||
| xvst U1, TD, 0x20 | |||
| addi.d TD, TD, 0x40 | |||
| .L_N2: /* if(n&2) */ | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N3 | |||
| move S1, TS //aoffset1 | |||
| slli.d T0, TL, 0x01 //2*lda | |||
| add.d S2, TS, TL | |||
| addi.d TS, TS, 0x20 | |||
| srai.d I, M, 0x01 | |||
| beq ZERO, I, .L_N2M1 | |||
| .L_N2I1: /* if(i>0) i-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvst U0, TD, 0x00 | |||
| xvst U1, TD, 0x20 | |||
| add.d S1, S1, T0 | |||
| add.d S2, S2, T0 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N2I1 | |||
| .L_N2M1: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq ZERO, I, .L_N3 | |||
| xvld U0, S1, 0x00 | |||
| xvst U0, TD, 0x00 | |||
| addi.d TD, TD, 0x20 | |||
| .L_N3: /* if(n&1) */ | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_N0 | |||
| move S1, TS //aoffset1 | |||
| slli.d T0, TL, 0x01 //2*lda | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x01 | |||
| beq ZERO, I, .L_N3M1 | |||
| .L_N3I1: /* if(i>0) i-- */ | |||
| vld $vr0, S1, 0x00 | |||
| vld $vr1, S2, 0x00 | |||
| vst $vr0, TD, 0x00 | |||
| vst $vr1, TD, 0x10 | |||
| add.d S1, S1, T0 | |||
| add.d S2, S2, T0 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N3I1 | |||
| .L_N3M1: /* if(m&1) */ | |||
| andi I, M, 0x01 | |||
| beq ZERO, I, .L_N0 | |||
| vld $vr0, S1, 0x00 | |||
| vst $vr0, TD, 0x00 | |||
| .L_N0: | |||
| LDARG $r23, $sp, 0 | |||
| addi.d $sp, $sp, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -35,7 +35,7 @@ DSUMKERNEL = ../mips/sum.c | |||
| CSUMKERNEL = ../mips/zsum.c | |||
| ZSUMKERNEL = ../mips/zsum.c | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SASUMKERNEL = ../mips/sasum_msa.c | |||
| DASUMKERNEL = ../mips/dasum_msa.c | |||
| CASUMKERNEL = ../mips/casum_msa.c | |||
| @@ -47,7 +47,7 @@ CASUMKERNEL = ../mips/zasum.c | |||
| ZASUMKERNEL = ../mips/zasum.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SAXPYKERNEL = ../mips/saxpy_msa.c | |||
| DAXPYKERNEL = ../mips/daxpy_msa.c | |||
| CAXPYKERNEL = ../mips/caxpy_msa.c | |||
| @@ -59,7 +59,7 @@ CAXPYKERNEL = ../mips/zaxpy.c | |||
| ZAXPYKERNEL = ../mips/zaxpy.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SCOPYKERNEL = ../mips/scopy_msa.c | |||
| DCOPYKERNEL = ../mips/dcopy_msa.c | |||
| CCOPYKERNEL = ../mips/ccopy_msa.c | |||
| @@ -71,7 +71,7 @@ CCOPYKERNEL = ../mips/zcopy.c | |||
| ZCOPYKERNEL = ../mips/zcopy.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SDOTKERNEL = ../mips/sdot_msa.c | |||
| DDOTKERNEL = ../mips/ddot_msa.c | |||
| CDOTKERNEL = ../mips/cdot_msa.c | |||
| @@ -88,7 +88,7 @@ DNRM2KERNEL = ../mips/nrm2.c | |||
| CNRM2KERNEL = ../mips/znrm2.c | |||
| ZNRM2KERNEL = ../mips/znrm2.c | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SROTKERNEL = ../mips/srot_msa.c | |||
| DROTKERNEL = ../mips/drot_msa.c | |||
| CROTKERNEL = ../mips/crot_msa.c | |||
| @@ -100,7 +100,7 @@ CROTKERNEL = ../mips/zrot.c | |||
| ZROTKERNEL = ../mips/zrot.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SSCALKERNEL = ../mips/sscal_msa.c | |||
| DSCALKERNEL = ../mips/dscal_msa.c | |||
| #CSCALKERNEL = ../mips/cscal_msa.c | |||
| @@ -114,7 +114,7 @@ CSCALKERNEL = ../mips/zscal.c | |||
| ZSCALKERNEL = ../mips/zscal.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SSWAPKERNEL = ../mips/sswap_msa.c | |||
| DSWAPKERNEL = ../mips/dswap_msa.c | |||
| CSWAPKERNEL = ../mips/cswap_msa.c | |||
| @@ -126,7 +126,7 @@ CSWAPKERNEL = ../mips/zswap.c | |||
| ZSWAPKERNEL = ../mips/zswap.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SGEMVNKERNEL = ../mips/sgemv_n_msa.c | |||
| DGEMVNKERNEL = ../mips/dgemv_n_msa.c | |||
| CGEMVNKERNEL = ../mips/cgemv_n_msa.c | |||
| @@ -138,7 +138,7 @@ CGEMVNKERNEL = ../mips/zgemv_n.c | |||
| ZGEMVNKERNEL = ../mips/zgemv_n.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SGEMVTKERNEL = ../mips/sgemv_t_msa.c | |||
| DGEMVTKERNEL = ../mips/dgemv_t_msa.c | |||
| CGEMVTKERNEL = ../mips/cgemv_t_msa.c | |||
| @@ -150,7 +150,7 @@ CGEMVTKERNEL = ../mips/zgemv_t.c | |||
| ZGEMVTKERNEL = ../mips/zgemv_t.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | |||
| SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | |||
| SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | |||
| @@ -164,7 +164,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c | |||
| DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c | |||
| DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c | |||
| @@ -182,7 +182,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | |||
| CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | |||
| CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | |||
| @@ -200,7 +200,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | |||
| ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | |||
| ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | |||
| @@ -214,7 +214,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | |||
| STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | |||
| STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | |||
| @@ -226,7 +226,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | |||
| DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | |||
| DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c | |||
| @@ -238,7 +238,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| @@ -250,7 +250,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| @@ -1,4 +1,4 @@ | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SAXPYKERNEL = ../mips/saxpy_msa.c | |||
| DAXPYKERNEL = ../mips/daxpy_msa.c | |||
| CAXPYKERNEL = ../mips/caxpy_msa.c | |||
| @@ -8,14 +8,14 @@ SAXPYKERNEL = axpy_loongson3a.S | |||
| DAXPYKERNEL = daxpy_loongson3a_simd.S | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SCOPYKERNEL = ../mips/scopy_msa.c | |||
| DCOPYKERNEL = ../mips/dcopy_msa.c | |||
| CCOPYKERNEL = ../mips/ccopy_msa.c | |||
| ZCOPYKERNEL = ../mips/zcopy_msa.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SDOTKERNEL = ../mips/sdot_msa.c | |||
| DDOTKERNEL = ../mips/ddot_msa.c | |||
| CDOTKERNEL = ../mips/cdot_msa.c | |||
| @@ -23,21 +23,21 @@ ZDOTKERNEL = ../mips/zdot_msa.c | |||
| endif | |||
| DSDOTKERNEL = ../mips/dot.c | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SROTKERNEL = ../mips/srot_msa.c | |||
| DROTKERNEL = ../mips/drot_msa.c | |||
| CROTKERNEL = ../mips/crot_msa.c | |||
| ZROTKERNEL = ../mips/zrot_msa.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SSCALKERNEL = ../mips/sscal_msa.c | |||
| DSCALKERNEL = ../mips/dscal_msa.c | |||
| CSCALKERNEL = ../mips/cscal_msa.c | |||
| ZSCALKERNEL = ../mips/zscal_msa.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SGEMVNKERNEL = ../mips/sgemv_n_msa.c | |||
| DGEMVNKERNEL = ../mips/dgemv_n_msa.c | |||
| SGEMVTKERNEL = ../mips/sgemv_t_msa.c | |||
| @@ -57,21 +57,21 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||
| ZGEMVTKERNEL = zgemv_t_loongson3a.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SASUMKERNEL = ../mips/sasum_msa.c | |||
| DASUMKERNEL = ../mips/dasum_msa.c | |||
| CASUMKERNEL = ../mips/casum_msa.c | |||
| ZASUMKERNEL = ../mips/zasum_msa.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SSWAPKERNEL = ../mips/sswap_msa.c | |||
| DSWAPKERNEL = ../mips/dswap_msa.c | |||
| CSWAPKERNEL = ../mips/cswap_msa.c | |||
| ZSWAPKERNEL = ../mips/zswap_msa.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | |||
| SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | |||
| SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | |||
| @@ -89,7 +89,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c | |||
| DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c | |||
| DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c | |||
| @@ -107,7 +107,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | |||
| CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | |||
| CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | |||
| @@ -129,7 +129,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | |||
| ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | |||
| ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | |||
| @@ -143,7 +143,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | |||
| STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | |||
| STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | |||
| @@ -155,7 +155,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | |||
| DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | |||
| DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c | |||
| @@ -167,7 +167,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| @@ -179,7 +179,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| ifndef NO_MSA | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| @@ -25,7 +25,7 @@ ZTRMMKERNEL = zgemm_kernel_power10.S | |||
| endif | |||
| SGEMMKERNEL = sgemm_kernel_power10.c | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMINCOPY = sgemm_ncopy_16_power.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| @@ -50,7 +50,7 @@ CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||
| ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||
| SGEMMKERNEL = sgemm_kernel_16x8_power8.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMINCOPY = sgemm_ncopy_16_power.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| @@ -13,7 +13,7 @@ CTRMMKERNEL = cgemm_kernel_power9.S | |||
| ZTRMMKERNEL = zgemm_kernel_power9.S | |||
| SGEMMKERNEL = sgemm_kernel_power9.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMINCOPY = sgemm_ncopy_16_power.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| @@ -0,0 +1,482 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include <altivec.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; | |||
| IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; | |||
| IFLOAT *boffset; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp19 ; | |||
| IFLOAT ctemp21, ctemp23 ; | |||
| IFLOAT ctemp25, ctemp27 ; | |||
| IFLOAT ctemp29, ctemp31 ; | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = (n >> 4); | |||
| if (j > 0){ | |||
| do{ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset3 = aoffset2 + lda; | |||
| aoffset4 = aoffset3 + lda; | |||
| aoffset5 = aoffset4 + lda; | |||
| aoffset6 = aoffset5 + lda; | |||
| aoffset7 = aoffset6 + lda; | |||
| aoffset8 = aoffset7 + lda; | |||
| aoffset9 = aoffset8 + lda; | |||
| aoffset10 = aoffset9 + lda; | |||
| aoffset11 = aoffset10 + lda; | |||
| aoffset12 = aoffset11 + lda; | |||
| aoffset13 = aoffset12 + lda; | |||
| aoffset14 = aoffset13 + lda; | |||
| aoffset15 = aoffset14 + lda; | |||
| aoffset16 = aoffset15 + lda; | |||
| aoffset += 16 * lda; | |||
| i = (m >> 2); | |||
| if (i > 0){ | |||
| vector float c1, c2, c3, c4, c5, c6, c7, c8; | |||
| vector float c9, c10, c11, c12, c13, c14, c15, c16; | |||
| vector float t1, t2, t3, t4, t5, t6, t7, t8; | |||
| vector float t9, t10, t11, t12; | |||
| do{ | |||
| c1 = vec_xl(0, aoffset1); | |||
| c2 = vec_xl(0, aoffset2); | |||
| c3 = vec_xl(0, aoffset3); | |||
| c4 = vec_xl(0, aoffset4); | |||
| c5 = vec_xl(0, aoffset5); | |||
| c6 = vec_xl(0, aoffset6); | |||
| c7 = vec_xl(0, aoffset7); | |||
| c8 = vec_xl(0, aoffset8); | |||
| c9 = vec_xl(0, aoffset9); | |||
| c10 = vec_xl(0, aoffset10); | |||
| c11 = vec_xl(0, aoffset11); | |||
| c12 = vec_xl(0, aoffset12); | |||
| c13 = vec_xl(0, aoffset13); | |||
| c14 = vec_xl(0, aoffset14); | |||
| c15 = vec_xl(0, aoffset15); | |||
| c16 = vec_xl(0, aoffset16); | |||
| t1 = vec_mergeh(c1, c2); | |||
| t2 = vec_mergeh(c3, c4); | |||
| t3 = vec_mergeh(c5, c6); | |||
| t4 = vec_mergeh(c7, c8); | |||
| t9 = vec_mergeh(c9, c10); | |||
| t10 = vec_mergeh(c11, c12); | |||
| t11 = vec_mergeh(c13, c14); | |||
| t12 = vec_mergeh(c15, c16); | |||
| t5 = vec_xxpermdi(t1, t2, 0b00); | |||
| t6 = vec_xxpermdi(t3, t4, 0b00); | |||
| t7 = vec_xxpermdi(t9, t10, 0b00); | |||
| t8 = vec_xxpermdi(t11, t12, 0b00); | |||
| vec_xst(t5, 0, boffset); | |||
| vec_xst(t6, 0, boffset+4); | |||
| vec_xst(t7, 0, boffset+8); | |||
| vec_xst(t8, 0, boffset+12); | |||
| t5 = vec_xxpermdi(t1, t2, 0b11); | |||
| t6 = vec_xxpermdi(t3, t4, 0b11); | |||
| t7 = vec_xxpermdi(t9, t10, 0b11); | |||
| t8 = vec_xxpermdi(t11, t12, 0b11); | |||
| vec_xst(t5, 0, boffset+16); | |||
| vec_xst(t6, 0, boffset+20); | |||
| vec_xst(t7, 0, boffset+24); | |||
| vec_xst(t8, 0, boffset+28); | |||
| t1 = vec_mergel(c1, c2); | |||
| t2 = vec_mergel(c3, c4); | |||
| t3 = vec_mergel(c5, c6); | |||
| t4 = vec_mergel(c7, c8); | |||
| t9 = vec_mergel(c9, c10); | |||
| t10 = vec_mergel(c11, c12); | |||
| t11 = vec_mergel(c13, c14); | |||
| t12 = vec_mergel(c15, c16); | |||
| t5 = vec_xxpermdi(t1, t2, 0b00); | |||
| t6 = vec_xxpermdi(t3, t4, 0b00); | |||
| t7 = vec_xxpermdi(t9, t10, 0b00); | |||
| t8 = vec_xxpermdi(t11, t12, 0b00); | |||
| vec_xst(t5, 0, boffset+32); | |||
| vec_xst(t6, 0, boffset+36); | |||
| vec_xst(t7, 0, boffset+40); | |||
| vec_xst(t8, 0, boffset+44); | |||
| t5 = vec_xxpermdi(t1, t2, 0b11); | |||
| t6 = vec_xxpermdi(t3, t4, 0b11); | |||
| t7 = vec_xxpermdi(t9, t10, 0b11); | |||
| t8 = vec_xxpermdi(t11, t12, 0b11); | |||
| vec_xst(t5, 0, boffset+48); | |||
| vec_xst(t6, 0, boffset+52); | |||
| vec_xst(t7, 0, boffset+56); | |||
| vec_xst(t8, 0, boffset+60); | |||
| aoffset1 += 4; | |||
| aoffset2 += 4; | |||
| aoffset3 += 4; | |||
| aoffset4 += 4; | |||
| aoffset5 += 4; | |||
| aoffset6 += 4; | |||
| aoffset7 += 4; | |||
| aoffset8 += 4; | |||
| aoffset9 += 4; | |||
| aoffset10 += 4; | |||
| aoffset11 += 4; | |||
| aoffset12 += 4; | |||
| aoffset13 += 4; | |||
| aoffset14 += 4; | |||
| aoffset15 += 4; | |||
| aoffset16 += 4; | |||
| boffset += 64; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp03 = *(aoffset2 + 0); | |||
| ctemp05 = *(aoffset3 + 0); | |||
| ctemp07 = *(aoffset4 + 0); | |||
| ctemp09 = *(aoffset5 + 0); | |||
| ctemp11 = *(aoffset6 + 0); | |||
| ctemp13 = *(aoffset7 + 0); | |||
| ctemp15 = *(aoffset8 + 0); | |||
| ctemp17 = *(aoffset9 + 0); | |||
| ctemp19 = *(aoffset10 + 0); | |||
| ctemp21 = *(aoffset11 + 0); | |||
| ctemp23 = *(aoffset12 + 0); | |||
| ctemp25 = *(aoffset13 + 0); | |||
| ctemp27 = *(aoffset14 + 0); | |||
| ctemp29 = *(aoffset15 + 0); | |||
| ctemp31 = *(aoffset16 + 0); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp03; | |||
| *(boffset + 2) = ctemp05; | |||
| *(boffset + 3) = ctemp07; | |||
| *(boffset + 4) = ctemp09; | |||
| *(boffset + 5) = ctemp11; | |||
| *(boffset + 6) = ctemp13; | |||
| *(boffset + 7) = ctemp15; | |||
| *(boffset + 8) = ctemp17; | |||
| *(boffset + 9) = ctemp19; | |||
| *(boffset + 10) = ctemp21; | |||
| *(boffset + 11) = ctemp23; | |||
| *(boffset + 12) = ctemp25; | |||
| *(boffset + 13) = ctemp27; | |||
| *(boffset + 14) = ctemp29; | |||
| *(boffset + 15) = ctemp31; | |||
| aoffset1+=1; | |||
| aoffset2+=1; | |||
| aoffset3+=1; | |||
| aoffset4+=1; | |||
| aoffset5+=1; | |||
| aoffset6+=1; | |||
| aoffset7+=1; | |||
| aoffset8+=1; | |||
| aoffset9+=1; | |||
| aoffset10+=1; | |||
| aoffset11+=1; | |||
| aoffset12+=1; | |||
| aoffset13+=1; | |||
| aoffset14+=1; | |||
| aoffset15+=1; | |||
| aoffset16+=1; | |||
| boffset += 16; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| j--; | |||
| }while(j > 0); | |||
| } /* end of if(j > 0) */ | |||
| if (n & 8){ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset3 = aoffset2 + lda; | |||
| aoffset4 = aoffset3 + lda; | |||
| aoffset5 = aoffset4 + lda; | |||
| aoffset6 = aoffset5 + lda; | |||
| aoffset7 = aoffset6 + lda; | |||
| aoffset8 = aoffset7 + lda; | |||
| aoffset += 8 * lda; | |||
| i = (m >> 2); | |||
| if (i > 0){ | |||
| vector float c1, c2, c3, c4, c5, c6, c7, c8; | |||
| vector float t1, t2, t3, t4, t5, t6, t7, t8; | |||
| do{ | |||
| c1 = vec_xl(0, aoffset1); | |||
| c2 = vec_xl(0, aoffset2); | |||
| c3 = vec_xl(0, aoffset3); | |||
| c4 = vec_xl(0, aoffset4); | |||
| c5 = vec_xl(0, aoffset5); | |||
| c6 = vec_xl(0, aoffset6); | |||
| c7 = vec_xl(0, aoffset7); | |||
| c8 = vec_xl(0, aoffset8); | |||
| t1 = vec_mergeh(c1, c2); | |||
| t2 = vec_mergeh(c3, c4); | |||
| t3 = vec_mergeh(c5, c6); | |||
| t4 = vec_mergeh(c7, c8); | |||
| t5 = vec_xxpermdi(t1, t2, 0b00); | |||
| t6 = vec_xxpermdi(t3, t4, 0b00); | |||
| t7 = vec_xxpermdi(t1, t2, 0b11); | |||
| t8 = vec_xxpermdi(t3, t4, 0b11); | |||
| vec_xst(t5, 0, boffset); | |||
| vec_xst(t6, 0, boffset+4); | |||
| vec_xst(t7, 0, boffset+8); | |||
| vec_xst(t8, 0, boffset+12); | |||
| t1 = vec_mergel(c1, c2); | |||
| t2 = vec_mergel(c3, c4); | |||
| t3 = vec_mergel(c5, c6); | |||
| t4 = vec_mergel(c7, c8); | |||
| t5 = vec_xxpermdi(t1, t2, 0b00); | |||
| t6 = vec_xxpermdi(t3, t4, 0b00); | |||
| t7 = vec_xxpermdi(t1, t2, 0b11); | |||
| t8 = vec_xxpermdi(t3, t4, 0b11); | |||
| vec_xst(t5, 0, boffset+16); | |||
| vec_xst(t6, 0, boffset+20); | |||
| vec_xst(t7, 0, boffset+24); | |||
| vec_xst(t8, 0, boffset+28); | |||
| aoffset1 += 4; | |||
| aoffset2 += 4; | |||
| aoffset3 += 4; | |||
| aoffset4 += 4; | |||
| aoffset5 += 4; | |||
| aoffset6 += 4; | |||
| aoffset7 += 4; | |||
| aoffset8 += 4; | |||
| boffset += 32; | |||
| i--; | |||
| }while(i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i > 0) { | |||
| do { | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp03 = *(aoffset2 + 0); | |||
| ctemp05 = *(aoffset3 + 0); | |||
| ctemp07 = *(aoffset4 + 0); | |||
| ctemp09 = *(aoffset5 + 0); | |||
| ctemp11 = *(aoffset6 + 0); | |||
| ctemp13 = *(aoffset7 + 0); | |||
| ctemp15 = *(aoffset8 + 0); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp03; | |||
| *(boffset + 2) = ctemp05; | |||
| *(boffset + 3) = ctemp07; | |||
| *(boffset + 4) = ctemp09; | |||
| *(boffset + 5) = ctemp11; | |||
| *(boffset + 6) = ctemp13; | |||
| *(boffset + 7) = ctemp15; | |||
| aoffset1+=1; | |||
| aoffset2+=1; | |||
| aoffset3+=1; | |||
| aoffset4+=1; | |||
| aoffset5+=1; | |||
| aoffset6+=1; | |||
| aoffset7+=1; | |||
| aoffset8+=1; | |||
| boffset += 8; | |||
| i--; | |||
| } while (i > 0); | |||
| } | |||
| } | |||
| if (n & 4){ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset3 = aoffset2 + lda; | |||
| aoffset4 = aoffset3 + lda; | |||
| aoffset += 4 * lda; | |||
| i = (m >> 2); | |||
| if (i > 0){ | |||
| vector float c1, c2, c3, c4; | |||
| vector float t1, t2, t3, t4; | |||
| do{ | |||
| c1 = vec_xl(0, aoffset1); | |||
| c2 = vec_xl(0, aoffset2); | |||
| c3 = vec_xl(0, aoffset3); | |||
| c4 = vec_xl(0, aoffset4); | |||
| t1 = vec_mergeh(c1, c2); | |||
| t2 = vec_mergeh(c3, c4); | |||
| t3 = vec_xxpermdi(t1, t2, 0b00); | |||
| t4 = vec_xxpermdi(t1, t2, 0b11); | |||
| vec_xst(t3, 0, boffset); | |||
| vec_xst(t4, 0, boffset+4); | |||
| t1 = vec_mergel(c1, c2); | |||
| t2 = vec_mergel(c3, c4); | |||
| t3 = vec_xxpermdi(t1, t2, 0b00); | |||
| t4 = vec_xxpermdi(t1, t2, 0b11); | |||
| vec_xst(t3, 0, boffset+8); | |||
| vec_xst(t4, 0, boffset+12); | |||
| aoffset1 += 4; | |||
| aoffset2 += 4; | |||
| aoffset3 += 4; | |||
| aoffset4 += 4; | |||
| boffset += 16; | |||
| i--; | |||
| }while(i > 0); | |||
| } | |||
| i = (m & 3); | |||
| if (i > 0) { | |||
| do { | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp03 = *(aoffset2 + 0); | |||
| ctemp05 = *(aoffset3 + 0); | |||
| ctemp07 = *(aoffset4 + 0); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp03; | |||
| *(boffset + 2) = ctemp05; | |||
| *(boffset + 3) = ctemp07; | |||
| aoffset1+=1; | |||
| aoffset2+=1; | |||
| aoffset3+=1; | |||
| aoffset4+=1; | |||
| boffset += 4; | |||
| i--; | |||
| } while (i > 0); | |||
| } | |||
| } | |||
| if (n & 2){ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset += 2 * lda; | |||
| i = (m >> 1); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset2 + 0); | |||
| ctemp04 = *(aoffset2 + 1); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp03; | |||
| *(boffset + 2) = ctemp02; | |||
| *(boffset + 3) = ctemp04; | |||
| aoffset1 += 2; | |||
| aoffset2 += 2; | |||
| boffset += 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (m & 1){ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp03 = *(aoffset2 + 0); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp03; | |||
| boffset += 2; | |||
| } | |||
| } | |||
| if (n & 1){ | |||
| aoffset1 = aoffset; | |||
| i = (m >> 1); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp02; | |||
| aoffset1 += 2; | |||
| boffset += 2; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (m & 1){ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| *(boffset + 0) = ctemp01; | |||
| // boffset += 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -42,8 +42,8 @@ ZSUMKERNEL = ../arm/zsum.c | |||
| SAXPYKERNEL = axpy_vector.c | |||
| DAXPYKERNEL = axpy_vector.c | |||
| CAXPYKERNEL = zaxpy.c | |||
| ZAXPYKERNEL = zaxpy.c | |||
| CAXPYKERNEL = zaxpy_vector.c | |||
| ZAXPYKERNEL = zaxpy_vector.c | |||
| SAXPBYKERNEL = axpby_vector.c | |||
| DAXPBYKERNEL = axpby_vector.c | |||
| @@ -59,7 +59,7 @@ SDOTKERNEL = dot_vector.c | |||
| DDOTKERNEL = dot_vector.c | |||
| CDOTKERNEL = zdot_vector.c | |||
| ZDOTKERNEL = zdot_vector.c | |||
| DSDOTKERNEL = ../generic/dot.c | |||
| DSDOTKERNEL = dsdot_vector.c | |||
| SNRM2KERNEL = nrm2_vector.c | |||
| DNRM2KERNEL = nrm2_vector.c | |||
| @@ -45,6 +45,11 @@ DAXPYKERNEL = ../riscv64/axpy.c | |||
| CAXPYKERNEL = ../riscv64/zaxpy.c | |||
| ZAXPYKERNEL = ../riscv64/zaxpy.c | |||
| SAXPBYKERNEL = ../riscv64/axpby.c | |||
| DAXPBYKERNEL = ../riscv64/axpby.c | |||
| CAXPBYKERNEL = ../riscv64/zaxpby.c | |||
| ZAXPBYKERNEL = ../riscv64/zaxpby.c | |||
| SCOPYKERNEL = ../riscv64/copy.c | |||
| DCOPYKERNEL = ../riscv64/copy.c | |||
| CCOPYKERNEL = ../riscv64/zcopy.c | |||
| @@ -0,0 +1,243 @@ | |||
| SAMAXKERNEL = amax_rvv.c | |||
| DAMAXKERNEL = amax_rvv.c | |||
| CAMAXKERNEL = zamax_rvv.c | |||
| ZAMAXKERNEL = zamax_rvv.c | |||
| SAMINKERNEL = amin_rvv.c | |||
| DAMINKERNEL = amin_rvv.c | |||
| CAMINKERNEL = zamin_rvv.c | |||
| ZAMINKERNEL = zamin_rvv.c | |||
| SMAXKERNEL = max_rvv.c | |||
| DMAXKERNEL = max_rvv.c | |||
| SMINKERNEL = min_rvv.c | |||
| DMINKERNEL = min_rvv.c | |||
| ISAMAXKERNEL = iamax_rvv.c | |||
| IDAMAXKERNEL = iamax_rvv.c | |||
| ICAMAXKERNEL = izamax_rvv.c | |||
| IZAMAXKERNEL = izamax_rvv.c | |||
| ISAMINKERNEL = iamin_rvv.c | |||
| IDAMINKERNEL = iamin_rvv.c | |||
| ICAMINKERNEL = izamin_rvv.c | |||
| IZAMINKERNEL = izamin_rvv.c | |||
| ISMAXKERNEL = imax_rvv.c | |||
| IDMAXKERNEL = imax_rvv.c | |||
| ISMINKERNEL = imin_rvv.c | |||
| IDMINKERNEL = imin_rvv.c | |||
| SASUMKERNEL = asum_rvv.c | |||
| DASUMKERNEL = asum_rvv.c | |||
| CASUMKERNEL = zasum_rvv.c | |||
| ZASUMKERNEL = zasum_rvv.c | |||
| SSUMKERNEL = sum_rvv.c | |||
| DSUMKERNEL = sum_rvv.c | |||
| CSUMKERNEL = zsum_rvv.c | |||
| ZSUMKERNEL = zsum_rvv.c | |||
| SAXPYKERNEL = axpy_rvv.c | |||
| DAXPYKERNEL = axpy_rvv.c | |||
| CAXPYKERNEL = zaxpy_rvv.c | |||
| ZAXPYKERNEL = zaxpy_rvv.c | |||
| SAXPBYKERNEL = axpby_rvv.c | |||
| DAXPBYKERNEL = axpby_rvv.c | |||
| CAXPBYKERNEL = zaxpby_rvv.c | |||
| ZAXPBYKERNEL = zaxpby_rvv.c | |||
| SCOPYKERNEL = copy_rvv.c | |||
| DCOPYKERNEL = copy_rvv.c | |||
| CCOPYKERNEL = zcopy_rvv.c | |||
| ZCOPYKERNEL = zcopy_rvv.c | |||
| SDOTKERNEL = dot_rvv.c | |||
| DDOTKERNEL = dot_rvv.c | |||
| CDOTKERNEL = zdot_rvv.c | |||
| ZDOTKERNEL = zdot_rvv.c | |||
| DSDOTKERNEL = dot_rvv.c | |||
| SNRM2KERNEL = nrm2_rvv.c | |||
| DNRM2KERNEL = nrm2_rvv.c | |||
| CNRM2KERNEL = znrm2_rvv.c | |||
| ZNRM2KERNEL = znrm2_rvv.c | |||
| SROTKERNEL = rot_rvv.c | |||
| DROTKERNEL = rot_rvv.c | |||
| CROTKERNEL = zrot_rvv.c | |||
| ZROTKERNEL = zrot_rvv.c | |||
| SSCALKERNEL = scal_rvv.c | |||
| DSCALKERNEL = scal_rvv.c | |||
| CSCALKERNEL = zscal_rvv.c | |||
| ZSCALKERNEL = zscal_rvv.c | |||
| SSWAPKERNEL = swap_rvv.c | |||
| DSWAPKERNEL = swap_rvv.c | |||
| CSWAPKERNEL = zswap_rvv.c | |||
| ZSWAPKERNEL = zswap_rvv.c | |||
| SGEMVNKERNEL = gemv_n_rvv.c | |||
| DGEMVNKERNEL = gemv_n_rvv.c | |||
| CGEMVNKERNEL = zgemv_n_rvv.c | |||
| ZGEMVNKERNEL = zgemv_n_rvv.c | |||
| SGEMVTKERNEL = gemv_t_rvv.c | |||
| DGEMVTKERNEL = gemv_t_rvv.c | |||
| CGEMVTKERNEL = zgemv_t_rvv.c | |||
| ZGEMVTKERNEL = zgemv_t_rvv.c | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c | |||
| STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c | |||
| STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c | |||
| STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c | |||
| STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c | |||
| DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c | |||
| DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c | |||
| DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c | |||
| DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c | |||
| CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c | |||
| CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | |||
| CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c | |||
| CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c | |||
| ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c | |||
| ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c | |||
| ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c | |||
| ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SSYMV_U_KERNEL = symv_U_rvv.c | |||
| SSYMV_L_KERNEL = symv_L_rvv.c | |||
| DSYMV_U_KERNEL = symv_U_rvv.c | |||
| DSYMV_L_KERNEL = symv_L_rvv.c | |||
| CSYMV_U_KERNEL = zsymv_U_rvv.c | |||
| CSYMV_L_KERNEL = zsymv_L_rvv.c | |||
| ZSYMV_U_KERNEL = zsymv_U_rvv.c | |||
| ZSYMV_L_KERNEL = zsymv_L_rvv.c | |||
| CHEMV_L_KERNEL = zhemv_LM_rvv.c | |||
| CHEMV_M_KERNEL = zhemv_LM_rvv.c | |||
| CHEMV_U_KERNEL = zhemv_UV_rvv.c | |||
| CHEMV_V_KERNEL = zhemv_UV_rvv.c | |||
| ZHEMV_L_KERNEL = zhemv_LM_rvv.c | |||
| ZHEMV_M_KERNEL = zhemv_LM_rvv.c | |||
| ZHEMV_U_KERNEL = zhemv_UV_rvv.c | |||
| ZHEMV_V_KERNEL = zhemv_UV_rvv.c | |||
| SSYMMUCOPY_M = ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c | |||
| SSYMMLCOPY_M = ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c | |||
| DSYMMUCOPY_M = ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c | |||
| DSYMMLCOPY_M = ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c | |||
| CSYMMUCOPY_M = ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c | |||
| CSYMMLCOPY_M = ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c | |||
| ZSYMMUCOPY_M = ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c | |||
| ZSYMMLCOPY_M = ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c | |||
| CHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c | |||
| CHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c | |||
| ZHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
| ZHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| ifndef SGEMM_BETA | |||
| SGEMM_BETA = gemm_beta_rvv.c | |||
| endif | |||
| ifndef DGEMM_BETA | |||
| DGEMM_BETA = gemm_beta_rvv.c | |||
| endif | |||
| ifndef CGEMM_BETA | |||
| CGEMM_BETA = zgemm_beta_rvv.c | |||
| endif | |||
| ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = zgemm_beta_rvv.c | |||
| endif | |||
| @@ -0,0 +1,199 @@ | |||
| SAMAXKERNEL = amax_vector.c | |||
| DAMAXKERNEL = amax_vector.c | |||
| CAMAXKERNEL = zamax_vector.c | |||
| ZAMAXKERNEL = zamax_vector.c | |||
| SAMINKERNEL = amin_vector.c | |||
| DAMINKERNEL = amin_vector.c | |||
| CAMINKERNEL = zamin_vector.c | |||
| ZAMINKERNEL = zamin_vector.c | |||
| SMAXKERNEL = max_vector.c | |||
| DMAXKERNEL = max_vector.c | |||
| SMINKERNEL = min_vector.c | |||
| DMINKERNEL = min_vector.c | |||
| ISAMAXKERNEL = iamax_vector.c | |||
| IDAMAXKERNEL = iamax_vector.c | |||
| ICAMAXKERNEL = izamax_vector.c | |||
| IZAMAXKERNEL = izamax_vector.c | |||
| ISAMINKERNEL = iamin_vector.c | |||
| IDAMINKERNEL = iamin_vector.c | |||
| ICAMINKERNEL = izamin_vector.c | |||
| IZAMINKERNEL = izamin_vector.c | |||
| ISMAXKERNEL = imax_vector.c | |||
| IDMAXKERNEL = imax_vector.c | |||
| ISMINKERNEL = imin_vector.c | |||
| IDMINKERNEL = imin_vector.c | |||
| SASUMKERNEL = asum_vector.c | |||
| DASUMKERNEL = asum_vector.c | |||
| CASUMKERNEL = zasum_vector.c | |||
| ZASUMKERNEL = zasum_vector.c | |||
| SSUMKERNEL = sum_vector.c | |||
| DSUMKERNEL = sum_vector.c | |||
| CSUMKERNEL = zsum_vector.c | |||
| ZSUMKERNEL = zsum_vector.c | |||
| SAXPYKERNEL = axpy_vector.c | |||
| DAXPYKERNEL = axpy_vector.c | |||
| CAXPYKERNEL = zaxpy_vector.c | |||
| ZAXPYKERNEL = zaxpy_vector.c | |||
| SCOPYKERNEL = copy_vector.c | |||
| DCOPYKERNEL = copy_vector.c | |||
| CCOPYKERNEL = zcopy_vector.c | |||
| ZCOPYKERNEL = zcopy_vector.c | |||
| SDOTKERNEL = dot_vector.c | |||
| DDOTKERNEL = dot_vector.c | |||
| CDOTKERNEL = zdot_vector.c | |||
| ZDOTKERNEL = zdot_vector.c | |||
| DSDOTKERNEL = ../generic/dot.c | |||
| SNRM2KERNEL = nrm2_vector.c | |||
| DNRM2KERNEL = nrm2_vector.c | |||
| CNRM2KERNEL = znrm2_vector.c | |||
| ZNRM2KERNEL = znrm2_vector.c | |||
| SROTKERNEL = rot_vector.c | |||
| DROTKERNEL = rot_vector.c | |||
| CROTKERNEL = zrot_vector.c | |||
| ZROTKERNEL = zrot_vector.c | |||
| SSCALKERNEL = scal_vector.c | |||
| DSCALKERNEL = scal_vector.c | |||
| CSCALKERNEL = zscal_vector.c | |||
| ZSCALKERNEL = zscal_vector.c | |||
| SSWAPKERNEL = swap_vector.c | |||
| DSWAPKERNEL = swap_vector.c | |||
| CSWAPKERNEL = zswap_vector.c | |||
| ZSWAPKERNEL = zswap_vector.c | |||
| SGEMVNKERNEL = gemv_n_vector.c | |||
| DGEMVNKERNEL = gemv_n_vector.c | |||
| CGEMVNKERNEL = zgemv_n_vector.c | |||
| ZGEMVNKERNEL = zgemv_n_vector.c | |||
| SGEMVTKERNEL = gemv_t_vector.c | |||
| DGEMVTKERNEL = gemv_t_vector.c | |||
| CGEMVTKERNEL = zgemv_t_vector.c | |||
| ZGEMVTKERNEL = zgemv_t_vector.c | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SSYMV_U_KERNEL = symv_U_vector.c | |||
| SSYMV_L_KERNEL = symv_L_vector.c | |||
| DSYMV_U_KERNEL = symv_U_vector.c | |||
| DSYMV_L_KERNEL = symv_L_vector.c | |||
| CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| CHEMV_L_KERNEL = zhemv_LM_vector.c | |||
| CHEMV_M_KERNEL = zhemv_LM_vector.c | |||
| CHEMV_U_KERNEL = zhemv_UV_vector.c | |||
| CHEMV_V_KERNEL = zhemv_UV_vector.c | |||
| ZHEMV_L_KERNEL = zhemv_LM_vector.c | |||
| ZHEMV_M_KERNEL = zhemv_LM_vector.c | |||
| ZHEMV_U_KERNEL = zhemv_UV_vector.c | |||
| ZHEMV_V_KERNEL = zhemv_UV_vector.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| ifndef SGEMM_BETA | |||
| SGEMM_BETA = ../generic/gemm_beta.c | |||
| endif | |||
| ifndef DGEMM_BETA | |||
| DGEMM_BETA = ../generic/gemm_beta.c | |||
| endif | |||
| ifndef CGEMM_BETA | |||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||
| endif | |||
| ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| endif | |||
| @@ -0,0 +1,281 @@ | |||
| # ********************************************************************************** | |||
| # Copyright (c) 2022, The OpenBLAS Project | |||
| # All rights reserved. | |||
| # Redistribution and use in source and binary forms, with or without | |||
| # modification, are permitted provided that the following conditions are | |||
| # met: | |||
| # 1. Redistributions of source code must retain the above copyright | |||
| # notice, this list of conditions and the following disclaimer. | |||
| # 2. Redistributions in binary form must reproduce the above copyright | |||
| # notice, this list of conditions and the following disclaimer in | |||
| # the documentation and/or other materials provided with the | |||
| # distribution. | |||
| # 3. Neither the name of the OpenBLAS project nor the names of | |||
| # its contributors may be used to endorse or promote products | |||
| # derived from this software without specific prior written permission. | |||
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| # ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| # USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| # ********************************************************************************** | |||
| SAMAXKERNEL = amax_rvv.c | |||
| DAMAXKERNEL = amax_rvv.c | |||
| CAMAXKERNEL = zamax_rvv.c | |||
| ZAMAXKERNEL = zamax_rvv.c | |||
| SAMINKERNEL = amin_rvv.c | |||
| DAMINKERNEL = amin_rvv.c | |||
| CAMINKERNEL = zamin_rvv.c | |||
| ZAMINKERNEL = zamin_rvv.c | |||
| SMAXKERNEL = max_rvv.c | |||
| DMAXKERNEL = max_rvv.c | |||
| SMINKERNEL = min_rvv.c | |||
| DMINKERNEL = min_rvv.c | |||
| ISAMAXKERNEL = iamax_rvv.c | |||
| IDAMAXKERNEL = iamax_rvv.c | |||
| ICAMAXKERNEL = izamax_rvv.c | |||
| IZAMAXKERNEL = izamax_rvv.c | |||
| ISAMINKERNEL = iamin_rvv.c | |||
| IDAMINKERNEL = iamin_rvv.c | |||
| ICAMINKERNEL = izamin_rvv.c | |||
| IZAMINKERNEL = izamin_rvv.c | |||
| ISMAXKERNEL = imax_rvv.c | |||
| IDMAXKERNEL = imax_rvv.c | |||
| ISMINKERNEL = imin_rvv.c | |||
| IDMINKERNEL = imin_rvv.c | |||
| SASUMKERNEL = asum_rvv.c | |||
| DASUMKERNEL = asum_rvv.c | |||
| CASUMKERNEL = zasum_rvv.c | |||
| ZASUMKERNEL = zasum_rvv.c | |||
| SSUMKERNEL = sum_rvv.c | |||
| DSUMKERNEL = sum_rvv.c | |||
| CSUMKERNEL = zsum_rvv.c | |||
| ZSUMKERNEL = zsum_rvv.c | |||
| SAXPYKERNEL = axpy_rvv.c | |||
| DAXPYKERNEL = axpy_rvv.c | |||
| CAXPYKERNEL = zaxpy_rvv.c | |||
| ZAXPYKERNEL = zaxpy_rvv.c | |||
| SAXPBYKERNEL = axpby_rvv.c | |||
| DAXPBYKERNEL = axpby_rvv.c | |||
| CAXPBYKERNEL = zaxpby_rvv.c | |||
| ZAXPBYKERNEL = zaxpby_rvv.c | |||
| SCOPYKERNEL = copy_rvv.c | |||
| DCOPYKERNEL = copy_rvv.c | |||
| CCOPYKERNEL = zcopy_rvv.c | |||
| ZCOPYKERNEL = zcopy_rvv.c | |||
| SDOTKERNEL = dot_rvv.c | |||
| DDOTKERNEL = dot_rvv.c | |||
| CDOTKERNEL = zdot_rvv.c | |||
| ZDOTKERNEL = zdot_rvv.c | |||
| DSDOTKERNEL = dot_rvv.c | |||
| SNRM2KERNEL = nrm2_rvv.c | |||
| DNRM2KERNEL = nrm2_rvv.c | |||
| CNRM2KERNEL = znrm2_rvv.c | |||
| ZNRM2KERNEL = znrm2_rvv.c | |||
| SROTKERNEL = rot_rvv.c | |||
| DROTKERNEL = rot_rvv.c | |||
| CROTKERNEL = zrot_rvv.c | |||
| ZROTKERNEL = zrot_rvv.c | |||
| SSCALKERNEL = scal_rvv.c | |||
| DSCALKERNEL = scal_rvv.c | |||
| CSCALKERNEL = zscal_rvv.c | |||
| ZSCALKERNEL = zscal_rvv.c | |||
| SSWAPKERNEL = swap_rvv.c | |||
| DSWAPKERNEL = swap_rvv.c | |||
| CSWAPKERNEL = zswap_rvv.c | |||
| ZSWAPKERNEL = zswap_rvv.c | |||
| SGEMVNKERNEL = gemv_n_rvv.c | |||
| DGEMVNKERNEL = gemv_n_rvv.c | |||
| CGEMVNKERNEL = zgemv_n_rvv.c | |||
| ZGEMVNKERNEL = zgemv_n_rvv.c | |||
| SGEMVTKERNEL = gemv_t_rvv.c | |||
| DGEMVTKERNEL = gemv_t_rvv.c | |||
| CGEMVTKERNEL = zgemv_t_rvv.c | |||
| ZGEMVTKERNEL = zgemv_t_rvv.c | |||
| CTRMMKERNEL = ztrmmkernel_rvv_v1x4.c | |||
| ZTRMMKERNEL = ztrmmkernel_rvv_v1x4.c | |||
| # SGEMM_UNROLL_N set in params.h | |||
| ifeq ($(SGEMM_UNROLL_N), 8) | |||
| # UNROLL_M is VLMAX | |||
| SGEMMKERNEL = gemmkernel_rvv_v1x8.c | |||
| SGEMMINCOPY = gemm_ncopy_rvv_v1.c | |||
| SGEMMITCOPY = gemm_tcopy_rvv_v1.c | |||
| SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c | |||
| SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMKERNEL = trmmkernel_rvv_v1x8.c | |||
| STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c | |||
| STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c | |||
| STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c | |||
| STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c | |||
| SSYMMUCOPY_M = symm_ucopy_rvv_v1.c | |||
| SSYMMLCOPY_M = symm_lcopy_rvv_v1.c | |||
| endif | |||
| # SGEMM_UNROLL_N set in params.h | |||
| ifeq ($(DGEMM_UNROLL_N), 8) | |||
| # UNROLL_M is VLMAX | |||
| DGEMMKERNEL = gemmkernel_rvv_v1x8.c | |||
| DGEMMINCOPY = gemm_ncopy_rvv_v1.c | |||
| DGEMMITCOPY = gemm_tcopy_rvv_v1.c | |||
| DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c | |||
| DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRMMKERNEL = trmmkernel_rvv_v1x8.c | |||
| DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c | |||
| DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c | |||
| DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c | |||
| DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c | |||
| DSYMMUCOPY_M = symm_ucopy_rvv_v1.c | |||
| DSYMMLCOPY_M = symm_lcopy_rvv_v1.c | |||
| endif | |||
| CGEMMKERNEL = zgemmkernel_rvv_v1x4.c | |||
| CGEMMINCOPY = zgemm_ncopy_rvv_v1.c | |||
| CGEMMITCOPY = zgemm_tcopy_rvv_v1.c | |||
| CGEMMONCOPY = zgemm_ncopy_4_rvv.c | |||
| CGEMMOTCOPY = zgemm_tcopy_4_rvv.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemmkernel_rvv_v1x4.c | |||
| ZGEMMINCOPY = zgemm_ncopy_rvv_v1.c | |||
| ZGEMMITCOPY = zgemm_tcopy_rvv_v1.c | |||
| ZGEMMONCOPY = zgemm_ncopy_4_rvv.c | |||
| ZGEMMOTCOPY = zgemm_tcopy_4_rvv.c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||
| STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||
| STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||
| STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||
| DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||
| CTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||
| CTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||
| CTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||
| CTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||
| ZTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||
| ZTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||
| ZTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||
| ZTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||
| TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c | |||
| TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c | |||
| TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c | |||
| TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c | |||
| ZTRSMCOPYLN_M = ztrsm_lncopy_rvv_v1.c | |||
| ZTRSMCOPYLT_M = ztrsm_ltcopy_rvv_v1.c | |||
| ZTRSMCOPYUN_M = ztrsm_uncopy_rvv_v1.c | |||
| ZTRSMCOPYUT_M = ztrsm_utcopy_rvv_v1.c | |||
| SSYMV_U_KERNEL = symv_U_rvv.c | |||
| SSYMV_L_KERNEL = symv_L_rvv.c | |||
| DSYMV_U_KERNEL = symv_U_rvv.c | |||
| DSYMV_L_KERNEL = symv_L_rvv.c | |||
| CSYMV_U_KERNEL = zsymv_U_rvv.c | |||
| CSYMV_L_KERNEL = zsymv_L_rvv.c | |||
| ZSYMV_U_KERNEL = zsymv_U_rvv.c | |||
| ZSYMV_L_KERNEL = zsymv_L_rvv.c | |||
| CHEMV_L_KERNEL = zhemv_LM_rvv.c | |||
| CHEMV_M_KERNEL = zhemv_LM_rvv.c | |||
| CHEMV_U_KERNEL = zhemv_UV_rvv.c | |||
| CHEMV_V_KERNEL = zhemv_UV_rvv.c | |||
| ZHEMV_L_KERNEL = zhemv_LM_rvv.c | |||
| ZHEMV_M_KERNEL = zhemv_LM_rvv.c | |||
| ZHEMV_U_KERNEL = zhemv_UV_rvv.c | |||
| ZHEMV_V_KERNEL = zhemv_UV_rvv.c | |||
| ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c | |||
| ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c | |||
| CHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c | |||
| CHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c | |||
| ZSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c | |||
| ZSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c | |||
| CSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c | |||
| CSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c | |||
| ZTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c | |||
| ZTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c | |||
| ZTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c | |||
| ZTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c | |||
| CTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c | |||
| CTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c | |||
| CTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c | |||
| CTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| ifndef SGEMM_BETA | |||
| SGEMM_BETA = gemm_beta_rvv.c | |||
| endif | |||
| ifndef DGEMM_BETA | |||
| DGEMM_BETA = gemm_beta_rvv.c | |||
| endif | |||
| ifndef CGEMM_BETA | |||
| CGEMM_BETA = zgemm_beta_rvv.c | |||
| endif | |||
| ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = zgemm_beta_rvv.c | |||
| endif | |||
| @@ -0,0 +1,102 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT maxf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||
| FLOAT_V_T vx, vmax; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vmax = VFMVVF_FLOAT(0.0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); | |||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||
| return(maxf); | |||
| } | |||
| @@ -28,36 +28,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||
| #endif | |||
| #define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| @@ -65,103 +70,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT maxf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_max; | |||
| FLOAT_V_T_M1 v_res, v_zero; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_zero = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT_V_T v0, v1; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, 1); | |||
| MASK_T mask0, mask1; | |||
| FLOAT zero = 0.0; | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| v_max = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | |||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); | |||
| j += gvl*2; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||
| maxf = *((FLOAT*)&v_res); | |||
| //maxf = v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||
| if(*((FLOAT*)&v_res) > maxf) | |||
| maxf = *((FLOAT*)&v_res); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| @@ -169,94 +99,27 @@ asm volatile( | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| v_max = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); | |||
| j += gvl*2; | |||
| ix += inc_xv*2; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||
| maxf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||
| if(*((FLOAT*)&v_res) > maxf) | |||
| maxf = *((FLOAT*)&v_res); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| } | |||
| maxf = EXTRACT_FLOAT(v_res); | |||
| return(maxf); | |||
| } | |||
| @@ -0,0 +1,102 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT minf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(minf); | |||
| FLOAT_V_T vx, vmin; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); | |||
| minf = VFMVFS_FLOAT_M1(v_res); | |||
| return(minf); | |||
| } | |||
| @@ -26,232 +26,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define ABS fabs | |||
| # else | |||
| # define ELEN 32 | |||
| # define ABS fabsf | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define ABS fabs | |||
| # else | |||
| # define ELEN 32 | |||
| # define ABS fabsf | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||
| #endif | |||
| #define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||
| FLOAT minf=FLT_MAX; | |||
| BLASLONG i=0, j=0; | |||
| BLASLONG ix=0; | |||
| FLOAT minf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(minf); | |||
| minf = ABS(*x); | |||
| x += inc_x; | |||
| --n; | |||
| if (n == 0) return(minf); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_min; | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| FLOAT_V_T v0, v1; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(minf, 1); | |||
| MASK_T mask0, mask1; | |||
| FLOAT zero = 0.0; | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_min = VFMINVV_FLOAT(v_min, v0, gvl); | |||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_min = VFMINVV_FLOAT(v_min, v1, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v1, v_res, gvl); | |||
| j += gvl*2; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(*((FLOAT*)&v_res) < minf) | |||
| minf = *((FLOAT*)&v_res); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG idx = 0, inc_xv = inc_x * gvl; | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_min = VFMINVV_FLOAT(v_min, v0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_min = VFMINVV_FLOAT(v_min, v1, gvl); | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v1, v_res, gvl); | |||
| j += gvl*2; | |||
| idx += inc_xv*2; | |||
| ix += inc_xv*2; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(*((FLOAT*)&v_res) < minf) | |||
| minf = *((FLOAT*)&v_res); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| } | |||
| return(minf); | |||
| } | |||
| minf = EXTRACT_FLOAT(v_res); | |||
| return(minf); | |||
| } | |||
| @@ -0,0 +1,99 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 | |||
| #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 | |||
| #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT asumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(asumf); | |||
| FLOAT_V_T vx, vsum; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vsum = VFMVVF_FLOAT(0.0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); | |||
| asumf = VFMVFS_FLOAT_M1(v_res); | |||
| return(asumf); | |||
| } | |||
| @@ -28,111 +28,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||
| #endif | |||
| #define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||
| #define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| BLASLONG ix=0; | |||
| FLOAT asumf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(asumf); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_zero,v_sum; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT_V_T v0, v1, v_sum; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, 1); | |||
| MASK_T mask0, mask1; | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| if(gvl <= n/2){ | |||
| v_sum = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||
| j += gvl * 2; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += *((FLOAT*)&v_res); | |||
| v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| asumf += *((FLOAT*)&v_res); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| if(gvl <= n/2){ | |||
| v_sum = VFMVVF_FLOAT(0, gvl); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[(j+gvl)*inc_x], stride_x, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||
| j += gvl * 2; | |||
| inc_xv += inc_xv * 2; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += *((FLOAT*)&v_res); | |||
| v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| asumf += *((FLOAT*)&v_res); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| } | |||
| asumf = EXTRACT_FLOAT(v_res); | |||
| return(asumf); | |||
| } | |||
| @@ -33,7 +33,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| BLASLONG i=0; | |||
| BLASLONG ix,iy; | |||
| if ( n < 0 ) return(0); | |||
| if ( n <= 0 ) return(0); | |||
| ix = 0; | |||
| iy = 0; | |||
| @@ -0,0 +1,173 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| FLOAT_V_T vx, vy; | |||
| if ( n <= 0 ) return(0); | |||
| if ( beta == 0.0 ) { | |||
| if ( alpha == 0.0 ) { | |||
| if (1 == inc_y) { | |||
| memset(&y[0], 0, n * sizeof(FLOAT)); | |||
| } else { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| size_t vl = VSETVL(n); | |||
| vy = VFMVVF_FLOAT(0.0, vl); | |||
| for ( ; n > 0; n -= vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| } else { | |||
| if ((1 == inc_x) && (1 == inc_y)) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else if (1 == inc_x) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } else if (1 == inc_y) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if ( alpha == 0.0 ) { | |||
| if (1 == inc_y) { | |||
| for (size_t vl; n > 0; n -= vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| } else { | |||
| if ((1 == inc_x) && (1 == inc_y)) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else if (1 == inc_x) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } else if (1 == inc_y) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -27,31 +27,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VSEV_FLOAT vse32_v_f32m4 | |||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VSEV_FLOAT vse64_v_f64m4 | |||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||
| # define LMUL m4 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
| #define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _) | |||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| if (n < 0) return(0); | |||
| if (n <= 0) return(0); | |||
| BLASLONG i=0, j=0; | |||
| unsigned int gvl = 0; | |||
| @@ -60,6 +69,63 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| BLASLONG stride_x, stride_y, ix = 0, iy = 0; | |||
| if (inc_x == 0 || inc_y == 0) { /* use trivial non-vectorized loop if either increment is zero */ | |||
| if ( beta == 0.0 ) | |||
| { | |||
| if ( alpha == 0.0 ) | |||
| { | |||
| while(i < n) | |||
| { | |||
| y[iy] = 0.0 ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(i < n) | |||
| { | |||
| y[iy] = alpha * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( alpha == 0.0 ) | |||
| { | |||
| while(i < n) | |||
| { | |||
| y[iy] = beta * y[iy] ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(i < n) | |||
| { | |||
| y[iy] = alpha * x[ix] + beta * y[iy] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } else { /* vectorized approach for non-zero increments */ | |||
| if(beta == 0.0){ | |||
| if(alpha == 0.0){//alpha == 0 && beta == 0 | |||
| if(inc_y == 1){ | |||
| @@ -372,5 +438,6 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| } | |||
| @@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| BLASLONG i=0; | |||
| BLASLONG ix,iy; | |||
| if ( n < 0 ) return(0); | |||
| if ( n <= 0 ) return(0); | |||
| if ( da == 0.0 ) return(0); | |||
| ix = 0; | |||
| @@ -0,0 +1,109 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| if ( n <= 0 ) return(0); | |||
| if ( da == 0.0 ) return(0); | |||
| FLOAT_V_T vx, vy; | |||
| if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else if (1 == inc_y) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
| VSEV_FLOAT(y, vy, vl); | |||
| } | |||
| } else if (1 == inc_x) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -25,26 +25,38 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VSEV_FLOAT vse32_v_f32m4 | |||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VSEV_FLOAT vse64_v_f64m4 | |||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| # define LMUL m4 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0, j=0, jx=0, jy=0; | |||
| @@ -53,7 +65,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| FLOAT_V_T vy0, vy1; | |||
| BLASLONG stride_x, stride_y; | |||
| if (n < 0) return(0); | |||
| if (n <= 0) return(0); | |||
| if (da == 0.0) return(0); | |||
| if (inc_x == 1 && inc_y == 1) { | |||
| @@ -0,0 +1,996 @@ | |||
| /* | |||
| AUTOGENERATED KERNEL | |||
| Script: ./kernel/riscv64/generate_kernel.py | |||
| Settings: | |||
| LMUL=2 | |||
| M=8 | |||
| M_tail_scalar_from=2 | |||
| N=4 | |||
| __riscv_='__riscv_' | |||
| complex=True | |||
| conjugate=False | |||
| cpu='zvl128b' | |||
| force_acc_double=False | |||
| index_type='BLASLONG' | |||
| op='gemm' | |||
| param_precision='float' | |||
| reg_width_bits=128 | |||
| tail_policy='' | |||
| trace=False | |||
| Derived: | |||
| ELEN_ACC=32 | |||
| ELEN_PARAM=32 | |||
| LMUL_ACC=2 | |||
| VFMACC='__riscv_vfmacc_vf_f32m2' | |||
| VFMUL='__riscv_vfmul_vf_f32m2' | |||
| VLEV='__riscv_vle32_v_f32m2' | |||
| VLSEV='__riscv_vlse32_v_f32m2' | |||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' | |||
| VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' | |||
| VSETVL='__riscv_vsetvl_e32m2' | |||
| VSEV='__riscv_vse32_v_f32m2' | |||
| VSSEV='__riscv_vsse32_v_f32m2' | |||
| acc_vector_t='vfloat32m2_t' | |||
| output='cgemm_kernel_8x4_zvl128b.c' | |||
| param_scalar_t='float' | |||
| param_vector_t='vfloat32m2_t' | |||
| */ | |||
| #include "common.h" | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define S0 1 | |||
| #define S1 -1 | |||
| #define S2 1 | |||
| #define S3 1 | |||
| #define VFMACC_RR __riscv_vfmsac | |||
| #define VFMACC_RI __riscv_vfmacc | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| #define S0 1 | |||
| #define S1 1 | |||
| #define S2 1 | |||
| #define S3 -1 | |||
| #define VFMACC_RR __riscv_vfmacc | |||
| #define VFMACC_RI __riscv_vfmsac | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| #define S0 1 | |||
| #define S1 1 | |||
| #define S2 -1 | |||
| #define S3 1 | |||
| #define VFMACC_RR __riscv_vfmacc | |||
| #define VFMACC_RI __riscv_vfnmsac | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #define S0 1 | |||
| #define S1 -1 | |||
| #define S2 -1 | |||
| #define S3 -1 | |||
| #define VFMACC_RR __riscv_vfmsac | |||
| #define VFMACC_RI __riscv_vfnmacc | |||
| #endif | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) | |||
| { | |||
| BLASLONG gvl = 0; | |||
| BLASLONG m_top = 0; | |||
| BLASLONG n_top = 0; | |||
| // -- MAIN PASS | |||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||
| m_top = 0; | |||
| BLASLONG gvl = __riscv_vsetvl_e32m2(8); | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| float B0r = B[bi + 0 * 2 + 0]; | |||
| float B0i = B[bi + 0 * 2 + 1]; | |||
| float B1r = B[bi + 1 * 2 + 0]; | |||
| float B1i = B[bi + 1 * 2 + 1]; | |||
| float B2r = B[bi + 2 * 2 + 0]; | |||
| float B2i = B[bi + 2 * 2 + 1]; | |||
| float B3r = B[bi + 3 * 2 + 0]; | |||
| float B3i = B[bi + 3 * 2 + 1]; | |||
| bi += 4 * 2; | |||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 8 * 2; | |||
| // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k | |||
| // leaving 6 vector registers for temporaries | |||
| // performing 2 operations between reuses of temporaries | |||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| vfloat32m2_t ACC0r = tmp0r; | |||
| vfloat32m2_t ACC0i = tmp0i; | |||
| vfloat32m2_t ACC1r = tmp1r; | |||
| vfloat32m2_t ACC1i = tmp1i; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||
| vfloat32m2_t ACC2r = tmp0r; | |||
| vfloat32m2_t ACC2i = tmp0i; | |||
| vfloat32m2_t ACC3r = tmp1r; | |||
| vfloat32m2_t ACC3i = tmp1i; | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0r = B[bi + 0 * 2 + 0]; | |||
| B0i = B[bi + 0 * 2 + 1]; | |||
| B1r = B[bi + 1 * 2 + 0]; | |||
| B1i = B[bi + 1 * 2 + 1]; | |||
| B2r = B[bi + 2 * 2 + 0]; | |||
| B2i = B[bi + 2 * 2 + 1]; | |||
| B3r = B[bi + 3 * 2 + 0]; | |||
| B3i = B[bi + 3 * 2 + 1]; | |||
| bi += 4 * 2; | |||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 8 * 2; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||
| ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); | |||
| ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); | |||
| ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); | |||
| ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||
| C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); | |||
| C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); | |||
| C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); | |||
| C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); | |||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||
| C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); | |||
| C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); | |||
| C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); | |||
| C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); | |||
| m_top += 8; | |||
| } | |||
| // -- tails for main pass | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| float B0r = B[bi + 0 * 2 + 0]; | |||
| float B0i = B[bi + 0 * 2 + 1]; | |||
| float B1r = B[bi + 1 * 2 + 0]; | |||
| float B1i = B[bi + 1 * 2 + 1]; | |||
| float B2r = B[bi + 2 * 2 + 0]; | |||
| float B2i = B[bi + 2 * 2 + 1]; | |||
| float B3r = B[bi + 3 * 2 + 0]; | |||
| float B3i = B[bi + 3 * 2 + 1]; | |||
| bi += 4 * 2; | |||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 4 * 2; | |||
| // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k | |||
| // leaving 6 vector registers for temporaries | |||
| // performing 2 operations between reuses of temporaries | |||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| vfloat32m2_t ACC0r = tmp0r; | |||
| vfloat32m2_t ACC0i = tmp0i; | |||
| vfloat32m2_t ACC1r = tmp1r; | |||
| vfloat32m2_t ACC1i = tmp1i; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||
| vfloat32m2_t ACC2r = tmp0r; | |||
| vfloat32m2_t ACC2i = tmp0i; | |||
| vfloat32m2_t ACC3r = tmp1r; | |||
| vfloat32m2_t ACC3i = tmp1i; | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0r = B[bi + 0 * 2 + 0]; | |||
| B0i = B[bi + 0 * 2 + 1]; | |||
| B1r = B[bi + 1 * 2 + 0]; | |||
| B1i = B[bi + 1 * 2 + 1]; | |||
| B2r = B[bi + 2 * 2 + 0]; | |||
| B2i = B[bi + 2 * 2 + 1]; | |||
| B3r = B[bi + 3 * 2 + 0]; | |||
| B3i = B[bi + 3 * 2 + 1]; | |||
| bi += 4 * 2; | |||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 4 * 2; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||
| ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); | |||
| ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); | |||
| ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); | |||
| ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||
| C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); | |||
| C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); | |||
| C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); | |||
| C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); | |||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||
| C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); | |||
| C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); | |||
| C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); | |||
| C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| float result4 = 0; | |||
| float result5 = 0; | |||
| float result6 = 0; | |||
| float result7 = 0; | |||
| float result8 = 0; | |||
| float result9 = 0; | |||
| float result10 = 0; | |||
| float result11 = 0; | |||
| float result12 = 0; | |||
| float result13 = 0; | |||
| float result14 = 0; | |||
| float result15 = 0; | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||
| result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; | |||
| result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; | |||
| result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; | |||
| result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; | |||
| result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; | |||
| result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; | |||
| result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; | |||
| result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; | |||
| result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; | |||
| result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; | |||
| ai += 2 * 2; | |||
| bi += 4 * 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| float Cr, Ci; | |||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||
| Cr += result0 * alphar; | |||
| Ci += result1 * alphar; | |||
| Cr -= result1 * alphai; | |||
| Ci += result0 * alphai; | |||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||
| Cr += result2 * alphar; | |||
| Ci += result3 * alphar; | |||
| Cr -= result3 * alphai; | |||
| Ci += result2 * alphai; | |||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||
| Cr += result4 * alphar; | |||
| Ci += result5 * alphar; | |||
| Cr -= result5 * alphai; | |||
| Ci += result4 * alphai; | |||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; | |||
| Cr += result6 * alphar; | |||
| Ci += result7 * alphar; | |||
| Cr -= result7 * alphai; | |||
| Ci += result6 * alphai; | |||
| C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; | |||
| Cr += result8 * alphar; | |||
| Ci += result9 * alphar; | |||
| Cr -= result9 * alphai; | |||
| Ci += result8 * alphai; | |||
| C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; | |||
| Cr += result10 * alphar; | |||
| Ci += result11 * alphar; | |||
| Cr -= result11 * alphai; | |||
| Ci += result10 * alphai; | |||
| C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; | |||
| Cr += result12 * alphar; | |||
| Ci += result13 * alphar; | |||
| Cr -= result13 * alphai; | |||
| Ci += result12 * alphai; | |||
| C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; | |||
| Cr += result14 * alphar; | |||
| Ci += result15 * alphar; | |||
| Cr -= result15 * alphai; | |||
| Ci += result14 * alphai; | |||
| C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| float result4 = 0; | |||
| float result5 = 0; | |||
| float result6 = 0; | |||
| float result7 = 0; | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||
| result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||
| result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; | |||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; | |||
| result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; | |||
| result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; | |||
| ai += 1 * 2; | |||
| bi += 4 * 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| float Cr, Ci; | |||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||
| Cr += result0 * alphar; | |||
| Ci += result1 * alphar; | |||
| Cr -= result1 * alphai; | |||
| Ci += result0 * alphai; | |||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||
| Cr += result2 * alphar; | |||
| Ci += result3 * alphar; | |||
| Cr -= result3 * alphai; | |||
| Ci += result2 * alphai; | |||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; | |||
| Cr += result4 * alphar; | |||
| Ci += result5 * alphar; | |||
| Cr -= result5 * alphai; | |||
| Ci += result4 * alphai; | |||
| C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; | |||
| Cr += result6 * alphar; | |||
| Ci += result7 * alphar; | |||
| Cr -= result7 * alphai; | |||
| Ci += result6 * alphai; | |||
| C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; | |||
| m_top += 1; | |||
| } | |||
| n_top += 4; | |||
| } | |||
| // -- tails for N=2 | |||
| if (N & 2) { | |||
| gvl = __riscv_vsetvl_e32m2(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| float B0r = B[bi + 0 * 2 + 0]; | |||
| float B0i = B[bi + 0 * 2 + 1]; | |||
| float B1r = B[bi + 1 * 2 + 0]; | |||
| float B1i = B[bi + 1 * 2 + 1]; | |||
| bi += 2 * 2; | |||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 8 * 2; | |||
| // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k | |||
| // leaving 10 vector registers for temporaries | |||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| vfloat32m2_t ACC0r = tmp0r; | |||
| vfloat32m2_t ACC0i = tmp0i; | |||
| vfloat32m2_t ACC1r = tmp1r; | |||
| vfloat32m2_t ACC1i = tmp1i; | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0r = B[bi + 0 * 2 + 0]; | |||
| B0i = B[bi + 0 * 2 + 1]; | |||
| B1r = B[bi + 1 * 2 + 0]; | |||
| B1i = B[bi + 1 * 2 + 1]; | |||
| bi += 2 * 2; | |||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 8 * 2; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| float B0r = B[bi + 0 * 2 + 0]; | |||
| float B0i = B[bi + 0 * 2 + 1]; | |||
| float B1r = B[bi + 1 * 2 + 0]; | |||
| float B1i = B[bi + 1 * 2 + 1]; | |||
| bi += 2 * 2; | |||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 4 * 2; | |||
| // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k | |||
| // leaving 10 vector registers for temporaries | |||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| vfloat32m2_t ACC0r = tmp0r; | |||
| vfloat32m2_t ACC0i = tmp0i; | |||
| vfloat32m2_t ACC1r = tmp1r; | |||
| vfloat32m2_t ACC1i = tmp1i; | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0r = B[bi + 0 * 2 + 0]; | |||
| B0i = B[bi + 0 * 2 + 1]; | |||
| B1r = B[bi + 1 * 2 + 0]; | |||
| B1i = B[bi + 1 * 2 + 1]; | |||
| bi += 2 * 2; | |||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 4 * 2; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| float result4 = 0; | |||
| float result5 = 0; | |||
| float result6 = 0; | |||
| float result7 = 0; | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||
| result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; | |||
| result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; | |||
| ai += 2 * 2; | |||
| bi += 2 * 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| float Cr, Ci; | |||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||
| Cr += result0 * alphar; | |||
| Ci += result1 * alphar; | |||
| Cr -= result1 * alphai; | |||
| Ci += result0 * alphai; | |||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||
| Cr += result2 * alphar; | |||
| Ci += result3 * alphar; | |||
| Cr -= result3 * alphai; | |||
| Ci += result2 * alphai; | |||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||
| Cr += result4 * alphar; | |||
| Ci += result5 * alphar; | |||
| Cr -= result5 * alphai; | |||
| Ci += result4 * alphai; | |||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; | |||
| Cr += result6 * alphar; | |||
| Ci += result7 * alphar; | |||
| Cr -= result7 * alphai; | |||
| Ci += result6 * alphai; | |||
| C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||
| result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||
| result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||
| ai += 1 * 2; | |||
| bi += 2 * 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| float Cr, Ci; | |||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||
| Cr += result0 * alphar; | |||
| Ci += result1 * alphar; | |||
| Cr -= result1 * alphai; | |||
| Ci += result0 * alphai; | |||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||
| Cr += result2 * alphar; | |||
| Ci += result3 * alphar; | |||
| Cr -= result3 * alphai; | |||
| Ci += result2 * alphai; | |||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||
| m_top += 1; | |||
| } | |||
| n_top += 2; | |||
| } | |||
| // -- tails for N=1 | |||
| if (N & 1) { | |||
| gvl = __riscv_vsetvl_e32m2(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| float B0r = B[bi + 0 * 2 + 0]; | |||
| float B0i = B[bi + 0 * 2 + 1]; | |||
| bi += 1 * 2; | |||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 8 * 2; | |||
| // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k | |||
| // leaving 12 vector registers for temporaries | |||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| vfloat32m2_t ACC0r = tmp0r; | |||
| vfloat32m2_t ACC0i = tmp0i; | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0r = B[bi + 0 * 2 + 0]; | |||
| B0i = B[bi + 0 * 2 + 1]; | |||
| bi += 1 * 2; | |||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 8 * 2; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| float B0r = B[bi + 0 * 2 + 0]; | |||
| float B0i = B[bi + 0 * 2 + 1]; | |||
| bi += 1 * 2; | |||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 4 * 2; | |||
| // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k | |||
| // leaving 12 vector registers for temporaries | |||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| vfloat32m2_t ACC0r = tmp0r; | |||
| vfloat32m2_t ACC0i = tmp0i; | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0r = B[bi + 0 * 2 + 0]; | |||
| B0i = B[bi + 0 * 2 + 1]; | |||
| bi += 1 * 2; | |||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 4 * 2; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||
| ai += 2 * 2; | |||
| bi += 1 * 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| float Cr, Ci; | |||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||
| Cr += result0 * alphar; | |||
| Ci += result1 * alphar; | |||
| Cr -= result1 * alphai; | |||
| Ci += result0 * alphai; | |||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||
| Cr += result2 * alphar; | |||
| Ci += result3 * alphar; | |||
| Cr -= result3 * alphai; | |||
| Ci += result2 * alphai; | |||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||
| ai += 1 * 2; | |||
| bi += 1 * 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| float Cr, Ci; | |||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||
| Cr += result0 * alphar; | |||
| Ci += result1 * alphar; | |||
| Cr -= result1 * alphai; | |||
| Ci += result0 * alphai; | |||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||
| m_top += 1; | |||
| } | |||
| n_top += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -41,7 +41,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| if ( n < 0 ) return(0); | |||
| if ( n <= 0 ) return(0); | |||
| while(i < n) | |||
| { | |||
| @@ -0,0 +1,94 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| if(n <= 0) return(0); | |||
| FLOAT_V_T v0; | |||
| if(inc_x == 1 && inc_y == 1) { | |||
| for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| v0 = VLEV_FLOAT(x, vl); | |||
| VSEV_FLOAT(y, v0, vl); | |||
| } | |||
| } else if (inc_y == 1) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| v0 = VLSEV_FLOAT(x, stride_x, vl); | |||
| VSEV_FLOAT(y, v0, vl); | |||
| } | |||
| } else if(inc_x == 1) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| v0 = VLEV_FLOAT(x, vl); | |||
| VSSEV_FLOAT(y, stride_y, v0, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| v0 = VLSEV_FLOAT(x, stride_x, vl); | |||
| VSSEV_FLOAT(y, stride_y, v0, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -25,22 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| @@ -58,7 +71,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/4){ | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| BLASLONG gvl3 = gvl * 3; | |||
| unsigned int gvl3 = gvl * 3; | |||
| BLASLONG inc_xv3 = inc_xv * 3; | |||
| for(i=0,j=0; i<n/(4*gvl); i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| @@ -86,7 +99,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| if(gvl <= n/4){ | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| BLASLONG inc_yv3 = inc_yv * 3; | |||
| BLASLONG gvl3 = gvl * 3; | |||
| unsigned int gvl3 = gvl * 3; | |||
| for(i=0,j=0; i<n/(4*gvl); i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| VSSEV_FLOAT(&y[iy], stride_y, v0, gvl); | |||
| @@ -196,7 +196,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| asm volatile( | |||
| "vsetvli zero, zero, e64,m1 \n\t" | |||
| "fmv.w.x ft11, zero \n\t" | |||
| "fmv.d.x ft11, zero \n\t" | |||
| "mv t0, %[BK] \n\t" | |||
| "vfmv.v.f v16, ft11 \n\t" | |||
| @@ -0,0 +1,492 @@ | |||
| /* | |||
| AUTOGENERATED KERNEL | |||
| Script: ./kernel/riscv64/generate_kernel.py | |||
| Settings: | |||
| LMUL=4 | |||
| M=8 | |||
| M_tail_scalar_from=2 | |||
| N=4 | |||
| __riscv_='__riscv_' | |||
| complex=False | |||
| conjugate=False | |||
| cpu='zvl128b' | |||
| force_acc_double=False | |||
| index_type='BLASLONG' | |||
| op='gemm' | |||
| param_precision='double' | |||
| reg_width_bits=128 | |||
| tail_policy='' | |||
| trace=False | |||
| Derived: | |||
| ELEN_ACC=64 | |||
| ELEN_PARAM=64 | |||
| LMUL_ACC=4 | |||
| VFMACC='__riscv_vfmacc_vf_f64m4' | |||
| VFMUL='__riscv_vfmul_vf_f64m4' | |||
| VLEV='__riscv_vle64_v_f64m4' | |||
| VLSEV='__riscv_vlse64_v_f64m4' | |||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' | |||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' | |||
| VSETVL='__riscv_vsetvl_e64m4' | |||
| VSEV='__riscv_vse64_v_f64m4' | |||
| VSSEV='__riscv_vsse64_v_f64m4' | |||
| acc_vector_t='vfloat64m4_t' | |||
| output='dgemm_kernel_8x4_zvl128b.c' | |||
| param_scalar_t='double' | |||
| param_vector_t='vfloat64m4_t' | |||
| */ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) | |||
| { | |||
| BLASLONG gvl = 0; | |||
| BLASLONG m_top = 0; | |||
| BLASLONG n_top = 0; | |||
| // -- MAIN PASS | |||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||
| m_top = 0; | |||
| BLASLONG gvl = __riscv_vsetvl_e64m4(8); | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| double B2 = B[bi + 2]; | |||
| double B3 = B[bi + 3]; | |||
| bi += 4; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||
| c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); | |||
| c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||
| m_top += 8; | |||
| } | |||
| // -- tails for main pass | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e64m4(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| double B2 = B[bi + 2]; | |||
| double B3 = B[bi + 3]; | |||
| bi += 4; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||
| c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); | |||
| c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| double result4 = 0; | |||
| double result5 = 0; | |||
| double result6 = 0; | |||
| double result7 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| result4 += A[ai + 0] * B[bi + 2]; | |||
| result5 += A[ai + 1] * B[bi + 2]; | |||
| result6 += A[ai + 0] * B[bi + 3]; | |||
| result7 += A[ai + 1] * B[bi + 3]; | |||
| ai += 2; | |||
| bi += 4; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||
| C[ci + 2 * ldc + 0] += alpha * result4; | |||
| C[ci + 2 * ldc + 1] += alpha * result5; | |||
| C[ci + 3 * ldc + 0] += alpha * result6; | |||
| C[ci + 3 * ldc + 1] += alpha * result7; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| result2 += A[ai + 0] * B[bi + 2]; | |||
| result3 += A[ai + 0] * B[bi + 3]; | |||
| ai += 1; | |||
| bi += 4; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||
| C[ci + 2 * ldc + 0] += alpha * result2; | |||
| C[ci + 3 * ldc + 0] += alpha * result3; | |||
| m_top += 1; | |||
| } | |||
| n_top += 4; | |||
| } | |||
| // -- tails for N=2 | |||
| if (N & 2) { | |||
| gvl = __riscv_vsetvl_e64m4(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| bi += 2; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e64m4(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| bi += 2; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| ai += 2; | |||
| bi += 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| ai += 1; | |||
| bi += 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||
| m_top += 1; | |||
| } | |||
| n_top += 2; | |||
| } | |||
| // -- tails for N=1 | |||
| if (N & 1) { | |||
| gvl = __riscv_vsetvl_e64m4(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| double B0 = B[bi + 0]; | |||
| bi += 1; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e64m4(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| double B0 = B[bi + 0]; | |||
| bi += 1; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| ai += 2; | |||
| bi += 1; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| double result0 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| ai += 1; | |||
| bi += 1; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| m_top += 1; | |||
| } | |||
| n_top += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,860 @@ | |||
| /* | |||
| AUTOGENERATED KERNEL | |||
| Settings: | |||
| LMUL=1 | |||
| M=8 | |||
| M_tail_scalar_from=2 | |||
| N=8 | |||
| __riscv_='__riscv_' | |||
| complex=False | |||
| conjugate=False | |||
| cpu='zvl256b' | |||
| force_acc_double=False | |||
| index_type='BLASLONG' | |||
| op='gemm' | |||
| param_precision='double' | |||
| reg_width_bits=256 | |||
| tail_policy='' | |||
| trace=False | |||
| Derived: | |||
| ELEN_ACC=64 | |||
| ELEN_PARAM=64 | |||
| LMUL_ACC=1 | |||
| VFMACC='__riscv_vfmacc_vf_f64m1' | |||
| VFMUL='__riscv_vfmul_vf_f64m1' | |||
| VLEV='__riscv_vle64_v_f64m1' | |||
| VLSEV='__riscv_vlse64_v_f64m1' | |||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' | |||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' | |||
| VSETVL='__riscv_vsetvl_e64m1' | |||
| VSEV='__riscv_vse64_v_f64m1' | |||
| VSSEV='__riscv_vsse64_v_f64m1' | |||
| acc_vector_t='vfloat64m1_t' | |||
| output='dgemm_kernel_8x8_zvl256b.c' | |||
| param_scalar_t='double' | |||
| param_vector_t='vfloat64m1_t' | |||
| */ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) | |||
| { | |||
| BLASLONG gvl = 0; | |||
| BLASLONG m_top = 0; | |||
| BLASLONG n_top = 0; | |||
| // -- MAIN PASS | |||
| for (BLASLONG j=0; j<N/8; j+=1) { | |||
| m_top = 0; | |||
| BLASLONG gvl = __riscv_vsetvl_e64m1(4); | |||
| for (BLASLONG i=0; i<M/8; i+=1) { | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| double B1 = B[bi+1]; | |||
| double B2 = B[bi+2]; | |||
| double B3 = B[bi+3]; | |||
| double B4 = B[bi+4]; | |||
| double B5 = B[bi+5]; | |||
| double B6 = B[bi+6]; | |||
| double B7 = B[bi+7]; | |||
| bi += 8; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl); | |||
| vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||
| vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl); | |||
| vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||
| vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl); | |||
| vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl); | |||
| vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl); | |||
| vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl); | |||
| vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl); | |||
| vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl); | |||
| vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl); | |||
| vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl); | |||
| vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| B1 = B[bi+1]; | |||
| B2 = B[bi+2]; | |||
| B3 = B[bi+3]; | |||
| B4 = B[bi+4]; | |||
| B5 = B[bi+5]; | |||
| B6 = B[bi+6]; | |||
| B7 = B[bi+7]; | |||
| bi += 8; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl); | |||
| result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl); | |||
| result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl); | |||
| result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl); | |||
| result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl); | |||
| result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl); | |||
| result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl); | |||
| result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl); | |||
| result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl); | |||
| result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl); | |||
| result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl); | |||
| result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl); | |||
| result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c8 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c9 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c10 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c11 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c12 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c13 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c14 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c15 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||
| c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl ); | |||
| c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl ); | |||
| c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl ); | |||
| c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl ); | |||
| c8 = __riscv_vfmacc_vf_f64m1( c8, alpha, result8, gvl ); | |||
| c9 = __riscv_vfmacc_vf_f64m1( c9, alpha, result9, gvl ); | |||
| c10 = __riscv_vfmacc_vf_f64m1( c10, alpha, result10, gvl ); | |||
| c11 = __riscv_vfmacc_vf_f64m1( c11, alpha, result11, gvl ); | |||
| c12 = __riscv_vfmacc_vf_f64m1( c12, alpha, result12, gvl ); | |||
| c13 = __riscv_vfmacc_vf_f64m1( c13, alpha, result13, gvl ); | |||
| c14 = __riscv_vfmacc_vf_f64m1( c14, alpha, result14, gvl ); | |||
| c15 = __riscv_vfmacc_vf_f64m1( c15, alpha, result15, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c15, gvl); | |||
| m_top += 8; | |||
| } | |||
| // -- tails for main pass | |||
| if( M & 4 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| double B1 = B[bi+1]; | |||
| double B2 = B[bi+2]; | |||
| double B3 = B[bi+3]; | |||
| double B4 = B[bi+4]; | |||
| double B5 = B[bi+5]; | |||
| double B6 = B[bi+6]; | |||
| double B7 = B[bi+7]; | |||
| bi += 8; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||
| vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl); | |||
| vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl); | |||
| vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl); | |||
| vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| B1 = B[bi+1]; | |||
| B2 = B[bi+2]; | |||
| B3 = B[bi+3]; | |||
| B4 = B[bi+4]; | |||
| B5 = B[bi+5]; | |||
| B6 = B[bi+6]; | |||
| B7 = B[bi+7]; | |||
| bi += 8; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl); | |||
| result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl); | |||
| result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl); | |||
| result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl); | |||
| result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||
| c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl ); | |||
| c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl ); | |||
| c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl ); | |||
| c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c7, gvl); | |||
| m_top += 4; | |||
| } | |||
| if( M & 2 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| double result4 = 0; | |||
| double result5 = 0; | |||
| double result6 = 0; | |||
| double result7 = 0; | |||
| double result8 = 0; | |||
| double result9 = 0; | |||
| double result10 = 0; | |||
| double result11 = 0; | |||
| double result12 = 0; | |||
| double result13 = 0; | |||
| double result14 = 0; | |||
| double result15 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+1]*B[bi+0]; | |||
| result2+=A[ai+0]*B[bi+1]; | |||
| result3+=A[ai+1]*B[bi+1]; | |||
| result4+=A[ai+0]*B[bi+2]; | |||
| result5+=A[ai+1]*B[bi+2]; | |||
| result6+=A[ai+0]*B[bi+3]; | |||
| result7+=A[ai+1]*B[bi+3]; | |||
| result8+=A[ai+0]*B[bi+4]; | |||
| result9+=A[ai+1]*B[bi+4]; | |||
| result10+=A[ai+0]*B[bi+5]; | |||
| result11+=A[ai+1]*B[bi+5]; | |||
| result12+=A[ai+0]*B[bi+6]; | |||
| result13+=A[ai+1]*B[bi+6]; | |||
| result14+=A[ai+0]*B[bi+7]; | |||
| result15+=A[ai+1]*B[bi+7]; | |||
| ai+=2; | |||
| bi+=8; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+0*ldc+1] += alpha * result1; | |||
| C[ci+1*ldc+0] += alpha * result2; | |||
| C[ci+1*ldc+1] += alpha * result3; | |||
| C[ci+2*ldc+0] += alpha * result4; | |||
| C[ci+2*ldc+1] += alpha * result5; | |||
| C[ci+3*ldc+0] += alpha * result6; | |||
| C[ci+3*ldc+1] += alpha * result7; | |||
| C[ci+4*ldc+0] += alpha * result8; | |||
| C[ci+4*ldc+1] += alpha * result9; | |||
| C[ci+5*ldc+0] += alpha * result10; | |||
| C[ci+5*ldc+1] += alpha * result11; | |||
| C[ci+6*ldc+0] += alpha * result12; | |||
| C[ci+6*ldc+1] += alpha * result13; | |||
| C[ci+7*ldc+0] += alpha * result14; | |||
| C[ci+7*ldc+1] += alpha * result15; | |||
| m_top+=2; | |||
| } | |||
| if( M & 1 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| double result4 = 0; | |||
| double result5 = 0; | |||
| double result6 = 0; | |||
| double result7 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+0]*B[bi+1]; | |||
| result2+=A[ai+0]*B[bi+2]; | |||
| result3+=A[ai+0]*B[bi+3]; | |||
| result4+=A[ai+0]*B[bi+4]; | |||
| result5+=A[ai+0]*B[bi+5]; | |||
| result6+=A[ai+0]*B[bi+6]; | |||
| result7+=A[ai+0]*B[bi+7]; | |||
| ai+=1; | |||
| bi+=8; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+1*ldc+0] += alpha * result1; | |||
| C[ci+2*ldc+0] += alpha * result2; | |||
| C[ci+3*ldc+0] += alpha * result3; | |||
| C[ci+4*ldc+0] += alpha * result4; | |||
| C[ci+5*ldc+0] += alpha * result5; | |||
| C[ci+6*ldc+0] += alpha * result6; | |||
| C[ci+7*ldc+0] += alpha * result7; | |||
| m_top+=1; | |||
| } | |||
| n_top += 8; | |||
| } | |||
| // -- tails for N=4 | |||
| if( N & 4 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| m_top = 0; | |||
| for (BLASLONG i=0; i<M/8; i+=1) { | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| double B1 = B[bi+1]; | |||
| double B2 = B[bi+2]; | |||
| double B3 = B[bi+3]; | |||
| bi += 4; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl); | |||
| vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||
| vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl); | |||
| vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||
| vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| B1 = B[bi+1]; | |||
| B2 = B[bi+2]; | |||
| B3 = B[bi+3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl); | |||
| result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl); | |||
| result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl); | |||
| result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl); | |||
| result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||
| c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl ); | |||
| c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl ); | |||
| c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl ); | |||
| c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c7, gvl); | |||
| m_top += 8; | |||
| } | |||
| if( M & 4 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| double B1 = B[bi+1]; | |||
| double B2 = B[bi+2]; | |||
| double B3 = B[bi+3]; | |||
| bi += 4; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| B1 = B[bi+1]; | |||
| B2 = B[bi+2]; | |||
| B3 = B[bi+3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); | |||
| m_top += 4; | |||
| } | |||
| if( M & 2 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| double result4 = 0; | |||
| double result5 = 0; | |||
| double result6 = 0; | |||
| double result7 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+1]*B[bi+0]; | |||
| result2+=A[ai+0]*B[bi+1]; | |||
| result3+=A[ai+1]*B[bi+1]; | |||
| result4+=A[ai+0]*B[bi+2]; | |||
| result5+=A[ai+1]*B[bi+2]; | |||
| result6+=A[ai+0]*B[bi+3]; | |||
| result7+=A[ai+1]*B[bi+3]; | |||
| ai+=2; | |||
| bi+=4; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+0*ldc+1] += alpha * result1; | |||
| C[ci+1*ldc+0] += alpha * result2; | |||
| C[ci+1*ldc+1] += alpha * result3; | |||
| C[ci+2*ldc+0] += alpha * result4; | |||
| C[ci+2*ldc+1] += alpha * result5; | |||
| C[ci+3*ldc+0] += alpha * result6; | |||
| C[ci+3*ldc+1] += alpha * result7; | |||
| m_top+=2; | |||
| } | |||
| if( M & 1 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+0]*B[bi+1]; | |||
| result2+=A[ai+0]*B[bi+2]; | |||
| result3+=A[ai+0]*B[bi+3]; | |||
| ai+=1; | |||
| bi+=4; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+1*ldc+0] += alpha * result1; | |||
| C[ci+2*ldc+0] += alpha * result2; | |||
| C[ci+3*ldc+0] += alpha * result3; | |||
| m_top+=1; | |||
| } | |||
| n_top += 4; | |||
| } | |||
| // -- tails for N=2 | |||
| if( N & 2 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| m_top = 0; | |||
| for (BLASLONG i=0; i<M/8; i+=1) { | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| double B1 = B[bi+1]; | |||
| bi += 2; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| B1 = B[bi+1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); | |||
| m_top += 8; | |||
| } | |||
| if( M & 4 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| double B1 = B[bi+1]; | |||
| bi += 2; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| B1 = B[bi+1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); | |||
| m_top += 4; | |||
| } | |||
| if( M & 2 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+1]*B[bi+0]; | |||
| result2+=A[ai+0]*B[bi+1]; | |||
| result3+=A[ai+1]*B[bi+1]; | |||
| ai+=2; | |||
| bi+=2; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+0*ldc+1] += alpha * result1; | |||
| C[ci+1*ldc+0] += alpha * result2; | |||
| C[ci+1*ldc+1] += alpha * result3; | |||
| m_top+=2; | |||
| } | |||
| if( M & 1 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+0]*B[bi+1]; | |||
| ai+=1; | |||
| bi+=2; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+1*ldc+0] += alpha * result1; | |||
| m_top+=1; | |||
| } | |||
| n_top += 2; | |||
| } | |||
| // -- tails for N=1 | |||
| if( N & 1 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| m_top = 0; | |||
| for (BLASLONG i=0; i<M/8; i+=1) { | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| bi += 1; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); | |||
| m_top += 8; | |||
| } | |||
| if( M & 4 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| bi += 1; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); | |||
| m_top += 4; | |||
| } | |||
| if( M & 2 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+1]*B[bi+0]; | |||
| ai+=2; | |||
| bi+=1; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+0*ldc+1] += alpha * result1; | |||
| m_top+=2; | |||
| } | |||
| if( M & 1 ) { | |||
| double result0 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| ai+=1; | |||
| bi+=1; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| m_top+=1; | |||
| } | |||
| n_top += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -44,14 +44,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| #if defined(DSDOT) | |||
| double dot = 0.0 ; | |||
| #else | |||
| FLOAT dot = 0.0 ; | |||
| #endif | |||
| if ( n < 1 ) return(dot); | |||
| while(i < n) | |||
| { | |||
| #if defined(DSDOT) | |||
| dot += (double) y[iy] * (double) x[ix] ; | |||
| #else | |||
| dot += y[iy] * x[ix] ; | |||
| #endif | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| @@ -0,0 +1,126 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(DSDOT) | |||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #else | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #endif | |||
| { | |||
| double dot = 0.0; | |||
| if ( n <= 0 ) return(dot); | |||
| size_t vlmax = __riscv_vsetvlmax_e64m8(); | |||
| vfloat64m8_t vr = __riscv_vfmv_v_f_f64m8(0, vlmax); | |||
| if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = __riscv_vsetvl_e64m8(n); | |||
| #if !defined(DOUBLE) | |||
| vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); | |||
| vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); | |||
| vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #else | |||
| vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); | |||
| vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); | |||
| vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #endif | |||
| } | |||
| } else if (1 == inc_x) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = __riscv_vsetvl_e64m8(n); | |||
| #if !defined(DOUBLE) | |||
| vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); | |||
| vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); | |||
| vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #else | |||
| vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); | |||
| vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); | |||
| vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #endif | |||
| } | |||
| } else if (1 == inc_y) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = __riscv_vsetvl_e64m8(n); | |||
| #if !defined(DOUBLE) | |||
| vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); | |||
| vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); | |||
| vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #else | |||
| vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); | |||
| vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); | |||
| vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #endif | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = __riscv_vsetvl_e64m8(n); | |||
| #if !defined(DOUBLE) | |||
| vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); | |||
| vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); | |||
| vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #else | |||
| vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); | |||
| vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); | |||
| vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #endif | |||
| } | |||
| } | |||
| vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax); | |||
| vfloat64m1_t vec_sum = __riscv_vfredusum_vs_f64m8_f64m1(vr, vec_zero, vlmax); | |||
| dot = __riscv_vfmv_f_s_f64m1_f64(vec_sum); | |||
| return(dot); | |||
| } | |||
| @@ -27,31 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) | |||
| #endif | |||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) | |||
| #else | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) | |||
| #endif | |||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) | |||
| #endif | |||
| #if defined(DSDOT) | |||
| @@ -82,8 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| @@ -93,13 +99,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| }else if(inc_y == 1){ | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| int stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy = VLEV_FLOAT(&y[j], gvl); | |||
| @@ -107,9 +113,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| @@ -119,14 +124,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| int stride_y = inc_y * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| @@ -134,9 +138,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| @@ -146,15 +149,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| int stride_x = inc_x * sizeof(FLOAT); | |||
| int stride_y = inc_y * sizeof(FLOAT); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| @@ -162,9 +164,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| @@ -174,9 +175,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| } | |||
| return(dot); | |||
| @@ -0,0 +1,152 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| double dot = 0.0 ; | |||
| if ( n < 1 ) return(dot); | |||
| vfloat64m4_t vr; | |||
| vfloat32m2_t vx, vy; | |||
| unsigned int gvl = 0; | |||
| vfloat64m1_t v_res, v_z0; | |||
| gvl = vsetvlmax_e64m1(); | |||
| v_res = vfmv_v_f_f64m1(0, gvl); | |||
| v_z0 = vfmv_v_f_f64m1(0, gvl); | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| gvl = vsetvl_e64m4(n); | |||
| vr = vfmv_v_f_f64m4(0, gvl); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = vle32_v_f32m2(&x[j], gvl); | |||
| vy = vle32_v_f32m2(&y[j], gvl); | |||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvl_e64m4(n-j); | |||
| vx = vle32_v_f32m2(&x[j], gvl); | |||
| vy = vle32_v_f32m2(&y[j], gvl); | |||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| }else if(inc_y == 1){ | |||
| gvl = vsetvl_e64m4(n); | |||
| vr = vfmv_v_f_f64m4(0, gvl); | |||
| int stride_x = inc_x * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||
| vy = vle32_v_f32m2(&y[j], gvl); | |||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvl_e64m4(n-j); | |||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||
| vy = vle32_v_f32m2(&y[j], gvl); | |||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = vsetvl_e64m4(n); | |||
| vr = vfmv_v_f_f64m4(0, gvl); | |||
| int stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = vle32_v_f32m2(&x[j], gvl); | |||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvl_e64m4(n-j); | |||
| vx = vle32_v_f32m2(&x[j], gvl); | |||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| }else{ | |||
| gvl = vsetvl_e64m4(n); | |||
| vr = vfmv_v_f_f64m4(0, gvl); | |||
| int stride_x = inc_x * sizeof(FLOAT); | |||
| int stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvl_e64m4(n-j); | |||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| } | |||
| return(dot); | |||
| } | |||
| @@ -0,0 +1,660 @@ | |||
| /* | |||
| AUTOGENERATED KERNEL | |||
| Script: ./kernel/riscv64/generate_kernel.py | |||
| Settings: | |||
| LMUL=4 | |||
| M=8 | |||
| M_tail_scalar_from=2 | |||
| N=4 | |||
| __riscv_='__riscv_' | |||
| complex=False | |||
| conjugate=False | |||
| cpu='zvl128b' | |||
| force_acc_double=False | |||
| index_type='BLASLONG' | |||
| op='trmm' | |||
| param_precision='double' | |||
| reg_width_bits=128 | |||
| tail_policy='' | |||
| trace=False | |||
| Derived: | |||
| ELEN_ACC=64 | |||
| ELEN_PARAM=64 | |||
| LMUL_ACC=4 | |||
| VFMACC='__riscv_vfmacc_vf_f64m4' | |||
| VFMUL='__riscv_vfmul_vf_f64m4' | |||
| VLEV='__riscv_vle64_v_f64m4' | |||
| VLSEV='__riscv_vlse64_v_f64m4' | |||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' | |||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' | |||
| VSETVL='__riscv_vsetvl_e64m4' | |||
| VSEV='__riscv_vse64_v_f64m4' | |||
| VSSEV='__riscv_vsse64_v_f64m4' | |||
| acc_vector_t='vfloat64m4_t' | |||
| output='dtrmm_kernel_8x4_zvl128b.c' | |||
| param_scalar_t='double' | |||
| param_vector_t='vfloat64m4_t' | |||
| */ | |||
| #include "common.h" | |||
| #if defined(LEFT) != defined(TRANSA) | |||
| #define BACKWARDS | |||
| #endif | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) | |||
| { | |||
| BLASLONG gvl = 0; | |||
| BLASLONG m_top = 0; | |||
| BLASLONG n_top = 0; | |||
| // -- MAIN PASS | |||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||
| m_top = 0; | |||
| BLASLONG gvl = __riscv_vsetvl_e64m4(8); | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 8; | |||
| bi += off * 4; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 8; | |||
| #else | |||
| pass_K = off + 4; | |||
| #endif | |||
| #endif | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| double B2 = B[bi + 2]; | |||
| double B3 = B[bi + 3]; | |||
| bi += 4; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||
| vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); | |||
| vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||
| m_top += 8; | |||
| } | |||
| // -- tails for main pass | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e64m4(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 4; | |||
| bi += off * 4; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 4; | |||
| #else | |||
| pass_K = off + 4; | |||
| #endif | |||
| #endif | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| double B2 = B[bi + 2]; | |||
| double B3 = B[bi + 3]; | |||
| bi += 4; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||
| vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); | |||
| vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| double result4 = 0; | |||
| double result5 = 0; | |||
| double result6 = 0; | |||
| double result7 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 2; | |||
| bi += off * 4; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 2; | |||
| #else | |||
| pass_K = off + 4; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| result4 += A[ai + 0] * B[bi + 2]; | |||
| result5 += A[ai + 1] * B[bi + 2]; | |||
| result6 += A[ai + 0] * B[bi + 3]; | |||
| result7 += A[ai + 1] * B[bi + 3]; | |||
| ai += 2; | |||
| bi += 4; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||
| C[ci + 2 * ldc + 0] = alpha * result4; | |||
| C[ci + 2 * ldc + 1] = alpha * result5; | |||
| C[ci + 3 * ldc + 0] = alpha * result6; | |||
| C[ci + 3 * ldc + 1] = alpha * result7; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 1; | |||
| bi += off * 4; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 1; | |||
| #else | |||
| pass_K = off + 4; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| result2 += A[ai + 0] * B[bi + 2]; | |||
| result3 += A[ai + 0] * B[bi + 3]; | |||
| ai += 1; | |||
| bi += 4; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||
| C[ci + 2 * ldc + 0] = alpha * result2; | |||
| C[ci + 3 * ldc + 0] = alpha * result3; | |||
| m_top += 1; | |||
| } | |||
| n_top += 4; | |||
| } | |||
| // -- tails for N=2 | |||
| if (N & 2) { | |||
| gvl = __riscv_vsetvl_e64m4(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 8; | |||
| bi += off * 2; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 8; | |||
| #else | |||
| pass_K = off + 2; | |||
| #endif | |||
| #endif | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| bi += 2; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e64m4(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 4; | |||
| bi += off * 2; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 4; | |||
| #else | |||
| pass_K = off + 2; | |||
| #endif | |||
| #endif | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| bi += 2; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 2; | |||
| bi += off * 2; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 2; | |||
| #else | |||
| pass_K = off + 2; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| ai += 2; | |||
| bi += 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 1; | |||
| bi += off * 2; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 1; | |||
| #else | |||
| pass_K = off + 2; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| ai += 1; | |||
| bi += 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||
| m_top += 1; | |||
| } | |||
| n_top += 2; | |||
| } | |||
| // -- tails for N=1 | |||
| if (N & 1) { | |||
| gvl = __riscv_vsetvl_e64m4(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 8; | |||
| bi += off * 1; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 8; | |||
| #else | |||
| pass_K = off + 1; | |||
| #endif | |||
| #endif | |||
| double B0 = B[bi + 0]; | |||
| bi += 1; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e64m4(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 4; | |||
| bi += off * 1; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 4; | |||
| #else | |||
| pass_K = off + 1; | |||
| #endif | |||
| #endif | |||
| double B0 = B[bi + 0]; | |||
| bi += 1; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 2; | |||
| bi += off * 1; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 2; | |||
| #else | |||
| pass_K = off + 1; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| ai += 2; | |||
| bi += 1; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| double result0 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 1; | |||
| bi += off * 1; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 1; | |||
| #else | |||
| pass_K = off + 1; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| ai += 1; | |||
| bi += 1; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| m_top += 1; | |||
| } | |||
| n_top += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,89 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/gemm_beta.c | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, | |||
| IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, | |||
| FLOAT *c, BLASLONG ldc) | |||
| { | |||
| BLASLONG chunk; | |||
| FLOAT *c_offset; | |||
| size_t vl; | |||
| FLOAT_V_T vx; | |||
| if (beta == ZERO) { | |||
| vl = VSETVL(m); | |||
| vx = VFMVVF_FLOAT(0.0, vl); | |||
| for( ; n > 0; n--, c += ldc) { | |||
| c_offset = c; | |||
| for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { | |||
| vl = VSETVL(chunk); | |||
| VSEV_FLOAT(c_offset, vx, vl); | |||
| } | |||
| } | |||
| } else { | |||
| for( ; n > 0; n--, c += ldc) { | |||
| c_offset = c; | |||
| for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { | |||
| vl = VSETVL(chunk); | |||
| vx = VLEV_FLOAT(c_offset, vl); | |||
| vx = VFMULVF_FLOAT(vx, beta, vl); | |||
| VSEV_FLOAT(c_offset, vx, vl); | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,197 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m1(n) | |||
| #define FLOAT_V_T vfloat32m1_t | |||
| #define FLOAT_VX2_T vfloat32m1x2_t | |||
| #define FLOAT_VX4_T vfloat32m1x4_t | |||
| #define FLOAT_VX8_T vfloat32m1x8_t | |||
| #define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 | |||
| #define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 | |||
| #define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m1 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m1 | |||
| #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 | |||
| #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 | |||
| #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m1(n) | |||
| #define FLOAT_V_T vfloat64m1_t | |||
| #define FLOAT_VX2_T vfloat64m1x2_t | |||
| #define FLOAT_VX4_T vfloat64m1x4_t | |||
| #define FLOAT_VX8_T vfloat64m1x8_t | |||
| #define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 | |||
| #define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 | |||
| #define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m1 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m1 | |||
| #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 | |||
| #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 | |||
| #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/gemm_ncopy_8.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset; | |||
| FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; | |||
| FLOAT *b_offset; | |||
| FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; | |||
| FLOAT_VX2_T vx2; | |||
| FLOAT_VX4_T vx4; | |||
| FLOAT_VX8_T vx8; | |||
| size_t vl; | |||
| //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for(j = (n >> 3); j > 0; j--) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset5 = a_offset4 + lda; | |||
| a_offset6 = a_offset5 + lda; | |||
| a_offset7 = a_offset6 + lda; | |||
| a_offset8 = a_offset7 + lda; | |||
| a_offset += 8 * lda; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||
| v3 = VLEV_FLOAT(a_offset3, vl); | |||
| v4 = VLEV_FLOAT(a_offset4, vl); | |||
| v5 = VLEV_FLOAT(a_offset5, vl); | |||
| v6 = VLEV_FLOAT(a_offset6, vl); | |||
| v7 = VLEV_FLOAT(a_offset7, vl); | |||
| v8 = VLEV_FLOAT(a_offset8, vl); | |||
| vx8 = VSET_VX8(vx8, 0, v1); | |||
| vx8 = VSET_VX8(vx8, 1, v2); | |||
| vx8 = VSET_VX8(vx8, 2, v3); | |||
| vx8 = VSET_VX8(vx8, 3, v4); | |||
| vx8 = VSET_VX8(vx8, 4, v5); | |||
| vx8 = VSET_VX8(vx8, 5, v6); | |||
| vx8 = VSET_VX8(vx8, 6, v7); | |||
| vx8 = VSET_VX8(vx8, 7, v8); | |||
| VSSEG8_FLOAT(b_offset, vx8, vl); | |||
| a_offset1 += vl; | |||
| a_offset2 += vl; | |||
| a_offset3 += vl; | |||
| a_offset4 += vl; | |||
| a_offset5 += vl; | |||
| a_offset6 += vl; | |||
| a_offset7 += vl; | |||
| a_offset8 += vl; | |||
| b_offset += vl*8; | |||
| } | |||
| } | |||
| if (n & 4) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 4 * lda; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||
| v3 = VLEV_FLOAT(a_offset3, vl); | |||
| v4 = VLEV_FLOAT(a_offset4, vl); | |||
| vx4 = VSET_VX4(vx4, 0, v1); | |||
| vx4 = VSET_VX4(vx4, 1, v2); | |||
| vx4 = VSET_VX4(vx4, 2, v3); | |||
| vx4 = VSET_VX4(vx4, 3, v4); | |||
| VSSEG4_FLOAT(b_offset, vx4, vl); | |||
| a_offset1 += vl; | |||
| a_offset2 += vl; | |||
| a_offset3 += vl; | |||
| a_offset4 += vl; | |||
| b_offset += vl*4; | |||
| } | |||
| } | |||
| if (n & 2) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||
| vx2 = VSET_VX2(vx2, 0, v1); | |||
| vx2 = VSET_VX2(vx2, 1, v2); | |||
| VSSEG2_FLOAT(b_offset, vx2, vl); | |||
| a_offset1 += vl; | |||
| a_offset2 += vl; | |||
| b_offset += vl*2; | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| a_offset1 = a_offset; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| VSEV_FLOAT(b_offset, v1, vl); | |||
| a_offset1 += vl; | |||
| b_offset += vl; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,76 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m2 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m2 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset; | |||
| FLOAT *a_offset1; | |||
| FLOAT *b_offset; | |||
| FLOAT_V_T v0; | |||
| size_t vl; | |||
| //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for(j = n; j > 0; j -= vl) { | |||
| vl = VSETVL(j); | |||
| a_offset1 = a_offset; | |||
| a_offset += vl * lda; | |||
| for(i = m; i > 0; i--) { | |||
| v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); | |||
| VSEV_FLOAT(b_offset, v0, vl); | |||
| a_offset1++; | |||
| b_offset += vl; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,273 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m1(n) | |||
| #define FLOAT_V_T vfloat32m1_t | |||
| #define FLOAT_VX2_T vfloat32m1x2_t | |||
| #define FLOAT_VX4_T vfloat32m1x4_t | |||
| #define FLOAT_VX8_T vfloat32m1x8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m1 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m1 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m1 | |||
| #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 | |||
| #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 | |||
| #define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 | |||
| #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 | |||
| #define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 | |||
| #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m1(n) | |||
| #define FLOAT_V_T vfloat64m1_t | |||
| #define FLOAT_VX2_T vfloat64m1x2_t | |||
| #define FLOAT_VX4_T vfloat64m1x4_t | |||
| #define FLOAT_VX8_T vfloat64m1x8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m1 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m1 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m1 | |||
| #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 | |||
| #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 | |||
| #define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 | |||
| #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 | |||
| #define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 | |||
| #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1; | |||
| IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; | |||
| FLOAT_V_T v0; | |||
| FLOAT_VX2_T vx2; | |||
| FLOAT_VX4_T vx4; | |||
| FLOAT_VX8_T vx8; | |||
| // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); | |||
| aoffset = a; | |||
| boffset = b; | |||
| boffset2 = b + m * (n & ~7); | |||
| boffset3 = b + m * (n & ~3); | |||
| boffset4 = b + m * (n & ~1); | |||
| for(j = (m >> 3); j > 0; j--) { | |||
| aoffset1 = aoffset; | |||
| aoffset += 8 * lda; | |||
| boffset1 = boffset; | |||
| boffset += 64; | |||
| for(i = (n >> 3); i > 0; i--) { | |||
| size_t vl = 8; | |||
| vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG8_FLOAT(boffset1, vx8, vl); | |||
| aoffset1 += 8; | |||
| boffset1 += m * 8; | |||
| } | |||
| if (n & 4) { | |||
| size_t vl = 8; | |||
| vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG4_FLOAT(boffset2, vx4, vl); | |||
| aoffset1 += 4; | |||
| boffset2 += 32; | |||
| } | |||
| if (n & 2) { | |||
| size_t vl = 8; | |||
| vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG2_FLOAT(boffset3, vx2, vl); | |||
| aoffset1 += 2; | |||
| boffset3 += 16; | |||
| } | |||
| if (n & 1) { | |||
| size_t vl = 8; | |||
| v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSEV_FLOAT(boffset4, v0, vl); | |||
| aoffset1 += 1; | |||
| boffset4 += 8; | |||
| } | |||
| } | |||
| if (m & 4) { | |||
| aoffset1 = aoffset; | |||
| aoffset += 4 * lda; | |||
| boffset1 = boffset; | |||
| boffset += 32; | |||
| for(i = (n >> 3); i > 0; i--) { | |||
| size_t vl = 4; | |||
| vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG8_FLOAT(boffset1, vx8, vl); | |||
| aoffset1 += 8; | |||
| boffset1 += m * 8; | |||
| } | |||
| if (n & 4) { | |||
| size_t vl = 4; | |||
| vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG4_FLOAT(boffset2, vx4, vl); | |||
| aoffset1 += 4; | |||
| boffset2 += 16; | |||
| } | |||
| if (n & 2) { | |||
| size_t vl = 4; | |||
| vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG2_FLOAT(boffset3, vx2, vl); | |||
| aoffset1 += 2; | |||
| boffset3 += 8; | |||
| } | |||
| if (n & 1) { | |||
| size_t vl = 4; | |||
| v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSEV_FLOAT(boffset4, v0, vl); | |||
| aoffset1 += 1; | |||
| boffset4 += 4; | |||
| } | |||
| } | |||
| if (m & 2) { | |||
| aoffset1 = aoffset; | |||
| aoffset += 2 * lda; | |||
| boffset1 = boffset; | |||
| boffset += 16; | |||
| for(i = (n >> 3); i > 0; i--) { | |||
| size_t vl = 2; | |||
| vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG8_FLOAT(boffset1, vx8, vl); | |||
| aoffset1 += 8; | |||
| boffset1 += m * 8; | |||
| } | |||
| if (n & 4) { | |||
| size_t vl = 2; | |||
| vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG4_FLOAT(boffset2, vx4, vl); | |||
| aoffset1 += 4; | |||
| boffset2 += 8; | |||
| } | |||
| if (n & 2) { | |||
| size_t vl = 2; | |||
| vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG2_FLOAT(boffset3, vx2, vl); | |||
| aoffset1 += 2; | |||
| boffset3 += 4; | |||
| } | |||
| if (n & 1) { | |||
| size_t vl = 2; | |||
| v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSEV_FLOAT(boffset4, v0, vl); | |||
| aoffset1 += 1; | |||
| boffset4 += 2; | |||
| } | |||
| } | |||
| if (m & 1) { | |||
| aoffset1 = aoffset; | |||
| boffset1 = boffset; | |||
| for(i = (n >> 3); i > 0; i--) { | |||
| size_t vl = 8; | |||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||
| VSEV_FLOAT(boffset1, v0, vl); | |||
| aoffset1 += 8; | |||
| boffset1 += 8 * m; | |||
| } | |||
| if (n & 4) { | |||
| size_t vl = 4; | |||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||
| VSEV_FLOAT(boffset2, v0, vl); | |||
| aoffset1 += 4; | |||
| //boffset2 += 4; | |||
| } | |||
| if (n & 2) { | |||
| size_t vl = 2; | |||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||
| VSEV_FLOAT(boffset3, v0, vl); | |||
| aoffset1 += 2; | |||
| // boffset3 += 2; | |||
| } | |||
| if (n & 1) { | |||
| *(boffset4) = *(aoffset1); | |||
| // aoffset1 ++; | |||
| // boffset4 ++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,74 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m2 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m2 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1; | |||
| IFLOAT *boffset; | |||
| FLOAT_V_T v0; | |||
| size_t vl; | |||
| //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); | |||
| aoffset = a; | |||
| boffset = b; | |||
| for(j = n; j > 0; j -= vl) { | |||
| vl = VSETVL(j); | |||
| aoffset1 = aoffset; | |||
| aoffset += vl; | |||
| for(i = m; i > 0; i--) { | |||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||
| VSEV_FLOAT(boffset, v0, vl); | |||
| aoffset1 += lda; | |||
| boffset += vl; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,601 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m2 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m2 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 | |||
| #endif | |||
| int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc | |||
| #ifdef TRMMKERNEL | |||
| ,BLASLONG offset | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7; | |||
| IFLOAT *ptrba,*ptrbb; | |||
| //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug | |||
| FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; | |||
| FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; | |||
| size_t vl; | |||
| // N:8 | |||
| for (j = bn/8; j > 0; j--) { | |||
| C0 = C; | |||
| C1 = C0 + ldc; | |||
| C2 = C1 + ldc; | |||
| C3 = C2 + ldc; | |||
| C4 = C3 + ldc; | |||
| C5 = C4 + ldc; | |||
| C6 = C5 + ldc; | |||
| C7 = C6 + ldc; | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||
| vres2 = VFMVVF_FLOAT(0.0, vl); | |||
| vres3 = VFMVVF_FLOAT(0.0, vl); | |||
| vres4 = VFMVVF_FLOAT(0.0, vl); | |||
| vres5 = VFMVVF_FLOAT(0.0, vl); | |||
| vres6 = VFMVVF_FLOAT(0.0, vl); | |||
| vres7 = VFMVVF_FLOAT(0.0, vl); | |||
| #if 0 | |||
| for (k = bk; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += 8; | |||
| } | |||
| #else | |||
| // Unroll K | |||
| for (k = bk/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||
| ptrbb += 8; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); | |||
| ptrbb += 8; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); | |||
| ptrbb += 8; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); | |||
| ptrbb += 8; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); | |||
| ptrbb += 8; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); | |||
| ptrbb += 8; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); | |||
| ptrbb += 8; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); | |||
| ptrbb += 8; | |||
| } | |||
| // K remainder | |||
| for (k = bk&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||
| ptrbb += 8; | |||
| ptrba += vl; | |||
| } | |||
| #endif | |||
| va0 = VLEV_FLOAT(C0, vl); | |||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| va1 = VLEV_FLOAT(C1, vl); | |||
| va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); | |||
| VSEV_FLOAT(C1, va1, vl); | |||
| va2 = VLEV_FLOAT(C2, vl); | |||
| va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); | |||
| VSEV_FLOAT(C2, va2, vl); | |||
| va3 = VLEV_FLOAT(C3, vl); | |||
| va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); | |||
| VSEV_FLOAT(C3, va3, vl); | |||
| va4 = VLEV_FLOAT(C4, vl); | |||
| va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl); | |||
| VSEV_FLOAT(C4, va4, vl); | |||
| va5 = VLEV_FLOAT(C5, vl); | |||
| va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl); | |||
| VSEV_FLOAT(C5, va5, vl); | |||
| va6 = VLEV_FLOAT(C6, vl); | |||
| va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl); | |||
| VSEV_FLOAT(C6, va6, vl); | |||
| va7 = VLEV_FLOAT(C7, vl); | |||
| va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl); | |||
| VSEV_FLOAT(C7, va7, vl); | |||
| C0 += vl; | |||
| C1 += vl; | |||
| C2 += vl; | |||
| C3 += vl; | |||
| C4 += vl; | |||
| C5 += vl; | |||
| C6 += vl; | |||
| C7 += vl; | |||
| } | |||
| bb += (bk<<3); | |||
| C += (ldc<<3); | |||
| } | |||
| // N:4 | |||
| if (bn & 4) { | |||
| C0 = C; | |||
| C1 = C0 + ldc; | |||
| C2 = C1 + ldc; | |||
| C3 = C2 + ldc; | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||
| vres2 = VFMVVF_FLOAT(0.0, vl); | |||
| vres3 = VFMVVF_FLOAT(0.0, vl); | |||
| #if 0 | |||
| for (k = bk; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += 4; | |||
| } | |||
| #else | |||
| // Unroll K | |||
| for (k = bk/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| ptrbb += 4; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); | |||
| ptrbb += 4; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); | |||
| ptrbb += 4; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); | |||
| ptrbb += 4; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); | |||
| ptrbb += 4; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); | |||
| ptrbb += 4; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); | |||
| ptrbb += 4; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); | |||
| ptrbb += 4; | |||
| } | |||
| // K remainder | |||
| for (k = bk&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| ptrbb += 4; | |||
| ptrba += vl; | |||
| } | |||
| #endif | |||
| va0 = VLEV_FLOAT(C0, vl); | |||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| va1 = VLEV_FLOAT(C1, vl); | |||
| va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); | |||
| VSEV_FLOAT(C1, va1, vl); | |||
| va2 = VLEV_FLOAT(C2, vl); | |||
| va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); | |||
| VSEV_FLOAT(C2, va2, vl); | |||
| va3 = VLEV_FLOAT(C3, vl); | |||
| va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); | |||
| VSEV_FLOAT(C3, va3, vl); | |||
| C0 += vl; | |||
| C1 += vl; | |||
| C2 += vl; | |||
| C3 += vl; | |||
| } | |||
| bb += (bk<<2); | |||
| C += (ldc<<2); | |||
| } | |||
| // N:2 | |||
| if (bn & 2) { | |||
| C0 = C; | |||
| C1 = C0 + ldc; | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||
| #if 0 | |||
| for (k = bk; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += 2; | |||
| } | |||
| #else | |||
| // Unroll K | |||
| for (k = bk/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| ptrbb += 2; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||
| ptrbb += 2; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||
| ptrbb += 2; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||
| ptrbb += 2; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||
| ptrbb += 2; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||
| ptrbb += 2; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||
| ptrbb += 2; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||
| ptrbb += 2; | |||
| } | |||
| // K remainder | |||
| for (k = bk&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| ptrbb += 2; | |||
| ptrba += vl; | |||
| } | |||
| #endif | |||
| va0 = VLEV_FLOAT(C0, vl); | |||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| va1 = VLEV_FLOAT(C1, vl); | |||
| va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); | |||
| VSEV_FLOAT(C1, va1, vl); | |||
| C0 += vl; | |||
| C1 += vl; | |||
| } | |||
| bb += (bk<<1); | |||
| C += (ldc<<1); | |||
| } | |||
| // N:1 | |||
| if (bn & 1) { | |||
| C0 = C; | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| #if 0 | |||
| for (k = bk; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += 1; | |||
| } | |||
| #else | |||
| // Unroll K | |||
| for (k = bk/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| ptrbb += 1; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| ptrbb += 1; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| ptrbb += 1; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| ptrbb += 1; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| ptrbb += 1; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| ptrbb += 1; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| ptrbb += 1; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| ptrbb += 1; | |||
| } | |||
| // K remainder | |||
| for (k = bk&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| ptrbb += 1; | |||
| ptrba += vl; | |||
| } | |||
| #endif | |||
| va0 = VLEV_FLOAT(C0, vl); | |||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| C0 += vl; | |||
| } | |||
| bb += (bk); | |||
| C += (ldc); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,94 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| if(n < 0) return(0); | |||
| FLOAT *a_ptr, *x_ptr; | |||
| BLASLONG i; | |||
| FLOAT_V_T va, vy; | |||
| if(inc_y == 1) { | |||
| for (size_t vl; m > 0; m -= vl, y += vl, a += vl) { | |||
| vl = VSETVL(m); | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| vy = VLEV_FLOAT(y, vl); | |||
| for(i = 0; i < n; i++) { | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| VSEV_FLOAT(y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) { | |||
| vl = VSETVL(m); | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| for(i = 0; i < n; i++) { | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -27,21 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VSEV_FLOAT vse32_v_f32m4 | |||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VSEV_FLOAT vse64_v_f64m4 | |||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| @@ -0,0 +1,118 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *a_ptr, *x_ptr; | |||
| FLOAT_V_T va, vx, vr; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| size_t vlmax = VSETVL_MAX_M1; | |||
| v_z0 = VFMVVF_FLOAT_M1(0, vlmax); | |||
| vlmax = VSETVL_MAX; | |||
| if(inc_x == 1) { | |||
| for(i = 0; i < n; i++) { | |||
| j = m; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) { | |||
| vl = VSETVL(j); | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vx = VLEV_FLOAT(x_ptr, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||
| *y += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| y += inc_y; | |||
| a += lda; | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for(i = 0; i < n; i++) { | |||
| j = m; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) { | |||
| vl = VSETVL(j); | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vx = VLSEV_FLOAT(x_ptr, stride_x, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||
| *y += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| y += inc_y; | |||
| a += lda; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||