| @@ -219,6 +219,7 @@ In chronological order: | |||||
| * Mark Seminatore <https://github.com/mseminatore> | * Mark Seminatore <https://github.com/mseminatore> | ||||
| * [2023-11-09] Improve Windows threading performance scaling | * [2023-11-09] Improve Windows threading performance scaling | ||||
| * [2024-02-09] Introduce MT_TRACE facility and improve code consistency | |||||
| * Dirreke <https://github.com/mseminatore> | * Dirreke <https://github.com/mseminatore> | ||||
| * [2024-01-16] Add basic support for the CSKY architecture | * [2024-01-16] Add basic support for the CSKY architecture | ||||
| @@ -59,6 +59,22 @@ ifeq ($(TARGET), CK860FV) | |||||
| TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float | TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float | ||||
| endif | endif | ||||
| ifeq ($(TARGET), x280) | |||||
| TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d | |||||
| endif | |||||
| ifeq ($(TARGET), RISCV64_ZVL256B) | |||||
| TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d | |||||
| endif | |||||
| ifeq ($(TARGET), RISCV64_ZVL128B) | |||||
| TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d | |||||
| endif | |||||
| ifeq ($(TARGET), RISCV64_GENERIC) | |||||
| TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d | |||||
| endif | |||||
| all: getarch_2nd | all: getarch_2nd | ||||
| ./getarch_2nd 0 >> $(TARGET_MAKE) | ./getarch_2nd 0 >> $(TARGET_MAKE) | ||||
| ./getarch_2nd 1 >> $(TARGET_CONF) | ./getarch_2nd 1 >> $(TARGET_CONF) | ||||
| @@ -2,3 +2,19 @@ ifeq ($(CORE), C910V) | |||||
| CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 | CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 | ||||
| FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static | FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static | ||||
| endif | endif | ||||
| ifeq ($(CORE), x280) | |||||
| CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math | |||||
| FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | |||||
| endif | |||||
| ifeq ($(CORE), RISCV64_ZVL256B) | |||||
| CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d | |||||
| FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static | |||||
| endif | |||||
| ifeq ($(CORE), RISCV64_ZVL128B) | |||||
| CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d | |||||
| FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static | |||||
| endif | |||||
| ifeq ($(CORE), RISCV64_GENERIC) | |||||
| CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d | |||||
| FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static | |||||
| endif | |||||
| @@ -198,6 +198,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||||
| ``` | ``` | ||||
| (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision) | (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision) | ||||
| - **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0. | |||||
| ```sh | |||||
| make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran | |||||
| ``` | |||||
| ### Support for multiple targets in a single library | ### Support for multiple targets in a single library | ||||
| OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. | OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. | ||||
| @@ -118,8 +118,11 @@ Z13 | |||||
| Z14 | Z14 | ||||
| 10.RISC-V 64: | 10.RISC-V 64: | ||||
| RISCV64_GENERIC | |||||
| RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) | |||||
| RISCV64_ZVL128B | |||||
| C910V | C910V | ||||
| x280 | |||||
| RISCV64_ZVL256B | |||||
| 11.LOONGARCH64: | 11.LOONGARCH64: | ||||
| LOONGSONGENERIC | LOONGSONGENERIC | ||||
| @@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib | |||||
| #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | ||||
| LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | ||||
| # x280 temporary workaround for gfortran | |||||
| ifeq ($(TARGET), x280) | |||||
| CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT)) | |||||
| endif | |||||
| ifneq ($(NO_LAPACK), 1) | ifneq ($(NO_LAPACK), 1) | ||||
| GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | ||||
| scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ | scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ | ||||
| @@ -265,9 +271,9 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ | |||||
| ismax.goto idmax.goto \ | ismax.goto idmax.goto \ | ||||
| isamin.goto idamin.goto icamin.goto izamin.goto \ | isamin.goto idamin.goto icamin.goto izamin.goto \ | ||||
| ismin.goto idmin.goto \ | ismin.goto idmin.goto \ | ||||
| samax.goto damax.goto scamax.goto dzamax.goto \ | |||||
| samax.goto damax.goto camax.goto zamax.goto \ | |||||
| smax.goto dmax.goto \ | smax.goto dmax.goto \ | ||||
| samin.goto damin.goto scamin.goto dzamin.goto \ | |||||
| samin.goto damin.goto camin.goto zamin.goto \ | |||||
| smin.goto dmin.goto \ | smin.goto dmin.goto \ | ||||
| saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ | saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ | ||||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) | snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) | ||||
| @@ -2832,12 +2838,12 @@ samax.goto : samax.$(SUFFIX) ../$(LIBNAME) | |||||
| damax.goto : damax.$(SUFFIX) ../$(LIBNAME) | damax.goto : damax.$(SUFFIX) ../$(LIBNAME) | ||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | ||||
| ############################################## SCAMAX ############################################## | |||||
| scamax.goto : scamax.$(SUFFIX) ../$(LIBNAME) | |||||
| ############################################## CAMAX ############################################## | |||||
| camax.goto : camax.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | ||||
| ############################################## DZAMAX ############################################## | |||||
| dzamax.goto : dzamax.$(SUFFIX) ../$(LIBNAME) | |||||
| ############################################## ZAMAX ############################################## | |||||
| zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | ||||
| ############################################## SMAX ############################################## | ############################################## SMAX ############################################## | ||||
| @@ -2856,12 +2862,12 @@ samin.goto : samin.$(SUFFIX) ../$(LIBNAME) | |||||
| damin.goto : damin.$(SUFFIX) ../$(LIBNAME) | damin.goto : damin.$(SUFFIX) ../$(LIBNAME) | ||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | ||||
| ############################################## SCAMIN ############################################## | |||||
| scamin.goto : scamin.$(SUFFIX) ../$(LIBNAME) | |||||
| ############################################## CAMIN ############################################## | |||||
| camin.goto : camin.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | ||||
| ############################################## DZAMIN ############################################## | |||||
| dzamin.goto : dzamin.$(SUFFIX) ../$(LIBNAME) | |||||
| ############################################## ZAMIN ############################################## | |||||
| zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | ||||
| ############################################## SMIN ############################################## | ############################################## SMIN ############################################## | ||||
| @@ -3383,10 +3389,10 @@ samax.$(SUFFIX) : amax.c | |||||
| damax.$(SUFFIX) : amax.c | damax.$(SUFFIX) : amax.c | ||||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | ||||
| scamax.$(SUFFIX) : amax.c | |||||
| camax.$(SUFFIX) : amax.c | |||||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | ||||
| dzamax.$(SUFFIX) : amax.c | |||||
| zamax.$(SUFFIX) : amax.c | |||||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | ||||
| @@ -3403,10 +3409,10 @@ samin.$(SUFFIX) : amin.c | |||||
| damin.$(SUFFIX) : amin.c | damin.$(SUFFIX) : amin.c | ||||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | ||||
| scamin.$(SUFFIX) : amin.c | |||||
| camin.$(SUFFIX) : amin.c | |||||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | ||||
| dzamin.$(SUFFIX) : amin.c | |||||
| zamin.$(SUFFIX) : amin.c | |||||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | ||||
| @@ -3436,4 +3442,4 @@ smallscaling: smallscaling.c ../$(LIBNAME) | |||||
| clean :: | clean :: | ||||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling | @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling | ||||
| include $(TOPDIR)/Makefile.tail | |||||
| include $(TOPDIR)/Makefile.tail | |||||
| @@ -101,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE | |||||
| CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
| CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
| float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||||
| double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||||
| float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
| double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
| float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||||
| double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||||
| float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
| double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
| CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | ||||
| CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | ||||
| CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
| @@ -116,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS | |||||
| void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | ||||
| void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | ||||
| void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||||
| void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||||
| void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | ||||
| void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | ||||
| void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | ||||
| @@ -290,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA | |||||
| void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, | void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, | ||||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | ||||
| void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); | |||||
| void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||||
| OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); | |||||
| void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | |||||
| void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | |||||
| void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, | void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, | ||||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); | OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); | ||||
| @@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double * | |||||
| void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, | void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, | ||||
| xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); | xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); | ||||
| void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *, | |||||
| float *, blasint *, float *, blasint *, float *, float *, blasint *); | |||||
| void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *, | |||||
| double *, blasint *, double *, blasint *, double *, double *, blasint *); | |||||
| void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *, | |||||
| float *, blasint *, float *, blasint *, float *, float *, blasint *); | |||||
| void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *, | |||||
| double *, blasint *, double *, blasint *, double *, double *, blasint *); | |||||
| int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, | int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, | ||||
| float *, float *, blasint *, float *, blasint *, | float *, float *, blasint *, float *, blasint *, | ||||
| float *, float *, blasint *); | float *, float *, blasint *); | ||||
| @@ -764,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *); | |||||
| void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); | void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); | ||||
| void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); | void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); | ||||
| void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); | |||||
| void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); | |||||
| void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *); | |||||
| void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *); | |||||
| void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); | void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); | ||||
| void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); | void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); | ||||
| @@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define BUFFER_SIZE ( 32 << 20) | #define BUFFER_SIZE ( 32 << 20) | ||||
| #define SEEK_ADDRESS | #define SEEK_ADDRESS | ||||
| #if defined(C910V) | |||||
| #include <riscv_vector.h> | |||||
| #if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280) | |||||
| # include <riscv_vector.h> | |||||
| #endif | |||||
| #if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 ) | |||||
| // t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this | |||||
| #define RISCV_0p10_INTRINSICS | |||||
| #define RISCV_RVV(x) x | |||||
| #else | |||||
| #define RISCV_RVV(x) __riscv_ ## x | |||||
| #endif | |||||
| #if defined(C910V) || defined(RISCV64_ZVL256B) | |||||
| # if !defined(DOUBLE) | |||||
| # define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v) | |||||
| # else | |||||
| # define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v) | |||||
| # endif | |||||
| #else | |||||
| # define EXTRACT_FLOAT(v) (v[0]) | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -70,12 +70,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* or implied, of The University of Texas at Austin. */ | /* or implied, of The University of Texas at Austin. */ | ||||
| /*********************************************************************/ | /*********************************************************************/ | ||||
| #define CPU_GENERIC 0 | |||||
| #define CPU_C910V 1 | |||||
| #define CPU_GENERIC 0 | |||||
| #define CPU_C910V 1 | |||||
| #define CPU_x280 2 | |||||
| #define CPU_RISCV64_ZVL256B 3 | |||||
| #define CPU_RISCV64_ZVL128B 4 | |||||
| static char *cpuname[] = { | static char *cpuname[] = { | ||||
| "RISCV64_GENERIC", | "RISCV64_GENERIC", | ||||
| "C910V" | |||||
| "C910V", | |||||
| "x280", | |||||
| "CPU_RISCV64_ZVL256B", | |||||
| "CPU_RISCV64_ZVL128B" | |||||
| }; | |||||
| static char *cpuname_lower[] = { | |||||
| "riscv64_generic", | |||||
| "c910v", | |||||
| "x280", | |||||
| "riscv64_zvl256b", | |||||
| "riscv64_zvl128b" | |||||
| }; | }; | ||||
| int detect(void){ | int detect(void){ | ||||
| @@ -86,23 +100,29 @@ int detect(void){ | |||||
| char *pmodel = NULL, *pisa = NULL; | char *pmodel = NULL, *pisa = NULL; | ||||
| infile = fopen("/proc/cpuinfo", "r"); | infile = fopen("/proc/cpuinfo", "r"); | ||||
| if (!infile) | |||||
| return CPU_GENERIC; | |||||
| while (fgets(buffer, sizeof(buffer), infile)){ | while (fgets(buffer, sizeof(buffer), infile)){ | ||||
| if(!strncmp(buffer, "model name", 10)){ | if(!strncmp(buffer, "model name", 10)){ | ||||
| strcpy(model_buffer, buffer); | strcpy(model_buffer, buffer); | ||||
| pmodel = strchr(isa_buffer, ':') + 1; | |||||
| pmodel = strchr(model_buffer, ':'); | |||||
| if (pmodel) | |||||
| pmodel++; | |||||
| } | } | ||||
| if(!strncmp(buffer, "isa", 3)){ | if(!strncmp(buffer, "isa", 3)){ | ||||
| strcpy(isa_buffer, buffer); | strcpy(isa_buffer, buffer); | ||||
| pisa = strchr(isa_buffer, '4') + 1; | |||||
| pisa = strchr(isa_buffer, '4'); | |||||
| if (pisa) | |||||
| pisa++; | |||||
| } | } | ||||
| } | } | ||||
| fclose(infile); | fclose(infile); | ||||
| if (!pmodel) | |||||
| if (!pmodel || !pisa) | |||||
| return(CPU_GENERIC); | return(CPU_GENERIC); | ||||
| if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v')) | if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v')) | ||||
| return CPU_C910V; | return CPU_C910V; | ||||
| @@ -140,5 +160,5 @@ void get_cpuconfig(void){ | |||||
| } | } | ||||
| void get_libname(void){ | void get_libname(void){ | ||||
| printf("riscv64\n"); | |||||
| printf("%s", cpuname_lower[detect()]); | |||||
| } | } | ||||
| @@ -218,6 +218,9 @@ ifeq ($(F_COMPILER), IBM) | |||||
| ifeq ($(C_COMPILER), GCC) | ifeq ($(C_COMPILER), GCC) | ||||
| CEXTRALIB += -lgomp | CEXTRALIB += -lgomp | ||||
| endif | endif | ||||
| ifeq ($(C_COMPILER), CLANG) | |||||
| CEXTRALIB += -lomp | |||||
| endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -96,7 +96,7 @@ | |||||
| INTEGER ICAMAXTEST | INTEGER ICAMAXTEST | ||||
| EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST | EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST | ||||
| * .. External Subroutines .. | * .. External Subroutines .. | ||||
| EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 | |||||
| EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1 | |||||
| * .. Intrinsic Functions .. | * .. Intrinsic Functions .. | ||||
| INTRINSIC MAX | INTRINSIC MAX | ||||
| * .. Common blocks .. | * .. Common blocks .. | ||||
| @@ -214,8 +214,8 @@ | |||||
| CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), | CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), | ||||
| + STRUE4(NP1),SFAC) | + STRUE4(NP1),SFAC) | ||||
| ELSE IF (ICASE.EQ.8) THEN | ELSE IF (ICASE.EQ.8) THEN | ||||
| * .. CSCAL .. | |||||
| CALL CSCAL(N,CA,CX,INCX) | |||||
| * .. CSCALTEST .. | |||||
| CALL CSCALTEST(N,CA,CX,INCX) | |||||
| CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), | CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), | ||||
| + SFAC) | + SFAC) | ||||
| ELSE IF (ICASE.EQ.9) THEN | ELSE IF (ICASE.EQ.9) THEN | ||||
| @@ -236,14 +236,14 @@ | |||||
| * | * | ||||
| INCX = 1 | INCX = 1 | ||||
| IF (ICASE.EQ.8) THEN | IF (ICASE.EQ.8) THEN | ||||
| * CSCAL | |||||
| * CSCALTEST | |||||
| * Add a test for alpha equal to zero. | * Add a test for alpha equal to zero. | ||||
| CA = (0.0E0,0.0E0) | CA = (0.0E0,0.0E0) | ||||
| DO 80 I = 1, 5 | DO 80 I = 1, 5 | ||||
| MWPCT(I) = (0.0E0,0.0E0) | MWPCT(I) = (0.0E0,0.0E0) | ||||
| MWPCS(I) = (1.0E0,1.0E0) | MWPCS(I) = (1.0E0,1.0E0) | ||||
| 80 CONTINUE | 80 CONTINUE | ||||
| CALL CSCAL(5,CA,CX,INCX) | |||||
| CALL CSCALTEST(5,CA,CX,INCX) | |||||
| CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) | CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) | ||||
| ELSE IF (ICASE.EQ.9) THEN | ELSE IF (ICASE.EQ.9) THEN | ||||
| * CSSCALTEST | * CSSCALTEST | ||||
| @@ -440,6 +440,7 @@ static real c_b43 = (float)1.; | |||||
| extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); | extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); | ||||
| static complex mwpcs[5], mwpct[5]; | static complex mwpcs[5], mwpct[5]; | ||||
| extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); | extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); | ||||
| extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); | |||||
| static complex cx[8]; | static complex cx[8]; | ||||
| extern real scnrm2test_(integer*, complex*, integer*); | extern real scnrm2test_(integer*, complex*, integer*); | ||||
| static integer np1; | static integer np1; | ||||
| @@ -481,7 +482,7 @@ static real c_b43 = (float)1.; | |||||
| stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac); | stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac); | ||||
| } else if (combla_1.icase == 8) { | } else if (combla_1.icase == 8) { | ||||
| /* .. CSCAL .. */ | /* .. CSCAL .. */ | ||||
| cscal_(&combla_1.n, &ca, cx, &combla_1.incx); | |||||
| cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx); | |||||
| ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], | ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], | ||||
| &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac); | &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac); | ||||
| } else if (combla_1.icase == 9) { | } else if (combla_1.icase == 9) { | ||||
| @@ -515,7 +516,7 @@ static real c_b43 = (float)1.; | |||||
| mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.; | mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.; | ||||
| /* L80: */ | /* L80: */ | ||||
| } | } | ||||
| cscal_(&c__5, &ca, cx, &combla_1.incx); | |||||
| cscaltest_(&c__5, &ca, cx, &combla_1.incx); | |||||
| ctest_(&c__5, cx, mwpct, mwpcs, sfac); | ctest_(&c__5, cx, mwpct, mwpcs, sfac); | ||||
| } else if (combla_1.icase == 9) { | } else if (combla_1.icase == 9) { | ||||
| /* CSSCALTEST */ | /* CSSCALTEST */ | ||||
| @@ -48,6 +48,12 @@ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef SMP_DEBUG | |||||
| # define MT_TRACE(...) fprintf(stderr, __VA_ARGS__) | |||||
| #else | |||||
| # define MT_TRACE(...) | |||||
| #endif | |||||
| /* This is a thread implementation for Win32 lazy implementation */ | /* This is a thread implementation for Win32 lazy implementation */ | ||||
| /* Thread server common information */ | /* Thread server common information */ | ||||
| @@ -68,19 +74,12 @@ static HANDLE blas_threads [MAX_CPU_NUMBER]; | |||||
| static DWORD blas_threads_id[MAX_CPU_NUMBER]; | static DWORD blas_threads_id[MAX_CPU_NUMBER]; | ||||
| static volatile int thread_target; // target num of live threads, volatile for cross-thread reads | static volatile int thread_target; // target num of live threads, volatile for cross-thread reads | ||||
| #if defined (__GNUC__) && (__GNUC__ < 6) | |||||
| #define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) | |||||
| #else | |||||
| #if defined(_WIN64) | |||||
| #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp) | |||||
| #else | |||||
| #define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp) | |||||
| #endif | |||||
| #endif | |||||
| // | |||||
| // Legacy code path | |||||
| // | |||||
| static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) { | |||||
| static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||||
| if (!(mode & BLAS_COMPLEX)){ | |||||
| if (!(mode & BLAS_COMPLEX)) { | |||||
| #ifdef EXPRECISION | #ifdef EXPRECISION | ||||
| if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ | if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ | ||||
| /* REAL / Extended Double */ | /* REAL / Extended Double */ | ||||
| @@ -95,7 +94,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||||
| args -> c, args -> ldc, sb); | args -> c, args -> ldc, sb); | ||||
| } else | } else | ||||
| #endif | #endif | ||||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE){ | |||||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE) { | |||||
| /* REAL / Double */ | /* REAL / Double */ | ||||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | ||||
| double *, BLASLONG, double *, BLASLONG, | double *, BLASLONG, double *, BLASLONG, | ||||
| @@ -106,7 +105,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||||
| args -> a, args -> lda, | args -> a, args -> lda, | ||||
| args -> b, args -> ldb, | args -> b, args -> ldb, | ||||
| args -> c, args -> ldc, sb); | args -> c, args -> ldc, sb); | ||||
| } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ | |||||
| } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { | |||||
| /* REAL / Single */ | /* REAL / Single */ | ||||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | ||||
| float *, BLASLONG, float *, BLASLONG, | float *, BLASLONG, float *, BLASLONG, | ||||
| @@ -118,7 +117,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||||
| args -> b, args -> ldb, | args -> b, args -> ldb, | ||||
| args -> c, args -> ldc, sb); | args -> c, args -> ldc, sb); | ||||
| #ifdef BUILD_BFLOAT16 | #ifdef BUILD_BFLOAT16 | ||||
| } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ | |||||
| } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) { | |||||
| /* REAL / BFLOAT16 */ | /* REAL / BFLOAT16 */ | ||||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | ||||
| bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, | bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, | ||||
| @@ -129,7 +128,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||||
| args -> a, args -> lda, | args -> a, args -> lda, | ||||
| args -> b, args -> ldb, | args -> b, args -> ldb, | ||||
| args -> c, args -> ldc, sb); | args -> c, args -> ldc, sb); | ||||
| } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ | |||||
| } else if ((mode & BLAS_PREC) == BLAS_STOBF16) { | |||||
| /* REAL / BLAS_STOBF16 */ | /* REAL / BLAS_STOBF16 */ | ||||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | ||||
| float *, BLASLONG, bfloat16 *, BLASLONG, | float *, BLASLONG, bfloat16 *, BLASLONG, | ||||
| @@ -140,7 +139,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||||
| args -> a, args -> lda, | args -> a, args -> lda, | ||||
| args -> b, args -> ldb, | args -> b, args -> ldb, | ||||
| args -> c, args -> ldc, sb); | args -> c, args -> ldc, sb); | ||||
| } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ | |||||
| } else if ((mode & BLAS_PREC) == BLAS_DTOBF16) { | |||||
| /* REAL / BLAS_DTOBF16 */ | /* REAL / BLAS_DTOBF16 */ | ||||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | ||||
| double *, BLASLONG, bfloat16 *, BLASLONG, | double *, BLASLONG, bfloat16 *, BLASLONG, | ||||
| @@ -157,7 +156,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||||
| } | } | ||||
| } else { | } else { | ||||
| #ifdef EXPRECISION | #ifdef EXPRECISION | ||||
| if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||||
| if ((mode & BLAS_PREC) == BLAS_XDOUBLE) { | |||||
| /* COMPLEX / Extended Double */ | /* COMPLEX / Extended Double */ | ||||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, | void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, | ||||
| xdouble *, BLASLONG, xdouble *, BLASLONG, | xdouble *, BLASLONG, xdouble *, BLASLONG, | ||||
| @@ -171,7 +170,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||||
| args -> c, args -> ldc, sb); | args -> c, args -> ldc, sb); | ||||
| } else | } else | ||||
| #endif | #endif | ||||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE){ | |||||
| if ((mode & BLAS_PREC) == BLAS_DOUBLE) { | |||||
| /* COMPLEX / Double */ | /* COMPLEX / Double */ | ||||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, | void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, | ||||
| double *, BLASLONG, double *, BLASLONG, | double *, BLASLONG, double *, BLASLONG, | ||||
| @@ -201,10 +200,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||||
| } | } | ||||
| } | } | ||||
| /* This is a main routine of threads. Each thread waits until job is */ | |||||
| /* queued. */ | |||||
| static DWORD WINAPI blas_thread_server(void *arg){ | |||||
| // | |||||
| // This is a main routine of threads. Each thread waits until job is queued. | |||||
| // | |||||
| static DWORD WINAPI blas_thread_server(void *arg) { | |||||
| /* Thread identifier */ | /* Thread identifier */ | ||||
| BLASLONG cpu = (BLASLONG)arg; | BLASLONG cpu = (BLASLONG)arg; | ||||
| @@ -215,31 +214,24 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||||
| /* Each server needs each buffer */ | /* Each server needs each buffer */ | ||||
| buffer = blas_memory_alloc(2); | buffer = blas_memory_alloc(2); | ||||
| #ifdef SMP_DEBUG | |||||
| fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu); | |||||
| #endif | |||||
| MT_TRACE("Server[%2ld] Thread is started!\n", cpu); | |||||
| while (1){ | |||||
| while (1) { | |||||
| /* Waiting for Queue */ | /* Waiting for Queue */ | ||||
| #ifdef SMP_DEBUG | |||||
| fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); | |||||
| #endif | |||||
| // event raised when work is added to the queue | |||||
| WaitForSingleObject(kickoff_event, INFINITE); | |||||
| MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu); | |||||
| if (cpu > thread_target - 2) | |||||
| { | |||||
| //printf("thread [%d] exiting.\n", cpu); | |||||
| break; // excess thread, so worker thread exits | |||||
| } | |||||
| // event raised when work is added to the queue | |||||
| WaitForSingleObject(kickoff_event, INFINITE); | |||||
| #ifdef SMP_DEBUG | |||||
| fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); | |||||
| #endif | |||||
| if (cpu > thread_target - 2) { | |||||
| //MT_TRACE("thread [%d] exiting.\n", cpu); | |||||
| break; // excess thread, so worker thread exits | |||||
| } | |||||
| MT_TRACE("Server[%2ld] Got it.\n", cpu); | |||||
| #if 1 | |||||
| EnterCriticalSection(&queue_lock); | EnterCriticalSection(&queue_lock); | ||||
| queue = work_queue; | queue = work_queue; | ||||
| @@ -247,53 +239,39 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||||
| work_queue = work_queue->next; | work_queue = work_queue->next; | ||||
| LeaveCriticalSection(&queue_lock); | LeaveCriticalSection(&queue_lock); | ||||
| #else | |||||
| volatile blas_queue_t* queue_next; | |||||
| INT_PTR prev_value; | |||||
| do { | |||||
| queue = (volatile blas_queue_t*)work_queue; | |||||
| if (!queue) | |||||
| break; | |||||
| queue_next = (volatile blas_queue_t*)queue->next; | |||||
| prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue); | |||||
| } while (prev_value != queue); | |||||
| #endif | |||||
| if (queue) { | |||||
| if (queue) { | |||||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | ||||
| sa = queue -> sa; | sa = queue -> sa; | ||||
| sb = queue -> sb; | sb = queue -> sb; | ||||
| #ifdef CONSISTENT_FPCSR | |||||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||||
| #endif | |||||
| #ifdef CONSISTENT_FPCSR | |||||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||||
| #endif | |||||
| #ifdef SMP_DEBUG | |||||
| fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||||
| MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", | |||||
| cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); | cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); | ||||
| #endif | |||||
| // fprintf(stderr, "queue start[%ld]!!!\n", cpu); | // fprintf(stderr, "queue start[%ld]!!!\n", cpu); | ||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_RUNNING1; | |||||
| #endif | |||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_RUNNING1; | |||||
| #endif | |||||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||||
| if (sa == NULL) | |||||
| sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||||
| if (sb == NULL) { | if (sb == NULL) { | ||||
| if (!(queue -> mode & BLAS_COMPLEX)){ | |||||
| if (!(queue -> mode & BLAS_COMPLEX)) { | |||||
| #ifdef EXPRECISION | #ifdef EXPRECISION | ||||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) { | |||||
| sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) | sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) | ||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| } else | } else | ||||
| #endif | #endif | ||||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ | |||||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { | |||||
| #ifdef BUILD_DOUBLE | #ifdef BUILD_DOUBLE | ||||
| sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) | sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) | ||||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | ||||
| @@ -327,65 +305,58 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||||
| /* Other types in future */ | /* Other types in future */ | ||||
| } | } | ||||
| } | } | ||||
| queue->sb=sb; | |||||
| queue->sb=sb; | |||||
| } | } | ||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_RUNNING2; | |||||
| #endif | |||||
| #ifdef MONITOR | |||||
| main_status[cpu] = MAIN_RUNNING2; | |||||
| #endif | |||||
| if (!(queue -> mode & BLAS_LEGACY)) { | if (!(queue -> mode & BLAS_LEGACY)) { | ||||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||||
| } else { | } else { | ||||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||||
| } | } | ||||
| }else{ | |||||
| continue; //if queue == NULL | |||||
| } | |||||
| } else { | |||||
| continue; //if queue == NULL | |||||
| } | |||||
| #ifdef SMP_DEBUG | |||||
| fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); | |||||
| #endif | |||||
| MT_TRACE("Server[%2ld] Finished!\n", cpu); | |||||
| queue->finished = 1; | |||||
| queue->finished = 1; | |||||
| } | } | ||||
| /* Shutdown procedure */ | /* Shutdown procedure */ | ||||
| #ifdef SMP_DEBUG | |||||
| fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); | |||||
| #endif | |||||
| MT_TRACE("Server[%2ld] Shutdown!\n", cpu); | |||||
| blas_memory_free(buffer); | blas_memory_free(buffer); | ||||
| return 0; | return 0; | ||||
| } | |||||
| } | |||||
| /* Initializing routine */ | |||||
| int blas_thread_init(void){ | |||||
| // | |||||
| // Initializing routine | |||||
| // | |||||
| int blas_thread_init(void) { | |||||
| BLASLONG i; | BLASLONG i; | ||||
| if (blas_server_avail || (blas_cpu_number <= 1)) return 0; | if (blas_server_avail || (blas_cpu_number <= 1)) return 0; | ||||
| LOCK_COMMAND(&server_lock); | LOCK_COMMAND(&server_lock); | ||||
| #ifdef SMP_DEBUG | |||||
| fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", | |||||
| blas_cpu_number); | |||||
| #endif | |||||
| MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number); | |||||
| if (!blas_server_avail){ | |||||
| // create the kickoff Event | |||||
| kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||||
| if (!blas_server_avail) { | |||||
| // create the kickoff Event | |||||
| kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | |||||
| thread_target = blas_cpu_number; | |||||
| thread_target = blas_cpu_number; | |||||
| InitializeCriticalSection(&queue_lock); | InitializeCriticalSection(&queue_lock); | ||||
| for(i = 0; i < blas_cpu_number - 1; i++){ | |||||
| //printf("thread_init: creating thread [%d]\n", i); | |||||
| for(i = 0; i < blas_cpu_number - 1; i++) { | |||||
| //MT_TRACE("thread_init: creating thread [%d]\n", i); | |||||
| blas_threads[i] = CreateThread(NULL, 0, | blas_threads[i] = CreateThread(NULL, 0, | ||||
| blas_thread_server, (void *)i, | blas_thread_server, (void *)i, | ||||
| @@ -400,15 +371,12 @@ int blas_thread_init(void){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| /* | |||||
| User can call one of two routines. | |||||
| exec_blas_async ... immediately returns after jobs are queued. | |||||
| exec_blas ... returns after jobs are finished. | |||||
| */ | |||||
| int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||||
| // | |||||
| // User can call one of two routines. | |||||
| // exec_blas_async ... immediately returns after jobs are queued. | |||||
| // exec_blas ... returns after jobs are finished. | |||||
| // | |||||
| int exec_blas_async(BLASLONG pos, blas_queue_t *queue) { | |||||
| #if defined(SMP_SERVER) | #if defined(SMP_SERVER) | ||||
| // Handle lazy re-init of the thread-pool after a POSIX fork | // Handle lazy re-init of the thread-pool after a POSIX fork | ||||
| @@ -428,7 +396,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||||
| __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); | __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); | ||||
| #endif | #endif | ||||
| current->finished = 0; | |||||
| current->finished = 0; | |||||
| current = current -> next; | current = current -> next; | ||||
| pos ++; | pos ++; | ||||
| } | } | ||||
| @@ -437,18 +405,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||||
| if (!work_queue) | if (!work_queue) | ||||
| { | { | ||||
| work_queue = queue; | |||||
| work_queue = queue; | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| blas_queue_t *next_item = work_queue; | blas_queue_t *next_item = work_queue; | ||||
| // find the end of the work queue | |||||
| while (next_item) | |||||
| next_item = next_item->next; | |||||
| // find the end of the work queue | |||||
| while (next_item) | |||||
| next_item = next_item->next; | |||||
| // add new work to the end | |||||
| next_item = queue; | |||||
| // add new work to the end | |||||
| next_item = queue; | |||||
| } | } | ||||
| LeaveCriticalSection(&queue_lock); | LeaveCriticalSection(&queue_lock); | ||||
| @@ -458,26 +426,25 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | |||||
| // | |||||
| // Join. Wait for all queued tasks to complete | |||||
| // | |||||
| int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) { | |||||
| #ifdef SMP_DEBUG | |||||
| fprintf(STDERR, "Synchronization Waiting.\n"); | |||||
| #endif | |||||
| MT_TRACE("Synchronization Waiting.\n"); | |||||
| while (num){ | |||||
| #ifdef SMP_DEBUG | |||||
| fprintf(STDERR, "Waiting Queue ..\n"); | |||||
| #endif | |||||
| while (!queue->finished) | |||||
| YIELDING; | |||||
| while (num) { | |||||
| MT_TRACE("Waiting Queue ..\n"); | |||||
| queue = queue->next; | |||||
| num--; | |||||
| } | |||||
| while (!queue->finished) | |||||
| YIELDING; | |||||
| queue = queue->next; | |||||
| num--; | |||||
| } | |||||
| MT_TRACE("Completely Done.\n\n"); | |||||
| #ifdef SMP_DEBUG | |||||
| fprintf(STDERR, "Completely Done.\n\n"); | |||||
| #endif | |||||
| // if work was added to the queue after this batch we can't sleep the worker threads | // if work was added to the queue after this batch we can't sleep the worker threads | ||||
| // by resetting the event | // by resetting the event | ||||
| EnterCriticalSection(&queue_lock); | EnterCriticalSection(&queue_lock); | ||||
| @@ -490,8 +457,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| /* Execute Threads */ | |||||
| int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||||
| // | |||||
| // Execute Threads | |||||
| // | |||||
| int exec_blas(BLASLONG num, blas_queue_t *queue) { | |||||
| #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) | #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) | ||||
| // Handle lazy re-init of the thread-pool after a POSIX fork | // Handle lazy re-init of the thread-pool after a POSIX fork | ||||
| @@ -504,29 +473,33 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||||
| if ((num <= 0) || (queue == NULL)) return 0; | if ((num <= 0) || (queue == NULL)) return 0; | ||||
| if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); | |||||
| if ((num > 1) && queue -> next) | |||||
| exec_blas_async(1, queue -> next); | |||||
| routine = queue -> routine; | routine = queue -> routine; | ||||
| if (queue -> mode & BLAS_LEGACY) { | if (queue -> mode & BLAS_LEGACY) { | ||||
| legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); | legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); | ||||
| } else | |||||
| } else { | |||||
| if (queue -> mode & BLAS_PTHREAD) { | if (queue -> mode & BLAS_PTHREAD) { | ||||
| void (*pthreadcompat)(void *) = queue -> routine; | void (*pthreadcompat)(void *) = queue -> routine; | ||||
| (pthreadcompat)(queue -> args); | (pthreadcompat)(queue -> args); | ||||
| } else | } else | ||||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, | (routine)(queue -> args, queue -> range_m, queue -> range_n, | ||||
| queue -> sa, queue -> sb, 0); | |||||
| queue -> sa, queue -> sb, 0); | |||||
| } | |||||
| if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); | |||||
| if ((num > 1) && queue -> next) | |||||
| exec_blas_async_wait(num - 1, queue -> next); | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| /* Shutdown procedure, but user don't have to call this routine. The */ | |||||
| /* kernel automatically kill threads. */ | |||||
| int BLASFUNC(blas_thread_shutdown)(void){ | |||||
| // | |||||
| // Shutdown procedure, but user don't have to call this routine. The | |||||
| // kernel automatically kill threads. | |||||
| // | |||||
| int BLASFUNC(blas_thread_shutdown)(void) { | |||||
| int i; | int i; | ||||
| @@ -534,9 +507,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||||
| LOCK_COMMAND(&server_lock); | LOCK_COMMAND(&server_lock); | ||||
| if (blas_server_avail){ | |||||
| if (blas_server_avail) { | |||||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||||
| for (i = 0; i < blas_num_threads - 1; i++) { | |||||
| // Could also just use WaitForMultipleObjects | // Could also just use WaitForMultipleObjects | ||||
| DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); | DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); | ||||
| @@ -558,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| // | |||||
| // Legacy function to set numbef of threads | |||||
| // | |||||
| void goto_set_num_threads(int num_threads) | void goto_set_num_threads(int num_threads) | ||||
| { | { | ||||
| long i; | long i; | ||||
| @@ -571,7 +547,7 @@ void goto_set_num_threads(int num_threads) | |||||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | ||||
| if (blas_server_avail && num_threads < blas_num_threads) { | |||||
| if (blas_server_avail && num_threads < blas_num_threads) { | |||||
| LOCK_COMMAND(&server_lock); | LOCK_COMMAND(&server_lock); | ||||
| thread_target = num_threads; | thread_target = num_threads; | ||||
| @@ -579,11 +555,11 @@ void goto_set_num_threads(int num_threads) | |||||
| SetEvent(kickoff_event); | SetEvent(kickoff_event); | ||||
| for (i = num_threads - 1; i < blas_num_threads - 1; i++) { | for (i = num_threads - 1; i < blas_num_threads - 1; i++) { | ||||
| //printf("set_num_threads: waiting on thread [%d] to quit.\n", i); | |||||
| //MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i); | |||||
| WaitForSingleObject(blas_threads[i], INFINITE); | WaitForSingleObject(blas_threads[i], INFINITE); | ||||
| //printf("set_num_threads: thread [%d] has quit.\n", i); | |||||
| //MT_TRACE("set_num_threads: thread [%d] has quit.\n", i); | |||||
| CloseHandle(blas_threads[i]); | CloseHandle(blas_threads[i]); | ||||
| } | } | ||||
| @@ -601,8 +577,8 @@ void goto_set_num_threads(int num_threads) | |||||
| thread_target = num_threads; | thread_target = num_threads; | ||||
| //increased_threads = 1; | |||||
| if (!blas_server_avail){ | |||||
| //increased_threads = 1; | |||||
| if (!blas_server_avail) { | |||||
| // create the kickoff Event | // create the kickoff Event | ||||
| kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); | ||||
| @@ -611,8 +587,8 @@ void goto_set_num_threads(int num_threads) | |||||
| blas_server_avail = 1; | blas_server_avail = 1; | ||||
| } | } | ||||
| for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ | |||||
| //printf("set_num_threads: creating thread [%d]\n", i); | |||||
| for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { | |||||
| //MT_TRACE("set_num_threads: creating thread [%d]\n", i); | |||||
| blas_threads[i] = CreateThread(NULL, 0, | blas_threads[i] = CreateThread(NULL, 0, | ||||
| blas_thread_server, (void *)i, | blas_thread_server, (void *)i, | ||||
| @@ -627,6 +603,9 @@ void goto_set_num_threads(int num_threads) | |||||
| blas_cpu_number = num_threads; | blas_cpu_number = num_threads; | ||||
| } | } | ||||
| // | |||||
| // Openblas function to set thread count | |||||
| // | |||||
| void openblas_set_num_threads(int num) | void openblas_set_num_threads(int num) | ||||
| { | { | ||||
| goto_set_num_threads(num); | goto_set_num_threads(num); | ||||
| @@ -275,6 +275,7 @@ extern gotoblas_t gotoblas_EXCAVATOR; | |||||
| #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE | #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE | ||||
| #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE | #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE | ||||
| #define gotoblas_ZEN gotoblas_SANDYBRIDGE | #define gotoblas_ZEN gotoblas_SANDYBRIDGE | ||||
| #define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE | |||||
| #else | #else | ||||
| extern gotoblas_t gotoblas_HASWELL; | extern gotoblas_t gotoblas_HASWELL; | ||||
| extern gotoblas_t gotoblas_ZEN; | extern gotoblas_t gotoblas_ZEN; | ||||
| @@ -43,6 +43,13 @@ char *gotoblas_corename(void) { | |||||
| #define CPU_POWER9 9 | #define CPU_POWER9 9 | ||||
| #define CPU_POWER10 10 | #define CPU_POWER10 10 | ||||
| #ifndef POWER_9 | |||||
| #define POWER_9 0x20000 /* 9 class CPU */ | |||||
| #endif | |||||
| #ifndef POWER_10 | |||||
| #define POWER_10 0x40000 /* 10 class CPU */ | |||||
| #endif | |||||
| #ifdef _AIX | #ifdef _AIX | ||||
| #include <sys/systemcfg.h> | #include <sys/systemcfg.h> | ||||
| @@ -62,7 +69,7 @@ static int cpuid(void) | |||||
| else if (arch == POWER_9) return CPU_POWER9; | else if (arch == POWER_9) return CPU_POWER9; | ||||
| #endif | #endif | ||||
| #ifdef POWER_10 | #ifdef POWER_10 | ||||
| else if (arch == POWER_10) return CPU_POWER10; | |||||
| else if (arch >= POWER_10) return CPU_POWER10; | |||||
| #endif | #endif | ||||
| return CPU_UNKNOWN; | return CPU_UNKNOWN; | ||||
| } | } | ||||
| @@ -332,6 +339,9 @@ void gotoblas_dynamic_init(void) { | |||||
| if (gotoblas && gotoblas -> init) { | if (gotoblas && gotoblas -> init) { | ||||
| strncpy(coren,gotoblas_corename(),20); | strncpy(coren,gotoblas_corename(),20); | ||||
| sprintf(coremsg, "Core: %s\n",coren); | sprintf(coremsg, "Core: %s\n",coren); | ||||
| if (getenv("GET_OPENBLAS_CORETYPE")) { | |||||
| fprintf(stderr, "%s", coremsg); | |||||
| } | |||||
| openblas_warning(2, coremsg); | openblas_warning(2, coremsg); | ||||
| gotoblas -> init(); | gotoblas -> init(); | ||||
| } else { | } else { | ||||
| @@ -3214,7 +3214,7 @@ void blas_shutdown(void){ | |||||
| #endif | #endif | ||||
| memory[pos].lock = 0; | memory[pos].lock = 0; | ||||
| } | } | ||||
| if (memory_overflowed) | |||||
| if (memory_overflowed) { | |||||
| for (pos = 0; pos < NEW_BUFFERS; pos ++){ | for (pos = 0; pos < NEW_BUFFERS; pos ++){ | ||||
| newmemory[pos].addr = (void *)0; | newmemory[pos].addr = (void *)0; | ||||
| newmemory[pos].used = 0; | newmemory[pos].used = 0; | ||||
| @@ -3222,6 +3222,10 @@ void blas_shutdown(void){ | |||||
| newmemory[pos].pos = -1; | newmemory[pos].pos = -1; | ||||
| #endif | #endif | ||||
| newmemory[pos].lock = 0; | newmemory[pos].lock = 0; | ||||
| } | |||||
| free(newmemory); | |||||
| newmemory = NULL; | |||||
| memory_overflowed = 0; | |||||
| } | } | ||||
| UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
| @@ -60,6 +60,7 @@ cblasobjsc=" | |||||
| cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv | cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv | ||||
| cblas_scnrm2 cblas_scasum cblas_cgemmt | cblas_scnrm2 cblas_scasum cblas_cgemmt | ||||
| cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy | cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy | ||||
| cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin | |||||
| " | " | ||||
| cblasobjsd=" | cblasobjsd=" | ||||
| cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot | cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot | ||||
| @@ -69,6 +70,7 @@ cblasobjsd=" | |||||
| cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv | cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv | ||||
| cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt | cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt | ||||
| cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy | cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy | ||||
| cblas_damax cblas_damin | |||||
| " | " | ||||
| cblasobjss=" | cblasobjss=" | ||||
| @@ -80,6 +82,7 @@ cblasobjss=" | |||||
| cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm | cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm | ||||
| cblas_strsv cblas_sgeadd cblas_sgemmt | cblas_strsv cblas_sgeadd cblas_sgemmt | ||||
| cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy | cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy | ||||
| cblas_samax cblas_samin | |||||
| " | " | ||||
| cblasobjsz=" | cblasobjsz=" | ||||
| @@ -91,6 +94,7 @@ cblasobjsz=" | |||||
| cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub | cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub | ||||
| cblas_zaxpby cblas_zgeadd cblas_zgemmt | cblas_zaxpby cblas_zgeadd cblas_zgemmt | ||||
| cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy | cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy | ||||
| cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin | |||||
| " | " | ||||
| cblasobjs="cblas_xerbla" | cblasobjs="cblas_xerbla" | ||||
| @@ -861,6 +865,53 @@ lapackobjs2z="$lapackobjs2z | |||||
| zgedmd | zgedmd | ||||
| zgedmdq | zgedmdq | ||||
| " | " | ||||
| #functions added post 3.11 | |||||
| lapackobjs2c="$lapackobjs2c | |||||
| claqp2rk | |||||
| claqp3rk | |||||
| ctrsyl3 | |||||
| " | |||||
| # claqz0 | |||||
| # claqz1 | |||||
| # claqz2 | |||||
| # claqz3 | |||||
| # clatrs3 | |||||
| lapackobjs2d="$lapackobjs2d | |||||
| dgelqs | |||||
| dgelst | |||||
| dgeqp3rk | |||||
| dgeqrs | |||||
| dlaqp2rk | |||||
| dlaqp3rk | |||||
| dlarmm | |||||
| dlatrs3 | |||||
| dtrsyl3 | |||||
| " | |||||
| # dlaqz0 | |||||
| # dlaqz1 | |||||
| # dlaqz2 | |||||
| # dlaqz3 | |||||
| # dlaqz4 | |||||
| lapackobjs2z="$lapackobjs2z | |||||
| zgelqs | |||||
| zgelst | |||||
| zgeqp3rk | |||||
| zgeqrs | |||||
| zlaqp2rk | |||||
| zlaqp3rk | |||||
| zlatrs3 | |||||
| zrscl | |||||
| ztrsyl3 | |||||
| " | |||||
| # zlaqz0 | |||||
| # zlaqz1 | |||||
| # zlaqz2 | |||||
| # zlaqz3 | |||||
| lapack_extendedprecision_objs=" | lapack_extendedprecision_objs=" | ||||
| zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx | zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx | ||||
| dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx | dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx | ||||
| @@ -1622,6 +1673,14 @@ lapackeobjsc=" | |||||
| LAPACKE_cgetsqrhrt_work | LAPACKE_cgetsqrhrt_work | ||||
| LAPACKE_cungtsqr_row | LAPACKE_cungtsqr_row | ||||
| LAPACKE_cungtsqr_row_work | LAPACKE_cungtsqr_row_work | ||||
| LAPACKE_clangb | |||||
| LAPACKE_clangb_work | |||||
| LAPACKE_ctrsyl3 | |||||
| LAPACKE_ctrsyl3_work | |||||
| LAPACKE_ctz_nancheck | |||||
| LAPACKE_ctz_trans | |||||
| LAPACKE_cunhr_col | |||||
| LAPACKE_cunhr_col_work | |||||
| " | " | ||||
| lapackeobjsd=" | lapackeobjsd=" | ||||
| @@ -2239,6 +2298,14 @@ lapackeobjsd=" | |||||
| LAPACKE_dgetsqrhrt_work | LAPACKE_dgetsqrhrt_work | ||||
| LAPACKE_dorgtsqr_row | LAPACKE_dorgtsqr_row | ||||
| LAPACKE_dorgtsqr_row_work | LAPACKE_dorgtsqr_row_work | ||||
| LAPACKE_dlangb | |||||
| LAPACKE_dlangb_work | |||||
| LAPACKE_dorhr_col | |||||
| LAPACKE_dorhr_col_work | |||||
| LAPACKE_dtrsyl3 | |||||
| LAPACKE_dtrsyl3_work | |||||
| LAPACKE_dtz_nancheck | |||||
| LAPACKE_dtz_trans | |||||
| " | " | ||||
| lapackeobjss=" | lapackeobjss=" | ||||
| @@ -2848,6 +2915,14 @@ lapackeobjss=" | |||||
| LAPACKE_sgetsqrhrt_work | LAPACKE_sgetsqrhrt_work | ||||
| LAPACKE_sorgtsqr_row | LAPACKE_sorgtsqr_row | ||||
| LAPACKE_sorgtsqr_row_work | LAPACKE_sorgtsqr_row_work | ||||
| LAPACKE_slangb | |||||
| LAPACKE_slangb_work | |||||
| LAPACKE_sorhr_col | |||||
| LAPACKE_sorhr_col_work | |||||
| LAPACKE_strsyl3 | |||||
| LAPACKE_strsyl3_work | |||||
| LAPACKE_stz_nancheck | |||||
| LAPACKE_stz_trans | |||||
| " | " | ||||
| lapackeobjsz=" | lapackeobjsz=" | ||||
| @@ -3515,6 +3590,14 @@ lapackeobjsz=" | |||||
| LAPACKE_zgetsqrhrt_work | LAPACKE_zgetsqrhrt_work | ||||
| LAPACKE_zungtsqr_row | LAPACKE_zungtsqr_row | ||||
| LAPACKE_zungtsqr_row_work | LAPACKE_zungtsqr_row_work | ||||
| LAPACKE_zlangb | |||||
| LAPACKE_zlangb_work | |||||
| LAPACKE_ztrsyl3 | |||||
| LAPACKE_ztrsyl3_work | |||||
| LAPACKE_ztz_nancheck | |||||
| LAPACKE_ztz_trans | |||||
| LAPACKE_zunhr_col | |||||
| LAPACKE_zunhr_col_work | |||||
| " | " | ||||
| ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` | ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` | ||||
| ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the | ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the | ||||
| @@ -3616,6 +3699,7 @@ lapack_embeded_underscore_objs_s=" | |||||
| ssysv_aa_2stage ssytrf_aa_2stage | ssysv_aa_2stage ssytrf_aa_2stage | ||||
| ssytrs_aa_2stage | ssytrs_aa_2stage | ||||
| slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col | slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col | ||||
| slarfb_gett | |||||
| " | " | ||||
| lapack_embeded_underscore_objs_c=" | lapack_embeded_underscore_objs_c=" | ||||
| chetf2_rook chetrf_rook chetri_rook | chetf2_rook chetrf_rook chetri_rook | ||||
| @@ -3641,6 +3725,7 @@ lapack_embeded_underscore_objs_c=" | |||||
| csysv_aa_2stage csytrf_aa_2stage | csysv_aa_2stage csytrf_aa_2stage | ||||
| csytrs_aa_2stage | csytrs_aa_2stage | ||||
| claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col | claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col | ||||
| clarfb_gett | |||||
| " | " | ||||
| lapack_embeded_underscore_objs_d=" | lapack_embeded_underscore_objs_d=" | ||||
| dlasyf_rook | dlasyf_rook | ||||
| @@ -3658,6 +3743,7 @@ lapack_embeded_underscore_objs_d=" | |||||
| dsysv_aa_2stage | dsysv_aa_2stage | ||||
| dsytrf_aa_2stage dsytrs_aa_2stage | dsytrf_aa_2stage dsytrs_aa_2stage | ||||
| dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col | dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col | ||||
| dlarfb_gett | |||||
| " | " | ||||
| lapack_embeded_underscore_objs_z=" | lapack_embeded_underscore_objs_z=" | ||||
| zhetf2_rook zhetrf_rook zhetri_rook | zhetf2_rook zhetrf_rook zhetri_rook | ||||
| @@ -3682,6 +3768,7 @@ lapack_embeded_underscore_objs_z=" | |||||
| zhetrs_aa_2stage zsysv_aa_2stage | zhetrs_aa_2stage zsysv_aa_2stage | ||||
| zsytrf_aa_2stage zsytrs_aa_2stage | zsytrf_aa_2stage zsytrs_aa_2stage | ||||
| zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col | zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col | ||||
| zlarfb_gett | |||||
| " | " | ||||
| dirname=`pwd -P`/../lapack-netlib | dirname=`pwd -P`/../lapack-netlib | ||||
| @@ -1679,9 +1679,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define LIBNAME "c910v" | #define LIBNAME "c910v" | ||||
| #define CORENAME "C910V" | #define CORENAME "C910V" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifdef FORCE_x280 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "RISCV64" | |||||
| #define SUBARCHITECTURE "x280" | |||||
| #define SUBDIRNAME "riscv64" | |||||
| #define ARCHCONFIG "-Dx280 " \ | |||||
| "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||||
| #define LIBNAME "x280" | |||||
| #define CORENAME "x280" | |||||
| #else | #else | ||||
| #endif | #endif | ||||
| #ifdef FORCE_RISCV64_ZVL256B | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "RISCV64" | |||||
| #define SUBARCHITECTURE "RISCV64_ZVL256B" | |||||
| #define SUBDIRNAME "riscv64" | |||||
| #define ARCHCONFIG "-DRISCV64_ZVL256B " \ | |||||
| "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||||
| #define LIBNAME "riscv64_zvl256b" | |||||
| #define CORENAME "RISCV64_ZVL256B" | |||||
| #endif | |||||
| #ifdef FORCE_RISCV64_ZVL128B | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "RISCV64" | |||||
| #define SUBARCHITECTURE "RISCV64_ZVL128B" | |||||
| #define SUBDIRNAME "riscv64" | |||||
| #define ARCHCONFIG "-DRISCV64_ZVL128B " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||||
| #define LIBNAME "riscv64_zvl128b" | |||||
| #define CORENAME "RISCV64_ZVL128B" | |||||
| #endif | |||||
| #if defined(FORCE_E2K) || defined(__e2k__) | #if defined(FORCE_E2K) || defined(__e2k__) | ||||
| #define FORCE | #define FORCE | ||||
| @@ -119,6 +119,7 @@ endif () | |||||
| if (BUILD_BFLOAT16) | if (BUILD_BFLOAT16) | ||||
| GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") | GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") | ||||
| GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") | GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") | ||||
| GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||||
| GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") | GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") | ||||
| GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | ||||
| GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | ||||
| @@ -130,6 +131,8 @@ endif () | |||||
| foreach (float_type ${FLOAT_TYPES}) | foreach (float_type ${FLOAT_TYPES}) | ||||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | ||||
| GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type}) | |||||
| GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) | GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) | ||||
| GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) | GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) | ||||
| GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) | GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) | ||||
| @@ -270,7 +270,8 @@ CSBLAS1OBJS = \ | |||||
| cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | ||||
| cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | ||||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | ||||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) | |||||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \ | |||||
| cblas_samin.$(SUFFIX) | |||||
| CSBLAS2OBJS = \ | CSBLAS2OBJS = \ | ||||
| cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | ||||
| @@ -295,7 +296,8 @@ CDBLAS1OBJS = \ | |||||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | ||||
| cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | ||||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | ||||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) | |||||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \ | |||||
| cblas_damin.$(SUFFIX) | |||||
| CDBLAS2OBJS = \ | CDBLAS2OBJS = \ | ||||
| cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | ||||
| @@ -315,7 +317,7 @@ CCBLAS1OBJS = \ | |||||
| cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | ||||
| cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | ||||
| cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | ||||
| cblas_caxpby.$(SUFFIX) \ | |||||
| cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \ | |||||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) | cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) | ||||
| CCBLAS2OBJS = \ | CCBLAS2OBJS = \ | ||||
| @@ -340,12 +342,12 @@ CXERBLAOBJ = \ | |||||
| CZBLAS1OBJS = \ | CZBLAS1OBJS = \ | ||||
| cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ | ||||
| cblas_zcopy.$(SUFFIX) \ | |||||
| cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \ | |||||
| cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ | cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ | ||||
| cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | ||||
| cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | ||||
| cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | ||||
| cblas_zaxpby.$(SUFFIX) \ | |||||
| cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \ | |||||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) | cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) | ||||
| @@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c | |||||
| ifeq ($(BUILD_BFLOAT16),1) | ifeq ($(BUILD_BFLOAT16),1) | ||||
| sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h | sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h | ||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||||
| sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h | |||||
| $(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
| endif | endif | ||||
| @@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c | |||||
| cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c | cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | ||||
| cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||||
| cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||||
| cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||||
| cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) | |||||
| cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
| cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
| cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
| cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F) | |||||
| cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c | cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| @@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c | |||||
| cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c | cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||||
| cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||||
| cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) | |||||
| cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c | cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| @@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | ||||
| ifeq ($(BUILD_BFLOAT16),1) | ifeq ($(BUILD_BFLOAT16),1) | ||||
| cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||||
| cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h | |||||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | ||||
| endif | endif | ||||
| @@ -78,6 +78,9 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||||
| char transA, transB, Uplo; | char transA, transB, Uplo; | ||||
| blasint nrowa, nrowb; | blasint nrowa, nrowb; | ||||
| #if defined(COMPLEX) | |||||
| blasint ncolb; | |||||
| #endif | |||||
| IFLOAT *buffer; | IFLOAT *buffer; | ||||
| IFLOAT *aa, *bb; | IFLOAT *aa, *bb; | ||||
| FLOAT *cc; | FLOAT *cc; | ||||
| @@ -155,19 +158,27 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||||
| uplo = 0; | uplo = 0; | ||||
| if (Uplo == 'L') | if (Uplo == 'L') | ||||
| uplo = 1; | uplo = 1; | ||||
| nrowa = m; | nrowa = m; | ||||
| if (transa) nrowa = k; | |||||
| if (transa & 1) nrowa = k; | |||||
| nrowb = k; | nrowb = k; | ||||
| if (transb) nrowb = m; | |||||
| #if defined(COMPLEX) | |||||
| ncolb = m; | |||||
| #endif | |||||
| if (transb & 1) { | |||||
| nrowb = m; | |||||
| #if defined(COMPLEX) | |||||
| ncolb = k; | |||||
| #endif | |||||
| } | |||||
| info = 0; | info = 0; | ||||
| if (ldc < MAX(1, m)) | if (ldc < MAX(1, m)) | ||||
| info = 13; | info = 13; | ||||
| if (ldb < MAX(1, nrowa)) | |||||
| if (ldb < MAX(1, nrowb)) | |||||
| info = 10; | info = 10; | ||||
| if (lda < MAX(1, nrowb)) | |||||
| if (lda < MAX(1, nrowa)) | |||||
| info = 8; | info = 8; | ||||
| if (k < 0) | if (k < 0) | ||||
| info = 5; | info = 5; | ||||
| @@ -211,6 +222,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| blasint info; | blasint info; | ||||
| blasint lda, ldb; | blasint lda, ldb; | ||||
| FLOAT *a, *b; | FLOAT *a, *b; | ||||
| #if defined(COMPLEX) | |||||
| blasint nrowb, ncolb; | |||||
| #endif | |||||
| XFLOAT *buffer; | XFLOAT *buffer; | ||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| @@ -262,11 +276,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| info = -1; | info = -1; | ||||
| blasint nrowa, nrowb; | |||||
| blasint nrowa; | |||||
| #if !defined(COMPLEX) | |||||
| blasint nrowb; | |||||
| #endif | |||||
| nrowa = m; | nrowa = m; | ||||
| if (transa) nrowa = k; | |||||
| if (transa & 1) nrowa = k; | |||||
| nrowb = k; | nrowb = k; | ||||
| if (transb) nrowb = m; | |||||
| #if defined(COMPLEX) | |||||
| ncolb = m; | |||||
| #endif | |||||
| if (transb & 1) { | |||||
| nrowb = m; | |||||
| #if defined(COMPLEX) | |||||
| ncolb = k; | |||||
| #endif | |||||
| } | |||||
| if (ldc < MAX(1, m)) | if (ldc < MAX(1, m)) | ||||
| info = 13; | info = 13; | ||||
| @@ -330,26 +355,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| info = -1; | info = -1; | ||||
| blasint ncola, ncolb; | |||||
| ncola = k; | |||||
| if (transa) ncola = m; | |||||
| ncolb = m; | |||||
| if (transb) ncolb = k; | |||||
| blasint ncola; | |||||
| #if !defined(COMPLEX) | |||||
| blasint ncolb; | |||||
| #endif | |||||
| ncola = m; | |||||
| if (transa & 1) ncola = k; | |||||
| ncolb = k; | |||||
| #if defined(COMPLEX) | |||||
| nrowb = m; | |||||
| #endif | |||||
| if (transb & 1) { | |||||
| #if defined(COMPLEX) | |||||
| nrowb = k; | |||||
| #endif | |||||
| ncolb = m; | |||||
| } | |||||
| if (ldc < MAX(1,m)) | if (ldc < MAX(1,m)) | ||||
| info = 13; | info = 13; | ||||
| if (ldb < MAX(1, ncolb)) | if (ldb < MAX(1, ncolb)) | ||||
| info = 10; | |||||
| if (lda < MAX(1, ncola)) | |||||
| info = 8; | info = 8; | ||||
| if (lda < MAX(1, ncola)) | |||||
| info = 10; | |||||
| if (k < 0) | if (k < 0) | ||||
| info = 5; | info = 5; | ||||
| if (m < 0) | if (m < 0) | ||||
| info = 4; | info = 4; | ||||
| if (transb < 0) | if (transb < 0) | ||||
| info = 3; | |||||
| if (transa < 0) | |||||
| info = 2; | info = 2; | ||||
| if (transa < 0) | |||||
| info = 3; | |||||
| if (uplo < 0) | if (uplo < 0) | ||||
| info = 1; | info = 1; | ||||
| } | } | ||||
| @@ -428,7 +465,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| IDEBUG_START; | IDEBUG_START; | ||||
| const blasint incb = (transb == 0) ? 1 : ldb; | |||||
| #if defined(COMPLEX) | |||||
| if (transb > 1){ | |||||
| #ifndef CBLAS | |||||
| IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||||
| #else | |||||
| if (order == CblasColMajor) | |||||
| IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||||
| if (order == CblasRowMajor) | |||||
| IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| const blasint incb = ((transb & 1) == 0) ? 1 : ldb; | |||||
| if (uplo == 1) { | if (uplo == 1) { | ||||
| for (i = 0; i < m; i++) { | for (i = 0; i < m; i++) { | ||||
| @@ -438,19 +488,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| #if defined(COMPLEX) | #if defined(COMPLEX) | ||||
| aa = a + i * 2; | aa = a + i * 2; | ||||
| bb = b + i * ldb * 2; | bb = b + i * ldb * 2; | ||||
| if (transa) { | |||||
| if (transa & 1) { | |||||
| aa = a + lda * i * 2; | aa = a + lda * i * 2; | ||||
| } | } | ||||
| if (transb) | |||||
| if (transb & 1) | |||||
| bb = b + i * 2; | bb = b + i * 2; | ||||
| cc = c + i * 2 * ldc + i * 2; | cc = c + i * 2 * ldc + i * 2; | ||||
| #else | #else | ||||
| aa = a + i; | aa = a + i; | ||||
| bb = b + i * ldb; | bb = b + i * ldb; | ||||
| if (transa) { | |||||
| if (transa & 1) { | |||||
| aa = a + lda * i; | aa = a + lda * i; | ||||
| } | } | ||||
| if (transb) | |||||
| if (transb & 1) | |||||
| bb = b + i; | bb = b + i; | ||||
| cc = c + i * ldc + i; | cc = c + i * ldc + i; | ||||
| #endif | #endif | ||||
| @@ -461,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| NULL, 0); | NULL, 0); | ||||
| if (alpha_r == ZERO && alpha_i == ZERO) | if (alpha_r == ZERO && alpha_i == ZERO) | ||||
| return; | |||||
| continue; | |||||
| #else | #else | ||||
| if (beta != ONE) | if (beta != ONE) | ||||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | ||||
| @@ -478,7 +528,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| #endif | #endif | ||||
| // for alignment | // for alignment | ||||
| buffer_size = (buffer_size + 3) & ~3; | buffer_size = (buffer_size + 3) & ~3; | ||||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||||
| STACK_ALLOC(buffer_size, IFLOAT, buffer); | |||||
| #ifdef SMP | #ifdef SMP | ||||
| @@ -491,7 +541,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| #endif | #endif | ||||
| #if defined(COMPLEX) | #if defined(COMPLEX) | ||||
| if (!transa) | |||||
| if (!(transa & 1)) | |||||
| (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | ||||
| aa, lda, bb, incb, cc, 1, | aa, lda, bb, incb, cc, 1, | ||||
| buffer); | buffer); | ||||
| @@ -500,7 +550,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| aa, lda, bb, incb, cc, 1, | aa, lda, bb, incb, cc, 1, | ||||
| buffer); | buffer); | ||||
| #else | #else | ||||
| if (!transa) | |||||
| if (!(transa & 1)) | |||||
| (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, | (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, | ||||
| bb, incb, cc, 1, buffer); | bb, incb, cc, 1, buffer); | ||||
| else | else | ||||
| @@ -509,7 +559,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| #endif | #endif | ||||
| #ifdef SMP | #ifdef SMP | ||||
| } else { | } else { | ||||
| if (!transa) | |||||
| if (!(transa & 1)) | |||||
| (gemv_thread[(int)transa]) (j, k, alpha, aa, | (gemv_thread[(int)transa]) (j, k, alpha, aa, | ||||
| lda, bb, incb, cc, | lda, bb, incb, cc, | ||||
| 1, buffer, | 1, buffer, | ||||
| @@ -533,13 +583,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| l = j; | l = j; | ||||
| #if defined COMPLEX | #if defined COMPLEX | ||||
| bb = b + i * ldb * 2; | bb = b + i * ldb * 2; | ||||
| if (transb) { | |||||
| if (transb & 1) { | |||||
| bb = b + i * 2; | bb = b + i * 2; | ||||
| } | } | ||||
| cc = c + i * 2 * ldc; | cc = c + i * 2 * ldc; | ||||
| #else | #else | ||||
| bb = b + i * ldb; | bb = b + i * ldb; | ||||
| if (transb) { | |||||
| if (transb & 1) { | |||||
| bb = b + i; | bb = b + i; | ||||
| } | } | ||||
| cc = c + i * ldc; | cc = c + i * ldc; | ||||
| @@ -551,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| NULL, 0); | NULL, 0); | ||||
| if (alpha_r == ZERO && alpha_i == ZERO) | if (alpha_r == ZERO && alpha_i == ZERO) | ||||
| return; | |||||
| continue; | |||||
| #else | #else | ||||
| if (beta != ONE) | if (beta != ONE) | ||||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | ||||
| @@ -567,7 +617,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| #endif | #endif | ||||
| // for alignment | // for alignment | ||||
| buffer_size = (buffer_size + 3) & ~3; | buffer_size = (buffer_size + 3) & ~3; | ||||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||||
| STACK_ALLOC(buffer_size, IFLOAT, buffer); | |||||
| #ifdef SMP | #ifdef SMP | ||||
| @@ -580,7 +630,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| #endif | #endif | ||||
| #if defined(COMPLEX) | #if defined(COMPLEX) | ||||
| if (!transa) | |||||
| if (!(transa & 1)) | |||||
| (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | ||||
| a, lda, bb, incb, cc, 1, | a, lda, bb, incb, cc, 1, | ||||
| buffer); | buffer); | ||||
| @@ -589,7 +639,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| a, lda, bb, incb, cc, 1, | a, lda, bb, incb, cc, 1, | ||||
| buffer); | buffer); | ||||
| #else | #else | ||||
| if (!transa) | |||||
| if (!(transa & 1)) | |||||
| (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, | (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, | ||||
| incb, cc, 1, buffer); | incb, cc, 1, buffer); | ||||
| else | else | ||||
| @@ -599,7 +649,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| #ifdef SMP | #ifdef SMP | ||||
| } else { | } else { | ||||
| if (!transa) | |||||
| if (!(transa & 1)) | |||||
| (gemv_thread[(int)transa]) (j, k, alpha, a, lda, | (gemv_thread[(int)transa]) (j, k, alpha, a, lda, | ||||
| bb, incb, cc, 1, | bb, incb, cc, 1, | ||||
| buffer, nthreads); | buffer, nthreads); | ||||
| @@ -154,7 +154,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||||
| } | } | ||||
| #endif | #endif | ||||
| msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT); | |||||
| if ( *rows > *cols ) | |||||
| msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT); | |||||
| else | |||||
| msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT); | |||||
| b = malloc(msize); | b = malloc(msize); | ||||
| if ( b == NULL ) | if ( b == NULL ) | ||||
| @@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||||
| #else | #else | ||||
| #ifdef COMPLEX | |||||
| FLOAT CNAME(blasint n, void *vx, blasint incx){ | |||||
| FLOAT *x = (FLOAT*) vx; | |||||
| #else | |||||
| FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | ||||
| #endif | |||||
| FLOAT ret; | FLOAT ret; | ||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| @@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||||
| else | else | ||||
| { | { | ||||
| dp2 = *dd2 * dy1; | dp2 = *dd2 * dy1; | ||||
| if(dp2 == ZERO) | |||||
| { | |||||
| dflag = -TWO; | |||||
| dparam[0] = dflag; | |||||
| return; | |||||
| } | |||||
| dp1 = *dd1 * *dx1; | dp1 = *dd1 * *dx1; | ||||
| dq2 = dp2 * dy1; | dq2 = dp2 * dy1; | ||||
| dq1 = dp1 * *dx1; | dq1 = dp1 * *dx1; | ||||
| @@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||||
| dh12 = dp2 / dp1; | dh12 = dp2 / dp1; | ||||
| du = ONE - dh12 * dh21; | du = ONE - dh12 * dh21; | ||||
| if(du > ZERO) | |||||
| { | |||||
| dflag = ZERO; | |||||
| *dd1 = *dd1 / du; | |||||
| *dd2 = *dd2 / du; | |||||
| *dx1 = *dx1 * du; | |||||
| } else { | |||||
| dflag = -ONE; | |||||
| dh11 = ZERO; | |||||
| dh12 = ZERO; | |||||
| dh21 = ZERO; | |||||
| dh22 = ZERO; | |||||
| *dd1 = ZERO; | |||||
| *dd2 = ZERO; | |||||
| *dx1 = ZERO; | |||||
| } | |||||
| dflag = ZERO; | |||||
| *dd1 = *dd1 / du; | |||||
| *dd2 = *dd2 / du; | |||||
| *dx1 = *dx1 * du; | |||||
| } | } | ||||
| else | else | ||||
| @@ -0,0 +1,447 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2024, The OpenBLAS Project. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /*********************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include <stdlib.h> | |||||
| #include "common.h" | |||||
| #define SMP_THRESHOLD_MIN 65536.0 | |||||
| #define ERROR_NAME "SBGEMMT " | |||||
| #ifndef GEMM_MULTITHREAD_THRESHOLD | |||||
| #define GEMM_MULTITHREAD_THRESHOLD 4 | |||||
| #endif | |||||
| #ifndef CBLAS | |||||
| void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||||
| blasint * M, blasint * K, | |||||
| FLOAT * Alpha, | |||||
| IFLOAT * a, blasint * ldA, | |||||
| IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) | |||||
| { | |||||
| blasint m, k; | |||||
| blasint lda, ldb, ldc; | |||||
| int transa, transb, uplo; | |||||
| blasint info; | |||||
| char transA, transB, Uplo; | |||||
| blasint nrowa, nrowb; | |||||
| IFLOAT *buffer; | |||||
| IFLOAT *aa, *bb; | |||||
| FLOAT *cc; | |||||
| FLOAT alpha, beta; | |||||
| PRINT_DEBUG_NAME; | |||||
| m = *M; | |||||
| k = *K; | |||||
| alpha = *Alpha; | |||||
| beta = *Beta; | |||||
| lda = *ldA; | |||||
| ldb = *ldB; | |||||
| ldc = *ldC; | |||||
| transA = *TRANSA; | |||||
| transB = *TRANSB; | |||||
| Uplo = *UPLO; | |||||
| TOUPPER(transA); | |||||
| TOUPPER(transB); | |||||
| TOUPPER(Uplo); | |||||
| transa = -1; | |||||
| transb = -1; | |||||
| uplo = -1; | |||||
| if (transA == 'N') | |||||
| transa = 0; | |||||
| if (transA == 'T') | |||||
| transa = 1; | |||||
| if (transA == 'R') | |||||
| transa = 0; | |||||
| if (transA == 'C') | |||||
| transa = 1; | |||||
| if (transB == 'N') | |||||
| transb = 0; | |||||
| if (transB == 'T') | |||||
| transb = 1; | |||||
| if (transB == 'R') | |||||
| transb = 0; | |||||
| if (transB == 'C') | |||||
| transb = 1; | |||||
| if (Uplo == 'U') | |||||
| uplo = 0; | |||||
| if (Uplo == 'L') | |||||
| uplo = 1; | |||||
| nrowa = m; | |||||
| if (transa & 1) nrowa = k; | |||||
| nrowb = k; | |||||
| if (transb & 1) nrowb = m; | |||||
| info = 0; | |||||
| if (ldc < MAX(1, m)) | |||||
| info = 13; | |||||
| if (ldb < MAX(1, nrowb)) | |||||
| info = 10; | |||||
| if (lda < MAX(1, nrowa)) | |||||
| info = 8; | |||||
| if (k < 0) | |||||
| info = 5; | |||||
| if (m < 0) | |||||
| info = 4; | |||||
| if (transb < 0) | |||||
| info = 3; | |||||
| if (transa < 0) | |||||
| info = 2; | |||||
| if (uplo < 0) | |||||
| info = 1; | |||||
| if (info != 0) { | |||||
| BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||||
| return; | |||||
| } | |||||
| #else | |||||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||||
| enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, | |||||
| blasint k, | |||||
| FLOAT alpha, | |||||
| IFLOAT * A, blasint LDA, | |||||
| IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) | |||||
| { | |||||
| IFLOAT *aa, *bb; | |||||
| FLOAT *cc; | |||||
| int transa, transb, uplo; | |||||
| blasint info; | |||||
| blasint lda, ldb; | |||||
| IFLOAT *a, *b; | |||||
| XFLOAT *buffer; | |||||
| PRINT_DEBUG_CNAME; | |||||
| uplo = -1; | |||||
| transa = -1; | |||||
| transb = -1; | |||||
| info = 0; | |||||
| if (order == CblasColMajor) { | |||||
| if (Uplo == CblasUpper) uplo = 0; | |||||
| if (Uplo == CblasLower) uplo = 1; | |||||
| if (TransA == CblasNoTrans) | |||||
| transa = 0; | |||||
| if (TransA == CblasTrans) | |||||
| transa = 1; | |||||
| if (TransA == CblasConjNoTrans) | |||||
| transa = 0; | |||||
| if (TransA == CblasConjTrans) | |||||
| transa = 1; | |||||
| if (TransB == CblasNoTrans) | |||||
| transb = 0; | |||||
| if (TransB == CblasTrans) | |||||
| transb = 1; | |||||
| if (TransB == CblasConjNoTrans) | |||||
| transb = 0; | |||||
| if (TransB == CblasConjTrans) | |||||
| transb = 1; | |||||
| a = (void *)A; | |||||
| b = (void *)B; | |||||
| lda = LDA; | |||||
| ldb = LDB; | |||||
| info = -1; | |||||
| blasint nrowa; | |||||
| blasint nrowb; | |||||
| nrowa = m; | |||||
| if (transa & 1) nrowa = k; | |||||
| nrowb = k; | |||||
| if (transb & 1) nrowb = m; | |||||
| if (ldc < MAX(1, m)) | |||||
| info = 13; | |||||
| if (ldb < MAX(1, nrowb)) | |||||
| info = 10; | |||||
| if (lda < MAX(1, nrowa)) | |||||
| info = 8; | |||||
| if (k < 0) | |||||
| info = 5; | |||||
| if (m < 0) | |||||
| info = 4; | |||||
| if (transb < 0) | |||||
| info = 3; | |||||
| if (transa < 0) | |||||
| info = 2; | |||||
| if (uplo < 0) | |||||
| info = 1; | |||||
| } | |||||
| if (order == CblasRowMajor) { | |||||
| a = (void *)B; | |||||
| b = (void *)A; | |||||
| lda = LDB; | |||||
| ldb = LDA; | |||||
| if (Uplo == CblasUpper) uplo = 0; | |||||
| if (Uplo == CblasLower) uplo = 1; | |||||
| if (TransB == CblasNoTrans) | |||||
| transa = 0; | |||||
| if (TransB == CblasTrans) | |||||
| transa = 1; | |||||
| if (TransB == CblasConjNoTrans) | |||||
| transa = 0; | |||||
| if (TransB == CblasConjTrans) | |||||
| transa = 1; | |||||
| if (TransA == CblasNoTrans) | |||||
| transb = 0; | |||||
| if (TransA == CblasTrans) | |||||
| transb = 1; | |||||
| if (TransA == CblasConjNoTrans) | |||||
| transb = 0; | |||||
| if (TransA == CblasConjTrans) | |||||
| transb = 1; | |||||
| info = -1; | |||||
| blasint ncola; | |||||
| blasint ncolb; | |||||
| ncola = m; | |||||
| if (transa & 1) ncola = k; | |||||
| ncolb = k; | |||||
| if (transb & 1) { | |||||
| ncolb = m; | |||||
| } | |||||
| if (ldc < MAX(1,m)) | |||||
| info = 13; | |||||
| if (ldb < MAX(1, ncolb)) | |||||
| info = 8; | |||||
| if (lda < MAX(1, ncola)) | |||||
| info = 10; | |||||
| if (k < 0) | |||||
| info = 5; | |||||
| if (m < 0) | |||||
| info = 4; | |||||
| if (transb < 0) | |||||
| info = 2; | |||||
| if (transa < 0) | |||||
| info = 3; | |||||
| if (uplo < 0) | |||||
| info = 1; | |||||
| } | |||||
| if (info >= 0) { | |||||
| BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| int buffer_size; | |||||
| blasint i, j; | |||||
| #ifdef SMP | |||||
| int nthreads; | |||||
| #endif | |||||
| #ifdef SMP | |||||
| static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, | |||||
| BLASLONG, IFLOAT *, BLASLONG, FLOAT, | |||||
| FLOAT *, BLASLONG, int) = { | |||||
| sbgemv_thread_n, sbgemv_thread_t, | |||||
| }; | |||||
| #endif | |||||
| int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG, | |||||
| IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = { | |||||
| SBGEMV_N, SBGEMV_T,}; | |||||
| if (m == 0) | |||||
| return; | |||||
| IDEBUG_START; | |||||
| const blasint incb = ((transb & 1) == 0) ? 1 : ldb; | |||||
| if (uplo == 1) { | |||||
| for (i = 0; i < m; i++) { | |||||
| j = m - i; | |||||
| aa = a + i; | |||||
| bb = b + i * ldb; | |||||
| if (transa & 1) { | |||||
| aa = a + lda * i; | |||||
| } | |||||
| if (transb & 1) | |||||
| bb = b + i; | |||||
| cc = c + i * ldc + i; | |||||
| #if 0 | |||||
| if (beta != ONE) | |||||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||||
| if (alpha == ZERO) | |||||
| continue; | |||||
| #endif | |||||
| IDEBUG_START; | |||||
| buffer_size = j + k + 128 / sizeof(FLOAT); | |||||
| #ifdef WINDOWS_ABI | |||||
| buffer_size += 160 / sizeof(FLOAT); | |||||
| #endif | |||||
| // for alignment | |||||
| buffer_size = (buffer_size + 3) & ~3; | |||||
| STACK_ALLOC(buffer_size, IFLOAT, buffer); | |||||
| #ifdef SMP | |||||
| if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) | |||||
| nthreads = 1; | |||||
| else | |||||
| nthreads = num_cpu_avail(2); | |||||
| if (nthreads == 1) { | |||||
| #endif | |||||
| if (!(transa & 1)) | |||||
| (gemv[(int)transa]) (j, k, alpha, aa, lda, | |||||
| bb, incb, beta, cc, 1); | |||||
| else | |||||
| (gemv[(int)transa]) (k, j, alpha, aa, lda, | |||||
| bb, incb, beta, cc, 1); | |||||
| #ifdef SMP | |||||
| } else { | |||||
| if (!(transa & 1)) | |||||
| (gemv_thread[(int)transa]) (j, k, alpha, aa, | |||||
| lda, bb, incb, beta, cc, | |||||
| 1, nthreads); | |||||
| else | |||||
| (gemv_thread[(int)transa]) (k, j, alpha, aa, | |||||
| lda, bb, incb, beta, cc, | |||||
| 1, nthreads); | |||||
| } | |||||
| #endif | |||||
| STACK_FREE(buffer); | |||||
| } | |||||
| } else { | |||||
| for (i = 0; i < m; i++) { | |||||
| j = i + 1; | |||||
| bb = b + i * ldb; | |||||
| if (transb & 1) { | |||||
| bb = b + i; | |||||
| } | |||||
| cc = c + i * ldc; | |||||
| #if 0 | |||||
| if (beta != ONE) | |||||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||||
| if (alpha == ZERO) | |||||
| continue; | |||||
| #endif | |||||
| IDEBUG_START; | |||||
| buffer_size = j + k + 128 / sizeof(FLOAT); | |||||
| #ifdef WINDOWS_ABI | |||||
| buffer_size += 160 / sizeof(FLOAT); | |||||
| #endif | |||||
| // for alignment | |||||
| buffer_size = (buffer_size + 3) & ~3; | |||||
| STACK_ALLOC(buffer_size, IFLOAT, buffer); | |||||
| #ifdef SMP | |||||
| if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) | |||||
| nthreads = 1; | |||||
| else | |||||
| nthreads = num_cpu_avail(2); | |||||
| if (nthreads == 1) { | |||||
| #endif | |||||
| if (!(transa & 1)) | |||||
| (gemv[(int)transa]) (j, k, alpha, a, lda, bb, | |||||
| incb, beta, cc, 1); | |||||
| else | |||||
| (gemv[(int)transa]) (k, j, alpha, a, lda, bb, | |||||
| incb, beta, cc, 1); | |||||
| #ifdef SMP | |||||
| } else { | |||||
| if (!(transa & 1)) | |||||
| (gemv_thread[(int)transa]) (j, k, alpha, a, lda, | |||||
| bb, incb, beta, cc, 1, | |||||
| nthreads); | |||||
| else | |||||
| (gemv_thread[(int)transa]) (k, j, alpha, a, lda, | |||||
| bb, incb, beta, cc, 1, | |||||
| nthreads); | |||||
| } | |||||
| #endif | |||||
| STACK_FREE(buffer); | |||||
| } | |||||
| } | |||||
| IDEBUG_END; | |||||
| return; | |||||
| } | |||||
| @@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) | |||||
| void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY) | |||||
| { | { | ||||
| blasint n = *N; | blasint n = *N; | ||||
| blasint incx = *INCX; | blasint incx = *INCX; | ||||
| blasint incy = *INCY; | blasint incy = *INCY; | ||||
| FLOAT* ALPHA = (FLOAT*) VALPHA; | |||||
| FLOAT* BETA = (FLOAT*) VBETA; | |||||
| #else | #else | ||||
| @@ -183,7 +183,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||||
| } | } | ||||
| #endif | #endif | ||||
| msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2; | |||||
| if ( *rows > *cols ) | |||||
| msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2; | |||||
| else | |||||
| msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2; | |||||
| b = malloc(msize); | b = malloc(msize); | ||||
| if ( b == NULL ) | if ( b == NULL ) | ||||
| @@ -1349,6 +1349,9 @@ endif () | |||||
| set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}") | set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}") | ||||
| get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES) | get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES) | ||||
| set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}") | set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}") | ||||
| if (USE_GEMM3M) | |||||
| target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) | |||||
| endif() | |||||
| endfunction () | endfunction () | ||||
| @@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, | |||||
| if ( rows <= 0 ) return(0); | if ( rows <= 0 ) return(0); | ||||
| if ( cols <= 0 ) return(0); | if ( cols <= 0 ) return(0); | ||||
| if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0); | |||||
| aptr = a; | aptr = a; | ||||
| lda *= 2; | lda *= 2; | ||||
| @@ -58,6 +58,8 @@ ZAXPYKERNEL = caxpy_lsx.S | |||||
| SAXPBYKERNEL = axpby_lsx.S | SAXPBYKERNEL = axpby_lsx.S | ||||
| DAXPBYKERNEL = axpby_lsx.S | DAXPBYKERNEL = axpby_lsx.S | ||||
| CAXPBYKERNEL = caxpby_lsx.S | |||||
| ZAXPBYKERNEL = caxpby_lsx.S | |||||
| SSUMKERNEL = sum_lsx.S | SSUMKERNEL = sum_lsx.S | ||||
| DSUMKERNEL = sum_lsx.S | DSUMKERNEL = sum_lsx.S | ||||
| @@ -98,9 +100,13 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| CGEMMKERNEL = cgemm_kernel_2x2_lsx.S | |||||
| CGEMMONCOPY = cgemm_ncopy_2_lsx.S | |||||
| CGEMMOTCOPY = cgemm_tcopy_2_lsx.S | |||||
| CGEMMKERNEL = cgemm_kernel_8x4_lsx.S | |||||
| CGEMMINCOPY = cgemm_ncopy_8_lsx.S | |||||
| CGEMMITCOPY = cgemm_tcopy_8_lsx.S | |||||
| CGEMMONCOPY = cgemm_ncopy_4_lsx.S | |||||
| CGEMMOTCOPY = cgemm_tcopy_4_lsx.S | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| @@ -109,4 +115,14 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S | |||||
| ZGEMMONCOPY = zgemm_ncopy_4_lsx.S | |||||
| ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| endif | endif | ||||
| @@ -58,6 +58,8 @@ ZAXPYKERNEL = caxpy_lasx.S | |||||
| SAXPBYKERNEL = axpby_lasx.S | SAXPBYKERNEL = axpby_lasx.S | ||||
| DAXPBYKERNEL = axpby_lasx.S | DAXPBYKERNEL = axpby_lasx.S | ||||
| CAXPBYKERNEL = caxpby_lasx.S | |||||
| ZAXPBYKERNEL = caxpby_lasx.S | |||||
| SSUMKERNEL = sum_lasx.S | SSUMKERNEL = sum_lasx.S | ||||
| DSUMKERNEL = sum_lasx.S | DSUMKERNEL = sum_lasx.S | ||||
| @@ -120,9 +122,13 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S | |||||
| ZGEMMONCOPY = zgemm_ncopy_2_lasx.S | |||||
| ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S | |||||
| ZGEMMKERNEL = zgemm_kernel_8x4_lasx.S | |||||
| ZGEMMINCOPY = zgemm_ncopy_8_lasx.S | |||||
| ZGEMMITCOPY = zgemm_tcopy_8_lasx.S | |||||
| ZGEMMONCOPY = zgemm_ncopy_4_lasx.S | |||||
| ZGEMMOTCOPY = zgemm_tcopy_4_lasx.S | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| @@ -124,7 +124,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L13: | .L13: | ||||
| FABS $f0, $f0 | FABS $f0, $f0 | ||||
| SUB $f0, $f0, $f0 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| .align 3 | .align 3 | ||||
| @@ -57,10 +57,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
| li.d TEMP, 1 | |||||
| movgr2fr.d a1, $r0 | movgr2fr.d a1, $r0 | ||||
| ffint.s.l a1, a1 | ffint.s.l a1, a1 | ||||
| slli.d TEMP, TEMP, BASE_SHIFT | |||||
| slli.d INCX, INCX, BASE_SHIFT | slli.d INCX, INCX, BASE_SHIFT | ||||
| slli.d INCY, INCY, BASE_SHIFT | slli.d INCY, INCY, BASE_SHIFT | ||||
| MTG t1, ALPHA | MTG t1, ALPHA | ||||
| @@ -75,6 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvreplgr2vr.w VXB, t2 | xvreplgr2vr.w VXB, t2 | ||||
| xvreplgr2vr.w VXZ, t3 | xvreplgr2vr.w VXZ, t3 | ||||
| #endif | #endif | ||||
| // If incx == 0 || incy == 0, do one by one | |||||
| and TEMP, INCX, INCY | |||||
| or I, N, N | |||||
| beqz TEMP, .L998 | |||||
| li.d TEMP, 1 | |||||
| slli.d TEMP, TEMP, BASE_SHIFT | |||||
| srai.d I, N, 3 | srai.d I, N, 3 | ||||
| bne INCX, TEMP, .L20 | bne INCX, TEMP, .L20 | ||||
| bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 | bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 | ||||
| @@ -57,10 +57,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
| li.d TEMP, 1 | |||||
| movgr2fr.d a1, $r0 | movgr2fr.d a1, $r0 | ||||
| ffint.s.l a1, a1 | ffint.s.l a1, a1 | ||||
| slli.d TEMP, TEMP, BASE_SHIFT | |||||
| slli.d INCX, INCX, BASE_SHIFT | slli.d INCX, INCX, BASE_SHIFT | ||||
| slli.d INCY, INCY, BASE_SHIFT | slli.d INCY, INCY, BASE_SHIFT | ||||
| MTG t1, ALPHA | MTG t1, ALPHA | ||||
| @@ -75,6 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vreplgr2vr.w VXB, t2 | vreplgr2vr.w VXB, t2 | ||||
| vreplgr2vr.w VXZ, t3 | vreplgr2vr.w VXZ, t3 | ||||
| #endif | #endif | ||||
| // If incx == 0 || incy == 0, do one by one | |||||
| and TEMP, INCX, INCY | |||||
| or I, N, N | |||||
| beqz TEMP, .L998 | |||||
| li.d TEMP, 1 | |||||
| slli.d TEMP, TEMP, BASE_SHIFT | |||||
| srai.d I, N, 3 | srai.d I, N, 3 | ||||
| bne INCX, TEMP, .L20 | bne INCX, TEMP, .L20 | ||||
| bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 | bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 | ||||
| @@ -0,0 +1,341 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Function parameters */ | |||||
| #define M $r4 // param 1: m | |||||
| #define N $r5 // param 2: n | |||||
| #define SRC $r6 // param 3: src | |||||
| #define LDA $r7 // param 4: lda | |||||
| #define DST $r8 // param 5: dst | |||||
| #define I $r9 | |||||
| #define J $r10 | |||||
| #define S1 $r12 | |||||
| #define S2 $r13 | |||||
| #define S3 $r14 | |||||
| #define S4 $r15 | |||||
| #define S5 $r16 | |||||
| #define S6 $r17 | |||||
| #define S7 $r18 | |||||
| #define TD $r20 | |||||
| #define TS $r11 | |||||
| #define TL $r19 | |||||
| #define T0 $r23 | |||||
| #define ZERO $r0 | |||||
| #define F0 $f0 | |||||
| #define F1 $f1 | |||||
| #define F2 $f2 | |||||
| #define F3 $f3 | |||||
| #define F4 $f4 | |||||
| #define F5 $f5 | |||||
| #define F6 $f6 | |||||
| #define F7 $f7 | |||||
| /* LSX vectors */ | |||||
| #define U0 $vr0 | |||||
| #define U1 $vr1 | |||||
| #define U2 $vr2 | |||||
| #define U3 $vr3 | |||||
| #define U4 $vr4 | |||||
| #define U5 $vr5 | |||||
| #define U6 $vr6 | |||||
| #define U7 $vr7 | |||||
| #define D0 $vr8 | |||||
| #define D1 $vr9 | |||||
| #define D2 $vr10 | |||||
| #define D3 $vr11 | |||||
| #define D4 $vr12 | |||||
| #define D5 $vr13 | |||||
| #define D6 $vr14 | |||||
| #define D7 $vr15 | |||||
| #define D8 $vr16 | |||||
| PROLOGUE | |||||
| addi.d $sp, $sp, -8 | |||||
| SDARG $r23, $sp, 0 | |||||
| move TD, DST //boffset | |||||
| move TS, SRC //aoffset | |||||
| slli.d TL, LDA, 0x02 | |||||
| slli.d TL, TL, 0x01 | |||||
| srai.d J, N, 0x02 | |||||
| beq J, ZERO, .L_N0 | |||||
| .L_J1: /* J-- */ | |||||
| move S1, TS | |||||
| add.d S2, S1, TL | |||||
| add.d S3, S2, TL | |||||
| add.d S4, S3, TL | |||||
| slli.d T0, TL, 0x02 | |||||
| add.d TS, TS, T0 | |||||
| srai.d I, M, 0x02 | |||||
| beq I, ZERO, .L_I3 | |||||
| .L_I1: /* I-- */ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S2, 0x00 | |||||
| vld U3, S2, 0x10 | |||||
| vld U4, S3, 0x00 | |||||
| vld U5, S3, 0x10 | |||||
| vld U6, S4, 0x00 | |||||
| vld U7, S4, 0x10 | |||||
| vand.v D0, U2, U2 | |||||
| vand.v D1, U3, U3 | |||||
| vand.v D2, U2, U2 | |||||
| vand.v D3, U3, U3 | |||||
| vand.v D4, U6, U6 | |||||
| vand.v D5, U7, U7 | |||||
| vand.v D6, U6, U6 | |||||
| vand.v D7, U7, U7 | |||||
| vpermi.w D0, U0, 0x44 | |||||
| vpermi.w D4, U4, 0x44 | |||||
| vpermi.w D2, U0, 0xee | |||||
| vpermi.w D6, U4, 0xee | |||||
| vpermi.w D1, U1, 0x44 | |||||
| vpermi.w D5, U5, 0x44 | |||||
| vpermi.w D3, U1, 0xee | |||||
| vpermi.w D7, U5, 0xee | |||||
| vst D0, TD, 0x00 | |||||
| vst D4, TD, 0x10 | |||||
| vst D2, TD, 0x20 | |||||
| vst D6, TD, 0x30 | |||||
| vst D1, TD, 0x40 | |||||
| vst D5, TD, 0x50 | |||||
| vst D3, TD, 0x60 | |||||
| vst D7, TD, 0x70 | |||||
| addi.d S1, S1, 0x20 // a_offset | |||||
| addi.d S2, S2, 0x20 | |||||
| addi.d S3, S3, 0x20 | |||||
| addi.d S4, S4, 0x20 | |||||
| addi.d TD, TD, 0x80 // b_offset | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_I1 | |||||
| .L_I3: /* if(m&2) */ | |||||
| andi I, M, 0x02 | |||||
| beq I, ZERO, .L_II20 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S2, 0x00 | |||||
| vld U2, S3, 0x00 | |||||
| vld U3, S4, 0x00 | |||||
| vand.v D0, U1, U1 | |||||
| vand.v D1, U1, U1 | |||||
| vand.v D2, U3, U3 | |||||
| vand.v D3, U3, U3 | |||||
| vpermi.w D0, U0, 0x44 | |||||
| vpermi.w D2, U2, 0x44 | |||||
| vpermi.w D1, U0, 0xee | |||||
| vpermi.w D3, U2, 0xee | |||||
| vst D0, TD, 0x00 | |||||
| vst D2, TD, 0x10 | |||||
| vst D1, TD, 0x20 | |||||
| vst D3, TD, 0x30 | |||||
| addi.d S1, S1, 0x10 | |||||
| addi.d S2, S2, 0x10 | |||||
| addi.d S3, S3, 0x10 | |||||
| addi.d S4, S4, 0x10 | |||||
| addi.d TD, TD, 0x40 | |||||
| .L_II20: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq I, ZERO, .L_J0 | |||||
| fld.s F0, S1, 0x00 | |||||
| fld.s F1, S1, 0x04 | |||||
| fld.s F2, S2, 0x00 | |||||
| fld.s F3, S2, 0x04 | |||||
| fld.s F4, S3, 0x00 | |||||
| fld.s F5, S3, 0x04 | |||||
| fld.s F6, S4, 0x00 | |||||
| fld.s F7, S4, 0x04 | |||||
| fst.s F0, TD, 0x00 | |||||
| fst.s F1, TD, 0x04 | |||||
| fst.s F2, TD, 0x08 | |||||
| fst.s F3, TD, 0x0c | |||||
| fst.s F4, TD, 0x10 | |||||
| fst.s F5, TD, 0x14 | |||||
| fst.s F6, TD, 0x18 | |||||
| fst.s F7, TD, 0x1c | |||||
| addi.d TD, TD, 0x20 | |||||
| .L_J0: | |||||
| addi.d J, J, -1 | |||||
| blt ZERO, J, .L_J1 | |||||
| .L_N0: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_N20 | |||||
| move S1, TS | |||||
| add.d S2, S1, TL | |||||
| slli.d T0, TL, 0x01 | |||||
| add.d TS, TS, T0 | |||||
| srai.d I, M, 0x02 | |||||
| beq ZERO, I, .L_N10 | |||||
| .L_N11: /* if(i>0) */ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S2, 0x00 | |||||
| vld U3, S2, 0x10 | |||||
| vand.v D0, U2, U2 | |||||
| vand.v D1, U3, U3 | |||||
| vand.v D2, U2, U2 | |||||
| vand.v D3, U3, U3 | |||||
| vpermi.w D0, U0, 0x44 | |||||
| vpermi.w D2, U0, 0xee | |||||
| vpermi.w D1, U1, 0x44 | |||||
| vpermi.w D3, U1, 0xee | |||||
| vst D0, TD, 0x00 | |||||
| vst D2, TD, 0x10 | |||||
| vst D1, TD, 0x20 | |||||
| vst D3, TD, 0x30 | |||||
| addi.d S1, S1, 0x20 // a_offset | |||||
| addi.d S2, S2, 0x20 | |||||
| addi.d TD, TD, 0x40 // b_offset | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N11 | |||||
| .L_N10: /* if(m&2) */ | |||||
| andi I, M, 0x02 | |||||
| beq I, ZERO, .L_N130 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S2, 0x00 | |||||
| vand.v D0, U1, U1 | |||||
| vpermi.w D0, U0, 0x44 | |||||
| vpermi.w U1, U0, 0xee | |||||
| vst D0, TD, 0x00 | |||||
| vst U1, TD, 0x10 | |||||
| addi.d S1, S1, 0x10 // a_offset | |||||
| addi.d S2, S2, 0x10 | |||||
| addi.d TD, TD, 0x20 // b_offset | |||||
| .L_N130: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq I, ZERO, .L_N20 | |||||
| fld.s F0, S1, 0x00 | |||||
| fld.s F1, S1, 0x04 | |||||
| fld.s F2, S2, 0x00 | |||||
| fld.s F3, S2, 0x04 | |||||
| fst.s F0, TD, 0x00 | |||||
| fst.s F1, TD, 0x04 | |||||
| fst.s F2, TD, 0x08 | |||||
| fst.s F3, TD, 0x0c | |||||
| addi.d TD, TD, 0x10 | |||||
| .L_N20: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq I, ZERO, .L_N00 | |||||
| move S1, TS | |||||
| srai.d I, M, 0x02 | |||||
| beq I, ZERO, .L_N30 | |||||
| .L_N21: /* if(i>0) */ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vst U0, TD, 0x00 | |||||
| vst U1, TD, 0x10 | |||||
| addi.d S1, S1, 0x20 // aoffset1 | |||||
| addi.d TD, TD, 0x20 // b_offset | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N21 | |||||
| .L_N30: /* if(m&2) */ | |||||
| andi I, M, 0x02 | |||||
| beq I, ZERO, .L_N330 | |||||
| vld U0, S1, 0x00 | |||||
| vst U0, TD, 0x00 | |||||
| addi.d S1, S1, 0x10 // aoffset1 | |||||
| addi.d TD, TD, 0x10 // b_offset | |||||
| .L_N330: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq I, ZERO, .L_N00 | |||||
| fld.s F0, S1, 0x00 | |||||
| fld.s F1, S1, 0x04 | |||||
| fst.s F0, TD, 0x00 | |||||
| fst.s F1, TD, 0x04 | |||||
| .L_N00: | |||||
| LDARG $r23, $sp, 0 | |||||
| addi.d $sp, $sp, 8 | |||||
| jirl $r0, $r1, 0x00 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,263 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Function parameters */ | |||||
| #define M $r4 // param 1: m | |||||
| #define N $r5 // param 2: n | |||||
| #define SRC $r6 // param 3: src | |||||
| #define LDA $r7 // param 4: lda | |||||
| #define DST $r8 // param 5: dst | |||||
| #define I $r9 | |||||
| #define J $r10 | |||||
| #define S1 $r12 | |||||
| #define S2 $r13 | |||||
| #define S3 $r14 | |||||
| #define S4 $r15 | |||||
| #define S5 $r16 | |||||
| #define S6 $r17 | |||||
| #define S7 $r18 | |||||
| #define S8 $r19 | |||||
| #define TD $r20 | |||||
| #define TS $r11 | |||||
| #define TL $r7 | |||||
| #define T0 $r23 | |||||
| #define ZERO $r0 | |||||
| #define F0 $f0 | |||||
| #define F1 $f1 | |||||
| #define F2 $f2 | |||||
| #define F3 $f3 | |||||
| #define F4 $f4 | |||||
| #define F5 $f5 | |||||
| #define F6 $f6 | |||||
| #define F7 $f7 | |||||
| /* LSX vectors */ | |||||
| #define U0 $vr0 | |||||
| #define U1 $vr1 | |||||
| #define U2 $vr2 | |||||
| #define U3 $vr3 | |||||
| #define U4 $vr4 | |||||
| #define U5 $vr5 | |||||
| #define U6 $vr6 | |||||
| #define U7 $vr7 | |||||
| #define D0 $vr8 | |||||
| #define D1 $vr9 | |||||
| #define D2 $vr10 | |||||
| #define D3 $vr11 | |||||
| #define D4 $vr12 | |||||
| #define D5 $vr13 | |||||
| #define D6 $vr14 | |||||
| #define D7 $vr15 | |||||
| #define D8 $vr16 | |||||
| PROLOGUE | |||||
| addi.d $sp, $sp, -8 | |||||
| SDARG $r23, $sp, 0 | |||||
| move TD, DST //boffset | |||||
| move TS, SRC //aoffset | |||||
| slli.d TL, LDA, 0x02 //lda | |||||
| slli.d TL, TL, 0x01 | |||||
| slli.d T0, TL, 0x03 | |||||
| srai.d J, N, 0x03 //j | |||||
| beq J, ZERO, .L_N1 | |||||
| .L_J1: /* if(j>0) j--*/ | |||||
| move S1, TS | |||||
| add.d S2, TS, TL | |||||
| move I, M | |||||
| add.d S3, S2, TL | |||||
| add.d S4, S3, TL | |||||
| add.d S5, S4, TL | |||||
| add.d S6, S5, TL | |||||
| add.d S7, S6, TL | |||||
| add.d S8, S7, TL | |||||
| add.d TS, TS, T0 | |||||
| beq I, ZERO, .L_J11 | |||||
| .L_I1: /* if(i>0) i--*/ | |||||
| fld.s F0, S1, 0x00 | |||||
| fld.s F1, S1, 0x04 | |||||
| fld.s F2, S2, 0x00 | |||||
| fld.s F3, S2, 0x04 | |||||
| fld.s F4, S3, 0x00 | |||||
| fld.s F5, S3, 0x04 | |||||
| fld.s F6, S4, 0x00 | |||||
| fld.s F7, S4, 0x04 | |||||
| fst.s F0, TD, 0x00 | |||||
| fst.s F1, TD, 0x04 | |||||
| fst.s F2, TD, 0x08 | |||||
| fst.s F3, TD, 0x0c | |||||
| fst.s F4, TD, 0x10 | |||||
| fst.s F5, TD, 0x14 | |||||
| fst.s F6, TD, 0x18 | |||||
| fst.s F7, TD, 0x1c | |||||
| fld.s F0, S5, 0x00 | |||||
| fld.s F1, S5, 0x04 | |||||
| fld.s F2, S6, 0x00 | |||||
| fld.s F3, S6, 0x04 | |||||
| fld.s F4, S7, 0x00 | |||||
| fld.s F5, S7, 0x04 | |||||
| fld.s F6, S8, 0x00 | |||||
| fld.s F7, S8, 0x04 | |||||
| fst.s F0, TD, 0x20 | |||||
| fst.s F1, TD, 0x24 | |||||
| fst.s F2, TD, 0x28 | |||||
| fst.s F3, TD, 0x2c | |||||
| fst.s F4, TD, 0x30 | |||||
| fst.s F5, TD, 0x34 | |||||
| fst.s F6, TD, 0x38 | |||||
| fst.s F7, TD, 0x3c | |||||
| addi.d S1, S1, 0x08 | |||||
| addi.d S2, S2, 0x08 | |||||
| addi.d S3, S3, 0x08 | |||||
| addi.d S4, S4, 0x08 | |||||
| addi.d S5, S5, 0x08 | |||||
| addi.d S6, S6, 0x08 | |||||
| addi.d S7, S7, 0x08 | |||||
| addi.d S8, S8, 0x08 | |||||
| addi.d TD, TD, 0x40 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_I1 | |||||
| .L_J11: /* j--*/ | |||||
| addi.d J, J, -1 | |||||
| blt ZERO, J, .L_J1 | |||||
| .L_N1: /* if(n&4)*/ | |||||
| andi I, N, 0x04 | |||||
| beq I, ZERO, .L_N2 | |||||
| move S1, TS | |||||
| add.d S2, TS, TL | |||||
| move I, M | |||||
| add.d S3, S2, TL | |||||
| add.d S4, S3, TL | |||||
| add.d TS, S4, TL | |||||
| beq I, ZERO, .L_N2 | |||||
| .L_N11: /* if(i>0)*/ | |||||
| fld.s F0, S1, 0x00 | |||||
| fld.s F1, S1, 0x04 | |||||
| fld.s F2, S2, 0x00 | |||||
| fld.s F3, S2, 0x04 | |||||
| fld.s F4, S3, 0x00 | |||||
| fld.s F5, S3, 0x04 | |||||
| fld.s F6, S4, 0x00 | |||||
| fld.s F7, S4, 0x04 | |||||
| fst.s F0, TD, 0x00 | |||||
| fst.s F1, TD, 0x04 | |||||
| fst.s F2, TD, 0x08 | |||||
| fst.s F3, TD, 0x0c | |||||
| fst.s F4, TD, 0x10 | |||||
| fst.s F5, TD, 0x14 | |||||
| fst.s F6, TD, 0x18 | |||||
| fst.s F7, TD, 0x1c | |||||
| addi.d S1, S1, 0x08 | |||||
| addi.d S2, S2, 0x08 | |||||
| addi.d S3, S3, 0x08 | |||||
| addi.d S4, S4, 0x08 | |||||
| addi.d TD, TD, 0x20 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N11 | |||||
| .L_N2: /* if(n&2)*/ | |||||
| andi I, N, 0x02 | |||||
| beq I, ZERO, .L_N3 | |||||
| move S1, TS | |||||
| add.d S2, TS, TL | |||||
| move I, M | |||||
| add.d TS, S2, TL | |||||
| beq I, ZERO, .L_N3 | |||||
| .L_N21: /* if(i>0)*/ | |||||
| fld.s F0, S1, 0x00 | |||||
| fld.s F1, S1, 0x04 | |||||
| fld.s F2, S2, 0x00 | |||||
| fld.s F3, S2, 0x04 | |||||
| fst.s F0, TD, 0x00 | |||||
| fst.s F1, TD, 0x04 | |||||
| fst.s F2, TD, 0x08 | |||||
| fst.s F3, TD, 0x0c | |||||
| addi.d S1, S1, 0x08 | |||||
| addi.d S2, S2, 0x08 | |||||
| addi.d TD, TD, 0x10 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N21 | |||||
| .L_N3: /* if(n&2)*/ | |||||
| andi I, N, 0x01 | |||||
| beq I, ZERO, .L_N0 | |||||
| move S1, TS | |||||
| move I, M | |||||
| beq I, ZERO, .L_N0 | |||||
| .L_N31: /* if(i>0)*/ | |||||
| fld.s F0, S1, 0x00 | |||||
| fld.s F1, S1, 0x04 | |||||
| fst.s F0, TD, 0x00 | |||||
| fst.s F1, TD, 0x04 | |||||
| addi.d S1, S1, 0x08 | |||||
| addi.d TD, TD, 0x08 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N31 | |||||
| .L_N0: | |||||
| LDARG $r23, $sp, 0 | |||||
| addi.d $sp, $sp, 8 | |||||
| jirl $r0, $r1, 0x00 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,324 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Function parameters */ | |||||
| #define M $r4 // param 1: m | |||||
| #define N $r5 // param 2: n | |||||
| #define SRC $r6 // param 3: src | |||||
| #define LDA $r7 // param 4: lda | |||||
| #define DST $r8 // param 5: dst | |||||
| #define I $r9 | |||||
| #define J $r10 | |||||
| #define S1 $r12 | |||||
| #define S2 $r13 | |||||
| #define S3 $r14 | |||||
| #define S4 $r15 | |||||
| #define TD $r16 | |||||
| #define TS $r17 | |||||
| #define TL $r18 | |||||
| #define T0 $r19 | |||||
| #define S8 $r20 | |||||
| #define S9 $r23 | |||||
| #define S10 $r11 | |||||
| #define ZERO $r0 | |||||
| #define F0 $f0 | |||||
| #define F1 $f1 | |||||
| #define F2 $f2 | |||||
| #define F3 $f3 | |||||
| #define F4 $f4 | |||||
| #define F5 $f5 | |||||
| #define F6 $f6 | |||||
| #define F7 $f7 | |||||
| /* LSX vectors */ | |||||
| #define U0 $vr0 | |||||
| #define U1 $vr1 | |||||
| #define U2 $vr2 | |||||
| #define U3 $vr3 | |||||
| #define U4 $vr4 | |||||
| #define U5 $vr5 | |||||
| #define U6 $vr6 | |||||
| #define U7 $vr7 | |||||
| #define U8 $vr8 | |||||
| #define U9 $vr9 | |||||
| #define U10 $vr10 | |||||
| #define U11 $vr11 | |||||
| #define U12 $vr12 | |||||
| #define U13 $vr13 | |||||
| #define U14 $vr14 | |||||
| #define U15 $vr15 | |||||
| PROLOGUE | |||||
| addi.d $sp, $sp, -8 | |||||
| SDARG $r23, $sp, 0 | |||||
| move TS, SRC //aoffset | |||||
| move TD, DST //boffset | |||||
| slli.d TL, LDA, 0x02 //lda | |||||
| slli.d TL, TL, 0x01 //lda | |||||
| ori T0, ZERO, 0x03 | |||||
| andn T0, N, T0 | |||||
| mul.w T0, M, T0 | |||||
| slli.d T0, T0, 0x01 | |||||
| slli.d T0, T0, 0x02 | |||||
| add.d S9, DST, T0 //boffset2 | |||||
| ori T0, ZERO, 0x01 | |||||
| andn T0, N, T0 | |||||
| mul.w T0, M, T0 | |||||
| slli.d T0, T0, 0x01 | |||||
| slli.d T0, T0, 0x02 | |||||
| add.d S10, DST, T0 //boffset3 | |||||
| srai.d J, M, 0x02 //j | |||||
| beq J, ZERO, .L_M1 | |||||
| .L_J1: /* if(j>0) j--*/ | |||||
| move S1, TS //aoffset1 | |||||
| add.d S2, S1, TL | |||||
| add.d S3, S2, TL | |||||
| add.d S4, S3, TL | |||||
| slli.d T0, TL, 0x02 | |||||
| add.d TS, TS, T0 | |||||
| move S8, TD //boffset1 | |||||
| addi.d TD, TD, 0x80 | |||||
| srai.d I, N, 0x02 | |||||
| beq ZERO, I, .L_JN1 | |||||
| .L_JI1: /* if(i>0) i--*/ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S2, 0x00 | |||||
| vld U3, S2, 0x10 | |||||
| vld U4, S3, 0x00 | |||||
| vld U5, S3, 0x10 | |||||
| vld U6, S4, 0x00 | |||||
| vld U7, S4, 0x10 | |||||
| vst U0, S8, 0x00 | |||||
| vst U1, S8, 0x10 | |||||
| vst U2, S8, 0x20 | |||||
| vst U3, S8, 0x30 | |||||
| vst U4, S8, 0x40 | |||||
| vst U5, S8, 0x50 | |||||
| vst U6, S8, 0x60 | |||||
| vst U7, S8, 0x70 | |||||
| addi.d S1, S1, 0x20 | |||||
| addi.d S2, S2, 0x20 | |||||
| addi.d S3, S3, 0x20 | |||||
| addi.d S4, S4, 0x20 | |||||
| slli.d T0, M, 0x05 | |||||
| add.d S8, S8, T0 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_JI1 | |||||
| .L_JN1: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_JN2 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S2, 0x00 | |||||
| vld U2, S3, 0x00 | |||||
| vld U3, S4, 0x00 | |||||
| vst U0, S9, 0x00 | |||||
| vst U1, S9, 0x10 | |||||
| vst U2, S9, 0x20 | |||||
| vst U3, S9, 0x30 | |||||
| addi.d S1, S1, 0x10 | |||||
| addi.d S2, S2, 0x10 | |||||
| addi.d S3, S3, 0x10 | |||||
| addi.d S4, S4, 0x10 | |||||
| addi.d S9, S9, 0x40 | |||||
| .L_JN2: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq ZERO, I, .L_J0 | |||||
| fld.s F0, S1, 0x00 | |||||
| fld.s F1, S1, 0x04 | |||||
| fld.s F2, S2, 0x00 | |||||
| fld.s F3, S2, 0x04 | |||||
| fld.s F4, S3, 0x00 | |||||
| fld.s F5, S3, 0x04 | |||||
| fld.s F6, S4, 0x00 | |||||
| fld.s F7, S4, 0x04 | |||||
| fst.s F0, S10, 0x00 | |||||
| fst.s F1, S10, 0x04 | |||||
| fst.s F2, S10, 0x08 | |||||
| fst.s F3, S10, 0x0c | |||||
| fst.s F4, S10, 0x10 | |||||
| fst.s F5, S10, 0x14 | |||||
| fst.s F6, S10, 0x18 | |||||
| fst.s F7, S10, 0x1c | |||||
| addi.d S10, S10, 0x20 | |||||
| .L_J0: | |||||
| addi.d J, J, -1 | |||||
| blt ZERO, J, .L_J1 | |||||
| .L_M1: /* if(m&2) */ | |||||
| andi I, M, 0x02 | |||||
| beq ZERO, I, .L_M2 | |||||
| move S1, TS //aoffset1 | |||||
| add.d S2, S1, TL | |||||
| slli.d T0, TL, 0x01 | |||||
| add.d TS, TS, T0 | |||||
| move S8, TD //boffset1 | |||||
| addi.d TD, TD, 0x40 | |||||
| srai.d I, N, 0x02 | |||||
| beq ZERO, I, .L_M1N1 | |||||
| .L_M1I1: /* if(i>0) */ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S2, 0x00 | |||||
| vld U3, S2, 0x10 | |||||
| vst U0, S8, 0x00 | |||||
| vst U1, S8, 0x10 | |||||
| vst U2, S8, 0x20 | |||||
| vst U3, S8, 0x30 | |||||
| addi.d S1, S1, 0x20 | |||||
| addi.d S2, S2, 0x20 | |||||
| slli.d T0, M, 0x05 | |||||
| add.d S8, S8, T0 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_M1I1 | |||||
| .L_M1N1: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_M1N2 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S2, 0x00 | |||||
| vst U0, S9, 0x00 | |||||
| vst U1, S9, 0x10 | |||||
| addi.d S1, S1, 0x10 | |||||
| addi.d S2, S2, 0x10 | |||||
| addi.d S9, S9, 0x20 | |||||
| .L_M1N2: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq ZERO, I, .L_M2 | |||||
| fld.s F0, S1, 0x00 | |||||
| fld.s F1, S1, 0x04 | |||||
| fld.s F2, S2, 0x00 | |||||
| fld.s F3, S2, 0x04 | |||||
| fst.s F0, S10, 0x00 | |||||
| fst.s F1, S10, 0x04 | |||||
| fst.s F2, S10, 0x08 | |||||
| fst.s F3, S10, 0x0c | |||||
| addi.d S10, S10, 0x10 | |||||
| .L_M2: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq ZERO, I, .L_M0 | |||||
| move S1, TS //aoffset1 | |||||
| move S8, TD //boffset1 | |||||
| srai.d I, N, 0x02 | |||||
| beq ZERO, I, .L_M2N1 | |||||
| .L_M2I1: /* if(i>0) */ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vst U0, S8, 0x00 | |||||
| vst U1, S8, 0x10 | |||||
| addi.d S1, S1, 0x20 | |||||
| slli.d T0, M, 0x05 | |||||
| add.d S8, S8, T0 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_M2I1 | |||||
| .L_M2N1: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_M2N2 | |||||
| vld U0, S1, 0x00 | |||||
| vst U0, S9, 0x00 | |||||
| addi.d S1, S1, 0x10 | |||||
| .L_M2N2: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq ZERO, I, .L_M0 | |||||
| fld.s F0, S1, 0x00 | |||||
| fld.s F1, S1, 0x04 | |||||
| fst.s F0, S10, 0x00 | |||||
| fst.s F1, S10, 0x04 | |||||
| .L_M0: | |||||
| LDARG $r23, $sp, 0 | |||||
| addi.d $sp, $sp, 8 | |||||
| jirl $r0, $r1, 0x00 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,277 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Function parameters */ | |||||
| #define M $r4 // param 1: m | |||||
| #define N $r5 // param 2: n | |||||
| #define SRC $r6 // param 3: src | |||||
| #define LDA $r7 // param 4: lda | |||||
| #define DST $r8 // param 5: dst | |||||
| #define I $r9 | |||||
| #define J $r10 | |||||
| #define S1 $r12 | |||||
| #define S2 $r13 | |||||
| #define S3 $r14 | |||||
| #define S4 $r15 | |||||
| #define S5 $r16 | |||||
| #define S6 $r17 | |||||
| #define S7 $r18 | |||||
| #define S8 $r19 | |||||
| #define TD $r20 | |||||
| #define TS $r11 | |||||
| #define TL $r7 | |||||
| #define T0 $r23 | |||||
| #define ZERO $r0 | |||||
| #define F0 $f0 | |||||
| #define F1 $f1 | |||||
| #define F2 $f2 | |||||
| #define F3 $f3 | |||||
| #define F4 $f4 | |||||
| #define F5 $f5 | |||||
| #define F6 $f6 | |||||
| #define F7 $f7 | |||||
| /* LASX vectors */ | |||||
| #define U0 $vr0 | |||||
| #define U1 $vr1 | |||||
| #define U2 $vr2 | |||||
| #define U3 $vr3 | |||||
| #define U4 $vr4 | |||||
| #define U5 $vr5 | |||||
| #define U6 $vr6 | |||||
| #define U7 $vr7 | |||||
| #define D0 $vr8 | |||||
| #define D1 $vr9 | |||||
| #define D2 $vr10 | |||||
| #define D3 $vr11 | |||||
| #define D4 $vr12 | |||||
| #define D5 $vr13 | |||||
| #define D6 $vr14 | |||||
| #define D7 $vr15 | |||||
| PROLOGUE | |||||
| addi.d $sp, $sp, -8 | |||||
| SDARG $r23, $sp, 0 | |||||
| move TS, SRC //aoffset | |||||
| move TD, DST //boffset | |||||
| slli.d TL, LDA, 0x02 //lda | |||||
| slli.d TL, TL, 0x01 | |||||
| srai.d J, N, 0x03 //j | |||||
| beq J, ZERO, .L_N1 | |||||
| .L_J1: /* if(j>0) j--*/ | |||||
| move S1, TS //aoffset1 | |||||
| slli.d T0, TL, 0x01 //2*lda | |||||
| add.d S2, TS, TL | |||||
| addi.d TS, TS, 0x40 | |||||
| srai.d I, M, 0x01 | |||||
| beq ZERO, I, .L_J1M1 | |||||
| .L_J1I1: /* if(i>0) i--*/ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S1, 0x20 | |||||
| vld U3, S1, 0x30 | |||||
| vld U4, S2, 0x00 | |||||
| vld U5, S2, 0x10 | |||||
| vld U6, S2, 0x20 | |||||
| vld U7, S2, 0x30 | |||||
| vst U0, TD, 0x00 | |||||
| vst U1, TD, 0x10 | |||||
| vst U2, TD, 0x20 | |||||
| vst U3, TD, 0x30 | |||||
| vst U4, TD, 0x40 | |||||
| vst U5, TD, 0x50 | |||||
| vst U6, TD, 0x60 | |||||
| vst U7, TD, 0x70 | |||||
| add.d S1, S1, T0 | |||||
| add.d S2, S2, T0 | |||||
| addi.d TD, TD, 0x80 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_J1I1 | |||||
| .L_J1M1: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq ZERO, I, .L_J0 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S1, 0x20 | |||||
| vld U3, S1, 0x30 | |||||
| vst U0, TD, 0x00 | |||||
| vst U1, TD, 0x10 | |||||
| vst U2, TD, 0x20 | |||||
| vst U3, TD, 0x30 | |||||
| addi.d TD, TD, 0x40 | |||||
| .L_J0: | |||||
| addi.d J, J, -1 | |||||
| blt ZERO, J, .L_J1 | |||||
| .L_N1: /* if(n&4) */ | |||||
| andi I, N, 0x04 | |||||
| beq ZERO, I, .L_N2 | |||||
| move S1, TS //aoffset1 | |||||
| slli.d T0, TL, 0x01 //2*lda | |||||
| add.d S2, TS, TL | |||||
| addi.d TS, TS, 0x20 | |||||
| srai.d I, M, 0x01 | |||||
| beq ZERO, I, .L_N1M1 | |||||
| .L_N1I1: /* if(i>0) i-- */ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S2, 0x00 | |||||
| vld U3, S2, 0x10 | |||||
| vst U0, TD, 0x00 | |||||
| vst U1, TD, 0x10 | |||||
| vst U2, TD, 0x20 | |||||
| vst U3, TD, 0x30 | |||||
| add.d S1, S1, T0 | |||||
| add.d S2, S2, T0 | |||||
| addi.d TD, TD, 0x40 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N1I1 | |||||
| .L_N1M1: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq ZERO, I, .L_N2 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vst U0, TD, 0x00 | |||||
| vst U1, TD, 0x10 | |||||
| addi.d TD, TD, 0x20 | |||||
| .L_N2: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_N3 | |||||
| move S1, TS //aoffset1 | |||||
| slli.d T0, TL, 0x01 //2*lda | |||||
| add.d S2, TS, TL | |||||
| addi.d TS, TS, 0x10 | |||||
| srai.d I, M, 0x01 | |||||
| beq ZERO, I, .L_N2M1 | |||||
| .L_N2I1: /* if(i>0) i-- */ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S2, 0x00 | |||||
| vst U0, TD, 0x00 | |||||
| vst U1, TD, 0x10 | |||||
| add.d S1, S1, T0 | |||||
| add.d S2, S2, T0 | |||||
| addi.d TD, TD, 0x20 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N2I1 | |||||
| .L_N2M1: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq ZERO, I, .L_N3 | |||||
| vld U0, S1, 0x00 | |||||
| vst U0, TD, 0x00 | |||||
| addi.d TD, TD, 0x10 | |||||
| .L_N3: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq ZERO, I, .L_N0 | |||||
| move S1, TS //aoffset1 | |||||
| slli.d T0, TL, 0x01 //2*lda | |||||
| add.d S2, TS, TL | |||||
| srai.d I, M, 0x01 | |||||
| beq ZERO, I, .L_N3M1 | |||||
| .L_N3I1: /* if(i>0) i-- */ | |||||
| fld.s F0, S1, 0x00 | |||||
| fld.s F1, S1, 0x04 | |||||
| fld.s F2, S2, 0x00 | |||||
| fld.s F3, S2, 0x04 | |||||
| fst.s F0, TD, 0x00 | |||||
| fst.s F1, TD, 0x04 | |||||
| fst.s F2, TD, 0x08 | |||||
| fst.s F3, TD, 0x0c | |||||
| add.d S1, S1, T0 | |||||
| add.d S2, S2, T0 | |||||
| addi.d TD, TD, 0x10 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N3I1 | |||||
| .L_N3M1: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq ZERO, I, .L_N0 | |||||
| fld.s F0, S1, 0x00 | |||||
| fld.s F1, S1, 0x04 | |||||
| fst.s F0, TD, 0x00 | |||||
| fst.s F1, TD, 0x04 | |||||
| .L_N0: | |||||
| LDARG $r23, $sp, 0 | |||||
| addi.d $sp, $sp, 8 | |||||
| jirl $r0, $r1, 0x00 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,320 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Function parameters */ | |||||
| #define M $r4 // param 1: m | |||||
| #define N $r5 // param 2: n | |||||
| #define SRC $r6 // param 3: src | |||||
| #define LDA $r7 // param 4: lda | |||||
| #define DST $r8 // param 5: dst | |||||
| #define I $r9 | |||||
| #define J $r10 | |||||
| #define S1 $r12 | |||||
| #define S2 $r13 | |||||
| #define S3 $r14 | |||||
| #define S4 $r15 | |||||
| #define S5 $r16 | |||||
| #define S6 $r17 | |||||
| #define S7 $r18 | |||||
| #define TD $r20 | |||||
| #define TS $r11 | |||||
| #define TL $r19 | |||||
| #define T0 $r23 | |||||
| #define ZERO $r0 | |||||
| #define F0 $f0 | |||||
| #define F1 $f1 | |||||
| #define F2 $f2 | |||||
| #define F3 $f3 | |||||
| #define F4 $f4 | |||||
| #define F5 $f5 | |||||
| #define F6 $f6 | |||||
| #define F7 $f7 | |||||
| /* LASX vectors */ | |||||
| #define U0 $xr0 | |||||
| #define U1 $xr1 | |||||
| #define U2 $xr2 | |||||
| #define U3 $xr3 | |||||
| #define U4 $xr4 | |||||
| #define U5 $xr5 | |||||
| #define U6 $xr6 | |||||
| #define U7 $xr7 | |||||
| #define D0 $xr8 | |||||
| #define D1 $xr9 | |||||
| #define D2 $xr10 | |||||
| #define D3 $xr11 | |||||
| #define D4 $xr12 | |||||
| #define D5 $xr13 | |||||
| #define D6 $xr14 | |||||
| #define D7 $xr15 | |||||
| #define D8 $xr16 | |||||
| PROLOGUE | |||||
| addi.d $sp, $sp, -8 | |||||
| SDARG $r23, $sp, 0 | |||||
| move TD, DST //boffset | |||||
| move TS, SRC //aoffset | |||||
| slli.d TL, LDA, 0x03 | |||||
| slli.d TL, TL, 0x01 | |||||
| srai.d J, N, 0x02 | |||||
| beq J, ZERO, .L_N0 | |||||
| .L_J1: /* J-- */ | |||||
| move S1, TS | |||||
| add.d S2, S1, TL | |||||
| add.d S3, S2, TL | |||||
| add.d S4, S3, TL | |||||
| slli.d T0, TL, 0x02 | |||||
| add.d TS, TS, T0 | |||||
| srai.d I, M, 0x02 | |||||
| beq I, ZERO, .L_I3 | |||||
| .L_I1: /* I-- */ | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S1, 0x20 | |||||
| xvld U2, S2, 0x00 | |||||
| xvld U3, S2, 0x20 | |||||
| xvld U4, S3, 0x00 | |||||
| xvld U5, S3, 0x20 | |||||
| xvld U6, S4, 0x00 | |||||
| xvld U7, S4, 0x20 | |||||
| xvand.v D0, U0, U0 | |||||
| xvand.v D1, U1, U1 | |||||
| xvand.v D2, U2, U2 | |||||
| xvand.v D3, U3, U3 | |||||
| xvand.v D4, U4, U4 | |||||
| xvand.v D5, U5, U5 | |||||
| xvand.v D6, U6, U6 | |||||
| xvand.v D7, U7, U7 | |||||
| xvpermi.q D0, U2, 0x02 | |||||
| xvpermi.q D4, U6, 0x02 | |||||
| xvpermi.q D2, U0, 0x31 | |||||
| xvpermi.q D6, U4, 0x31 | |||||
| xvpermi.q D1, U3, 0x02 | |||||
| xvpermi.q D5, U7, 0x02 | |||||
| xvpermi.q D3, U1, 0x31 | |||||
| xvpermi.q D7, U5, 0x31 | |||||
| xvst D0, TD, 0x00 | |||||
| xvst D4, TD, 0x20 | |||||
| xvst D2, TD, 0x40 | |||||
| xvst D6, TD, 0x60 | |||||
| xvst D1, TD, 0x80 | |||||
| xvst D5, TD, 0xa0 | |||||
| xvst D3, TD, 0xc0 | |||||
| xvst D7, TD, 0xe0 | |||||
| addi.d S1, S1, 0x40 // a_offset | |||||
| addi.d S2, S2, 0x40 | |||||
| addi.d S3, S3, 0x40 | |||||
| addi.d S4, S4, 0x40 | |||||
| addi.d TD, TD, 0x100 // b_offset | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_I1 | |||||
| .L_I3: | |||||
| andi I, M, 0x02 | |||||
| beq I, ZERO, .L_II20 | |||||
| .L_II1: /* if(m&2) */ | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S2, 0x00 | |||||
| xvld U2, S3, 0x00 | |||||
| xvld U3, S4, 0x00 | |||||
| xvand.v D0, U0, U0 | |||||
| xvand.v D1, U1, U1 | |||||
| xvand.v D2, U2, U2 | |||||
| xvand.v D3, U3, U3 | |||||
| xvpermi.q D0, U1, 0x02 | |||||
| xvpermi.q D2, U3, 0x02 | |||||
| xvpermi.q D1, U0, 0x31 | |||||
| xvpermi.q D3, U2, 0x31 | |||||
| xvst D0, TD, 0x00 | |||||
| xvst D2, TD, 0x20 | |||||
| xvst D1, TD, 0x40 | |||||
| xvst D3, TD, 0x60 | |||||
| addi.d S1, S1, 0x20 | |||||
| addi.d S2, S2, 0x20 | |||||
| addi.d S3, S3, 0x20 | |||||
| addi.d S4, S4, 0x20 | |||||
| addi.d TD, TD, 0x80 | |||||
| .L_II20: | |||||
| andi I, M, 0x01 | |||||
| beq I, ZERO, .L_J0 | |||||
| .L_II2: /* if(m&1) */ | |||||
| vld $vr0, S1, 0x00 | |||||
| vld $vr1, S2, 0x00 | |||||
| vld $vr2, S3, 0x00 | |||||
| vld $vr3, S4, 0x00 | |||||
| vst $vr0, TD, 0x00 | |||||
| vst $vr1, TD, 0x10 | |||||
| vst $vr2, TD, 0x20 | |||||
| vst $vr3, TD, 0x30 | |||||
| addi.d TD, TD, 0x40 | |||||
| .L_J0: | |||||
| addi.d J, J, -1 | |||||
| blt ZERO, J, .L_J1 | |||||
| .L_N0: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_N20 | |||||
| move S1, TS | |||||
| add.d S2, S1, TL | |||||
| slli.d T0, TL, 0x01 | |||||
| add.d TS, TS, T0 | |||||
| srai.d I, M, 0x02 | |||||
| beq ZERO, I, .L_N10 | |||||
| .L_N11: /* if(i>0) */ | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S1, 0x20 | |||||
| xvld U2, S2, 0x00 | |||||
| xvld U3, S2, 0x20 | |||||
| xvand.v D0, U0, U0 | |||||
| xvand.v D1, U1, U1 | |||||
| xvand.v D2, U2, U2 | |||||
| xvand.v D3, U3, U3 | |||||
| xvpermi.q D0, U2, 0x02 | |||||
| xvpermi.q D2, U0, 0x31 | |||||
| xvpermi.q D1, U3, 0x02 | |||||
| xvpermi.q D3, U1, 0x31 | |||||
| xvst D0, TD, 0x00 | |||||
| xvst D2, TD, 0x20 | |||||
| xvst D1, TD, 0x40 | |||||
| xvst D3, TD, 0x60 | |||||
| addi.d S1, S1, 0x40 // a_offset | |||||
| addi.d S2, S2, 0x40 | |||||
| addi.d TD, TD, 0x80 // b_offset | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N11 | |||||
| .L_N10: /* if(m&2) */ | |||||
| andi I, M, 0x02 | |||||
| beq I, ZERO, .L_N130 | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S2, 0x00 | |||||
| xvand.v D0, U0, U0 | |||||
| xvpermi.q D0, U1, 0x02 | |||||
| xvpermi.q U1, U0, 0x31 | |||||
| xvst D0, TD, 0x00 | |||||
| xvst U1, TD, 0x20 | |||||
| addi.d S1, S1, 0x20 // a_offset | |||||
| addi.d S2, S2, 0x20 | |||||
| addi.d TD, TD, 0x40 // b_offset | |||||
| .L_N130: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq I, ZERO, .L_N20 | |||||
| vld $vr0, S1, 0x00 | |||||
| vld $vr1, S2, 0x00 | |||||
| vst $vr0, TD, 0x00 | |||||
| vst $vr1, TD, 0x10 | |||||
| addi.d TD, TD, 0x20 | |||||
| .L_N20: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq I, ZERO, .L_N00 | |||||
| move S1, TS | |||||
| srai.d I, M, 0x02 | |||||
| beq I, ZERO, .L_N30 | |||||
| .L_N21: /* if(i>0) */ | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S1, 0x20 | |||||
| xvst U0, TD, 0x00 | |||||
| xvst U1, TD, 0x20 | |||||
| addi.d S1, S1, 0x40 // aoffset1 | |||||
| addi.d TD, TD, 0x40 // b_offset | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N21 | |||||
| .L_N30: /* if(m&2) */ | |||||
| andi I, M, 0x02 | |||||
| beq I, ZERO, .L_N330 | |||||
| xvld U0, S1, 0x00 | |||||
| xvst U0, TD, 0x00 | |||||
| addi.d S1, S1, 0x20 // aoffset1 | |||||
| addi.d TD, TD, 0x20 // b_offset | |||||
| .L_N330: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq I, ZERO, .L_N00 | |||||
| vld $vr0, S1, 0x00 | |||||
| vst $vr0, TD, 0x00 | |||||
| .L_N00: | |||||
| LDARG $r23, $sp, 0 | |||||
| addi.d $sp, $sp, 8 | |||||
| jirl $r0, $r1, 0x00 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,332 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Function parameters */ | |||||
| #define M $r4 // param 1: m | |||||
| #define N $r5 // param 2: n | |||||
| #define SRC $r6 // param 3: src | |||||
| #define LDA $r7 // param 4: lda | |||||
| #define DST $r8 // param 5: dst | |||||
| #define I $r9 | |||||
| #define J $r10 | |||||
| #define S1 $r12 | |||||
| #define S2 $r13 | |||||
| #define S3 $r14 | |||||
| #define S4 $r15 | |||||
| #define S5 $r16 | |||||
| #define S6 $r17 | |||||
| #define S7 $r18 | |||||
| #define TD $r20 | |||||
| #define TS $r11 | |||||
| #define TL $r19 | |||||
| #define T0 $r23 | |||||
| #define ZERO $r0 | |||||
| #define F0 $f0 | |||||
| #define F1 $f1 | |||||
| #define F2 $f2 | |||||
| #define F3 $f3 | |||||
| #define F4 $f4 | |||||
| #define F5 $f5 | |||||
| #define F6 $f6 | |||||
| #define F7 $f7 | |||||
| /* LSX vectors */ | |||||
| #define U0 $vr0 | |||||
| #define U1 $vr1 | |||||
| #define U2 $vr2 | |||||
| #define U3 $vr3 | |||||
| #define U4 $vr4 | |||||
| #define U5 $vr5 | |||||
| #define U6 $vr6 | |||||
| #define U7 $vr7 | |||||
| #define U8 $vr8 | |||||
| #define U9 $vr9 | |||||
| #define U10 $vr10 | |||||
| #define U11 $vr11 | |||||
| #define U12 $vr12 | |||||
| #define U13 $vr13 | |||||
| #define U14 $vr14 | |||||
| #define U15 $vr15 | |||||
| PROLOGUE | |||||
| addi.d $sp, $sp, -8 | |||||
| SDARG $r23, $sp, 0 | |||||
| move TD, DST //boffset | |||||
| move TS, SRC //aoffset | |||||
| slli.d TL, LDA, 0x03 | |||||
| slli.d TL, TL, 0x01 | |||||
| srai.d J, N, 0x02 | |||||
| beq J, ZERO, .L_N0 | |||||
| .L_J1: /* J-- */ | |||||
| move S1, TS | |||||
| add.d S2, S1, TL | |||||
| add.d S3, S2, TL | |||||
| add.d S4, S3, TL | |||||
| slli.d T0, TL, 0x02 | |||||
| add.d TS, TS, T0 | |||||
| srai.d I, M, 0x02 | |||||
| beq I, ZERO, .L_I3 | |||||
| .L_I1: /* I-- */ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S1, 0x20 | |||||
| vld U3, S1, 0x30 | |||||
| vld U4, S2, 0x00 | |||||
| vld U5, S2, 0x10 | |||||
| vld U6, S2, 0x20 | |||||
| vld U7, S2, 0x30 | |||||
| vld U8, S3, 0x00 | |||||
| vld U9, S3, 0x10 | |||||
| vld U10, S3, 0x20 | |||||
| vld U11, S3, 0x30 | |||||
| vld U12, S4, 0x00 | |||||
| vld U13, S4, 0x10 | |||||
| vld U14, S4, 0x20 | |||||
| vld U15, S4, 0x30 | |||||
| vst U0, TD, 0x00 | |||||
| vst U4, TD, 0x10 | |||||
| vst U8, TD, 0x20 | |||||
| vst U12, TD, 0x30 | |||||
| vst U1, TD, 0x40 | |||||
| vst U5, TD, 0x50 | |||||
| vst U9, TD, 0x60 | |||||
| vst U13, TD, 0x70 | |||||
| vst U2, TD, 0x80 | |||||
| vst U6, TD, 0x90 | |||||
| vst U10, TD, 0xa0 | |||||
| vst U14, TD, 0xb0 | |||||
| vst U3, TD, 0xc0 | |||||
| vst U7, TD, 0xd0 | |||||
| vst U11, TD, 0xe0 | |||||
| vst U15, TD, 0xf0 | |||||
| addi.d S1, S1, 0x40 // a_offset | |||||
| addi.d S2, S2, 0x40 | |||||
| addi.d S3, S3, 0x40 | |||||
| addi.d S4, S4, 0x40 | |||||
| addi.d TD, TD, 0x100 // b_offset | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_I1 | |||||
| .L_I3: /* if(m&2) */ | |||||
| andi I, M, 0x02 | |||||
| beq I, ZERO, .L_II20 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S2, 0x00 | |||||
| vld U3, S2, 0x10 | |||||
| vld U4, S3, 0x00 | |||||
| vld U5, S3, 0x10 | |||||
| vld U6, S4, 0x00 | |||||
| vld U7, S4, 0x10 | |||||
| vst U0, TD, 0x00 | |||||
| vst U2, TD, 0x10 | |||||
| vst U4, TD, 0x20 | |||||
| vst U6, TD, 0x30 | |||||
| vst U1, TD, 0x40 | |||||
| vst U3, TD, 0x50 | |||||
| vst U5, TD, 0x60 | |||||
| vst U7, TD, 0x70 | |||||
| addi.d S1, S1, 0x20 | |||||
| addi.d S2, S2, 0x20 | |||||
| addi.d S3, S3, 0x20 | |||||
| addi.d S4, S4, 0x20 | |||||
| addi.d TD, TD, 0x80 | |||||
| .L_II20: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq I, ZERO, .L_J0 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S2, 0x00 | |||||
| vld U2, S3, 0x00 | |||||
| vld U3, S4, 0x00 | |||||
| vst U0, TD, 0x00 | |||||
| vst U1, TD, 0x10 | |||||
| vst U2, TD, 0x20 | |||||
| vst U3, TD, 0x30 | |||||
| addi.d TD, TD, 0x40 | |||||
| .L_J0: | |||||
| addi.d J, J, -1 | |||||
| blt ZERO, J, .L_J1 | |||||
| .L_N0: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_N20 | |||||
| move S1, TS | |||||
| add.d S2, S1, TL | |||||
| slli.d T0, TL, 0x01 | |||||
| add.d TS, TS, T0 | |||||
| srai.d I, M, 0x02 | |||||
| beq ZERO, I, .L_N10 | |||||
| .L_N11: /* if(i>0) */ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S1, 0x20 | |||||
| vld U3, S1, 0x30 | |||||
| vld U4, S2, 0x00 | |||||
| vld U5, S2, 0x10 | |||||
| vld U6, S2, 0x20 | |||||
| vld U7, S2, 0x30 | |||||
| vst U0, TD, 0x00 | |||||
| vst U4, TD, 0x10 | |||||
| vst U1, TD, 0x20 | |||||
| vst U5, TD, 0x30 | |||||
| vst U2, TD, 0x40 | |||||
| vst U6, TD, 0x50 | |||||
| vst U3, TD, 0x60 | |||||
| vst U7, TD, 0x70 | |||||
| addi.d S1, S1, 0x40 // a_offset | |||||
| addi.d S2, S2, 0x40 | |||||
| addi.d TD, TD, 0x80 // b_offset | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N11 | |||||
| .L_N10: /* if(m&2) */ | |||||
| andi I, M, 0x02 | |||||
| beq I, ZERO, .L_N130 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S2, 0x00 | |||||
| vld U3, S2, 0x10 | |||||
| vst U0, TD, 0x00 | |||||
| vst U2, TD, 0x10 | |||||
| vst U1, TD, 0x20 | |||||
| vst U3, TD, 0x30 | |||||
| addi.d S1, S1, 0x20 // a_offset | |||||
| addi.d S2, S2, 0x20 | |||||
| addi.d TD, TD, 0x40 // b_offset | |||||
| .L_N130: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq I, ZERO, .L_N20 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S2, 0x00 | |||||
| vst U0, TD, 0x00 | |||||
| vst U1, TD, 0x10 | |||||
| addi.d TD, TD, 0x20 | |||||
| .L_N20: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq I, ZERO, .L_N00 | |||||
| move S1, TS | |||||
| srai.d I, M, 0x02 | |||||
| beq I, ZERO, .L_N30 | |||||
| .L_N21: /* if(i>0) */ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S1, 0x20 | |||||
| vld U3, S1, 0x30 | |||||
| vst U0, TD, 0x00 | |||||
| vst U1, TD, 0x10 | |||||
| vst U2, TD, 0x20 | |||||
| vst U3, TD, 0x30 | |||||
| addi.d S1, S1, 0x40 // aoffset1 | |||||
| addi.d TD, TD, 0x40 // b_offset | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N21 | |||||
| .L_N30: /* if(m&2) */ | |||||
| andi I, M, 0x02 | |||||
| beq I, ZERO, .L_N330 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vst U0, TD, 0x00 | |||||
| vst U1, TD, 0x10 | |||||
| addi.d S1, S1, 0x20 // aoffset1 | |||||
| addi.d TD, TD, 0x20 // b_offset | |||||
| .L_N330: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq I, ZERO, .L_N00 | |||||
| vld U0, S1, 0x00 | |||||
| vst U0, TD, 0x00 | |||||
| .L_N00: | |||||
| LDARG $r23, $sp, 0 | |||||
| addi.d $sp, $sp, 8 | |||||
| jirl $r0, $r1, 0x00 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,263 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Function parameters */ | |||||
| #define M $r4 // param 1: m | |||||
| #define N $r5 // param 2: n | |||||
| #define SRC $r6 // param 3: src | |||||
| #define LDA $r7 // param 4: lda | |||||
| #define DST $r8 // param 5: dst | |||||
| #define I $r9 | |||||
| #define J $r10 | |||||
| #define S1 $r12 | |||||
| #define S2 $r13 | |||||
| #define S3 $r14 | |||||
| #define S4 $r15 | |||||
| #define S5 $r16 | |||||
| #define S6 $r17 | |||||
| #define S7 $r18 | |||||
| #define S8 $r19 | |||||
| #define TD $r20 | |||||
| #define TS $r11 | |||||
| #define TL $r7 | |||||
| #define T0 $r23 | |||||
| #define ZERO $r0 | |||||
| #define F0 $f0 | |||||
| #define F1 $f1 | |||||
| #define F2 $f2 | |||||
| #define F3 $f3 | |||||
| #define F4 $f4 | |||||
| #define F5 $f5 | |||||
| #define F6 $f6 | |||||
| #define F7 $f7 | |||||
| /* LASX vectors */ | |||||
| #define U0 $xr0 | |||||
| #define U1 $xr1 | |||||
| #define U2 $xr2 | |||||
| #define U3 $xr3 | |||||
| #define U4 $xr4 | |||||
| #define U5 $xr5 | |||||
| #define U6 $xr6 | |||||
| #define U7 $xr7 | |||||
| #define D0 $xr8 | |||||
| #define D1 $xr9 | |||||
| #define D2 $xr10 | |||||
| #define D3 $xr11 | |||||
| #define D4 $xr12 | |||||
| #define D5 $xr13 | |||||
| #define D6 $xr14 | |||||
| #define D7 $xr15 | |||||
| #define D8 $xr16 | |||||
| PROLOGUE | |||||
| addi.d $sp, $sp, -8 | |||||
| SDARG $r23, $sp, 0 | |||||
| move TD, DST //boffset | |||||
| move TS, SRC //aoffset | |||||
| slli.d TL, LDA, 0x03 //lda | |||||
| slli.d TL, TL, 0x01 | |||||
| slli.d T0, TL, 0x03 | |||||
| srai.d J, N, 0x03 //j | |||||
| beq J, ZERO, .L_N1 | |||||
| .L_J1: /* if(j>0) j--*/ | |||||
| move S1, TS | |||||
| add.d S2, TS, TL | |||||
| move I, M | |||||
| add.d S3, S2, TL | |||||
| add.d S4, S3, TL | |||||
| add.d S5, S4, TL | |||||
| add.d S6, S5, TL | |||||
| add.d S7, S6, TL | |||||
| add.d S8, S7, TL | |||||
| add.d TS, TS, T0 | |||||
| beq I, ZERO, .L_J11 | |||||
| .L_I1: /* if(i>0) i--*/ | |||||
| fld.d F0, S1, 0x00 | |||||
| fld.d F1, S1, 0x08 | |||||
| fld.d F2, S2, 0x00 | |||||
| fld.d F3, S2, 0x08 | |||||
| fld.d F4, S3, 0x00 | |||||
| fld.d F5, S3, 0x08 | |||||
| fld.d F6, S4, 0x00 | |||||
| fld.d F7, S4, 0x08 | |||||
| fst.d F0, TD, 0x00 | |||||
| fst.d F1, TD, 0x08 | |||||
| fst.d F2, TD, 0x10 | |||||
| fst.d F3, TD, 0x18 | |||||
| fst.d F4, TD, 0x20 | |||||
| fst.d F5, TD, 0x28 | |||||
| fst.d F6, TD, 0x30 | |||||
| fst.d F7, TD, 0x38 | |||||
| fld.d F0, S5, 0x00 | |||||
| fld.d F1, S5, 0x08 | |||||
| fld.d F2, S6, 0x00 | |||||
| fld.d F3, S6, 0x08 | |||||
| fld.d F4, S7, 0x00 | |||||
| fld.d F5, S7, 0x08 | |||||
| fld.d F6, S8, 0x00 | |||||
| fld.d F7, S8, 0x08 | |||||
| fst.d F0, TD, 0x40 | |||||
| fst.d F1, TD, 0x48 | |||||
| fst.d F2, TD, 0x50 | |||||
| fst.d F3, TD, 0x58 | |||||
| fst.d F4, TD, 0x60 | |||||
| fst.d F5, TD, 0x68 | |||||
| fst.d F6, TD, 0x70 | |||||
| fst.d F7, TD, 0x78 | |||||
| addi.d S1, S1, 0x10 | |||||
| addi.d S2, S2, 0x10 | |||||
| addi.d S3, S3, 0x10 | |||||
| addi.d S4, S4, 0x10 | |||||
| addi.d S5, S5, 0x10 | |||||
| addi.d S6, S6, 0x10 | |||||
| addi.d S7, S7, 0x10 | |||||
| addi.d S8, S8, 0x10 | |||||
| addi.d TD, TD, 0x80 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_I1 | |||||
| .L_J11: /* j--*/ | |||||
| addi.d J, J, -1 | |||||
| blt ZERO, J, .L_J1 | |||||
| .L_N1: /* if(n&4)*/ | |||||
| andi I, N, 0x04 | |||||
| beq I, ZERO, .L_N2 | |||||
| move S1, TS | |||||
| add.d S2, TS, TL | |||||
| move I, M | |||||
| add.d S3, S2, TL | |||||
| add.d S4, S3, TL | |||||
| add.d TS, S4, TL | |||||
| beq I, ZERO, .L_N2 | |||||
| .L_N11: /* if(i>0)*/ | |||||
| fld.d F0, S1, 0x00 | |||||
| fld.d F1, S1, 0x08 | |||||
| fld.d F2, S2, 0x00 | |||||
| fld.d F3, S2, 0x08 | |||||
| fld.d F4, S3, 0x00 | |||||
| fld.d F5, S3, 0x08 | |||||
| fld.d F6, S4, 0x00 | |||||
| fld.d F7, S4, 0x08 | |||||
| fst.d F0, TD, 0x00 | |||||
| fst.d F1, TD, 0x08 | |||||
| fst.d F2, TD, 0x10 | |||||
| fst.d F3, TD, 0x18 | |||||
| fst.d F4, TD, 0x20 | |||||
| fst.d F5, TD, 0x28 | |||||
| fst.d F6, TD, 0x30 | |||||
| fst.d F7, TD, 0x38 | |||||
| addi.d S1, S1, 0x10 | |||||
| addi.d S2, S2, 0x10 | |||||
| addi.d S3, S3, 0x10 | |||||
| addi.d S4, S4, 0x10 | |||||
| addi.d TD, TD, 0x40 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N11 | |||||
| .L_N2: /* if(n&2)*/ | |||||
| andi I, N, 0x02 | |||||
| beq I, ZERO, .L_N3 | |||||
| move S1, TS | |||||
| add.d S2, TS, TL | |||||
| move I, M | |||||
| add.d TS, S2, TL | |||||
| beq I, ZERO, .L_N3 | |||||
| .L_N21: /* if(i>0)*/ | |||||
| fld.d F0, S1, 0x00 | |||||
| fld.d F1, S1, 0x08 | |||||
| fld.d F2, S2, 0x00 | |||||
| fld.d F3, S2, 0x08 | |||||
| fst.d F0, TD, 0x00 | |||||
| fst.d F1, TD, 0x08 | |||||
| fst.d F2, TD, 0x10 | |||||
| fst.d F3, TD, 0x18 | |||||
| addi.d S1, S1, 0x10 | |||||
| addi.d S2, S2, 0x10 | |||||
| addi.d TD, TD, 0x20 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N21 | |||||
| .L_N3: /* if(n&2)*/ | |||||
| andi I, N, 0x01 | |||||
| beq I, ZERO, .L_N0 | |||||
| move S1, TS | |||||
| move I, M | |||||
| beq I, ZERO, .L_N0 | |||||
| .L_N31: /* if(i>0)*/ | |||||
| fld.d F0, S1, 0x00 | |||||
| fld.d F1, S1, 0x08 | |||||
| fst.d F0, TD, 0x00 | |||||
| fst.d F1, TD, 0x08 | |||||
| addi.d S1, S1, 0x10 | |||||
| addi.d TD, TD, 0x10 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N31 | |||||
| .L_N0: | |||||
| LDARG $r23, $sp, 0 | |||||
| addi.d $sp, $sp, 8 | |||||
| jirl $r0, $r1, 0x00 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,302 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Function parameters */ | |||||
| #define M $r4 // param 1: m | |||||
| #define N $r5 // param 2: n | |||||
| #define SRC $r6 // param 3: src | |||||
| #define LDA $r7 // param 4: lda | |||||
| #define DST $r8 // param 5: dst | |||||
| #define I $r9 | |||||
| #define J $r10 | |||||
| #define S1 $r12 | |||||
| #define S2 $r13 | |||||
| #define S3 $r14 | |||||
| #define S4 $r15 | |||||
| #define TD $r16 | |||||
| #define TS $r17 | |||||
| #define TL $r18 | |||||
| #define T0 $r19 | |||||
| #define S8 $r20 | |||||
| #define S9 $r23 | |||||
| #define S10 $r11 | |||||
| #define ZERO $r0 | |||||
| #define F0 $f0 | |||||
| #define F1 $f1 | |||||
| #define F2 $f2 | |||||
| #define F3 $f3 | |||||
| #define F4 $f4 | |||||
| #define F5 $f5 | |||||
| #define F6 $f6 | |||||
| #define F7 $f7 | |||||
| /* LASX vectors */ | |||||
| #define U0 $xr0 | |||||
| #define U1 $xr1 | |||||
| #define U2 $xr2 | |||||
| #define U3 $xr3 | |||||
| #define U4 $xr4 | |||||
| #define U5 $xr5 | |||||
| #define U6 $xr6 | |||||
| #define U7 $xr7 | |||||
| #define D0 $xr8 | |||||
| #define D1 $xr9 | |||||
| #define D2 $xr10 | |||||
| #define D3 $xr11 | |||||
| #define D4 $xr12 | |||||
| #define D5 $xr13 | |||||
| #define D6 $xr14 | |||||
| #define D7 $xr15 | |||||
| PROLOGUE | |||||
| addi.d $sp, $sp, -8 | |||||
| SDARG $r23, $sp, 0 | |||||
| move TS, SRC //aoffset | |||||
| move TD, DST //boffset | |||||
| slli.d TL, LDA, 0x03 //lda | |||||
| slli.d TL, TL, 0x01 //lda | |||||
| ori T0, ZERO, 0x03 | |||||
| andn T0, N, T0 | |||||
| mul.d T0, M, T0 | |||||
| slli.d T0, T0, 0x01 | |||||
| slli.d T0, T0, 0x03 | |||||
| add.d S9, DST, T0 //boffset2 | |||||
| ori T0, ZERO, 0x01 | |||||
| andn T0, N, T0 | |||||
| mul.d T0, M, T0 | |||||
| slli.d T0, T0, 0x01 | |||||
| slli.d T0, T0, 0x03 | |||||
| add.d S10, DST, T0 //boffset3 | |||||
| srai.d J, M, 0x02 //j | |||||
| beq J, ZERO, .L_M1 | |||||
| .L_J1: /* if(j>0) j--*/ | |||||
| move S1, TS //aoffset1 | |||||
| add.d S2, S1, TL | |||||
| add.d S3, S2, TL | |||||
| add.d S4, S3, TL | |||||
| slli.d T0, TL, 0x02 | |||||
| add.d TS, TS, T0 | |||||
| move S8, TD //boffset1 | |||||
| addi.d TD, TD, 0x100 | |||||
| srai.d I, N, 0x02 | |||||
| beq ZERO, I, .L_JN1 | |||||
| .L_JI1: /* if(i>0) i--*/ | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S1, 0x20 | |||||
| xvld U2, S2, 0x00 | |||||
| xvld U3, S2, 0x20 | |||||
| xvld U4, S3, 0x00 | |||||
| xvld U5, S3, 0x20 | |||||
| xvld U6, S4, 0x00 | |||||
| xvld U7, S4, 0x20 | |||||
| xvst U0, S8, 0x00 | |||||
| xvst U1, S8, 0x20 | |||||
| xvst U2, S8, 0x40 | |||||
| xvst U3, S8, 0x60 | |||||
| xvst U4, S8, 0x80 | |||||
| xvst U5, S8, 0xa0 | |||||
| xvst U6, S8, 0xc0 | |||||
| xvst U7, S8, 0xe0 | |||||
| addi.d S1, S1, 0x40 | |||||
| addi.d S2, S2, 0x40 | |||||
| addi.d S3, S3, 0x40 | |||||
| addi.d S4, S4, 0x40 | |||||
| slli.d T0, M, 0x06 | |||||
| add.d S8, S8, T0 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_JI1 | |||||
| .L_JN1: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_JN2 | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S2, 0x00 | |||||
| xvld U2, S3, 0x00 | |||||
| xvld U3, S4, 0x00 | |||||
| xvst U0, S9, 0x00 | |||||
| xvst U1, S9, 0x20 | |||||
| xvst U2, S9, 0x40 | |||||
| xvst U3, S9, 0x60 | |||||
| addi.d S1, S1, 0x20 | |||||
| addi.d S2, S2, 0x20 | |||||
| addi.d S3, S3, 0x20 | |||||
| addi.d S4, S4, 0x20 | |||||
| addi.d S9, S9, 0x80 | |||||
| .L_JN2: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq ZERO, I, .L_J0 | |||||
| vld $vr0, S1, 0x00 | |||||
| vld $vr1, S2, 0x00 | |||||
| vld $vr2, S3, 0x00 | |||||
| vld $vr3, S4, 0x00 | |||||
| vst $vr0, S10, 0x00 | |||||
| vst $vr1, S10, 0x10 | |||||
| vst $vr2, S10, 0x20 | |||||
| vst $vr3, S10, 0x30 | |||||
| addi.d S10, S10, 0x40 | |||||
| .L_J0: | |||||
| addi.d J, J, -1 | |||||
| blt ZERO, J, .L_J1 | |||||
| .L_M1: /* if(m&2) */ | |||||
| andi I, M, 0x02 | |||||
| beq ZERO, I, .L_M2 | |||||
| move S1, TS //aoffset1 | |||||
| add.d S2, S1, TL | |||||
| slli.d T0, TL, 0x01 | |||||
| add.d TS, TS, T0 | |||||
| move S8, TD //boffset1 | |||||
| addi.d TD, TD, 0x80 | |||||
| srai.d I, N, 0x02 | |||||
| beq ZERO, I, .L_M1N1 | |||||
| .L_M1I1: /* if(i>0) */ | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S1, 0x20 | |||||
| xvld U2, S2, 0x00 | |||||
| xvld U3, S2, 0x20 | |||||
| xvst U0, S8, 0x00 | |||||
| xvst U1, S8, 0x20 | |||||
| xvst U2, S8, 0x40 | |||||
| xvst U3, S8, 0x60 | |||||
| addi.d S1, S1, 0x40 | |||||
| addi.d S2, S2, 0x40 | |||||
| slli.d T0, M, 0x06 | |||||
| add.d S8, S8, T0 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_M1I1 | |||||
| .L_M1N1: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_M1N2 | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S2, 0x00 | |||||
| xvst U0, S9, 0x00 | |||||
| xvst U1, S9, 0x20 | |||||
| addi.d S1, S1, 0x20 | |||||
| addi.d S2, S2, 0x20 | |||||
| addi.d S9, S9, 0x40 | |||||
| .L_M1N2: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq ZERO, I, .L_M2 | |||||
| vld $vr0, S1, 0x00 | |||||
| vld $vr1, S2, 0x00 | |||||
| vst $vr0, S10, 0x00 | |||||
| vst $vr1, S10, 0x10 | |||||
| addi.d S10, S10, 0x20 | |||||
| .L_M2: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq ZERO, I, .L_M0 | |||||
| move S1, TS //aoffset1 | |||||
| move S8, TD //boffset1 | |||||
| srai.d I, N, 0x02 | |||||
| beq ZERO, I, .L_M2N1 | |||||
| .L_M2I1: /* if(i>0) */ | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S1, 0x20 | |||||
| xvst U0, S8, 0x00 | |||||
| xvst U1, S8, 0x20 | |||||
| addi.d S1, S1, 0x40 | |||||
| slli.d T0, M, 0x06 | |||||
| add.d S8, S8, T0 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_M2I1 | |||||
| .L_M2N1: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_M2N2 | |||||
| xvld U0, S1, 0x00 | |||||
| xvst U0, S9, 0x00 | |||||
| addi.d S1, S1, 0x20 | |||||
| .L_M2N2: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq ZERO, I, .L_M0 | |||||
| vld $vr0, S1, 0x00 | |||||
| vst $vr0, S10, 0x00 | |||||
| .L_M0: | |||||
| LDARG $r23, $sp, 0 | |||||
| addi.d $sp, $sp, 8 | |||||
| jirl $r0, $r1, 0x00 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,355 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Function parameters */ | |||||
| #define M $r4 // param 1: m | |||||
| #define N $r5 // param 2: n | |||||
| #define SRC $r6 // param 3: src | |||||
| #define LDA $r7 // param 4: lda | |||||
| #define DST $r8 // param 5: dst | |||||
| #define I $r9 | |||||
| #define J $r10 | |||||
| #define S1 $r12 | |||||
| #define S2 $r13 | |||||
| #define S3 $r14 | |||||
| #define S4 $r15 | |||||
| #define TD $r16 | |||||
| #define TS $r17 | |||||
| #define TL $r18 | |||||
| #define T0 $r19 | |||||
| #define S8 $r20 | |||||
| #define S9 $r23 | |||||
| #define S10 $r11 | |||||
| #define ZERO $r0 | |||||
| #define F0 $f0 | |||||
| #define F1 $f1 | |||||
| #define F2 $f2 | |||||
| #define F3 $f3 | |||||
| #define F4 $f4 | |||||
| #define F5 $f5 | |||||
| #define F6 $f6 | |||||
| #define F7 $f7 | |||||
| /* LSX vectors */ | |||||
| #define U0 $vr0 | |||||
| #define U1 $vr1 | |||||
| #define U2 $vr2 | |||||
| #define U3 $vr3 | |||||
| #define U4 $vr4 | |||||
| #define U5 $vr5 | |||||
| #define U6 $vr6 | |||||
| #define U7 $vr7 | |||||
| #define U8 $vr8 | |||||
| #define U9 $vr9 | |||||
| #define U10 $vr10 | |||||
| #define U11 $vr11 | |||||
| #define U12 $vr12 | |||||
| #define U13 $vr13 | |||||
| #define U14 $vr14 | |||||
| #define U15 $vr15 | |||||
| PROLOGUE | |||||
| addi.d $sp, $sp, -8 | |||||
| SDARG $r23, $sp, 0 | |||||
| move TS, SRC //aoffset | |||||
| move TD, DST //boffset | |||||
| slli.d TL, LDA, 0x03 //lda | |||||
| slli.d TL, TL, 0x01 //lda | |||||
| ori T0, ZERO, 0x03 | |||||
| andn T0, N, T0 | |||||
| mul.d T0, M, T0 | |||||
| slli.d T0, T0, 0x01 | |||||
| slli.d T0, T0, 0x03 | |||||
| add.d S9, DST, T0 //boffset2 | |||||
| ori T0, ZERO, 0x01 | |||||
| andn T0, N, T0 | |||||
| mul.d T0, M, T0 | |||||
| slli.d T0, T0, 0x01 | |||||
| slli.d T0, T0, 0x03 | |||||
| add.d S10, DST, T0 //boffset3 | |||||
| srai.d J, M, 0x02 //j | |||||
| beq J, ZERO, .L_M1 | |||||
| .L_J1: /* if(j>0) j--*/ | |||||
| move S1, TS //aoffset1 | |||||
| add.d S2, S1, TL | |||||
| add.d S3, S2, TL | |||||
| add.d S4, S3, TL | |||||
| slli.d T0, TL, 0x02 | |||||
| add.d TS, TS, T0 | |||||
| move S8, TD //boffset1 | |||||
| addi.d TD, TD, 0x100 | |||||
| srai.d I, N, 0x02 | |||||
| beq ZERO, I, .L_JN1 | |||||
| .L_JI1: /* if(i>0) i--*/ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S1, 0x20 | |||||
| vld U3, S1, 0x30 | |||||
| vld U4, S2, 0x00 | |||||
| vld U5, S2, 0x10 | |||||
| vld U6, S2, 0x20 | |||||
| vld U7, S2, 0x30 | |||||
| vld U8, S3, 0x00 | |||||
| vld U9, S3, 0x10 | |||||
| vld U10, S3, 0x20 | |||||
| vld U11, S3, 0x30 | |||||
| vld U12, S4, 0x00 | |||||
| vld U13, S4, 0x10 | |||||
| vld U14, S4, 0x20 | |||||
| vld U15, S4, 0x30 | |||||
| vst U0, S8, 0x00 | |||||
| vst U1, S8, 0x10 | |||||
| vst U2, S8, 0x20 | |||||
| vst U3, S8, 0x30 | |||||
| vst U4, S8, 0x40 | |||||
| vst U5, S8, 0x50 | |||||
| vst U6, S8, 0x60 | |||||
| vst U7, S8, 0x70 | |||||
| vst U8, S8, 0x80 | |||||
| vst U9, S8, 0x90 | |||||
| vst U10, S8, 0xa0 | |||||
| vst U11, S8, 0xb0 | |||||
| vst U12, S8, 0xc0 | |||||
| vst U13, S8, 0xd0 | |||||
| vst U14, S8, 0xe0 | |||||
| vst U15, S8, 0xf0 | |||||
| addi.d S1, S1, 0x40 | |||||
| addi.d S2, S2, 0x40 | |||||
| addi.d S3, S3, 0x40 | |||||
| addi.d S4, S4, 0x40 | |||||
| slli.d T0, M, 0x06 | |||||
| add.d S8, S8, T0 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_JI1 | |||||
| .L_JN1: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_JN2 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S2, 0x00 | |||||
| vld U3, S2, 0x10 | |||||
| vld U4, S3, 0x00 | |||||
| vld U5, S3, 0x10 | |||||
| vld U6, S4, 0x00 | |||||
| vld U7, S4, 0x10 | |||||
| vst U0, S9, 0x00 | |||||
| vst U1, S9, 0x10 | |||||
| vst U2, S9, 0x20 | |||||
| vst U3, S9, 0x30 | |||||
| vst U4, S9, 0x40 | |||||
| vst U5, S9, 0x50 | |||||
| vst U6, S9, 0x60 | |||||
| vst U7, S9, 0x70 | |||||
| addi.d S1, S1, 0x20 | |||||
| addi.d S2, S2, 0x20 | |||||
| addi.d S3, S3, 0x20 | |||||
| addi.d S4, S4, 0x20 | |||||
| addi.d S9, S9, 0x80 | |||||
| .L_JN2: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq ZERO, I, .L_J0 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S2, 0x00 | |||||
| vld U2, S3, 0x00 | |||||
| vld U3, S4, 0x00 | |||||
| vst U0, S10, 0x00 | |||||
| vst U1, S10, 0x10 | |||||
| vst U2, S10, 0x20 | |||||
| vst U3, S10, 0x30 | |||||
| addi.d S10, S10, 0x40 | |||||
| .L_J0: | |||||
| addi.d J, J, -1 | |||||
| blt ZERO, J, .L_J1 | |||||
| .L_M1: /* if(m&2) */ | |||||
| andi I, M, 0x02 | |||||
| beq ZERO, I, .L_M2 | |||||
| move S1, TS //aoffset1 | |||||
| add.d S2, S1, TL | |||||
| slli.d T0, TL, 0x01 | |||||
| add.d TS, TS, T0 | |||||
| move S8, TD //boffset1 | |||||
| addi.d TD, TD, 0x80 | |||||
| srai.d I, N, 0x02 | |||||
| beq ZERO, I, .L_M1N1 | |||||
| .L_M1I1: /* if(i>0) */ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S1, 0x20 | |||||
| vld U3, S1, 0x30 | |||||
| vld U4, S2, 0x00 | |||||
| vld U5, S2, 0x10 | |||||
| vld U6, S2, 0x20 | |||||
| vld U7, S2, 0x30 | |||||
| vst U0, S8, 0x00 | |||||
| vst U1, S8, 0x10 | |||||
| vst U2, S8, 0x20 | |||||
| vst U3, S8, 0x30 | |||||
| vst U4, S8, 0x40 | |||||
| vst U5, S8, 0x50 | |||||
| vst U6, S8, 0x60 | |||||
| vst U7, S8, 0x70 | |||||
| addi.d S1, S1, 0x40 | |||||
| addi.d S2, S2, 0x40 | |||||
| slli.d T0, M, 0x06 | |||||
| add.d S8, S8, T0 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_M1I1 | |||||
| .L_M1N1: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_M1N2 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S2, 0x00 | |||||
| vld U3, S2, 0x10 | |||||
| vst U0, S9, 0x00 | |||||
| vst U1, S9, 0x10 | |||||
| vst U2, S9, 0x20 | |||||
| vst U3, S9, 0x30 | |||||
| addi.d S1, S1, 0x20 | |||||
| addi.d S2, S2, 0x20 | |||||
| addi.d S9, S9, 0x40 | |||||
| .L_M1N2: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq ZERO, I, .L_M2 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S2, 0x00 | |||||
| vst U0, S10, 0x00 | |||||
| vst U1, S10, 0x10 | |||||
| addi.d S10, S10, 0x20 | |||||
| .L_M2: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq ZERO, I, .L_M0 | |||||
| move S1, TS //aoffset1 | |||||
| move S8, TD //boffset1 | |||||
| srai.d I, N, 0x02 | |||||
| beq ZERO, I, .L_M2N1 | |||||
| .L_M2I1: /* if(i>0) */ | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vld U2, S1, 0x20 | |||||
| vld U3, S1, 0x30 | |||||
| vst U0, S8, 0x00 | |||||
| vst U1, S8, 0x10 | |||||
| vst U2, S8, 0x20 | |||||
| vst U3, S8, 0x30 | |||||
| addi.d S1, S1, 0x40 | |||||
| slli.d T0, M, 0x06 | |||||
| add.d S8, S8, T0 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_M2I1 | |||||
| .L_M2N1: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_M2N2 | |||||
| vld U0, S1, 0x00 | |||||
| vld U1, S1, 0x10 | |||||
| vst U0, S9, 0x00 | |||||
| vst U1, S9, 0x10 | |||||
| addi.d S1, S1, 0x20 | |||||
| .L_M2N2: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq ZERO, I, .L_M0 | |||||
| vld U0, S1, 0x00 | |||||
| vst U0, S10, 0x00 | |||||
| .L_M0: | |||||
| LDARG $r23, $sp, 0 | |||||
| addi.d $sp, $sp, 8 | |||||
| jirl $r0, $r1, 0x00 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,268 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Function parameters */ | |||||
| #define M $r4 // param 1: m | |||||
| #define N $r5 // param 2: n | |||||
| #define SRC $r6 // param 3: src | |||||
| #define LDA $r7 // param 4: lda | |||||
| #define DST $r8 // param 5: dst | |||||
| #define I $r9 | |||||
| #define J $r10 | |||||
| #define S1 $r12 | |||||
| #define S2 $r13 | |||||
| #define S3 $r14 | |||||
| #define S4 $r15 | |||||
| #define S5 $r16 | |||||
| #define S6 $r17 | |||||
| #define S7 $r18 | |||||
| #define S8 $r19 | |||||
| #define TD $r20 | |||||
| #define TS $r11 | |||||
| #define TL $r7 | |||||
| #define T0 $r23 | |||||
| #define ZERO $r0 | |||||
| #define F0 $f0 | |||||
| #define F1 $f1 | |||||
| #define F2 $f2 | |||||
| #define F3 $f3 | |||||
| #define F4 $f4 | |||||
| #define F5 $f5 | |||||
| #define F6 $f6 | |||||
| #define F7 $f7 | |||||
| /* LASX vectors */ | |||||
| #define U0 $xr0 | |||||
| #define U1 $xr1 | |||||
| #define U2 $xr2 | |||||
| #define U3 $xr3 | |||||
| #define U4 $xr4 | |||||
| #define U5 $xr5 | |||||
| #define U6 $xr6 | |||||
| #define U7 $xr7 | |||||
| #define D0 $xr8 | |||||
| #define D1 $xr9 | |||||
| #define D2 $xr10 | |||||
| #define D3 $xr11 | |||||
| #define D4 $xr12 | |||||
| #define D5 $xr13 | |||||
| #define D6 $xr14 | |||||
| #define D7 $xr15 | |||||
| PROLOGUE | |||||
| addi.d $sp, $sp, -8 | |||||
| SDARG $r23, $sp, 0 | |||||
| move TS, SRC //aoffset | |||||
| move TD, DST //boffset | |||||
| slli.d TL, LDA, 0x03 //lda | |||||
| slli.d TL, TL, 0x01 | |||||
| srai.d J, N, 0x03 //j | |||||
| beq J, ZERO, .L_N1 | |||||
| .L_J1: /* if(j>0) j--*/ | |||||
| move S1, TS //aoffset1 | |||||
| slli.d T0, TL, 0x01 //2*lda | |||||
| add.d S2, TS, TL | |||||
| addi.d TS, TS, 0x80 | |||||
| srai.d I, M, 0x01 | |||||
| beq ZERO, I, .L_J1M1 | |||||
| .L_J1I1: /* if(i>0) i--*/ | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S1, 0x20 | |||||
| xvld U2, S1, 0x40 | |||||
| xvld U3, S1, 0x60 | |||||
| xvld U4, S2, 0x00 | |||||
| xvld U5, S2, 0x20 | |||||
| xvld U6, S2, 0x40 | |||||
| xvld U7, S2, 0x60 | |||||
| xvst U0, TD, 0x00 | |||||
| xvst U1, TD, 0x20 | |||||
| xvst U2, TD, 0x40 | |||||
| xvst U3, TD, 0x60 | |||||
| xvst U4, TD, 0x80 | |||||
| xvst U5, TD, 0xa0 | |||||
| xvst U6, TD, 0xc0 | |||||
| xvst U7, TD, 0xe0 | |||||
| add.d S1, S1, T0 | |||||
| add.d S2, S2, T0 | |||||
| addi.d TD, TD, 0x100 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_J1I1 | |||||
| .L_J1M1: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq ZERO, I, .L_J0 | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S1, 0x20 | |||||
| xvld U2, S1, 0x40 | |||||
| xvld U3, S1, 0x60 | |||||
| xvst U0, TD, 0x00 | |||||
| xvst U1, TD, 0x20 | |||||
| xvst U2, TD, 0x40 | |||||
| xvst U3, TD, 0x60 | |||||
| addi.d TD, TD, 0x80 | |||||
| .L_J0: | |||||
| addi.d J, J, -1 | |||||
| blt ZERO, J, .L_J1 | |||||
| .L_N1: /* if(n&4) */ | |||||
| andi I, N, 0x04 | |||||
| beq ZERO, I, .L_N2 | |||||
| move S1, TS //aoffset1 | |||||
| slli.d T0, TL, 0x01 //2*lda | |||||
| add.d S2, TS, TL | |||||
| addi.d TS, TS, 0x40 | |||||
| srai.d I, M, 0x01 | |||||
| beq ZERO, I, .L_N1M1 | |||||
| .L_N1I1: /* if(i>0) i-- */ | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S1, 0x20 | |||||
| xvld U2, S2, 0x00 | |||||
| xvld U3, S2, 0x20 | |||||
| xvst U0, TD, 0x00 | |||||
| xvst U1, TD, 0x20 | |||||
| xvst U2, TD, 0x40 | |||||
| xvst U3, TD, 0x60 | |||||
| add.d S1, S1, T0 | |||||
| add.d S2, S2, T0 | |||||
| addi.d TD, TD, 0x80 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N1I1 | |||||
| .L_N1M1: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq ZERO, I, .L_N2 | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S1, 0x20 | |||||
| xvst U0, TD, 0x00 | |||||
| xvst U1, TD, 0x20 | |||||
| addi.d TD, TD, 0x40 | |||||
| .L_N2: /* if(n&2) */ | |||||
| andi I, N, 0x02 | |||||
| beq ZERO, I, .L_N3 | |||||
| move S1, TS //aoffset1 | |||||
| slli.d T0, TL, 0x01 //2*lda | |||||
| add.d S2, TS, TL | |||||
| addi.d TS, TS, 0x20 | |||||
| srai.d I, M, 0x01 | |||||
| beq ZERO, I, .L_N2M1 | |||||
| .L_N2I1: /* if(i>0) i-- */ | |||||
| xvld U0, S1, 0x00 | |||||
| xvld U1, S2, 0x00 | |||||
| xvst U0, TD, 0x00 | |||||
| xvst U1, TD, 0x20 | |||||
| add.d S1, S1, T0 | |||||
| add.d S2, S2, T0 | |||||
| addi.d TD, TD, 0x40 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N2I1 | |||||
| .L_N2M1: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq ZERO, I, .L_N3 | |||||
| xvld U0, S1, 0x00 | |||||
| xvst U0, TD, 0x00 | |||||
| addi.d TD, TD, 0x20 | |||||
| .L_N3: /* if(n&1) */ | |||||
| andi I, N, 0x01 | |||||
| beq ZERO, I, .L_N0 | |||||
| move S1, TS //aoffset1 | |||||
| slli.d T0, TL, 0x01 //2*lda | |||||
| add.d S2, TS, TL | |||||
| srai.d I, M, 0x01 | |||||
| beq ZERO, I, .L_N3M1 | |||||
| .L_N3I1: /* if(i>0) i-- */ | |||||
| vld $vr0, S1, 0x00 | |||||
| vld $vr1, S2, 0x00 | |||||
| vst $vr0, TD, 0x00 | |||||
| vst $vr1, TD, 0x10 | |||||
| add.d S1, S1, T0 | |||||
| add.d S2, S2, T0 | |||||
| addi.d TD, TD, 0x20 | |||||
| addi.d I, I, -1 | |||||
| blt ZERO, I, .L_N3I1 | |||||
| .L_N3M1: /* if(m&1) */ | |||||
| andi I, M, 0x01 | |||||
| beq ZERO, I, .L_N0 | |||||
| vld $vr0, S1, 0x00 | |||||
| vst $vr0, TD, 0x00 | |||||
| .L_N0: | |||||
| LDARG $r23, $sp, 0 | |||||
| addi.d $sp, $sp, 8 | |||||
| jirl $r0, $r1, 0x00 | |||||
| EPILOGUE | |||||
| @@ -35,7 +35,7 @@ DSUMKERNEL = ../mips/sum.c | |||||
| CSUMKERNEL = ../mips/zsum.c | CSUMKERNEL = ../mips/zsum.c | ||||
| ZSUMKERNEL = ../mips/zsum.c | ZSUMKERNEL = ../mips/zsum.c | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SASUMKERNEL = ../mips/sasum_msa.c | SASUMKERNEL = ../mips/sasum_msa.c | ||||
| DASUMKERNEL = ../mips/dasum_msa.c | DASUMKERNEL = ../mips/dasum_msa.c | ||||
| CASUMKERNEL = ../mips/casum_msa.c | CASUMKERNEL = ../mips/casum_msa.c | ||||
| @@ -47,7 +47,7 @@ CASUMKERNEL = ../mips/zasum.c | |||||
| ZASUMKERNEL = ../mips/zasum.c | ZASUMKERNEL = ../mips/zasum.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SAXPYKERNEL = ../mips/saxpy_msa.c | SAXPYKERNEL = ../mips/saxpy_msa.c | ||||
| DAXPYKERNEL = ../mips/daxpy_msa.c | DAXPYKERNEL = ../mips/daxpy_msa.c | ||||
| CAXPYKERNEL = ../mips/caxpy_msa.c | CAXPYKERNEL = ../mips/caxpy_msa.c | ||||
| @@ -59,7 +59,7 @@ CAXPYKERNEL = ../mips/zaxpy.c | |||||
| ZAXPYKERNEL = ../mips/zaxpy.c | ZAXPYKERNEL = ../mips/zaxpy.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SCOPYKERNEL = ../mips/scopy_msa.c | SCOPYKERNEL = ../mips/scopy_msa.c | ||||
| DCOPYKERNEL = ../mips/dcopy_msa.c | DCOPYKERNEL = ../mips/dcopy_msa.c | ||||
| CCOPYKERNEL = ../mips/ccopy_msa.c | CCOPYKERNEL = ../mips/ccopy_msa.c | ||||
| @@ -71,7 +71,7 @@ CCOPYKERNEL = ../mips/zcopy.c | |||||
| ZCOPYKERNEL = ../mips/zcopy.c | ZCOPYKERNEL = ../mips/zcopy.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SDOTKERNEL = ../mips/sdot_msa.c | SDOTKERNEL = ../mips/sdot_msa.c | ||||
| DDOTKERNEL = ../mips/ddot_msa.c | DDOTKERNEL = ../mips/ddot_msa.c | ||||
| CDOTKERNEL = ../mips/cdot_msa.c | CDOTKERNEL = ../mips/cdot_msa.c | ||||
| @@ -88,7 +88,7 @@ DNRM2KERNEL = ../mips/nrm2.c | |||||
| CNRM2KERNEL = ../mips/znrm2.c | CNRM2KERNEL = ../mips/znrm2.c | ||||
| ZNRM2KERNEL = ../mips/znrm2.c | ZNRM2KERNEL = ../mips/znrm2.c | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SROTKERNEL = ../mips/srot_msa.c | SROTKERNEL = ../mips/srot_msa.c | ||||
| DROTKERNEL = ../mips/drot_msa.c | DROTKERNEL = ../mips/drot_msa.c | ||||
| CROTKERNEL = ../mips/crot_msa.c | CROTKERNEL = ../mips/crot_msa.c | ||||
| @@ -100,7 +100,7 @@ CROTKERNEL = ../mips/zrot.c | |||||
| ZROTKERNEL = ../mips/zrot.c | ZROTKERNEL = ../mips/zrot.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SSCALKERNEL = ../mips/sscal_msa.c | SSCALKERNEL = ../mips/sscal_msa.c | ||||
| DSCALKERNEL = ../mips/dscal_msa.c | DSCALKERNEL = ../mips/dscal_msa.c | ||||
| #CSCALKERNEL = ../mips/cscal_msa.c | #CSCALKERNEL = ../mips/cscal_msa.c | ||||
| @@ -114,7 +114,7 @@ CSCALKERNEL = ../mips/zscal.c | |||||
| ZSCALKERNEL = ../mips/zscal.c | ZSCALKERNEL = ../mips/zscal.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SSWAPKERNEL = ../mips/sswap_msa.c | SSWAPKERNEL = ../mips/sswap_msa.c | ||||
| DSWAPKERNEL = ../mips/dswap_msa.c | DSWAPKERNEL = ../mips/dswap_msa.c | ||||
| CSWAPKERNEL = ../mips/cswap_msa.c | CSWAPKERNEL = ../mips/cswap_msa.c | ||||
| @@ -126,7 +126,7 @@ CSWAPKERNEL = ../mips/zswap.c | |||||
| ZSWAPKERNEL = ../mips/zswap.c | ZSWAPKERNEL = ../mips/zswap.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SGEMVNKERNEL = ../mips/sgemv_n_msa.c | SGEMVNKERNEL = ../mips/sgemv_n_msa.c | ||||
| DGEMVNKERNEL = ../mips/dgemv_n_msa.c | DGEMVNKERNEL = ../mips/dgemv_n_msa.c | ||||
| CGEMVNKERNEL = ../mips/cgemv_n_msa.c | CGEMVNKERNEL = ../mips/cgemv_n_msa.c | ||||
| @@ -138,7 +138,7 @@ CGEMVNKERNEL = ../mips/zgemv_n.c | |||||
| ZGEMVNKERNEL = ../mips/zgemv_n.c | ZGEMVNKERNEL = ../mips/zgemv_n.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SGEMVTKERNEL = ../mips/sgemv_t_msa.c | SGEMVTKERNEL = ../mips/sgemv_t_msa.c | ||||
| DGEMVTKERNEL = ../mips/dgemv_t_msa.c | DGEMVTKERNEL = ../mips/dgemv_t_msa.c | ||||
| CGEMVTKERNEL = ../mips/cgemv_t_msa.c | CGEMVTKERNEL = ../mips/cgemv_t_msa.c | ||||
| @@ -150,7 +150,7 @@ CGEMVTKERNEL = ../mips/zgemv_t.c | |||||
| ZGEMVTKERNEL = ../mips/zgemv_t.c | ZGEMVTKERNEL = ../mips/zgemv_t.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | ||||
| SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | ||||
| SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | ||||
| @@ -164,7 +164,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | SGEMMOTCOPYOBJ = sgemm_otcopy.o | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c | DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c | ||||
| DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c | DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c | ||||
| DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c | DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c | ||||
| @@ -182,7 +182,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | DGEMMOTCOPYOBJ = dgemm_otcopy.o | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | ||||
| CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | ||||
| CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | ||||
| @@ -200,7 +200,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | CGEMMOTCOPYOBJ = cgemm_otcopy.o | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | ||||
| ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | ||||
| ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | ||||
| @@ -214,7 +214,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | ZGEMMOTCOPYOBJ = zgemm_otcopy.o | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | ||||
| STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | ||||
| STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | ||||
| @@ -226,7 +226,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | ||||
| DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | ||||
| DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c | DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c | ||||
| @@ -238,7 +238,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| @@ -250,7 +250,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| @@ -1,4 +1,4 @@ | |||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SAXPYKERNEL = ../mips/saxpy_msa.c | SAXPYKERNEL = ../mips/saxpy_msa.c | ||||
| DAXPYKERNEL = ../mips/daxpy_msa.c | DAXPYKERNEL = ../mips/daxpy_msa.c | ||||
| CAXPYKERNEL = ../mips/caxpy_msa.c | CAXPYKERNEL = ../mips/caxpy_msa.c | ||||
| @@ -8,14 +8,14 @@ SAXPYKERNEL = axpy_loongson3a.S | |||||
| DAXPYKERNEL = daxpy_loongson3a_simd.S | DAXPYKERNEL = daxpy_loongson3a_simd.S | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SCOPYKERNEL = ../mips/scopy_msa.c | SCOPYKERNEL = ../mips/scopy_msa.c | ||||
| DCOPYKERNEL = ../mips/dcopy_msa.c | DCOPYKERNEL = ../mips/dcopy_msa.c | ||||
| CCOPYKERNEL = ../mips/ccopy_msa.c | CCOPYKERNEL = ../mips/ccopy_msa.c | ||||
| ZCOPYKERNEL = ../mips/zcopy_msa.c | ZCOPYKERNEL = ../mips/zcopy_msa.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SDOTKERNEL = ../mips/sdot_msa.c | SDOTKERNEL = ../mips/sdot_msa.c | ||||
| DDOTKERNEL = ../mips/ddot_msa.c | DDOTKERNEL = ../mips/ddot_msa.c | ||||
| CDOTKERNEL = ../mips/cdot_msa.c | CDOTKERNEL = ../mips/cdot_msa.c | ||||
| @@ -23,21 +23,21 @@ ZDOTKERNEL = ../mips/zdot_msa.c | |||||
| endif | endif | ||||
| DSDOTKERNEL = ../mips/dot.c | DSDOTKERNEL = ../mips/dot.c | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SROTKERNEL = ../mips/srot_msa.c | SROTKERNEL = ../mips/srot_msa.c | ||||
| DROTKERNEL = ../mips/drot_msa.c | DROTKERNEL = ../mips/drot_msa.c | ||||
| CROTKERNEL = ../mips/crot_msa.c | CROTKERNEL = ../mips/crot_msa.c | ||||
| ZROTKERNEL = ../mips/zrot_msa.c | ZROTKERNEL = ../mips/zrot_msa.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SSCALKERNEL = ../mips/sscal_msa.c | SSCALKERNEL = ../mips/sscal_msa.c | ||||
| DSCALKERNEL = ../mips/dscal_msa.c | DSCALKERNEL = ../mips/dscal_msa.c | ||||
| CSCALKERNEL = ../mips/cscal_msa.c | CSCALKERNEL = ../mips/cscal_msa.c | ||||
| ZSCALKERNEL = ../mips/zscal_msa.c | ZSCALKERNEL = ../mips/zscal_msa.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SGEMVNKERNEL = ../mips/sgemv_n_msa.c | SGEMVNKERNEL = ../mips/sgemv_n_msa.c | ||||
| DGEMVNKERNEL = ../mips/dgemv_n_msa.c | DGEMVNKERNEL = ../mips/dgemv_n_msa.c | ||||
| SGEMVTKERNEL = ../mips/sgemv_t_msa.c | SGEMVTKERNEL = ../mips/sgemv_t_msa.c | ||||
| @@ -57,21 +57,21 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c | |||||
| ZGEMVTKERNEL = zgemv_t_loongson3a.c | ZGEMVTKERNEL = zgemv_t_loongson3a.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SASUMKERNEL = ../mips/sasum_msa.c | SASUMKERNEL = ../mips/sasum_msa.c | ||||
| DASUMKERNEL = ../mips/dasum_msa.c | DASUMKERNEL = ../mips/dasum_msa.c | ||||
| CASUMKERNEL = ../mips/casum_msa.c | CASUMKERNEL = ../mips/casum_msa.c | ||||
| ZASUMKERNEL = ../mips/zasum_msa.c | ZASUMKERNEL = ../mips/zasum_msa.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SSWAPKERNEL = ../mips/sswap_msa.c | SSWAPKERNEL = ../mips/sswap_msa.c | ||||
| DSWAPKERNEL = ../mips/dswap_msa.c | DSWAPKERNEL = ../mips/dswap_msa.c | ||||
| CSWAPKERNEL = ../mips/cswap_msa.c | CSWAPKERNEL = ../mips/cswap_msa.c | ||||
| ZSWAPKERNEL = ../mips/zswap_msa.c | ZSWAPKERNEL = ../mips/zswap_msa.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c | ||||
| SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c | ||||
| SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c | ||||
| @@ -89,7 +89,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c | DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c | ||||
| DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c | DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c | ||||
| DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c | DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c | ||||
| @@ -107,7 +107,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c | ||||
| CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c | ||||
| CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c | ||||
| @@ -129,7 +129,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c | ||||
| ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c | ||||
| ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c | ||||
| @@ -143,7 +143,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c | ||||
| STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c | ||||
| STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c | ||||
| @@ -155,7 +155,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c | ||||
| DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c | ||||
| DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c | DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c | ||||
| @@ -167,7 +167,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| @@ -179,7 +179,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| ifndef NO_MSA | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | ||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| @@ -25,7 +25,7 @@ ZTRMMKERNEL = zgemm_kernel_power10.S | |||||
| endif | endif | ||||
| SGEMMKERNEL = sgemm_kernel_power10.c | SGEMMKERNEL = sgemm_kernel_power10.c | ||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
| SGEMMINCOPY = sgemm_ncopy_16_power.c | |||||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | SGEMMITCOPY = sgemm_tcopy_16_power8.S | ||||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | SGEMMONCOPY = ../generic/gemm_ncopy_8.c | ||||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | SGEMMOTCOPY = sgemm_tcopy_8_power8.S | ||||
| @@ -50,7 +50,7 @@ CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||||
| ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | ||||
| SGEMMKERNEL = sgemm_kernel_16x8_power8.S | SGEMMKERNEL = sgemm_kernel_16x8_power8.S | ||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
| SGEMMINCOPY = sgemm_ncopy_16_power.c | |||||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | SGEMMITCOPY = sgemm_tcopy_16_power8.S | ||||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | SGEMMONCOPY = ../generic/gemm_ncopy_8.c | ||||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | SGEMMOTCOPY = sgemm_tcopy_8_power8.S | ||||
| @@ -13,7 +13,7 @@ CTRMMKERNEL = cgemm_kernel_power9.S | |||||
| ZTRMMKERNEL = zgemm_kernel_power9.S | ZTRMMKERNEL = zgemm_kernel_power9.S | ||||
| SGEMMKERNEL = sgemm_kernel_power9.S | SGEMMKERNEL = sgemm_kernel_power9.S | ||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
| SGEMMINCOPY = sgemm_ncopy_16_power.c | |||||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | SGEMMITCOPY = sgemm_tcopy_16_power8.S | ||||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | SGEMMONCOPY = ../generic/gemm_ncopy_8.c | ||||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | SGEMMOTCOPY = sgemm_tcopy_8_power8.S | ||||
| @@ -0,0 +1,482 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include <altivec.h> | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||||
| BLASLONG i, j; | |||||
| IFLOAT *aoffset; | |||||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||||
| IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; | |||||
| IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; | |||||
| IFLOAT *boffset; | |||||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||||
| IFLOAT ctemp17, ctemp19 ; | |||||
| IFLOAT ctemp21, ctemp23 ; | |||||
| IFLOAT ctemp25, ctemp27 ; | |||||
| IFLOAT ctemp29, ctemp31 ; | |||||
| aoffset = a; | |||||
| boffset = b; | |||||
| j = (n >> 4); | |||||
| if (j > 0){ | |||||
| do{ | |||||
| aoffset1 = aoffset; | |||||
| aoffset2 = aoffset1 + lda; | |||||
| aoffset3 = aoffset2 + lda; | |||||
| aoffset4 = aoffset3 + lda; | |||||
| aoffset5 = aoffset4 + lda; | |||||
| aoffset6 = aoffset5 + lda; | |||||
| aoffset7 = aoffset6 + lda; | |||||
| aoffset8 = aoffset7 + lda; | |||||
| aoffset9 = aoffset8 + lda; | |||||
| aoffset10 = aoffset9 + lda; | |||||
| aoffset11 = aoffset10 + lda; | |||||
| aoffset12 = aoffset11 + lda; | |||||
| aoffset13 = aoffset12 + lda; | |||||
| aoffset14 = aoffset13 + lda; | |||||
| aoffset15 = aoffset14 + lda; | |||||
| aoffset16 = aoffset15 + lda; | |||||
| aoffset += 16 * lda; | |||||
| i = (m >> 2); | |||||
| if (i > 0){ | |||||
| vector float c1, c2, c3, c4, c5, c6, c7, c8; | |||||
| vector float c9, c10, c11, c12, c13, c14, c15, c16; | |||||
| vector float t1, t2, t3, t4, t5, t6, t7, t8; | |||||
| vector float t9, t10, t11, t12; | |||||
| do{ | |||||
| c1 = vec_xl(0, aoffset1); | |||||
| c2 = vec_xl(0, aoffset2); | |||||
| c3 = vec_xl(0, aoffset3); | |||||
| c4 = vec_xl(0, aoffset4); | |||||
| c5 = vec_xl(0, aoffset5); | |||||
| c6 = vec_xl(0, aoffset6); | |||||
| c7 = vec_xl(0, aoffset7); | |||||
| c8 = vec_xl(0, aoffset8); | |||||
| c9 = vec_xl(0, aoffset9); | |||||
| c10 = vec_xl(0, aoffset10); | |||||
| c11 = vec_xl(0, aoffset11); | |||||
| c12 = vec_xl(0, aoffset12); | |||||
| c13 = vec_xl(0, aoffset13); | |||||
| c14 = vec_xl(0, aoffset14); | |||||
| c15 = vec_xl(0, aoffset15); | |||||
| c16 = vec_xl(0, aoffset16); | |||||
| t1 = vec_mergeh(c1, c2); | |||||
| t2 = vec_mergeh(c3, c4); | |||||
| t3 = vec_mergeh(c5, c6); | |||||
| t4 = vec_mergeh(c7, c8); | |||||
| t9 = vec_mergeh(c9, c10); | |||||
| t10 = vec_mergeh(c11, c12); | |||||
| t11 = vec_mergeh(c13, c14); | |||||
| t12 = vec_mergeh(c15, c16); | |||||
| t5 = vec_xxpermdi(t1, t2, 0b00); | |||||
| t6 = vec_xxpermdi(t3, t4, 0b00); | |||||
| t7 = vec_xxpermdi(t9, t10, 0b00); | |||||
| t8 = vec_xxpermdi(t11, t12, 0b00); | |||||
| vec_xst(t5, 0, boffset); | |||||
| vec_xst(t6, 0, boffset+4); | |||||
| vec_xst(t7, 0, boffset+8); | |||||
| vec_xst(t8, 0, boffset+12); | |||||
| t5 = vec_xxpermdi(t1, t2, 0b11); | |||||
| t6 = vec_xxpermdi(t3, t4, 0b11); | |||||
| t7 = vec_xxpermdi(t9, t10, 0b11); | |||||
| t8 = vec_xxpermdi(t11, t12, 0b11); | |||||
| vec_xst(t5, 0, boffset+16); | |||||
| vec_xst(t6, 0, boffset+20); | |||||
| vec_xst(t7, 0, boffset+24); | |||||
| vec_xst(t8, 0, boffset+28); | |||||
| t1 = vec_mergel(c1, c2); | |||||
| t2 = vec_mergel(c3, c4); | |||||
| t3 = vec_mergel(c5, c6); | |||||
| t4 = vec_mergel(c7, c8); | |||||
| t9 = vec_mergel(c9, c10); | |||||
| t10 = vec_mergel(c11, c12); | |||||
| t11 = vec_mergel(c13, c14); | |||||
| t12 = vec_mergel(c15, c16); | |||||
| t5 = vec_xxpermdi(t1, t2, 0b00); | |||||
| t6 = vec_xxpermdi(t3, t4, 0b00); | |||||
| t7 = vec_xxpermdi(t9, t10, 0b00); | |||||
| t8 = vec_xxpermdi(t11, t12, 0b00); | |||||
| vec_xst(t5, 0, boffset+32); | |||||
| vec_xst(t6, 0, boffset+36); | |||||
| vec_xst(t7, 0, boffset+40); | |||||
| vec_xst(t8, 0, boffset+44); | |||||
| t5 = vec_xxpermdi(t1, t2, 0b11); | |||||
| t6 = vec_xxpermdi(t3, t4, 0b11); | |||||
| t7 = vec_xxpermdi(t9, t10, 0b11); | |||||
| t8 = vec_xxpermdi(t11, t12, 0b11); | |||||
| vec_xst(t5, 0, boffset+48); | |||||
| vec_xst(t6, 0, boffset+52); | |||||
| vec_xst(t7, 0, boffset+56); | |||||
| vec_xst(t8, 0, boffset+60); | |||||
| aoffset1 += 4; | |||||
| aoffset2 += 4; | |||||
| aoffset3 += 4; | |||||
| aoffset4 += 4; | |||||
| aoffset5 += 4; | |||||
| aoffset6 += 4; | |||||
| aoffset7 += 4; | |||||
| aoffset8 += 4; | |||||
| aoffset9 += 4; | |||||
| aoffset10 += 4; | |||||
| aoffset11 += 4; | |||||
| aoffset12 += 4; | |||||
| aoffset13 += 4; | |||||
| aoffset14 += 4; | |||||
| aoffset15 += 4; | |||||
| aoffset16 += 4; | |||||
| boffset += 64; | |||||
| i --; | |||||
| }while(i > 0); | |||||
| } | |||||
| i = (m & 3); | |||||
| if (i > 0){ | |||||
| do{ | |||||
| ctemp01 = *(aoffset1 + 0); | |||||
| ctemp03 = *(aoffset2 + 0); | |||||
| ctemp05 = *(aoffset3 + 0); | |||||
| ctemp07 = *(aoffset4 + 0); | |||||
| ctemp09 = *(aoffset5 + 0); | |||||
| ctemp11 = *(aoffset6 + 0); | |||||
| ctemp13 = *(aoffset7 + 0); | |||||
| ctemp15 = *(aoffset8 + 0); | |||||
| ctemp17 = *(aoffset9 + 0); | |||||
| ctemp19 = *(aoffset10 + 0); | |||||
| ctemp21 = *(aoffset11 + 0); | |||||
| ctemp23 = *(aoffset12 + 0); | |||||
| ctemp25 = *(aoffset13 + 0); | |||||
| ctemp27 = *(aoffset14 + 0); | |||||
| ctemp29 = *(aoffset15 + 0); | |||||
| ctemp31 = *(aoffset16 + 0); | |||||
| *(boffset + 0) = ctemp01; | |||||
| *(boffset + 1) = ctemp03; | |||||
| *(boffset + 2) = ctemp05; | |||||
| *(boffset + 3) = ctemp07; | |||||
| *(boffset + 4) = ctemp09; | |||||
| *(boffset + 5) = ctemp11; | |||||
| *(boffset + 6) = ctemp13; | |||||
| *(boffset + 7) = ctemp15; | |||||
| *(boffset + 8) = ctemp17; | |||||
| *(boffset + 9) = ctemp19; | |||||
| *(boffset + 10) = ctemp21; | |||||
| *(boffset + 11) = ctemp23; | |||||
| *(boffset + 12) = ctemp25; | |||||
| *(boffset + 13) = ctemp27; | |||||
| *(boffset + 14) = ctemp29; | |||||
| *(boffset + 15) = ctemp31; | |||||
| aoffset1+=1; | |||||
| aoffset2+=1; | |||||
| aoffset3+=1; | |||||
| aoffset4+=1; | |||||
| aoffset5+=1; | |||||
| aoffset6+=1; | |||||
| aoffset7+=1; | |||||
| aoffset8+=1; | |||||
| aoffset9+=1; | |||||
| aoffset10+=1; | |||||
| aoffset11+=1; | |||||
| aoffset12+=1; | |||||
| aoffset13+=1; | |||||
| aoffset14+=1; | |||||
| aoffset15+=1; | |||||
| aoffset16+=1; | |||||
| boffset += 16; | |||||
| i --; | |||||
| }while(i > 0); | |||||
| } | |||||
| j--; | |||||
| }while(j > 0); | |||||
| } /* end of if(j > 0) */ | |||||
| if (n & 8){ | |||||
| aoffset1 = aoffset; | |||||
| aoffset2 = aoffset1 + lda; | |||||
| aoffset3 = aoffset2 + lda; | |||||
| aoffset4 = aoffset3 + lda; | |||||
| aoffset5 = aoffset4 + lda; | |||||
| aoffset6 = aoffset5 + lda; | |||||
| aoffset7 = aoffset6 + lda; | |||||
| aoffset8 = aoffset7 + lda; | |||||
| aoffset += 8 * lda; | |||||
| i = (m >> 2); | |||||
| if (i > 0){ | |||||
| vector float c1, c2, c3, c4, c5, c6, c7, c8; | |||||
| vector float t1, t2, t3, t4, t5, t6, t7, t8; | |||||
| do{ | |||||
| c1 = vec_xl(0, aoffset1); | |||||
| c2 = vec_xl(0, aoffset2); | |||||
| c3 = vec_xl(0, aoffset3); | |||||
| c4 = vec_xl(0, aoffset4); | |||||
| c5 = vec_xl(0, aoffset5); | |||||
| c6 = vec_xl(0, aoffset6); | |||||
| c7 = vec_xl(0, aoffset7); | |||||
| c8 = vec_xl(0, aoffset8); | |||||
| t1 = vec_mergeh(c1, c2); | |||||
| t2 = vec_mergeh(c3, c4); | |||||
| t3 = vec_mergeh(c5, c6); | |||||
| t4 = vec_mergeh(c7, c8); | |||||
| t5 = vec_xxpermdi(t1, t2, 0b00); | |||||
| t6 = vec_xxpermdi(t3, t4, 0b00); | |||||
| t7 = vec_xxpermdi(t1, t2, 0b11); | |||||
| t8 = vec_xxpermdi(t3, t4, 0b11); | |||||
| vec_xst(t5, 0, boffset); | |||||
| vec_xst(t6, 0, boffset+4); | |||||
| vec_xst(t7, 0, boffset+8); | |||||
| vec_xst(t8, 0, boffset+12); | |||||
| t1 = vec_mergel(c1, c2); | |||||
| t2 = vec_mergel(c3, c4); | |||||
| t3 = vec_mergel(c5, c6); | |||||
| t4 = vec_mergel(c7, c8); | |||||
| t5 = vec_xxpermdi(t1, t2, 0b00); | |||||
| t6 = vec_xxpermdi(t3, t4, 0b00); | |||||
| t7 = vec_xxpermdi(t1, t2, 0b11); | |||||
| t8 = vec_xxpermdi(t3, t4, 0b11); | |||||
| vec_xst(t5, 0, boffset+16); | |||||
| vec_xst(t6, 0, boffset+20); | |||||
| vec_xst(t7, 0, boffset+24); | |||||
| vec_xst(t8, 0, boffset+28); | |||||
| aoffset1 += 4; | |||||
| aoffset2 += 4; | |||||
| aoffset3 += 4; | |||||
| aoffset4 += 4; | |||||
| aoffset5 += 4; | |||||
| aoffset6 += 4; | |||||
| aoffset7 += 4; | |||||
| aoffset8 += 4; | |||||
| boffset += 32; | |||||
| i--; | |||||
| }while(i > 0); | |||||
| } | |||||
| i = (m & 3); | |||||
| if (i > 0) { | |||||
| do { | |||||
| ctemp01 = *(aoffset1 + 0); | |||||
| ctemp03 = *(aoffset2 + 0); | |||||
| ctemp05 = *(aoffset3 + 0); | |||||
| ctemp07 = *(aoffset4 + 0); | |||||
| ctemp09 = *(aoffset5 + 0); | |||||
| ctemp11 = *(aoffset6 + 0); | |||||
| ctemp13 = *(aoffset7 + 0); | |||||
| ctemp15 = *(aoffset8 + 0); | |||||
| *(boffset + 0) = ctemp01; | |||||
| *(boffset + 1) = ctemp03; | |||||
| *(boffset + 2) = ctemp05; | |||||
| *(boffset + 3) = ctemp07; | |||||
| *(boffset + 4) = ctemp09; | |||||
| *(boffset + 5) = ctemp11; | |||||
| *(boffset + 6) = ctemp13; | |||||
| *(boffset + 7) = ctemp15; | |||||
| aoffset1+=1; | |||||
| aoffset2+=1; | |||||
| aoffset3+=1; | |||||
| aoffset4+=1; | |||||
| aoffset5+=1; | |||||
| aoffset6+=1; | |||||
| aoffset7+=1; | |||||
| aoffset8+=1; | |||||
| boffset += 8; | |||||
| i--; | |||||
| } while (i > 0); | |||||
| } | |||||
| } | |||||
| if (n & 4){ | |||||
| aoffset1 = aoffset; | |||||
| aoffset2 = aoffset1 + lda; | |||||
| aoffset3 = aoffset2 + lda; | |||||
| aoffset4 = aoffset3 + lda; | |||||
| aoffset += 4 * lda; | |||||
| i = (m >> 2); | |||||
| if (i > 0){ | |||||
| vector float c1, c2, c3, c4; | |||||
| vector float t1, t2, t3, t4; | |||||
| do{ | |||||
| c1 = vec_xl(0, aoffset1); | |||||
| c2 = vec_xl(0, aoffset2); | |||||
| c3 = vec_xl(0, aoffset3); | |||||
| c4 = vec_xl(0, aoffset4); | |||||
| t1 = vec_mergeh(c1, c2); | |||||
| t2 = vec_mergeh(c3, c4); | |||||
| t3 = vec_xxpermdi(t1, t2, 0b00); | |||||
| t4 = vec_xxpermdi(t1, t2, 0b11); | |||||
| vec_xst(t3, 0, boffset); | |||||
| vec_xst(t4, 0, boffset+4); | |||||
| t1 = vec_mergel(c1, c2); | |||||
| t2 = vec_mergel(c3, c4); | |||||
| t3 = vec_xxpermdi(t1, t2, 0b00); | |||||
| t4 = vec_xxpermdi(t1, t2, 0b11); | |||||
| vec_xst(t3, 0, boffset+8); | |||||
| vec_xst(t4, 0, boffset+12); | |||||
| aoffset1 += 4; | |||||
| aoffset2 += 4; | |||||
| aoffset3 += 4; | |||||
| aoffset4 += 4; | |||||
| boffset += 16; | |||||
| i--; | |||||
| }while(i > 0); | |||||
| } | |||||
| i = (m & 3); | |||||
| if (i > 0) { | |||||
| do { | |||||
| ctemp01 = *(aoffset1 + 0); | |||||
| ctemp03 = *(aoffset2 + 0); | |||||
| ctemp05 = *(aoffset3 + 0); | |||||
| ctemp07 = *(aoffset4 + 0); | |||||
| *(boffset + 0) = ctemp01; | |||||
| *(boffset + 1) = ctemp03; | |||||
| *(boffset + 2) = ctemp05; | |||||
| *(boffset + 3) = ctemp07; | |||||
| aoffset1+=1; | |||||
| aoffset2+=1; | |||||
| aoffset3+=1; | |||||
| aoffset4+=1; | |||||
| boffset += 4; | |||||
| i--; | |||||
| } while (i > 0); | |||||
| } | |||||
| } | |||||
| if (n & 2){ | |||||
| aoffset1 = aoffset; | |||||
| aoffset2 = aoffset1 + lda; | |||||
| aoffset += 2 * lda; | |||||
| i = (m >> 1); | |||||
| if (i > 0){ | |||||
| do{ | |||||
| ctemp01 = *(aoffset1 + 0); | |||||
| ctemp02 = *(aoffset1 + 1); | |||||
| ctemp03 = *(aoffset2 + 0); | |||||
| ctemp04 = *(aoffset2 + 1); | |||||
| *(boffset + 0) = ctemp01; | |||||
| *(boffset + 1) = ctemp03; | |||||
| *(boffset + 2) = ctemp02; | |||||
| *(boffset + 3) = ctemp04; | |||||
| aoffset1 += 2; | |||||
| aoffset2 += 2; | |||||
| boffset += 4; | |||||
| i --; | |||||
| }while(i > 0); | |||||
| } | |||||
| if (m & 1){ | |||||
| ctemp01 = *(aoffset1 + 0); | |||||
| ctemp03 = *(aoffset2 + 0); | |||||
| *(boffset + 0) = ctemp01; | |||||
| *(boffset + 1) = ctemp03; | |||||
| boffset += 2; | |||||
| } | |||||
| } | |||||
| if (n & 1){ | |||||
| aoffset1 = aoffset; | |||||
| i = (m >> 1); | |||||
| if (i > 0){ | |||||
| do{ | |||||
| ctemp01 = *(aoffset1 + 0); | |||||
| ctemp02 = *(aoffset1 + 1); | |||||
| *(boffset + 0) = ctemp01; | |||||
| *(boffset + 1) = ctemp02; | |||||
| aoffset1 += 2; | |||||
| boffset += 2; | |||||
| i --; | |||||
| }while(i > 0); | |||||
| } | |||||
| if (m & 1){ | |||||
| ctemp01 = *(aoffset1 + 0); | |||||
| *(boffset + 0) = ctemp01; | |||||
| // boffset += 1; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -42,8 +42,8 @@ ZSUMKERNEL = ../arm/zsum.c | |||||
| SAXPYKERNEL = axpy_vector.c | SAXPYKERNEL = axpy_vector.c | ||||
| DAXPYKERNEL = axpy_vector.c | DAXPYKERNEL = axpy_vector.c | ||||
| CAXPYKERNEL = zaxpy.c | |||||
| ZAXPYKERNEL = zaxpy.c | |||||
| CAXPYKERNEL = zaxpy_vector.c | |||||
| ZAXPYKERNEL = zaxpy_vector.c | |||||
| SAXPBYKERNEL = axpby_vector.c | SAXPBYKERNEL = axpby_vector.c | ||||
| DAXPBYKERNEL = axpby_vector.c | DAXPBYKERNEL = axpby_vector.c | ||||
| @@ -59,7 +59,7 @@ SDOTKERNEL = dot_vector.c | |||||
| DDOTKERNEL = dot_vector.c | DDOTKERNEL = dot_vector.c | ||||
| CDOTKERNEL = zdot_vector.c | CDOTKERNEL = zdot_vector.c | ||||
| ZDOTKERNEL = zdot_vector.c | ZDOTKERNEL = zdot_vector.c | ||||
| DSDOTKERNEL = ../generic/dot.c | |||||
| DSDOTKERNEL = dsdot_vector.c | |||||
| SNRM2KERNEL = nrm2_vector.c | SNRM2KERNEL = nrm2_vector.c | ||||
| DNRM2KERNEL = nrm2_vector.c | DNRM2KERNEL = nrm2_vector.c | ||||
| @@ -45,6 +45,11 @@ DAXPYKERNEL = ../riscv64/axpy.c | |||||
| CAXPYKERNEL = ../riscv64/zaxpy.c | CAXPYKERNEL = ../riscv64/zaxpy.c | ||||
| ZAXPYKERNEL = ../riscv64/zaxpy.c | ZAXPYKERNEL = ../riscv64/zaxpy.c | ||||
| SAXPBYKERNEL = ../riscv64/axpby.c | |||||
| DAXPBYKERNEL = ../riscv64/axpby.c | |||||
| CAXPBYKERNEL = ../riscv64/zaxpby.c | |||||
| ZAXPBYKERNEL = ../riscv64/zaxpby.c | |||||
| SCOPYKERNEL = ../riscv64/copy.c | SCOPYKERNEL = ../riscv64/copy.c | ||||
| DCOPYKERNEL = ../riscv64/copy.c | DCOPYKERNEL = ../riscv64/copy.c | ||||
| CCOPYKERNEL = ../riscv64/zcopy.c | CCOPYKERNEL = ../riscv64/zcopy.c | ||||
| @@ -0,0 +1,243 @@ | |||||
| SAMAXKERNEL = amax_rvv.c | |||||
| DAMAXKERNEL = amax_rvv.c | |||||
| CAMAXKERNEL = zamax_rvv.c | |||||
| ZAMAXKERNEL = zamax_rvv.c | |||||
| SAMINKERNEL = amin_rvv.c | |||||
| DAMINKERNEL = amin_rvv.c | |||||
| CAMINKERNEL = zamin_rvv.c | |||||
| ZAMINKERNEL = zamin_rvv.c | |||||
| SMAXKERNEL = max_rvv.c | |||||
| DMAXKERNEL = max_rvv.c | |||||
| SMINKERNEL = min_rvv.c | |||||
| DMINKERNEL = min_rvv.c | |||||
| ISAMAXKERNEL = iamax_rvv.c | |||||
| IDAMAXKERNEL = iamax_rvv.c | |||||
| ICAMAXKERNEL = izamax_rvv.c | |||||
| IZAMAXKERNEL = izamax_rvv.c | |||||
| ISAMINKERNEL = iamin_rvv.c | |||||
| IDAMINKERNEL = iamin_rvv.c | |||||
| ICAMINKERNEL = izamin_rvv.c | |||||
| IZAMINKERNEL = izamin_rvv.c | |||||
| ISMAXKERNEL = imax_rvv.c | |||||
| IDMAXKERNEL = imax_rvv.c | |||||
| ISMINKERNEL = imin_rvv.c | |||||
| IDMINKERNEL = imin_rvv.c | |||||
| SASUMKERNEL = asum_rvv.c | |||||
| DASUMKERNEL = asum_rvv.c | |||||
| CASUMKERNEL = zasum_rvv.c | |||||
| ZASUMKERNEL = zasum_rvv.c | |||||
| SSUMKERNEL = sum_rvv.c | |||||
| DSUMKERNEL = sum_rvv.c | |||||
| CSUMKERNEL = zsum_rvv.c | |||||
| ZSUMKERNEL = zsum_rvv.c | |||||
| SAXPYKERNEL = axpy_rvv.c | |||||
| DAXPYKERNEL = axpy_rvv.c | |||||
| CAXPYKERNEL = zaxpy_rvv.c | |||||
| ZAXPYKERNEL = zaxpy_rvv.c | |||||
| SAXPBYKERNEL = axpby_rvv.c | |||||
| DAXPBYKERNEL = axpby_rvv.c | |||||
| CAXPBYKERNEL = zaxpby_rvv.c | |||||
| ZAXPBYKERNEL = zaxpby_rvv.c | |||||
| SCOPYKERNEL = copy_rvv.c | |||||
| DCOPYKERNEL = copy_rvv.c | |||||
| CCOPYKERNEL = zcopy_rvv.c | |||||
| ZCOPYKERNEL = zcopy_rvv.c | |||||
| SDOTKERNEL = dot_rvv.c | |||||
| DDOTKERNEL = dot_rvv.c | |||||
| CDOTKERNEL = zdot_rvv.c | |||||
| ZDOTKERNEL = zdot_rvv.c | |||||
| DSDOTKERNEL = dot_rvv.c | |||||
| SNRM2KERNEL = nrm2_rvv.c | |||||
| DNRM2KERNEL = nrm2_rvv.c | |||||
| CNRM2KERNEL = znrm2_rvv.c | |||||
| ZNRM2KERNEL = znrm2_rvv.c | |||||
| SROTKERNEL = rot_rvv.c | |||||
| DROTKERNEL = rot_rvv.c | |||||
| CROTKERNEL = zrot_rvv.c | |||||
| ZROTKERNEL = zrot_rvv.c | |||||
| SSCALKERNEL = scal_rvv.c | |||||
| DSCALKERNEL = scal_rvv.c | |||||
| CSCALKERNEL = zscal_rvv.c | |||||
| ZSCALKERNEL = zscal_rvv.c | |||||
| SSWAPKERNEL = swap_rvv.c | |||||
| DSWAPKERNEL = swap_rvv.c | |||||
| CSWAPKERNEL = zswap_rvv.c | |||||
| ZSWAPKERNEL = zswap_rvv.c | |||||
| SGEMVNKERNEL = gemv_n_rvv.c | |||||
| DGEMVNKERNEL = gemv_n_rvv.c | |||||
| CGEMVNKERNEL = zgemv_n_rvv.c | |||||
| ZGEMVNKERNEL = zgemv_n_rvv.c | |||||
| SGEMVTKERNEL = gemv_t_rvv.c | |||||
| DGEMVTKERNEL = gemv_t_rvv.c | |||||
| CGEMVTKERNEL = zgemv_t_rvv.c | |||||
| ZGEMVTKERNEL = zgemv_t_rvv.c | |||||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c | |||||
| STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c | |||||
| STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c | |||||
| STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c | |||||
| STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c | |||||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c | |||||
| DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c | |||||
| DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c | |||||
| DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c | |||||
| DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c | |||||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c | |||||
| CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c | |||||
| CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | |||||
| CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c | |||||
| CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | |||||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c | |||||
| ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c | |||||
| ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c | |||||
| ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c | |||||
| ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| SSYMV_U_KERNEL = symv_U_rvv.c | |||||
| SSYMV_L_KERNEL = symv_L_rvv.c | |||||
| DSYMV_U_KERNEL = symv_U_rvv.c | |||||
| DSYMV_L_KERNEL = symv_L_rvv.c | |||||
| CSYMV_U_KERNEL = zsymv_U_rvv.c | |||||
| CSYMV_L_KERNEL = zsymv_L_rvv.c | |||||
| ZSYMV_U_KERNEL = zsymv_U_rvv.c | |||||
| ZSYMV_L_KERNEL = zsymv_L_rvv.c | |||||
| CHEMV_L_KERNEL = zhemv_LM_rvv.c | |||||
| CHEMV_M_KERNEL = zhemv_LM_rvv.c | |||||
| CHEMV_U_KERNEL = zhemv_UV_rvv.c | |||||
| CHEMV_V_KERNEL = zhemv_UV_rvv.c | |||||
| ZHEMV_L_KERNEL = zhemv_LM_rvv.c | |||||
| ZHEMV_M_KERNEL = zhemv_LM_rvv.c | |||||
| ZHEMV_U_KERNEL = zhemv_UV_rvv.c | |||||
| ZHEMV_V_KERNEL = zhemv_UV_rvv.c | |||||
| SSYMMUCOPY_M = ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c | |||||
| SSYMMLCOPY_M = ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c | |||||
| DSYMMUCOPY_M = ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c | |||||
| DSYMMLCOPY_M = ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c | |||||
| CSYMMUCOPY_M = ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c | |||||
| CSYMMLCOPY_M = ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c | |||||
| ZSYMMUCOPY_M = ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c | |||||
| ZSYMMLCOPY_M = ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c | |||||
| CHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c | |||||
| CHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c | |||||
| ZHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c | |||||
| ZHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c | |||||
| LSAME_KERNEL = ../generic/lsame.c | |||||
| SCABS_KERNEL = ../generic/cabs.c | |||||
| DCABS_KERNEL = ../generic/cabs.c | |||||
| QCABS_KERNEL = ../generic/cabs.c | |||||
| ifndef SGEMM_BETA | |||||
| SGEMM_BETA = gemm_beta_rvv.c | |||||
| endif | |||||
| ifndef DGEMM_BETA | |||||
| DGEMM_BETA = gemm_beta_rvv.c | |||||
| endif | |||||
| ifndef CGEMM_BETA | |||||
| CGEMM_BETA = zgemm_beta_rvv.c | |||||
| endif | |||||
| ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = zgemm_beta_rvv.c | |||||
| endif | |||||
| @@ -0,0 +1,199 @@ | |||||
| SAMAXKERNEL = amax_vector.c | |||||
| DAMAXKERNEL = amax_vector.c | |||||
| CAMAXKERNEL = zamax_vector.c | |||||
| ZAMAXKERNEL = zamax_vector.c | |||||
| SAMINKERNEL = amin_vector.c | |||||
| DAMINKERNEL = amin_vector.c | |||||
| CAMINKERNEL = zamin_vector.c | |||||
| ZAMINKERNEL = zamin_vector.c | |||||
| SMAXKERNEL = max_vector.c | |||||
| DMAXKERNEL = max_vector.c | |||||
| SMINKERNEL = min_vector.c | |||||
| DMINKERNEL = min_vector.c | |||||
| ISAMAXKERNEL = iamax_vector.c | |||||
| IDAMAXKERNEL = iamax_vector.c | |||||
| ICAMAXKERNEL = izamax_vector.c | |||||
| IZAMAXKERNEL = izamax_vector.c | |||||
| ISAMINKERNEL = iamin_vector.c | |||||
| IDAMINKERNEL = iamin_vector.c | |||||
| ICAMINKERNEL = izamin_vector.c | |||||
| IZAMINKERNEL = izamin_vector.c | |||||
| ISMAXKERNEL = imax_vector.c | |||||
| IDMAXKERNEL = imax_vector.c | |||||
| ISMINKERNEL = imin_vector.c | |||||
| IDMINKERNEL = imin_vector.c | |||||
| SASUMKERNEL = asum_vector.c | |||||
| DASUMKERNEL = asum_vector.c | |||||
| CASUMKERNEL = zasum_vector.c | |||||
| ZASUMKERNEL = zasum_vector.c | |||||
| SSUMKERNEL = sum_vector.c | |||||
| DSUMKERNEL = sum_vector.c | |||||
| CSUMKERNEL = zsum_vector.c | |||||
| ZSUMKERNEL = zsum_vector.c | |||||
| SAXPYKERNEL = axpy_vector.c | |||||
| DAXPYKERNEL = axpy_vector.c | |||||
| CAXPYKERNEL = zaxpy_vector.c | |||||
| ZAXPYKERNEL = zaxpy_vector.c | |||||
| SCOPYKERNEL = copy_vector.c | |||||
| DCOPYKERNEL = copy_vector.c | |||||
| CCOPYKERNEL = zcopy_vector.c | |||||
| ZCOPYKERNEL = zcopy_vector.c | |||||
| SDOTKERNEL = dot_vector.c | |||||
| DDOTKERNEL = dot_vector.c | |||||
| CDOTKERNEL = zdot_vector.c | |||||
| ZDOTKERNEL = zdot_vector.c | |||||
| DSDOTKERNEL = ../generic/dot.c | |||||
| SNRM2KERNEL = nrm2_vector.c | |||||
| DNRM2KERNEL = nrm2_vector.c | |||||
| CNRM2KERNEL = znrm2_vector.c | |||||
| ZNRM2KERNEL = znrm2_vector.c | |||||
| SROTKERNEL = rot_vector.c | |||||
| DROTKERNEL = rot_vector.c | |||||
| CROTKERNEL = zrot_vector.c | |||||
| ZROTKERNEL = zrot_vector.c | |||||
| SSCALKERNEL = scal_vector.c | |||||
| DSCALKERNEL = scal_vector.c | |||||
| CSCALKERNEL = zscal_vector.c | |||||
| ZSCALKERNEL = zscal_vector.c | |||||
| SSWAPKERNEL = swap_vector.c | |||||
| DSWAPKERNEL = swap_vector.c | |||||
| CSWAPKERNEL = zswap_vector.c | |||||
| ZSWAPKERNEL = zswap_vector.c | |||||
| SGEMVNKERNEL = gemv_n_vector.c | |||||
| DGEMVNKERNEL = gemv_n_vector.c | |||||
| CGEMVNKERNEL = zgemv_n_vector.c | |||||
| ZGEMVNKERNEL = zgemv_n_vector.c | |||||
| SGEMVTKERNEL = gemv_t_vector.c | |||||
| DGEMVTKERNEL = gemv_t_vector.c | |||||
| CGEMVTKERNEL = zgemv_t_vector.c | |||||
| ZGEMVTKERNEL = zgemv_t_vector.c | |||||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c | |||||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c | |||||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c | |||||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c | |||||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| SSYMV_U_KERNEL = symv_U_vector.c | |||||
| SSYMV_L_KERNEL = symv_L_vector.c | |||||
| DSYMV_U_KERNEL = symv_U_vector.c | |||||
| DSYMV_L_KERNEL = symv_L_vector.c | |||||
| CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| CHEMV_L_KERNEL = zhemv_LM_vector.c | |||||
| CHEMV_M_KERNEL = zhemv_LM_vector.c | |||||
| CHEMV_U_KERNEL = zhemv_UV_vector.c | |||||
| CHEMV_V_KERNEL = zhemv_UV_vector.c | |||||
| ZHEMV_L_KERNEL = zhemv_LM_vector.c | |||||
| ZHEMV_M_KERNEL = zhemv_LM_vector.c | |||||
| ZHEMV_U_KERNEL = zhemv_UV_vector.c | |||||
| ZHEMV_V_KERNEL = zhemv_UV_vector.c | |||||
| LSAME_KERNEL = ../generic/lsame.c | |||||
| SCABS_KERNEL = ../generic/cabs.c | |||||
| DCABS_KERNEL = ../generic/cabs.c | |||||
| QCABS_KERNEL = ../generic/cabs.c | |||||
| ifndef SGEMM_BETA | |||||
| SGEMM_BETA = ../generic/gemm_beta.c | |||||
| endif | |||||
| ifndef DGEMM_BETA | |||||
| DGEMM_BETA = ../generic/gemm_beta.c | |||||
| endif | |||||
| ifndef CGEMM_BETA | |||||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||||
| endif | |||||
| ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||||
| endif | |||||
| @@ -0,0 +1,281 @@ | |||||
| # ********************************************************************************** | |||||
| # Copyright (c) 2022, The OpenBLAS Project | |||||
| # All rights reserved. | |||||
| # Redistribution and use in source and binary forms, with or without | |||||
| # modification, are permitted provided that the following conditions are | |||||
| # met: | |||||
| # 1. Redistributions of source code must retain the above copyright | |||||
| # notice, this list of conditions and the following disclaimer. | |||||
| # 2. Redistributions in binary form must reproduce the above copyright | |||||
| # notice, this list of conditions and the following disclaimer in | |||||
| # the documentation and/or other materials provided with the | |||||
| # distribution. | |||||
| # 3. Neither the name of the OpenBLAS project nor the names of | |||||
| # its contributors may be used to endorse or promote products | |||||
| # derived from this software without specific prior written permission. | |||||
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| # ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| # USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| # ********************************************************************************** | |||||
| SAMAXKERNEL = amax_rvv.c | |||||
| DAMAXKERNEL = amax_rvv.c | |||||
| CAMAXKERNEL = zamax_rvv.c | |||||
| ZAMAXKERNEL = zamax_rvv.c | |||||
| SAMINKERNEL = amin_rvv.c | |||||
| DAMINKERNEL = amin_rvv.c | |||||
| CAMINKERNEL = zamin_rvv.c | |||||
| ZAMINKERNEL = zamin_rvv.c | |||||
| SMAXKERNEL = max_rvv.c | |||||
| DMAXKERNEL = max_rvv.c | |||||
| SMINKERNEL = min_rvv.c | |||||
| DMINKERNEL = min_rvv.c | |||||
| ISAMAXKERNEL = iamax_rvv.c | |||||
| IDAMAXKERNEL = iamax_rvv.c | |||||
| ICAMAXKERNEL = izamax_rvv.c | |||||
| IZAMAXKERNEL = izamax_rvv.c | |||||
| ISAMINKERNEL = iamin_rvv.c | |||||
| IDAMINKERNEL = iamin_rvv.c | |||||
| ICAMINKERNEL = izamin_rvv.c | |||||
| IZAMINKERNEL = izamin_rvv.c | |||||
| ISMAXKERNEL = imax_rvv.c | |||||
| IDMAXKERNEL = imax_rvv.c | |||||
| ISMINKERNEL = imin_rvv.c | |||||
| IDMINKERNEL = imin_rvv.c | |||||
| SASUMKERNEL = asum_rvv.c | |||||
| DASUMKERNEL = asum_rvv.c | |||||
| CASUMKERNEL = zasum_rvv.c | |||||
| ZASUMKERNEL = zasum_rvv.c | |||||
| SSUMKERNEL = sum_rvv.c | |||||
| DSUMKERNEL = sum_rvv.c | |||||
| CSUMKERNEL = zsum_rvv.c | |||||
| ZSUMKERNEL = zsum_rvv.c | |||||
| SAXPYKERNEL = axpy_rvv.c | |||||
| DAXPYKERNEL = axpy_rvv.c | |||||
| CAXPYKERNEL = zaxpy_rvv.c | |||||
| ZAXPYKERNEL = zaxpy_rvv.c | |||||
| SAXPBYKERNEL = axpby_rvv.c | |||||
| DAXPBYKERNEL = axpby_rvv.c | |||||
| CAXPBYKERNEL = zaxpby_rvv.c | |||||
| ZAXPBYKERNEL = zaxpby_rvv.c | |||||
| SCOPYKERNEL = copy_rvv.c | |||||
| DCOPYKERNEL = copy_rvv.c | |||||
| CCOPYKERNEL = zcopy_rvv.c | |||||
| ZCOPYKERNEL = zcopy_rvv.c | |||||
| SDOTKERNEL = dot_rvv.c | |||||
| DDOTKERNEL = dot_rvv.c | |||||
| CDOTKERNEL = zdot_rvv.c | |||||
| ZDOTKERNEL = zdot_rvv.c | |||||
| DSDOTKERNEL = dot_rvv.c | |||||
| SNRM2KERNEL = nrm2_rvv.c | |||||
| DNRM2KERNEL = nrm2_rvv.c | |||||
| CNRM2KERNEL = znrm2_rvv.c | |||||
| ZNRM2KERNEL = znrm2_rvv.c | |||||
| SROTKERNEL = rot_rvv.c | |||||
| DROTKERNEL = rot_rvv.c | |||||
| CROTKERNEL = zrot_rvv.c | |||||
| ZROTKERNEL = zrot_rvv.c | |||||
| SSCALKERNEL = scal_rvv.c | |||||
| DSCALKERNEL = scal_rvv.c | |||||
| CSCALKERNEL = zscal_rvv.c | |||||
| ZSCALKERNEL = zscal_rvv.c | |||||
| SSWAPKERNEL = swap_rvv.c | |||||
| DSWAPKERNEL = swap_rvv.c | |||||
| CSWAPKERNEL = zswap_rvv.c | |||||
| ZSWAPKERNEL = zswap_rvv.c | |||||
| SGEMVNKERNEL = gemv_n_rvv.c | |||||
| DGEMVNKERNEL = gemv_n_rvv.c | |||||
| CGEMVNKERNEL = zgemv_n_rvv.c | |||||
| ZGEMVNKERNEL = zgemv_n_rvv.c | |||||
| SGEMVTKERNEL = gemv_t_rvv.c | |||||
| DGEMVTKERNEL = gemv_t_rvv.c | |||||
| CGEMVTKERNEL = zgemv_t_rvv.c | |||||
| ZGEMVTKERNEL = zgemv_t_rvv.c | |||||
| CTRMMKERNEL = ztrmmkernel_rvv_v1x4.c | |||||
| ZTRMMKERNEL = ztrmmkernel_rvv_v1x4.c | |||||
| # SGEMM_UNROLL_N set in params.h | |||||
| ifeq ($(SGEMM_UNROLL_N), 8) | |||||
| # UNROLL_M is VLMAX | |||||
| SGEMMKERNEL = gemmkernel_rvv_v1x8.c | |||||
| SGEMMINCOPY = gemm_ncopy_rvv_v1.c | |||||
| SGEMMITCOPY = gemm_tcopy_rvv_v1.c | |||||
| SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c | |||||
| SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRMMKERNEL = trmmkernel_rvv_v1x8.c | |||||
| STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c | |||||
| STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c | |||||
| STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c | |||||
| STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c | |||||
| SSYMMUCOPY_M = symm_ucopy_rvv_v1.c | |||||
| SSYMMLCOPY_M = symm_lcopy_rvv_v1.c | |||||
| endif | |||||
| # SGEMM_UNROLL_N set in params.h | |||||
| ifeq ($(DGEMM_UNROLL_N), 8) | |||||
| # UNROLL_M is VLMAX | |||||
| DGEMMKERNEL = gemmkernel_rvv_v1x8.c | |||||
| DGEMMINCOPY = gemm_ncopy_rvv_v1.c | |||||
| DGEMMITCOPY = gemm_tcopy_rvv_v1.c | |||||
| DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c | |||||
| DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DTRMMKERNEL = trmmkernel_rvv_v1x8.c | |||||
| DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c | |||||
| DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c | |||||
| DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c | |||||
| DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c | |||||
| DSYMMUCOPY_M = symm_ucopy_rvv_v1.c | |||||
| DSYMMLCOPY_M = symm_lcopy_rvv_v1.c | |||||
| endif | |||||
| CGEMMKERNEL = zgemmkernel_rvv_v1x4.c | |||||
| CGEMMINCOPY = zgemm_ncopy_rvv_v1.c | |||||
| CGEMMITCOPY = zgemm_tcopy_rvv_v1.c | |||||
| CGEMMONCOPY = zgemm_ncopy_4_rvv.c | |||||
| CGEMMOTCOPY = zgemm_tcopy_4_rvv.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemmkernel_rvv_v1x4.c | |||||
| ZGEMMINCOPY = zgemm_ncopy_rvv_v1.c | |||||
| ZGEMMITCOPY = zgemm_tcopy_rvv_v1.c | |||||
| ZGEMMONCOPY = zgemm_ncopy_4_rvv.c | |||||
| ZGEMMOTCOPY = zgemm_tcopy_4_rvv.c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||||
| STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||||
| DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||||
| CTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||||
| CTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||||
| CTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||||
| CTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||||
| ZTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||||
| ZTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||||
| ZTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||||
| ZTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||||
| TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c | |||||
| TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c | |||||
| TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c | |||||
| TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c | |||||
| ZTRSMCOPYLN_M = ztrsm_lncopy_rvv_v1.c | |||||
| ZTRSMCOPYLT_M = ztrsm_ltcopy_rvv_v1.c | |||||
| ZTRSMCOPYUN_M = ztrsm_uncopy_rvv_v1.c | |||||
| ZTRSMCOPYUT_M = ztrsm_utcopy_rvv_v1.c | |||||
| SSYMV_U_KERNEL = symv_U_rvv.c | |||||
| SSYMV_L_KERNEL = symv_L_rvv.c | |||||
| DSYMV_U_KERNEL = symv_U_rvv.c | |||||
| DSYMV_L_KERNEL = symv_L_rvv.c | |||||
| CSYMV_U_KERNEL = zsymv_U_rvv.c | |||||
| CSYMV_L_KERNEL = zsymv_L_rvv.c | |||||
| ZSYMV_U_KERNEL = zsymv_U_rvv.c | |||||
| ZSYMV_L_KERNEL = zsymv_L_rvv.c | |||||
| CHEMV_L_KERNEL = zhemv_LM_rvv.c | |||||
| CHEMV_M_KERNEL = zhemv_LM_rvv.c | |||||
| CHEMV_U_KERNEL = zhemv_UV_rvv.c | |||||
| CHEMV_V_KERNEL = zhemv_UV_rvv.c | |||||
| ZHEMV_L_KERNEL = zhemv_LM_rvv.c | |||||
| ZHEMV_M_KERNEL = zhemv_LM_rvv.c | |||||
| ZHEMV_U_KERNEL = zhemv_UV_rvv.c | |||||
| ZHEMV_V_KERNEL = zhemv_UV_rvv.c | |||||
| ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c | |||||
| ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c | |||||
| CHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c | |||||
| CHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c | |||||
| ZSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c | |||||
| ZSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c | |||||
| CSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c | |||||
| CSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c | |||||
| ZTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c | |||||
| ZTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c | |||||
| ZTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c | |||||
| ZTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c | |||||
| CTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c | |||||
| CTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c | |||||
| CTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c | |||||
| CTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c | |||||
| LSAME_KERNEL = ../generic/lsame.c | |||||
| SCABS_KERNEL = ../generic/cabs.c | |||||
| DCABS_KERNEL = ../generic/cabs.c | |||||
| QCABS_KERNEL = ../generic/cabs.c | |||||
| ifndef SGEMM_BETA | |||||
| SGEMM_BETA = gemm_beta_rvv.c | |||||
| endif | |||||
| ifndef DGEMM_BETA | |||||
| DGEMM_BETA = gemm_beta_rvv.c | |||||
| endif | |||||
| ifndef CGEMM_BETA | |||||
| CGEMM_BETA = zgemm_beta_rvv.c | |||||
| endif | |||||
| ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = zgemm_beta_rvv.c | |||||
| endif | |||||
| @@ -0,0 +1,102 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <float.h> | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu | |||||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 | |||||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu | |||||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 | |||||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| FLOAT maxf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||||
| FLOAT_V_T vx, vmax; | |||||
| FLOAT_V_T_M1 v_res; | |||||
| v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||||
| size_t vlmax = VSETVL_MAX; | |||||
| vmax = VFMVVF_FLOAT(0.0, vlmax); | |||||
| if(inc_x == 1) { | |||||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLEV_FLOAT(x, vl); | |||||
| vx = VFABSV_FLOAT(vx, vl); | |||||
| vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); | |||||
| } | |||||
| } else { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||||
| vx = VFABSV_FLOAT(vx, vl); | |||||
| vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); | |||||
| } | |||||
| } | |||||
| v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); | |||||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||||
| return(maxf); | |||||
| } | |||||
| @@ -28,36 +28,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle32_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||||
| #ifdef RISCV64_ZVL256B | |||||
| # define LMUL m2 | |||||
| # if defined(DOUBLE) | |||||
| # define ELEN 64 | |||||
| # else | |||||
| # define ELEN 32 | |||||
| # endif | |||||
| #else | #else | ||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle64_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||||
| # define LMUL m8 | |||||
| # if defined(DOUBLE) | |||||
| # define ELEN 64 | |||||
| # else | |||||
| # define ELEN 32 | |||||
| # endif | |||||
| #endif | #endif | ||||
| #define _ | |||||
| #define JOIN2_X(x, y) x ## y | |||||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
| #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||||
| #ifdef RISCV_0p10_INTRINSICS | |||||
| #define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) | |||||
| #else | |||||
| #define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||||
| #endif | |||||
| #define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| { | { | ||||
| BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
| @@ -65,103 +70,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT maxf=0.0; | FLOAT maxf=0.0; | ||||
| if (n <= 0 || inc_x <= 0) return(maxf); | if (n <= 0 || inc_x <= 0) return(maxf); | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T v0, v1, v_max; | |||||
| FLOAT_V_T_M1 v_res, v_zero; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_zero = VFMVVF_FLOAT_M1(0, gvl); | |||||
| FLOAT_V_T v0, v1; | |||||
| FLOAT_V_T_M1 v_res; | |||||
| v_res = VFMVVF_FLOAT_M1(0, 1); | |||||
| MASK_T mask0, mask1; | |||||
| FLOAT zero = 0.0; | |||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| v_max = VFMVVF_FLOAT(0, gvl); | |||||
| for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | v1 = VLEV_FLOAT(&x[j+gvl], gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
| #if defined(DOUBLE) | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e64,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #else | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e32,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #endif | |||||
| v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | |||||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||||
| #if defined(DOUBLE) | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e64,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v1) | |||||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #else | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e32,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v1) | |||||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #endif | |||||
| v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | |||||
| v0 = VFABS_FLOAT(v0, gvl); | |||||
| v1 = VFABS_FLOAT(v1, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); | |||||
| j += gvl*2; | j += gvl*2; | ||||
| } | } | ||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||||
| maxf = *((FLOAT*)&v_res); | |||||
| //maxf = v_res[0]; | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
| #if defined(DOUBLE) | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e64,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #else | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e32,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #endif | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||||
| if(*((FLOAT*)&v_res) > maxf) | |||||
| maxf = *((FLOAT*)&v_res); | |||||
| v0 = VFABS_FLOAT(v0, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| @@ -169,94 +99,27 @@ asm volatile( | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
| v_max = VFMVVF_FLOAT(0, gvl); | |||||
| for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
| #if defined(DOUBLE) | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e64,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #else | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e32,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #endif | |||||
| v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | |||||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | ||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||||
| #if defined(DOUBLE) | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e64,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v1) | |||||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #else | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e32,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v1) | |||||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #endif | |||||
| v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | |||||
| v0 = VFABS_FLOAT(v0, gvl); | |||||
| v1 = VFABS_FLOAT(v1, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); | |||||
| j += gvl*2; | j += gvl*2; | ||||
| ix += inc_xv*2; | ix += inc_xv*2; | ||||
| } | } | ||||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||||
| maxf = *((FLOAT*)&v_res); | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
| #if defined(DOUBLE) | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e64,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #else | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e32,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #endif | |||||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||||
| if(*((FLOAT*)&v_res) > maxf) | |||||
| maxf = *((FLOAT*)&v_res); | |||||
| v0 = VFABS_FLOAT(v0, gvl); | |||||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| } | } | ||||
| maxf = EXTRACT_FLOAT(v_res); | |||||
| return(maxf); | return(maxf); | ||||
| } | } | ||||
| @@ -0,0 +1,102 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <float.h> | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu | |||||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 | |||||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu | |||||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 | |||||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| FLOAT minf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return(minf); | |||||
| FLOAT_V_T vx, vmin; | |||||
| FLOAT_V_T_M1 v_res; | |||||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); | |||||
| size_t vlmax = VSETVL_MAX; | |||||
| vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||||
| if(inc_x == 1) { | |||||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLEV_FLOAT(x, vl); | |||||
| vx = VFABSV_FLOAT(vx, vl); | |||||
| vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); | |||||
| } | |||||
| } else { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||||
| vx = VFABSV_FLOAT(vx, vl); | |||||
| vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); | |||||
| } | |||||
| } | |||||
| v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); | |||||
| minf = VFMVFS_FLOAT_M1(v_res); | |||||
| return(minf); | |||||
| } | |||||
| @@ -26,232 +26,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | |||||
| #include <float.h> | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle32_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||||
| #ifdef RISCV64_ZVL256B | |||||
| # define LMUL m2 | |||||
| # if defined(DOUBLE) | |||||
| # define ELEN 64 | |||||
| # define ABS fabs | |||||
| # else | |||||
| # define ELEN 32 | |||||
| # define ABS fabsf | |||||
| # endif | |||||
| #else | #else | ||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle64_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||||
| # define LMUL m8 | |||||
| # if defined(DOUBLE) | |||||
| # define ELEN 64 | |||||
| # define ABS fabs | |||||
| # else | |||||
| # define ELEN 32 | |||||
| # define ABS fabsf | |||||
| # endif | |||||
| #endif | #endif | ||||
| #define _ | |||||
| #define JOIN2_X(x, y) x ## y | |||||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
| #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||||
| #ifdef RISCV_0p10_INTRINSICS | |||||
| #define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) | |||||
| #else | |||||
| #define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||||
| #endif | |||||
| #define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f ELEN, LMUL, _) | |||||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| { | { | ||||
| BLASLONG i=0, j=0; | |||||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||||
| FLOAT minf=FLT_MAX; | |||||
| BLASLONG i=0, j=0; | |||||
| BLASLONG ix=0; | |||||
| FLOAT minf=0.0; | |||||
| if (n <= 0 || inc_x <= 0) return(minf); | |||||
| minf = ABS(*x); | |||||
| x += inc_x; | |||||
| --n; | |||||
| if (n == 0) return(minf); | |||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T v0, v1, v_min; | |||||
| FLOAT_V_T_M1 v_res, v_max; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||||
| FLOAT_V_T v0, v1; | |||||
| FLOAT_V_T_M1 v_res; | |||||
| v_res = VFMVVF_FLOAT_M1(minf, 1); | |||||
| MASK_T mask0, mask1; | |||||
| FLOAT zero = 0.0; | |||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
| #if defined(DOUBLE) | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e64,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #else | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e32,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #endif | |||||
| v_min = VFMINVV_FLOAT(v_min, v0, gvl); | |||||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | v1 = VLEV_FLOAT(&x[j+gvl], gvl); | ||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||||
| #if defined(DOUBLE) | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e64,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v1) | |||||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #else | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e32,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v1) | |||||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #endif | |||||
| v_min = VFMINVV_FLOAT(v_min, v1, gvl); | |||||
| v0 = VFABS_FLOAT(v0, gvl); | |||||
| v1 = VFABS_FLOAT(v1, gvl); | |||||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||||
| v_res = VFREDMINVS_FLOAT(v1, v_res, gvl); | |||||
| j += gvl*2; | j += gvl*2; | ||||
| } | } | ||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| minf = *((FLOAT*)&v_res); | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
| #if defined(DOUBLE) | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e64,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #else | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e32,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #endif | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||||
| if(*((FLOAT*)&v_res) < minf) | |||||
| minf = *((FLOAT*)&v_res); | |||||
| v0 = VFABS_FLOAT(v0, gvl); | |||||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | BLASLONG stride_x = inc_x * sizeof(FLOAT); | ||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| BLASLONG idx = 0, inc_xv = inc_x * gvl; | |||||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||||
| BLASLONG inc_xv = inc_x * gvl; | |||||
| for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
| #if defined(DOUBLE) | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e64,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #else | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e32,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #endif | |||||
| v_min = VFMINVV_FLOAT(v_min, v0, gvl); | |||||
| v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl); | |||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||||
| #if defined(DOUBLE) | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e64,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v1) | |||||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #else | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e32,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v1) | |||||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #endif | |||||
| v_min = VFMINVV_FLOAT(v_min, v1, gvl); | |||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||||
| v0 = VFABS_FLOAT(v0, gvl); | |||||
| v1 = VFABS_FLOAT(v1, gvl); | |||||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||||
| v_res = VFREDMINVS_FLOAT(v1, v_res, gvl); | |||||
| j += gvl*2; | j += gvl*2; | ||||
| idx += inc_xv*2; | |||||
| ix += inc_xv*2; | |||||
| } | } | ||||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||||
| minf = *((FLOAT*)&v_res); | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||||
| #if defined(DOUBLE) | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e64,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #else | |||||
| asm volatile( | |||||
| "vsetvli zero, zero, e8, m1\n\t" | |||||
| "vor.vv v0, %1, %1\n\t" | |||||
| "vsetvli x0, %3, e32,m8 \n\t" | |||||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||||
| :"+vd"(v0) | |||||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||||
| :"v0"); | |||||
| #endif | |||||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||||
| if(*((FLOAT*)&v_res) < minf) | |||||
| minf = *((FLOAT*)&v_res); | |||||
| v0 = VFABS_FLOAT(v0, gvl); | |||||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| } | } | ||||
| return(minf); | |||||
| } | |||||
| minf = EXTRACT_FLOAT(v_res); | |||||
| return(minf); | |||||
| } | |||||
| @@ -0,0 +1,99 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||||
| #define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu | |||||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 | |||||
| #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 | |||||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||||
| #define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu | |||||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 | |||||
| #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 | |||||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| FLOAT asumf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return(asumf); | |||||
| FLOAT_V_T vx, vsum; | |||||
| FLOAT_V_T_M1 v_res; | |||||
| v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||||
| size_t vlmax = VSETVL_MAX; | |||||
| vsum = VFMVVF_FLOAT(0.0, vlmax); | |||||
| if(inc_x == 1) { | |||||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLEV_FLOAT(x, vl); | |||||
| vx = VFABSV_FLOAT(vx, vl); | |||||
| vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); | |||||
| } | |||||
| } else { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||||
| vx = VFABSV_FLOAT(vx, vl); | |||||
| vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); | |||||
| } | |||||
| } | |||||
| v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); | |||||
| asumf = VFMVFS_FLOAT_M1(v_res); | |||||
| return(asumf); | |||||
| } | |||||
| @@ -28,111 +28,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #include <math.h> | #include <math.h> | ||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT vle32_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||||
| #define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 | |||||
| #define MASK_T vbool4_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||||
| #ifdef RISCV64_ZVL256B | |||||
| # define LMUL m2 | |||||
| # if defined(DOUBLE) | |||||
| # define ELEN 64 | |||||
| # else | |||||
| # define ELEN 32 | |||||
| # endif | |||||
| #else | #else | ||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT vle64_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 | |||||
| #define MASK_T vbool8_t | |||||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||||
| # define LMUL m8 | |||||
| # if defined(DOUBLE) | |||||
| # define ELEN 64 | |||||
| # else | |||||
| # define ELEN 32 | |||||
| # endif | |||||
| #endif | #endif | ||||
| #define _ | |||||
| #define JOIN2_X(x, y) x ## y | |||||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
| #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||||
| #ifdef RISCV_0p10_INTRINSICS | |||||
| #define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) | |||||
| #else | |||||
| #define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||||
| #endif | |||||
| #define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||||
| #define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | ||||
| { | { | ||||
| BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
| BLASLONG ix=0; | |||||
| FLOAT asumf=0.0; | FLOAT asumf=0.0; | ||||
| if (n <= 0 || inc_x <= 0) return(asumf); | if (n <= 0 || inc_x <= 0) return(asumf); | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| FLOAT_V_T v0, v1, v_zero,v_sum; | |||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| gvl = VSETVL_MAX; | |||||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||||
| FLOAT_V_T v0, v1, v_sum; | |||||
| FLOAT_V_T_M1 v_res; | |||||
| v_res = VFMVVF_FLOAT_M1(0, 1); | |||||
| MASK_T mask0, mask1; | |||||
| if(inc_x == 1){ | if(inc_x == 1){ | ||||
| gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| v_sum = VFMVVF_FLOAT(0, gvl); | v_sum = VFMVVF_FLOAT(0, gvl); | ||||
| for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| v0 = VFABS_FLOAT(v0, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | ||||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | v1 = VLEV_FLOAT(&x[j+gvl], gvl); | ||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
| v1 = VFABS_FLOAT(v1, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | ||||
| j += gvl * 2; | j += gvl * 2; | ||||
| } | } | ||||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||||
| asumf += *((FLOAT*)&v_res); | |||||
| v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| asumf += *((FLOAT*)&v_res); | |||||
| v0 = VFABS_FLOAT(v0, gvl); | |||||
| v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | unsigned int stride_x = inc_x * sizeof(FLOAT); | ||||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||||
| if(gvl <= n/2){ | if(gvl <= n/2){ | ||||
| v_sum = VFMVVF_FLOAT(0, gvl); | v_sum = VFMVVF_FLOAT(0, gvl); | ||||
| BLASLONG inc_xv = inc_x * gvl; | |||||
| for(i=0,j=0; i<n/(gvl*2); i++){ | for(i=0,j=0; i<n/(gvl*2); i++){ | ||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||||
| v0 = VFABS_FLOAT(v0, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | ||||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||||
| v1 = VLSEV_FLOAT(&x[(j+gvl)*inc_x], stride_x, gvl); | |||||
| v1 = VFABS_FLOAT(v1, gvl); | |||||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | ||||
| j += gvl * 2; | j += gvl * 2; | ||||
| inc_xv += inc_xv * 2; | |||||
| } | } | ||||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||||
| asumf += *((FLOAT*)&v_res); | |||||
| v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||||
| } | } | ||||
| for(;j<n;){ | for(;j<n;){ | ||||
| gvl = VSETVL(n-j); | gvl = VSETVL(n-j); | ||||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||||
| asumf += *((FLOAT*)&v_res); | |||||
| v0 = VFABS_FLOAT(v0, gvl); | |||||
| v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| } | } | ||||
| asumf = EXTRACT_FLOAT(v_res); | |||||
| return(asumf); | return(asumf); | ||||
| } | } | ||||
| @@ -33,7 +33,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| BLASLONG ix,iy; | BLASLONG ix,iy; | ||||
| if ( n < 0 ) return(0); | |||||
| if ( n <= 0 ) return(0); | |||||
| ix = 0; | ix = 0; | ||||
| iy = 0; | iy = 0; | ||||
| @@ -0,0 +1,173 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||||
| #endif | |||||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| FLOAT_V_T vx, vy; | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( beta == 0.0 ) { | |||||
| if ( alpha == 0.0 ) { | |||||
| if (1 == inc_y) { | |||||
| memset(&y[0], 0, n * sizeof(FLOAT)); | |||||
| } else { | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| size_t vl = VSETVL(n); | |||||
| vy = VFMVVF_FLOAT(0.0, vl); | |||||
| for ( ; n > 0; n -= vl, y += vl*inc_y) { | |||||
| vl = VSETVL(n); | |||||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||||
| } | |||||
| } | |||||
| } else { | |||||
| if ((1 == inc_x) && (1 == inc_y)) { | |||||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLEV_FLOAT(x, vl); | |||||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||||
| VSEV_FLOAT (y, vy, vl); | |||||
| } | |||||
| } else if (1 == inc_x) { | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLEV_FLOAT(x, vl); | |||||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||||
| } | |||||
| } else if (1 == inc_y) { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||||
| VSEV_FLOAT (y, vy, vl); | |||||
| } | |||||
| } else { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| if ( alpha == 0.0 ) { | |||||
| if (1 == inc_y) { | |||||
| for (size_t vl; n > 0; n -= vl, y += vl) { | |||||
| vl = VSETVL(n); | |||||
| vy = VLEV_FLOAT(y, vl); | |||||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||||
| VSEV_FLOAT (y, vy, vl); | |||||
| } | |||||
| } else { | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, y += vl*inc_y) { | |||||
| vl = VSETVL(n); | |||||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||||
| } | |||||
| } | |||||
| } else { | |||||
| if ((1 == inc_x) && (1 == inc_y)) { | |||||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLEV_FLOAT(x, vl); | |||||
| vy = VLEV_FLOAT(y, vl); | |||||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||||
| VSEV_FLOAT (y, vy, vl); | |||||
| } | |||||
| } else if (1 == inc_x) { | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLEV_FLOAT(x, vl); | |||||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||||
| } | |||||
| } else if (1 == inc_y) { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||||
| vy = VLEV_FLOAT(y, vl); | |||||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||||
| VSEV_FLOAT (y, vy, vl); | |||||
| } | |||||
| } else { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -27,31 +27,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define VLEV_FLOAT vle32_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||||
| #define VSEV_FLOAT vse32_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||||
| #ifdef RISCV64_ZVL256B | |||||
| # define LMUL m2 | |||||
| # if defined(DOUBLE) | |||||
| # define ELEN 64 | |||||
| # else | |||||
| # define ELEN 32 | |||||
| # endif | |||||
| #else | #else | ||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define VLEV_FLOAT vle64_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||||
| #define VSEV_FLOAT vse64_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||||
| # define LMUL m4 | |||||
| # if defined(DOUBLE) | |||||
| # define ELEN 64 | |||||
| # else | |||||
| # define ELEN 32 | |||||
| # endif | |||||
| #endif | #endif | ||||
| #define _ | |||||
| #define JOIN2_X(x, y) x ## y | |||||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) | |||||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||||
| #define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _) | |||||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | ||||
| { | { | ||||
| if (n < 0) return(0); | |||||
| if (n <= 0) return(0); | |||||
| BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
| unsigned int gvl = 0; | unsigned int gvl = 0; | ||||
| @@ -60,6 +69,63 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| BLASLONG stride_x, stride_y, ix = 0, iy = 0; | BLASLONG stride_x, stride_y, ix = 0, iy = 0; | ||||
| if (inc_x == 0 || inc_y == 0) { /* use trivial non-vectorized loop if either increment is zero */ | |||||
| if ( beta == 0.0 ) | |||||
| { | |||||
| if ( alpha == 0.0 ) | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = 0.0 ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = alpha * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| if ( alpha == 0.0 ) | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = beta * y[iy] ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| while(i < n) | |||||
| { | |||||
| y[iy] = alpha * x[ix] + beta * y[iy] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } else { /* vectorized approach for non-zero increments */ | |||||
| if(beta == 0.0){ | if(beta == 0.0){ | ||||
| if(alpha == 0.0){//alpha == 0 && beta == 0 | if(alpha == 0.0){//alpha == 0 && beta == 0 | ||||
| if(inc_y == 1){ | if(inc_y == 1){ | ||||
| @@ -372,5 +438,6 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||||
| } | } | ||||
| } | } | ||||
| return(0); | return(0); | ||||
| } | |||||
| } | } | ||||
| @@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| BLASLONG ix,iy; | BLASLONG ix,iy; | ||||
| if ( n < 0 ) return(0); | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( da == 0.0 ) return(0); | if ( da == 0.0 ) return(0); | ||||
| ix = 0; | ix = 0; | ||||
| @@ -0,0 +1,109 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||||
| #endif | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| if ( n <= 0 ) return(0); | |||||
| if ( da == 0.0 ) return(0); | |||||
| FLOAT_V_T vx, vy; | |||||
| if(inc_x == 1 && inc_y == 1) { | |||||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLEV_FLOAT(x, vl); | |||||
| vy = VLEV_FLOAT(y, vl); | |||||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||||
| VSEV_FLOAT (y, vy, vl); | |||||
| } | |||||
| } else if (1 == inc_y) { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||||
| vy = VLEV_FLOAT(y, vl); | |||||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||||
| VSEV_FLOAT(y, vy, vl); | |||||
| } | |||||
| } else if (1 == inc_x) { | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLEV_FLOAT(x, vl); | |||||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||||
| } | |||||
| } else { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||||
| vl = VSETVL(n); | |||||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -25,26 +25,38 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define FLOAT_V_T vfloat32m4_t | |||||
| #define VLEV_FLOAT vle32_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||||
| #define VSEV_FLOAT vse32_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #ifdef RISCV64_ZVL256B | |||||
| # define LMUL m2 | |||||
| # if defined(DOUBLE) | |||||
| # define ELEN 64 | |||||
| # else | |||||
| # define ELEN 32 | |||||
| # endif | |||||
| #else | #else | ||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define FLOAT_V_T vfloat64m4_t | |||||
| #define VLEV_FLOAT vle64_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||||
| #define VSEV_FLOAT vse64_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| # define LMUL m4 | |||||
| # if defined(DOUBLE) | |||||
| # define ELEN 64 | |||||
| # else | |||||
| # define ELEN 32 | |||||
| # endif | |||||
| #endif | #endif | ||||
| #define _ | |||||
| #define JOIN2_X(x, y) x ## y | |||||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | ||||
| { | { | ||||
| BLASLONG i=0, j=0, jx=0, jy=0; | BLASLONG i=0, j=0, jx=0, jy=0; | ||||
| @@ -53,7 +65,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||||
| FLOAT_V_T vy0, vy1; | FLOAT_V_T vy0, vy1; | ||||
| BLASLONG stride_x, stride_y; | BLASLONG stride_x, stride_y; | ||||
| if (n < 0) return(0); | |||||
| if (n <= 0) return(0); | |||||
| if (da == 0.0) return(0); | if (da == 0.0) return(0); | ||||
| if (inc_x == 1 && inc_y == 1) { | if (inc_x == 1 && inc_y == 1) { | ||||
| @@ -0,0 +1,996 @@ | |||||
| /* | |||||
| AUTOGENERATED KERNEL | |||||
| Script: ./kernel/riscv64/generate_kernel.py | |||||
| Settings: | |||||
| LMUL=2 | |||||
| M=8 | |||||
| M_tail_scalar_from=2 | |||||
| N=4 | |||||
| __riscv_='__riscv_' | |||||
| complex=True | |||||
| conjugate=False | |||||
| cpu='zvl128b' | |||||
| force_acc_double=False | |||||
| index_type='BLASLONG' | |||||
| op='gemm' | |||||
| param_precision='float' | |||||
| reg_width_bits=128 | |||||
| tail_policy='' | |||||
| trace=False | |||||
| Derived: | |||||
| ELEN_ACC=32 | |||||
| ELEN_PARAM=32 | |||||
| LMUL_ACC=2 | |||||
| VFMACC='__riscv_vfmacc_vf_f32m2' | |||||
| VFMUL='__riscv_vfmul_vf_f32m2' | |||||
| VLEV='__riscv_vle32_v_f32m2' | |||||
| VLSEV='__riscv_vlse32_v_f32m2' | |||||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' | |||||
| VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' | |||||
| VSETVL='__riscv_vsetvl_e32m2' | |||||
| VSEV='__riscv_vse32_v_f32m2' | |||||
| VSSEV='__riscv_vsse32_v_f32m2' | |||||
| acc_vector_t='vfloat32m2_t' | |||||
| output='cgemm_kernel_8x4_zvl128b.c' | |||||
| param_scalar_t='float' | |||||
| param_vector_t='vfloat32m2_t' | |||||
| */ | |||||
| #include "common.h" | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| #define S0 1 | |||||
| #define S1 -1 | |||||
| #define S2 1 | |||||
| #define S3 1 | |||||
| #define VFMACC_RR __riscv_vfmsac | |||||
| #define VFMACC_RI __riscv_vfmacc | |||||
| #endif | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| #define S0 1 | |||||
| #define S1 1 | |||||
| #define S2 1 | |||||
| #define S3 -1 | |||||
| #define VFMACC_RR __riscv_vfmacc | |||||
| #define VFMACC_RI __riscv_vfmsac | |||||
| #endif | |||||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| #define S0 1 | |||||
| #define S1 1 | |||||
| #define S2 -1 | |||||
| #define S3 1 | |||||
| #define VFMACC_RR __riscv_vfmacc | |||||
| #define VFMACC_RI __riscv_vfnmsac | |||||
| #endif | |||||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| #define S0 1 | |||||
| #define S1 -1 | |||||
| #define S2 -1 | |||||
| #define S3 -1 | |||||
| #define VFMACC_RR __riscv_vfmsac | |||||
| #define VFMACC_RI __riscv_vfnmacc | |||||
| #endif | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) | |||||
| { | |||||
| BLASLONG gvl = 0; | |||||
| BLASLONG m_top = 0; | |||||
| BLASLONG n_top = 0; | |||||
| // -- MAIN PASS | |||||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||||
| m_top = 0; | |||||
| BLASLONG gvl = __riscv_vsetvl_e32m2(8); | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| float B0r = B[bi + 0 * 2 + 0]; | |||||
| float B0i = B[bi + 0 * 2 + 1]; | |||||
| float B1r = B[bi + 1 * 2 + 0]; | |||||
| float B1i = B[bi + 1 * 2 + 1]; | |||||
| float B2r = B[bi + 2 * 2 + 0]; | |||||
| float B2i = B[bi + 2 * 2 + 1]; | |||||
| float B3r = B[bi + 3 * 2 + 0]; | |||||
| float B3i = B[bi + 3 * 2 + 1]; | |||||
| bi += 4 * 2; | |||||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 8 * 2; | |||||
| // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k | |||||
| // leaving 6 vector registers for temporaries | |||||
| // performing 2 operations between reuses of temporaries | |||||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| vfloat32m2_t ACC0r = tmp0r; | |||||
| vfloat32m2_t ACC0i = tmp0i; | |||||
| vfloat32m2_t ACC1r = tmp1r; | |||||
| vfloat32m2_t ACC1i = tmp1i; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||||
| vfloat32m2_t ACC2r = tmp0r; | |||||
| vfloat32m2_t ACC2i = tmp0i; | |||||
| vfloat32m2_t ACC3r = tmp1r; | |||||
| vfloat32m2_t ACC3i = tmp1i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| B1r = B[bi + 1 * 2 + 0]; | |||||
| B1i = B[bi + 1 * 2 + 1]; | |||||
| B2r = B[bi + 2 * 2 + 0]; | |||||
| B2i = B[bi + 2 * 2 + 1]; | |||||
| B3r = B[bi + 3 * 2 + 0]; | |||||
| B3i = B[bi + 3 * 2 + 1]; | |||||
| bi += 4 * 2; | |||||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 8 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||||
| ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); | |||||
| ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); | |||||
| ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); | |||||
| ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||||
| C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); | |||||
| C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); | |||||
| C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); | |||||
| C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||||
| C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); | |||||
| C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); | |||||
| C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); | |||||
| C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| // -- tails for main pass | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| float B0r = B[bi + 0 * 2 + 0]; | |||||
| float B0i = B[bi + 0 * 2 + 1]; | |||||
| float B1r = B[bi + 1 * 2 + 0]; | |||||
| float B1i = B[bi + 1 * 2 + 1]; | |||||
| float B2r = B[bi + 2 * 2 + 0]; | |||||
| float B2i = B[bi + 2 * 2 + 1]; | |||||
| float B3r = B[bi + 3 * 2 + 0]; | |||||
| float B3i = B[bi + 3 * 2 + 1]; | |||||
| bi += 4 * 2; | |||||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k | |||||
| // leaving 6 vector registers for temporaries | |||||
| // performing 2 operations between reuses of temporaries | |||||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| vfloat32m2_t ACC0r = tmp0r; | |||||
| vfloat32m2_t ACC0i = tmp0i; | |||||
| vfloat32m2_t ACC1r = tmp1r; | |||||
| vfloat32m2_t ACC1i = tmp1i; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||||
| vfloat32m2_t ACC2r = tmp0r; | |||||
| vfloat32m2_t ACC2i = tmp0i; | |||||
| vfloat32m2_t ACC3r = tmp1r; | |||||
| vfloat32m2_t ACC3i = tmp1i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| B1r = B[bi + 1 * 2 + 0]; | |||||
| B1i = B[bi + 1 * 2 + 1]; | |||||
| B2r = B[bi + 2 * 2 + 0]; | |||||
| B2i = B[bi + 2 * 2 + 1]; | |||||
| B3r = B[bi + 3 * 2 + 0]; | |||||
| B3i = B[bi + 3 * 2 + 1]; | |||||
| bi += 4 * 2; | |||||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||||
| ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); | |||||
| ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); | |||||
| ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); | |||||
| ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||||
| C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); | |||||
| C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); | |||||
| C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); | |||||
| C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||||
| C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); | |||||
| C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); | |||||
| C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); | |||||
| C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| float result4 = 0; | |||||
| float result5 = 0; | |||||
| float result6 = 0; | |||||
| float result7 = 0; | |||||
| float result8 = 0; | |||||
| float result9 = 0; | |||||
| float result10 = 0; | |||||
| float result11 = 0; | |||||
| float result12 = 0; | |||||
| float result13 = 0; | |||||
| float result14 = 0; | |||||
| float result15 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; | |||||
| result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; | |||||
| result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; | |||||
| result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; | |||||
| result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; | |||||
| result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; | |||||
| result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; | |||||
| result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; | |||||
| result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; | |||||
| result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; | |||||
| ai += 2 * 2; | |||||
| bi += 4 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| float Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||||
| Cr += result4 * alphar; | |||||
| Ci += result5 * alphar; | |||||
| Cr -= result5 * alphai; | |||||
| Ci += result4 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; | |||||
| Cr += result6 * alphar; | |||||
| Ci += result7 * alphar; | |||||
| Cr -= result7 * alphai; | |||||
| Ci += result6 * alphai; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; | |||||
| Cr += result8 * alphar; | |||||
| Ci += result9 * alphar; | |||||
| Cr -= result9 * alphai; | |||||
| Ci += result8 * alphai; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; | |||||
| Cr += result10 * alphar; | |||||
| Ci += result11 * alphar; | |||||
| Cr -= result11 * alphai; | |||||
| Ci += result10 * alphai; | |||||
| C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; | |||||
| Cr += result12 * alphar; | |||||
| Ci += result13 * alphar; | |||||
| Cr -= result13 * alphai; | |||||
| Ci += result12 * alphai; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; | |||||
| Cr += result14 * alphar; | |||||
| Ci += result15 * alphar; | |||||
| Cr -= result15 * alphai; | |||||
| Ci += result14 * alphai; | |||||
| C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| float result4 = 0; | |||||
| float result5 = 0; | |||||
| float result6 = 0; | |||||
| float result7 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; | |||||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; | |||||
| result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; | |||||
| result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; | |||||
| ai += 1 * 2; | |||||
| bi += 4 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| float Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; | |||||
| Cr += result4 * alphar; | |||||
| Ci += result5 * alphar; | |||||
| Cr -= result5 * alphai; | |||||
| Ci += result4 * alphai; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; | |||||
| Cr += result6 * alphar; | |||||
| Ci += result7 * alphar; | |||||
| Cr -= result7 * alphai; | |||||
| Ci += result6 * alphai; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 4; | |||||
| } | |||||
| // -- tails for N=2 | |||||
| if (N & 2) { | |||||
| gvl = __riscv_vsetvl_e32m2(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| float B0r = B[bi + 0 * 2 + 0]; | |||||
| float B0i = B[bi + 0 * 2 + 1]; | |||||
| float B1r = B[bi + 1 * 2 + 0]; | |||||
| float B1i = B[bi + 1 * 2 + 1]; | |||||
| bi += 2 * 2; | |||||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 8 * 2; | |||||
| // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k | |||||
| // leaving 10 vector registers for temporaries | |||||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| vfloat32m2_t ACC0r = tmp0r; | |||||
| vfloat32m2_t ACC0i = tmp0i; | |||||
| vfloat32m2_t ACC1r = tmp1r; | |||||
| vfloat32m2_t ACC1i = tmp1i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| B1r = B[bi + 1 * 2 + 0]; | |||||
| B1i = B[bi + 1 * 2 + 1]; | |||||
| bi += 2 * 2; | |||||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 8 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| float B0r = B[bi + 0 * 2 + 0]; | |||||
| float B0i = B[bi + 0 * 2 + 1]; | |||||
| float B1r = B[bi + 1 * 2 + 0]; | |||||
| float B1i = B[bi + 1 * 2 + 1]; | |||||
| bi += 2 * 2; | |||||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k | |||||
| // leaving 10 vector registers for temporaries | |||||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| vfloat32m2_t ACC0r = tmp0r; | |||||
| vfloat32m2_t ACC0i = tmp0i; | |||||
| vfloat32m2_t ACC1r = tmp1r; | |||||
| vfloat32m2_t ACC1i = tmp1i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| B1r = B[bi + 1 * 2 + 0]; | |||||
| B1i = B[bi + 1 * 2 + 1]; | |||||
| bi += 2 * 2; | |||||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| float result4 = 0; | |||||
| float result5 = 0; | |||||
| float result6 = 0; | |||||
| float result7 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; | |||||
| result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; | |||||
| ai += 2 * 2; | |||||
| bi += 2 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| float Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||||
| Cr += result4 * alphar; | |||||
| Ci += result5 * alphar; | |||||
| Cr -= result5 * alphai; | |||||
| Ci += result4 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; | |||||
| Cr += result6 * alphar; | |||||
| Ci += result7 * alphar; | |||||
| Cr -= result7 * alphai; | |||||
| Ci += result6 * alphai; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||||
| result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||||
| ai += 1 * 2; | |||||
| bi += 2 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| float Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 2; | |||||
| } | |||||
| // -- tails for N=1 | |||||
| if (N & 1) { | |||||
| gvl = __riscv_vsetvl_e32m2(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| float B0r = B[bi + 0 * 2 + 0]; | |||||
| float B0i = B[bi + 0 * 2 + 1]; | |||||
| bi += 1 * 2; | |||||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 8 * 2; | |||||
| // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k | |||||
| // leaving 12 vector registers for temporaries | |||||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| vfloat32m2_t ACC0r = tmp0r; | |||||
| vfloat32m2_t ACC0i = tmp0i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| bi += 1 * 2; | |||||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 8 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e32m2(4); | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| float B0r = B[bi + 0 * 2 + 0]; | |||||
| float B0i = B[bi + 0 * 2 + 1]; | |||||
| bi += 1 * 2; | |||||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k | |||||
| // leaving 12 vector registers for temporaries | |||||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| vfloat32m2_t ACC0r = tmp0r; | |||||
| vfloat32m2_t ACC0i = tmp0i; | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0r = B[bi + 0 * 2 + 0]; | |||||
| B0i = B[bi + 0 * 2 + 1]; | |||||
| bi += 1 * 2; | |||||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| ai += 4 * 2; | |||||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| float result2 = 0; | |||||
| float result3 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||||
| ai += 2 * 2; | |||||
| bi += 1 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| float Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||||
| Cr += result2 * alphar; | |||||
| Ci += result3 * alphar; | |||||
| Cr -= result3 * alphai; | |||||
| Ci += result2 * alphai; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| float result0 = 0; | |||||
| float result1 = 0; | |||||
| BLASLONG ai = m_top * K * 2; | |||||
| BLASLONG bi = n_top * K * 2; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||||
| ai += 1 * 2; | |||||
| bi += 1 * 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| float Cr, Ci; | |||||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||||
| Cr += result0 * alphar; | |||||
| Ci += result1 * alphar; | |||||
| Cr -= result1 * alphai; | |||||
| Ci += result0 * alphai; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 1; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -41,7 +41,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| BLASLONG ix=0,iy=0; | BLASLONG ix=0,iy=0; | ||||
| if ( n < 0 ) return(0); | |||||
| if ( n <= 0 ) return(0); | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -0,0 +1,94 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||||
| #endif | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| if(n <= 0) return(0); | |||||
| FLOAT_V_T v0; | |||||
| if(inc_x == 1 && inc_y == 1) { | |||||
| for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||||
| vl = VSETVL(n); | |||||
| v0 = VLEV_FLOAT(x, vl); | |||||
| VSEV_FLOAT(y, v0, vl); | |||||
| } | |||||
| } else if (inc_y == 1) { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||||
| vl = VSETVL(n); | |||||
| v0 = VLSEV_FLOAT(x, stride_x, vl); | |||||
| VSEV_FLOAT(y, v0, vl); | |||||
| } | |||||
| } else if(inc_x == 1) { | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||||
| vl = VSETVL(n); | |||||
| v0 = VLEV_FLOAT(x, vl); | |||||
| VSSEV_FLOAT(y, stride_y, v0, vl); | |||||
| } | |||||
| } else { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||||
| vl = VSETVL(n); | |||||
| v0 = VLSEV_FLOAT(x, stride_x, vl); | |||||
| VSSEV_FLOAT(y, stride_y, v0, vl); | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -25,22 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| *****************************************************************************/ | *****************************************************************************/ | ||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) vsetvl_e32m8(n) | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define VLEV_FLOAT vle32_v_f32m8 | |||||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||||
| #define VSEV_FLOAT vse32_v_f32m8 | |||||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||||
| #ifdef RISCV64_ZVL256B | |||||
| # define LMUL m2 | |||||
| # if defined(DOUBLE) | |||||
| # define ELEN 64 | |||||
| # else | |||||
| # define ELEN 32 | |||||
| # endif | |||||
| #else | #else | ||||
| #define VSETVL(n) vsetvl_e64m8(n) | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define VLEV_FLOAT vle64_v_f64m8 | |||||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||||
| #define VSEV_FLOAT vse64_v_f64m8 | |||||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||||
| # define LMUL m8 | |||||
| # if defined(DOUBLE) | |||||
| # define ELEN 64 | |||||
| # else | |||||
| # define ELEN 32 | |||||
| # endif | |||||
| #endif | #endif | ||||
| #define _ | |||||
| #define JOIN2_X(x, y) x ## y | |||||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||||
| #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | ||||
| { | { | ||||
| BLASLONG i=0, j=0; | BLASLONG i=0, j=0; | ||||
| @@ -58,7 +71,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| stride_x = inc_x * sizeof(FLOAT); | stride_x = inc_x * sizeof(FLOAT); | ||||
| if(gvl <= n/4){ | if(gvl <= n/4){ | ||||
| BLASLONG inc_xv = inc_x * gvl; | BLASLONG inc_xv = inc_x * gvl; | ||||
| BLASLONG gvl3 = gvl * 3; | |||||
| unsigned int gvl3 = gvl * 3; | |||||
| BLASLONG inc_xv3 = inc_xv * 3; | BLASLONG inc_xv3 = inc_xv * 3; | ||||
| for(i=0,j=0; i<n/(4*gvl); i++){ | for(i=0,j=0; i<n/(4*gvl); i++){ | ||||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | ||||
| @@ -86,7 +99,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| if(gvl <= n/4){ | if(gvl <= n/4){ | ||||
| BLASLONG inc_yv = inc_y * gvl; | BLASLONG inc_yv = inc_y * gvl; | ||||
| BLASLONG inc_yv3 = inc_yv * 3; | BLASLONG inc_yv3 = inc_yv * 3; | ||||
| BLASLONG gvl3 = gvl * 3; | |||||
| unsigned int gvl3 = gvl * 3; | |||||
| for(i=0,j=0; i<n/(4*gvl); i++){ | for(i=0,j=0; i<n/(4*gvl); i++){ | ||||
| v0 = VLEV_FLOAT(&x[j], gvl); | v0 = VLEV_FLOAT(&x[j], gvl); | ||||
| VSSEV_FLOAT(&y[iy], stride_y, v0, gvl); | VSSEV_FLOAT(&y[iy], stride_y, v0, gvl); | ||||
| @@ -196,7 +196,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||||
| asm volatile( | asm volatile( | ||||
| "vsetvli zero, zero, e64,m1 \n\t" | "vsetvli zero, zero, e64,m1 \n\t" | ||||
| "fmv.w.x ft11, zero \n\t" | |||||
| "fmv.d.x ft11, zero \n\t" | |||||
| "mv t0, %[BK] \n\t" | "mv t0, %[BK] \n\t" | ||||
| "vfmv.v.f v16, ft11 \n\t" | "vfmv.v.f v16, ft11 \n\t" | ||||
| @@ -0,0 +1,492 @@ | |||||
| /* | |||||
| AUTOGENERATED KERNEL | |||||
| Script: ./kernel/riscv64/generate_kernel.py | |||||
| Settings: | |||||
| LMUL=4 | |||||
| M=8 | |||||
| M_tail_scalar_from=2 | |||||
| N=4 | |||||
| __riscv_='__riscv_' | |||||
| complex=False | |||||
| conjugate=False | |||||
| cpu='zvl128b' | |||||
| force_acc_double=False | |||||
| index_type='BLASLONG' | |||||
| op='gemm' | |||||
| param_precision='double' | |||||
| reg_width_bits=128 | |||||
| tail_policy='' | |||||
| trace=False | |||||
| Derived: | |||||
| ELEN_ACC=64 | |||||
| ELEN_PARAM=64 | |||||
| LMUL_ACC=4 | |||||
| VFMACC='__riscv_vfmacc_vf_f64m4' | |||||
| VFMUL='__riscv_vfmul_vf_f64m4' | |||||
| VLEV='__riscv_vle64_v_f64m4' | |||||
| VLSEV='__riscv_vlse64_v_f64m4' | |||||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' | |||||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' | |||||
| VSETVL='__riscv_vsetvl_e64m4' | |||||
| VSEV='__riscv_vse64_v_f64m4' | |||||
| VSSEV='__riscv_vsse64_v_f64m4' | |||||
| acc_vector_t='vfloat64m4_t' | |||||
| output='dgemm_kernel_8x4_zvl128b.c' | |||||
| param_scalar_t='double' | |||||
| param_vector_t='vfloat64m4_t' | |||||
| */ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) | |||||
| { | |||||
| BLASLONG gvl = 0; | |||||
| BLASLONG m_top = 0; | |||||
| BLASLONG n_top = 0; | |||||
| // -- MAIN PASS | |||||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||||
| m_top = 0; | |||||
| BLASLONG gvl = __riscv_vsetvl_e64m4(8); | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| double B2 = B[bi + 2]; | |||||
| double B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||||
| c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); | |||||
| c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| // -- tails for main pass | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e64m4(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| double B2 = B[bi + 2]; | |||||
| double B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||||
| c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); | |||||
| c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| result4 += A[ai + 0] * B[bi + 2]; | |||||
| result5 += A[ai + 1] * B[bi + 2]; | |||||
| result6 += A[ai + 0] * B[bi + 3]; | |||||
| result7 += A[ai + 1] * B[bi + 3]; | |||||
| ai += 2; | |||||
| bi += 4; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||||
| C[ci + 2 * ldc + 0] += alpha * result4; | |||||
| C[ci + 2 * ldc + 1] += alpha * result5; | |||||
| C[ci + 3 * ldc + 0] += alpha * result6; | |||||
| C[ci + 3 * ldc + 1] += alpha * result7; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| result2 += A[ai + 0] * B[bi + 2]; | |||||
| result3 += A[ai + 0] * B[bi + 3]; | |||||
| ai += 1; | |||||
| bi += 4; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||||
| C[ci + 2 * ldc + 0] += alpha * result2; | |||||
| C[ci + 3 * ldc + 0] += alpha * result3; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 4; | |||||
| } | |||||
| // -- tails for N=2 | |||||
| if (N & 2) { | |||||
| gvl = __riscv_vsetvl_e64m4(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e64m4(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| ai += 2; | |||||
| bi += 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| ai += 1; | |||||
| bi += 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 2; | |||||
| } | |||||
| // -- tails for N=1 | |||||
| if (N & 1) { | |||||
| gvl = __riscv_vsetvl_e64m4(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| double B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e64m4(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| double B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| for (BLASLONG k = 1; k < K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||||
| ci = n_top * ldc + m_top; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| ai += 2; | |||||
| bi += 1; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| for (BLASLONG k = 0; k < K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| ai += 1; | |||||
| bi += 1; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 1; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,860 @@ | |||||
| /* | |||||
| AUTOGENERATED KERNEL | |||||
| Settings: | |||||
| LMUL=1 | |||||
| M=8 | |||||
| M_tail_scalar_from=2 | |||||
| N=8 | |||||
| __riscv_='__riscv_' | |||||
| complex=False | |||||
| conjugate=False | |||||
| cpu='zvl256b' | |||||
| force_acc_double=False | |||||
| index_type='BLASLONG' | |||||
| op='gemm' | |||||
| param_precision='double' | |||||
| reg_width_bits=256 | |||||
| tail_policy='' | |||||
| trace=False | |||||
| Derived: | |||||
| ELEN_ACC=64 | |||||
| ELEN_PARAM=64 | |||||
| LMUL_ACC=1 | |||||
| VFMACC='__riscv_vfmacc_vf_f64m1' | |||||
| VFMUL='__riscv_vfmul_vf_f64m1' | |||||
| VLEV='__riscv_vle64_v_f64m1' | |||||
| VLSEV='__riscv_vlse64_v_f64m1' | |||||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' | |||||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' | |||||
| VSETVL='__riscv_vsetvl_e64m1' | |||||
| VSEV='__riscv_vse64_v_f64m1' | |||||
| VSSEV='__riscv_vsse64_v_f64m1' | |||||
| acc_vector_t='vfloat64m1_t' | |||||
| output='dgemm_kernel_8x8_zvl256b.c' | |||||
| param_scalar_t='double' | |||||
| param_vector_t='vfloat64m1_t' | |||||
| */ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) | |||||
| { | |||||
| BLASLONG gvl = 0; | |||||
| BLASLONG m_top = 0; | |||||
| BLASLONG n_top = 0; | |||||
| // -- MAIN PASS | |||||
| for (BLASLONG j=0; j<N/8; j+=1) { | |||||
| m_top = 0; | |||||
| BLASLONG gvl = __riscv_vsetvl_e64m1(4); | |||||
| for (BLASLONG i=0; i<M/8; i+=1) { | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| double B0 = B[bi+0]; | |||||
| double B1 = B[bi+1]; | |||||
| double B2 = B[bi+2]; | |||||
| double B3 = B[bi+3]; | |||||
| double B4 = B[bi+4]; | |||||
| double B5 = B[bi+5]; | |||||
| double B6 = B[bi+6]; | |||||
| double B7 = B[bi+7]; | |||||
| bi += 8; | |||||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
| ai += 8; | |||||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl); | |||||
| vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||||
| vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl); | |||||
| vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||||
| vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl); | |||||
| vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl); | |||||
| vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl); | |||||
| vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl); | |||||
| vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl); | |||||
| vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl); | |||||
| vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl); | |||||
| vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl); | |||||
| vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl); | |||||
| for(BLASLONG k=1; k<K; k++) { | |||||
| B0 = B[bi+0]; | |||||
| B1 = B[bi+1]; | |||||
| B2 = B[bi+2]; | |||||
| B3 = B[bi+3]; | |||||
| B4 = B[bi+4]; | |||||
| B5 = B[bi+5]; | |||||
| B6 = B[bi+6]; | |||||
| B7 = B[bi+7]; | |||||
| bi += 8; | |||||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl); | |||||
| result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl); | |||||
| result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl); | |||||
| result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl); | |||||
| result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl); | |||||
| result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl); | |||||
| result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl); | |||||
| result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl); | |||||
| result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl); | |||||
| result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl); | |||||
| result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl); | |||||
| result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl); | |||||
| result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl); | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
| vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
| vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
| vfloat64m1_t c8 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c9 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
| vfloat64m1_t c10 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c11 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
| vfloat64m1_t c12 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c13 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
| vfloat64m1_t c14 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c15 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||||
| c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl ); | |||||
| c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl ); | |||||
| c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl ); | |||||
| c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl ); | |||||
| c8 = __riscv_vfmacc_vf_f64m1( c8, alpha, result8, gvl ); | |||||
| c9 = __riscv_vfmacc_vf_f64m1( c9, alpha, result9, gvl ); | |||||
| c10 = __riscv_vfmacc_vf_f64m1( c10, alpha, result10, gvl ); | |||||
| c11 = __riscv_vfmacc_vf_f64m1( c11, alpha, result11, gvl ); | |||||
| c12 = __riscv_vfmacc_vf_f64m1( c12, alpha, result12, gvl ); | |||||
| c13 = __riscv_vfmacc_vf_f64m1( c13, alpha, result13, gvl ); | |||||
| c14 = __riscv_vfmacc_vf_f64m1( c14, alpha, result14, gvl ); | |||||
| c15 = __riscv_vfmacc_vf_f64m1( c15, alpha, result15, gvl ); | |||||
| ci=n_top*ldc+m_top; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c15, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| // -- tails for main pass | |||||
| if( M & 4 ) { | |||||
| gvl = __riscv_vsetvl_e64m1(4); | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| double B0 = B[bi+0]; | |||||
| double B1 = B[bi+1]; | |||||
| double B2 = B[bi+2]; | |||||
| double B3 = B[bi+3]; | |||||
| double B4 = B[bi+4]; | |||||
| double B5 = B[bi+5]; | |||||
| double B6 = B[bi+6]; | |||||
| double B7 = B[bi+7]; | |||||
| bi += 8; | |||||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| ai += 4; | |||||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||||
| vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl); | |||||
| vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl); | |||||
| vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl); | |||||
| vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl); | |||||
| for(BLASLONG k=1; k<K; k++) { | |||||
| B0 = B[bi+0]; | |||||
| B1 = B[bi+1]; | |||||
| B2 = B[bi+2]; | |||||
| B3 = B[bi+3]; | |||||
| B4 = B[bi+4]; | |||||
| B5 = B[bi+5]; | |||||
| B6 = B[bi+6]; | |||||
| B7 = B[bi+7]; | |||||
| bi += 8; | |||||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl); | |||||
| result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl); | |||||
| result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl); | |||||
| result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl); | |||||
| result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl); | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
| vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
| vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
| vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
| vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||||
| c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl ); | |||||
| c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl ); | |||||
| c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl ); | |||||
| c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl ); | |||||
| ci=n_top*ldc+m_top; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c7, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if( M & 2 ) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| double result8 = 0; | |||||
| double result9 = 0; | |||||
| double result10 = 0; | |||||
| double result11 = 0; | |||||
| double result12 = 0; | |||||
| double result13 = 0; | |||||
| double result14 = 0; | |||||
| double result15 = 0; | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| for(BLASLONG k=0; k<K; k++) { | |||||
| result0+=A[ai+0]*B[bi+0]; | |||||
| result1+=A[ai+1]*B[bi+0]; | |||||
| result2+=A[ai+0]*B[bi+1]; | |||||
| result3+=A[ai+1]*B[bi+1]; | |||||
| result4+=A[ai+0]*B[bi+2]; | |||||
| result5+=A[ai+1]*B[bi+2]; | |||||
| result6+=A[ai+0]*B[bi+3]; | |||||
| result7+=A[ai+1]*B[bi+3]; | |||||
| result8+=A[ai+0]*B[bi+4]; | |||||
| result9+=A[ai+1]*B[bi+4]; | |||||
| result10+=A[ai+0]*B[bi+5]; | |||||
| result11+=A[ai+1]*B[bi+5]; | |||||
| result12+=A[ai+0]*B[bi+6]; | |||||
| result13+=A[ai+1]*B[bi+6]; | |||||
| result14+=A[ai+0]*B[bi+7]; | |||||
| result15+=A[ai+1]*B[bi+7]; | |||||
| ai+=2; | |||||
| bi+=8; | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| C[ci+0*ldc+0] += alpha * result0; | |||||
| C[ci+0*ldc+1] += alpha * result1; | |||||
| C[ci+1*ldc+0] += alpha * result2; | |||||
| C[ci+1*ldc+1] += alpha * result3; | |||||
| C[ci+2*ldc+0] += alpha * result4; | |||||
| C[ci+2*ldc+1] += alpha * result5; | |||||
| C[ci+3*ldc+0] += alpha * result6; | |||||
| C[ci+3*ldc+1] += alpha * result7; | |||||
| C[ci+4*ldc+0] += alpha * result8; | |||||
| C[ci+4*ldc+1] += alpha * result9; | |||||
| C[ci+5*ldc+0] += alpha * result10; | |||||
| C[ci+5*ldc+1] += alpha * result11; | |||||
| C[ci+6*ldc+0] += alpha * result12; | |||||
| C[ci+6*ldc+1] += alpha * result13; | |||||
| C[ci+7*ldc+0] += alpha * result14; | |||||
| C[ci+7*ldc+1] += alpha * result15; | |||||
| m_top+=2; | |||||
| } | |||||
| if( M & 1 ) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| for(BLASLONG k=0; k<K; k++) { | |||||
| result0+=A[ai+0]*B[bi+0]; | |||||
| result1+=A[ai+0]*B[bi+1]; | |||||
| result2+=A[ai+0]*B[bi+2]; | |||||
| result3+=A[ai+0]*B[bi+3]; | |||||
| result4+=A[ai+0]*B[bi+4]; | |||||
| result5+=A[ai+0]*B[bi+5]; | |||||
| result6+=A[ai+0]*B[bi+6]; | |||||
| result7+=A[ai+0]*B[bi+7]; | |||||
| ai+=1; | |||||
| bi+=8; | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| C[ci+0*ldc+0] += alpha * result0; | |||||
| C[ci+1*ldc+0] += alpha * result1; | |||||
| C[ci+2*ldc+0] += alpha * result2; | |||||
| C[ci+3*ldc+0] += alpha * result3; | |||||
| C[ci+4*ldc+0] += alpha * result4; | |||||
| C[ci+5*ldc+0] += alpha * result5; | |||||
| C[ci+6*ldc+0] += alpha * result6; | |||||
| C[ci+7*ldc+0] += alpha * result7; | |||||
| m_top+=1; | |||||
| } | |||||
| n_top += 8; | |||||
| } | |||||
| // -- tails for N=4 | |||||
| if( N & 4 ) { | |||||
| gvl = __riscv_vsetvl_e64m1(4); | |||||
| m_top = 0; | |||||
| for (BLASLONG i=0; i<M/8; i+=1) { | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| double B0 = B[bi+0]; | |||||
| double B1 = B[bi+1]; | |||||
| double B2 = B[bi+2]; | |||||
| double B3 = B[bi+3]; | |||||
| bi += 4; | |||||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
| ai += 8; | |||||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl); | |||||
| vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||||
| vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl); | |||||
| vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||||
| vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl); | |||||
| for(BLASLONG k=1; k<K; k++) { | |||||
| B0 = B[bi+0]; | |||||
| B1 = B[bi+1]; | |||||
| B2 = B[bi+2]; | |||||
| B3 = B[bi+3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl); | |||||
| result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl); | |||||
| result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl); | |||||
| result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl); | |||||
| result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl); | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
| vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
| vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||||
| c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl ); | |||||
| c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl ); | |||||
| c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl ); | |||||
| c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl ); | |||||
| ci=n_top*ldc+m_top; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c7, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if( M & 4 ) { | |||||
| gvl = __riscv_vsetvl_e64m1(4); | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| double B0 = B[bi+0]; | |||||
| double B1 = B[bi+1]; | |||||
| double B2 = B[bi+2]; | |||||
| double B3 = B[bi+3]; | |||||
| bi += 4; | |||||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| ai += 4; | |||||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||||
| for(BLASLONG k=1; k<K; k++) { | |||||
| B0 = B[bi+0]; | |||||
| B1 = B[bi+1]; | |||||
| B2 = B[bi+2]; | |||||
| B3 = B[bi+3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||||
| ci=n_top*ldc+m_top; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if( M & 2 ) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| for(BLASLONG k=0; k<K; k++) { | |||||
| result0+=A[ai+0]*B[bi+0]; | |||||
| result1+=A[ai+1]*B[bi+0]; | |||||
| result2+=A[ai+0]*B[bi+1]; | |||||
| result3+=A[ai+1]*B[bi+1]; | |||||
| result4+=A[ai+0]*B[bi+2]; | |||||
| result5+=A[ai+1]*B[bi+2]; | |||||
| result6+=A[ai+0]*B[bi+3]; | |||||
| result7+=A[ai+1]*B[bi+3]; | |||||
| ai+=2; | |||||
| bi+=4; | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| C[ci+0*ldc+0] += alpha * result0; | |||||
| C[ci+0*ldc+1] += alpha * result1; | |||||
| C[ci+1*ldc+0] += alpha * result2; | |||||
| C[ci+1*ldc+1] += alpha * result3; | |||||
| C[ci+2*ldc+0] += alpha * result4; | |||||
| C[ci+2*ldc+1] += alpha * result5; | |||||
| C[ci+3*ldc+0] += alpha * result6; | |||||
| C[ci+3*ldc+1] += alpha * result7; | |||||
| m_top+=2; | |||||
| } | |||||
| if( M & 1 ) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| for(BLASLONG k=0; k<K; k++) { | |||||
| result0+=A[ai+0]*B[bi+0]; | |||||
| result1+=A[ai+0]*B[bi+1]; | |||||
| result2+=A[ai+0]*B[bi+2]; | |||||
| result3+=A[ai+0]*B[bi+3]; | |||||
| ai+=1; | |||||
| bi+=4; | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| C[ci+0*ldc+0] += alpha * result0; | |||||
| C[ci+1*ldc+0] += alpha * result1; | |||||
| C[ci+2*ldc+0] += alpha * result2; | |||||
| C[ci+3*ldc+0] += alpha * result3; | |||||
| m_top+=1; | |||||
| } | |||||
| n_top += 4; | |||||
| } | |||||
| // -- tails for N=2 | |||||
| if( N & 2 ) { | |||||
| gvl = __riscv_vsetvl_e64m1(4); | |||||
| m_top = 0; | |||||
| for (BLASLONG i=0; i<M/8; i+=1) { | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| double B0 = B[bi+0]; | |||||
| double B1 = B[bi+1]; | |||||
| bi += 2; | |||||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
| ai += 8; | |||||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl); | |||||
| for(BLASLONG k=1; k<K; k++) { | |||||
| B0 = B[bi+0]; | |||||
| B1 = B[bi+1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl); | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||||
| ci=n_top*ldc+m_top; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if( M & 4 ) { | |||||
| gvl = __riscv_vsetvl_e64m1(4); | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| double B0 = B[bi+0]; | |||||
| double B1 = B[bi+1]; | |||||
| bi += 2; | |||||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| ai += 4; | |||||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||||
| for(BLASLONG k=1; k<K; k++) { | |||||
| B0 = B[bi+0]; | |||||
| B1 = B[bi+1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
| ci=n_top*ldc+m_top; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if( M & 2 ) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| for(BLASLONG k=0; k<K; k++) { | |||||
| result0+=A[ai+0]*B[bi+0]; | |||||
| result1+=A[ai+1]*B[bi+0]; | |||||
| result2+=A[ai+0]*B[bi+1]; | |||||
| result3+=A[ai+1]*B[bi+1]; | |||||
| ai+=2; | |||||
| bi+=2; | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| C[ci+0*ldc+0] += alpha * result0; | |||||
| C[ci+0*ldc+1] += alpha * result1; | |||||
| C[ci+1*ldc+0] += alpha * result2; | |||||
| C[ci+1*ldc+1] += alpha * result3; | |||||
| m_top+=2; | |||||
| } | |||||
| if( M & 1 ) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| for(BLASLONG k=0; k<K; k++) { | |||||
| result0+=A[ai+0]*B[bi+0]; | |||||
| result1+=A[ai+0]*B[bi+1]; | |||||
| ai+=1; | |||||
| bi+=2; | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| C[ci+0*ldc+0] += alpha * result0; | |||||
| C[ci+1*ldc+0] += alpha * result1; | |||||
| m_top+=1; | |||||
| } | |||||
| n_top += 2; | |||||
| } | |||||
| // -- tails for N=1 | |||||
| if( N & 1 ) { | |||||
| gvl = __riscv_vsetvl_e64m1(4); | |||||
| m_top = 0; | |||||
| for (BLASLONG i=0; i<M/8; i+=1) { | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| double B0 = B[bi+0]; | |||||
| bi += 1; | |||||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
| ai += 8; | |||||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||||
| for(BLASLONG k=1; k<K; k++) { | |||||
| B0 = B[bi+0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||||
| ci=n_top*ldc+m_top; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if( M & 4 ) { | |||||
| gvl = __riscv_vsetvl_e64m1(4); | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| double B0 = B[bi+0]; | |||||
| bi += 1; | |||||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| ai += 4; | |||||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||||
| for(BLASLONG k=1; k<K; k++) { | |||||
| B0 = B[bi+0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||||
| ci=n_top*ldc+m_top; | |||||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if( M & 2 ) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| for(BLASLONG k=0; k<K; k++) { | |||||
| result0+=A[ai+0]*B[bi+0]; | |||||
| result1+=A[ai+1]*B[bi+0]; | |||||
| ai+=2; | |||||
| bi+=1; | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| C[ci+0*ldc+0] += alpha * result0; | |||||
| C[ci+0*ldc+1] += alpha * result1; | |||||
| m_top+=2; | |||||
| } | |||||
| if( M & 1 ) { | |||||
| double result0 = 0; | |||||
| BLASLONG ai=m_top*K; | |||||
| BLASLONG bi=n_top*K; | |||||
| for(BLASLONG k=0; k<K; k++) { | |||||
| result0+=A[ai+0]*B[bi+0]; | |||||
| ai+=1; | |||||
| bi+=1; | |||||
| } | |||||
| BLASLONG ci=n_top*ldc+m_top; | |||||
| C[ci+0*ldc+0] += alpha * result0; | |||||
| m_top+=1; | |||||
| } | |||||
| n_top += 1; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -44,14 +44,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | { | ||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| BLASLONG ix=0,iy=0; | BLASLONG ix=0,iy=0; | ||||
| #if defined(DSDOT) | |||||
| double dot = 0.0 ; | double dot = 0.0 ; | ||||
| #else | |||||
| FLOAT dot = 0.0 ; | |||||
| #endif | |||||
| if ( n < 1 ) return(dot); | if ( n < 1 ) return(dot); | ||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| #if defined(DSDOT) | |||||
| dot += (double) y[iy] * (double) x[ix] ; | |||||
| #else | |||||
| dot += y[iy] * x[ix] ; | dot += y[iy] * x[ix] ; | ||||
| #endif | |||||
| ix += inc_x ; | ix += inc_x ; | ||||
| iy += inc_y ; | iy += inc_y ; | ||||
| i++ ; | i++ ; | ||||
| @@ -0,0 +1,126 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if defined(DSDOT) | |||||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| #else | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| #endif | |||||
| { | |||||
| double dot = 0.0; | |||||
| if ( n <= 0 ) return(dot); | |||||
| size_t vlmax = __riscv_vsetvlmax_e64m8(); | |||||
| vfloat64m8_t vr = __riscv_vfmv_v_f_f64m8(0, vlmax); | |||||
| if(inc_x == 1 && inc_y == 1) { | |||||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||||
| vl = __riscv_vsetvl_e64m8(n); | |||||
| #if !defined(DOUBLE) | |||||
| vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); | |||||
| vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); | |||||
| vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||||
| #else | |||||
| vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); | |||||
| vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); | |||||
| vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||||
| #endif | |||||
| } | |||||
| } else if (1 == inc_x) { | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||||
| vl = __riscv_vsetvl_e64m8(n); | |||||
| #if !defined(DOUBLE) | |||||
| vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); | |||||
| vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); | |||||
| vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||||
| #else | |||||
| vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); | |||||
| vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); | |||||
| vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||||
| #endif | |||||
| } | |||||
| } else if (1 == inc_y) { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||||
| vl = __riscv_vsetvl_e64m8(n); | |||||
| #if !defined(DOUBLE) | |||||
| vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); | |||||
| vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); | |||||
| vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||||
| #else | |||||
| vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); | |||||
| vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); | |||||
| vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||||
| #endif | |||||
| } | |||||
| } else { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||||
| vl = __riscv_vsetvl_e64m8(n); | |||||
| #if !defined(DOUBLE) | |||||
| vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); | |||||
| vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); | |||||
| vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||||
| #else | |||||
| vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); | |||||
| vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); | |||||
| vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||||
| #endif | |||||
| } | |||||
| } | |||||
| vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax); | |||||
| vfloat64m1_t vec_sum = __riscv_vfredusum_vs_f64m8_f64m1(vr, vec_zero, vlmax); | |||||
| dot = __riscv_vfmv_f_s_f64m1_f64(vec_sum); | |||||
| return(dot); | |||||
| } | |||||
| @@ -27,31 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() | |||||
| #define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
| #define FLOAT_V_T_M1 vfloat32m1_t | #define FLOAT_V_T_M1 vfloat32m1_t | ||||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||||
| #define VLEV_FLOAT vle32_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||||
| #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||||
| #ifdef RISCV_0p10_INTRINSICS | |||||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) | |||||
| #else | #else | ||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) | |||||
| #endif | |||||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) | |||||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) | |||||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||||
| #define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) | |||||
| #else | |||||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() | |||||
| #define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
| #define FLOAT_V_T_M1 vfloat64m1_t | #define FLOAT_V_T_M1 vfloat64m1_t | ||||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||||
| #define VLEV_FLOAT vle64_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||||
| #ifdef RISCV_0p10_INTRINSICS | |||||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) | |||||
| #else | |||||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) | |||||
| #endif | |||||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) | |||||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) | |||||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||||
| #define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) | |||||
| #endif | #endif | ||||
| #if defined(DSDOT) | #if defined(DSDOT) | ||||
| @@ -82,8 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| if(j > 0){ | if(j > 0){ | ||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += (double)VFMVFS_FLOAT(v_res); | |||||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
| dot += (double)EXTRACT_FLOAT(v_res); | |||||
| } | } | ||||
| //tail | //tail | ||||
| if(j < n){ | if(j < n){ | ||||
| @@ -93,13 +99,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | ||||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | //vr = VFDOTVV_FLOAT(vx, vy, gvl); | ||||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | ||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += (double)VFMVFS_FLOAT(v_res); | |||||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
| dot += (double)EXTRACT_FLOAT(v_res); | |||||
| } | } | ||||
| }else if(inc_y == 1){ | }else if(inc_y == 1){ | ||||
| gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| int stride_x = inc_x * sizeof(FLOAT); | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| vy = VLEV_FLOAT(&y[j], gvl); | vy = VLEV_FLOAT(&y[j], gvl); | ||||
| @@ -107,9 +113,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| if(j > 0){ | if(j > 0){ | ||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += (double)VFMVFS_FLOAT(v_res); | |||||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
| dot += (double)EXTRACT_FLOAT(v_res); | |||||
| } | } | ||||
| //tail | //tail | ||||
| if(j < n){ | if(j < n){ | ||||
| @@ -119,14 +124,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | ||||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | //vr = VFDOTVV_FLOAT(vx, vy, gvl); | ||||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | ||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += (double)VFMVFS_FLOAT(v_res); | |||||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
| dot += (double)EXTRACT_FLOAT(v_res); | |||||
| } | } | ||||
| }else if(inc_x == 1){ | }else if(inc_x == 1){ | ||||
| gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| int stride_y = inc_y * sizeof(FLOAT); | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| vx = VLEV_FLOAT(&x[j], gvl); | vx = VLEV_FLOAT(&x[j], gvl); | ||||
| vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | ||||
| @@ -134,9 +138,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| if(j > 0){ | if(j > 0){ | ||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += (double)VFMVFS_FLOAT(v_res); | |||||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
| dot += (double)EXTRACT_FLOAT(v_res); | |||||
| } | } | ||||
| //tail | //tail | ||||
| if(j < n){ | if(j < n){ | ||||
| @@ -146,15 +149,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | ||||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | //vr = VFDOTVV_FLOAT(vx, vy, gvl); | ||||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | ||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += (double)VFMVFS_FLOAT(v_res); | |||||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
| dot += (double)EXTRACT_FLOAT(v_res); | |||||
| } | } | ||||
| }else{ | }else{ | ||||
| gvl = VSETVL(n); | gvl = VSETVL(n); | ||||
| vr = VFMVVF_FLOAT(0, gvl); | vr = VFMVVF_FLOAT(0, gvl); | ||||
| int stride_x = inc_x * sizeof(FLOAT); | |||||
| int stride_y = inc_y * sizeof(FLOAT); | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for(i=0,j=0; i<n/gvl; i++){ | for(i=0,j=0; i<n/gvl; i++){ | ||||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | ||||
| vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | ||||
| @@ -162,9 +164,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| j += gvl; | j += gvl; | ||||
| } | } | ||||
| if(j > 0){ | if(j > 0){ | ||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += (double)VFMVFS_FLOAT(v_res); | |||||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
| dot += (double)EXTRACT_FLOAT(v_res); | |||||
| } | } | ||||
| //tail | //tail | ||||
| if(j < n){ | if(j < n){ | ||||
| @@ -174,9 +175,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | ||||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | //vr = VFDOTVV_FLOAT(vx, vy, gvl); | ||||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | ||||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||||
| dot += (double)VFMVFS_FLOAT(v_res); | |||||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||||
| dot += (double)EXTRACT_FLOAT(v_res); | |||||
| } | } | ||||
| } | } | ||||
| return(dot); | return(dot); | ||||
| @@ -0,0 +1,152 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2023, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0, j=0; | |||||
| double dot = 0.0 ; | |||||
| if ( n < 1 ) return(dot); | |||||
| vfloat64m4_t vr; | |||||
| vfloat32m2_t vx, vy; | |||||
| unsigned int gvl = 0; | |||||
| vfloat64m1_t v_res, v_z0; | |||||
| gvl = vsetvlmax_e64m1(); | |||||
| v_res = vfmv_v_f_f64m1(0, gvl); | |||||
| v_z0 = vfmv_v_f_f64m1(0, gvl); | |||||
| if(inc_x == 1 && inc_y == 1){ | |||||
| gvl = vsetvl_e64m4(n); | |||||
| vr = vfmv_v_f_f64m4(0, gvl); | |||||
| for(i=0,j=0; i<n/gvl; i++){ | |||||
| vx = vle32_v_f32m2(&x[j], gvl); | |||||
| vy = vle32_v_f32m2(&y[j], gvl); | |||||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||||
| j += gvl; | |||||
| } | |||||
| if(j > 0){ | |||||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
| } | |||||
| //tail | |||||
| if(j < n){ | |||||
| gvl = vsetvl_e64m4(n-j); | |||||
| vx = vle32_v_f32m2(&x[j], gvl); | |||||
| vy = vle32_v_f32m2(&y[j], gvl); | |||||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
| } | |||||
| }else if(inc_y == 1){ | |||||
| gvl = vsetvl_e64m4(n); | |||||
| vr = vfmv_v_f_f64m4(0, gvl); | |||||
| int stride_x = inc_x * sizeof(FLOAT); | |||||
| for(i=0,j=0; i<n/gvl; i++){ | |||||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||||
| vy = vle32_v_f32m2(&y[j], gvl); | |||||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||||
| j += gvl; | |||||
| } | |||||
| if(j > 0){ | |||||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
| } | |||||
| //tail | |||||
| if(j < n){ | |||||
| gvl = vsetvl_e64m4(n-j); | |||||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||||
| vy = vle32_v_f32m2(&y[j], gvl); | |||||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
| } | |||||
| }else if(inc_x == 1){ | |||||
| gvl = vsetvl_e64m4(n); | |||||
| vr = vfmv_v_f_f64m4(0, gvl); | |||||
| int stride_y = inc_y * sizeof(FLOAT); | |||||
| for(i=0,j=0; i<n/gvl; i++){ | |||||
| vx = vle32_v_f32m2(&x[j], gvl); | |||||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||||
| j += gvl; | |||||
| } | |||||
| if(j > 0){ | |||||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
| } | |||||
| //tail | |||||
| if(j < n){ | |||||
| gvl = vsetvl_e64m4(n-j); | |||||
| vx = vle32_v_f32m2(&x[j], gvl); | |||||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
| } | |||||
| }else{ | |||||
| gvl = vsetvl_e64m4(n); | |||||
| vr = vfmv_v_f_f64m4(0, gvl); | |||||
| int stride_x = inc_x * sizeof(FLOAT); | |||||
| int stride_y = inc_y * sizeof(FLOAT); | |||||
| for(i=0,j=0; i<n/gvl; i++){ | |||||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||||
| j += gvl; | |||||
| } | |||||
| if(j > 0){ | |||||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
| } | |||||
| //tail | |||||
| if(j < n){ | |||||
| gvl = vsetvl_e64m4(n-j); | |||||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||||
| } | |||||
| } | |||||
| return(dot); | |||||
| } | |||||
| @@ -0,0 +1,660 @@ | |||||
| /* | |||||
| AUTOGENERATED KERNEL | |||||
| Script: ./kernel/riscv64/generate_kernel.py | |||||
| Settings: | |||||
| LMUL=4 | |||||
| M=8 | |||||
| M_tail_scalar_from=2 | |||||
| N=4 | |||||
| __riscv_='__riscv_' | |||||
| complex=False | |||||
| conjugate=False | |||||
| cpu='zvl128b' | |||||
| force_acc_double=False | |||||
| index_type='BLASLONG' | |||||
| op='trmm' | |||||
| param_precision='double' | |||||
| reg_width_bits=128 | |||||
| tail_policy='' | |||||
| trace=False | |||||
| Derived: | |||||
| ELEN_ACC=64 | |||||
| ELEN_PARAM=64 | |||||
| LMUL_ACC=4 | |||||
| VFMACC='__riscv_vfmacc_vf_f64m4' | |||||
| VFMUL='__riscv_vfmul_vf_f64m4' | |||||
| VLEV='__riscv_vle64_v_f64m4' | |||||
| VLSEV='__riscv_vlse64_v_f64m4' | |||||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' | |||||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' | |||||
| VSETVL='__riscv_vsetvl_e64m4' | |||||
| VSEV='__riscv_vse64_v_f64m4' | |||||
| VSSEV='__riscv_vsse64_v_f64m4' | |||||
| acc_vector_t='vfloat64m4_t' | |||||
| output='dtrmm_kernel_8x4_zvl128b.c' | |||||
| param_scalar_t='double' | |||||
| param_vector_t='vfloat64m4_t' | |||||
| */ | |||||
| #include "common.h" | |||||
| #if defined(LEFT) != defined(TRANSA) | |||||
| #define BACKWARDS | |||||
| #endif | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) | |||||
| { | |||||
| BLASLONG gvl = 0; | |||||
| BLASLONG m_top = 0; | |||||
| BLASLONG n_top = 0; | |||||
| // -- MAIN PASS | |||||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||||
| m_top = 0; | |||||
| BLASLONG gvl = __riscv_vsetvl_e64m4(8); | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 8; | |||||
| bi += off * 4; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 8; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| double B2 = B[bi + 2]; | |||||
| double B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||||
| vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); | |||||
| vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| // -- tails for main pass | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e64m4(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4; | |||||
| bi += off * 4; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| double B2 = B[bi + 2]; | |||||
| double B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| B2 = B[bi + 2]; | |||||
| B3 = B[bi + 3]; | |||||
| bi += 4; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||||
| vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); | |||||
| vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| double result4 = 0; | |||||
| double result5 = 0; | |||||
| double result6 = 0; | |||||
| double result7 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2; | |||||
| bi += off * 4; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| result4 += A[ai + 0] * B[bi + 2]; | |||||
| result5 += A[ai + 1] * B[bi + 2]; | |||||
| result6 += A[ai + 0] * B[bi + 3]; | |||||
| result7 += A[ai + 1] * B[bi + 3]; | |||||
| ai += 2; | |||||
| bi += 4; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||||
| C[ci + 2 * ldc + 0] = alpha * result4; | |||||
| C[ci + 2 * ldc + 1] = alpha * result5; | |||||
| C[ci + 3 * ldc + 0] = alpha * result6; | |||||
| C[ci + 3 * ldc + 1] = alpha * result7; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1; | |||||
| bi += off * 4; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 4; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| result2 += A[ai + 0] * B[bi + 2]; | |||||
| result3 += A[ai + 0] * B[bi + 3]; | |||||
| ai += 1; | |||||
| bi += 4; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||||
| C[ci + 2 * ldc + 0] = alpha * result2; | |||||
| C[ci + 3 * ldc + 0] = alpha * result3; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 4; | |||||
| } | |||||
| // -- tails for N=2 | |||||
| if (N & 2) { | |||||
| gvl = __riscv_vsetvl_e64m4(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 8; | |||||
| bi += off * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 8; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e64m4(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4; | |||||
| bi += off * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| double B0 = B[bi + 0]; | |||||
| double B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| B1 = B[bi + 1]; | |||||
| bi += 2; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| ci += ldc - gvl * 0; | |||||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| double result2 = 0; | |||||
| double result3 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2; | |||||
| bi += off * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| result2 += A[ai + 0] * B[bi + 1]; | |||||
| result3 += A[ai + 1] * B[bi + 1]; | |||||
| ai += 2; | |||||
| bi += 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1; | |||||
| bi += off * 2; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 2; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 0] * B[bi + 1]; | |||||
| ai += 1; | |||||
| bi += 2; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 2; | |||||
| } | |||||
| // -- tails for N=1 | |||||
| if (N & 1) { | |||||
| gvl = __riscv_vsetvl_e64m4(8); | |||||
| m_top = 0; | |||||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 8; | |||||
| bi += off * 1; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 8; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| double B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 8; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| m_top += 8; | |||||
| } | |||||
| if (M & 4) { | |||||
| gvl = __riscv_vsetvl_e64m4(4); | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 4; | |||||
| bi += off * 1; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 4; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| double B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||||
| B0 = B[bi + 0]; | |||||
| bi += 1; | |||||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||||
| ai += 4; | |||||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||||
| m_top += 4; | |||||
| } | |||||
| if (M & 2) { | |||||
| double result0 = 0; | |||||
| double result1 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 2; | |||||
| bi += off * 1; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 2; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| result1 += A[ai + 1] * B[bi + 0]; | |||||
| ai += 2; | |||||
| bi += 1; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||||
| m_top += 2; | |||||
| } | |||||
| if (M & 1) { | |||||
| double result0 = 0; | |||||
| BLASLONG ai = m_top * K; | |||||
| BLASLONG bi = n_top * K; | |||||
| BLASLONG pass_K = K; | |||||
| #ifdef LEFT | |||||
| BLASLONG off = offset + m_top; | |||||
| #else | |||||
| BLASLONG off = -offset + n_top; | |||||
| #endif | |||||
| #ifdef BACKWARDS | |||||
| ai += off * 1; | |||||
| bi += off * 1; | |||||
| pass_K -= off; | |||||
| #else | |||||
| #ifdef LEFT | |||||
| pass_K = off + 1; | |||||
| #else | |||||
| pass_K = off + 1; | |||||
| #endif | |||||
| #endif | |||||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||||
| result0 += A[ai + 0] * B[bi + 0]; | |||||
| ai += 1; | |||||
| bi += 1; | |||||
| } | |||||
| BLASLONG ci = n_top * ldc + m_top; | |||||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||||
| m_top += 1; | |||||
| } | |||||
| n_top += 1; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,89 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||||
| #endif | |||||
| // Optimizes the implementation in ../generic/gemm_beta.c | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, | |||||
| IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, | |||||
| FLOAT *c, BLASLONG ldc) | |||||
| { | |||||
| BLASLONG chunk; | |||||
| FLOAT *c_offset; | |||||
| size_t vl; | |||||
| FLOAT_V_T vx; | |||||
| if (beta == ZERO) { | |||||
| vl = VSETVL(m); | |||||
| vx = VFMVVF_FLOAT(0.0, vl); | |||||
| for( ; n > 0; n--, c += ldc) { | |||||
| c_offset = c; | |||||
| for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { | |||||
| vl = VSETVL(chunk); | |||||
| VSEV_FLOAT(c_offset, vx, vl); | |||||
| } | |||||
| } | |||||
| } else { | |||||
| for( ; n > 0; n--, c += ldc) { | |||||
| c_offset = c; | |||||
| for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { | |||||
| vl = VSETVL(chunk); | |||||
| vx = VLEV_FLOAT(c_offset, vl); | |||||
| vx = VFMULVF_FLOAT(vx, beta, vl); | |||||
| VSEV_FLOAT(c_offset, vx, vl); | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,197 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m1(n) | |||||
| #define FLOAT_V_T vfloat32m1_t | |||||
| #define FLOAT_VX2_T vfloat32m1x2_t | |||||
| #define FLOAT_VX4_T vfloat32m1x4_t | |||||
| #define FLOAT_VX8_T vfloat32m1x8_t | |||||
| #define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 | |||||
| #define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 | |||||
| #define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m1 | |||||
| #define VSEV_FLOAT __riscv_vse32_v_f32m1 | |||||
| #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 | |||||
| #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 | |||||
| #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m1(n) | |||||
| #define FLOAT_V_T vfloat64m1_t | |||||
| #define FLOAT_VX2_T vfloat64m1x2_t | |||||
| #define FLOAT_VX4_T vfloat64m1x4_t | |||||
| #define FLOAT_VX8_T vfloat64m1x8_t | |||||
| #define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 | |||||
| #define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 | |||||
| #define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m1 | |||||
| #define VSEV_FLOAT __riscv_vse64_v_f64m1 | |||||
| #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 | |||||
| #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 | |||||
| #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 | |||||
| #endif | |||||
| // Optimizes the implementation in ../generic/gemm_ncopy_8.c | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| FLOAT *a_offset; | |||||
| FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||||
| FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; | |||||
| FLOAT *b_offset; | |||||
| FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; | |||||
| FLOAT_VX2_T vx2; | |||||
| FLOAT_VX4_T vx4; | |||||
| FLOAT_VX8_T vx8; | |||||
| size_t vl; | |||||
| //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); | |||||
| a_offset = a; | |||||
| b_offset = b; | |||||
| for(j = (n >> 3); j > 0; j--) { | |||||
| a_offset1 = a_offset; | |||||
| a_offset2 = a_offset1 + lda; | |||||
| a_offset3 = a_offset2 + lda; | |||||
| a_offset4 = a_offset3 + lda; | |||||
| a_offset5 = a_offset4 + lda; | |||||
| a_offset6 = a_offset5 + lda; | |||||
| a_offset7 = a_offset6 + lda; | |||||
| a_offset8 = a_offset7 + lda; | |||||
| a_offset += 8 * lda; | |||||
| for(i = m; i > 0; i -= vl) { | |||||
| vl = VSETVL(i); | |||||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||||
| v3 = VLEV_FLOAT(a_offset3, vl); | |||||
| v4 = VLEV_FLOAT(a_offset4, vl); | |||||
| v5 = VLEV_FLOAT(a_offset5, vl); | |||||
| v6 = VLEV_FLOAT(a_offset6, vl); | |||||
| v7 = VLEV_FLOAT(a_offset7, vl); | |||||
| v8 = VLEV_FLOAT(a_offset8, vl); | |||||
| vx8 = VSET_VX8(vx8, 0, v1); | |||||
| vx8 = VSET_VX8(vx8, 1, v2); | |||||
| vx8 = VSET_VX8(vx8, 2, v3); | |||||
| vx8 = VSET_VX8(vx8, 3, v4); | |||||
| vx8 = VSET_VX8(vx8, 4, v5); | |||||
| vx8 = VSET_VX8(vx8, 5, v6); | |||||
| vx8 = VSET_VX8(vx8, 6, v7); | |||||
| vx8 = VSET_VX8(vx8, 7, v8); | |||||
| VSSEG8_FLOAT(b_offset, vx8, vl); | |||||
| a_offset1 += vl; | |||||
| a_offset2 += vl; | |||||
| a_offset3 += vl; | |||||
| a_offset4 += vl; | |||||
| a_offset5 += vl; | |||||
| a_offset6 += vl; | |||||
| a_offset7 += vl; | |||||
| a_offset8 += vl; | |||||
| b_offset += vl*8; | |||||
| } | |||||
| } | |||||
| if (n & 4) { | |||||
| a_offset1 = a_offset; | |||||
| a_offset2 = a_offset1 + lda; | |||||
| a_offset3 = a_offset2 + lda; | |||||
| a_offset4 = a_offset3 + lda; | |||||
| a_offset += 4 * lda; | |||||
| for(i = m; i > 0; i -= vl) { | |||||
| vl = VSETVL(i); | |||||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||||
| v3 = VLEV_FLOAT(a_offset3, vl); | |||||
| v4 = VLEV_FLOAT(a_offset4, vl); | |||||
| vx4 = VSET_VX4(vx4, 0, v1); | |||||
| vx4 = VSET_VX4(vx4, 1, v2); | |||||
| vx4 = VSET_VX4(vx4, 2, v3); | |||||
| vx4 = VSET_VX4(vx4, 3, v4); | |||||
| VSSEG4_FLOAT(b_offset, vx4, vl); | |||||
| a_offset1 += vl; | |||||
| a_offset2 += vl; | |||||
| a_offset3 += vl; | |||||
| a_offset4 += vl; | |||||
| b_offset += vl*4; | |||||
| } | |||||
| } | |||||
| if (n & 2) { | |||||
| a_offset1 = a_offset; | |||||
| a_offset2 = a_offset1 + lda; | |||||
| a_offset += 2 * lda; | |||||
| for(i = m; i > 0; i -= vl) { | |||||
| vl = VSETVL(i); | |||||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||||
| vx2 = VSET_VX2(vx2, 0, v1); | |||||
| vx2 = VSET_VX2(vx2, 1, v2); | |||||
| VSSEG2_FLOAT(b_offset, vx2, vl); | |||||
| a_offset1 += vl; | |||||
| a_offset2 += vl; | |||||
| b_offset += vl*2; | |||||
| } | |||||
| } | |||||
| if (n & 1) { | |||||
| a_offset1 = a_offset; | |||||
| for(i = m; i > 0; i -= vl) { | |||||
| vl = VSETVL(i); | |||||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||||
| VSEV_FLOAT(b_offset, v1, vl); | |||||
| a_offset1 += vl; | |||||
| b_offset += vl; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,76 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||||
| #define FLOAT_V_T vfloat32m2_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m2 | |||||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 | |||||
| #define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||||
| #define FLOAT_V_T vfloat64m2_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m2 | |||||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 | |||||
| #define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||||
| #endif | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| FLOAT *a_offset; | |||||
| FLOAT *a_offset1; | |||||
| FLOAT *b_offset; | |||||
| FLOAT_V_T v0; | |||||
| size_t vl; | |||||
| //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); | |||||
| a_offset = a; | |||||
| b_offset = b; | |||||
| for(j = n; j > 0; j -= vl) { | |||||
| vl = VSETVL(j); | |||||
| a_offset1 = a_offset; | |||||
| a_offset += vl * lda; | |||||
| for(i = m; i > 0; i--) { | |||||
| v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); | |||||
| VSEV_FLOAT(b_offset, v0, vl); | |||||
| a_offset1++; | |||||
| b_offset += vl; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,273 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m1(n) | |||||
| #define FLOAT_V_T vfloat32m1_t | |||||
| #define FLOAT_VX2_T vfloat32m1x2_t | |||||
| #define FLOAT_VX4_T vfloat32m1x4_t | |||||
| #define FLOAT_VX8_T vfloat32m1x8_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m1 | |||||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m1 | |||||
| #define VSEV_FLOAT __riscv_vse32_v_f32m1 | |||||
| #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 | |||||
| #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 | |||||
| #define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 | |||||
| #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 | |||||
| #define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 | |||||
| #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m1(n) | |||||
| #define FLOAT_V_T vfloat64m1_t | |||||
| #define FLOAT_VX2_T vfloat64m1x2_t | |||||
| #define FLOAT_VX4_T vfloat64m1x4_t | |||||
| #define FLOAT_VX8_T vfloat64m1x8_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m1 | |||||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m1 | |||||
| #define VSEV_FLOAT __riscv_vse64_v_f64m1 | |||||
| #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 | |||||
| #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 | |||||
| #define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 | |||||
| #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 | |||||
| #define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 | |||||
| #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 | |||||
| #endif | |||||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| IFLOAT *aoffset; | |||||
| IFLOAT *aoffset1; | |||||
| IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; | |||||
| FLOAT_V_T v0; | |||||
| FLOAT_VX2_T vx2; | |||||
| FLOAT_VX4_T vx4; | |||||
| FLOAT_VX8_T vx8; | |||||
| // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); | |||||
| aoffset = a; | |||||
| boffset = b; | |||||
| boffset2 = b + m * (n & ~7); | |||||
| boffset3 = b + m * (n & ~3); | |||||
| boffset4 = b + m * (n & ~1); | |||||
| for(j = (m >> 3); j > 0; j--) { | |||||
| aoffset1 = aoffset; | |||||
| aoffset += 8 * lda; | |||||
| boffset1 = boffset; | |||||
| boffset += 64; | |||||
| for(i = (n >> 3); i > 0; i--) { | |||||
| size_t vl = 8; | |||||
| vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||||
| VSSEG8_FLOAT(boffset1, vx8, vl); | |||||
| aoffset1 += 8; | |||||
| boffset1 += m * 8; | |||||
| } | |||||
| if (n & 4) { | |||||
| size_t vl = 8; | |||||
| vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||||
| VSSEG4_FLOAT(boffset2, vx4, vl); | |||||
| aoffset1 += 4; | |||||
| boffset2 += 32; | |||||
| } | |||||
| if (n & 2) { | |||||
| size_t vl = 8; | |||||
| vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||||
| VSSEG2_FLOAT(boffset3, vx2, vl); | |||||
| aoffset1 += 2; | |||||
| boffset3 += 16; | |||||
| } | |||||
| if (n & 1) { | |||||
| size_t vl = 8; | |||||
| v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||||
| VSEV_FLOAT(boffset4, v0, vl); | |||||
| aoffset1 += 1; | |||||
| boffset4 += 8; | |||||
| } | |||||
| } | |||||
| if (m & 4) { | |||||
| aoffset1 = aoffset; | |||||
| aoffset += 4 * lda; | |||||
| boffset1 = boffset; | |||||
| boffset += 32; | |||||
| for(i = (n >> 3); i > 0; i--) { | |||||
| size_t vl = 4; | |||||
| vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||||
| VSSEG8_FLOAT(boffset1, vx8, vl); | |||||
| aoffset1 += 8; | |||||
| boffset1 += m * 8; | |||||
| } | |||||
| if (n & 4) { | |||||
| size_t vl = 4; | |||||
| vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||||
| VSSEG4_FLOAT(boffset2, vx4, vl); | |||||
| aoffset1 += 4; | |||||
| boffset2 += 16; | |||||
| } | |||||
| if (n & 2) { | |||||
| size_t vl = 4; | |||||
| vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||||
| VSSEG2_FLOAT(boffset3, vx2, vl); | |||||
| aoffset1 += 2; | |||||
| boffset3 += 8; | |||||
| } | |||||
| if (n & 1) { | |||||
| size_t vl = 4; | |||||
| v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||||
| VSEV_FLOAT(boffset4, v0, vl); | |||||
| aoffset1 += 1; | |||||
| boffset4 += 4; | |||||
| } | |||||
| } | |||||
| if (m & 2) { | |||||
| aoffset1 = aoffset; | |||||
| aoffset += 2 * lda; | |||||
| boffset1 = boffset; | |||||
| boffset += 16; | |||||
| for(i = (n >> 3); i > 0; i--) { | |||||
| size_t vl = 2; | |||||
| vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||||
| VSSEG8_FLOAT(boffset1, vx8, vl); | |||||
| aoffset1 += 8; | |||||
| boffset1 += m * 8; | |||||
| } | |||||
| if (n & 4) { | |||||
| size_t vl = 2; | |||||
| vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||||
| VSSEG4_FLOAT(boffset2, vx4, vl); | |||||
| aoffset1 += 4; | |||||
| boffset2 += 8; | |||||
| } | |||||
| if (n & 2) { | |||||
| size_t vl = 2; | |||||
| vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||||
| VSSEG2_FLOAT(boffset3, vx2, vl); | |||||
| aoffset1 += 2; | |||||
| boffset3 += 4; | |||||
| } | |||||
| if (n & 1) { | |||||
| size_t vl = 2; | |||||
| v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||||
| VSEV_FLOAT(boffset4, v0, vl); | |||||
| aoffset1 += 1; | |||||
| boffset4 += 2; | |||||
| } | |||||
| } | |||||
| if (m & 1) { | |||||
| aoffset1 = aoffset; | |||||
| boffset1 = boffset; | |||||
| for(i = (n >> 3); i > 0; i--) { | |||||
| size_t vl = 8; | |||||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||||
| VSEV_FLOAT(boffset1, v0, vl); | |||||
| aoffset1 += 8; | |||||
| boffset1 += 8 * m; | |||||
| } | |||||
| if (n & 4) { | |||||
| size_t vl = 4; | |||||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||||
| VSEV_FLOAT(boffset2, v0, vl); | |||||
| aoffset1 += 4; | |||||
| //boffset2 += 4; | |||||
| } | |||||
| if (n & 2) { | |||||
| size_t vl = 2; | |||||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||||
| VSEV_FLOAT(boffset3, v0, vl); | |||||
| aoffset1 += 2; | |||||
| // boffset3 += 2; | |||||
| } | |||||
| if (n & 1) { | |||||
| *(boffset4) = *(aoffset1); | |||||
| // aoffset1 ++; | |||||
| // boffset4 ++; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,74 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||||
| #define FLOAT_V_T vfloat32m2_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m2 | |||||
| #define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||||
| #define FLOAT_V_T vfloat64m2_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m2 | |||||
| #define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||||
| #endif | |||||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| IFLOAT *aoffset; | |||||
| IFLOAT *aoffset1; | |||||
| IFLOAT *boffset; | |||||
| FLOAT_V_T v0; | |||||
| size_t vl; | |||||
| //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); | |||||
| aoffset = a; | |||||
| boffset = b; | |||||
| for(j = n; j > 0; j -= vl) { | |||||
| vl = VSETVL(j); | |||||
| aoffset1 = aoffset; | |||||
| aoffset += vl; | |||||
| for(i = m; i > 0; i--) { | |||||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||||
| VSEV_FLOAT(boffset, v0, vl); | |||||
| aoffset1 += lda; | |||||
| boffset += vl; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,601 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||||
| #define FLOAT_V_T vfloat32m2_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m2 | |||||
| #define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 | |||||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||||
| #define FLOAT_V_T vfloat64m2_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m2 | |||||
| #define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 | |||||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 | |||||
| #endif | |||||
| int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc | |||||
| #ifdef TRMMKERNEL | |||||
| ,BLASLONG offset | |||||
| #endif | |||||
| ) | |||||
| { | |||||
| BLASLONG i,j,k; | |||||
| FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7; | |||||
| IFLOAT *ptrba,*ptrbb; | |||||
| //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug | |||||
| FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; | |||||
| FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; | |||||
| size_t vl; | |||||
| // N:8 | |||||
| for (j = bn/8; j > 0; j--) { | |||||
| C0 = C; | |||||
| C1 = C0 + ldc; | |||||
| C2 = C1 + ldc; | |||||
| C3 = C2 + ldc; | |||||
| C4 = C3 + ldc; | |||||
| C5 = C4 + ldc; | |||||
| C6 = C5 + ldc; | |||||
| C7 = C6 + ldc; | |||||
| ptrba = ba; | |||||
| for (i = bm; i > 0; i -= vl) { | |||||
| vl = VSETVL(i); | |||||
| ptrbb = bb; | |||||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||||
| vres2 = VFMVVF_FLOAT(0.0, vl); | |||||
| vres3 = VFMVVF_FLOAT(0.0, vl); | |||||
| vres4 = VFMVVF_FLOAT(0.0, vl); | |||||
| vres5 = VFMVVF_FLOAT(0.0, vl); | |||||
| vres6 = VFMVVF_FLOAT(0.0, vl); | |||||
| vres7 = VFMVVF_FLOAT(0.0, vl); | |||||
| #if 0 | |||||
| for (k = bk; k > 0; k--) { | |||||
| va0 = VLEV_FLOAT(ptrba, vl); | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||||
| ptrba += vl; | |||||
| ptrbb += 8; | |||||
| } | |||||
| #else | |||||
| // Unroll K | |||||
| for (k = bk/8; k > 0; k--) { | |||||
| va0 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| va1 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||||
| ptrbb += 8; | |||||
| va2 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); | |||||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); | |||||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); | |||||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); | |||||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); | |||||
| ptrbb += 8; | |||||
| va3 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); | |||||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); | |||||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); | |||||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); | |||||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); | |||||
| ptrbb += 8; | |||||
| va4 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); | |||||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); | |||||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); | |||||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); | |||||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); | |||||
| ptrbb += 8; | |||||
| va5 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); | |||||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); | |||||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); | |||||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); | |||||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); | |||||
| ptrbb += 8; | |||||
| va6 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); | |||||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); | |||||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); | |||||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); | |||||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); | |||||
| ptrbb += 8; | |||||
| va7 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); | |||||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); | |||||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); | |||||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); | |||||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); | |||||
| ptrbb += 8; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); | |||||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); | |||||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); | |||||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); | |||||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); | |||||
| ptrbb += 8; | |||||
| } | |||||
| // K remainder | |||||
| for (k = bk&7; k > 0; k--) { | |||||
| va0 = VLEV_FLOAT(ptrba, vl); | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||||
| ptrbb += 8; | |||||
| ptrba += vl; | |||||
| } | |||||
| #endif | |||||
| va0 = VLEV_FLOAT(C0, vl); | |||||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||||
| VSEV_FLOAT(C0, va0, vl); | |||||
| va1 = VLEV_FLOAT(C1, vl); | |||||
| va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); | |||||
| VSEV_FLOAT(C1, va1, vl); | |||||
| va2 = VLEV_FLOAT(C2, vl); | |||||
| va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); | |||||
| VSEV_FLOAT(C2, va2, vl); | |||||
| va3 = VLEV_FLOAT(C3, vl); | |||||
| va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); | |||||
| VSEV_FLOAT(C3, va3, vl); | |||||
| va4 = VLEV_FLOAT(C4, vl); | |||||
| va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl); | |||||
| VSEV_FLOAT(C4, va4, vl); | |||||
| va5 = VLEV_FLOAT(C5, vl); | |||||
| va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl); | |||||
| VSEV_FLOAT(C5, va5, vl); | |||||
| va6 = VLEV_FLOAT(C6, vl); | |||||
| va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl); | |||||
| VSEV_FLOAT(C6, va6, vl); | |||||
| va7 = VLEV_FLOAT(C7, vl); | |||||
| va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl); | |||||
| VSEV_FLOAT(C7, va7, vl); | |||||
| C0 += vl; | |||||
| C1 += vl; | |||||
| C2 += vl; | |||||
| C3 += vl; | |||||
| C4 += vl; | |||||
| C5 += vl; | |||||
| C6 += vl; | |||||
| C7 += vl; | |||||
| } | |||||
| bb += (bk<<3); | |||||
| C += (ldc<<3); | |||||
| } | |||||
| // N:4 | |||||
| if (bn & 4) { | |||||
| C0 = C; | |||||
| C1 = C0 + ldc; | |||||
| C2 = C1 + ldc; | |||||
| C3 = C2 + ldc; | |||||
| ptrba = ba; | |||||
| for (i = bm; i > 0; i -= vl) { | |||||
| vl = VSETVL(i); | |||||
| ptrbb = bb; | |||||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||||
| vres2 = VFMVVF_FLOAT(0.0, vl); | |||||
| vres3 = VFMVVF_FLOAT(0.0, vl); | |||||
| #if 0 | |||||
| for (k = bk; k > 0; k--) { | |||||
| va0 = VLEV_FLOAT(ptrba, vl); | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||||
| ptrba += vl; | |||||
| ptrbb += 4; | |||||
| } | |||||
| #else | |||||
| // Unroll K | |||||
| for (k = bk/8; k > 0; k--) { | |||||
| va0 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| va1 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||||
| ptrbb += 4; | |||||
| va2 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); | |||||
| ptrbb += 4; | |||||
| va3 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); | |||||
| ptrbb += 4; | |||||
| va4 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); | |||||
| ptrbb += 4; | |||||
| va5 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); | |||||
| ptrbb += 4; | |||||
| va6 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); | |||||
| ptrbb += 4; | |||||
| va7 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); | |||||
| ptrbb += 4; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); | |||||
| ptrbb += 4; | |||||
| } | |||||
| // K remainder | |||||
| for (k = bk&7; k > 0; k--) { | |||||
| va0 = VLEV_FLOAT(ptrba, vl); | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||||
| ptrbb += 4; | |||||
| ptrba += vl; | |||||
| } | |||||
| #endif | |||||
| va0 = VLEV_FLOAT(C0, vl); | |||||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||||
| VSEV_FLOAT(C0, va0, vl); | |||||
| va1 = VLEV_FLOAT(C1, vl); | |||||
| va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); | |||||
| VSEV_FLOAT(C1, va1, vl); | |||||
| va2 = VLEV_FLOAT(C2, vl); | |||||
| va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); | |||||
| VSEV_FLOAT(C2, va2, vl); | |||||
| va3 = VLEV_FLOAT(C3, vl); | |||||
| va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); | |||||
| VSEV_FLOAT(C3, va3, vl); | |||||
| C0 += vl; | |||||
| C1 += vl; | |||||
| C2 += vl; | |||||
| C3 += vl; | |||||
| } | |||||
| bb += (bk<<2); | |||||
| C += (ldc<<2); | |||||
| } | |||||
| // N:2 | |||||
| if (bn & 2) { | |||||
| C0 = C; | |||||
| C1 = C0 + ldc; | |||||
| ptrba = ba; | |||||
| for (i = bm; i > 0; i -= vl) { | |||||
| vl = VSETVL(i); | |||||
| ptrbb = bb; | |||||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||||
| #if 0 | |||||
| for (k = bk; k > 0; k--) { | |||||
| va0 = VLEV_FLOAT(ptrba, vl); | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||||
| ptrba += vl; | |||||
| ptrbb += 2; | |||||
| } | |||||
| #else | |||||
| // Unroll K | |||||
| for (k = bk/8; k > 0; k--) { | |||||
| va0 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| va1 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||||
| ptrbb += 2; | |||||
| va2 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||||
| ptrbb += 2; | |||||
| va3 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||||
| ptrbb += 2; | |||||
| va4 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||||
| ptrbb += 2; | |||||
| va5 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||||
| ptrbb += 2; | |||||
| va6 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||||
| ptrbb += 2; | |||||
| va7 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||||
| ptrbb += 2; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||||
| ptrbb += 2; | |||||
| } | |||||
| // K remainder | |||||
| for (k = bk&7; k > 0; k--) { | |||||
| va0 = VLEV_FLOAT(ptrba, vl); | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||||
| ptrbb += 2; | |||||
| ptrba += vl; | |||||
| } | |||||
| #endif | |||||
| va0 = VLEV_FLOAT(C0, vl); | |||||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||||
| VSEV_FLOAT(C0, va0, vl); | |||||
| va1 = VLEV_FLOAT(C1, vl); | |||||
| va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); | |||||
| VSEV_FLOAT(C1, va1, vl); | |||||
| C0 += vl; | |||||
| C1 += vl; | |||||
| } | |||||
| bb += (bk<<1); | |||||
| C += (ldc<<1); | |||||
| } | |||||
| // N:1 | |||||
| if (bn & 1) { | |||||
| C0 = C; | |||||
| ptrba = ba; | |||||
| for (i = bm; i > 0; i -= vl) { | |||||
| vl = VSETVL(i); | |||||
| ptrbb = bb; | |||||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||||
| #if 0 | |||||
| for (k = bk; k > 0; k--) { | |||||
| va0 = VLEV_FLOAT(ptrba, vl); | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||||
| ptrba += vl; | |||||
| ptrbb += 1; | |||||
| } | |||||
| #else | |||||
| // Unroll K | |||||
| for (k = bk/8; k > 0; k--) { | |||||
| va0 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| va1 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||||
| ptrbb += 1; | |||||
| va2 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||||
| ptrbb += 1; | |||||
| va3 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||||
| ptrbb += 1; | |||||
| va4 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||||
| ptrbb += 1; | |||||
| va5 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||||
| ptrbb += 1; | |||||
| va6 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||||
| ptrbb += 1; | |||||
| va7 = VLEV_FLOAT(ptrba, vl); | |||||
| ptrba += vl; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||||
| ptrbb += 1; | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||||
| ptrbb += 1; | |||||
| } | |||||
| // K remainder | |||||
| for (k = bk&7; k > 0; k--) { | |||||
| va0 = VLEV_FLOAT(ptrba, vl); | |||||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||||
| ptrbb += 1; | |||||
| ptrba += vl; | |||||
| } | |||||
| #endif | |||||
| va0 = VLEV_FLOAT(C0, vl); | |||||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||||
| VSEV_FLOAT(C0, va0, vl); | |||||
| C0 += vl; | |||||
| } | |||||
| bb += (bk); | |||||
| C += (ldc); | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,94 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||||
| #endif | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| { | |||||
| if(n < 0) return(0); | |||||
| FLOAT *a_ptr, *x_ptr; | |||||
| BLASLONG i; | |||||
| FLOAT_V_T va, vy; | |||||
| if(inc_y == 1) { | |||||
| for (size_t vl; m > 0; m -= vl, y += vl, a += vl) { | |||||
| vl = VSETVL(m); | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| vy = VLEV_FLOAT(y, vl); | |||||
| for(i = 0; i < n; i++) { | |||||
| va = VLEV_FLOAT(a_ptr, vl); | |||||
| vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); | |||||
| a_ptr += lda; | |||||
| x_ptr += inc_x; | |||||
| } | |||||
| VSEV_FLOAT(y, vy, vl); | |||||
| } | |||||
| } else { | |||||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||||
| for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) { | |||||
| vl = VSETVL(m); | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||||
| for(i = 0; i < n; i++) { | |||||
| va = VLEV_FLOAT(a_ptr, vl); | |||||
| vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); | |||||
| a_ptr += lda; | |||||
| x_ptr += inc_x; | |||||
| } | |||||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||
| @@ -27,21 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
| #define VSETVL(n) vsetvl_e32m4(n) | |||||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||||
| #define FLOAT_V_T vfloat32m4_t | #define FLOAT_V_T vfloat32m4_t | ||||
| #define VLEV_FLOAT vle32_v_f32m4 | |||||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||||
| #define VSEV_FLOAT vse32_v_f32m4 | |||||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) | |||||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) | |||||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) | |||||
| #else | #else | ||||
| #define VSETVL(n) vsetvl_e64m4(n) | |||||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||||
| #define FLOAT_V_T vfloat64m4_t | #define FLOAT_V_T vfloat64m4_t | ||||
| #define VLEV_FLOAT vle64_v_f64m4 | |||||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||||
| #define VSEV_FLOAT vse64_v_f64m4 | |||||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) | |||||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) | |||||
| #endif | #endif | ||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | ||||
| @@ -0,0 +1,118 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2022, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 | |||||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 | |||||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu | |||||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||||
| #endif | |||||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| { | |||||
| BLASLONG i, j; | |||||
| FLOAT *a_ptr, *x_ptr; | |||||
| FLOAT_V_T va, vx, vr; | |||||
| FLOAT_V_T_M1 v_res, v_z0; | |||||
| size_t vlmax = VSETVL_MAX_M1; | |||||
| v_z0 = VFMVVF_FLOAT_M1(0, vlmax); | |||||
| vlmax = VSETVL_MAX; | |||||
| if(inc_x == 1) { | |||||
| for(i = 0; i < n; i++) { | |||||
| j = m; | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| vr = VFMVVF_FLOAT(0, vlmax); | |||||
| for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) { | |||||
| vl = VSETVL(j); | |||||
| va = VLEV_FLOAT(a_ptr, vl); | |||||
| vx = VLEV_FLOAT(x_ptr, vl); | |||||
| vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); | |||||
| } | |||||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||||
| *y += alpha * VFMVFS_FLOAT_M1(v_res); | |||||
| y += inc_y; | |||||
| a += lda; | |||||
| } | |||||
| } else { | |||||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||||
| for(i = 0; i < n; i++) { | |||||
| j = m; | |||||
| a_ptr = a; | |||||
| x_ptr = x; | |||||
| vr = VFMVVF_FLOAT(0, vlmax); | |||||
| for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) { | |||||
| vl = VSETVL(j); | |||||
| va = VLEV_FLOAT(a_ptr, vl); | |||||
| vx = VLSEV_FLOAT(x_ptr, stride_x, vl); | |||||
| vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); | |||||
| } | |||||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||||
| *y += alpha * VFMVFS_FLOAT_M1(v_res); | |||||
| y += inc_y; | |||||
| a += lda; | |||||
| } | |||||
| } | |||||
| return(0); | |||||
| } | |||||