| @@ -57,6 +57,21 @@ endif | |||
| ifeq ($(TARGET), CK860FV) | |||
| TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float | |||
| ifeq ($(TARGET), x280) | |||
| TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), RISCV64_ZVL256B) | |||
| TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), RISCV64_ZVL128B) | |||
| TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d | |||
| endif | |||
| ifeq ($(TARGET), RISCV64_GENERIC) | |||
| TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d | |||
| endif | |||
| all: getarch_2nd | |||
| @@ -2,3 +2,19 @@ ifeq ($(CORE), C910V) | |||
| CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 | |||
| FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static | |||
| endif | |||
| ifeq ($(CORE), x280) | |||
| CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math | |||
| FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static | |||
| endif | |||
| ifeq ($(CORE), RISCV64_ZVL256B) | |||
| CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d | |||
| FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static | |||
| endif | |||
| ifeq ($(CORE), RISCV64_ZVL128B) | |||
| CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d | |||
| FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static | |||
| endif | |||
| ifeq ($(CORE), RISCV64_GENERIC) | |||
| CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d | |||
| FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static | |||
| endif | |||
| @@ -198,6 +198,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| ``` | |||
| (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision) | |||
| - **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0. | |||
| ```sh | |||
| make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran | |||
| ``` | |||
| ### Support for multiple targets in a single library | |||
| OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. | |||
| @@ -118,8 +118,11 @@ Z13 | |||
| Z14 | |||
| 10.RISC-V 64: | |||
| RISCV64_GENERIC | |||
| RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54) | |||
| RISCV64_ZVL128B | |||
| C910V | |||
| x280 | |||
| RISCV64_ZVL256B | |||
| 11.LOONGARCH64: | |||
| LOONGSONGENERIC | |||
| @@ -290,6 +290,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA | |||
| void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); | |||
| void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, | |||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); | |||
| @@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double * | |||
| void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, | |||
| xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); | |||
| void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *, | |||
| float *, blasint *, float *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *, | |||
| double *, blasint *, double *, blasint *, double *, double *, blasint *); | |||
| void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *, | |||
| float *, blasint *, float *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *, | |||
| double *, blasint *, double *, blasint *, double *, double *, blasint *); | |||
| int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, | |||
| float *, float *, blasint *, float *, blasint *, | |||
| float *, float *, blasint *); | |||
| @@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define BUFFER_SIZE ( 32 << 20) | |||
| #define SEEK_ADDRESS | |||
| #if defined(C910V) | |||
| #include <riscv_vector.h> | |||
| #if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280) | |||
| # include <riscv_vector.h> | |||
| #endif | |||
| #if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 ) | |||
| // t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this | |||
| #define RISCV_0p10_INTRINSICS | |||
| #define RISCV_RVV(x) x | |||
| #else | |||
| #define RISCV_RVV(x) __riscv_ ## x | |||
| #endif | |||
| #if defined(C910V) || defined(RISCV64_ZVL256B) | |||
| # if !defined(DOUBLE) | |||
| # define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v) | |||
| # else | |||
| # define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v) | |||
| # endif | |||
| #else | |||
| # define EXTRACT_FLOAT(v) (v[0]) | |||
| #endif | |||
| #endif | |||
| @@ -70,12 +70,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_C910V 1 | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_C910V 1 | |||
| #define CPU_x280 2 | |||
| #define CPU_RISCV64_ZVL256B 3 | |||
| #define CPU_RISCV64_ZVL128B 4 | |||
| static char *cpuname[] = { | |||
| "RISCV64_GENERIC", | |||
| "C910V" | |||
| "C910V", | |||
| "x280", | |||
| "CPU_RISCV64_ZVL256B", | |||
| "CPU_RISCV64_ZVL128B" | |||
| }; | |||
| int detect(void){ | |||
| @@ -96,7 +96,7 @@ | |||
| INTEGER ICAMAXTEST | |||
| EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST | |||
| * .. External Subroutines .. | |||
| EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 | |||
| EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1 | |||
| * .. Intrinsic Functions .. | |||
| INTRINSIC MAX | |||
| * .. Common blocks .. | |||
| @@ -214,8 +214,8 @@ | |||
| CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), | |||
| + STRUE4(NP1),SFAC) | |||
| ELSE IF (ICASE.EQ.8) THEN | |||
| * .. CSCAL .. | |||
| CALL CSCAL(N,CA,CX,INCX) | |||
| * .. CSCALTEST .. | |||
| CALL CSCALTEST(N,CA,CX,INCX) | |||
| CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), | |||
| + SFAC) | |||
| ELSE IF (ICASE.EQ.9) THEN | |||
| @@ -236,14 +236,14 @@ | |||
| * | |||
| INCX = 1 | |||
| IF (ICASE.EQ.8) THEN | |||
| * CSCAL | |||
| * CSCALTEST | |||
| * Add a test for alpha equal to zero. | |||
| CA = (0.0E0,0.0E0) | |||
| DO 80 I = 1, 5 | |||
| MWPCT(I) = (0.0E0,0.0E0) | |||
| MWPCS(I) = (1.0E0,1.0E0) | |||
| 80 CONTINUE | |||
| CALL CSCAL(5,CA,CX,INCX) | |||
| CALL CSCALTEST(5,CA,CX,INCX) | |||
| CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) | |||
| ELSE IF (ICASE.EQ.9) THEN | |||
| * CSSCALTEST | |||
| @@ -440,6 +440,7 @@ static real c_b43 = (float)1.; | |||
| extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); | |||
| static complex mwpcs[5], mwpct[5]; | |||
| extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); | |||
| extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); | |||
| static complex cx[8]; | |||
| extern real scnrm2test_(integer*, complex*, integer*); | |||
| static integer np1; | |||
| @@ -481,7 +482,7 @@ static real c_b43 = (float)1.; | |||
| stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac); | |||
| } else if (combla_1.icase == 8) { | |||
| /* .. CSCAL .. */ | |||
| cscal_(&combla_1.n, &ca, cx, &combla_1.incx); | |||
| cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx); | |||
| ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], | |||
| &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac); | |||
| } else if (combla_1.icase == 9) { | |||
| @@ -515,7 +516,7 @@ static real c_b43 = (float)1.; | |||
| mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.; | |||
| /* L80: */ | |||
| } | |||
| cscal_(&c__5, &ca, cx, &combla_1.incx); | |||
| cscaltest_(&c__5, &ca, cx, &combla_1.incx); | |||
| ctest_(&c__5, cx, mwpct, mwpcs, sfac); | |||
| } else if (combla_1.icase == 9) { | |||
| /* CSSCALTEST */ | |||
| @@ -1679,9 +1679,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define LIBNAME "c910v" | |||
| #define CORENAME "C910V" | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_x280 | |||
| #define FORCE | |||
| #define ARCHITECTURE "RISCV64" | |||
| #define SUBARCHITECTURE "x280" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-Dx280 " \ | |||
| "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "x280" | |||
| #define CORENAME "x280" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_RISCV64_ZVL256B | |||
| #define FORCE | |||
| #define ARCHITECTURE "RISCV64" | |||
| #define SUBARCHITECTURE "RISCV64_ZVL256B" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-DRISCV64_ZVL256B " \ | |||
| "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "riscv64_zvl256b" | |||
| #define CORENAME "RISCV64_ZVL256B" | |||
| #endif | |||
| #ifdef FORCE_RISCV64_ZVL128B | |||
| #define FORCE | |||
| #define ARCHITECTURE "RISCV64" | |||
| #define SUBARCHITECTURE "RISCV64_ZVL128B" | |||
| #define SUBDIRNAME "riscv64" | |||
| #define ARCHCONFIG "-DRISCV64_ZVL128B " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " | |||
| #define LIBNAME "riscv64_zvl128b" | |||
| #define CORENAME "RISCV64_ZVL128B" | |||
| #endif | |||
| #if defined(FORCE_E2K) || defined(__e2k__) | |||
| #define FORCE | |||
| @@ -78,6 +78,9 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||
| char transA, transB, Uplo; | |||
| blasint nrowa, nrowb; | |||
| #if defined(COMPLEX) | |||
| blasint ncolb; | |||
| #endif | |||
| IFLOAT *buffer; | |||
| IFLOAT *aa, *bb; | |||
| FLOAT *cc; | |||
| @@ -156,18 +159,25 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||
| if (Uplo == 'L') | |||
| uplo = 1; | |||
| nrowa = m; | |||
| if (transa) nrowa = k; | |||
| if (transa & 1) nrowa = k; | |||
| nrowb = k; | |||
| if (transb) nrowb = m; | |||
| #if defined(COMPLEX) | |||
| ncolb = m; | |||
| #endif | |||
| if (transb & 1) { | |||
| nrowb = m; | |||
| #if defined(COMPLEX) | |||
| ncolb = k; | |||
| #endif | |||
| } | |||
| info = 0; | |||
| if (ldc < MAX(1, m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, nrowa)) | |||
| if (ldb < MAX(1, nrowb)) | |||
| info = 10; | |||
| if (lda < MAX(1, nrowb)) | |||
| if (lda < MAX(1, nrowa)) | |||
| info = 8; | |||
| if (k < 0) | |||
| info = 5; | |||
| @@ -211,6 +221,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| blasint info; | |||
| blasint lda, ldb; | |||
| FLOAT *a, *b; | |||
| #if defined(COMPLEX) | |||
| blasint nrowb, ncolb; | |||
| #endif | |||
| XFLOAT *buffer; | |||
| PRINT_DEBUG_CNAME; | |||
| @@ -262,11 +275,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| info = -1; | |||
| blasint nrowa, nrowb; | |||
| blasint nrowa; | |||
| #if !defined(COMPLEX) | |||
| blasint nrowb; | |||
| #endif | |||
| nrowa = m; | |||
| if (transa) nrowa = k; | |||
| if (transa & 1) nrowa = k; | |||
| nrowb = k; | |||
| if (transb) nrowb = m; | |||
| #if defined(COMPLEX) | |||
| ncolb = m; | |||
| #endif | |||
| if (transb & 1) { | |||
| nrowb = m; | |||
| #if defined(COMPLEX) | |||
| ncolb = k; | |||
| #endif | |||
| } | |||
| if (ldc < MAX(1, m)) | |||
| info = 13; | |||
| @@ -330,26 +354,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| info = -1; | |||
| blasint ncola, ncolb; | |||
| ncola = k; | |||
| if (transa) ncola = m; | |||
| ncolb = m; | |||
| if (transb) ncolb = k; | |||
| blasint ncola; | |||
| #if !defined(COMPLEX) | |||
| blasint ncolb; | |||
| #endif | |||
| ncola = m; | |||
| if (transa & 1) ncola = k; | |||
| ncolb = k; | |||
| #if defined(COMPLEX) | |||
| nrowb = m; | |||
| #endif | |||
| if (transb & 1) { | |||
| #if defined(COMPLEX) | |||
| nrowb = k; | |||
| #endif | |||
| ncolb = m; | |||
| } | |||
| if (ldc < MAX(1,m)) | |||
| info = 13; | |||
| if (ldb < MAX(1, ncolb)) | |||
| info = 10; | |||
| if (lda < MAX(1, ncola)) | |||
| info = 8; | |||
| if (lda < MAX(1, ncola)) | |||
| info = 10; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (m < 0) | |||
| info = 4; | |||
| if (transb < 0) | |||
| info = 3; | |||
| if (transa < 0) | |||
| info = 2; | |||
| if (transa < 0) | |||
| info = 3; | |||
| if (uplo < 0) | |||
| info = 1; | |||
| } | |||
| @@ -428,7 +464,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| IDEBUG_START; | |||
| const blasint incb = (transb == 0) ? 1 : ldb; | |||
| #if defined(COMPLEX) | |||
| if (transb > 1){ | |||
| #ifndef CBLAS | |||
| IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
| #else | |||
| if (order == CblasColMajor) | |||
| IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
| if (order == CblasRowMajor) | |||
| IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); | |||
| #endif | |||
| } | |||
| #endif | |||
| const blasint incb = ((transb & 1) == 0) ? 1 : ldb; | |||
| if (uplo == 1) { | |||
| for (i = 0; i < m; i++) { | |||
| @@ -438,19 +487,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #if defined(COMPLEX) | |||
| aa = a + i * 2; | |||
| bb = b + i * ldb * 2; | |||
| if (transa) { | |||
| if (transa & 1) { | |||
| aa = a + lda * i * 2; | |||
| } | |||
| if (transb) | |||
| if (transb & 1) | |||
| bb = b + i * 2; | |||
| cc = c + i * 2 * ldc + i * 2; | |||
| #else | |||
| aa = a + i; | |||
| bb = b + i * ldb; | |||
| if (transa) { | |||
| if (transa & 1) { | |||
| aa = a + lda * i; | |||
| } | |||
| if (transb) | |||
| if (transb & 1) | |||
| bb = b + i; | |||
| cc = c + i * ldc + i; | |||
| #endif | |||
| @@ -461,7 +510,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| NULL, 0); | |||
| if (alpha_r == ZERO && alpha_i == ZERO) | |||
| return; | |||
| continue; | |||
| #else | |||
| if (beta != ONE) | |||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||
| @@ -491,7 +540,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | |||
| aa, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| @@ -500,7 +549,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| aa, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| #else | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, | |||
| bb, incb, cc, 1, buffer); | |||
| else | |||
| @@ -509,7 +558,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| #ifdef SMP | |||
| } else { | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv_thread[(int)transa]) (j, k, alpha, aa, | |||
| lda, bb, incb, cc, | |||
| 1, buffer, | |||
| @@ -533,13 +582,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| l = j; | |||
| #if defined COMPLEX | |||
| bb = b + i * ldb * 2; | |||
| if (transb) { | |||
| if (transb & 1) { | |||
| bb = b + i * 2; | |||
| } | |||
| cc = c + i * 2 * ldc; | |||
| #else | |||
| bb = b + i * ldb; | |||
| if (transb) { | |||
| if (transb & 1) { | |||
| bb = b + i; | |||
| } | |||
| cc = c + i * ldc; | |||
| @@ -551,7 +600,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| NULL, 0); | |||
| if (alpha_r == ZERO && alpha_i == ZERO) | |||
| return; | |||
| continue; | |||
| #else | |||
| if (beta != ONE) | |||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||
| @@ -580,7 +629,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | |||
| a, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| @@ -589,7 +638,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| a, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| #else | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, | |||
| incb, cc, 1, buffer); | |||
| else | |||
| @@ -599,7 +648,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| #ifdef SMP | |||
| } else { | |||
| if (!transa) | |||
| if (!(transa & 1)) | |||
| (gemv_thread[(int)transa]) (j, k, alpha, a, lda, | |||
| bb, incb, cc, 1, | |||
| buffer, nthreads); | |||
| @@ -617,4 +666,4 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| IDEBUG_END; | |||
| return; | |||
| } | |||
| } | |||
| @@ -154,7 +154,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| } | |||
| #endif | |||
| msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT); | |||
| if ( *rows > *cols ) | |||
| msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT); | |||
| else | |||
| msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT); | |||
| b = malloc(msize); | |||
| if ( b == NULL ) | |||
| @@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| else | |||
| { | |||
| dp2 = *dd2 * dy1; | |||
| if(dp2 == ZERO) | |||
| { | |||
| dflag = -TWO; | |||
| dparam[0] = dflag; | |||
| return; | |||
| } | |||
| dp1 = *dd1 * *dx1; | |||
| dq2 = dp2 * dy1; | |||
| dq1 = dp1 * *dx1; | |||
| @@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ | |||
| dh12 = dp2 / dp1; | |||
| du = ONE - dh12 * dh21; | |||
| if(du > ZERO) | |||
| { | |||
| dflag = ZERO; | |||
| *dd1 = *dd1 / du; | |||
| *dd2 = *dd2 / du; | |||
| *dx1 = *dx1 * du; | |||
| } else { | |||
| dflag = -ONE; | |||
| dh11 = ZERO; | |||
| dh12 = ZERO; | |||
| dh21 = ZERO; | |||
| dh22 = ZERO; | |||
| *dd1 = ZERO; | |||
| *dd2 = ZERO; | |||
| *dx1 = ZERO; | |||
| } | |||
| dflag = ZERO; | |||
| *dd1 = *dd1 / du; | |||
| *dd2 = *dd2 / du; | |||
| *dx1 = *dx1 * du; | |||
| } | |||
| else | |||
| @@ -183,7 +183,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||
| } | |||
| #endif | |||
| msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2; | |||
| if ( *rows > *cols ) | |||
| msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2; | |||
| else | |||
| msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2; | |||
| b = malloc(msize); | |||
| if ( b == NULL ) | |||
| @@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, | |||
| if ( rows <= 0 ) return(0); | |||
| if ( cols <= 0 ) return(0); | |||
| if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0); | |||
| aptr = a; | |||
| lda *= 2; | |||
| @@ -42,8 +42,8 @@ ZSUMKERNEL = ../arm/zsum.c | |||
| SAXPYKERNEL = axpy_vector.c | |||
| DAXPYKERNEL = axpy_vector.c | |||
| CAXPYKERNEL = zaxpy.c | |||
| ZAXPYKERNEL = zaxpy.c | |||
| CAXPYKERNEL = zaxpy_vector.c | |||
| ZAXPYKERNEL = zaxpy_vector.c | |||
| SAXPBYKERNEL = axpby_vector.c | |||
| DAXPBYKERNEL = axpby_vector.c | |||
| @@ -59,7 +59,7 @@ SDOTKERNEL = dot_vector.c | |||
| DDOTKERNEL = dot_vector.c | |||
| CDOTKERNEL = zdot_vector.c | |||
| ZDOTKERNEL = zdot_vector.c | |||
| DSDOTKERNEL = ../generic/dot.c | |||
| DSDOTKERNEL = dsdot_vector.c | |||
| SNRM2KERNEL = nrm2_vector.c | |||
| DNRM2KERNEL = nrm2_vector.c | |||
| @@ -45,6 +45,11 @@ DAXPYKERNEL = ../riscv64/axpy.c | |||
| CAXPYKERNEL = ../riscv64/zaxpy.c | |||
| ZAXPYKERNEL = ../riscv64/zaxpy.c | |||
| SAXPBYKERNEL = ../riscv64/axpby.c | |||
| DAXPBYKERNEL = ../riscv64/axpby.c | |||
| CAXPBYKERNEL = ../riscv64/zaxpby.c | |||
| ZAXPBYKERNEL = ../riscv64/zaxpby.c | |||
| SCOPYKERNEL = ../riscv64/copy.c | |||
| DCOPYKERNEL = ../riscv64/copy.c | |||
| CCOPYKERNEL = ../riscv64/zcopy.c | |||
| @@ -0,0 +1,243 @@ | |||
| SAMAXKERNEL = amax_rvv.c | |||
| DAMAXKERNEL = amax_rvv.c | |||
| CAMAXKERNEL = zamax_rvv.c | |||
| ZAMAXKERNEL = zamax_rvv.c | |||
| SAMINKERNEL = amin_rvv.c | |||
| DAMINKERNEL = amin_rvv.c | |||
| CAMINKERNEL = zamin_rvv.c | |||
| ZAMINKERNEL = zamin_rvv.c | |||
| SMAXKERNEL = max_rvv.c | |||
| DMAXKERNEL = max_rvv.c | |||
| SMINKERNEL = min_rvv.c | |||
| DMINKERNEL = min_rvv.c | |||
| ISAMAXKERNEL = iamax_rvv.c | |||
| IDAMAXKERNEL = iamax_rvv.c | |||
| ICAMAXKERNEL = izamax_rvv.c | |||
| IZAMAXKERNEL = izamax_rvv.c | |||
| ISAMINKERNEL = iamin_rvv.c | |||
| IDAMINKERNEL = iamin_rvv.c | |||
| ICAMINKERNEL = izamin_rvv.c | |||
| IZAMINKERNEL = izamin_rvv.c | |||
| ISMAXKERNEL = imax_rvv.c | |||
| IDMAXKERNEL = imax_rvv.c | |||
| ISMINKERNEL = imin_rvv.c | |||
| IDMINKERNEL = imin_rvv.c | |||
| SASUMKERNEL = asum_rvv.c | |||
| DASUMKERNEL = asum_rvv.c | |||
| CASUMKERNEL = zasum_rvv.c | |||
| ZASUMKERNEL = zasum_rvv.c | |||
| SSUMKERNEL = sum_rvv.c | |||
| DSUMKERNEL = sum_rvv.c | |||
| CSUMKERNEL = zsum_rvv.c | |||
| ZSUMKERNEL = zsum_rvv.c | |||
| SAXPYKERNEL = axpy_rvv.c | |||
| DAXPYKERNEL = axpy_rvv.c | |||
| CAXPYKERNEL = zaxpy_rvv.c | |||
| ZAXPYKERNEL = zaxpy_rvv.c | |||
| SAXPBYKERNEL = axpby_rvv.c | |||
| DAXPBYKERNEL = axpby_rvv.c | |||
| CAXPBYKERNEL = zaxpby_rvv.c | |||
| ZAXPBYKERNEL = zaxpby_rvv.c | |||
| SCOPYKERNEL = copy_rvv.c | |||
| DCOPYKERNEL = copy_rvv.c | |||
| CCOPYKERNEL = zcopy_rvv.c | |||
| ZCOPYKERNEL = zcopy_rvv.c | |||
| SDOTKERNEL = dot_rvv.c | |||
| DDOTKERNEL = dot_rvv.c | |||
| CDOTKERNEL = zdot_rvv.c | |||
| ZDOTKERNEL = zdot_rvv.c | |||
| DSDOTKERNEL = dot_rvv.c | |||
| SNRM2KERNEL = nrm2_rvv.c | |||
| DNRM2KERNEL = nrm2_rvv.c | |||
| CNRM2KERNEL = znrm2_rvv.c | |||
| ZNRM2KERNEL = znrm2_rvv.c | |||
| SROTKERNEL = rot_rvv.c | |||
| DROTKERNEL = rot_rvv.c | |||
| CROTKERNEL = zrot_rvv.c | |||
| ZROTKERNEL = zrot_rvv.c | |||
| SSCALKERNEL = scal_rvv.c | |||
| DSCALKERNEL = scal_rvv.c | |||
| CSCALKERNEL = zscal_rvv.c | |||
| ZSCALKERNEL = zscal_rvv.c | |||
| SSWAPKERNEL = swap_rvv.c | |||
| DSWAPKERNEL = swap_rvv.c | |||
| CSWAPKERNEL = zswap_rvv.c | |||
| ZSWAPKERNEL = zswap_rvv.c | |||
| SGEMVNKERNEL = gemv_n_rvv.c | |||
| DGEMVNKERNEL = gemv_n_rvv.c | |||
| CGEMVNKERNEL = zgemv_n_rvv.c | |||
| ZGEMVNKERNEL = zgemv_n_rvv.c | |||
| SGEMVTKERNEL = gemv_t_rvv.c | |||
| DGEMVTKERNEL = gemv_t_rvv.c | |||
| CGEMVTKERNEL = zgemv_t_rvv.c | |||
| ZGEMVTKERNEL = zgemv_t_rvv.c | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c | |||
| STRMMUNCOPY_M = ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c | |||
| STRMMLNCOPY_M = ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c | |||
| STRMMUTCOPY_M = ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c | |||
| STRMMLTCOPY_M = ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c | |||
| DTRMMUNCOPY_M = ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c | |||
| DTRMMLNCOPY_M = ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c | |||
| DTRMMUTCOPY_M = ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c | |||
| DTRMMLTCOPY_M = ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c | |||
| CTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c | |||
| CTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | |||
| CTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c | |||
| CTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c | |||
| ZTRMMUNCOPY_M = ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c | |||
| ZTRMMLNCOPY_M = ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c | |||
| ZTRMMUTCOPY_M = ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c | |||
| ZTRMMLTCOPY_M = ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SSYMV_U_KERNEL = symv_U_rvv.c | |||
| SSYMV_L_KERNEL = symv_L_rvv.c | |||
| DSYMV_U_KERNEL = symv_U_rvv.c | |||
| DSYMV_L_KERNEL = symv_L_rvv.c | |||
| CSYMV_U_KERNEL = zsymv_U_rvv.c | |||
| CSYMV_L_KERNEL = zsymv_L_rvv.c | |||
| ZSYMV_U_KERNEL = zsymv_U_rvv.c | |||
| ZSYMV_L_KERNEL = zsymv_L_rvv.c | |||
| CHEMV_L_KERNEL = zhemv_LM_rvv.c | |||
| CHEMV_M_KERNEL = zhemv_LM_rvv.c | |||
| CHEMV_U_KERNEL = zhemv_UV_rvv.c | |||
| CHEMV_V_KERNEL = zhemv_UV_rvv.c | |||
| ZHEMV_L_KERNEL = zhemv_LM_rvv.c | |||
| ZHEMV_M_KERNEL = zhemv_LM_rvv.c | |||
| ZHEMV_U_KERNEL = zhemv_UV_rvv.c | |||
| ZHEMV_V_KERNEL = zhemv_UV_rvv.c | |||
| SSYMMUCOPY_M = ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c | |||
| SSYMMLCOPY_M = ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c | |||
| DSYMMUCOPY_M = ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c | |||
| DSYMMLCOPY_M = ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c | |||
| CSYMMUCOPY_M = ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c | |||
| CSYMMLCOPY_M = ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c | |||
| ZSYMMUCOPY_M = ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c | |||
| ZSYMMLCOPY_M = ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c | |||
| CHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c | |||
| CHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c | |||
| ZHEMMLTCOPY_M = ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
| ZHEMMUTCOPY_M = ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| ifndef SGEMM_BETA | |||
| SGEMM_BETA = gemm_beta_rvv.c | |||
| endif | |||
| ifndef DGEMM_BETA | |||
| DGEMM_BETA = gemm_beta_rvv.c | |||
| endif | |||
| ifndef CGEMM_BETA | |||
| CGEMM_BETA = zgemm_beta_rvv.c | |||
| endif | |||
| ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = zgemm_beta_rvv.c | |||
| endif | |||
| @@ -0,0 +1,199 @@ | |||
| SAMAXKERNEL = amax_vector.c | |||
| DAMAXKERNEL = amax_vector.c | |||
| CAMAXKERNEL = zamax_vector.c | |||
| ZAMAXKERNEL = zamax_vector.c | |||
| SAMINKERNEL = amin_vector.c | |||
| DAMINKERNEL = amin_vector.c | |||
| CAMINKERNEL = zamin_vector.c | |||
| ZAMINKERNEL = zamin_vector.c | |||
| SMAXKERNEL = max_vector.c | |||
| DMAXKERNEL = max_vector.c | |||
| SMINKERNEL = min_vector.c | |||
| DMINKERNEL = min_vector.c | |||
| ISAMAXKERNEL = iamax_vector.c | |||
| IDAMAXKERNEL = iamax_vector.c | |||
| ICAMAXKERNEL = izamax_vector.c | |||
| IZAMAXKERNEL = izamax_vector.c | |||
| ISAMINKERNEL = iamin_vector.c | |||
| IDAMINKERNEL = iamin_vector.c | |||
| ICAMINKERNEL = izamin_vector.c | |||
| IZAMINKERNEL = izamin_vector.c | |||
| ISMAXKERNEL = imax_vector.c | |||
| IDMAXKERNEL = imax_vector.c | |||
| ISMINKERNEL = imin_vector.c | |||
| IDMINKERNEL = imin_vector.c | |||
| SASUMKERNEL = asum_vector.c | |||
| DASUMKERNEL = asum_vector.c | |||
| CASUMKERNEL = zasum_vector.c | |||
| ZASUMKERNEL = zasum_vector.c | |||
| SSUMKERNEL = sum_vector.c | |||
| DSUMKERNEL = sum_vector.c | |||
| CSUMKERNEL = zsum_vector.c | |||
| ZSUMKERNEL = zsum_vector.c | |||
| SAXPYKERNEL = axpy_vector.c | |||
| DAXPYKERNEL = axpy_vector.c | |||
| CAXPYKERNEL = zaxpy_vector.c | |||
| ZAXPYKERNEL = zaxpy_vector.c | |||
| SCOPYKERNEL = copy_vector.c | |||
| DCOPYKERNEL = copy_vector.c | |||
| CCOPYKERNEL = zcopy_vector.c | |||
| ZCOPYKERNEL = zcopy_vector.c | |||
| SDOTKERNEL = dot_vector.c | |||
| DDOTKERNEL = dot_vector.c | |||
| CDOTKERNEL = zdot_vector.c | |||
| ZDOTKERNEL = zdot_vector.c | |||
| DSDOTKERNEL = ../generic/dot.c | |||
| SNRM2KERNEL = nrm2_vector.c | |||
| DNRM2KERNEL = nrm2_vector.c | |||
| CNRM2KERNEL = znrm2_vector.c | |||
| ZNRM2KERNEL = znrm2_vector.c | |||
| SROTKERNEL = rot_vector.c | |||
| DROTKERNEL = rot_vector.c | |||
| CROTKERNEL = zrot_vector.c | |||
| ZROTKERNEL = zrot_vector.c | |||
| SSCALKERNEL = scal_vector.c | |||
| DSCALKERNEL = scal_vector.c | |||
| CSCALKERNEL = zscal_vector.c | |||
| ZSCALKERNEL = zscal_vector.c | |||
| SSWAPKERNEL = swap_vector.c | |||
| DSWAPKERNEL = swap_vector.c | |||
| CSWAPKERNEL = zswap_vector.c | |||
| ZSWAPKERNEL = zswap_vector.c | |||
| SGEMVNKERNEL = gemv_n_vector.c | |||
| DGEMVNKERNEL = gemv_n_vector.c | |||
| CGEMVNKERNEL = zgemv_n_vector.c | |||
| ZGEMVNKERNEL = zgemv_n_vector.c | |||
| SGEMVTKERNEL = gemv_t_vector.c | |||
| DGEMVTKERNEL = gemv_t_vector.c | |||
| CGEMVTKERNEL = zgemv_t_vector.c | |||
| ZGEMVTKERNEL = zgemv_t_vector.c | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SSYMV_U_KERNEL = symv_U_vector.c | |||
| SSYMV_L_KERNEL = symv_L_vector.c | |||
| DSYMV_U_KERNEL = symv_U_vector.c | |||
| DSYMV_L_KERNEL = symv_L_vector.c | |||
| CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| CHEMV_L_KERNEL = zhemv_LM_vector.c | |||
| CHEMV_M_KERNEL = zhemv_LM_vector.c | |||
| CHEMV_U_KERNEL = zhemv_UV_vector.c | |||
| CHEMV_V_KERNEL = zhemv_UV_vector.c | |||
| ZHEMV_L_KERNEL = zhemv_LM_vector.c | |||
| ZHEMV_M_KERNEL = zhemv_LM_vector.c | |||
| ZHEMV_U_KERNEL = zhemv_UV_vector.c | |||
| ZHEMV_V_KERNEL = zhemv_UV_vector.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| ifndef SGEMM_BETA | |||
| SGEMM_BETA = ../generic/gemm_beta.c | |||
| endif | |||
| ifndef DGEMM_BETA | |||
| DGEMM_BETA = ../generic/gemm_beta.c | |||
| endif | |||
| ifndef CGEMM_BETA | |||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||
| endif | |||
| ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| endif | |||
| @@ -0,0 +1,281 @@ | |||
| # ********************************************************************************** | |||
| # Copyright (c) 2022, The OpenBLAS Project | |||
| # All rights reserved. | |||
| # Redistribution and use in source and binary forms, with or without | |||
| # modification, are permitted provided that the following conditions are | |||
| # met: | |||
| # 1. Redistributions of source code must retain the above copyright | |||
| # notice, this list of conditions and the following disclaimer. | |||
| # 2. Redistributions in binary form must reproduce the above copyright | |||
| # notice, this list of conditions and the following disclaimer in | |||
| # the documentation and/or other materials provided with the | |||
| # distribution. | |||
| # 3. Neither the name of the OpenBLAS project nor the names of | |||
| # its contributors may be used to endorse or promote products | |||
| # derived from this software without specific prior written permission. | |||
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| # ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| # USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| # ********************************************************************************** | |||
| SAMAXKERNEL = amax_rvv.c | |||
| DAMAXKERNEL = amax_rvv.c | |||
| CAMAXKERNEL = zamax_rvv.c | |||
| ZAMAXKERNEL = zamax_rvv.c | |||
| SAMINKERNEL = amin_rvv.c | |||
| DAMINKERNEL = amin_rvv.c | |||
| CAMINKERNEL = zamin_rvv.c | |||
| ZAMINKERNEL = zamin_rvv.c | |||
| SMAXKERNEL = max_rvv.c | |||
| DMAXKERNEL = max_rvv.c | |||
| SMINKERNEL = min_rvv.c | |||
| DMINKERNEL = min_rvv.c | |||
| ISAMAXKERNEL = iamax_rvv.c | |||
| IDAMAXKERNEL = iamax_rvv.c | |||
| ICAMAXKERNEL = izamax_rvv.c | |||
| IZAMAXKERNEL = izamax_rvv.c | |||
| ISAMINKERNEL = iamin_rvv.c | |||
| IDAMINKERNEL = iamin_rvv.c | |||
| ICAMINKERNEL = izamin_rvv.c | |||
| IZAMINKERNEL = izamin_rvv.c | |||
| ISMAXKERNEL = imax_rvv.c | |||
| IDMAXKERNEL = imax_rvv.c | |||
| ISMINKERNEL = imin_rvv.c | |||
| IDMINKERNEL = imin_rvv.c | |||
| SASUMKERNEL = asum_rvv.c | |||
| DASUMKERNEL = asum_rvv.c | |||
| CASUMKERNEL = zasum_rvv.c | |||
| ZASUMKERNEL = zasum_rvv.c | |||
| SSUMKERNEL = sum_rvv.c | |||
| DSUMKERNEL = sum_rvv.c | |||
| CSUMKERNEL = zsum_rvv.c | |||
| ZSUMKERNEL = zsum_rvv.c | |||
| SAXPYKERNEL = axpy_rvv.c | |||
| DAXPYKERNEL = axpy_rvv.c | |||
| CAXPYKERNEL = zaxpy_rvv.c | |||
| ZAXPYKERNEL = zaxpy_rvv.c | |||
| SAXPBYKERNEL = axpby_rvv.c | |||
| DAXPBYKERNEL = axpby_rvv.c | |||
| CAXPBYKERNEL = zaxpby_rvv.c | |||
| ZAXPBYKERNEL = zaxpby_rvv.c | |||
| SCOPYKERNEL = copy_rvv.c | |||
| DCOPYKERNEL = copy_rvv.c | |||
| CCOPYKERNEL = zcopy_rvv.c | |||
| ZCOPYKERNEL = zcopy_rvv.c | |||
| SDOTKERNEL = dot_rvv.c | |||
| DDOTKERNEL = dot_rvv.c | |||
| CDOTKERNEL = zdot_rvv.c | |||
| ZDOTKERNEL = zdot_rvv.c | |||
| DSDOTKERNEL = dot_rvv.c | |||
| SNRM2KERNEL = nrm2_rvv.c | |||
| DNRM2KERNEL = nrm2_rvv.c | |||
| CNRM2KERNEL = znrm2_rvv.c | |||
| ZNRM2KERNEL = znrm2_rvv.c | |||
| SROTKERNEL = rot_rvv.c | |||
| DROTKERNEL = rot_rvv.c | |||
| CROTKERNEL = zrot_rvv.c | |||
| ZROTKERNEL = zrot_rvv.c | |||
| SSCALKERNEL = scal_rvv.c | |||
| DSCALKERNEL = scal_rvv.c | |||
| CSCALKERNEL = zscal_rvv.c | |||
| ZSCALKERNEL = zscal_rvv.c | |||
| SSWAPKERNEL = swap_rvv.c | |||
| DSWAPKERNEL = swap_rvv.c | |||
| CSWAPKERNEL = zswap_rvv.c | |||
| ZSWAPKERNEL = zswap_rvv.c | |||
| SGEMVNKERNEL = gemv_n_rvv.c | |||
| DGEMVNKERNEL = gemv_n_rvv.c | |||
| CGEMVNKERNEL = zgemv_n_rvv.c | |||
| ZGEMVNKERNEL = zgemv_n_rvv.c | |||
| SGEMVTKERNEL = gemv_t_rvv.c | |||
| DGEMVTKERNEL = gemv_t_rvv.c | |||
| CGEMVTKERNEL = zgemv_t_rvv.c | |||
| ZGEMVTKERNEL = zgemv_t_rvv.c | |||
| CTRMMKERNEL = ztrmmkernel_rvv_v1x4.c | |||
| ZTRMMKERNEL = ztrmmkernel_rvv_v1x4.c | |||
| # SGEMM_UNROLL_N set in params.h | |||
| ifeq ($(SGEMM_UNROLL_N), 8) | |||
| # UNROLL_M is VLMAX | |||
| SGEMMKERNEL = gemmkernel_rvv_v1x8.c | |||
| SGEMMINCOPY = gemm_ncopy_rvv_v1.c | |||
| SGEMMITCOPY = gemm_tcopy_rvv_v1.c | |||
| SGEMMONCOPY = gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c | |||
| SGEMMOTCOPY = gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMKERNEL = trmmkernel_rvv_v1x8.c | |||
| STRMMUNCOPY_M = trmm_uncopy_rvv_v1.c | |||
| STRMMLNCOPY_M = trmm_lncopy_rvv_v1.c | |||
| STRMMUTCOPY_M = trmm_utcopy_rvv_v1.c | |||
| STRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c | |||
| SSYMMUCOPY_M = symm_ucopy_rvv_v1.c | |||
| SSYMMLCOPY_M = symm_lcopy_rvv_v1.c | |||
| endif | |||
| # SGEMM_UNROLL_N set in params.h | |||
| ifeq ($(DGEMM_UNROLL_N), 8) | |||
| # UNROLL_M is VLMAX | |||
| DGEMMKERNEL = gemmkernel_rvv_v1x8.c | |||
| DGEMMINCOPY = gemm_ncopy_rvv_v1.c | |||
| DGEMMITCOPY = gemm_tcopy_rvv_v1.c | |||
| DGEMMONCOPY = gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c | |||
| DGEMMOTCOPY = gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRMMKERNEL = trmmkernel_rvv_v1x8.c | |||
| DTRMMUNCOPY_M = trmm_uncopy_rvv_v1.c | |||
| DTRMMLNCOPY_M = trmm_lncopy_rvv_v1.c | |||
| DTRMMUTCOPY_M = trmm_utcopy_rvv_v1.c | |||
| DTRMMLTCOPY_M = trmm_ltcopy_rvv_v1.c | |||
| DSYMMUCOPY_M = symm_ucopy_rvv_v1.c | |||
| DSYMMLCOPY_M = symm_lcopy_rvv_v1.c | |||
| endif | |||
| CGEMMKERNEL = zgemmkernel_rvv_v1x4.c | |||
| CGEMMINCOPY = zgemm_ncopy_rvv_v1.c | |||
| CGEMMITCOPY = zgemm_tcopy_rvv_v1.c | |||
| CGEMMONCOPY = zgemm_ncopy_4_rvv.c | |||
| CGEMMOTCOPY = zgemm_tcopy_4_rvv.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemmkernel_rvv_v1x4.c | |||
| ZGEMMINCOPY = zgemm_ncopy_rvv_v1.c | |||
| ZGEMMITCOPY = zgemm_tcopy_rvv_v1.c | |||
| ZGEMMONCOPY = zgemm_ncopy_4_rvv.c | |||
| ZGEMMOTCOPY = zgemm_tcopy_4_rvv.c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||
| STRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||
| STRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||
| STRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||
| DTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||
| CTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||
| CTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||
| CTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||
| CTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||
| ZTRSMKERNEL_LN = trsm_kernel_LN_rvv_v1.c | |||
| ZTRSMKERNEL_LT = trsm_kernel_LT_rvv_v1.c | |||
| ZTRSMKERNEL_RN = trsm_kernel_RN_rvv_v1.c | |||
| ZTRSMKERNEL_RT = trsm_kernel_RT_rvv_v1.c | |||
| TRSMCOPYLN_M = trsm_lncopy_rvv_v1.c | |||
| TRSMCOPYLT_M = trsm_ltcopy_rvv_v1.c | |||
| TRSMCOPYUN_M = trsm_uncopy_rvv_v1.c | |||
| TRSMCOPYUT_M = trsm_utcopy_rvv_v1.c | |||
| ZTRSMCOPYLN_M = ztrsm_lncopy_rvv_v1.c | |||
| ZTRSMCOPYLT_M = ztrsm_ltcopy_rvv_v1.c | |||
| ZTRSMCOPYUN_M = ztrsm_uncopy_rvv_v1.c | |||
| ZTRSMCOPYUT_M = ztrsm_utcopy_rvv_v1.c | |||
| SSYMV_U_KERNEL = symv_U_rvv.c | |||
| SSYMV_L_KERNEL = symv_L_rvv.c | |||
| DSYMV_U_KERNEL = symv_U_rvv.c | |||
| DSYMV_L_KERNEL = symv_L_rvv.c | |||
| CSYMV_U_KERNEL = zsymv_U_rvv.c | |||
| CSYMV_L_KERNEL = zsymv_L_rvv.c | |||
| ZSYMV_U_KERNEL = zsymv_U_rvv.c | |||
| ZSYMV_L_KERNEL = zsymv_L_rvv.c | |||
| CHEMV_L_KERNEL = zhemv_LM_rvv.c | |||
| CHEMV_M_KERNEL = zhemv_LM_rvv.c | |||
| CHEMV_U_KERNEL = zhemv_UV_rvv.c | |||
| CHEMV_V_KERNEL = zhemv_UV_rvv.c | |||
| ZHEMV_L_KERNEL = zhemv_LM_rvv.c | |||
| ZHEMV_M_KERNEL = zhemv_LM_rvv.c | |||
| ZHEMV_U_KERNEL = zhemv_UV_rvv.c | |||
| ZHEMV_V_KERNEL = zhemv_UV_rvv.c | |||
| ZHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c | |||
| ZHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c | |||
| CHEMMLTCOPY_M = zhemm_ltcopy_rvv_v1.c | |||
| CHEMMUTCOPY_M = zhemm_utcopy_rvv_v1.c | |||
| ZSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c | |||
| ZSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c | |||
| CSYMMUCOPY_M = zsymm_ucopy_rvv_v1.c | |||
| CSYMMLCOPY_M = zsymm_lcopy_rvv_v1.c | |||
| ZTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c | |||
| ZTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c | |||
| ZTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c | |||
| ZTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c | |||
| CTRMMUNCOPY_M = ztrmm_uncopy_rvv_v1.c | |||
| CTRMMLNCOPY_M = ztrmm_lncopy_rvv_v1.c | |||
| CTRMMUTCOPY_M = ztrmm_utcopy_rvv_v1.c | |||
| CTRMMLTCOPY_M = ztrmm_ltcopy_rvv_v1.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| ifndef SGEMM_BETA | |||
| SGEMM_BETA = gemm_beta_rvv.c | |||
| endif | |||
| ifndef DGEMM_BETA | |||
| DGEMM_BETA = gemm_beta_rvv.c | |||
| endif | |||
| ifndef CGEMM_BETA | |||
| CGEMM_BETA = zgemm_beta_rvv.c | |||
| endif | |||
| ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = zgemm_beta_rvv.c | |||
| endif | |||
| @@ -0,0 +1,102 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT maxf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||
| FLOAT_V_T vx, vmax; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vmax = VFMVVF_FLOAT(0.0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); | |||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||
| return(maxf); | |||
| } | |||
| @@ -28,36 +28,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||
| #endif | |||
| #define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| @@ -65,103 +70,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT maxf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_max; | |||
| FLOAT_V_T_M1 v_res, v_zero; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_zero = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT_V_T v0, v1; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, 1); | |||
| MASK_T mask0, mask1; | |||
| FLOAT zero = 0.0; | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| v_max = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | |||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); | |||
| j += gvl*2; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||
| maxf = *((FLOAT*)&v_res); | |||
| //maxf = v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||
| if(*((FLOAT*)&v_res) > maxf) | |||
| maxf = *((FLOAT*)&v_res); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| @@ -169,94 +99,27 @@ asm volatile( | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| v_max = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl); | |||
| j += gvl*2; | |||
| ix += inc_xv*2; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||
| maxf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||
| if(*((FLOAT*)&v_res) > maxf) | |||
| maxf = *((FLOAT*)&v_res); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| } | |||
| maxf = EXTRACT_FLOAT(v_res); | |||
| return(maxf); | |||
| } | |||
| @@ -0,0 +1,102 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT minf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(minf); | |||
| FLOAT_V_T vx, vmin; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); | |||
| minf = VFMVFS_FLOAT_M1(v_res); | |||
| return(minf); | |||
| } | |||
| @@ -26,232 +26,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define ABS fabs | |||
| # else | |||
| # define ELEN 32 | |||
| # define ABS fabsf | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define ABS fabs | |||
| # else | |||
| # define ELEN 32 | |||
| # define ABS fabsf | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||
| #endif | |||
| #define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||
| FLOAT minf=FLT_MAX; | |||
| BLASLONG i=0, j=0; | |||
| BLASLONG ix=0; | |||
| FLOAT minf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(minf); | |||
| minf = ABS(*x); | |||
| x += inc_x; | |||
| --n; | |||
| if (n == 0) return(minf); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_min; | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| FLOAT_V_T v0, v1; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(minf, 1); | |||
| MASK_T mask0, mask1; | |||
| FLOAT zero = 0.0; | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_min = VFMINVV_FLOAT(v_min, v0, gvl); | |||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_min = VFMINVV_FLOAT(v_min, v1, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v1, v_res, gvl); | |||
| j += gvl*2; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(*((FLOAT*)&v_res) < minf) | |||
| minf = *((FLOAT*)&v_res); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG idx = 0, inc_xv = inc_x * gvl; | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_min = VFMINVV_FLOAT(v_min, v0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v1) | |||
| :"vd"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_min = VFMINVV_FLOAT(v_min, v1, gvl); | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v1, v_res, gvl); | |||
| j += gvl*2; | |||
| idx += inc_xv*2; | |||
| ix += inc_xv*2; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = *((FLOAT*)&v_res); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+vd"(v0) | |||
| :"vd"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(*((FLOAT*)&v_res) < minf) | |||
| minf = *((FLOAT*)&v_res); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| } | |||
| return(minf); | |||
| } | |||
| minf = EXTRACT_FLOAT(v_res); | |||
| return(minf); | |||
| } | |||
| @@ -0,0 +1,99 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 | |||
| #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 | |||
| #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT asumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(asumf); | |||
| FLOAT_V_T vx, vsum; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vsum = VFMVVF_FLOAT(0.0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); | |||
| asumf = VFMVFS_FLOAT_M1(v_res); | |||
| return(asumf); | |||
| } | |||
| @@ -28,111 +28,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||
| #endif | |||
| #define VFABS_FLOAT JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||
| #define VFADDVV_FLOAT JOIN(RISCV_RVV(vfadd), _vv_f, ELEN, LMUL, _) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| BLASLONG ix=0; | |||
| FLOAT asumf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(asumf); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_zero,v_sum; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT_V_T v0, v1, v_sum; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, 1); | |||
| MASK_T mask0, mask1; | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| if(gvl <= n/2){ | |||
| v_sum = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||
| j += gvl * 2; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += *((FLOAT*)&v_res); | |||
| v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| asumf += *((FLOAT*)&v_res); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| if(gvl <= n/2){ | |||
| v_sum = VFMVVF_FLOAT(0, gvl); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[(j+gvl)*inc_x], stride_x, gvl); | |||
| v1 = VFABS_FLOAT(v1, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||
| j += gvl * 2; | |||
| inc_xv += inc_xv * 2; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += *((FLOAT*)&v_res); | |||
| v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| asumf += *((FLOAT*)&v_res); | |||
| v0 = VFABS_FLOAT(v0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| } | |||
| asumf = EXTRACT_FLOAT(v_res); | |||
| return(asumf); | |||
| } | |||
| @@ -33,7 +33,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| BLASLONG i=0; | |||
| BLASLONG ix,iy; | |||
| if ( n < 0 ) return(0); | |||
| if ( n <= 0 ) return(0); | |||
| ix = 0; | |||
| iy = 0; | |||
| @@ -0,0 +1,173 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| FLOAT_V_T vx, vy; | |||
| if ( n <= 0 ) return(0); | |||
| if ( beta == 0.0 ) { | |||
| if ( alpha == 0.0 ) { | |||
| if (1 == inc_y) { | |||
| memset(&y[0], 0, n * sizeof(FLOAT)); | |||
| } else { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| size_t vl = VSETVL(n); | |||
| vy = VFMVVF_FLOAT(0.0, vl); | |||
| for ( ; n > 0; n -= vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| } else { | |||
| if ((1 == inc_x) && (1 == inc_y)) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else if (1 == inc_x) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } else if (1 == inc_y) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VFMULVF_FLOAT(vx, alpha, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if ( alpha == 0.0 ) { | |||
| if (1 == inc_y) { | |||
| for (size_t vl; n > 0; n -= vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| } else { | |||
| if ((1 == inc_x) && (1 == inc_y)) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else if (1 == inc_x) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } else if (1 == inc_y) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMULVF_FLOAT(vy, beta, vl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, vl); | |||
| VSSEV_FLOAT (y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -27,31 +27,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VSEV_FLOAT vse32_v_f32m4 | |||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VSEV_FLOAT vse64_v_f64m4 | |||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||
| # define LMUL m4 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
| #define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _) | |||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| if (n < 0) return(0); | |||
| if (n <= 0) return(0); | |||
| BLASLONG i=0, j=0; | |||
| unsigned int gvl = 0; | |||
| @@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| BLASLONG i=0; | |||
| BLASLONG ix,iy; | |||
| if ( n < 0 ) return(0); | |||
| if ( n <= 0 ) return(0); | |||
| if ( da == 0.0 ) return(0); | |||
| ix = 0; | |||
| @@ -0,0 +1,109 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| if ( n <= 0 ) return(0); | |||
| if ( da == 0.0 ) return(0); | |||
| FLOAT_V_T vx, vy; | |||
| if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
| VSEV_FLOAT (y, vy, vl); | |||
| } | |||
| } else if (1 == inc_y) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
| VSEV_FLOAT(y, vy, vl); | |||
| } | |||
| } else if (1 == inc_x) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, da, vx, vl); | |||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -25,26 +25,38 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VSEV_FLOAT vse32_v_f32m4 | |||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VSEV_FLOAT vse64_v_f64m4 | |||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| # define LMUL m4 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0, j=0, jx=0, jy=0; | |||
| @@ -0,0 +1,996 @@ | |||
| /* | |||
| AUTOGENERATED KERNEL | |||
| Script: ./kernel/riscv64/generate_kernel.py | |||
| Settings: | |||
| LMUL=2 | |||
| M=8 | |||
| M_tail_scalar_from=2 | |||
| N=4 | |||
| __riscv_='__riscv_' | |||
| complex=True | |||
| conjugate=False | |||
| cpu='zvl128b' | |||
| force_acc_double=False | |||
| index_type='BLASLONG' | |||
| op='gemm' | |||
| param_precision='float' | |||
| reg_width_bits=128 | |||
| tail_policy='' | |||
| trace=False | |||
| Derived: | |||
| ELEN_ACC=32 | |||
| ELEN_PARAM=32 | |||
| LMUL_ACC=2 | |||
| VFMACC='__riscv_vfmacc_vf_f32m2' | |||
| VFMUL='__riscv_vfmul_vf_f32m2' | |||
| VLEV='__riscv_vle32_v_f32m2' | |||
| VLSEV='__riscv_vlse32_v_f32m2' | |||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' | |||
| VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' | |||
| VSETVL='__riscv_vsetvl_e32m2' | |||
| VSEV='__riscv_vse32_v_f32m2' | |||
| VSSEV='__riscv_vsse32_v_f32m2' | |||
| acc_vector_t='vfloat32m2_t' | |||
| output='cgemm_kernel_8x4_zvl128b.c' | |||
| param_scalar_t='float' | |||
| param_vector_t='vfloat32m2_t' | |||
| */ | |||
| #include "common.h" | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define S0 1 | |||
| #define S1 -1 | |||
| #define S2 1 | |||
| #define S3 1 | |||
| #define VFMACC_RR __riscv_vfmsac | |||
| #define VFMACC_RI __riscv_vfmacc | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| #define S0 1 | |||
| #define S1 1 | |||
| #define S2 1 | |||
| #define S3 -1 | |||
| #define VFMACC_RR __riscv_vfmacc | |||
| #define VFMACC_RI __riscv_vfmsac | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| #define S0 1 | |||
| #define S1 1 | |||
| #define S2 -1 | |||
| #define S3 1 | |||
| #define VFMACC_RR __riscv_vfmacc | |||
| #define VFMACC_RI __riscv_vfnmsac | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #define S0 1 | |||
| #define S1 -1 | |||
| #define S2 -1 | |||
| #define S3 -1 | |||
| #define VFMACC_RR __riscv_vfmsac | |||
| #define VFMACC_RI __riscv_vfnmacc | |||
| #endif | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) | |||
| { | |||
| BLASLONG gvl = 0; | |||
| BLASLONG m_top = 0; | |||
| BLASLONG n_top = 0; | |||
| // -- MAIN PASS | |||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||
| m_top = 0; | |||
| BLASLONG gvl = __riscv_vsetvl_e32m2(8); | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| float B0r = B[bi + 0 * 2 + 0]; | |||
| float B0i = B[bi + 0 * 2 + 1]; | |||
| float B1r = B[bi + 1 * 2 + 0]; | |||
| float B1i = B[bi + 1 * 2 + 1]; | |||
| float B2r = B[bi + 2 * 2 + 0]; | |||
| float B2i = B[bi + 2 * 2 + 1]; | |||
| float B3r = B[bi + 3 * 2 + 0]; | |||
| float B3i = B[bi + 3 * 2 + 1]; | |||
| bi += 4 * 2; | |||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 8 * 2; | |||
| // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k | |||
| // leaving 6 vector registers for temporaries | |||
| // performing 2 operations between reuses of temporaries | |||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| vfloat32m2_t ACC0r = tmp0r; | |||
| vfloat32m2_t ACC0i = tmp0i; | |||
| vfloat32m2_t ACC1r = tmp1r; | |||
| vfloat32m2_t ACC1i = tmp1i; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||
| vfloat32m2_t ACC2r = tmp0r; | |||
| vfloat32m2_t ACC2i = tmp0i; | |||
| vfloat32m2_t ACC3r = tmp1r; | |||
| vfloat32m2_t ACC3i = tmp1i; | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0r = B[bi + 0 * 2 + 0]; | |||
| B0i = B[bi + 0 * 2 + 1]; | |||
| B1r = B[bi + 1 * 2 + 0]; | |||
| B1i = B[bi + 1 * 2 + 1]; | |||
| B2r = B[bi + 2 * 2 + 0]; | |||
| B2i = B[bi + 2 * 2 + 1]; | |||
| B3r = B[bi + 3 * 2 + 0]; | |||
| B3i = B[bi + 3 * 2 + 1]; | |||
| bi += 4 * 2; | |||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 8 * 2; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||
| ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); | |||
| ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); | |||
| ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); | |||
| ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||
| C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); | |||
| C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); | |||
| C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); | |||
| C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); | |||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||
| C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); | |||
| C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); | |||
| C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); | |||
| C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); | |||
| m_top += 8; | |||
| } | |||
| // -- tails for main pass | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| float B0r = B[bi + 0 * 2 + 0]; | |||
| float B0i = B[bi + 0 * 2 + 1]; | |||
| float B1r = B[bi + 1 * 2 + 0]; | |||
| float B1i = B[bi + 1 * 2 + 1]; | |||
| float B2r = B[bi + 2 * 2 + 0]; | |||
| float B2i = B[bi + 2 * 2 + 1]; | |||
| float B3r = B[bi + 3 * 2 + 0]; | |||
| float B3i = B[bi + 3 * 2 + 1]; | |||
| bi += 4 * 2; | |||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 4 * 2; | |||
| // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k | |||
| // leaving 6 vector registers for temporaries | |||
| // performing 2 operations between reuses of temporaries | |||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| vfloat32m2_t ACC0r = tmp0r; | |||
| vfloat32m2_t ACC0i = tmp0i; | |||
| vfloat32m2_t ACC1r = tmp1r; | |||
| vfloat32m2_t ACC1i = tmp1i; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||
| vfloat32m2_t ACC2r = tmp0r; | |||
| vfloat32m2_t ACC2i = tmp0i; | |||
| vfloat32m2_t ACC3r = tmp1r; | |||
| vfloat32m2_t ACC3i = tmp1i; | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0r = B[bi + 0 * 2 + 0]; | |||
| B0i = B[bi + 0 * 2 + 1]; | |||
| B1r = B[bi + 1 * 2 + 0]; | |||
| B1i = B[bi + 1 * 2 + 1]; | |||
| B2r = B[bi + 2 * 2 + 0]; | |||
| B2i = B[bi + 2 * 2 + 1]; | |||
| B3r = B[bi + 3 * 2 + 0]; | |||
| B3i = B[bi + 3 * 2 + 1]; | |||
| bi += 4 * 2; | |||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 4 * 2; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl); | |||
| ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl); | |||
| ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl); | |||
| ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl); | |||
| ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||
| C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl); | |||
| C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl); | |||
| C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl); | |||
| C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl); | |||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||
| C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl); | |||
| C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl); | |||
| C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl); | |||
| C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| float result4 = 0; | |||
| float result5 = 0; | |||
| float result6 = 0; | |||
| float result7 = 0; | |||
| float result8 = 0; | |||
| float result9 = 0; | |||
| float result10 = 0; | |||
| float result11 = 0; | |||
| float result12 = 0; | |||
| float result13 = 0; | |||
| float result14 = 0; | |||
| float result15 = 0; | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||
| result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; | |||
| result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; | |||
| result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; | |||
| result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; | |||
| result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1]; | |||
| result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1]; | |||
| result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; | |||
| result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; | |||
| result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1]; | |||
| result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1]; | |||
| ai += 2 * 2; | |||
| bi += 4 * 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| float Cr, Ci; | |||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||
| Cr += result0 * alphar; | |||
| Ci += result1 * alphar; | |||
| Cr -= result1 * alphai; | |||
| Ci += result0 * alphai; | |||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||
| Cr += result2 * alphar; | |||
| Ci += result3 * alphar; | |||
| Cr -= result3 * alphai; | |||
| Ci += result2 * alphai; | |||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||
| Cr += result4 * alphar; | |||
| Ci += result5 * alphar; | |||
| Cr -= result5 * alphai; | |||
| Ci += result4 * alphai; | |||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; | |||
| Cr += result6 * alphar; | |||
| Ci += result7 * alphar; | |||
| Cr -= result7 * alphai; | |||
| Ci += result6 * alphai; | |||
| C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; | |||
| Cr += result8 * alphar; | |||
| Ci += result9 * alphar; | |||
| Cr -= result9 * alphai; | |||
| Ci += result8 * alphai; | |||
| C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 2 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 2 * ldc + 1) * 2 + 1]; | |||
| Cr += result10 * alphar; | |||
| Ci += result11 * alphar; | |||
| Cr -= result11 * alphai; | |||
| Ci += result10 * alphai; | |||
| C[(ci + 2 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 2 * ldc + 1) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; | |||
| Cr += result12 * alphar; | |||
| Ci += result13 * alphar; | |||
| Cr -= result13 * alphai; | |||
| Ci += result12 * alphai; | |||
| C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 3 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 3 * ldc + 1) * 2 + 1]; | |||
| Cr += result14 * alphar; | |||
| Ci += result15 * alphar; | |||
| Cr -= result15 * alphai; | |||
| Ci += result14 * alphai; | |||
| C[(ci + 3 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 3 * ldc + 1) * 2 + 1] = Ci; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| float result4 = 0; | |||
| float result5 = 0; | |||
| float result6 = 0; | |||
| float result7 = 0; | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||
| result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||
| result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1]; | |||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1]; | |||
| result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1]; | |||
| result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1]; | |||
| ai += 1 * 2; | |||
| bi += 4 * 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| float Cr, Ci; | |||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||
| Cr += result0 * alphar; | |||
| Ci += result1 * alphar; | |||
| Cr -= result1 * alphai; | |||
| Ci += result0 * alphai; | |||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||
| Cr += result2 * alphar; | |||
| Ci += result3 * alphar; | |||
| Cr -= result3 * alphai; | |||
| Ci += result2 * alphai; | |||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 2 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 2 * ldc + 0) * 2 + 1]; | |||
| Cr += result4 * alphar; | |||
| Ci += result5 * alphar; | |||
| Cr -= result5 * alphai; | |||
| Ci += result4 * alphai; | |||
| C[(ci + 2 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 2 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 3 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 3 * ldc + 0) * 2 + 1]; | |||
| Cr += result6 * alphar; | |||
| Ci += result7 * alphar; | |||
| Cr -= result7 * alphai; | |||
| Ci += result6 * alphai; | |||
| C[(ci + 3 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 3 * ldc + 0) * 2 + 1] = Ci; | |||
| m_top += 1; | |||
| } | |||
| n_top += 4; | |||
| } | |||
| // -- tails for N=2 | |||
| if (N & 2) { | |||
| gvl = __riscv_vsetvl_e32m2(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| float B0r = B[bi + 0 * 2 + 0]; | |||
| float B0i = B[bi + 0 * 2 + 1]; | |||
| float B1r = B[bi + 1 * 2 + 0]; | |||
| float B1i = B[bi + 1 * 2 + 1]; | |||
| bi += 2 * 2; | |||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 8 * 2; | |||
| // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k | |||
| // leaving 10 vector registers for temporaries | |||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| vfloat32m2_t ACC0r = tmp0r; | |||
| vfloat32m2_t ACC0i = tmp0i; | |||
| vfloat32m2_t ACC1r = tmp1r; | |||
| vfloat32m2_t ACC1i = tmp1i; | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0r = B[bi + 0 * 2 + 0]; | |||
| B0i = B[bi + 0 * 2 + 1]; | |||
| B1r = B[bi + 1 * 2 + 0]; | |||
| B1i = B[bi + 1 * 2 + 1]; | |||
| bi += 2 * 2; | |||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 8 * 2; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| float B0r = B[bi + 0 * 2 + 0]; | |||
| float B0i = B[bi + 0 * 2 + 1]; | |||
| float B1r = B[bi + 1 * 2 + 0]; | |||
| float B1i = B[bi + 1 * 2 + 1]; | |||
| bi += 2 * 2; | |||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 4 * 2; | |||
| // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k | |||
| // leaving 10 vector registers for temporaries | |||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| vfloat32m2_t ACC0r = tmp0r; | |||
| vfloat32m2_t ACC0i = tmp0i; | |||
| vfloat32m2_t ACC1r = tmp1r; | |||
| vfloat32m2_t ACC1i = tmp1i; | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0r = B[bi + 0 * 2 + 0]; | |||
| B0i = B[bi + 0 * 2 + 1]; | |||
| B1r = B[bi + 1 * 2 + 0]; | |||
| B1i = B[bi + 1 * 2 + 1]; | |||
| bi += 2 * 2; | |||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 4 * 2; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl); | |||
| tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl); | |||
| tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl); | |||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||
| ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl); | |||
| ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||
| C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl); | |||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||
| C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl); | |||
| C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| float result4 = 0; | |||
| float result5 = 0; | |||
| float result6 = 0; | |||
| float result7 = 0; | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||
| result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||
| result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||
| result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1]; | |||
| result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1]; | |||
| ai += 2 * 2; | |||
| bi += 2 * 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| float Cr, Ci; | |||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||
| Cr += result0 * alphar; | |||
| Ci += result1 * alphar; | |||
| Cr -= result1 * alphai; | |||
| Ci += result0 * alphai; | |||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||
| Cr += result2 * alphar; | |||
| Ci += result3 * alphar; | |||
| Cr -= result3 * alphai; | |||
| Ci += result2 * alphai; | |||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||
| Cr += result4 * alphar; | |||
| Ci += result5 * alphar; | |||
| Cr -= result5 * alphai; | |||
| Ci += result4 * alphai; | |||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 1 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 1 * ldc + 1) * 2 + 1]; | |||
| Cr += result6 * alphar; | |||
| Ci += result7 * alphar; | |||
| Cr -= result7 * alphai; | |||
| Ci += result6 * alphai; | |||
| C[(ci + 1 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 1 * ldc + 1) * 2 + 1] = Ci; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||
| result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1]; | |||
| result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1]; | |||
| ai += 1 * 2; | |||
| bi += 2 * 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| float Cr, Ci; | |||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||
| Cr += result0 * alphar; | |||
| Ci += result1 * alphar; | |||
| Cr -= result1 * alphai; | |||
| Ci += result0 * alphai; | |||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 1 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 1 * ldc + 0) * 2 + 1]; | |||
| Cr += result2 * alphar; | |||
| Ci += result3 * alphar; | |||
| Cr -= result3 * alphai; | |||
| Ci += result2 * alphai; | |||
| C[(ci + 1 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 1 * ldc + 0) * 2 + 1] = Ci; | |||
| m_top += 1; | |||
| } | |||
| n_top += 2; | |||
| } | |||
| // -- tails for N=1 | |||
| if (N & 1) { | |||
| gvl = __riscv_vsetvl_e32m2(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| float B0r = B[bi + 0 * 2 + 0]; | |||
| float B0i = B[bi + 0 * 2 + 1]; | |||
| bi += 1 * 2; | |||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 8 * 2; | |||
| // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k | |||
| // leaving 12 vector registers for temporaries | |||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| vfloat32m2_t ACC0r = tmp0r; | |||
| vfloat32m2_t ACC0i = tmp0i; | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0r = B[bi + 0 * 2 + 0]; | |||
| B0i = B[bi + 0 * 2 + 1]; | |||
| bi += 1 * 2; | |||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 8 * 2; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| float B0r = B[bi + 0 * 2 + 0]; | |||
| float B0i = B[bi + 0 * 2 + 1]; | |||
| bi += 1 * 2; | |||
| vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 4 * 2; | |||
| // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k | |||
| // leaving 12 vector registers for temporaries | |||
| vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| vfloat32m2_t ACC0r = tmp0r; | |||
| vfloat32m2_t ACC0i = tmp0i; | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0r = B[bi + 0 * 2 + 0]; | |||
| B0i = B[bi + 0 * 2 + 1]; | |||
| bi += 1 * 2; | |||
| A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl); | |||
| A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| ai += 4 * 2; | |||
| tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl); | |||
| tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl); | |||
| tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl); | |||
| tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl); | |||
| ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl); | |||
| ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl); | |||
| vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl); | |||
| C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl); | |||
| C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl); | |||
| C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl); | |||
| __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||
| result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1]; | |||
| result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1]; | |||
| ai += 2 * 2; | |||
| bi += 1 * 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| float Cr, Ci; | |||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||
| Cr += result0 * alphar; | |||
| Ci += result1 * alphar; | |||
| Cr -= result1 * alphai; | |||
| Ci += result0 * alphai; | |||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||
| Cr = C[(ci + 0 * ldc + 1) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 1) * 2 + 1]; | |||
| Cr += result2 * alphar; | |||
| Ci += result3 * alphar; | |||
| Cr -= result3 * alphai; | |||
| Ci += result2 * alphai; | |||
| C[(ci + 0 * ldc + 1) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 1) * 2 + 1] = Ci; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| BLASLONG ai = m_top * K * 2; | |||
| BLASLONG bi = n_top * K * 2; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1]; | |||
| result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1]; | |||
| ai += 1 * 2; | |||
| bi += 1 * 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| float Cr, Ci; | |||
| Cr = C[(ci + 0 * ldc + 0) * 2 + 0]; | |||
| Ci = C[(ci + 0 * ldc + 0) * 2 + 1]; | |||
| Cr += result0 * alphar; | |||
| Ci += result1 * alphar; | |||
| Cr -= result1 * alphai; | |||
| Ci += result0 * alphai; | |||
| C[(ci + 0 * ldc + 0) * 2 + 0] = Cr; | |||
| C[(ci + 0 * ldc + 0) * 2 + 1] = Ci; | |||
| m_top += 1; | |||
| } | |||
| n_top += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -41,7 +41,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| if ( n < 0 ) return(0); | |||
| if ( n <= 0 ) return(0); | |||
| while(i < n) | |||
| { | |||
| @@ -0,0 +1,94 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| if(n <= 0) return(0); | |||
| FLOAT_V_T v0; | |||
| if(inc_x == 1 && inc_y == 1) { | |||
| for(size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| v0 = VLEV_FLOAT(x, vl); | |||
| VSEV_FLOAT(y, v0, vl); | |||
| } | |||
| } else if (inc_y == 1) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| v0 = VLSEV_FLOAT(x, stride_x, vl); | |||
| VSEV_FLOAT(y, v0, vl); | |||
| } | |||
| } else if(inc_x == 1) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| v0 = VLEV_FLOAT(x, vl); | |||
| VSSEV_FLOAT(y, stride_y, v0, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| v0 = VLSEV_FLOAT(x, stride_x, vl); | |||
| VSSEV_FLOAT(y, stride_y, v0, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -25,22 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| @@ -58,7 +71,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/4){ | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| BLASLONG gvl3 = gvl * 3; | |||
| unsigned int gvl3 = gvl * 3; | |||
| BLASLONG inc_xv3 = inc_xv * 3; | |||
| for(i=0,j=0; i<n/(4*gvl); i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| @@ -86,7 +99,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| if(gvl <= n/4){ | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| BLASLONG inc_yv3 = inc_yv * 3; | |||
| BLASLONG gvl3 = gvl * 3; | |||
| unsigned int gvl3 = gvl * 3; | |||
| for(i=0,j=0; i<n/(4*gvl); i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| VSSEV_FLOAT(&y[iy], stride_y, v0, gvl); | |||
| @@ -196,7 +196,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| asm volatile( | |||
| "vsetvli zero, zero, e64,m1 \n\t" | |||
| "fmv.w.x ft11, zero \n\t" | |||
| "fmv.d.x ft11, zero \n\t" | |||
| "mv t0, %[BK] \n\t" | |||
| "vfmv.v.f v16, ft11 \n\t" | |||
| @@ -0,0 +1,492 @@ | |||
| /* | |||
| AUTOGENERATED KERNEL | |||
| Script: ./kernel/riscv64/generate_kernel.py | |||
| Settings: | |||
| LMUL=4 | |||
| M=8 | |||
| M_tail_scalar_from=2 | |||
| N=4 | |||
| __riscv_='__riscv_' | |||
| complex=False | |||
| conjugate=False | |||
| cpu='zvl128b' | |||
| force_acc_double=False | |||
| index_type='BLASLONG' | |||
| op='gemm' | |||
| param_precision='double' | |||
| reg_width_bits=128 | |||
| tail_policy='' | |||
| trace=False | |||
| Derived: | |||
| ELEN_ACC=64 | |||
| ELEN_PARAM=64 | |||
| LMUL_ACC=4 | |||
| VFMACC='__riscv_vfmacc_vf_f64m4' | |||
| VFMUL='__riscv_vfmul_vf_f64m4' | |||
| VLEV='__riscv_vle64_v_f64m4' | |||
| VLSEV='__riscv_vlse64_v_f64m4' | |||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' | |||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' | |||
| VSETVL='__riscv_vsetvl_e64m4' | |||
| VSEV='__riscv_vse64_v_f64m4' | |||
| VSSEV='__riscv_vsse64_v_f64m4' | |||
| acc_vector_t='vfloat64m4_t' | |||
| output='dgemm_kernel_8x4_zvl128b.c' | |||
| param_scalar_t='double' | |||
| param_vector_t='vfloat64m4_t' | |||
| */ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) | |||
| { | |||
| BLASLONG gvl = 0; | |||
| BLASLONG m_top = 0; | |||
| BLASLONG n_top = 0; | |||
| // -- MAIN PASS | |||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||
| m_top = 0; | |||
| BLASLONG gvl = __riscv_vsetvl_e64m4(8); | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| double B2 = B[bi + 2]; | |||
| double B3 = B[bi + 3]; | |||
| bi += 4; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||
| c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); | |||
| c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||
| m_top += 8; | |||
| } | |||
| // -- tails for main pass | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e64m4(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| double B2 = B[bi + 2]; | |||
| double B3 = B[bi + 3]; | |||
| bi += 4; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||
| c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl); | |||
| c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| double result4 = 0; | |||
| double result5 = 0; | |||
| double result6 = 0; | |||
| double result7 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| result4 += A[ai + 0] * B[bi + 2]; | |||
| result5 += A[ai + 1] * B[bi + 2]; | |||
| result6 += A[ai + 0] * B[bi + 3]; | |||
| result7 += A[ai + 1] * B[bi + 3]; | |||
| ai += 2; | |||
| bi += 4; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||
| C[ci + 2 * ldc + 0] += alpha * result4; | |||
| C[ci + 2 * ldc + 1] += alpha * result5; | |||
| C[ci + 3 * ldc + 0] += alpha * result6; | |||
| C[ci + 3 * ldc + 1] += alpha * result7; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| result2 += A[ai + 0] * B[bi + 2]; | |||
| result3 += A[ai + 0] * B[bi + 3]; | |||
| ai += 1; | |||
| bi += 4; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||
| C[ci + 2 * ldc + 0] += alpha * result2; | |||
| C[ci + 3 * ldc + 0] += alpha * result3; | |||
| m_top += 1; | |||
| } | |||
| n_top += 4; | |||
| } | |||
| // -- tails for N=2 | |||
| if (N & 2) { | |||
| gvl = __riscv_vsetvl_e64m4(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| bi += 2; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e64m4(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| bi += 2; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| ai += 2; | |||
| bi += 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| ai += 1; | |||
| bi += 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||
| m_top += 1; | |||
| } | |||
| n_top += 2; | |||
| } | |||
| // -- tails for N=1 | |||
| if (N & 1) { | |||
| gvl = __riscv_vsetvl_e64m4(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| double B0 = B[bi + 0]; | |||
| bi += 1; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e64m4(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| double B0 = B[bi + 0]; | |||
| bi += 1; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| ai += 2; | |||
| bi += 1; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| double result0 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| ai += 1; | |||
| bi += 1; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| m_top += 1; | |||
| } | |||
| n_top += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,860 @@ | |||
| /* | |||
| AUTOGENERATED KERNEL | |||
| Settings: | |||
| LMUL=1 | |||
| M=8 | |||
| M_tail_scalar_from=2 | |||
| N=8 | |||
| __riscv_='__riscv_' | |||
| complex=False | |||
| conjugate=False | |||
| cpu='zvl256b' | |||
| force_acc_double=False | |||
| index_type='BLASLONG' | |||
| op='gemm' | |||
| param_precision='double' | |||
| reg_width_bits=256 | |||
| tail_policy='' | |||
| trace=False | |||
| Derived: | |||
| ELEN_ACC=64 | |||
| ELEN_PARAM=64 | |||
| LMUL_ACC=1 | |||
| VFMACC='__riscv_vfmacc_vf_f64m1' | |||
| VFMUL='__riscv_vfmul_vf_f64m1' | |||
| VLEV='__riscv_vle64_v_f64m1' | |||
| VLSEV='__riscv_vlse64_v_f64m1' | |||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1' | |||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m1' | |||
| VSETVL='__riscv_vsetvl_e64m1' | |||
| VSEV='__riscv_vse64_v_f64m1' | |||
| VSSEV='__riscv_vsse64_v_f64m1' | |||
| acc_vector_t='vfloat64m1_t' | |||
| output='dgemm_kernel_8x8_zvl256b.c' | |||
| param_scalar_t='double' | |||
| param_vector_t='vfloat64m1_t' | |||
| */ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) | |||
| { | |||
| BLASLONG gvl = 0; | |||
| BLASLONG m_top = 0; | |||
| BLASLONG n_top = 0; | |||
| // -- MAIN PASS | |||
| for (BLASLONG j=0; j<N/8; j+=1) { | |||
| m_top = 0; | |||
| BLASLONG gvl = __riscv_vsetvl_e64m1(4); | |||
| for (BLASLONG i=0; i<M/8; i+=1) { | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| double B1 = B[bi+1]; | |||
| double B2 = B[bi+2]; | |||
| double B3 = B[bi+3]; | |||
| double B4 = B[bi+4]; | |||
| double B5 = B[bi+5]; | |||
| double B6 = B[bi+6]; | |||
| double B7 = B[bi+7]; | |||
| bi += 8; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl); | |||
| vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||
| vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl); | |||
| vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||
| vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl); | |||
| vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl); | |||
| vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl); | |||
| vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl); | |||
| vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl); | |||
| vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl); | |||
| vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl); | |||
| vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl); | |||
| vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| B1 = B[bi+1]; | |||
| B2 = B[bi+2]; | |||
| B3 = B[bi+3]; | |||
| B4 = B[bi+4]; | |||
| B5 = B[bi+5]; | |||
| B6 = B[bi+6]; | |||
| B7 = B[bi+7]; | |||
| bi += 8; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl); | |||
| result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl); | |||
| result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl); | |||
| result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl); | |||
| result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl); | |||
| result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl); | |||
| result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl); | |||
| result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl); | |||
| result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl); | |||
| result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl); | |||
| result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl); | |||
| result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl); | |||
| result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c8 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c9 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c10 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c11 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c12 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c13 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c14 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c15 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||
| c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl ); | |||
| c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl ); | |||
| c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl ); | |||
| c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl ); | |||
| c8 = __riscv_vfmacc_vf_f64m1( c8, alpha, result8, gvl ); | |||
| c9 = __riscv_vfmacc_vf_f64m1( c9, alpha, result9, gvl ); | |||
| c10 = __riscv_vfmacc_vf_f64m1( c10, alpha, result10, gvl ); | |||
| c11 = __riscv_vfmacc_vf_f64m1( c11, alpha, result11, gvl ); | |||
| c12 = __riscv_vfmacc_vf_f64m1( c12, alpha, result12, gvl ); | |||
| c13 = __riscv_vfmacc_vf_f64m1( c13, alpha, result13, gvl ); | |||
| c14 = __riscv_vfmacc_vf_f64m1( c14, alpha, result14, gvl ); | |||
| c15 = __riscv_vfmacc_vf_f64m1( c15, alpha, result15, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c15, gvl); | |||
| m_top += 8; | |||
| } | |||
| // -- tails for main pass | |||
| if( M & 4 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| double B1 = B[bi+1]; | |||
| double B2 = B[bi+2]; | |||
| double B3 = B[bi+3]; | |||
| double B4 = B[bi+4]; | |||
| double B5 = B[bi+5]; | |||
| double B6 = B[bi+6]; | |||
| double B7 = B[bi+7]; | |||
| bi += 8; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||
| vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl); | |||
| vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl); | |||
| vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl); | |||
| vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| B1 = B[bi+1]; | |||
| B2 = B[bi+2]; | |||
| B3 = B[bi+3]; | |||
| B4 = B[bi+4]; | |||
| B5 = B[bi+5]; | |||
| B6 = B[bi+6]; | |||
| B7 = B[bi+7]; | |||
| bi += 8; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl); | |||
| result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl); | |||
| result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl); | |||
| result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl); | |||
| result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||
| c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl ); | |||
| c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl ); | |||
| c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl ); | |||
| c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c7, gvl); | |||
| m_top += 4; | |||
| } | |||
| if( M & 2 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| double result4 = 0; | |||
| double result5 = 0; | |||
| double result6 = 0; | |||
| double result7 = 0; | |||
| double result8 = 0; | |||
| double result9 = 0; | |||
| double result10 = 0; | |||
| double result11 = 0; | |||
| double result12 = 0; | |||
| double result13 = 0; | |||
| double result14 = 0; | |||
| double result15 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+1]*B[bi+0]; | |||
| result2+=A[ai+0]*B[bi+1]; | |||
| result3+=A[ai+1]*B[bi+1]; | |||
| result4+=A[ai+0]*B[bi+2]; | |||
| result5+=A[ai+1]*B[bi+2]; | |||
| result6+=A[ai+0]*B[bi+3]; | |||
| result7+=A[ai+1]*B[bi+3]; | |||
| result8+=A[ai+0]*B[bi+4]; | |||
| result9+=A[ai+1]*B[bi+4]; | |||
| result10+=A[ai+0]*B[bi+5]; | |||
| result11+=A[ai+1]*B[bi+5]; | |||
| result12+=A[ai+0]*B[bi+6]; | |||
| result13+=A[ai+1]*B[bi+6]; | |||
| result14+=A[ai+0]*B[bi+7]; | |||
| result15+=A[ai+1]*B[bi+7]; | |||
| ai+=2; | |||
| bi+=8; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+0*ldc+1] += alpha * result1; | |||
| C[ci+1*ldc+0] += alpha * result2; | |||
| C[ci+1*ldc+1] += alpha * result3; | |||
| C[ci+2*ldc+0] += alpha * result4; | |||
| C[ci+2*ldc+1] += alpha * result5; | |||
| C[ci+3*ldc+0] += alpha * result6; | |||
| C[ci+3*ldc+1] += alpha * result7; | |||
| C[ci+4*ldc+0] += alpha * result8; | |||
| C[ci+4*ldc+1] += alpha * result9; | |||
| C[ci+5*ldc+0] += alpha * result10; | |||
| C[ci+5*ldc+1] += alpha * result11; | |||
| C[ci+6*ldc+0] += alpha * result12; | |||
| C[ci+6*ldc+1] += alpha * result13; | |||
| C[ci+7*ldc+0] += alpha * result14; | |||
| C[ci+7*ldc+1] += alpha * result15; | |||
| m_top+=2; | |||
| } | |||
| if( M & 1 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| double result4 = 0; | |||
| double result5 = 0; | |||
| double result6 = 0; | |||
| double result7 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+0]*B[bi+1]; | |||
| result2+=A[ai+0]*B[bi+2]; | |||
| result3+=A[ai+0]*B[bi+3]; | |||
| result4+=A[ai+0]*B[bi+4]; | |||
| result5+=A[ai+0]*B[bi+5]; | |||
| result6+=A[ai+0]*B[bi+6]; | |||
| result7+=A[ai+0]*B[bi+7]; | |||
| ai+=1; | |||
| bi+=8; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+1*ldc+0] += alpha * result1; | |||
| C[ci+2*ldc+0] += alpha * result2; | |||
| C[ci+3*ldc+0] += alpha * result3; | |||
| C[ci+4*ldc+0] += alpha * result4; | |||
| C[ci+5*ldc+0] += alpha * result5; | |||
| C[ci+6*ldc+0] += alpha * result6; | |||
| C[ci+7*ldc+0] += alpha * result7; | |||
| m_top+=1; | |||
| } | |||
| n_top += 8; | |||
| } | |||
| // -- tails for N=4 | |||
| if( N & 4 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| m_top = 0; | |||
| for (BLASLONG i=0; i<M/8; i+=1) { | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| double B1 = B[bi+1]; | |||
| double B2 = B[bi+2]; | |||
| double B3 = B[bi+3]; | |||
| bi += 4; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl); | |||
| vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||
| vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl); | |||
| vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||
| vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| B1 = B[bi+1]; | |||
| B2 = B[bi+2]; | |||
| B3 = B[bi+3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl); | |||
| result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl); | |||
| result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl); | |||
| result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl); | |||
| result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||
| c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl ); | |||
| c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl ); | |||
| c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl ); | |||
| c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c7, gvl); | |||
| m_top += 8; | |||
| } | |||
| if( M & 4 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| double B1 = B[bi+1]; | |||
| double B2 = B[bi+2]; | |||
| double B3 = B[bi+3]; | |||
| bi += 4; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl); | |||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| B1 = B[bi+1]; | |||
| B2 = B[bi+2]; | |||
| B3 = B[bi+3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); | |||
| m_top += 4; | |||
| } | |||
| if( M & 2 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| double result4 = 0; | |||
| double result5 = 0; | |||
| double result6 = 0; | |||
| double result7 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+1]*B[bi+0]; | |||
| result2+=A[ai+0]*B[bi+1]; | |||
| result3+=A[ai+1]*B[bi+1]; | |||
| result4+=A[ai+0]*B[bi+2]; | |||
| result5+=A[ai+1]*B[bi+2]; | |||
| result6+=A[ai+0]*B[bi+3]; | |||
| result7+=A[ai+1]*B[bi+3]; | |||
| ai+=2; | |||
| bi+=4; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+0*ldc+1] += alpha * result1; | |||
| C[ci+1*ldc+0] += alpha * result2; | |||
| C[ci+1*ldc+1] += alpha * result3; | |||
| C[ci+2*ldc+0] += alpha * result4; | |||
| C[ci+2*ldc+1] += alpha * result5; | |||
| C[ci+3*ldc+0] += alpha * result6; | |||
| C[ci+3*ldc+1] += alpha * result7; | |||
| m_top+=2; | |||
| } | |||
| if( M & 1 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+0]*B[bi+1]; | |||
| result2+=A[ai+0]*B[bi+2]; | |||
| result3+=A[ai+0]*B[bi+3]; | |||
| ai+=1; | |||
| bi+=4; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+1*ldc+0] += alpha * result1; | |||
| C[ci+2*ldc+0] += alpha * result2; | |||
| C[ci+3*ldc+0] += alpha * result3; | |||
| m_top+=1; | |||
| } | |||
| n_top += 4; | |||
| } | |||
| // -- tails for N=2 | |||
| if( N & 2 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| m_top = 0; | |||
| for (BLASLONG i=0; i<M/8; i+=1) { | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| double B1 = B[bi+1]; | |||
| bi += 2; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||
| vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||
| vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| B1 = B[bi+1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1; | |||
| vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl ); | |||
| c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1; | |||
| __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c3, gvl); | |||
| m_top += 8; | |||
| } | |||
| if( M & 4 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| double B1 = B[bi+1]; | |||
| bi += 2; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| B1 = B[bi+1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); | |||
| m_top += 4; | |||
| } | |||
| if( M & 2 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+1]*B[bi+0]; | |||
| result2+=A[ai+0]*B[bi+1]; | |||
| result3+=A[ai+1]*B[bi+1]; | |||
| ai+=2; | |||
| bi+=2; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+0*ldc+1] += alpha * result1; | |||
| C[ci+1*ldc+0] += alpha * result2; | |||
| C[ci+1*ldc+1] += alpha * result3; | |||
| m_top+=2; | |||
| } | |||
| if( M & 1 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+0]*B[bi+1]; | |||
| ai+=1; | |||
| bi+=2; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+1*ldc+0] += alpha * result1; | |||
| m_top+=1; | |||
| } | |||
| n_top += 2; | |||
| } | |||
| // -- tails for N=1 | |||
| if( N & 1 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| m_top = 0; | |||
| for (BLASLONG i=0; i<M/8; i+=1) { | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| bi += 1; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl ); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl; | |||
| vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl; | |||
| __riscv_vse64_v_f64m1( &C[ci], c1, gvl); | |||
| m_top += 8; | |||
| } | |||
| if( M & 4 ) { | |||
| gvl = __riscv_vsetvl_e64m1(4); | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| double B0 = B[bi+0]; | |||
| bi += 1; | |||
| vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl); | |||
| for(BLASLONG k=1; k<K; k++) { | |||
| B0 = B[bi+0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl ); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl ); | |||
| ci=n_top*ldc+m_top; | |||
| __riscv_vse64_v_f64m1( &C[ci], c0, gvl); | |||
| m_top += 4; | |||
| } | |||
| if( M & 2 ) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| result1+=A[ai+1]*B[bi+0]; | |||
| ai+=2; | |||
| bi+=1; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| C[ci+0*ldc+1] += alpha * result1; | |||
| m_top+=2; | |||
| } | |||
| if( M & 1 ) { | |||
| double result0 = 0; | |||
| BLASLONG ai=m_top*K; | |||
| BLASLONG bi=n_top*K; | |||
| for(BLASLONG k=0; k<K; k++) { | |||
| result0+=A[ai+0]*B[bi+0]; | |||
| ai+=1; | |||
| bi+=1; | |||
| } | |||
| BLASLONG ci=n_top*ldc+m_top; | |||
| C[ci+0*ldc+0] += alpha * result0; | |||
| m_top+=1; | |||
| } | |||
| n_top += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,126 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(DSDOT) | |||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #else | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #endif | |||
| { | |||
| double dot = 0.0; | |||
| if ( n <= 0 ) return(dot); | |||
| size_t vlmax = __riscv_vsetvlmax_e64m8(); | |||
| vfloat64m8_t vr = __riscv_vfmv_v_f_f64m8(0, vlmax); | |||
| if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = __riscv_vsetvl_e64m8(n); | |||
| #if !defined(DOUBLE) | |||
| vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); | |||
| vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); | |||
| vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #else | |||
| vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); | |||
| vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); | |||
| vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #endif | |||
| } | |||
| } else if (1 == inc_x) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = __riscv_vsetvl_e64m8(n); | |||
| #if !defined(DOUBLE) | |||
| vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl); | |||
| vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); | |||
| vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #else | |||
| vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl); | |||
| vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); | |||
| vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #endif | |||
| } | |||
| } else if (1 == inc_y) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = __riscv_vsetvl_e64m8(n); | |||
| #if !defined(DOUBLE) | |||
| vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); | |||
| vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl); | |||
| vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #else | |||
| vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); | |||
| vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl); | |||
| vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #endif | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = __riscv_vsetvl_e64m8(n); | |||
| #if !defined(DOUBLE) | |||
| vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl); | |||
| vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl); | |||
| vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #else | |||
| vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl); | |||
| vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl); | |||
| vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl); | |||
| #endif | |||
| } | |||
| } | |||
| vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax); | |||
| vfloat64m1_t vec_sum = __riscv_vfredusum_vs_f64m8_f64m1(vr, vec_zero, vlmax); | |||
| dot = __riscv_vfmv_f_s_f64m1_f64(vec_sum); | |||
| return(dot); | |||
| } | |||
| @@ -27,31 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) | |||
| #endif | |||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) | |||
| #else | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) | |||
| #endif | |||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) | |||
| #endif | |||
| #if defined(DSDOT) | |||
| @@ -82,8 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| @@ -93,13 +99,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| }else if(inc_y == 1){ | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| int stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy = VLEV_FLOAT(&y[j], gvl); | |||
| @@ -107,9 +113,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| @@ -119,14 +124,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| int stride_y = inc_y * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| @@ -134,9 +138,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| @@ -146,15 +149,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| int stride_x = inc_x * sizeof(FLOAT); | |||
| int stride_y = inc_y * sizeof(FLOAT); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| @@ -162,9 +164,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| @@ -174,9 +175,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += (double)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| dot += (double)EXTRACT_FLOAT(v_res); | |||
| } | |||
| } | |||
| return(dot); | |||
| @@ -0,0 +1,152 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| double dot = 0.0 ; | |||
| if ( n < 1 ) return(dot); | |||
| vfloat64m4_t vr; | |||
| vfloat32m2_t vx, vy; | |||
| unsigned int gvl = 0; | |||
| vfloat64m1_t v_res, v_z0; | |||
| gvl = vsetvlmax_e64m1(); | |||
| v_res = vfmv_v_f_f64m1(0, gvl); | |||
| v_z0 = vfmv_v_f_f64m1(0, gvl); | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| gvl = vsetvl_e64m4(n); | |||
| vr = vfmv_v_f_f64m4(0, gvl); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = vle32_v_f32m2(&x[j], gvl); | |||
| vy = vle32_v_f32m2(&y[j], gvl); | |||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvl_e64m4(n-j); | |||
| vx = vle32_v_f32m2(&x[j], gvl); | |||
| vy = vle32_v_f32m2(&y[j], gvl); | |||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| }else if(inc_y == 1){ | |||
| gvl = vsetvl_e64m4(n); | |||
| vr = vfmv_v_f_f64m4(0, gvl); | |||
| int stride_x = inc_x * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||
| vy = vle32_v_f32m2(&y[j], gvl); | |||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvl_e64m4(n-j); | |||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||
| vy = vle32_v_f32m2(&y[j], gvl); | |||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = vsetvl_e64m4(n); | |||
| vr = vfmv_v_f_f64m4(0, gvl); | |||
| int stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = vle32_v_f32m2(&x[j], gvl); | |||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvl_e64m4(n-j); | |||
| vx = vle32_v_f32m2(&x[j], gvl); | |||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| }else{ | |||
| gvl = vsetvl_e64m4(n); | |||
| vr = vfmv_v_f_f64m4(0, gvl); | |||
| int stride_x = inc_x * sizeof(FLOAT); | |||
| int stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||
| vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl); | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvl_e64m4(n-j); | |||
| vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); | |||
| vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); | |||
| vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); | |||
| //vr = vfdot_vv_f32m2(vx, vy, gvl); | |||
| vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); | |||
| v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); | |||
| dot += (double)vfmv_f_s_f64m1_f64(v_res); | |||
| } | |||
| } | |||
| return(dot); | |||
| } | |||
| @@ -0,0 +1,660 @@ | |||
| /* | |||
| AUTOGENERATED KERNEL | |||
| Script: ./kernel/riscv64/generate_kernel.py | |||
| Settings: | |||
| LMUL=4 | |||
| M=8 | |||
| M_tail_scalar_from=2 | |||
| N=4 | |||
| __riscv_='__riscv_' | |||
| complex=False | |||
| conjugate=False | |||
| cpu='zvl128b' | |||
| force_acc_double=False | |||
| index_type='BLASLONG' | |||
| op='trmm' | |||
| param_precision='double' | |||
| reg_width_bits=128 | |||
| tail_policy='' | |||
| trace=False | |||
| Derived: | |||
| ELEN_ACC=64 | |||
| ELEN_PARAM=64 | |||
| LMUL_ACC=4 | |||
| VFMACC='__riscv_vfmacc_vf_f64m4' | |||
| VFMUL='__riscv_vfmul_vf_f64m4' | |||
| VLEV='__riscv_vle64_v_f64m4' | |||
| VLSEV='__riscv_vlse64_v_f64m4' | |||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4' | |||
| VMUL_TO_ACC='__riscv_vfmul_vf_f64m4' | |||
| VSETVL='__riscv_vsetvl_e64m4' | |||
| VSEV='__riscv_vse64_v_f64m4' | |||
| VSSEV='__riscv_vsse64_v_f64m4' | |||
| acc_vector_t='vfloat64m4_t' | |||
| output='dtrmm_kernel_8x4_zvl128b.c' | |||
| param_scalar_t='double' | |||
| param_vector_t='vfloat64m4_t' | |||
| */ | |||
| #include "common.h" | |||
| #if defined(LEFT) != defined(TRANSA) | |||
| #define BACKWARDS | |||
| #endif | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) | |||
| { | |||
| BLASLONG gvl = 0; | |||
| BLASLONG m_top = 0; | |||
| BLASLONG n_top = 0; | |||
| // -- MAIN PASS | |||
| for (BLASLONG j = 0; j < N / 4; j += 1) { | |||
| m_top = 0; | |||
| BLASLONG gvl = __riscv_vsetvl_e64m4(8); | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 8; | |||
| bi += off * 4; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 8; | |||
| #else | |||
| pass_K = off + 4; | |||
| #endif | |||
| #endif | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| double B2 = B[bi + 2]; | |||
| double B3 = B[bi + 3]; | |||
| bi += 4; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||
| vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); | |||
| vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||
| m_top += 8; | |||
| } | |||
| // -- tails for main pass | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e64m4(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 4; | |||
| bi += off * 4; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 4; | |||
| #else | |||
| pass_K = off + 4; | |||
| #endif | |||
| #endif | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| double B2 = B[bi + 2]; | |||
| double B3 = B[bi + 3]; | |||
| bi += 4; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl); | |||
| vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||
| vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl); | |||
| vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl); | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c3, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| double result4 = 0; | |||
| double result5 = 0; | |||
| double result6 = 0; | |||
| double result7 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 2; | |||
| bi += off * 4; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 2; | |||
| #else | |||
| pass_K = off + 4; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| result4 += A[ai + 0] * B[bi + 2]; | |||
| result5 += A[ai + 1] * B[bi + 2]; | |||
| result6 += A[ai + 0] * B[bi + 3]; | |||
| result7 += A[ai + 1] * B[bi + 3]; | |||
| ai += 2; | |||
| bi += 4; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||
| C[ci + 2 * ldc + 0] = alpha * result4; | |||
| C[ci + 2 * ldc + 1] = alpha * result5; | |||
| C[ci + 3 * ldc + 0] = alpha * result6; | |||
| C[ci + 3 * ldc + 1] = alpha * result7; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 1; | |||
| bi += off * 4; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 1; | |||
| #else | |||
| pass_K = off + 4; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| result2 += A[ai + 0] * B[bi + 2]; | |||
| result3 += A[ai + 0] * B[bi + 3]; | |||
| ai += 1; | |||
| bi += 4; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||
| C[ci + 2 * ldc + 0] = alpha * result2; | |||
| C[ci + 3 * ldc + 0] = alpha * result3; | |||
| m_top += 1; | |||
| } | |||
| n_top += 4; | |||
| } | |||
| // -- tails for N=2 | |||
| if (N & 2) { | |||
| gvl = __riscv_vsetvl_e64m4(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 8; | |||
| bi += off * 2; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 8; | |||
| #else | |||
| pass_K = off + 2; | |||
| #endif | |||
| #endif | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| bi += 2; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e64m4(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 4; | |||
| bi += off * 2; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 4; | |||
| #else | |||
| pass_K = off + 2; | |||
| #endif | |||
| #endif | |||
| double B0 = B[bi + 0]; | |||
| double B1 = B[bi + 1]; | |||
| bi += 2; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||
| vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl); | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse64_v_f64m4(&C[ci], c1, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| double result2 = 0; | |||
| double result3 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 2; | |||
| bi += off * 2; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 2; | |||
| #else | |||
| pass_K = off + 2; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| ai += 2; | |||
| bi += 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 1; | |||
| bi += off * 2; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 1; | |||
| #else | |||
| pass_K = off + 2; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| ai += 1; | |||
| bi += 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||
| m_top += 1; | |||
| } | |||
| n_top += 2; | |||
| } | |||
| // -- tails for N=1 | |||
| if (N & 1) { | |||
| gvl = __riscv_vsetvl_e64m4(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 8; | |||
| bi += off * 1; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 8; | |||
| #else | |||
| pass_K = off + 1; | |||
| #endif | |||
| #endif | |||
| double B0 = B[bi + 0]; | |||
| bi += 1; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e64m4(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 4; | |||
| bi += off * 1; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 4; | |||
| #else | |||
| pass_K = off + 1; | |||
| #endif | |||
| #endif | |||
| double B0 = B[bi + 0]; | |||
| bi += 1; | |||
| vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl); | |||
| __riscv_vse64_v_f64m4(&C[ci], c0, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| double result0 = 0; | |||
| double result1 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 2; | |||
| bi += off * 1; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 2; | |||
| #else | |||
| pass_K = off + 1; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| ai += 2; | |||
| bi += 1; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| double result0 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 1; | |||
| bi += off * 1; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 1; | |||
| #else | |||
| pass_K = off + 1; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| ai += 1; | |||
| bi += 1; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| m_top += 1; | |||
| } | |||
| n_top += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,89 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/gemm_beta.c | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, | |||
| IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, | |||
| FLOAT *c, BLASLONG ldc) | |||
| { | |||
| BLASLONG chunk; | |||
| FLOAT *c_offset; | |||
| size_t vl; | |||
| FLOAT_V_T vx; | |||
| if (beta == ZERO) { | |||
| vl = VSETVL(m); | |||
| vx = VFMVVF_FLOAT(0.0, vl); | |||
| for( ; n > 0; n--, c += ldc) { | |||
| c_offset = c; | |||
| for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { | |||
| vl = VSETVL(chunk); | |||
| VSEV_FLOAT(c_offset, vx, vl); | |||
| } | |||
| } | |||
| } else { | |||
| for( ; n > 0; n--, c += ldc) { | |||
| c_offset = c; | |||
| for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) { | |||
| vl = VSETVL(chunk); | |||
| vx = VLEV_FLOAT(c_offset, vl); | |||
| vx = VFMULVF_FLOAT(vx, beta, vl); | |||
| VSEV_FLOAT(c_offset, vx, vl); | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,197 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m1(n) | |||
| #define FLOAT_V_T vfloat32m1_t | |||
| #define FLOAT_VX2_T vfloat32m1x2_t | |||
| #define FLOAT_VX4_T vfloat32m1x4_t | |||
| #define FLOAT_VX8_T vfloat32m1x8_t | |||
| #define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2 | |||
| #define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4 | |||
| #define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8 | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m1 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m1 | |||
| #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 | |||
| #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 | |||
| #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m1(n) | |||
| #define FLOAT_V_T vfloat64m1_t | |||
| #define FLOAT_VX2_T vfloat64m1x2_t | |||
| #define FLOAT_VX4_T vfloat64m1x4_t | |||
| #define FLOAT_VX8_T vfloat64m1x8_t | |||
| #define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2 | |||
| #define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4 | |||
| #define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8 | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m1 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m1 | |||
| #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 | |||
| #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 | |||
| #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/gemm_ncopy_8.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset; | |||
| FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; | |||
| FLOAT *b_offset; | |||
| FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8; | |||
| FLOAT_VX2_T vx2; | |||
| FLOAT_VX4_T vx4; | |||
| FLOAT_VX8_T vx8; | |||
| size_t vl; | |||
| //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for(j = (n >> 3); j > 0; j--) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset5 = a_offset4 + lda; | |||
| a_offset6 = a_offset5 + lda; | |||
| a_offset7 = a_offset6 + lda; | |||
| a_offset8 = a_offset7 + lda; | |||
| a_offset += 8 * lda; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||
| v3 = VLEV_FLOAT(a_offset3, vl); | |||
| v4 = VLEV_FLOAT(a_offset4, vl); | |||
| v5 = VLEV_FLOAT(a_offset5, vl); | |||
| v6 = VLEV_FLOAT(a_offset6, vl); | |||
| v7 = VLEV_FLOAT(a_offset7, vl); | |||
| v8 = VLEV_FLOAT(a_offset8, vl); | |||
| vx8 = VSET_VX8(vx8, 0, v1); | |||
| vx8 = VSET_VX8(vx8, 1, v2); | |||
| vx8 = VSET_VX8(vx8, 2, v3); | |||
| vx8 = VSET_VX8(vx8, 3, v4); | |||
| vx8 = VSET_VX8(vx8, 4, v5); | |||
| vx8 = VSET_VX8(vx8, 5, v6); | |||
| vx8 = VSET_VX8(vx8, 6, v7); | |||
| vx8 = VSET_VX8(vx8, 7, v8); | |||
| VSSEG8_FLOAT(b_offset, vx8, vl); | |||
| a_offset1 += vl; | |||
| a_offset2 += vl; | |||
| a_offset3 += vl; | |||
| a_offset4 += vl; | |||
| a_offset5 += vl; | |||
| a_offset6 += vl; | |||
| a_offset7 += vl; | |||
| a_offset8 += vl; | |||
| b_offset += vl*8; | |||
| } | |||
| } | |||
| if (n & 4) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 4 * lda; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||
| v3 = VLEV_FLOAT(a_offset3, vl); | |||
| v4 = VLEV_FLOAT(a_offset4, vl); | |||
| vx4 = VSET_VX4(vx4, 0, v1); | |||
| vx4 = VSET_VX4(vx4, 1, v2); | |||
| vx4 = VSET_VX4(vx4, 2, v3); | |||
| vx4 = VSET_VX4(vx4, 3, v4); | |||
| VSSEG4_FLOAT(b_offset, vx4, vl); | |||
| a_offset1 += vl; | |||
| a_offset2 += vl; | |||
| a_offset3 += vl; | |||
| a_offset4 += vl; | |||
| b_offset += vl*4; | |||
| } | |||
| } | |||
| if (n & 2) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||
| vx2 = VSET_VX2(vx2, 0, v1); | |||
| vx2 = VSET_VX2(vx2, 1, v2); | |||
| VSSEG2_FLOAT(b_offset, vx2, vl); | |||
| a_offset1 += vl; | |||
| a_offset2 += vl; | |||
| b_offset += vl*2; | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| a_offset1 = a_offset; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| VSEV_FLOAT(b_offset, v1, vl); | |||
| a_offset1 += vl; | |||
| b_offset += vl; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,76 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m2 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m2 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset; | |||
| FLOAT *a_offset1; | |||
| FLOAT *b_offset; | |||
| FLOAT_V_T v0; | |||
| size_t vl; | |||
| //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for(j = n; j > 0; j -= vl) { | |||
| vl = VSETVL(j); | |||
| a_offset1 = a_offset; | |||
| a_offset += vl * lda; | |||
| for(i = m; i > 0; i--) { | |||
| v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl); | |||
| VSEV_FLOAT(b_offset, v0, vl); | |||
| a_offset1++; | |||
| b_offset += vl; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,273 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m1(n) | |||
| #define FLOAT_V_T vfloat32m1_t | |||
| #define FLOAT_VX2_T vfloat32m1x2_t | |||
| #define FLOAT_VX4_T vfloat32m1x4_t | |||
| #define FLOAT_VX8_T vfloat32m1x8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m1 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m1 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m1 | |||
| #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 | |||
| #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 | |||
| #define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 | |||
| #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 | |||
| #define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 | |||
| #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m1(n) | |||
| #define FLOAT_V_T vfloat64m1_t | |||
| #define FLOAT_VX2_T vfloat64m1x2_t | |||
| #define FLOAT_VX4_T vfloat64m1x4_t | |||
| #define FLOAT_VX8_T vfloat64m1x8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m1 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m1 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m1 | |||
| #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 | |||
| #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 | |||
| #define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 | |||
| #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 | |||
| #define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 | |||
| #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1; | |||
| IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; | |||
| FLOAT_V_T v0; | |||
| FLOAT_VX2_T vx2; | |||
| FLOAT_VX4_T vx4; | |||
| FLOAT_VX8_T vx8; | |||
| // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); | |||
| aoffset = a; | |||
| boffset = b; | |||
| boffset2 = b + m * (n & ~7); | |||
| boffset3 = b + m * (n & ~3); | |||
| boffset4 = b + m * (n & ~1); | |||
| for(j = (m >> 3); j > 0; j--) { | |||
| aoffset1 = aoffset; | |||
| aoffset += 8 * lda; | |||
| boffset1 = boffset; | |||
| boffset += 64; | |||
| for(i = (n >> 3); i > 0; i--) { | |||
| size_t vl = 8; | |||
| vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG8_FLOAT(boffset1, vx8, vl); | |||
| aoffset1 += 8; | |||
| boffset1 += m * 8; | |||
| } | |||
| if (n & 4) { | |||
| size_t vl = 8; | |||
| vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG4_FLOAT(boffset2, vx4, vl); | |||
| aoffset1 += 4; | |||
| boffset2 += 32; | |||
| } | |||
| if (n & 2) { | |||
| size_t vl = 8; | |||
| vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG2_FLOAT(boffset3, vx2, vl); | |||
| aoffset1 += 2; | |||
| boffset3 += 16; | |||
| } | |||
| if (n & 1) { | |||
| size_t vl = 8; | |||
| v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSEV_FLOAT(boffset4, v0, vl); | |||
| aoffset1 += 1; | |||
| boffset4 += 8; | |||
| } | |||
| } | |||
| if (m & 4) { | |||
| aoffset1 = aoffset; | |||
| aoffset += 4 * lda; | |||
| boffset1 = boffset; | |||
| boffset += 32; | |||
| for(i = (n >> 3); i > 0; i--) { | |||
| size_t vl = 4; | |||
| vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG8_FLOAT(boffset1, vx8, vl); | |||
| aoffset1 += 8; | |||
| boffset1 += m * 8; | |||
| } | |||
| if (n & 4) { | |||
| size_t vl = 4; | |||
| vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG4_FLOAT(boffset2, vx4, vl); | |||
| aoffset1 += 4; | |||
| boffset2 += 16; | |||
| } | |||
| if (n & 2) { | |||
| size_t vl = 4; | |||
| vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG2_FLOAT(boffset3, vx2, vl); | |||
| aoffset1 += 2; | |||
| boffset3 += 8; | |||
| } | |||
| if (n & 1) { | |||
| size_t vl = 4; | |||
| v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSEV_FLOAT(boffset4, v0, vl); | |||
| aoffset1 += 1; | |||
| boffset4 += 4; | |||
| } | |||
| } | |||
| if (m & 2) { | |||
| aoffset1 = aoffset; | |||
| aoffset += 2 * lda; | |||
| boffset1 = boffset; | |||
| boffset += 16; | |||
| for(i = (n >> 3); i > 0; i--) { | |||
| size_t vl = 2; | |||
| vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG8_FLOAT(boffset1, vx8, vl); | |||
| aoffset1 += 8; | |||
| boffset1 += m * 8; | |||
| } | |||
| if (n & 4) { | |||
| size_t vl = 2; | |||
| vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG4_FLOAT(boffset2, vx4, vl); | |||
| aoffset1 += 4; | |||
| boffset2 += 8; | |||
| } | |||
| if (n & 2) { | |||
| size_t vl = 2; | |||
| vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSSEG2_FLOAT(boffset3, vx2, vl); | |||
| aoffset1 += 2; | |||
| boffset3 += 4; | |||
| } | |||
| if (n & 1) { | |||
| size_t vl = 2; | |||
| v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); | |||
| VSEV_FLOAT(boffset4, v0, vl); | |||
| aoffset1 += 1; | |||
| boffset4 += 2; | |||
| } | |||
| } | |||
| if (m & 1) { | |||
| aoffset1 = aoffset; | |||
| boffset1 = boffset; | |||
| for(i = (n >> 3); i > 0; i--) { | |||
| size_t vl = 8; | |||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||
| VSEV_FLOAT(boffset1, v0, vl); | |||
| aoffset1 += 8; | |||
| boffset1 += 8 * m; | |||
| } | |||
| if (n & 4) { | |||
| size_t vl = 4; | |||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||
| VSEV_FLOAT(boffset2, v0, vl); | |||
| aoffset1 += 4; | |||
| //boffset2 += 4; | |||
| } | |||
| if (n & 2) { | |||
| size_t vl = 2; | |||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||
| VSEV_FLOAT(boffset3, v0, vl); | |||
| aoffset1 += 2; | |||
| // boffset3 += 2; | |||
| } | |||
| if (n & 1) { | |||
| *(boffset4) = *(aoffset1); | |||
| // aoffset1 ++; | |||
| // boffset4 ++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,74 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m2 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m2 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1; | |||
| IFLOAT *boffset; | |||
| FLOAT_V_T v0; | |||
| size_t vl; | |||
| //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda); | |||
| aoffset = a; | |||
| boffset = b; | |||
| for(j = n; j > 0; j -= vl) { | |||
| vl = VSETVL(j); | |||
| aoffset1 = aoffset; | |||
| aoffset += vl; | |||
| for(i = m; i > 0; i--) { | |||
| v0 = VLEV_FLOAT(aoffset1, vl); | |||
| VSEV_FLOAT(boffset, v0, vl); | |||
| aoffset1 += lda; | |||
| boffset += vl; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,601 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m2 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m2 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2 | |||
| #endif | |||
| int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc | |||
| #ifdef TRMMKERNEL | |||
| ,BLASLONG offset | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7; | |||
| IFLOAT *ptrba,*ptrbb; | |||
| //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug | |||
| FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7; | |||
| FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; | |||
| size_t vl; | |||
| // N:8 | |||
| for (j = bn/8; j > 0; j--) { | |||
| C0 = C; | |||
| C1 = C0 + ldc; | |||
| C2 = C1 + ldc; | |||
| C3 = C2 + ldc; | |||
| C4 = C3 + ldc; | |||
| C5 = C4 + ldc; | |||
| C6 = C5 + ldc; | |||
| C7 = C6 + ldc; | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||
| vres2 = VFMVVF_FLOAT(0.0, vl); | |||
| vres3 = VFMVVF_FLOAT(0.0, vl); | |||
| vres4 = VFMVVF_FLOAT(0.0, vl); | |||
| vres5 = VFMVVF_FLOAT(0.0, vl); | |||
| vres6 = VFMVVF_FLOAT(0.0, vl); | |||
| vres7 = VFMVVF_FLOAT(0.0, vl); | |||
| #if 0 | |||
| for (k = bk; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += 8; | |||
| } | |||
| #else | |||
| // Unroll K | |||
| for (k = bk/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||
| ptrbb += 8; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl); | |||
| ptrbb += 8; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl); | |||
| ptrbb += 8; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl); | |||
| ptrbb += 8; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl); | |||
| ptrbb += 8; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl); | |||
| ptrbb += 8; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl); | |||
| ptrbb += 8; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl); | |||
| ptrbb += 8; | |||
| } | |||
| // K remainder | |||
| for (k = bk&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl); | |||
| vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl); | |||
| vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl); | |||
| vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl); | |||
| ptrbb += 8; | |||
| ptrba += vl; | |||
| } | |||
| #endif | |||
| va0 = VLEV_FLOAT(C0, vl); | |||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| va1 = VLEV_FLOAT(C1, vl); | |||
| va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); | |||
| VSEV_FLOAT(C1, va1, vl); | |||
| va2 = VLEV_FLOAT(C2, vl); | |||
| va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); | |||
| VSEV_FLOAT(C2, va2, vl); | |||
| va3 = VLEV_FLOAT(C3, vl); | |||
| va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); | |||
| VSEV_FLOAT(C3, va3, vl); | |||
| va4 = VLEV_FLOAT(C4, vl); | |||
| va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl); | |||
| VSEV_FLOAT(C4, va4, vl); | |||
| va5 = VLEV_FLOAT(C5, vl); | |||
| va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl); | |||
| VSEV_FLOAT(C5, va5, vl); | |||
| va6 = VLEV_FLOAT(C6, vl); | |||
| va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl); | |||
| VSEV_FLOAT(C6, va6, vl); | |||
| va7 = VLEV_FLOAT(C7, vl); | |||
| va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl); | |||
| VSEV_FLOAT(C7, va7, vl); | |||
| C0 += vl; | |||
| C1 += vl; | |||
| C2 += vl; | |||
| C3 += vl; | |||
| C4 += vl; | |||
| C5 += vl; | |||
| C6 += vl; | |||
| C7 += vl; | |||
| } | |||
| bb += (bk<<3); | |||
| C += (ldc<<3); | |||
| } | |||
| // N:4 | |||
| if (bn & 4) { | |||
| C0 = C; | |||
| C1 = C0 + ldc; | |||
| C2 = C1 + ldc; | |||
| C3 = C2 + ldc; | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||
| vres2 = VFMVVF_FLOAT(0.0, vl); | |||
| vres3 = VFMVVF_FLOAT(0.0, vl); | |||
| #if 0 | |||
| for (k = bk; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += 4; | |||
| } | |||
| #else | |||
| // Unroll K | |||
| for (k = bk/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| ptrbb += 4; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl); | |||
| ptrbb += 4; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl); | |||
| ptrbb += 4; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl); | |||
| ptrbb += 4; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl); | |||
| ptrbb += 4; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl); | |||
| ptrbb += 4; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl); | |||
| ptrbb += 4; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl); | |||
| ptrbb += 4; | |||
| } | |||
| // K remainder | |||
| for (k = bk&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl); | |||
| vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl); | |||
| ptrbb += 4; | |||
| ptrba += vl; | |||
| } | |||
| #endif | |||
| va0 = VLEV_FLOAT(C0, vl); | |||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| va1 = VLEV_FLOAT(C1, vl); | |||
| va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); | |||
| VSEV_FLOAT(C1, va1, vl); | |||
| va2 = VLEV_FLOAT(C2, vl); | |||
| va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl); | |||
| VSEV_FLOAT(C2, va2, vl); | |||
| va3 = VLEV_FLOAT(C3, vl); | |||
| va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl); | |||
| VSEV_FLOAT(C3, va3, vl); | |||
| C0 += vl; | |||
| C1 += vl; | |||
| C2 += vl; | |||
| C3 += vl; | |||
| } | |||
| bb += (bk<<2); | |||
| C += (ldc<<2); | |||
| } | |||
| // N:2 | |||
| if (bn & 2) { | |||
| C0 = C; | |||
| C1 = C0 + ldc; | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| vres1 = VFMVVF_FLOAT(0.0, vl); | |||
| #if 0 | |||
| for (k = bk; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += 2; | |||
| } | |||
| #else | |||
| // Unroll K | |||
| for (k = bk/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| ptrbb += 2; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl); | |||
| ptrbb += 2; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl); | |||
| ptrbb += 2; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl); | |||
| ptrbb += 2; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl); | |||
| ptrbb += 2; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl); | |||
| ptrbb += 2; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl); | |||
| ptrbb += 2; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl); | |||
| ptrbb += 2; | |||
| } | |||
| // K remainder | |||
| for (k = bk&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl); | |||
| ptrbb += 2; | |||
| ptrba += vl; | |||
| } | |||
| #endif | |||
| va0 = VLEV_FLOAT(C0, vl); | |||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| va1 = VLEV_FLOAT(C1, vl); | |||
| va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl); | |||
| VSEV_FLOAT(C1, va1, vl); | |||
| C0 += vl; | |||
| C1 += vl; | |||
| } | |||
| bb += (bk<<1); | |||
| C += (ldc<<1); | |||
| } | |||
| // N:1 | |||
| if (bn & 1) { | |||
| C0 = C; | |||
| ptrba = ba; | |||
| for (i = bm; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vl); | |||
| #if 0 | |||
| for (k = bk; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += 1; | |||
| } | |||
| #else | |||
| // Unroll K | |||
| for (k = bk/8; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| va1 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| ptrbb += 1; | |||
| va2 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl); | |||
| ptrbb += 1; | |||
| va3 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl); | |||
| ptrbb += 1; | |||
| va4 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl); | |||
| ptrbb += 1; | |||
| va5 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl); | |||
| ptrbb += 1; | |||
| va6 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl); | |||
| ptrbb += 1; | |||
| va7 = VLEV_FLOAT(ptrba, vl); | |||
| ptrba += vl; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl); | |||
| ptrbb += 1; | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl); | |||
| ptrbb += 1; | |||
| } | |||
| // K remainder | |||
| for (k = bk&7; k > 0; k--) { | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl); | |||
| ptrbb += 1; | |||
| ptrba += vl; | |||
| } | |||
| #endif | |||
| va0 = VLEV_FLOAT(C0, vl); | |||
| va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl); | |||
| VSEV_FLOAT(C0, va0, vl); | |||
| C0 += vl; | |||
| } | |||
| bb += (bk); | |||
| C += (ldc); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,94 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| if(n < 0) return(0); | |||
| FLOAT *a_ptr, *x_ptr; | |||
| BLASLONG i; | |||
| FLOAT_V_T va, vy; | |||
| if(inc_y == 1) { | |||
| for (size_t vl; m > 0; m -= vl, y += vl, a += vl) { | |||
| vl = VSETVL(m); | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| vy = VLEV_FLOAT(y, vl); | |||
| for(i = 0; i < n; i++) { | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| VSEV_FLOAT(y, vy, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) { | |||
| vl = VSETVL(m); | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| for(i = 0; i < n; i++) { | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl); | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| VSSEV_FLOAT(y, stride_y, vy, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -27,21 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VSEV_FLOAT vse32_v_f32m4 | |||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VSEV_FLOAT vse64_v_f64m4 | |||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| @@ -0,0 +1,118 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *a_ptr, *x_ptr; | |||
| FLOAT_V_T va, vx, vr; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| size_t vlmax = VSETVL_MAX_M1; | |||
| v_z0 = VFMVVF_FLOAT_M1(0, vlmax); | |||
| vlmax = VSETVL_MAX; | |||
| if(inc_x == 1) { | |||
| for(i = 0; i < n; i++) { | |||
| j = m; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) { | |||
| vl = VSETVL(j); | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vx = VLEV_FLOAT(x_ptr, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||
| *y += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| y += inc_y; | |||
| a += lda; | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for(i = 0; i < n; i++) { | |||
| j = m; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) { | |||
| vl = VSETVL(j); | |||
| va = VLEV_FLOAT(a_ptr, vl); | |||
| vx = VLSEV_FLOAT(x_ptr, stride_x, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||
| *y += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| y += inc_y; | |||
| a += lda; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -27,107 +27,110 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m2_f32m1(v_res, va, vb, gvl) | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m2_f32m1) | |||
| #endif | |||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2) | |||
| #define xint_t int | |||
| #else | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m2_f64m1(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m2_f64m1) | |||
| #endif | |||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2) | |||
| #define xint_t long long | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i = 0, j = 0, k = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT *a_ptr = a; | |||
| BLASLONG i = 0, j = 0, k = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT *a_ptr = a; | |||
| FLOAT temp; | |||
| FLOAT_V_T va, vr, vx; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT_V_T_M1 v_res; | |||
| if(inc_x == 1){ | |||
| for(i = 0; i < n; i++){ | |||
| v_res = VFMVVF_FLOAT_M1(0, 1); | |||
| gvl = VSETVL(m); | |||
| j = 0; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(k = 0; k < m/gvl; k++){ | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vr = VFMACCVV_FLOAT(vr, va, vx, gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop | |||
| v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail | |||
| j += gvl; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp = (FLOAT)VFMVFS_FLOAT(v_res); | |||
| if(j < m){ | |||
| gvl = VSETVL(m-j); | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp += (FLOAT)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | |||
| } | |||
| temp = (FLOAT)EXTRACT_FLOAT(v_res); | |||
| y[iy] += alpha * temp; | |||
| iy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| }else{ | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for(i = 0; i < n; i++){ | |||
| v_res = VFMVVF_FLOAT_M1(0, 1); | |||
| gvl = VSETVL(m); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| j = 0; | |||
| ix = 0; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(k = 0; k < m/gvl; k++){ | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMACCVV_FLOAT(vr, va, vx, gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | |||
| j += gvl; | |||
| ix += inc_xv; | |||
| ix += inc_x * gvl; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp = (FLOAT)VFMVFS_FLOAT(v_res); | |||
| if(j < m){ | |||
| gvl = VSETVL(m-j); | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp += (FLOAT)VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_res, gvl); | |||
| } | |||
| temp = (FLOAT)EXTRACT_FLOAT(v_res); | |||
| y[iy] += alpha * temp; | |||
| iy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,673 @@ | |||
| #!/usr/bin/python3 | |||
| import sys, os | |||
| import contextlib | |||
| #----------------------------------------------------------------------- | |||
| def ERROR(*args, **kwargs): | |||
| print(*args, file=sys.stderr, **kwargs) | |||
| sys.exit(-1) | |||
| class Target(object): | |||
| def __init__( self, out, mappings, initial_level=0, tab_width=4 ): | |||
| self._level = initial_level | |||
| self._tab_width = tab_width | |||
| self._out = out | |||
| self._mappings = mappings | |||
| @contextlib.contextmanager | |||
| def map( self, **items ): | |||
| old_mappings = self._mappings | |||
| self._mappings = dict(old_mappings, **items) | |||
| yield self._mappings | |||
| self._mappings = old_mappings | |||
| @contextlib.contextmanager | |||
| def block( self, start=None, end=None, **args ): | |||
| with self.map(**args): | |||
| if start is not None: | |||
| self.write(); | |||
| self.write(start) | |||
| self._level += 1 | |||
| yield self._level | |||
| self._level -= 1 | |||
| if end is not None: | |||
| self.write(end) | |||
| self.write() | |||
| def write( self, fmt=None, *args, **kwargs ): | |||
| if fmt is not None: | |||
| mappings = dict(self._mappings, **kwargs) if kwargs else self._mappings | |||
| self._out(self._indent_str() + fmt.format(*args, **mappings)) | |||
| else: | |||
| self._out("") | |||
| def _indent_str( self ): | |||
| return ' ' * (self._level * self._tab_width) | |||
| #----------------------------------------------------------------------- | |||
| def generate_trmm_block( dest ): | |||
| dest.write("{index_type} pass_K = K;") | |||
| dest.write("#ifdef LEFT") | |||
| with dest.block(): | |||
| dest.write("{index_type} off = offset + m_top;") | |||
| dest.write("#else") | |||
| with dest.block(): | |||
| dest.write("{index_type} off = -offset + n_top;") | |||
| dest.write("#endif") | |||
| dest.write("#ifdef BACKWARDS") | |||
| with dest.block(): | |||
| dest.write("ai += off*{M}{elt_size};") | |||
| dest.write("bi += off*{N}{elt_size};") | |||
| dest.write("pass_K -= off;") | |||
| dest.write("#else") | |||
| with dest.block(): | |||
| dest.write("#ifdef LEFT") | |||
| with dest.block(): | |||
| dest.write("pass_K = off + {M};") | |||
| dest.write("#else") | |||
| with dest.block(): | |||
| dest.write("pass_K = off + {N};") | |||
| dest.write("#endif") | |||
| dest.write("#endif") | |||
| #----------------------------------------------------------------------- | |||
| def generate_gemm_kernel_inner_real( settings, dest, M, N, vlen, a_regs ): | |||
| TRMM = (settings['op'].value == 'trmm') | |||
| narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value | |||
| with dest.map( | |||
| M=M, | |||
| N=N, | |||
| ): | |||
| dest.write("{index_type} ai=m_top*K{elt_size};") | |||
| dest.write("{index_type} bi=n_top*K{elt_size};") | |||
| if TRMM: | |||
| generate_trmm_block( dest ) | |||
| for i in range(N): | |||
| dest.write("{param_scalar_t} B{i} = B[bi+{i}];", i=i) | |||
| dest.write("bi += {N};") | |||
| dest.write() | |||
| for i in range(a_regs): | |||
| dest.write("{param_vector_t} A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i) | |||
| dest.write("ai += {M};") | |||
| dest.write() | |||
| for j in range(N): | |||
| for i in range(a_regs): | |||
| dest.write("{acc_vector_t} result{dest} = {VMUL_TO_ACC}( A{i}, B{j}, gvl);", dest=j*a_regs+i, i=i, j=j) | |||
| with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')): | |||
| for i in range(N): | |||
| dest.write("B{i} = B[bi+{i}];", i=i ) | |||
| dest.write("bi += {N};") | |||
| dest.write() | |||
| for i in range(a_regs): | |||
| dest.write("A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i) | |||
| dest.write("ai += {M};") | |||
| dest.write() | |||
| for j in range(N): | |||
| for i in range(a_regs): | |||
| dest.write("result{dest} = {VMACC_TO_ACC}( result{dest}, B{j}, A{i}, gvl);", dest= j*a_regs+i, j=j, i=i ) | |||
| dest.write() | |||
| dest.write("{index_type} ci=n_top*ldc+m_top;") | |||
| dest.write() | |||
| if narrow_result: | |||
| for j in range(N): | |||
| for i in range(a_regs): | |||
| dest.write("{param_vector_t} narrowed{idx} = {VFNCVT}( result{idx}, gvl );", idx=j*a_regs+i) | |||
| if not TRMM: | |||
| for j in range(N): | |||
| for i in range(a_regs): | |||
| idx = j*a_regs+i | |||
| increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' | |||
| if idx == N*a_regs-1: | |||
| increment = '' | |||
| dest.write("{param_vector_t} c{idx} = {VLEV}( &C[ci], gvl);{increment}", idx=idx, increment=increment) | |||
| if narrow_result: | |||
| for j in range(N): | |||
| for i in range(a_regs): | |||
| idx = j*a_regs+i | |||
| if TRMM: | |||
| dest.write("{param_vector_t} c{idx} = {VFMUL}( narrowed{idx}, alpha, gvl );", idx=idx) | |||
| else: | |||
| dest.write("c{idx} = {VFMACC}( c{idx}, alpha, narrowed{idx}, gvl );", idx=idx) | |||
| else: | |||
| for j in range(N): | |||
| for i in range(a_regs): | |||
| idx = j*a_regs+i | |||
| if TRMM: | |||
| dest.write("{param_vector_t} c{idx} = {VFMUL}( result{idx}, alpha, gvl );", idx=idx) | |||
| else: | |||
| dest.write("c{idx} = {VFMACC}( c{idx}, alpha, result{idx}, gvl );", idx=idx) | |||
| if not TRMM: | |||
| dest.write() | |||
| dest.write("ci=n_top*ldc+m_top;") | |||
| dest.write() | |||
| for j in range(N): | |||
| for i in range(a_regs): | |||
| idx = j*a_regs+i | |||
| increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' | |||
| if idx == N*a_regs-1: | |||
| increment = '' | |||
| dest.write("{VSEV}( &C[ci], c{idx}, gvl);{increment}", idx=idx, increment=increment) | |||
| #----------------------------------------------------------------------- | |||
| def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ): | |||
| TRMM = (settings['op'].value == 'trmm') | |||
| narrow_result = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value | |||
| if narrow_result: | |||
| raise RuntimeError("wide accumulator not supported for generated complex kernels") | |||
| # we could, but we run out of registers really really fast | |||
| with dest.map( | |||
| M=M, | |||
| N=N, | |||
| ): | |||
| dest.write("{index_type} ai=m_top*K*2;") | |||
| dest.write("{index_type} bi=n_top*K*2;") | |||
| if TRMM: | |||
| generate_trmm_block( dest ) | |||
| for i in range(N): | |||
| dest.write("{param_scalar_t} B{i}r = B[bi+{i}*2+0];", i=i) | |||
| dest.write("{param_scalar_t} B{i}i = B[bi+{i}*2+1];", i=i) | |||
| dest.write("bi += {N}*2;") | |||
| dest.write() | |||
| for i in range(a_regs): | |||
| dest.write("{param_vector_t} A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i) | |||
| dest.write("{param_vector_t} A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i) | |||
| dest.write("ai += {M}*2;") | |||
| dest.write() | |||
| # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results | |||
| accumulation_regs = a_regs * N | |||
| dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k", | |||
| a_regs=a_regs*2, accumulation_regs=accumulation_regs*2 | |||
| ) | |||
| pass_regs = (accumulation_regs + a_regs)*2 | |||
| tmp_regs = (32 // settings['LMUL_ACC'].value) - pass_regs | |||
| if tmp_regs < 2: | |||
| raise RuntimeError("Complex kernel would use too many registers!") | |||
| dest.write("// leaving {tmp_regs} vector registers for temporaries", tmp_regs=tmp_regs) | |||
| tmp_unroll_i = min(tmp_regs, a_regs) | |||
| tmp_unroll_j = N | |||
| while tmp_unroll_j > 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j: | |||
| tmp_unroll_j = int(tmp_unroll_j / 2) | |||
| if tmp_unroll_i < a_regs or tmp_unroll_j < N: | |||
| dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i) | |||
| for tj in range(0, N, tmp_unroll_j): | |||
| for ti in range(0, a_regs, tmp_unroll_i): | |||
| for j in range(tj, tj+tmp_unroll_j): | |||
| for i in range(ti, ti+tmp_unroll_i): | |||
| with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): | |||
| if ti == 0 and tj==0: | |||
| dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") | |||
| dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") | |||
| else: | |||
| dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") | |||
| dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") | |||
| for j in range(tj, tj+tmp_unroll_j): | |||
| for i in range(ti, ti+tmp_unroll_i): | |||
| with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): | |||
| dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") | |||
| dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") | |||
| for j in range(tj, tj+tmp_unroll_j): | |||
| for i in range(ti, ti+tmp_unroll_i): | |||
| with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): | |||
| dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;") | |||
| dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;") | |||
| with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')): | |||
| for i in range(N): | |||
| dest.write("B{i}r = B[bi+{i}*2+0];", i=i) | |||
| dest.write("B{i}i = B[bi+{i}*2+1];", i=i) | |||
| dest.write("bi += {N}*2;") | |||
| dest.write() | |||
| for i in range(a_regs): | |||
| dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i) | |||
| dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i) | |||
| dest.write("ai += {M}*2;") | |||
| dest.write() | |||
| for tj in range(0, N, tmp_unroll_j): | |||
| for ti in range(0, a_regs, tmp_unroll_i): | |||
| # note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign | |||
| # so accumulating them directly to ACC would lose precision when ACC is larger | |||
| for j in range(tj, tj+tmp_unroll_j): | |||
| for i in range(ti, ti+tmp_unroll_i): | |||
| with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): | |||
| dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);") | |||
| dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);") | |||
| for j in range(tj, tj+tmp_unroll_j): | |||
| for i in range(ti, ti+tmp_unroll_i): | |||
| with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): | |||
| dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);") | |||
| dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);") | |||
| for j in range(tj, tj+tmp_unroll_j): | |||
| for i in range(ti, ti+tmp_unroll_i): | |||
| with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j): | |||
| dest.write("ACC{dest}r = {__riscv_}vfadd( ACC{dest}r, tmp{tmp}r, gvl);") | |||
| dest.write("ACC{dest}i = {__riscv_}vfadd( ACC{dest}i, tmp{tmp}i, gvl);") | |||
| dest.write() | |||
| dest.write("{index_type} ci=n_top*ldc+m_top;") | |||
| dest.write() | |||
| for j in range(N): | |||
| if TRMM: | |||
| for i in range(a_regs): | |||
| with dest.map(idx=j*a_regs+i): | |||
| dest.write("{param_vector_t} C{idx}r = {__riscv_}vfmul( ACC{idx}r, alphar, gvl );") | |||
| dest.write("{param_vector_t} C{idx}i = {__riscv_}vfmul( ACC{idx}i, alphar, gvl );") | |||
| else: | |||
| for i in range(a_regs): | |||
| idx = j*a_regs+i | |||
| increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' | |||
| if idx == N*a_regs-1: | |||
| increment = '' | |||
| with dest.map(idx=j*a_regs+i, increment=increment): | |||
| dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );") | |||
| dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );") | |||
| dest.write("{increment}") | |||
| if not TRMM: | |||
| for j in range(N): | |||
| for i in range(a_regs): | |||
| with dest.map(idx=j*a_regs+i): | |||
| dest.write("C{idx}r = {__riscv_}vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );") | |||
| dest.write("C{idx}i = {__riscv_}vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );") | |||
| for j in range(N): | |||
| for i in range(a_regs): | |||
| with dest.map(idx=j*a_regs+i): | |||
| dest.write("C{idx}r = {__riscv_}vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );") | |||
| dest.write("C{idx}i = {__riscv_}vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );") | |||
| if not TRMM: | |||
| dest.write() | |||
| dest.write("ci=n_top*ldc+m_top;") | |||
| dest.write() | |||
| for j in range(N): | |||
| for i in range(a_regs): | |||
| idx = j*a_regs+i | |||
| increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;' | |||
| if idx == N*a_regs-1: | |||
| increment = '' | |||
| with dest.map(idx=j*a_regs+i, increment=increment): | |||
| dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);") | |||
| dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);") | |||
| dest.write("{increment}") | |||
| #----------------------------------------------------------------------- | |||
| def generate_gemm_kernel( settings, OUTPUT ): | |||
| if settings['conjugate'].value: | |||
| ERROR('conjugate gemm not yet supported') | |||
| is_complex = settings['complex'].value | |||
| generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real | |||
| dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings }) | |||
| M = settings['M'].value | |||
| N = settings['N'].value | |||
| vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value / | |||
| settings['ELEN_PARAM'].value) | |||
| a_regs = max(int(M/vlenmax), 1) | |||
| # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results | |||
| accumulation_regs = a_regs * N | |||
| required_regs = accumulation_regs + a_regs | |||
| if is_complex: | |||
| required_regs = required_regs * 2 + 2 | |||
| dest.write(''' | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define S0 1 | |||
| #define S1 -1 | |||
| #define S2 1 | |||
| #define S3 1 | |||
| #define VFMACC_RR __riscv_vfmsac{tail_policy} | |||
| #define VFMACC_RI __riscv_vfmacc{tail_policy} | |||
| #endif | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| #define S0 1 | |||
| #define S1 1 | |||
| #define S2 1 | |||
| #define S3 -1 | |||
| #define VFMACC_RR __riscv_vfmacc{tail_policy} | |||
| #define VFMACC_RI __riscv_vfmsac{tail_policy} | |||
| #endif | |||
| #if defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| #define S0 1 | |||
| #define S1 1 | |||
| #define S2 -1 | |||
| #define S3 1 | |||
| #define VFMACC_RR __riscv_vfmacc{tail_policy} | |||
| #define VFMACC_RI __riscv_vfnmsac{tail_policy} | |||
| #endif | |||
| #if defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #define S0 1 | |||
| #define S1 -1 | |||
| #define S2 -1 | |||
| #define S3 -1 | |||
| #define VFMACC_RR __riscv_vfmsac{tail_policy} | |||
| #define VFMACC_RI __riscv_vfnmacc{tail_policy} | |||
| #endif | |||
| '''.format(tail_policy=settings['tail_policy'].value)) | |||
| if required_regs > (32 // settings['LMUL_ACC'].value): | |||
| raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only {} are available".format( | |||
| required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else ''), 32 // settings['LMUL_ACC'].value | |||
| )) | |||
| TRMM = (settings['op'].value == 'trmm') | |||
| if TRMM: | |||
| with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"): | |||
| dest.write("#define BACKWARDS") | |||
| dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})", | |||
| alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'), | |||
| trmm = (', BLASLONG offset' if TRMM else '') | |||
| ) | |||
| with dest.block("{{", "}}", elt_size='*2' if is_complex else ''): | |||
| if settings['trace'].value: | |||
| dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);") | |||
| dest.write("{index_type} gvl = 0;") | |||
| dest.write("{index_type} m_top = 0;") | |||
| dest.write("{index_type} n_top = 0;") | |||
| dest.write() | |||
| dest.write() | |||
| dest.write("// -- MAIN PASS") | |||
| with dest.block("for ({index_type} j=0; j<N/{N}; j+=1) {{", "}}"): | |||
| dest.write("m_top = 0;") | |||
| dest.write("{index_type} gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1))) | |||
| dest.write() | |||
| with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"): | |||
| generate_gemm_kernel_inner( settings, dest, M, N, vlenmax, a_regs ) | |||
| dest.write( "m_top += {M};" ) | |||
| dest.write() | |||
| dest.write() | |||
| dest.write("// -- tails for main pass") | |||
| generate_M_tails( dest, settings, M, N ) | |||
| dest.write( "n_top += {N};" ) | |||
| N_tail = int(N/2) | |||
| while( N_tail > 0 ): | |||
| with dest.map(N=N_tail): | |||
| dest.write() | |||
| dest.write() | |||
| dest.write("// -- tails for N={N}") | |||
| with dest.block("if( N & {N} ) {{", "}}" ): | |||
| if settings['trace'].value: | |||
| dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") | |||
| dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1))) | |||
| dest.write("m_top = 0;") | |||
| with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"): | |||
| generate_gemm_kernel_inner( settings, dest, M, N_tail, vlenmax, a_regs ) | |||
| dest.write("m_top += {M};") | |||
| generate_M_tails( dest, settings, M, N_tail ) | |||
| dest.write("n_top += {N};") | |||
| N_tail = int(N_tail/2) | |||
| dest.write("return 0;"); | |||
| #----------------------------------------------------------------------- | |||
| def generate_M_tails( dest, settings, M, N ): | |||
| M_tail = int(M/2) | |||
| M_tail_min = settings['M_tail_scalar_from'].value | |||
| vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value | |||
| / settings['ELEN_PARAM'].value ) | |||
| TRMM = (settings['op'].value == 'trmm') | |||
| is_complex = settings['complex'].value | |||
| generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real | |||
| while( M_tail > M_tail_min ): | |||
| with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ): | |||
| if settings['trace'].value: | |||
| dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") | |||
| a_regs = max( 1, int(M_tail/vlenmax) ) | |||
| vlen = int(M_tail/a_regs) | |||
| dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen) | |||
| generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs ) | |||
| dest.write( "m_top += {M_tail};" ) | |||
| M_tail = int( M_tail / 2 ) | |||
| while( M_tail > 0 ): | |||
| with dest.block("if( M & {M_tail} ) {{", "}}", | |||
| M_tail=M_tail, | |||
| N=N, | |||
| result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value) | |||
| ): | |||
| if settings['trace'].value: | |||
| dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);") | |||
| for r in range(M_tail * N * (2 if is_complex else 1)): | |||
| dest.write("{result_t} result{r} = 0;", | |||
| r=r | |||
| ) | |||
| dest.write("{index_type} ai=m_top*K{elt_size};") | |||
| dest.write("{index_type} bi=n_top*K{elt_size};") | |||
| if TRMM: | |||
| with dest.map(M=M_tail, N=N): | |||
| generate_trmm_block( dest ) | |||
| with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ): | |||
| for ki in range( N ): | |||
| for kj in range( M_tail ): | |||
| if is_complex: | |||
| dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format( | |||
| dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2 | |||
| )) | |||
| dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format( | |||
| dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2 | |||
| )) | |||
| else: | |||
| dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format( | |||
| dest=ki*M_tail+kj, kj=kj, ki=ki | |||
| )) | |||
| dest.write("ai+={M_tail}{elt_size};") | |||
| dest.write("bi+={N}{elt_size};") | |||
| dest.write("{index_type} ci=n_top*ldc+m_top;") | |||
| if is_complex: | |||
| dest.write("{result_t} Cr, Ci;") | |||
| for ki in range( N ): | |||
| for kj in range( M_tail ): | |||
| if is_complex: | |||
| if TRMM: | |||
| dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) | |||
| dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) | |||
| else: | |||
| dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj) | |||
| dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj) | |||
| dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0) | |||
| dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1) | |||
| dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1) | |||
| dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0) | |||
| dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj ) | |||
| dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj ) | |||
| else: | |||
| op = '' if TRMM else '+' | |||
| dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};", | |||
| ki=ki, kj=kj, op=op, dest=ki*M_tail+kj | |||
| ) | |||
| dest.write("m_top+={M_tail};") | |||
| M_tail = int(M_tail/2) | |||
| #----------------------------------------------------------------------- | |||
| class Setting(object): | |||
| def __init__( self, value, convert = None ): | |||
| self._value = value | |||
| self._convert = convert | |||
| @classmethod | |||
| def ENUM( cls, *values ): | |||
| def closure( values ): | |||
| return lambda value: values[value.lower()] | |||
| return closure( { v.lower():v for v in values } ) | |||
| @classmethod | |||
| def BOOL( cls, value ): | |||
| return value.lower().startswith('t') or value == '1' | |||
| @property | |||
| def value( self ): | |||
| return self._value | |||
| @property | |||
| def configurable( self ): | |||
| return self._convert is not None | |||
| @value.setter | |||
| def value( self, value ): | |||
| self._value = self._convert( value ) | |||
| def __str__( self ): | |||
| return str(self._value) | |||
| #----------------------------------------------------------------------- | |||
| def main(): | |||
| settings = { | |||
| 'op': Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ), | |||
| 'M': Setting( 16, int ), | |||
| 'N': Setting( 4, int ), | |||
| 'reg_width_bits': Setting( 256, int ), | |||
| 'LMUL': Setting( 1, int ), | |||
| 'M_tail_scalar_from':Setting( 2, int ), | |||
| 'cpu': Setting( 'zvl256b', str ), | |||
| 'param_precision': Setting( 'float', Setting.ENUM( 'float', 'double' ) ), | |||
| 'force_acc_double': Setting( False, Setting.BOOL ), | |||
| 'complex': Setting( False, Setting.BOOL ), | |||
| 'conjugate': Setting( False, Setting.BOOL ), | |||
| 'index_type': Setting( 'BLASLONG', str ), | |||
| 'trace': Setting( False, Setting.BOOL ), | |||
| 'output': Setting( None, str ), | |||
| 'tail_policy': Setting( '', str ), # _ta, if toolchain supports it | |||
| '__riscv_': Setting( '__riscv_', str), | |||
| } | |||
| for item in sys.argv[1:]: | |||
| try: | |||
| name, value = tuple(item.split( '=', 1 )) | |||
| except: | |||
| ERROR("couldn't parse {}, expected arguments of the form name=value".format(item)) | |||
| if name not in settings: | |||
| ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name ) | |||
| +"options (and current defaults) are\n{}".format( | |||
| " ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()])) | |||
| ) | |||
| try: | |||
| settings[name].value = value | |||
| except: | |||
| import traceback | |||
| traceback.print_exc() | |||
| ERROR("couldn't parse {}".format(item)) | |||
| if settings['output'].value is None: | |||
| if settings['complex'].value: | |||
| prefix = 'z' if settings['param_precision'].value == 'double' else 'c' | |||
| else: | |||
| prefix = 'd' if settings['param_precision'].value == 'double' else 's' | |||
| settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format( | |||
| prefix, | |||
| settings['op'], | |||
| settings['M'], | |||
| settings['N'], | |||
| settings['cpu'] | |||
| )) | |||
| if settings['param_precision'].value == 'double': | |||
| settings['param_scalar_t'] = Setting( 'double' ) | |||
| settings['ELEN_PARAM'] = Setting(64) | |||
| else: | |||
| settings['param_scalar_t'] = Setting( 'float' ) | |||
| settings['ELEN_PARAM'] = Setting(32) | |||
| settings['VFMUL'] = Setting( '{}vfmul_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) | |||
| settings['VFMACC'] = Setting( '{}vfmacc_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) ) | |||
| settings['ELEN_ACC'] = settings['ELEN_PARAM'] | |||
| settings['LMUL_ACC'] = Setting(settings['LMUL'].value) | |||
| widen = '' | |||
| if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'): | |||
| settings['ELEN_ACC'] = Setting(64) | |||
| settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2) | |||
| settings['VFNCVT'] = Setting('{}vfncvt_f_f_w_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy'])) | |||
| widen = 'w' | |||
| settings['VMUL_TO_ACC'] = Setting( '{}vf{}mul_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) | |||
| settings['VMACC_TO_ACC'] = Setting( '{}vf{}macc_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) ) | |||
| settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL'])) | |||
| settings['acc_vector_t'] =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC'])) | |||
| settings['VLEV'] =Setting('{}vle{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) | |||
| settings['VSEV'] =Setting('{}vse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) | |||
| settings['VLSEV'] =Setting('{}vlse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) | |||
| settings['VSSEV'] =Setting('{}vsse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL'])) | |||
| settings['VSETVL'] =Setting('{}vsetvl_e{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'])) | |||
| to_stdout = (settings['output'].value == '-') | |||
| if not to_stdout: | |||
| print("Writing {}".format(settings['output'].value), file=sys.stderr) | |||
| with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file: | |||
| def OUTPUT(*args, **kwargs): | |||
| print(*args, file=destination_file, **kwargs) | |||
| OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable]))) | |||
| OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable]))) | |||
| OUTPUT('#include "common.h"') | |||
| OUTPUT("\n") | |||
| if settings['op'].value in ('gemm', 'trmm'): | |||
| generate_gemm_kernel(settings, OUTPUT) | |||
| else: | |||
| ERROR("unsupported kernel type {}".format(settings['op'])) | |||
| if __name__ == "__main__": | |||
| main() | |||
| @@ -0,0 +1,149 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 | |||
| #define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 | |||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu | |||
| #define VFIRSTM __riscv_vfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu | |||
| #define VIDV_UINT __riscv_vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu | |||
| #define VADDVX_UINT __riscv_vadd_vx_u64m8 | |||
| #define VMVVX_UINT __riscv_vmv_v_x_u64m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 | |||
| #define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 | |||
| #define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 | |||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu | |||
| #define VFIRSTM __riscv_vfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu | |||
| #define VIDV_UINT __riscv_vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu | |||
| #define VADDVX_UINT __riscv_vadd_vx_u32m8 | |||
| #define VMVVX_UINT __riscv_vmv_v_x_u32m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 | |||
| #define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| unsigned int max_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(max_index); | |||
| FLOAT_V_T vx, v_max; | |||
| UINT_V_T v_max_index; | |||
| MASK_T mask; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_max_index = VMVVX_UINT(0, vlmax); | |||
| v_max = VFMVVF_FLOAT(-1, vlmax); | |||
| BLASLONG j=0; | |||
| FLOAT maxf=0.0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, vl); | |||
| v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); | |||
| v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); | |||
| //update v_max | |||
| v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, vl); | |||
| v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); | |||
| v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); | |||
| //update v_max | |||
| v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); | |||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); | |||
| max_index = VFIRSTM(mask, vlmax); | |||
| v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); | |||
| max_index = VMVVXS_UINT(v_max_index); | |||
| return(max_index+1); | |||
| } | |||
| @@ -27,127 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <float.h> | |||
| #if defined(DOUBLE) | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m4_f64m1(v_res, va, vb, gvl) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_m) | |||
| #define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m4)(vm, compressed, va, gvl) | |||
| #else | |||
| #define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m4_f64m1) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_mu) | |||
| #define VCOMPRESS RISCV_RVV(vcompress_vm_u64m4) | |||
| #endif | |||
| #define MASK_T vbool16_t | |||
| #define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m4_b16) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m4) | |||
| #define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m4_b16) | |||
| #define VMFIRSTM RISCV_RVV(vfirst_m_b16) | |||
| #define UINT_V_T vuint64m4_t | |||
| #define VIDV_UINT RISCV_RVV(vid_v_u64m4) | |||
| #define VADDVX_UINT RISCV_RVV(vadd_vx_u64m4) | |||
| #define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m4) | |||
| #define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m4) | |||
| #define VMV_X RISCV_RVV(vmv_x_s_u64m4_u64) | |||
| #else | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m4_f32m1(v_res, va, vb, gvl) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_m) | |||
| #define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m4)(vm, compressed, va, gvl) | |||
| #else | |||
| #define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m4_f32m1) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_mu) | |||
| #define VCOMPRESS RISCV_RVV(vcompress_vm_u32m4) | |||
| #endif | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m4_b8) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m4) | |||
| #define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m4_b8) | |||
| #define VMFIRSTM RISCV_RVV(vfirst_m_b8) | |||
| #define UINT_V_T vuint32m4_t | |||
| #define VIDV_UINT RISCV_RVV(vid_v_u32m4) | |||
| #define VADDVX_UINT RISCV_RVV(vadd_vx_u32m4) | |||
| #define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m4) | |||
| #define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m4) | |||
| #define VMV_X RISCV_RVV(vmv_x_s_u32m4_u32) | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| FLOAT maxf=0.0; | |||
| #ifdef DOUBLE | |||
| BLASLONG max_index = 0; | |||
| #else | |||
| BLASLONG i=0, j=0; | |||
| unsigned int max_index = 0; | |||
| #endif | |||
| if (n <= 0 || inc_x <= 0) return(max_index); | |||
| if (n <= 0 || inc_x <= 0) return(max_index); | |||
| FLOAT maxf=-FLT_MAX; | |||
| FLOAT_V_T vx, v_max; | |||
| UINT_V_T v_max_index; | |||
| MASK_T mask; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); | |||
| gvl = VSETVL(n); | |||
| UINT_V_T vid = VIDV_UINT(gvl); | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| v_max_index = VMVVX_UINT(0, gvl); | |||
| v_max = VFMVVF_FLOAT(-1, gvl); | |||
| v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| vx = VFABS_FLOAT(vx, gvl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | |||
| j += gvl; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| maxf = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||
| maxf = EXTRACT_FLOAT(v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| #ifdef DOUBLE | |||
| max_index = *((BLASLONG *)&v_max_index+max_index); | |||
| #else | |||
| max_index = *((unsigned int *)&v_max_index+max_index); | |||
| #endif | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_max_index, mask, gvl); | |||
| max_index = VMV_X(compressed); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| v_max = VLEV_FLOAT(&x[j], gvl); | |||
| v_max = VFABS_FLOAT(v_max, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| FLOAT cur_maxf = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||
| FLOAT cur_maxf = EXTRACT_FLOAT(v_res); | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| v_max_index = VADDVX_UINT(v_max_index, j, gvl); | |||
| v_max_index = VADDVX_UINT(vid, j, gvl); | |||
| mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| #ifdef DOUBLE | |||
| max_index = *((BLASLONG*)&v_max_index+max_index); | |||
| #else | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| #endif | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_max_index, mask, gvl); | |||
| max_index = VMV_X(compressed); | |||
| } | |||
| } | |||
| }else{ | |||
| @@ -155,56 +151,48 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| unsigned int idx = 0, inc_v = gvl * inc_x; | |||
| v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| v_max_index = VMVVX_UINT(0, gvl); | |||
| v_max = VFMVVF_FLOAT(-1, gvl); | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| vx = VFABS_FLOAT(vx, gvl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | |||
| j += gvl; | |||
| idx += inc_v; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| maxf = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||
| maxf = EXTRACT_FLOAT(v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| #ifdef DOUBLE | |||
| max_index = *((BLASLONG*)&v_max_index+max_index); | |||
| #else | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| #endif | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_max_index, mask, gvl); | |||
| max_index = VMV_X(compressed); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| v_max = VFABS_FLOAT(v_max, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||
| FLOAT cur_maxf = EXTRACT_FLOAT(v_res); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| FLOAT cur_maxf = VFMVFS_FLOAT(v_res); | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| v_max_index = VADDVX_UINT(v_max_index, j, gvl); | |||
| v_max_index = VADDVX_UINT(vid, j, gvl); | |||
| mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| #ifdef DOUBLE | |||
| max_index = *((BLASLONG*)&v_max_index+max_index); | |||
| #else | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| #endif | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_max_index, mask, gvl); | |||
| max_index = VMV_X(compressed); | |||
| } | |||
| } | |||
| } | |||
| return(max_index+1); | |||
| return(max_index+1); | |||
| } | |||
| @@ -0,0 +1,150 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 | |||
| #define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m8 | |||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu | |||
| #define VFIRSTM __riscv_vfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu | |||
| #define VIDV_UINT __riscv_vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu | |||
| #define VADDVX_UINT __riscv_vadd_vx_u64m8 | |||
| #define VMVVX_UINT __riscv_vmv_v_x_u64m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 | |||
| #define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 | |||
| #define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m8 | |||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu | |||
| #define VFIRSTM __riscv_vfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu | |||
| #define VIDV_UINT __riscv_vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu | |||
| #define VADDVX_UINT __riscv_vadd_vx_u32m8 | |||
| #define VMVVX_UINT __riscv_vmv_v_x_u32m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 | |||
| #define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| unsigned int min_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(min_index); | |||
| FLOAT_V_T vx, v_min; | |||
| UINT_V_T v_min_index; | |||
| MASK_T mask; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_min_index = VMVVX_UINT(0, vlmax); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||
| BLASLONG j=0; | |||
| FLOAT minf=0.0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| // index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, vl); | |||
| v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); | |||
| v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vx = VFABSV_FLOAT(vx, vl); | |||
| // index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, vl); | |||
| v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); | |||
| v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); | |||
| minf = VFMVFS_FLOAT_M1(v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, vlmax); | |||
| min_index = VFIRSTM(mask, vlmax); | |||
| v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); | |||
| min_index = VMVVXS_UINT(v_min_index); | |||
| return(min_index+1); | |||
| } | |||
| @@ -31,85 +31,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl) | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) | |||
| #else | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 | |||
| #define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu | |||
| #define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu | |||
| #define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) | |||
| #endif | |||
| #define MASK_T vbool8_t | |||
| #define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8) | |||
| #define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8) | |||
| #define VMFIRSTM RISCV_RVV(vfirst_m_b8) | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_UINT RISCV_RVV(vid_v_u64m8) | |||
| #define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) | |||
| #define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) | |||
| #define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8) | |||
| #define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) | |||
| #else | |||
| #define ABS fabsf | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl) | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) | |||
| #else | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 | |||
| #define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu | |||
| #define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu | |||
| #define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) | |||
| #endif | |||
| #define MASK_T vbool4_t | |||
| #define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8) | |||
| #define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4) | |||
| #define VMFIRSTM RISCV_RVV(vfirst_m_b4) | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_UINT RISCV_RVV(vid_v_u32m8) | |||
| #define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) | |||
| #define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) | |||
| #define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8) | |||
| #define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| FLOAT minf=FLT_MAX; | |||
| BLASLONG i=0, j=0; | |||
| unsigned int min_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(min_index); | |||
| if (n <= 0 || inc_x <= 0) return(min_index); | |||
| FLOAT minf=FLT_MAX; | |||
| FLOAT_V_T vx, v_min; | |||
| UINT_V_T v_min_index; | |||
| MASK_T mask; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v_min_index = VMVVX_UINT(0, gvl); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| vx = VFABS_FLOAT(vx, gvl); | |||
| //index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, gvl); | |||
| //index where element greater than v_min | |||
| mask = VMFGTVV_FLOAT(v_min, vx, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); | |||
| @@ -117,29 +125,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_min = VFMINVV_FLOAT(v_min, vx, gvl); | |||
| j += gvl; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| minf = EXTRACT_FLOAT(v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_min_index, mask, gvl); | |||
| min_index = VMV_X(compressed); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| v_min = VLEV_FLOAT(&x[j], gvl); | |||
| v_min = VFABS_FLOAT(v_min, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| v_min_index = VADDVX_UINT(v_min_index, j, gvl); | |||
| mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_min_index, mask, gvl); | |||
| min_index = VMV_X(compressed); | |||
| } | |||
| } | |||
| }else{ | |||
| @@ -151,12 +159,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_min_index = VMVVX_UINT(0, gvl); | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| vx = VFABS_FLOAT(vx, gvl); | |||
| //index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, gvl); | |||
| //index where element greater than v_min | |||
| mask = VMFGTVV_FLOAT(v_min, vx, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); | |||
| @@ -165,33 +171,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl; | |||
| idx += inc_v; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| minf = EXTRACT_FLOAT(v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_min_index, mask, gvl); | |||
| min_index = VMV_X(compressed); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| v_min = VFABS_FLOAT(v_min, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| v_min_index = VADDVX_UINT(v_min_index, j, gvl); | |||
| mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_min_index, mask, gvl); | |||
| min_index = VMV_X(compressed); | |||
| } | |||
| } | |||
| } | |||
| return(min_index+1); | |||
| return(min_index+1); | |||
| } | |||
| @@ -0,0 +1,146 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 | |||
| #define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu | |||
| #define VFIRSTM __riscv_vfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu | |||
| #define VIDV_UINT __riscv_vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu | |||
| #define VADDVX_UINT __riscv_vadd_vx_u64m8 | |||
| #define VMVVX_UINT __riscv_vmv_v_x_u64m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 | |||
| #define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 | |||
| #define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu | |||
| #define VFIRSTM __riscv_vfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu | |||
| #define VIDV_UINT __riscv_vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu | |||
| #define VADDVX_UINT __riscv_vadd_vx_u32m8 | |||
| #define VMVVX_UINT __riscv_vmv_v_x_u32m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 | |||
| #define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| unsigned int max_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(max_index); | |||
| FLOAT_V_T vx, v_max; | |||
| UINT_V_T v_max_index; | |||
| MASK_T mask; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_max_index = VMVVX_UINT(0, vlmax); | |||
| v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax); | |||
| BLASLONG j=0; | |||
| FLOAT maxf=0.0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, vl); | |||
| v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); | |||
| v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, vl); | |||
| v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); | |||
| v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); | |||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); | |||
| max_index = VFIRSTM(mask, vlmax); | |||
| v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); | |||
| max_index = VMVVXS_UINT(v_max_index); | |||
| return(max_index+1); | |||
| } | |||
| @@ -31,68 +31,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m8_f64m1(v_res, va, vb, gvl) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m) | |||
| #define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) | |||
| #else | |||
| #define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu) | |||
| #define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) | |||
| #endif | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8) | |||
| #define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8) | |||
| #define VMFIRSTM RISCV_RVV(vfirst_m_b8) | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #define VIDV_UINT RISCV_RVV(vid_v_u64m8) | |||
| #define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) | |||
| #define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) | |||
| #define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) | |||
| #else | |||
| #define ABS fabsf | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m8_f32m1(v_res, va, vb, gvl) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m) | |||
| #define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) | |||
| #else | |||
| #define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu) | |||
| #define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) | |||
| #endif | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8) | |||
| #define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4) | |||
| #define VMFIRSTM RISCV_RVV(vfirst_m_b4) | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #define VIDV_UINT RISCV_RVV(vid_v_u32m8) | |||
| #define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) | |||
| #define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) | |||
| #define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| BLASLONG i=0, j=0; | |||
| unsigned int max_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(max_index); | |||
| FLOAT maxf=-FLT_MAX; | |||
| if (n <= 0 || inc_x <= 0) return(max_index); | |||
| FLOAT maxf=-FLT_MAX; | |||
| FLOAT_V_T vx, v_max; | |||
| UINT_V_T v_max_index; | |||
| MASK_T mask; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_min; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| @@ -104,32 +116,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | |||
| j += gvl; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| maxf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||
| maxf = EXTRACT_FLOAT(v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_max_index, mask, gvl); | |||
| max_index = VMV_X(compressed); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| v_max = VLEV_FLOAT(&x[j], gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| FLOAT cur_maxf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||
| FLOAT cur_maxf = EXTRACT_FLOAT(v_res); | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| v_max_index = VADDVX_UINT(v_max_index, j, gvl); | |||
| mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_max_index, mask, gvl); | |||
| max_index = VMV_X(compressed); | |||
| } | |||
| } | |||
| }else{ | |||
| @@ -145,37 +159,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | |||
| j += gvl; | |||
| idx += inc_v; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| maxf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||
| maxf = EXTRACT_FLOAT(v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_max_index, mask, gvl); | |||
| max_index = VMV_X(compressed); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| FLOAT cur_maxf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||
| FLOAT cur_maxf = EXTRACT_FLOAT(v_res); | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| v_max_index = VADDVX_UINT(v_max_index, j, gvl); | |||
| mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = *((unsigned int*)&v_max_index+max_index); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_max_index, mask, gvl); | |||
| max_index = VMV_X(compressed); | |||
| } | |||
| } | |||
| } | |||
| return(max_index+1); | |||
| return(max_index+1); | |||
| } | |||
| @@ -0,0 +1,146 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8 | |||
| #define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu | |||
| #define VFIRSTM __riscv_vfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT_TU __riscv_vid_v_u64m8_tumu | |||
| #define VIDV_UINT __riscv_vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m8_tumu | |||
| #define VADDVX_UINT __riscv_vadd_vx_u64m8 | |||
| #define VMVVX_UINT __riscv_vmv_v_x_u64m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m8 | |||
| #define VMVVXS_UINT __riscv_vmv_x_s_u64m8_u64 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4 | |||
| #define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu | |||
| #define VFIRSTM __riscv_vfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT_TU __riscv_vid_v_u32m8_tumu | |||
| #define VIDV_UINT __riscv_vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m8_tumu | |||
| #define VADDVX_UINT __riscv_vadd_vx_u32m8 | |||
| #define VMVVX_UINT __riscv_vmv_v_x_u32m8 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m8 | |||
| #define VMVVXS_UINT __riscv_vmv_x_s_u32m8_u32 | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| unsigned int min_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(min_index); | |||
| FLOAT_V_T vx, v_min; | |||
| UINT_V_T v_min_index; | |||
| MASK_T mask; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_min_index = VMVVX_UINT(0, vlmax); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||
| BLASLONG j=0; | |||
| FLOAT minf=0.0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| // index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, vl); | |||
| v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); | |||
| v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| // index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, vl); | |||
| v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); | |||
| v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); | |||
| minf = VFMVFS_FLOAT_M1(v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, vlmax); | |||
| min_index = VFIRSTM(mask, vlmax); | |||
| v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); | |||
| min_index = VMVVXS_UINT(v_min_index); | |||
| return(min_index+1); | |||
| } | |||
| @@ -31,122 +31,119 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl) | |||
| #define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u64m8_m)(mask, v_min_index, gvl) | |||
| #define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u64m8_m)(mask, a, a, b, gvl) | |||
| #define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) | |||
| #else | |||
| #define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m) | |||
| #define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) | |||
| #endif | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8) | |||
| #define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8) | |||
| #define VMFIRSTM RISCV_RVV(vfirst_m_b8) | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #define VIDV_UINT RISCV_RVV(vid_v_u64m8) | |||
| #define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) | |||
| #define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) | |||
| #define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) | |||
| #else | |||
| #define ABS fabsf | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl) | |||
| #define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u32m8_m)(mask, v_min_index, gvl) | |||
| #define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u32m8_m)(mask, a, a, b, gvl) | |||
| #define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) | |||
| #else | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 | |||
| #define VIDV_MASK_UINT __riscv_vid_v_u32m8_m | |||
| #define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m | |||
| #define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) | |||
| #endif | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8) | |||
| #define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4) | |||
| #define VMFIRSTM RISCV_RVV(vfirst_m_b4) | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #define VIDV_UINT RISCV_RVV(vid_v_u32m8) | |||
| #define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) | |||
| #define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) | |||
| #define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| FLOAT minf=FLT_MAX; | |||
| BLASLONG i=0, j=0; | |||
| unsigned int min_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(min_index); | |||
| if (n <= 0 || inc_x <= 0) return(min_index); | |||
| FLOAT minf=FLT_MAX; | |||
| FLOAT_V_T vx, v_min; | |||
| UINT_V_T v_min_index; | |||
| MASK_T mask; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v_min_index = VMVVX_UINT(0, gvl); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| //index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1 \n\t" | |||
| "vsetvli x0, %2, e64,m8 \n\t" | |||
| "vid.v %0, v0.t \n\t" | |||
| :"+v"(v_min_index) | |||
| :"v"(mask), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1 \n\t" | |||
| "vsetvli x0, %2, e32,m8 \n\t" | |||
| "vid.v %0, v0.t \n\t" | |||
| :"+v"(v_min_index) | |||
| :"v"(mask), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); | |||
| //index where element greater than v_min | |||
| mask = VMFGTVV_FLOAT(v_min, vx, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask, gvl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx, gvl); | |||
| j += gvl; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| minf = EXTRACT_FLOAT(v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_min_index, mask, gvl); | |||
| min_index = VMV_X(compressed); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| v_min = VLEV_FLOAT(&x[j], gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = *((FLOAT*)&v_res); | |||
| if(cur_minf < minf){ | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||
| if(cur_minf > minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| v_min_index = VADDVX_UINT(v_min_index, j, gvl); | |||
| mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_min_index, mask, gvl); | |||
| min_index = VMV_X(compressed); | |||
| } | |||
| } | |||
| }else{ | |||
| @@ -159,59 +156,39 @@ asm volatile( | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1 \n\t" | |||
| "vsetvli x0, %2, e64,m8 \n\t" | |||
| "vid.v %0, v0.t \n\t" | |||
| :"+v"(v_min_index) | |||
| :"v"(mask), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1 \n\t" | |||
| "vsetvli x0, %2, e32,m8 \n\t" | |||
| "vid.v %0, v0.t \n\t" | |||
| :"+v"(v_min_index) | |||
| :"v"(mask), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); | |||
| //index where element greater than v_min | |||
| mask = VMFGTVV_FLOAT(v_min, vx, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask, gvl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx, gvl); | |||
| j += gvl; | |||
| idx += inc_v; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| minf = EXTRACT_FLOAT(v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_min_index, mask, gvl); | |||
| min_index = VMV_X(compressed); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = *((FLOAT*)&v_res); | |||
| if(cur_minf < minf){ | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||
| if(cur_minf > minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| v_min_index = VADDVX_UINT(v_min_index, j, gvl); | |||
| mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = *((unsigned int*)&v_min_index+min_index); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_min_index, mask, gvl); | |||
| min_index = VMV_X(compressed); | |||
| } | |||
| } | |||
| } | |||
| return(min_index+1); | |||
| return(min_index+1); | |||
| } | |||
| @@ -0,0 +1,172 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m4() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define FLOAT_VX2_T vfloat64m4x2_t | |||
| #define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m4 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||
| #define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 | |||
| #define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 | |||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1 | |||
| #define MASK_T vbool16_t | |||
| #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 | |||
| #define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 | |||
| #define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 | |||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m4_tu | |||
| #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 | |||
| #define VFIRSTM __riscv_vfirst_m_b16 | |||
| #define UINT_V_T vuint64m4_t | |||
| #define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu | |||
| #define VIDV_UINT __riscv_vid_v_u64m4 | |||
| #define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu | |||
| #define VADDVX_UINT __riscv_vadd_vx_u64m4 | |||
| #define VMVVX_UINT __riscv_vmv_v_x_u64m4 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4 | |||
| #define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m4() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define FLOAT_VX2_T vfloat32m4x2_t | |||
| #define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m4 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||
| #define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 | |||
| #define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 | |||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 | |||
| #define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 | |||
| #define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 | |||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m4_tu | |||
| #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 | |||
| #define VFIRSTM __riscv_vfirst_m_b8 | |||
| #define UINT_V_T vuint32m4_t | |||
| #define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu | |||
| #define VIDV_UINT __riscv_vid_v_u32m4 | |||
| #define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu | |||
| #define VADDVX_UINT __riscv_vadd_vx_u32m4 | |||
| #define VMVVX_UINT __riscv_vmv_v_x_u32m4 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4 | |||
| #define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32 | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| unsigned int max_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(max_index); | |||
| FLOAT_V_T vx0, vx1, v_max; | |||
| FLOAT_VX2_T vxx2; | |||
| UINT_V_T v_max_index; | |||
| MASK_T mask; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_max_index = VMVVX_UINT(0, vlmax); | |||
| v_max = VFMVVF_FLOAT(-1, vlmax); | |||
| BLASLONG j=0; | |||
| FLOAT maxf=0.0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { | |||
| vl = VSETVL(n); | |||
| vxx2 = VLSEG_FLOAT(x, vl); | |||
| vx0 = VGET_VX2(vxx2, 0); | |||
| vx1 = VGET_VX2(vxx2, 1); | |||
| vx0 = VFABSV_FLOAT(vx0, vl); | |||
| vx1 = VFABSV_FLOAT(vx1, vl); | |||
| vx0 = VFADDVV_FLOAT(vx0, vx1, vl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx0, vl); | |||
| v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); | |||
| v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl); | |||
| } | |||
| } | |||
| else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { | |||
| vl = VSETVL(n); | |||
| vxx2 = VLSSEG_FLOAT(x, stride_x, vl); | |||
| vx0 = VGET_VX2(vxx2, 0); | |||
| vx1 = VGET_VX2(vxx2, 1); | |||
| vx0 = VFABSV_FLOAT(vx0, vl); | |||
| vx1 = VFABSV_FLOAT(vx1, vl); | |||
| vx0 = VFADDVV_FLOAT(vx0, vx1, vl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx0, vl); | |||
| v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl); | |||
| v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, vlmax); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax); | |||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, vlmax); | |||
| max_index = VFIRSTM(mask, vlmax); | |||
| v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax); | |||
| max_index = VMVVXS_UINT(v_max_index); | |||
| return(max_index+1); | |||
| } | |||
| @@ -27,241 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <float.h> | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f64m8_f64m1)(v_res, va, vb, gvl) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m) | |||
| #define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) | |||
| #else | |||
| #define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu) | |||
| #define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) | |||
| #endif | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8) | |||
| #define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8) | |||
| #define VMFIRSTM RISCV_RVV(vfirst_m_b8) | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VSEVU_UINT vse64_v_u64m8 | |||
| #define VSEVU_UINT RISCV_RVV(vse64_v_u64m8) | |||
| #define UINT_T long unsigned int | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #define VIDV_UINT RISCV_RVV(vid_v_u64m8) | |||
| #define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) | |||
| #define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) | |||
| #define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8) | |||
| #define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8) | |||
| #define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) | |||
| #else | |||
| #define ABS fabsf | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f32m8_f32m1)(v_res, va, vb, gvl) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m) | |||
| #define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) | |||
| #else | |||
| #define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu) | |||
| #define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) | |||
| #endif | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8) | |||
| #define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4) | |||
| #define VMFIRSTM RISCV_RVV(vfirst_m_b4) | |||
| #define UINT_V_T vuint32m8_t | |||
| #define UINT_T unsigned int | |||
| #define VSEVU_UINT vse32_v_u32m8 | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #define VSEVU_UINT RISCV_RVV(vse32_v_u32m8) | |||
| #define VIDV_UINT RISCV_RVV(vid_v_u32m8) | |||
| #define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) | |||
| #define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) | |||
| #define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8) | |||
| #define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8) | |||
| #define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) | |||
| #endif | |||
| #define RVV_M RVV_M8 | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| FLOAT maxf=0.0; | |||
| BLASLONG i=0, j=0; | |||
| unsigned int max_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(max_index); | |||
| if (n <= 0 || inc_x <= 0) return(max_index); | |||
| FLOAT maxf=-FLT_MAX; | |||
| FLOAT_V_T vx0, vx1, v_max; | |||
| FLOAT_V_T vx, vx2, v_max; | |||
| UINT_V_T v_max_index; | |||
| MASK_T mask0, mask1; | |||
| MASK_T mask; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); | |||
| gvl = VSETVL(n); | |||
| UINT_T temp_uint[gvl]; | |||
| unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| unsigned int idx = 0, inc_v = gvl * inc_x * 2; | |||
| v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| v_max_index = VMVVX_UINT(0, gvl); | |||
| v_max = VFMVVF_FLOAT(-1, gvl); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG inc_xv = gvl * inc_x * 2; | |||
| BLASLONG ix = 0; | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | |||
| vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | |||
| vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); | |||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | |||
| vx = VFABS_FLOAT(vx, gvl); | |||
| vx2 = VFABS_FLOAT(vx2, gvl); | |||
| vx = VFADDVV_FLOAT(vx, vx2, gvl); | |||
| //index where element greater than v_max | |||
| mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); | |||
| v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1 \n\t" | |||
| "vsetvli x0, %2, e64,m8 \n\t" | |||
| "vid.v %0, v0.t \n\t" | |||
| :"+v"(v_max_index) | |||
| :"v"(mask0), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1 \n\t" | |||
| "vsetvli x0, %2, e32,m8 \n\t" | |||
| "vid.v %0, v0.t \n\t" | |||
| :"+v"(v_max_index) | |||
| :"v"(mask0), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); | |||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | |||
| j += gvl; | |||
| ix += inc_xv; | |||
| idx += inc_v; | |||
| } | |||
| vx0 = VFMVVF_FLOAT(0, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| maxf = VFMVFS_FLOAT(v_res); | |||
| mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask0,gvl); | |||
| VSEVU_UINT(temp_uint,v_max_index,gvl); | |||
| max_index = temp_uint[max_index]; | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||
| maxf = EXTRACT_FLOAT(v_res); | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_max_index, mask, gvl); | |||
| max_index = VMV_X(compressed); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| v_max_index = VMVVX_UINT(0, gvl); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | |||
| vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | |||
| vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| v_max = VFADDVV_FLOAT(vx0, vx1, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| FLOAT cur_maxf = VFMVFS_FLOAT(v_res); | |||
| v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | |||
| v_max = VFABS_FLOAT(v_max, gvl); | |||
| vx2 = VFABS_FLOAT(vx2, gvl); | |||
| v_max = VFADDVV_FLOAT(v_max, vx2, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||
| FLOAT cur_maxf = EXTRACT_FLOAT(v_res); | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| v_max_index = VADDVX_UINT(v_max_index, j, gvl); | |||
| mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||
| max_index = VMFIRSTM(mask0,gvl); | |||
| VSEVU_UINT(temp_uint,v_max_index,gvl); | |||
| max_index = temp_uint[max_index]; | |||
| mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_max_index, mask, gvl); | |||
| max_index = VMV_X(compressed); | |||
| } | |||
| } | |||
| return(max_index+1); | |||
| } | |||
| return(max_index+1); | |||
| } | |||
| @@ -0,0 +1,171 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e64m4(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m4() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define FLOAT_VX2_T vfloat64m4x2_t | |||
| #define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 | |||
| #define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2 | |||
| #define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2 | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m4_f64m1 | |||
| #define MASK_T vbool16_t | |||
| #define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16 | |||
| #define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16 | |||
| #define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m4_b16 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f64m4 | |||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m4_tu | |||
| #define VFADDVV_FLOAT __riscv_vfadd_vv_f64m4 | |||
| #define VFIRSTM __riscv_vfirst_m_b16 | |||
| #define UINT_V_T vuint64m4_t | |||
| #define VIDV_MASK_UINT_TU __riscv_vid_v_u64m4_tumu | |||
| #define VIDV_UINT __riscv_vid_v_u64m4 | |||
| #define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u64m4_tumu | |||
| #define VADDVX_UINT __riscv_vadd_vx_u64m4 | |||
| #define VMVVX_UINT __riscv_vmv_v_x_u64m4 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u64m4 | |||
| #define VMVVXS_UINT __riscv_vmv_x_s_u64m4_u64 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e32m4(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m4() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define FLOAT_VX2_T vfloat32m4x2_t | |||
| #define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 | |||
| #define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2 | |||
| #define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2 | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m4_f32m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8 | |||
| #define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8 | |||
| #define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m4_b8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFABSV_FLOAT __riscv_vfabs_v_f32m4 | |||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m4_tu | |||
| #define VFADDVV_FLOAT __riscv_vfadd_vv_f32m4 | |||
| #define VFIRSTM __riscv_vfirst_m_b8 | |||
| #define UINT_V_T vuint32m4_t | |||
| #define VIDV_MASK_UINT_TU __riscv_vid_v_u32m4_tumu | |||
| #define VIDV_UINT __riscv_vid_v_u32m4 | |||
| #define VADDVX_MASK_UINT_TU __riscv_vadd_vx_u32m4_tumu | |||
| #define VADDVX_UINT __riscv_vadd_vx_u32m4 | |||
| #define VMVVX_UINT __riscv_vmv_v_x_u32m4 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #define VSLIDEDOWN_UINT __riscv_vslidedown_vx_u32m4 | |||
| #define VMVVXS_UINT __riscv_vmv_x_s_u32m4_u32 | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| unsigned int min_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(min_index); | |||
| FLOAT_V_T vx0, vx1, v_min; | |||
| FLOAT_VX2_T vxx2; | |||
| UINT_V_T v_min_index; | |||
| MASK_T mask; | |||
| size_t vlmax = VSETVL_MAX; | |||
| v_min_index = VMVVX_UINT(0, vlmax); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||
| BLASLONG j=0; | |||
| FLOAT minf=0.0; | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) { | |||
| vl = VSETVL(n); | |||
| vxx2 = VLSEG_FLOAT(x, vl); | |||
| vx0 = VGET_VX2(vxx2, 0); | |||
| vx1 = VGET_VX2(vxx2, 1); | |||
| vx0 = VFABSV_FLOAT(vx0, vl); | |||
| vx1 = VFABSV_FLOAT(vx1, vl); | |||
| vx0 = VFADDVV_FLOAT(vx0, vx1, vl); | |||
| // index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx0, v_min, vl); | |||
| v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); | |||
| v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) { | |||
| vl = VSETVL(n); | |||
| vxx2 = VLSSEG_FLOAT(x, stride_x, vl); | |||
| vx0 = VGET_VX2(vxx2, 0); | |||
| vx1 = VGET_VX2(vxx2, 1); | |||
| vx0 = VFABSV_FLOAT(vx0, vl); | |||
| vx1 = VFABSV_FLOAT(vx1, vl); | |||
| vx0 = VFADDVV_FLOAT(vx0, vx1, vl); | |||
| // index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx0, v_min, vl); | |||
| v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl); | |||
| v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl); | |||
| } | |||
| } | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax); | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax); | |||
| minf = VFMVFS_FLOAT_M1(v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, vlmax); | |||
| min_index = VFIRSTM(mask, vlmax); | |||
| v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax); | |||
| min_index = VMVVXS_UINT(v_min_index); | |||
| return(min_index+1); | |||
| } | |||
| @@ -31,235 +31,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f64m8_f64m1)(v_res, va, vb, gvl) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m) | |||
| #define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl) | |||
| #else | |||
| #define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu) | |||
| #define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8) | |||
| #endif | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8) | |||
| #define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8) | |||
| #define VMFIRSTM RISCV_RVV(vfirst_m_b8) | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VSEVU_UINT vse64_v_u64m8 | |||
| #define UINT_T long unsigned int | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #define VIDV_UINT RISCV_RVV(vid_v_u64m8) | |||
| #define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8) | |||
| #define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8) | |||
| #define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8) | |||
| #define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8) | |||
| #define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64) | |||
| #else | |||
| #define ABS fabsf | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f32m8_f32m1)(v_res, va, vb, gvl) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m) | |||
| #define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl) | |||
| #else | |||
| #define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f32m8_f32m1) | |||
| #define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu) | |||
| #define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu) | |||
| #define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8) | |||
| #endif | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8) | |||
| #define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4) | |||
| #define VMFIRSTM RISCV_RVV(vfirst_m_b4) | |||
| #define UINT_V_T vuint32m8_t | |||
| #define UINT_T unsigned int | |||
| #define VSEVU_UINT vse32_v_u32m8 | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #define VSEVU_UINT RISCV_RVV(vse32_v_u32m8) | |||
| #define VIDV_UINT RISCV_RVV(vid_v_u32m8) | |||
| #define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8) | |||
| #define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8) | |||
| #define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8) | |||
| #define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8) | |||
| #define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32) | |||
| #endif | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| FLOAT minf=FLT_MAX; | |||
| BLASLONG i=0, j=0; | |||
| unsigned int min_index = 0; | |||
| if (n <= 0 || inc_x <= 0) return(min_index); | |||
| if (n <= 0 || inc_x <= 0) return(min_index); | |||
| FLOAT minf=FLT_MAX; | |||
| FLOAT_V_T vx0, vx1, v_min; | |||
| FLOAT_V_T vx, vx2, v_min; | |||
| UINT_V_T v_min_index; | |||
| MASK_T mask0, mask1; | |||
| MASK_T mask; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); | |||
| gvl = VSETVL(n); | |||
| UINT_T temp_uint[gvl]; | |||
| v_min_index = VMVVX_UINT(0, gvl); | |||
| unsigned int stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| unsigned int idx = 0, inc_v = gvl * inc_x * 2; | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG inc_xv = gvl * inc_x * 2; | |||
| BLASLONG ix = 0; | |||
| v_min_index = VMVVX_UINT(0, gvl); | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | |||
| vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | |||
| vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); | |||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | |||
| vx = VFABS_FLOAT(vx, gvl); | |||
| vx2 = VFABS_FLOAT(vx2, gvl); | |||
| vx = VFADDVV_FLOAT(vx, vx2, gvl); | |||
| //index where element less than v_min | |||
| mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1 \n\t" | |||
| "vsetvli x0, %2, e64,m8 \n\t" | |||
| "vid.v %0, v0.t \n\t" | |||
| :"+v"(v_min_index) | |||
| :"v"(mask0), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1 \n\t" | |||
| "vsetvli x0, %2, e32,m8 \n\t" | |||
| "vid.v %0, v0.t \n\t" | |||
| :"+v"(v_min_index) | |||
| :"v"(mask0), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); | |||
| //index where element greater than v_min | |||
| mask = VMFGTVV_FLOAT(v_min, vx, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx0, gvl); | |||
| v_min = VFMINVV_FLOAT(v_min, vx, gvl); | |||
| j += gvl; | |||
| ix += inc_xv; | |||
| idx += inc_v; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = VFMVFS_FLOAT(v_res); | |||
| mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask0,gvl); | |||
| VSEVU_UINT(temp_uint,v_min_index,gvl); | |||
| min_index = temp_uint[min_index]; | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| minf = EXTRACT_FLOAT(v_res); | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_min_index, mask, gvl); | |||
| min_index = VMV_X(compressed); | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| v_min_index = VMVVX_UINT(0, gvl); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | |||
| vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx0) | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | |||
| vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| :"+v"(vx1) | |||
| :"v"(mask1), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| v_min = VFADDVV_FLOAT(vx0, vx1, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = VFMVFS_FLOAT(v_res); | |||
| v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | |||
| v_min = VFABS_FLOAT(v_min, gvl); | |||
| vx2 = VFABS_FLOAT(vx2, gvl); | |||
| v_min = VFADDVV_FLOAT(v_min, vx2, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| FLOAT cur_minf = EXTRACT_FLOAT(v_res); | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| v_min_index = VADDVX_UINT(v_min_index, j, gvl); | |||
| mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||
| min_index = VMFIRSTM(mask0,gvl); | |||
| VSEVU_UINT(temp_uint,v_min_index,gvl); | |||
| min_index = temp_uint[min_index]; | |||
| mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); | |||
| UINT_V_T compressed; | |||
| compressed = VCOMPRESS(v_min_index, mask, gvl); | |||
| min_index = VMV_X(compressed); | |||
| } | |||
| } | |||
| return(min_index+1); | |||
| } | |||
| return(min_index+1); | |||
| } | |||
| @@ -0,0 +1,98 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f32m8_tu | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMAXVV_FLOAT_TU __riscv_vfmax_vv_f64m8_tu | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT maxf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||
| FLOAT_V_T vx, vmax; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax); | |||
| maxf = VFMVFS_FLOAT_M1(v_res); | |||
| return(maxf); | |||
| } | |||
| @@ -28,29 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define MLEN 32 | |||
| # else | |||
| # define ELEN 32 | |||
| # define MLEN 16 | |||
| # endif | |||
| #else | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define MLEN 8 | |||
| # else | |||
| # define ELEN 32 | |||
| # define MLEN 4 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||
| #endif | |||
| #define MASK_T JOIN(vbool, MLEN, _t, _, _) | |||
| #define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||
| #define VFMAXVV_FLOAT JOIN(RISCV_RVV(vfmax), _vv_f, ELEN, LMUL, _) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| @@ -59,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT maxf=-FLT_MAX; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_max; | |||
| FLOAT_V_T_M1 v_res, v_min; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1); | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| @@ -76,15 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | |||
| j += gvl * 2; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| maxf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); | |||
| if(*((FLOAT*)&v_res) > maxf) | |||
| maxf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| @@ -102,18 +115,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl * 2; | |||
| idx += inc_xv * 2; | |||
| } | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| maxf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); | |||
| if(*((FLOAT*)&v_res) > maxf) | |||
| maxf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| } | |||
| maxf = EXTRACT_FLOAT(v_res); | |||
| return(maxf); | |||
| } | |||
| @@ -0,0 +1,98 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f32m8_tu | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMINVV_FLOAT_TU __riscv_vfmin_vv_f64m8_tu | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT minf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(minf); | |||
| FLOAT_V_T vx, vmin; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vmin = VFMVVF_FLOAT(FLT_MAX, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax); | |||
| minf = VFMVFS_FLOAT_M1(v_res); | |||
| return(minf); | |||
| } | |||
| @@ -28,29 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define MLEN 32 | |||
| # else | |||
| # define ELEN 32 | |||
| # define MLEN 16 | |||
| # endif | |||
| #else | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define MLEN 8 | |||
| # else | |||
| # define ELEN 32 | |||
| # define MLEN 4 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl) | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1)) | |||
| #endif | |||
| #define MASK_T JOIN(vbool, MLEN, _t, _, _) | |||
| #define VMFLTVF_FLOAT JOIN(RISCV_RVV(vmflt_vf_f), ELEN, LMUL, _b, MLEN) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||
| #define VFMINVV_FLOAT JOIN(RISCV_RVV(vfmin), _vv_f, ELEN, LMUL, _) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| @@ -59,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT minf=FLT_MAX; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_min; | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1); | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| @@ -76,15 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_min = VFMINVV_FLOAT(v_min, v1, gvl); | |||
| j += gvl * 2; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(*((FLOAT*)&v_res) < minf) | |||
| minf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| @@ -102,18 +115,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl * 2; | |||
| idx += inc_xv * 2; | |||
| } | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(*((FLOAT*)&v_res) < minf) | |||
| minf = *((FLOAT*)&v_res); | |||
| v_res = VFREDMINVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| } | |||
| minf = EXTRACT_FLOAT(v_res); | |||
| return(minf); | |||
| } | |||
| @@ -0,0 +1,212 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(DOUBLE) | |||
| #define VSETVL __riscv_vsetvl_e64m4 | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m4 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m4 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 | |||
| #define VFMVSF_FLOAT __riscv_vfmv_s_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define MASK_T vbool16_t | |||
| #define VFABS __riscv_vfabs_v_f64m4 | |||
| #define VMFNE __riscv_vmfne_vf_f64m4_b16 | |||
| #define VMFGT __riscv_vmfgt_vv_f64m4_b16 | |||
| #define VMFEQ __riscv_vmfeq_vf_f64m4_b16 | |||
| #define VCPOP __riscv_vcpop_m_b16 | |||
| #define VFREDMAX __riscv_vfredmax_vs_f64m4_f64m1 | |||
| #define VFREDMIN __riscv_vfredmin_vs_f64m4_f64m1 | |||
| #define VFIRST __riscv_vfirst_m_b16 | |||
| #define VRGATHER __riscv_vrgather_vx_f64m4 | |||
| #define VFDIV __riscv_vfdiv_vv_f64m4 | |||
| #define VFDIV_M __riscv_vfdiv_vv_f64m4_mu | |||
| #define VFMUL __riscv_vfmul_vv_f64m4 | |||
| #define VFMUL_M __riscv_vfmul_vv_f64m4_mu | |||
| #define VFMACC __riscv_vfmacc_vv_f64m4 | |||
| #define VFMACC_M __riscv_vfmacc_vv_f64m4_mu | |||
| #define VMSBF __riscv_vmsbf_m_b16 | |||
| #define VMSOF __riscv_vmsof_m_b16 | |||
| #define VMAND __riscv_vmand_mm_b16 | |||
| #define VMANDN __riscv_vmand_mm_b16 | |||
| #define VFREDSUM __riscv_vfredusum_vs_f64m4_f64m1 | |||
| #define VMERGE __riscv_vmerge_vvm_f64m4 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m4 | |||
| #define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f64m4_f64(v) | |||
| #define ABS fabs | |||
| #else | |||
| #define VSETVL __riscv_vsetvl_e32m4 | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m4 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 | |||
| #define VFMVSF_FLOAT __riscv_vfmv_s_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define MASK_T vbool8_t | |||
| #define VFABS __riscv_vfabs_v_f32m4 | |||
| #define VMFNE __riscv_vmfne_vf_f32m4_b8 | |||
| #define VMFGT __riscv_vmfgt_vv_f32m4_b8 | |||
| #define VMFEQ __riscv_vmfeq_vf_f32m4_b8 | |||
| #define VCPOP __riscv_vcpop_m_b8 | |||
| #define VFREDMAX __riscv_vfredmax_vs_f32m4_f32m1 | |||
| #define VFREDMIN __riscv_vfredmin_vs_f32m4_f32m1 | |||
| #define VFIRST __riscv_vfirst_m_b8 | |||
| #define VRGATHER __riscv_vrgather_vx_f32m4 | |||
| #define VFDIV __riscv_vfdiv_vv_f32m4 | |||
| #define VFDIV_M __riscv_vfdiv_vv_f32m4_mu | |||
| #define VFMUL __riscv_vfmul_vv_f32m4 | |||
| #define VFMUL_M __riscv_vfmul_vv_f32m4_mu | |||
| #define VFMACC __riscv_vfmacc_vv_f32m4 | |||
| #define VFMACC_M __riscv_vfmacc_vv_f32m4_mu | |||
| #define VMSBF __riscv_vmsbf_m_b8 | |||
| #define VMSOF __riscv_vmsof_m_b8 | |||
| #define VMAND __riscv_vmand_mm_b8 | |||
| #define VMANDN __riscv_vmand_mm_b8 | |||
| #define VFREDSUM __riscv_vfredusum_vs_f32m4_f32m1 | |||
| #define VMERGE __riscv_vmerge_vvm_f32m4 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m4 | |||
| #define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f32m4_f32(v) | |||
| #define ABS fabsf | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||
| if(n == 1) return (ABS(x[0])); | |||
| unsigned int gvl = 0; | |||
| MASK_T nonzero_mask; | |||
| MASK_T scale_mask; | |||
| gvl = VSETVL(n); | |||
| FLOAT_V_T v0; | |||
| FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl); | |||
| FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl); | |||
| FLOAT scale = 0; | |||
| FLOAT ssq = 0; | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| int idx = 0; | |||
| if( n >= gvl ) // don't pay overheads if we're not doing useful work | |||
| { | |||
| for(i=0; i<n/gvl; i++){ | |||
| v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl ); | |||
| nonzero_mask = VMFNE( v0, 0, gvl ); | |||
| v0 = VFABS( v0, gvl ); | |||
| scale_mask = VMFGT( v0, v_scale, gvl ); | |||
| // assume scale changes are relatively infrequent | |||
| // unclear if the vcpop+branch is actually a win | |||
| // since the operations being skipped are predicated anyway | |||
| // need profiling to confirm | |||
| if( VCPOP(scale_mask, gvl) ) | |||
| { | |||
| v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl ); | |||
| v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl ); | |||
| v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl ); | |||
| v_scale = VMERGE( v_scale, v0, scale_mask, gvl ); | |||
| } | |||
| v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl ); | |||
| v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl ); | |||
| idx += inc_x * gvl; | |||
| } | |||
| // we have gvl elements which we accumulated independently, with independent scales | |||
| // we need to combine these | |||
| // naive sort so we process small values first to avoid losing information | |||
| // could use vector sort extensions where available, but we're dealing with gvl elts at most | |||
| FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT)); | |||
| FLOAT * out_scale = alloca(gvl*sizeof(FLOAT)); | |||
| VSEV_FLOAT( out_ssq, v_ssq, gvl ); | |||
| VSEV_FLOAT( out_scale, v_scale, gvl ); | |||
| for( int a = 0; a < (gvl-1); ++a ) | |||
| { | |||
| int smallest = a; | |||
| for( size_t b = a+1; b < gvl; ++b ) | |||
| if( out_scale[b] < out_scale[smallest] ) | |||
| smallest = b; | |||
| if( smallest != a ) | |||
| { | |||
| FLOAT tmp1 = out_ssq[a]; | |||
| FLOAT tmp2 = out_scale[a]; | |||
| out_ssq[a] = out_ssq[smallest]; | |||
| out_scale[a] = out_scale[smallest]; | |||
| out_ssq[smallest] = tmp1; | |||
| out_scale[smallest] = tmp2; | |||
| } | |||
| } | |||
| int a = 0; | |||
| while( a<gvl && out_scale[a] == 0 ) | |||
| ++a; | |||
| if( a < gvl ) | |||
| { | |||
| ssq = out_ssq[a]; | |||
| scale = out_scale[a]; | |||
| ++a; | |||
| for( ; a < gvl; ++a ) | |||
| { | |||
| ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a]; | |||
| scale = out_scale[a]; | |||
| } | |||
| } | |||
| } | |||
| //finish any tail using scalar ops | |||
| i*=gvl*inc_x; | |||
| n*=inc_x; | |||
| while(i < n){ | |||
| if ( x[i] != 0.0 ){ | |||
| FLOAT absxi = ABS( x[i] ); | |||
| if ( scale < absxi ){ | |||
| ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); | |||
| scale = absxi ; | |||
| } | |||
| else{ | |||
| ssq += ( absxi/scale ) * ( absxi/scale ); | |||
| } | |||
| } | |||
| i += inc_x; | |||
| } | |||
| return(scale * sqrt(ssq)); | |||
| } | |||
| @@ -26,207 +26,189 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32 | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||
| #define ABS fabsf | |||
| #define MASK_T vbool8_t | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m | |||
| #define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define VFDIVVF_FLOAT vfdiv_vf_f32m4 | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m1 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define MLEN 64 | |||
| # else | |||
| # define ELEN 32 | |||
| # define MLEN 32 | |||
| # endif | |||
| #else | |||
| # define LMUL m4 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define MLEN 16 | |||
| # else | |||
| # define ELEN 32 | |||
| # define MLEN 8 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define FLOAT_V_T_M1 JOIN(vfloat, ELEN, m1, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
| #define VFMVSF_FLOAT JOIN(RISCV_RVV(vfmv), _s_f_f, ELEN, LMUL, _) | |||
| #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _) | |||
| #define MASK_T JOIN(vbool, MLEN, _t, _, _) | |||
| #define VFABS JOIN(RISCV_RVV(vfabs), _v_f, ELEN, LMUL, _) | |||
| #define VMFNE JOIN(RISCV_RVV(vmfne_vf_f),ELEN, LMUL, _b, MLEN) | |||
| #define VMFGT JOIN(RISCV_RVV(vmfgt_vv_f),ELEN, LMUL, _b, MLEN) | |||
| #define VMFEQ JOIN(RISCV_RVV(vmfeq_vf_f),ELEN, LMUL, _b, MLEN) | |||
| #define VCPOP JOIN(RISCV_RVV(vcpop), _m_b, MLEN, _, _) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFDIV_M JOIN(RISCV_RVV(vfdiv), _vv_f, ELEN, LMUL, _m) | |||
| #define VFMUL_M JOIN(RISCV_RVV(vfmul), _vv_f, ELEN, LMUL, _m) | |||
| #define VFMACC_M JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _m) | |||
| #define VMERGE(a, b, mask, gvl) JOIN(RISCV_RVV(vmerge), _vvm_f, ELEN, LMUL, _)(mask, a, b, gvl) | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64 | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||
| #define VFDIV_M JOIN(RISCV_RVV(vfdiv), _vv_f, ELEN, LMUL, _mu) | |||
| #define VFMUL_M JOIN(RISCV_RVV(vfmul), _vv_f, ELEN, LMUL, _mu) | |||
| #define VFMACC_M JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _mu) | |||
| #define VMERGE JOIN(RISCV_RVV(vmerge), _vvm_f, ELEN, LMUL, _) | |||
| #endif | |||
| #define VFIRST JOIN(RISCV_RVV(vfirst), _m_b, MLEN, _, _) | |||
| #define VRGATHER JOIN(RISCV_RVV(vrgather), _vx_f, ELEN, LMUL, _) | |||
| #define VFDIV JOIN(RISCV_RVV(vfdiv), _vv_f, ELEN, LMUL, _) | |||
| #define VFMUL JOIN(RISCV_RVV(vfmul), _vv_f, ELEN, LMUL, _) | |||
| #define VFMACC JOIN(RISCV_RVV(vfmacc), _vv_f, ELEN, LMUL, _) | |||
| #define VMSBF JOIN(RISCV_RVV(vmsbf), _m_b, MLEN, _, _) | |||
| #define VMSOF JOIN(RISCV_RVV(vmsof), _m_b, MLEN, _, _) | |||
| #define VMAND JOIN(RISCV_RVV(vmand), _mm_b, MLEN, _, _) | |||
| #define VMANDN JOIN(RISCV_RVV(vmandn), _mm_b, MLEN, _, _) | |||
| #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #define MASK_T vbool16_t | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m | |||
| #define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16 | |||
| #define VMFIRSTM vmfirst_m_b16 | |||
| #define VFDIVVF_FLOAT vfdiv_vf_f64m4 | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define EXTRACT_FLOAT0_V(v) JOIN(RISCV_RVV(vfmv_f_s_f), ELEN, LMUL, _f, ELEN)(v) | |||
| //#define DUMP( label, v0, gvl ) | |||
| #define DUMP( label, v0, gvl ) do{ FLOAT x[16]; VSEV_FLOAT( x, v0, gvl ); printf ("%s(%d): %s [ ", __FILE__, __LINE__, label); for( int xxx = 0; xxx < gvl; ++xxx ) { printf("%f, ", x[xxx]); } printf(" ]\n"); } while(0) | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| BLASLONG i=0; | |||
| if ( n < 0 ) return(0.0); | |||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||
| if(n == 1) return (ABS(x[0])); | |||
| FLOAT_V_T vr, v0, v_zero; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT scale = 0.0, ssq = 0.0; | |||
| MASK_T mask; | |||
| BLASLONG index = 0; | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| if(index == -1){//no elements greater than scale | |||
| if(scale != 0.0){ | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(vr, v0, v0, gvl); | |||
| } | |||
| }else{//found greater element | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = VFMVFS_FLOAT(v_res); | |||
| //ssq in vector vr | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| MASK_T nonzero_mask; | |||
| MASK_T scale_mask; | |||
| gvl = VSETVL(n); | |||
| FLOAT_V_T v0; | |||
| FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl); | |||
| FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl); | |||
| FLOAT scale = 0; | |||
| FLOAT ssq = 0; | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| int idx = 0; | |||
| if( n >= gvl ) // don't pay overheads if we're not doing useful work | |||
| { | |||
| for(i=0; i<n/gvl; i++){ | |||
| v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl ); | |||
| nonzero_mask = VMFNE( v0, 0, gvl ); | |||
| v0 = VFABS( v0, gvl ); | |||
| scale_mask = VMFGT( v0, v_scale, gvl ); | |||
| // assume scale changes are relatively infrequent | |||
| // unclear if the vcpop+branch is actually a win | |||
| // since the operations being skipped are predicated anyway | |||
| // need profiling to confirm | |||
| if( VCPOP(scale_mask, gvl) ) | |||
| { | |||
| v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl ); | |||
| v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl ); | |||
| v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl ); | |||
| v_scale = VMERGE( v_scale, v0, scale_mask, gvl ); | |||
| } | |||
| j += gvl; | |||
| v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl ); | |||
| v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl ); | |||
| idx += inc_x * gvl; | |||
| } | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //tail | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| if(index == -1){//no elements greater than scale | |||
| if(scale != 0.0) | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| }else{//found greater element | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = VFMVFS_FLOAT(v_res); | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| // we have gvl elements which we accumulated independently, with independent scales | |||
| // we need to combine these | |||
| // naive sort so we process small values first to avoid losing information | |||
| // could use vector sort extensions where available, but we're dealing with gvl elts at most | |||
| FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT)); | |||
| FLOAT * out_scale = alloca(gvl*sizeof(FLOAT)); | |||
| VSEV_FLOAT( out_ssq, v_ssq, gvl ); | |||
| VSEV_FLOAT( out_scale, v_scale, gvl ); | |||
| for( int a = 0; a < (gvl-1); ++a ) | |||
| { | |||
| int smallest = a; | |||
| for( size_t b = a+1; b < gvl; ++b ) | |||
| if( out_scale[b] < out_scale[smallest] ) | |||
| smallest = b; | |||
| if( smallest != a ) | |||
| { | |||
| FLOAT tmp1 = out_ssq[a]; | |||
| FLOAT tmp2 = out_scale[a]; | |||
| out_ssq[a] = out_ssq[smallest]; | |||
| out_scale[a] = out_scale[smallest]; | |||
| out_ssq[smallest] = tmp1; | |||
| out_scale[smallest] = tmp2; | |||
| } | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| int idx = 0, inc_v = inc_x * gvl; | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| if(index == -1){//no elements greater than scale | |||
| if(scale != 0.0){ | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(vr, v0, v0, gvl); | |||
| } | |||
| }else{//found greater element | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = VFMVFS_FLOAT(v_res); | |||
| //ssq in vector vr | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| int a = 0; | |||
| while( a<gvl && out_scale[a] == 0 ) | |||
| ++a; | |||
| if( a < gvl ) | |||
| { | |||
| ssq = out_ssq[a]; | |||
| scale = out_scale[a]; | |||
| ++a; | |||
| for( ; a < gvl; ++a ) | |||
| { | |||
| ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a]; | |||
| scale = out_scale[a]; | |||
| } | |||
| j += gvl; | |||
| idx += inc_v; | |||
| } | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| //tail | |||
| if(j < n){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| if(index == -1){//no elements greater than scale | |||
| if(scale != 0.0) | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| }else{//found greater element | |||
| //find max | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res)); | |||
| //update scale | |||
| scale = VFMVFS_FLOATM4(vr); | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| } | |||
| //finish any tail using scalar ops | |||
| i*=gvl*inc_x; | |||
| n*=inc_x; | |||
| while(i < n){ | |||
| if ( x[i] != 0.0 ){ | |||
| FLOAT absxi = ABS( x[i] ); | |||
| if ( scale < absxi ){ | |||
| ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); | |||
| scale = absxi ; | |||
| } | |||
| else{ | |||
| ssq += ( absxi/scale ) * ( absxi/scale ); | |||
| } | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| //ssq in vector vr: vr[0] | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += VFMVFS_FLOAT(v_res); | |||
| } | |||
| i += inc_x; | |||
| } | |||
| return(scale * sqrt(ssq)); | |||
| } | |||
| @@ -31,9 +31,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| @@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| @@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| BLASLONG i=0, j=0; | |||
| double len = 0.0 ; | |||
| if ( n < 0 ) return(0.0); | |||
| if ( n <= 0 ) return(0.0); | |||
| if(n == 1) return (ABS(x[0])); | |||
| FLOAT_V_T vr, v0, v1; | |||
| @@ -0,0 +1,149 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||
| #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||
| #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| if(n <= 0) return(0); | |||
| FLOAT_V_T v0, v1, vx, vy; | |||
| if (inc_x == 0 || inc_y == 0) { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp; | |||
| while(i < n) | |||
| { | |||
| temp = c*x[ix] + s*y[iy] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| x[ix] = temp ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| v0 = VFMULVF_FLOAT(vx, c, vl); | |||
| v0 = VFMACCVF_FLOAT(v0, s, vy, vl); | |||
| VSEV_FLOAT(x, v0, vl); | |||
| v1 = VFMULVF_FLOAT(vx, s, vl); | |||
| v1 = VFMSACVF_FLOAT(v1, c, vy, vl); | |||
| VSEV_FLOAT(y, v1, vl); | |||
| } | |||
| } else if(inc_y == 1) { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| v0 = VFMULVF_FLOAT(vx, c, vl); | |||
| v0 = VFMACCVF_FLOAT(v0, s, vy, vl); | |||
| VSSEV_FLOAT(x, stride_x, v0, vl); | |||
| v1 = VFMULVF_FLOAT(vx, s, vl); | |||
| v1 = VFMSACVF_FLOAT(v1, c, vy, vl); | |||
| VSEV_FLOAT(y, v1, vl); | |||
| } | |||
| } else if(inc_x == 1) { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| v0 = VFMULVF_FLOAT(vx, c, vl); | |||
| v0 = VFMACCVF_FLOAT(v0, s, vy, vl); | |||
| VSEV_FLOAT(x, v0, vl); | |||
| v1 = VFMULVF_FLOAT(vx, s, vl); | |||
| v1 = VFMSACVF_FLOAT(v1, c, vy, vl); | |||
| VSSEV_FLOAT(y, stride_y, v1, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| v0 = VFMULVF_FLOAT(vx, c, vl); | |||
| v0 = VFMACCVF_FLOAT(v0, s, vy, vl); | |||
| VSSEV_FLOAT(x, stride_x, v0, vl); | |||
| v1 = VFMULVF_FLOAT(vx, s, vl); | |||
| v1 = VFMSACVF_FLOAT(v1, c, vy, vl); | |||
| VSSEV_FLOAT(y, stride_y, v1, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VSEV_FLOAT vse32_v_f32m4 | |||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f32m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) | |||
| #define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) | |||
| #define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4) | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VSEV_FLOAT vse64_v_f64m4 | |||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f64m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) | |||
| #define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) | |||
| #define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4) | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| @@ -57,11 +57,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| BLASLONG ix=0,iy=0; | |||
| if(n <= 0) return(0); | |||
| unsigned int gvl = 0; | |||
| unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); | |||
| FLOAT_V_T v0, v1, vx, vy; | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| gvl = VSETVL(n); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vy = VLEV_FLOAT(&y[j], gvl); | |||
| @@ -90,7 +89,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| VSEV_FLOAT(&y[j], v1, gvl); | |||
| } | |||
| }else if(inc_y == 1){ | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| @@ -122,7 +120,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| VSEV_FLOAT(&y[j], v1, gvl); | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| @@ -154,8 +151,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl); | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| @@ -0,0 +1,97 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| if ( (n <= 0) || (inc_x <= 0)) return(0); | |||
| FLOAT_V_T v0; | |||
| if(inc_x == 1) { | |||
| if(da == 0.0) { | |||
| int gvl = VSETVL_MAX; | |||
| v0 = VFMVVF_FLOAT(0.0, gvl); | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| VSEV_FLOAT(x, v0, vl); | |||
| } | |||
| } | |||
| else { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| v0 = VLEV_FLOAT(x, vl); | |||
| v0 = VFMULVF_FLOAT(v0, da, vl); | |||
| VSEV_FLOAT(x, v0, vl); | |||
| } | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| if(da == 0.0) { | |||
| int gvl = VSETVL_MAX; | |||
| v0 = VFMVVF_FLOAT(0.0, gvl); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| VSSEV_FLOAT(x, stride_x, v0, vl); | |||
| } | |||
| } | |||
| else { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| v0 = VLSEV_FLOAT(x, stride_x, vl); | |||
| v0 = VFMULVF_FLOAT(v0, da, vl); | |||
| VSSEV_FLOAT(x, stride_x, v0, vl); | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -26,28 +26,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define MLEN 32 | |||
| # else | |||
| # define ELEN 32 | |||
| # define MLEN 16 | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define MLEN 8 | |||
| # else | |||
| # define ELEN 32 | |||
| # define MLEN 4 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _) | |||
| #define VFMULVF_FLOAT JOIN(RISCV_RVV(vfmul), _vf_f, ELEN, LMUL, _) | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0,j=0; | |||
| @@ -84,25 +97,25 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| }else{ | |||
| if(da == 0.0){ | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG ix = 0; | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG ix = 0; | |||
| if(gvl <= n / 2){ | |||
| long int inc_xv = gvl * inc_x; | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){ | |||
| VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||
| VSSEV_FLOAT(&x[ix + inc_xv], stride_x, v0, gvl); | |||
| ix += inc_xv * 2; | |||
| } | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| for(i = 0; i < n/(gvl*2); ++i ){ | |||
| VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||
| ix += inc_x * gvl; | |||
| VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||
| ix += inc_x * gvl; | |||
| } | |||
| //tail | |||
| for(; j <n; ){ | |||
| gvl = VSETVL(n-j); | |||
| i *= gvl*2; | |||
| while( i < n ){ | |||
| gvl = VSETVL(n-i); | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||
| j += gvl; | |||
| ix += inc_x * gvl; | |||
| VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||
| i += gvl; | |||
| ix += inc_x * gvl; | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| @@ -0,0 +1,791 @@ | |||
| /* | |||
| AUTOGENERATED KERNEL | |||
| Script: ./kernel/riscv64/generate_kernel.py | |||
| Settings: | |||
| LMUL=2 | |||
| M=8 | |||
| M_tail_scalar_from=2 | |||
| N=8 | |||
| __riscv_='__riscv_' | |||
| complex=False | |||
| conjugate=False | |||
| cpu='zvl128b' | |||
| force_acc_double=False | |||
| index_type='BLASLONG' | |||
| op='gemm' | |||
| param_precision='float' | |||
| reg_width_bits=128 | |||
| tail_policy='' | |||
| trace=False | |||
| Derived: | |||
| ELEN_ACC=32 | |||
| ELEN_PARAM=32 | |||
| LMUL_ACC=2 | |||
| VFMACC='__riscv_vfmacc_vf_f32m2' | |||
| VFMUL='__riscv_vfmul_vf_f32m2' | |||
| VLEV='__riscv_vle32_v_f32m2' | |||
| VLSEV='__riscv_vlse32_v_f32m2' | |||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' | |||
| VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' | |||
| VSETVL='__riscv_vsetvl_e32m2' | |||
| VSEV='__riscv_vse32_v_f32m2' | |||
| VSSEV='__riscv_vsse32_v_f32m2' | |||
| acc_vector_t='vfloat32m2_t' | |||
| output='sgemm_kernel_8x8_zvl128b.c' | |||
| param_scalar_t='float' | |||
| param_vector_t='vfloat32m2_t' | |||
| */ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc) | |||
| { | |||
| BLASLONG gvl = 0; | |||
| BLASLONG m_top = 0; | |||
| BLASLONG n_top = 0; | |||
| // -- MAIN PASS | |||
| for (BLASLONG j = 0; j < N / 8; j += 1) { | |||
| m_top = 0; | |||
| BLASLONG gvl = __riscv_vsetvl_e32m2(8); | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| float B0 = B[bi + 0]; | |||
| float B1 = B[bi + 1]; | |||
| float B2 = B[bi + 2]; | |||
| float B3 = B[bi + 3]; | |||
| float B4 = B[bi + 4]; | |||
| float B5 = B[bi + 5]; | |||
| float B6 = B[bi + 6]; | |||
| float B7 = B[bi + 7]; | |||
| bi += 8; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||
| vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); | |||
| vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); | |||
| vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); | |||
| vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| B4 = B[bi + 4]; | |||
| B5 = B[bi + 5]; | |||
| B6 = B[bi + 6]; | |||
| B7 = B[bi + 7]; | |||
| bi += 8; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||
| result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); | |||
| result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); | |||
| result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); | |||
| result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); | |||
| c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); | |||
| c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); | |||
| c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl); | |||
| c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl); | |||
| c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl); | |||
| c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c4, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c5, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c6, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c7, gvl); | |||
| m_top += 8; | |||
| } | |||
| // -- tails for main pass | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| float B0 = B[bi + 0]; | |||
| float B1 = B[bi + 1]; | |||
| float B2 = B[bi + 2]; | |||
| float B3 = B[bi + 3]; | |||
| float B4 = B[bi + 4]; | |||
| float B5 = B[bi + 5]; | |||
| float B6 = B[bi + 6]; | |||
| float B7 = B[bi + 7]; | |||
| bi += 8; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||
| vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); | |||
| vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); | |||
| vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); | |||
| vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| B4 = B[bi + 4]; | |||
| B5 = B[bi + 5]; | |||
| B6 = B[bi + 6]; | |||
| B7 = B[bi + 7]; | |||
| bi += 8; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||
| result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); | |||
| result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); | |||
| result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); | |||
| result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); | |||
| c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); | |||
| c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); | |||
| c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl); | |||
| c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl); | |||
| c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl); | |||
| c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c4, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c5, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c6, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c7, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| float result4 = 0; | |||
| float result5 = 0; | |||
| float result6 = 0; | |||
| float result7 = 0; | |||
| float result8 = 0; | |||
| float result9 = 0; | |||
| float result10 = 0; | |||
| float result11 = 0; | |||
| float result12 = 0; | |||
| float result13 = 0; | |||
| float result14 = 0; | |||
| float result15 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| result4 += A[ai + 0] * B[bi + 2]; | |||
| result5 += A[ai + 1] * B[bi + 2]; | |||
| result6 += A[ai + 0] * B[bi + 3]; | |||
| result7 += A[ai + 1] * B[bi + 3]; | |||
| result8 += A[ai + 0] * B[bi + 4]; | |||
| result9 += A[ai + 1] * B[bi + 4]; | |||
| result10 += A[ai + 0] * B[bi + 5]; | |||
| result11 += A[ai + 1] * B[bi + 5]; | |||
| result12 += A[ai + 0] * B[bi + 6]; | |||
| result13 += A[ai + 1] * B[bi + 6]; | |||
| result14 += A[ai + 0] * B[bi + 7]; | |||
| result15 += A[ai + 1] * B[bi + 7]; | |||
| ai += 2; | |||
| bi += 8; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||
| C[ci + 2 * ldc + 0] += alpha * result4; | |||
| C[ci + 2 * ldc + 1] += alpha * result5; | |||
| C[ci + 3 * ldc + 0] += alpha * result6; | |||
| C[ci + 3 * ldc + 1] += alpha * result7; | |||
| C[ci + 4 * ldc + 0] += alpha * result8; | |||
| C[ci + 4 * ldc + 1] += alpha * result9; | |||
| C[ci + 5 * ldc + 0] += alpha * result10; | |||
| C[ci + 5 * ldc + 1] += alpha * result11; | |||
| C[ci + 6 * ldc + 0] += alpha * result12; | |||
| C[ci + 6 * ldc + 1] += alpha * result13; | |||
| C[ci + 7 * ldc + 0] += alpha * result14; | |||
| C[ci + 7 * ldc + 1] += alpha * result15; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| float result4 = 0; | |||
| float result5 = 0; | |||
| float result6 = 0; | |||
| float result7 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| result2 += A[ai + 0] * B[bi + 2]; | |||
| result3 += A[ai + 0] * B[bi + 3]; | |||
| result4 += A[ai + 0] * B[bi + 4]; | |||
| result5 += A[ai + 0] * B[bi + 5]; | |||
| result6 += A[ai + 0] * B[bi + 6]; | |||
| result7 += A[ai + 0] * B[bi + 7]; | |||
| ai += 1; | |||
| bi += 8; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||
| C[ci + 2 * ldc + 0] += alpha * result2; | |||
| C[ci + 3 * ldc + 0] += alpha * result3; | |||
| C[ci + 4 * ldc + 0] += alpha * result4; | |||
| C[ci + 5 * ldc + 0] += alpha * result5; | |||
| C[ci + 6 * ldc + 0] += alpha * result6; | |||
| C[ci + 7 * ldc + 0] += alpha * result7; | |||
| m_top += 1; | |||
| } | |||
| n_top += 8; | |||
| } | |||
| // -- tails for N=4 | |||
| if (N & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| float B0 = B[bi + 0]; | |||
| float B1 = B[bi + 1]; | |||
| float B2 = B[bi + 2]; | |||
| float B3 = B[bi + 3]; | |||
| bi += 4; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); | |||
| c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); | |||
| c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| float B0 = B[bi + 0]; | |||
| float B1 = B[bi + 1]; | |||
| float B2 = B[bi + 2]; | |||
| float B3 = B[bi + 3]; | |||
| bi += 4; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); | |||
| c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl); | |||
| c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| float result4 = 0; | |||
| float result5 = 0; | |||
| float result6 = 0; | |||
| float result7 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| result4 += A[ai + 0] * B[bi + 2]; | |||
| result5 += A[ai + 1] * B[bi + 2]; | |||
| result6 += A[ai + 0] * B[bi + 3]; | |||
| result7 += A[ai + 1] * B[bi + 3]; | |||
| ai += 2; | |||
| bi += 4; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||
| C[ci + 2 * ldc + 0] += alpha * result4; | |||
| C[ci + 2 * ldc + 1] += alpha * result5; | |||
| C[ci + 3 * ldc + 0] += alpha * result6; | |||
| C[ci + 3 * ldc + 1] += alpha * result7; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| result2 += A[ai + 0] * B[bi + 2]; | |||
| result3 += A[ai + 0] * B[bi + 3]; | |||
| ai += 1; | |||
| bi += 4; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||
| C[ci + 2 * ldc + 0] += alpha * result2; | |||
| C[ci + 3 * ldc + 0] += alpha * result3; | |||
| m_top += 1; | |||
| } | |||
| n_top += 4; | |||
| } | |||
| // -- tails for N=2 | |||
| if (N & 2) { | |||
| gvl = __riscv_vsetvl_e32m2(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| float B0 = B[bi + 0]; | |||
| float B1 = B[bi + 1]; | |||
| bi += 2; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| float B0 = B[bi + 0]; | |||
| float B1 = B[bi + 1]; | |||
| bi += 2; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| ci += ldc - gvl * 0; | |||
| vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||
| c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| ai += 2; | |||
| bi += 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||
| C[ci + 1 * ldc + 0] += alpha * result2; | |||
| C[ci + 1 * ldc + 1] += alpha * result3; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| ai += 1; | |||
| bi += 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 1 * ldc + 0] += alpha * result1; | |||
| m_top += 1; | |||
| } | |||
| n_top += 2; | |||
| } | |||
| // -- tails for N=1 | |||
| if (N & 1) { | |||
| gvl = __riscv_vsetvl_e32m2(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| float B0 = B[bi + 0]; | |||
| bi += 1; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| float B0 = B[bi + 0]; | |||
| bi += 1; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| for (BLASLONG k = 1; k < K; k++) { | |||
| B0 = B[bi + 0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl); | |||
| c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl); | |||
| ci = n_top * ldc + m_top; | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| ai += 2; | |||
| bi += 1; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| C[ci + 0 * ldc + 1] += alpha * result1; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| for (BLASLONG k = 0; k < K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| ai += 1; | |||
| bi += 1; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] += alpha * result0; | |||
| m_top += 1; | |||
| } | |||
| n_top += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,991 @@ | |||
| /* | |||
| AUTOGENERATED KERNEL | |||
| Script: ./kernel/riscv64/generate_kernel.py | |||
| Settings: | |||
| LMUL=2 | |||
| M=8 | |||
| M_tail_scalar_from=2 | |||
| N=8 | |||
| __riscv_='__riscv_' | |||
| complex=False | |||
| conjugate=False | |||
| cpu='zvl128b' | |||
| force_acc_double=False | |||
| index_type='BLASLONG' | |||
| op='trmm' | |||
| param_precision='float' | |||
| reg_width_bits=128 | |||
| tail_policy='' | |||
| trace=False | |||
| Derived: | |||
| ELEN_ACC=32 | |||
| ELEN_PARAM=32 | |||
| LMUL_ACC=2 | |||
| VFMACC='__riscv_vfmacc_vf_f32m2' | |||
| VFMUL='__riscv_vfmul_vf_f32m2' | |||
| VLEV='__riscv_vle32_v_f32m2' | |||
| VLSEV='__riscv_vlse32_v_f32m2' | |||
| VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2' | |||
| VMUL_TO_ACC='__riscv_vfmul_vf_f32m2' | |||
| VSETVL='__riscv_vsetvl_e32m2' | |||
| VSEV='__riscv_vse32_v_f32m2' | |||
| VSSEV='__riscv_vsse32_v_f32m2' | |||
| acc_vector_t='vfloat32m2_t' | |||
| output='strmm_kernel_8x8_zvl128b.c' | |||
| param_scalar_t='float' | |||
| param_vector_t='vfloat32m2_t' | |||
| */ | |||
| #include "common.h" | |||
| #if defined(LEFT) != defined(TRANSA) | |||
| #define BACKWARDS | |||
| #endif | |||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset) | |||
| { | |||
| BLASLONG gvl = 0; | |||
| BLASLONG m_top = 0; | |||
| BLASLONG n_top = 0; | |||
| // -- MAIN PASS | |||
| for (BLASLONG j = 0; j < N / 8; j += 1) { | |||
| m_top = 0; | |||
| BLASLONG gvl = __riscv_vsetvl_e32m2(8); | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 8; | |||
| bi += off * 8; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 8; | |||
| #else | |||
| pass_K = off + 8; | |||
| #endif | |||
| #endif | |||
| float B0 = B[bi + 0]; | |||
| float B1 = B[bi + 1]; | |||
| float B2 = B[bi + 2]; | |||
| float B3 = B[bi + 3]; | |||
| float B4 = B[bi + 4]; | |||
| float B5 = B[bi + 5]; | |||
| float B6 = B[bi + 6]; | |||
| float B7 = B[bi + 7]; | |||
| bi += 8; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||
| vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); | |||
| vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); | |||
| vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); | |||
| vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| B4 = B[bi + 4]; | |||
| B5 = B[bi + 5]; | |||
| B6 = B[bi + 6]; | |||
| B7 = B[bi + 7]; | |||
| bi += 8; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||
| result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); | |||
| result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); | |||
| result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); | |||
| result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||
| vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); | |||
| vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); | |||
| vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); | |||
| vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl); | |||
| vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl); | |||
| vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl); | |||
| vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl); | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c4, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c5, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c6, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c7, gvl); | |||
| m_top += 8; | |||
| } | |||
| // -- tails for main pass | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 4; | |||
| bi += off * 8; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 4; | |||
| #else | |||
| pass_K = off + 8; | |||
| #endif | |||
| #endif | |||
| float B0 = B[bi + 0]; | |||
| float B1 = B[bi + 1]; | |||
| float B2 = B[bi + 2]; | |||
| float B3 = B[bi + 3]; | |||
| float B4 = B[bi + 4]; | |||
| float B5 = B[bi + 5]; | |||
| float B6 = B[bi + 6]; | |||
| float B7 = B[bi + 7]; | |||
| bi += 8; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||
| vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl); | |||
| vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl); | |||
| vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl); | |||
| vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| B4 = B[bi + 4]; | |||
| B5 = B[bi + 5]; | |||
| B6 = B[bi + 6]; | |||
| B7 = B[bi + 7]; | |||
| bi += 8; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||
| result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl); | |||
| result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl); | |||
| result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl); | |||
| result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||
| vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); | |||
| vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); | |||
| vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); | |||
| vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl); | |||
| vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl); | |||
| vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl); | |||
| vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl); | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c4, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c5, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c6, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c7, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| float result4 = 0; | |||
| float result5 = 0; | |||
| float result6 = 0; | |||
| float result7 = 0; | |||
| float result8 = 0; | |||
| float result9 = 0; | |||
| float result10 = 0; | |||
| float result11 = 0; | |||
| float result12 = 0; | |||
| float result13 = 0; | |||
| float result14 = 0; | |||
| float result15 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 2; | |||
| bi += off * 8; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 2; | |||
| #else | |||
| pass_K = off + 8; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| result4 += A[ai + 0] * B[bi + 2]; | |||
| result5 += A[ai + 1] * B[bi + 2]; | |||
| result6 += A[ai + 0] * B[bi + 3]; | |||
| result7 += A[ai + 1] * B[bi + 3]; | |||
| result8 += A[ai + 0] * B[bi + 4]; | |||
| result9 += A[ai + 1] * B[bi + 4]; | |||
| result10 += A[ai + 0] * B[bi + 5]; | |||
| result11 += A[ai + 1] * B[bi + 5]; | |||
| result12 += A[ai + 0] * B[bi + 6]; | |||
| result13 += A[ai + 1] * B[bi + 6]; | |||
| result14 += A[ai + 0] * B[bi + 7]; | |||
| result15 += A[ai + 1] * B[bi + 7]; | |||
| ai += 2; | |||
| bi += 8; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||
| C[ci + 2 * ldc + 0] = alpha * result4; | |||
| C[ci + 2 * ldc + 1] = alpha * result5; | |||
| C[ci + 3 * ldc + 0] = alpha * result6; | |||
| C[ci + 3 * ldc + 1] = alpha * result7; | |||
| C[ci + 4 * ldc + 0] = alpha * result8; | |||
| C[ci + 4 * ldc + 1] = alpha * result9; | |||
| C[ci + 5 * ldc + 0] = alpha * result10; | |||
| C[ci + 5 * ldc + 1] = alpha * result11; | |||
| C[ci + 6 * ldc + 0] = alpha * result12; | |||
| C[ci + 6 * ldc + 1] = alpha * result13; | |||
| C[ci + 7 * ldc + 0] = alpha * result14; | |||
| C[ci + 7 * ldc + 1] = alpha * result15; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| float result4 = 0; | |||
| float result5 = 0; | |||
| float result6 = 0; | |||
| float result7 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 1; | |||
| bi += off * 8; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 1; | |||
| #else | |||
| pass_K = off + 8; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| result2 += A[ai + 0] * B[bi + 2]; | |||
| result3 += A[ai + 0] * B[bi + 3]; | |||
| result4 += A[ai + 0] * B[bi + 4]; | |||
| result5 += A[ai + 0] * B[bi + 5]; | |||
| result6 += A[ai + 0] * B[bi + 6]; | |||
| result7 += A[ai + 0] * B[bi + 7]; | |||
| ai += 1; | |||
| bi += 8; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||
| C[ci + 2 * ldc + 0] = alpha * result2; | |||
| C[ci + 3 * ldc + 0] = alpha * result3; | |||
| C[ci + 4 * ldc + 0] = alpha * result4; | |||
| C[ci + 5 * ldc + 0] = alpha * result5; | |||
| C[ci + 6 * ldc + 0] = alpha * result6; | |||
| C[ci + 7 * ldc + 0] = alpha * result7; | |||
| m_top += 1; | |||
| } | |||
| n_top += 8; | |||
| } | |||
| // -- tails for N=4 | |||
| if (N & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 8; | |||
| bi += off * 4; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 8; | |||
| #else | |||
| pass_K = off + 4; | |||
| #endif | |||
| #endif | |||
| float B0 = B[bi + 0]; | |||
| float B1 = B[bi + 1]; | |||
| float B2 = B[bi + 2]; | |||
| float B3 = B[bi + 3]; | |||
| bi += 4; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||
| vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); | |||
| vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); | |||
| vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 4; | |||
| bi += off * 4; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 4; | |||
| #else | |||
| pass_K = off + 4; | |||
| #endif | |||
| #endif | |||
| float B0 = B[bi + 0]; | |||
| float B1 = B[bi + 1]; | |||
| float B2 = B[bi + 2]; | |||
| float B3 = B[bi + 3]; | |||
| bi += 4; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||
| vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl); | |||
| vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| B2 = B[bi + 2]; | |||
| B3 = B[bi + 3]; | |||
| bi += 4; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||
| result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl); | |||
| result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||
| vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); | |||
| vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl); | |||
| vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl); | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c2, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c3, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| float result4 = 0; | |||
| float result5 = 0; | |||
| float result6 = 0; | |||
| float result7 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 2; | |||
| bi += off * 4; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 2; | |||
| #else | |||
| pass_K = off + 4; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| result4 += A[ai + 0] * B[bi + 2]; | |||
| result5 += A[ai + 1] * B[bi + 2]; | |||
| result6 += A[ai + 0] * B[bi + 3]; | |||
| result7 += A[ai + 1] * B[bi + 3]; | |||
| ai += 2; | |||
| bi += 4; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||
| C[ci + 2 * ldc + 0] = alpha * result4; | |||
| C[ci + 2 * ldc + 1] = alpha * result5; | |||
| C[ci + 3 * ldc + 0] = alpha * result6; | |||
| C[ci + 3 * ldc + 1] = alpha * result7; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 1; | |||
| bi += off * 4; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 1; | |||
| #else | |||
| pass_K = off + 4; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| result2 += A[ai + 0] * B[bi + 2]; | |||
| result3 += A[ai + 0] * B[bi + 3]; | |||
| ai += 1; | |||
| bi += 4; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||
| C[ci + 2 * ldc + 0] = alpha * result2; | |||
| C[ci + 3 * ldc + 0] = alpha * result3; | |||
| m_top += 1; | |||
| } | |||
| n_top += 4; | |||
| } | |||
| // -- tails for N=2 | |||
| if (N & 2) { | |||
| gvl = __riscv_vsetvl_e32m2(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 8; | |||
| bi += off * 2; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 8; | |||
| #else | |||
| pass_K = off + 2; | |||
| #endif | |||
| #endif | |||
| float B0 = B[bi + 0]; | |||
| float B1 = B[bi + 1]; | |||
| bi += 2; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||
| vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 4; | |||
| bi += off * 2; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 4; | |||
| #else | |||
| pass_K = off + 2; | |||
| #endif | |||
| #endif | |||
| float B0 = B[bi + 0]; | |||
| float B1 = B[bi + 1]; | |||
| bi += 2; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| B1 = B[bi + 1]; | |||
| bi += 2; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||
| vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl); | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| ci += ldc - gvl * 0; | |||
| __riscv_vse32_v_f32m2(&C[ci], c1, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| float result2 = 0; | |||
| float result3 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 2; | |||
| bi += off * 2; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 2; | |||
| #else | |||
| pass_K = off + 2; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| result2 += A[ai + 0] * B[bi + 1]; | |||
| result3 += A[ai + 1] * B[bi + 1]; | |||
| ai += 2; | |||
| bi += 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||
| C[ci + 1 * ldc + 0] = alpha * result2; | |||
| C[ci + 1 * ldc + 1] = alpha * result3; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 1; | |||
| bi += off * 2; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 1; | |||
| #else | |||
| pass_K = off + 2; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 0] * B[bi + 1]; | |||
| ai += 1; | |||
| bi += 2; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 1 * ldc + 0] = alpha * result1; | |||
| m_top += 1; | |||
| } | |||
| n_top += 2; | |||
| } | |||
| // -- tails for N=1 | |||
| if (N & 1) { | |||
| gvl = __riscv_vsetvl_e32m2(8); | |||
| m_top = 0; | |||
| for (BLASLONG i = 0; i < M / 8; i += 1) { | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 8; | |||
| bi += off * 1; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 8; | |||
| #else | |||
| pass_K = off + 1; | |||
| #endif | |||
| #endif | |||
| float B0 = B[bi + 0]; | |||
| bi += 1; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 8; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| m_top += 8; | |||
| } | |||
| if (M & 4) { | |||
| gvl = __riscv_vsetvl_e32m2(4); | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 4; | |||
| bi += off * 1; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 4; | |||
| #else | |||
| pass_K = off + 1; | |||
| #endif | |||
| #endif | |||
| float B0 = B[bi + 0]; | |||
| bi += 1; | |||
| vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl); | |||
| for (BLASLONG k = 1; k < pass_K; k++) { | |||
| B0 = B[bi + 0]; | |||
| bi += 1; | |||
| A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl); | |||
| ai += 4; | |||
| result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl); | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl); | |||
| __riscv_vse32_v_f32m2(&C[ci], c0, gvl); | |||
| m_top += 4; | |||
| } | |||
| if (M & 2) { | |||
| float result0 = 0; | |||
| float result1 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 2; | |||
| bi += off * 1; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 2; | |||
| #else | |||
| pass_K = off + 1; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| result1 += A[ai + 1] * B[bi + 0]; | |||
| ai += 2; | |||
| bi += 1; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| C[ci + 0 * ldc + 1] = alpha * result1; | |||
| m_top += 2; | |||
| } | |||
| if (M & 1) { | |||
| float result0 = 0; | |||
| BLASLONG ai = m_top * K; | |||
| BLASLONG bi = n_top * K; | |||
| BLASLONG pass_K = K; | |||
| #ifdef LEFT | |||
| BLASLONG off = offset + m_top; | |||
| #else | |||
| BLASLONG off = -offset + n_top; | |||
| #endif | |||
| #ifdef BACKWARDS | |||
| ai += off * 1; | |||
| bi += off * 1; | |||
| pass_K -= off; | |||
| #else | |||
| #ifdef LEFT | |||
| pass_K = off + 1; | |||
| #else | |||
| pass_K = off + 1; | |||
| #endif | |||
| #endif | |||
| for (BLASLONG k = 0; k < pass_K; k++) { | |||
| result0 += A[ai + 0] * B[bi + 0]; | |||
| ai += 1; | |||
| bi += 1; | |||
| } | |||
| BLASLONG ci = n_top * ldc + m_top; | |||
| C[ci + 0 * ldc + 0] = alpha * result0; | |||
| m_top += 1; | |||
| } | |||
| n_top += 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,95 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f32m8_tu | |||
| #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFADDVV_FLOAT_TU __riscv_vfadd_vv_f64m8_tu | |||
| #define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT sumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| FLOAT_V_T vx, vsum; | |||
| FLOAT_V_T_M1 v_res; | |||
| v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| vsum = VFMVVF_FLOAT(0.0, vlmax); | |||
| if(inc_x == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); | |||
| } | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl); | |||
| } | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); | |||
| sumf = VFMVFS_FLOAT_M1(v_res); | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,114 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) | |||
| #define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8) | |||
| #else | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8) | |||
| #define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8) | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0, j=0; | |||
| BLASLONG ix=0; | |||
| FLOAT asumf=0.0; | |||
| if (n <= 0 || inc_x <= 0) return(asumf); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_sum; | |||
| FLOAT_V_T_M1 v_res; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| v_sum = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||
| j += gvl * 2; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| v_sum = VFMVVF_FLOAT(0, gvl); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||
| j += gvl * 2; | |||
| inc_xv += inc_xv * 2; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl); | |||
| } | |||
| for(;j<n;){ | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl); | |||
| j += gvl; | |||
| } | |||
| } | |||
| asumf = EXTRACT_FLOAT(v_res); | |||
| return(asumf); | |||
| } | |||
| @@ -41,7 +41,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp; | |||
| if ( n < 0 ) return(0); | |||
| if ( n <= 0 ) return(0); | |||
| while(i < n) | |||
| { | |||
| @@ -0,0 +1,138 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG stride_x, stride_y; | |||
| FLOAT_V_T vx, vy; | |||
| if (n <= 0) return(0); | |||
| if (inc_x == 0 && inc_y == 0) { | |||
| if (n & 1) { | |||
| FLOAT temp = x[0]; | |||
| x[0] = y[0]; | |||
| y[0] = temp; | |||
| } | |||
| else { | |||
| return 0; | |||
| } | |||
| } | |||
| else if(inc_x == 0) { | |||
| FLOAT temp = x[0]; | |||
| x[0] = y[(n - 1) * inc_y]; | |||
| FLOAT* ptr = y + (n - 1) * inc_y; // start from the last one | |||
| stride_y = (0 - inc_y) * sizeof(FLOAT); // reverse | |||
| BLASLONG m = n - 1; | |||
| for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_y) { | |||
| vl = VSETVL(m); | |||
| vy = VLSEV_FLOAT(ptr - 1, stride_y, vl); | |||
| VSSEV_FLOAT(ptr, stride_y, vy, vl); | |||
| } | |||
| y[0] = temp; | |||
| } | |||
| else if(inc_y == 0) { | |||
| FLOAT temp = y[0]; | |||
| y[0] = x[(n - 1) * inc_x]; | |||
| FLOAT* ptr = x + (n - 1) * inc_x; // start from the last one | |||
| stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse | |||
| BLASLONG m = n - 1; | |||
| for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) { | |||
| vl = VSETVL(m); | |||
| vx = VLSEV_FLOAT(ptr - 1, stride_x, vl); | |||
| VSSEV_FLOAT(ptr, stride_x, vx, vl); | |||
| } | |||
| x[0] = temp; | |||
| } | |||
| else if(inc_x == 1 && inc_y == 1) { | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| VSEV_FLOAT(y, vx, vl); | |||
| VSEV_FLOAT(x, vy, vl); | |||
| } | |||
| } else if (inc_y == 1) { | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLEV_FLOAT(y, vl); | |||
| VSEV_FLOAT(y, vx, vl); | |||
| VSSEV_FLOAT(x, stride_x, vy, vl); | |||
| } | |||
| } else if(inc_x == 1) { | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| VSSEV_FLOAT(y, stride_y, vx, vl); | |||
| VSEV_FLOAT(x, vy, vl); | |||
| } | |||
| } else { | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) { | |||
| vl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, vl); | |||
| vy = VLSEV_FLOAT(y, stride_y, vl); | |||
| VSSEV_FLOAT(y, stride_y, vx, vl); | |||
| VSSEV_FLOAT(x, stride_x, vy, vl); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -27,35 +27,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <stdio.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle32_v_f32m8 | |||
| #define VLSEV_FLOAT vlse32_v_f32m8 | |||
| #define VSEV_FLOAT vse32_v_f32m8 | |||
| #define VSSEV_FLOAT vsse32_v_f32m8 | |||
| #ifdef RISCV64_ZVL256B | |||
| # define LMUL m2 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define MLEN 32 | |||
| # else | |||
| # define ELEN 32 | |||
| # define MLEN 16 | |||
| # endif | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle64_v_f64m8 | |||
| #define VLSEV_FLOAT vlse64_v_f64m8 | |||
| #define VSEV_FLOAT vse64_v_f64m8 | |||
| #define VSSEV_FLOAT vsse64_v_f64m8 | |||
| # define LMUL m8 | |||
| # if defined(DOUBLE) | |||
| # define ELEN 64 | |||
| # define MLEN 8 | |||
| # else | |||
| # define ELEN 32 | |||
| # define MLEN 4 | |||
| # endif | |||
| #endif | |||
| #define _ | |||
| #define JOIN2_X(x, y) x ## y | |||
| #define JOIN2(x, y) JOIN2_X(x, y) | |||
| #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z) | |||
| #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) | |||
| #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) | |||
| #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) | |||
| #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) | |||
| #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i = 0, j = 0; | |||
| BLASLONG ix = 0,iy = 0; | |||
| BLASLONG stride_x, stride_y; | |||
| FLOAT_V_T vx0, vx1, vy0, vy1; | |||
| unsigned int gvl = 0; | |||
| if (n < 0) return(0); | |||
| if (n <= 0) return(0); | |||
| unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1); | |||
| if( inc_x == 0 && inc_y == 0 ) { n = n & 1; } | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| for(i=0,j=0; i<n/(2*gvl); i++){ | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| @@ -79,7 +96,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| j+=gvl; | |||
| } | |||
| }else if (inc_y == 1){ | |||
| gvl = VSETVL(n); | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| @@ -107,7 +123,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| ix += inc_x * gvl; | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = VSETVL(n); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| @@ -135,8 +150,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| iy += inc_y * gvl; | |||
| } | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1); | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| @@ -0,0 +1,101 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m2() | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m2 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 | |||
| #define INT_V_T vint32m2_t | |||
| #define VID_V_INT __riscv_vid_v_i32m2 | |||
| #define VADD_VX_INT __riscv_vadd_vx_i32m2 | |||
| #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 | |||
| #define VBOOL_T vbool16_t | |||
| #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m2() | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m2 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 | |||
| #define INT_V_T vint64m2_t | |||
| #define VID_V_INT __riscv_vid_v_i64m2 | |||
| #define VADD_VX_INT __riscv_vadd_vx_i64m2 | |||
| #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 | |||
| #define VBOOL_T vbool32_t | |||
| #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/symm_lcopy_4.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) | |||
| { | |||
| BLASLONG i, js, offset; | |||
| FLOAT *ao1, *ao2; | |||
| BLASLONG stride_lda = sizeof(FLOAT)*lda; | |||
| FLOAT_V_T vb, va1, va2; | |||
| VBOOL_T vbool; | |||
| INT_V_T vindex_max, vindex; | |||
| size_t vl = VSETVL_MAX; | |||
| vindex_max = VID_V_INT(vl); | |||
| for (js = n; js > 0; js -= vl, posX += vl) { | |||
| vl = VSETVL(js); | |||
| offset = posX - posY; | |||
| ao1 = a + posX + posY * lda; | |||
| ao2 = a + posY + (posX) * lda; | |||
| for (i = m; i > 0; i--, offset--) { | |||
| va2 = VLSEV_FLOAT(ao2, stride_lda, vl); | |||
| va1 = VLEV_FLOAT(ao1, vl); | |||
| // offset > (0 - vindex) ---> (offset + vindex) > 0 | |||
| vindex = VADD_VX_INT(vindex_max, offset, vl); | |||
| vbool = VMSGT_VX_INT(vindex, 0, vl); | |||
| vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl); | |||
| VSEV_FLOAT(b, vb, vl); | |||
| b += vl; | |||
| ao1 += lda; | |||
| ao2++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,100 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) __riscv_vsetvl_e32m2(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m2() | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m2 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m2 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 | |||
| #define INT_V_T vint32m2_t | |||
| #define VID_V_INT __riscv_vid_v_i32m2 | |||
| #define VADD_VX_INT __riscv_vadd_vx_i32m2 | |||
| #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 | |||
| #define VBOOL_T vbool16_t | |||
| #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 | |||
| #else | |||
| #define VSETVL(n) __riscv_vsetvl_e64m2(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m2() | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m2 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m2 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 | |||
| #define INT_V_T vint64m2_t | |||
| #define VID_V_INT __riscv_vid_v_i64m2 | |||
| #define VADD_VX_INT __riscv_vadd_vx_i64m2 | |||
| #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 | |||
| #define VBOOL_T vbool32_t | |||
| #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/symm_ucopy_4.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) | |||
| { | |||
| BLASLONG i, js, offset; | |||
| FLOAT *ao1, *ao2; | |||
| BLASLONG stride_lda = sizeof(FLOAT)*lda; | |||
| FLOAT_V_T vb, va1, va2; | |||
| VBOOL_T vbool; | |||
| INT_V_T vindex_max, vindex; | |||
| size_t vl = VSETVL_MAX; | |||
| vindex_max = VID_V_INT(vl); | |||
| for (js = n; js > 0; js -= vl, posX += vl) { | |||
| vl = VSETVL(js); | |||
| offset = posX - posY; | |||
| ao1 = a + posY + (posX + 0) * lda; | |||
| ao2 = a + posX + 0 + posY * lda; | |||
| for (i = m; i > 0; i--, offset--) { | |||
| va1 = VLSEV_FLOAT(ao1, stride_lda, vl); | |||
| va2 = VLEV_FLOAT(ao2, vl); | |||
| // offset > (0 - vindex) ---> (offset + vindex) > 0 | |||
| vindex = VADD_VX_INT(vindex_max, offset, vl); | |||
| vbool = VMSGT_VX_INT(vindex, 0, vl); | |||
| vb = VMERGE_VVM_FLOAT(va2, va1, vbool, vl); | |||
| VSEV_FLOAT(b, vb, vl); | |||
| b += vl; | |||
| ao1++; | |||
| ao2 += lda; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,219 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
| #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
| #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j, k; | |||
| BLASLONG ix,iy; | |||
| BLASLONG jx,jy; | |||
| FLOAT temp1; | |||
| FLOAT *a_ptr = a; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| size_t vlmax = VSETVL_MAX_M1, vl; | |||
| v_z0 = VFMVVF_FLOAT_M1(0, vlmax); | |||
| vlmax = VSETVL_MAX; | |||
| FLOAT_V_T va, vx, vy, vr; | |||
| BLASLONG stride_x, stride_y, inc_xv, inc_yv; | |||
| if(inc_x == 1 && inc_y == 1) | |||
| { | |||
| for (j=0; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[j]; | |||
| y[j] += temp1 * a_ptr[j]; | |||
| i = j + 1; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (k = (m-i); k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VLEV_FLOAT(&y[i], vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSEV_FLOAT(&y[i], vy, vl); | |||
| vx = VLEV_FLOAT(&x[i], vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||
| y[j] += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| else if(inc_x == 1) | |||
| { | |||
| jy = 0; | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| for (j=0; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[j]; | |||
| y[jy] += temp1 * a_ptr[j]; | |||
| iy = jy + inc_y; | |||
| i = j + 1; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (k = (m-i); k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| inc_yv = inc_y * vl; | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy, vl); | |||
| vx = VLEV_FLOAT(&x[i], vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||
| y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| jy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| else if(inc_y == 1) | |||
| { | |||
| jx = 0; | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| for (j=0; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| y[j] += temp1 * a_ptr[j]; | |||
| ix = jx + inc_x; | |||
| i = j + 1; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (k = (m-i); k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| inc_xv = inc_x * vl; | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VLEV_FLOAT(&y[i], vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSEV_FLOAT(&y[i], vy, vl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||
| y[j] += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| jx += inc_x; | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| jx = 0; | |||
| jy = 0; | |||
| for (j=0; j<offset; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| y[jy] += temp1 * a_ptr[j]; | |||
| ix = jx + inc_x; | |||
| iy = jy + inc_y; | |||
| i = j + 1; | |||
| vr = VFMVVF_FLOAT(0, vlmax); | |||
| for (k = (m-i); k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| inc_xv = inc_x * vl; | |||
| inc_yv = inc_y * vl; | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy, vl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| ix += inc_xv; | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax); | |||
| y[jy] += alpha * VFMVFS_FLOAT_M1(v_res); | |||
| jx += inc_x; | |||
| jy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -27,37 +27,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VSEV_FLOAT vse32_v_f32m4 | |||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) | |||
| #endif | |||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) | |||
| #else | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VSEV_FLOAT vse64_v_f64m4 | |||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) | |||
| #endif | |||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| @@ -99,8 +105,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 = EXTRACT_FLOAT(v_res); | |||
| if(i < m){ | |||
| gvl = VSETVL(m-i); | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| @@ -110,8 +116,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLEV_FLOAT(&x[i], gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 += EXTRACT_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[j] += alpha * temp2; | |||
| @@ -144,8 +150,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 = EXTRACT_FLOAT(v_res); | |||
| if(i < m){ | |||
| gvl = VSETVL(m-i); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -155,8 +161,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLEV_FLOAT(&x[i], gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 += EXTRACT_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[jy] += alpha * temp2; | |||
| @@ -190,8 +196,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 = EXTRACT_FLOAT(v_res); | |||
| if(i < m){ | |||
| gvl = VSETVL(m-i); | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| @@ -201,8 +207,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 += EXTRACT_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[j] += alpha * temp2; | |||
| @@ -241,8 +247,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| ix += inc_xv; | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 = EXTRACT_FLOAT(v_res); | |||
| if(i < m){ | |||
| gvl = VSETVL(m-i); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -252,8 +258,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 += EXTRACT_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[jy] += alpha * temp2; | |||
| @@ -0,0 +1,216 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1() | |||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e32m8() | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT __riscv_vle32_v_f32m8 | |||
| #define VSEV_FLOAT __riscv_vse32_v_f32m8 | |||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m8_tu | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
| #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 | |||
| #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 | |||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m8_f32m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1() | |||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
| #define VSETVL_MAX __riscv_vsetvlmax_e64m8() | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT __riscv_vle64_v_f64m8 | |||
| #define VSEV_FLOAT __riscv_vse64_v_f64m8 | |||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
| #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m8_tu | |||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
| #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m8 | |||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||
| #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 | |||
| #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 | |||
| #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 | |||
| #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m8_f64m1 | |||
| #define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j, k; | |||
| BLASLONG ix,iy; | |||
| BLASLONG jx,jy; | |||
| FLOAT temp1; | |||
| FLOAT *a_ptr = a; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| size_t vl_max = VSETVL_MAX_M1, vl; | |||
| v_z0 = VFMVVF_FLOAT_M1(0, vl_max); | |||
| vl_max = VSETVL_MAX; | |||
| FLOAT_V_T va, vx, vy, vr; | |||
| BLASLONG stride_x, stride_y, inc_xv, inc_yv; | |||
| BLASLONG m1 = m - offset; | |||
| if(inc_x == 1 && inc_y == 1) | |||
| { | |||
| a_ptr += m1 * lda; | |||
| for (j=m1; j<m; j++) | |||
| { | |||
| temp1 = alpha * x[j]; | |||
| i = 0; | |||
| vr = VFMVVF_FLOAT(0, vl_max); | |||
| for (k = j; k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vy = VLEV_FLOAT(&y[i], vl); | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSEV_FLOAT(&y[i], vy, vl); | |||
| vx = VLEV_FLOAT(&x[i], vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); | |||
| y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| else if(inc_x == 1) | |||
| { | |||
| jy = m1 * inc_y; | |||
| a_ptr += m1 * lda; | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| for (j=m1; j<m; j++) | |||
| { | |||
| temp1 = alpha * x[j]; | |||
| iy = 0; | |||
| i = 0; | |||
| vr = VFMVVF_FLOAT(0, vl_max); | |||
| for (k = j; k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| inc_yv = inc_y * vl; | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, vl); | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy, vl); | |||
| vx = VLEV_FLOAT(&x[i], vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); | |||
| y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); | |||
| a_ptr += lda; | |||
| jy += inc_y; | |||
| } | |||
| } | |||
| else if(inc_y == 1) | |||
| { | |||
| jx = m1 * inc_x; | |||
| a_ptr += m1 * lda; | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| for (j=m1; j<m; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| ix = 0; | |||
| i = 0; | |||
| vr = VFMVVF_FLOAT(0, vl_max); | |||
| for (k = j; k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| inc_xv = inc_x * vl; | |||
| vy = VLEV_FLOAT(&y[i], vl); | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSEV_FLOAT(&y[i], vy, vl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); | |||
| y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); | |||
| a_ptr += lda; | |||
| jx += inc_x; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| jx = m1 * inc_x; | |||
| jy = m1 * inc_y; | |||
| a_ptr += m1 * lda; | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| for (j=m1; j<m; j++) | |||
| { | |||
| temp1 = alpha * x[jx]; | |||
| ix = 0; | |||
| iy = 0; | |||
| i = 0; | |||
| vr = VFMVVF_FLOAT(0, vl_max); | |||
| for (k = j; k > 0; k -= vl, i += vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| inc_xv = inc_x * vl; | |||
| inc_yv = inc_y * vl; | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, vl); | |||
| va = VLEV_FLOAT(&a_ptr[i], vl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, vl); | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy, vl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, vl); | |||
| vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl); | |||
| ix += inc_xv; | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max); | |||
| y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res); | |||
| a_ptr += lda; | |||
| jx += inc_x; | |||
| jy += inc_y; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -27,39 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32 | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEV_FLOAT vlse32_v_f32m4 | |||
| #define VSEV_FLOAT vse32_v_f32m4 | |||
| #define VSSEV_FLOAT vsse32_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl) | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1) | |||
| #endif | |||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1) | |||
| #define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4) | |||
| #define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4) | |||
| #else | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64 | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEV_FLOAT vlse64_v_f64m4 | |||
| #define VSEV_FLOAT vse64_v_f64m4 | |||
| #define VSSEV_FLOAT vsse64_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) | |||
| #ifdef RISCV_0p10_INTRINSICS | |||
| #define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl) | |||
| #else | |||
| #define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1) | |||
| #endif | |||
| #define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) | |||
| #define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1) | |||
| #define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4) | |||
| #define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| @@ -101,8 +107,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 = EXTRACT_FLOAT(v_res); | |||
| if(i < j){ | |||
| gvl = VSETVL(j-i); | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| @@ -112,8 +118,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLEV_FLOAT(&x[i], gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 += EXTRACT_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[j] += temp1 * a_ptr[j] + alpha * temp2; | |||
| @@ -145,8 +151,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 = EXTRACT_FLOAT(v_res); | |||
| if(i < j){ | |||
| gvl = VSETVL(j-i); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -156,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLEV_FLOAT(&x[i], gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 += EXTRACT_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[jy] += temp1 * a_ptr[j] + alpha * temp2; | |||
| @@ -190,8 +196,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| ix += inc_xv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 = EXTRACT_FLOAT(v_res); | |||
| if(i < j){ | |||
| gvl = VSETVL(j-i); | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| @@ -201,8 +207,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 += EXTRACT_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[j] += temp1 * a_ptr[j] + alpha * temp2; | |||
| @@ -240,8 +246,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| ix += inc_xv; | |||
| iy += inc_yv; | |||
| } | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 = EXTRACT_FLOAT(v_res); | |||
| if(i < j){ | |||
| gvl = VSETVL(j-i); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -251,8 +257,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += VFMVFS_FLOAT(v_res); | |||
| v_res = VFREDSUM_FLOAT(vr, v_z0, gvl); | |||
| temp2 += EXTRACT_FLOAT(v_res); | |||
| } | |||
| } | |||
| y[jy] += temp1 * a_ptr[j] + alpha * temp2; | |||