Merge develop int0 0.3.0 for release 0.3.16tags/v0.3.16
| @@ -43,11 +43,6 @@ jobs: | |||
| - name: Update Homebrew | |||
| if: github.event_name != 'pull_request' | |||
| run: brew update || true | |||
| - name: unlink installed gcc to allow updating | |||
| run: | | |||
| brew unlink gcc@8 | |||
| brew unlink gcc@9 | |||
| - name: Install prerequisites | |||
| run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas | |||
| @@ -194,3 +194,6 @@ In chronological order: | |||
| * PingTouGe Semiconductor Co., Ltd. | |||
| * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910 | |||
| * River Dillon <oss@outerpassage.net> | |||
| * [2021-07-10] fix compilation with musl libc | |||
| @@ -1,4 +1,52 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.16 | |||
| 11-Jul-2021 | |||
| common: | |||
| - drastically reduced the stack size requirements for running the LAPACK | |||
| testsuite (Reference-LAPACK PR 553) | |||
| - fixed spurious test failures in the LAPACK testsuite (Reference-LAPACK | |||
| PR 564) | |||
| - expressly setting DYNAMIC_ARCH=0 no longer enables dynamic_arch mode | |||
| - improved performance of xGER, xSPR, xSPR2, xSYR, xSYR2, xTRSV, SGEMV_N | |||
| and DGEMV_N, for small input sizes and consecutive arguments | |||
| - improved performance of xGETRF, xPORTF and xPOTRI for small input sizes | |||
| by disabling multithreading | |||
| - fixed installing with BSD versions of the "install" utility | |||
| RISCV: | |||
| - fixed the implementation of xIMIN | |||
| - improved the performance of DSDOT | |||
| - fixed linking of the tests on C910V with current vendor gcc | |||
| POWER: | |||
| - fixed SBGEMM computation for some odd value inputs | |||
| - fixed compilation for PPCG4, PPC970, POWER3, POWER4 and POWER5 | |||
| x86_64: | |||
| - improved performance of SGEMV_N and SGEMV_T for small N on AVX512-capable cpus | |||
| - worked around a miscompilation of ZGEMM/ZTRMM on Sandybridge with old gcc | |||
| versions | |||
| - fixed compilation with MS Visual Studio versions older than 2017 | |||
| - fixed macro name collision with winnt.h from the latest Win10 SDK | |||
| - added cpu type autodetection for Intel Ice Lake SP | |||
| - fixed cpu type autodetection for Intel Tiger Lake | |||
| - added cpu type autodetection for recent Centaur/Zhaoxin models | |||
| - fixed compilation with musl libc | |||
| ARM64: | |||
| - fixed compilation with gcc/gfortran on the Apple M1 | |||
| - fixed linking of the tests on FreeBSD | |||
| - fixed missing restore of a register in the recently rewritten DNRM2 kernel | |||
| for ThunderX2 and Neoverse N1 that could cause spurious failures in e.g. | |||
| DGEEV | |||
| - added compiler optimization flags for the EMAG8180 | |||
| - added initial support for Cortex A55 | |||
| ARM: | |||
| - fixed linking of the tests on FreeBSD | |||
| ==================================================================== | |||
| Version 0.3.15 | |||
| 2-May-2021 | |||
| @@ -57,6 +57,28 @@ endif | |||
| endif | |||
| endif | |||
| # Use a53 tunings because a55 is only available in GCC>=8.1 | |||
| ifeq ($(CORE), CORTEXA55) | |||
| ifeq ($(GCCVERSIONGTEQ7), 1) | |||
| ifeq ($(GCCVERSIONGTEQ8), 1) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53 | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), THUNDERX) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=thunderx | |||
| ifneq ($(F_COMPILER), NAG) | |||
| @@ -107,4 +129,13 @@ FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq ($(CORE), EMAG8180) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=emag | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=emag | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -74,17 +74,17 @@ endif | |||
| ifneq ($(OSNAME), AIX) | |||
| ifndef NO_LAPACKE | |||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" | |||
| @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" | |||
| @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | |||
| @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" | |||
| @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" | |||
| @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" | |||
| endif | |||
| #for install static library | |||
| ifneq ($(NO_STATIC),1) | |||
| @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| @@ -92,7 +92,7 @@ endif | |||
| ifneq ($(NO_SHARED),1) | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) | |||
| @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| @@ -333,6 +333,7 @@ GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | |||
| GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) | |||
| GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | |||
| GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8) | |||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
| GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) | |||
| GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) | |||
| @@ -380,6 +381,12 @@ ifeq ($(OSNAME), AIX) | |||
| EXTRALIB += -lm | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| ifeq ($(ARCH), $(filter $(ARCH),arm arm64)) | |||
| EXTRALIB += -lm | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| NEED_PIC = 0 | |||
| NO_EXPRECISION = 1 | |||
| @@ -619,6 +626,7 @@ DYNAMIC_CORE += CORTEXA57 | |||
| DYNAMIC_CORE += CORTEXA72 | |||
| DYNAMIC_CORE += CORTEXA73 | |||
| DYNAMIC_CORE += NEOVERSEN1 | |||
| DYNAMIC_CORE += CORTEXA55 | |||
| DYNAMIC_CORE += FALKOR | |||
| DYNAMIC_CORE += THUNDERX | |||
| DYNAMIC_CORE += THUNDERX2T99 | |||
| @@ -1,6 +1,6 @@ | |||
| # COMPILER_PREFIX = mingw32- | |||
| ifndef DYNAMIC_ARCH | |||
| ifneq ($(DYNAMIC_ARCH),1) | |||
| ADD_CPUFLAGS = 1 | |||
| else | |||
| ifdef TARGET_CORE | |||
| @@ -9,7 +9,7 @@ endif | |||
| endif | |||
| ifndef DYNAMIC_ARCH | |||
| ifneq ($(DYNAMIC_ARCH),1) | |||
| ADD_CPUFLAGS = 1 | |||
| else | |||
| ifdef TARGET_CORE | |||
| @@ -27,7 +27,7 @@ We provide official binary packages for the following platform: | |||
| * Windows x86/x86_64 | |||
| You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). | |||
| You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases). | |||
| ## Installation from Source | |||
| @@ -92,6 +92,7 @@ CORTEXA57 | |||
| CORTEXA72 | |||
| CORTEXA73 | |||
| NEOVERSEN1 | |||
| CORTEXA55 | |||
| EMAG8180 | |||
| FALKOR | |||
| THUNDERX | |||
| @@ -47,6 +47,7 @@ environment: | |||
| install: | |||
| - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat | |||
| - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | |||
| - if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false | |||
| - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 | |||
| - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 | |||
| - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" | |||
| @@ -137,3 +137,31 @@ jobs: | |||
| source /opt/intel/oneapi/setvars.sh | |||
| make CC=/usr/local/opt/llvm/bin/clang FC=ifort | |||
| - job: OSX_NDK_ARMV7 | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| brew install --cask android-ndk | |||
| export ANDROID_NDK_HOME=/usr/local/share/android-ndk | |||
| make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 | |||
| - job: ALPINE_MUSL | |||
| pool: | |||
| vmImage: 'ubuntu-latest' | |||
| steps: | |||
| - script: | | |||
| wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ | |||
| && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 | |||
| alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | |||
| sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' | |||
| alpine make DYNAMIC_ARCH=1 BINARY=64 | |||
| alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install | |||
| alpine ls -l mytestdir/include | |||
| alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c | |||
| alpine echo "#include <openblas_config.h>" >>test_install.c | |||
| alpine echo "int main(){" >> test_install.c | |||
| alpine echo "cpu_set_t* cpu_set = NULL;}" >>test_install.c | |||
| alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install | |||
| @@ -72,13 +72,17 @@ int main(int argc, char *argv[]){ | |||
| FLOAT *a,*work; | |||
| FLOAT wkopt[4]; | |||
| blasint *ipiv; | |||
| blasint m, i, j, info,lwork; | |||
| blasint m, i, j, l, info,lwork; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| int loops = 1; | |||
| double time1; | |||
| double time1,timeg; | |||
| char *p; | |||
| char btest = 'I'; | |||
| argc--;argv++; | |||
| @@ -86,6 +90,9 @@ int main(int argc, char *argv[]){ | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_TEST"))) btest=*p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); | |||
| @@ -124,32 +131,41 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, " SIZE FLops Time Lwork\n"); | |||
| for(m = from; m <= to; m += step){ | |||
| timeg = 0.; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| GETRF (&m, &m, a, &m, ipiv, &info); | |||
| for (l = 0; l < loops; l++) { | |||
| if (btest == 'F') begin(); | |||
| GETRF (&m, &m, a, &m, ipiv, &info); | |||
| if (btest == 'F') { | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| if (info) { | |||
| fprintf(stderr, "Matrix is not singular .. %d\n", info); | |||
| exit(1); | |||
| } | |||
| begin(); | |||
| if (btest == 'I') begin(); | |||
| lwork = -1; | |||
| GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info); | |||
| lwork = (blasint)wkopt[0]; | |||
| GETRI(&m, a, &m, ipiv, work, &lwork, &info); | |||
| end(); | |||
| if (btest == 'I') end(); | |||
| if (info) { | |||
| fprintf(stderr, "failed compute inverse matrix .. %d\n", info); | |||
| exit(1); | |||
| } | |||
| time1 = getsec(); | |||
| if (btest == 'I') | |||
| timeg += getsec(); | |||
| } // loops | |||
| time1 = timeg/(double)loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops : %10.2f Sec : %d\n", | |||
| COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork); | |||
| @@ -72,17 +72,21 @@ int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b; | |||
| blasint *ipiv; | |||
| blasint m, i, j, info; | |||
| blasint m, i, j, l, info; | |||
| blasint unit = 1; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| int loops = 1; | |||
| FLOAT maxerr; | |||
| double time1, time2; | |||
| double time1, time2, timeg1,timeg2; | |||
| char *p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| @@ -110,9 +114,9 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, " SIZE Residual Decompose Solve Total\n"); | |||
| for(m = from; m <= to; m += step){ | |||
| timeg1 = timeg2 = 0.; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) { | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < m * COMPSIZE; i++){ | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| @@ -138,7 +142,7 @@ int main(int argc, char *argv[]){ | |||
| exit(1); | |||
| } | |||
| time1 = getsec(); | |||
| timeg1 += getsec(); | |||
| begin(); | |||
| @@ -151,8 +155,10 @@ int main(int argc, char *argv[]){ | |||
| exit(1); | |||
| } | |||
| time2 = getsec(); | |||
| timeg2 += getsec(); | |||
| } //loops | |||
| time1=timeg1/(double)loops; | |||
| time2=timeg2/(double)loops; | |||
| maxerr = 0.; | |||
| for(i = 0; i < m; i++){ | |||
| @@ -99,14 +99,15 @@ int main(int argc, char *argv[]){ | |||
| char *p; | |||
| char btest = 'F'; | |||
| blasint m, i, j, info, uplos=0; | |||
| double flops; | |||
| blasint m, i, j, l, info, uplos=0; | |||
| double flops = 0.; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| int loops = 1; | |||
| double time1; | |||
| double time1, timeg; | |||
| argc--;argv++; | |||
| @@ -119,6 +120,8 @@ int main(int argc, char *argv[]){ | |||
| if ((p = getenv("OPENBLAS_TEST"))) btest=*p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ | |||
| @@ -129,19 +132,21 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| for(m = from; m <= to; m += step){ | |||
| for(m = from; m <= to; m += step){ | |||
| timeg=0.; | |||
| for (l = 0; l < loops; l++) { | |||
| #ifndef COMPLEX | |||
| if (uplos & 1) { | |||
| for (j = 0; j < m; j++) { | |||
| for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.; | |||
| a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; | |||
| a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; | |||
| for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; | |||
| } | |||
| } else { | |||
| for (j = 0; j < m; j++) { | |||
| for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; | |||
| a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; | |||
| a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; | |||
| for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.; | |||
| } | |||
| } | |||
| @@ -192,8 +197,8 @@ int main(int argc, char *argv[]){ | |||
| exit(1); | |||
| } | |||
| time1 = getsec(); | |||
| flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; | |||
| if ( btest == 'F') | |||
| timeg += getsec(); | |||
| if ( btest == 'S' ) | |||
| { | |||
| @@ -214,9 +219,7 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, "Potrs info = %d\n", info); | |||
| exit(1); | |||
| } | |||
| time1 = getsec(); | |||
| flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; | |||
| timeg += getsec(); | |||
| } | |||
| if ( btest == 'I' ) | |||
| @@ -232,11 +235,17 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, "Potri info = %d\n", info); | |||
| exit(1); | |||
| } | |||
| time1 = getsec(); | |||
| flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; | |||
| timeg += getsec(); | |||
| } | |||
| } // loops | |||
| time1 = timeg/(double)loops; | |||
| if ( btest == 'F') | |||
| flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; | |||
| if ( btest == 'S') | |||
| flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; | |||
| if ( btest == 'I') | |||
| flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; | |||
| fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest); | |||
| @@ -46,14 +46,17 @@ int main(int argc, char *argv[]){ | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| blasint m, i, j; | |||
| blasint m, i, j, l; | |||
| blasint inc_x= 1; | |||
| blasint inc_y= 1; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| int loops = 1; | |||
| double time1; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| @@ -85,8 +88,9 @@ int main(int argc, char *argv[]){ | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0.; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| @@ -107,8 +111,10 @@ int main(int argc, char *argv[]){ | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += getsec(); | |||
| } // loops | |||
| time1 = timeg/(double)loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6); | |||
| @@ -56,17 +56,20 @@ int main(int argc, char *argv[]){ | |||
| char uplo='U'; | |||
| char trans='N'; | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; | |||
| blasint m, i, j; | |||
| blasint m, i, j, l; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| int loops = 1; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| double time1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| @@ -95,9 +98,12 @@ int main(int argc, char *argv[]){ | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0.; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for(l = 0; l < loops; l++) { | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < m * COMPSIZE; i++){ | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| @@ -111,8 +117,10 @@ int main(int argc, char *argv[]){ | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += getsec(); | |||
| } //loops | |||
| time1 = timeg / (double)loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); | |||
| @@ -44,7 +44,7 @@ endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||
| endif () | |||
| @@ -177,7 +177,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") | |||
| elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53" OR "${TCORE}" STREQUAL "CORTEXA55") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t32768\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -39,7 +39,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
| if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | |||
| set(TARGET "BARCELONA") | |||
| endif () | |||
| if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") | |||
| if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55") | |||
| set(TARGET "ARMV7") | |||
| endif () | |||
| endif () | |||
| @@ -254,6 +254,19 @@ function(GenerateNamedObjects sources_in) | |||
| # now add the object and set the defines | |||
| set(obj_defines ${defines_in}) | |||
| list(FIND obj_defines "RC" def_idx) | |||
| if (${def_idx} GREATER -1) | |||
| # list(REMOVE_AT ${obj_defines} ${def_idx}) | |||
| list (REMOVE_ITEM obj_defines "RC") | |||
| list(APPEND obj_defines "RC=RC") | |||
| endif () | |||
| list(FIND obj_defines "CR" def_idx) | |||
| if (${def_idx} GREATER -1) | |||
| # list(REMOVE_AT ${obj_defines} ${def_idx}) | |||
| list (REMOVE_ITEM obj_defines "CR") | |||
| list(APPEND obj_defines "CR=CR") | |||
| endif () | |||
| if (use_cblas) | |||
| set(obj_name "cblas_${obj_name}") | |||
| list(APPEND obj_defines "CBLAS") | |||
| @@ -709,6 +709,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); | |||
| int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *); | |||
| int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *); | |||
| int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); | |||
| @@ -36,6 +36,7 @@ size_t length=sizeof(value); | |||
| #define CPU_ARMV8 1 | |||
| // Arm | |||
| #define CPU_CORTEXA53 2 | |||
| #define CPU_CORTEXA55 14 | |||
| #define CPU_CORTEXA57 3 | |||
| #define CPU_CORTEXA72 4 | |||
| #define CPU_CORTEXA73 5 | |||
| @@ -67,7 +68,8 @@ static char *cpuname[] = { | |||
| "EMAG8180", | |||
| "NEOVERSEN1", | |||
| "THUNDERX3T110", | |||
| "VORTEX" | |||
| "VORTEX", | |||
| "CORTEXA55" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| @@ -84,7 +86,8 @@ static char *cpuname_lower[] = { | |||
| "emag8180", | |||
| "neoversen1", | |||
| "thunderx3t110", | |||
| "vortex" | |||
| "vortex", | |||
| "cortexa55" | |||
| }; | |||
| int get_feature(char *search) | |||
| @@ -161,6 +164,8 @@ int detect(void) | |||
| return CPU_CORTEXA73; | |||
| else if (strstr(cpu_part, "0xd0c")) | |||
| return CPU_NEOVERSEN1; | |||
| else if (strstr(cpu_part, "0xd05")) | |||
| return CPU_CORTEXA55; | |||
| } | |||
| // Qualcomm | |||
| else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) | |||
| @@ -281,6 +286,7 @@ void get_cpuconfig(void) | |||
| { | |||
| case CPU_CORTEXA53: | |||
| case CPU_CORTEXA55: | |||
| printf("#define %s\n", cpuname[d]); | |||
| // Fall-through | |||
| case CPU_ARMV8: | |||
| @@ -283,6 +283,7 @@ int get_vendor(void){ | |||
| if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; | |||
| if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; | |||
| if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; | |||
| if (!strcmp(vendor, " Shanghai ")) return VENDOR_CENTAUR; | |||
| if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; | |||
| if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; | |||
| if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; | |||
| @@ -1398,6 +1399,17 @@ int get_cpuname(void){ | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 10: // Ice Lake SP | |||
| if(support_avx512_bf16()) | |||
| return CPUTYPE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 7: // family 6 exmodel 7 | |||
| @@ -1620,7 +1632,9 @@ int get_cpuname(void){ | |||
| case 0x6: | |||
| return CPUTYPE_NANO; | |||
| break; | |||
| case 0x7: | |||
| return CPUTYPE_NEHALEM; | |||
| break; | |||
| } | |||
| return CPUTYPE_VIAC3; | |||
| } | |||
| @@ -2112,7 +2126,22 @@ int get_coretype(void){ | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| #endif | |||
| #endif | |||
| if (model == 10) | |||
| #ifndef NO_AVX512 | |||
| if(support_avx512_bf16()) | |||
| return CORE_COOPERLAKE; | |||
| return CORE_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| #else | |||
| return CORE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| #endif | |||
| break; | |||
| case 7: | |||
| if (model == 10) | |||
| @@ -2135,13 +2164,13 @@ int get_coretype(void){ | |||
| case 8: | |||
| if (model == 12) { // Tiger Lake | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| return CORE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| return CORE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| return CORE_NEHALEM; | |||
| } | |||
| if (model == 14) { // Kaby Lake | |||
| if(support_avx()) | |||
| @@ -2259,6 +2288,9 @@ int get_coretype(void){ | |||
| case 0x6: | |||
| return CORE_NANO; | |||
| break; | |||
| case 0x7: | |||
| return CORE_NEHALEM; | |||
| break; | |||
| } | |||
| return CORE_VIAC3; | |||
| } | |||
| @@ -425,7 +425,7 @@ cgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -473,7 +473,7 @@ zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -521,7 +521,7 @@ xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -632,7 +632,7 @@ cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -680,7 +680,7 @@ zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -728,7 +728,7 @@ xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -1895,7 +1895,7 @@ cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -1943,7 +1943,7 @@ zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -1991,7 +1991,7 @@ xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -2048,7 +2048,7 @@ cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -2096,7 +2096,7 @@ zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -2144,7 +2144,7 @@ xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -2817,7 +2817,7 @@ cgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -2865,7 +2865,7 @@ zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -2913,7 +2913,7 @@ xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -3025,7 +3025,7 @@ cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -3073,7 +3073,7 @@ zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -3121,7 +3121,7 @@ xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -4288,7 +4288,7 @@ cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -4336,7 +4336,7 @@ zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -4384,7 +4384,7 @@ xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -4441,7 +4441,7 @@ cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -4489,7 +4489,7 @@ zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -4537,7 +4537,7 @@ xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -404,6 +404,7 @@ static int get_vendor(void){ | |||
| if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; | |||
| if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; | |||
| if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; | |||
| if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_CENTAUR; | |||
| if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; | |||
| if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | |||
| @@ -621,6 +622,22 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| if (model == 10) { | |||
| // Ice Lake SP | |||
| if(support_avx512_bf16()) | |||
| return &gotoblas_COOPERLAKE; | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| return NULL; | |||
| case 7: | |||
| if (model == 10) // Goldmont Plus | |||
| @@ -808,6 +825,9 @@ static gotoblas_t *get_coretype(void){ | |||
| switch (family) { | |||
| case 0x6: | |||
| return &gotoblas_NANO; | |||
| break; | |||
| case 0x7: | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| @@ -99,6 +99,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| #else | |||
| #define gotoblas_NEOVERSEN1 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEX_A55 | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #else | |||
| #define gotoblas_CORTEXA55 gotoblas_ARMV8 | |||
| #endif | |||
| #else | |||
| extern gotoblas_t gotoblas_CORTEXA53; | |||
| extern gotoblas_t gotoblas_CORTEXA57; | |||
| @@ -111,11 +116,12 @@ extern gotoblas_t gotoblas_TSV110; | |||
| extern gotoblas_t gotoblas_EMAG8180; | |||
| extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #endif | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define NUM_CORETYPES 12 | |||
| #define NUM_CORETYPES 13 | |||
| /* | |||
| * In case asm/hwcap.h is outdated on the build system, make sure | |||
| @@ -142,6 +148,7 @@ static char *corename[] = { | |||
| "emag8180", | |||
| "neoversen1", | |||
| "thunderx3t110", | |||
| "cortexa55", | |||
| "unknown" | |||
| }; | |||
| @@ -158,6 +165,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; | |||
| if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; | |||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; | |||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[12]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -189,6 +197,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| case 9: return (&gotoblas_EMAG8180); | |||
| case 10: return (&gotoblas_NEOVERSEN1); | |||
| case 11: return (&gotoblas_THUNDERX3T110); | |||
| case 12: return (&gotoblas_CORTEXA55); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| @@ -247,6 +256,8 @@ static gotoblas_t *get_coretype(void) { | |||
| return &gotoblas_CORTEXA73; | |||
| case 0xd0c: // Neoverse N1 | |||
| return &gotoblas_NEOVERSEN1; | |||
| case 0xd05: // Cortex A55 | |||
| return &gotoblas_CORTEXA55; | |||
| } | |||
| break; | |||
| case 0x42: // Broadcom | |||
| @@ -1702,7 +1702,6 @@ inline int atoi(const char *str) { return 0; } | |||
| #include <sys/sysinfo.h> | |||
| #include <sched.h> | |||
| #include <errno.h> | |||
| #include <linux/unistd.h> | |||
| #include <sys/syscall.h> | |||
| #include <sys/time.h> | |||
| #include <sys/resource.h> | |||
| @@ -314,11 +314,11 @@ if ($link ne "") { | |||
| $link =~ s/\-Y\sP\,/\-Y/g; | |||
| $link =~ s/\-R\s*/\-rpath\@/g; | |||
| $link =~ s/\-R\s*/\-rpath\%/g; | |||
| $link =~ s/\-rpath\s+/\-rpath\@/g; | |||
| $link =~ s/\-rpath\s+/\-rpath\%/g; | |||
| $link =~ s/\-rpath-link\s+/\-rpath-link\@/g; | |||
| $link =~ s/\-rpath-link\s+/\-rpath-link\%/g; | |||
| @flags = split(/[\s\,\n]/, $link); | |||
| # remove leading and trailing quotes from each flag. | |||
| @@ -344,13 +344,13 @@ if ($link ne "") { | |||
| } | |||
| if ($flags =~ /^\-rpath\@/) { | |||
| $flags =~ s/\@/\,/g; | |||
| if ($flags =~ /^\-rpath\%/) { | |||
| $flags =~ s/\%/\,/g; | |||
| $linker_L .= "-Wl,". $flags . " " ; | |||
| } | |||
| if ($flags =~ /^\-rpath-link\@/) { | |||
| $flags =~ s/\@/\,/g; | |||
| if ($flags =~ /^\-rpath-link\%/) { | |||
| $flags =~ s/\%/\,/g; | |||
| $linker_L .= "-Wl,". $flags . " " ; | |||
| } | |||
| if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { | |||
| @@ -1159,6 +1159,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_CORTEXA55 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "CORTEXA55" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DCORTEXA55 " \ | |||
| "-DL1_CODE_SIZE=16384 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | |||
| "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ | |||
| "-DL2_SIZE=65536 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "cortexa55" | |||
| #define CORENAME "CORTEXA55" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_FALKOR | |||
| #define FORCE | |||
| @@ -49,6 +49,8 @@ | |||
| #define ERROR_NAME "QGEMM " | |||
| #elif defined(DOUBLE) | |||
| #define ERROR_NAME "DGEMM " | |||
| #elif defined(BFLOAT16) | |||
| #define ERROR_NAME "SBGEMM " | |||
| #else | |||
| #define ERROR_NAME "SGEMM " | |||
| #endif | |||
| @@ -124,6 +126,7 @@ void NAME(char *TRANSA, char *TRANSB, | |||
| #ifdef SMP | |||
| double MNK; | |||
| #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) | |||
| #ifndef COMPLEX | |||
| #ifdef XDOUBLE | |||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | |||
| @@ -142,6 +145,7 @@ void NAME(char *TRANSA, char *TRANSB, | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) | |||
| int nodes; | |||
| @@ -202,6 +202,11 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (alpha == ZERO) return; | |||
| if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { | |||
| GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); | |||
| return; | |||
| } | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -164,6 +164,11 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (m == 0 || n == 0) return; | |||
| if (alpha == 0.) return; | |||
| if (incx == 1 && incy == 1 && 1L*m*n <= 2048 *GEMM_MULTITHREAD_THRESHOLD) { | |||
| GER(m, n, 0, alpha, x, incx, y, incy, a, lda, NULL); | |||
| return; | |||
| } | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -95,7 +95,14 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint | |||
| #ifdef SMP | |||
| args.common = NULL; | |||
| args.nthreads = num_cpu_avail(4); | |||
| #ifndef DOUBLE | |||
| if (args.m*args.n < 40000) | |||
| #else | |||
| if (args.m*args.n < 10000) | |||
| #endif | |||
| args.nthreads=1; | |||
| else | |||
| args.nthreads = num_cpu_avail(4); | |||
| if (args.nthreads == 1) { | |||
| #endif | |||
| @@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ | |||
| #ifdef SMP | |||
| args.common = NULL; | |||
| #ifndef DOUBLE | |||
| if (args.n <128) | |||
| #else | |||
| if (args.n <64) | |||
| #endif | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = num_cpu_avail(4); | |||
| if (args.nthreads == 1) { | |||
| @@ -121,6 +121,9 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ | |||
| #ifdef SMP | |||
| args.common = NULL; | |||
| if (args.n < 180) | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = num_cpu_avail(4); | |||
| if (args.nthreads == 1) { | |||
| @@ -95,7 +95,10 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint | |||
| #ifdef SMP | |||
| args.common = NULL; | |||
| args.nthreads = num_cpu_avail(4); | |||
| if (args.m*args.n <10000) | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = num_cpu_avail(4); | |||
| if (args.nthreads == 1) { | |||
| #endif | |||
| @@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ | |||
| #ifdef SMP | |||
| args.common = NULL; | |||
| #ifndef DOUBLE | |||
| if (args.n < 64) | |||
| #else | |||
| if (args.n < 64) | |||
| #endif | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = num_cpu_avail(4); | |||
| if (args.nthreads == 1) { | |||
| @@ -121,6 +121,15 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ | |||
| #ifdef SMP | |||
| args.nthreads = num_cpu_avail(4); | |||
| #ifndef DOUBLE | |||
| if (args.n < 200) | |||
| #else | |||
| if (args.n < 150) | |||
| #endif | |||
| args.nthreads=1; | |||
| else | |||
| #endif | |||
| args.nthreads = num_cpu_avail(4); | |||
| if (args.nthreads == 1) { | |||
| #endif | |||
| @@ -167,6 +167,26 @@ void CNAME(enum CBLAS_ORDER order, | |||
| FUNCTION_PROFILE_START(); | |||
| if (incx == 1 && n <100) { | |||
| blasint i; | |||
| if (uplo==0) { | |||
| for (i = 0; i < n; i++){ | |||
| if (x[i] != ZERO) { | |||
| AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0); | |||
| } | |||
| a += i + 1; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++){ | |||
| if (x[i] != ZERO) { | |||
| AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0); | |||
| } | |||
| a += n - i; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| if (incx < 0 ) x -= (n - 1) * incx; | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| @@ -168,6 +168,24 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (alpha == ZERO) return; | |||
| if (incx == 1 && incy == 1 && n < 50) { | |||
| blasint i; | |||
| if (!uplo) { | |||
| for (i = 0; i < n; i++){ | |||
| AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0); | |||
| AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0); | |||
| a += i + 1; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++){ | |||
| AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0); | |||
| AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0); | |||
| a += n - i; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -168,7 +168,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| #if 1 | |||
| if (incx == 1 && n < 100) { | |||
| BLASLONG i; | |||
| if (uplo == 0) { | |||
| for (i = 0; i < n; i++){ | |||
| if (x[i] != ZERO) { | |||
| AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0); | |||
| } | |||
| a += lda; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++){ | |||
| if (x[i] != ZERO) { | |||
| AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0); | |||
| } | |||
| a += 1 + lda; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| if (incx < 0 ) x -= (n - 1) * incx; | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| @@ -170,6 +170,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, | |||
| IDEBUG_START; | |||
| if (incx == 1 && incy == 1 && n < 100) { | |||
| blasint i; | |||
| if (!uplo) { | |||
| for (i = 0; i < n; i++){ | |||
| AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0); | |||
| AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0); | |||
| a += lda; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++){ | |||
| AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0); | |||
| AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0); | |||
| a += 1 + lda; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| FUNCTION_PROFILE_START(); | |||
| if (incx < 0 ) x -= (n - 1) * incx; | |||
| @@ -354,6 +354,17 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr | |||
| #endif | |||
| args.common = NULL; | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| if (args.n < 100) | |||
| #else | |||
| if (args.n < 200) | |||
| #endif | |||
| #else | |||
| if (args.n < 65) | |||
| #endif | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = num_cpu_avail(3); | |||
| if (args.nthreads == 1) { | |||
| @@ -188,6 +188,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| if (n == 0) return; | |||
| if (incx == 1 && trans == 0 && n < 50) { | |||
| buffer = NULL; | |||
| (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); | |||
| return; | |||
| } | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -172,6 +172,32 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO | |||
| if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; | |||
| if (incx == 1 && n < 50) { | |||
| blasint i; | |||
| if (!uplo) { | |||
| for (i = 0; i < n; i++){ | |||
| if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) { | |||
| AXPYU_K(i + 1, 0, 0, | |||
| alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1], | |||
| alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], | |||
| x, 1, a, 1, NULL, 0); | |||
| } | |||
| a += lda; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++){ | |||
| if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) { | |||
| AXPYU_K(n - i, 0, 0, | |||
| alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1], | |||
| alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], | |||
| x + i * 2, 1, a, 1, NULL, 0); | |||
| } | |||
| a += 2 + lda; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -199,6 +199,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| if (n == 0) return; | |||
| if (incx == 1 && trans == 0 && n < 50) { | |||
| buffer = NULL; | |||
| (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); | |||
| return; | |||
| } | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -818,6 +818,8 @@ ifeq ($(OS), AIX) | |||
| m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ | |||
| rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s | |||
| else ifeq ($(CORE),SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ | |||
| endif | |||
| @@ -828,6 +830,8 @@ ifeq ($(OS), AIX) | |||
| m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ | |||
| rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s | |||
| else ifeq ($(CORE),SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ | |||
| endif | |||
| @@ -838,6 +842,8 @@ ifeq ($(OS), AIX) | |||
| m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ | |||
| rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s | |||
| else ifeq ($(CORE),SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ | |||
| endif | |||
| @@ -848,6 +854,8 @@ ifeq ($(OS), AIX) | |||
| m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ | |||
| rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s | |||
| else ifeq ($(CORE),SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ | |||
| endif | |||
| @@ -1044,6 +1052,8 @@ ifeq ($(OS), AIX) | |||
| m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ | |||
| rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s | |||
| else ifeq ($(CORE), SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| endif | |||
| @@ -1054,6 +1064,8 @@ ifeq ($(OS), AIX) | |||
| m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ | |||
| rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s | |||
| else ifeq ($(CORE), SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||
| endif | |||
| @@ -1064,6 +1076,8 @@ ifeq ($(OS), AIX) | |||
| m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ | |||
| rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s | |||
| else ifeq ($(CORE), SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | |||
| endif | |||
| @@ -1074,6 +1088,8 @@ ifeq ($(OS), AIX) | |||
| m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ | |||
| rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s | |||
| else ifeq ($(CORE), SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | |||
| endif | |||
| @@ -1084,6 +1100,8 @@ ifeq ($(OS), AIX) | |||
| m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ | |||
| rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s | |||
| else ifeq ($(CORE), SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| endif | |||
| @@ -1094,6 +1112,8 @@ ifeq ($(OS), AIX) | |||
| m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ | |||
| rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s | |||
| else ifeq ($(CORE), SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||
| endif | |||
| @@ -1104,6 +1124,8 @@ ifeq ($(OS), AIX) | |||
| m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ | |||
| rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s | |||
| else ifeq ($(CORE), SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | |||
| endif | |||
| @@ -1114,6 +1136,8 @@ ifeq ($(OS), AIX) | |||
| m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ | |||
| rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s | |||
| else ifeq ($(CORE), SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||
| endif | |||
| @@ -1187,29 +1211,55 @@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||
| $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||
| ifeq ($(CORE),SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| endif | |||
| $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||
| ifeq ($(CORE),SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||
| endif | |||
| $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||
| ifeq ($(CORE),SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | |||
| endif | |||
| $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||
| ifeq ($(CORE),SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | |||
| endif | |||
| $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||
| ifeq ($(CORE),SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | |||
| endif | |||
| $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||
| ifeq ($(CORE),SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | |||
| endif | |||
| $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||
| ifeq ($(CORE),SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | |||
| endif | |||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||
| ifeq ($(CORE),SANDYBRIDGE) | |||
| $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||
| else | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | |||
| endif | |||
| endif | |||
| @@ -0,0 +1,196 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S | |||
| else | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| endif | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| ifeq ($(SGEMM_UNROLL_M), 16) | |||
| SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_M), 4) | |||
| SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -321,7 +321,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| : "cc", | |||
| "memory", | |||
| "x0", "x1", "x2", "x3", "x4", "x5", "x6", | |||
| "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8" | |||
| "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", REGINF | |||
| ); | |||
| } | |||
| @@ -54,3 +54,8 @@ ZTRSMKERNEL_LN = ztrsm_kernel_LN.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT.S | |||
| CROTKERNEL = ../arm/zrot.c | |||
| ZROTKERNEL = ../arm/zrot.c | |||
| SGEMVNKERNEL = ../arm/gemv_n.c | |||
| SGEMVTKERNEL = ../arm/gemv_t.c | |||
| @@ -16,11 +16,11 @@ ZASUMKERNEL = zasum_ppc440.S | |||
| SAXPYKERNEL = axpy_ppc440.S | |||
| DAXPYKERNEL = axpy_ppc440.S | |||
| ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| ZAXPYKERNEL = ../arm/zaxpy.c | |||
| else | |||
| CAXPYKERNEL = zaxpy_ppc440.S | |||
| ZAXPYKERNEL = zaxpy_ppc440.S | |||
| else | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| ZAXPYKERNEL = ../arm/zaxpy.c | |||
| endif | |||
| SDOTKERNEL = dot_ppc440.S | |||
| @@ -15,8 +15,13 @@ ZASUMKERNEL = zasum_ppc440.S | |||
| SAXPYKERNEL = axpy_ppc440.S | |||
| DAXPYKERNEL = axpy_ppc440.S | |||
| ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | |||
| CAXPYKERNEL = zaxpy_ppc440.S | |||
| ZAXPYKERNEL = zaxpy_ppc440.S | |||
| else | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| ZAXPYKERNEL = ../arm/zaxpy.c | |||
| endif | |||
| SDOTKERNEL = dot_ppc440.S | |||
| DDOTKERNEL = dot_ppc440.S | |||
| @@ -159,6 +159,11 @@ | |||
| #define PREFETCHSIZE_C 16 | |||
| #endif | |||
| #ifdef POWER3 | |||
| #define PREFETCHSIZE_A 16 | |||
| #define PREFETCHSIZE_C 16 | |||
| #endif | |||
| #ifdef POWER4 | |||
| #define PREFETCHSIZE_A 16 | |||
| #define PREFETCHSIZE_C 16 | |||
| @@ -124,6 +124,11 @@ | |||
| #define PREFETCHSIZE_C 16 | |||
| #endif | |||
| #ifdef POWER3 | |||
| #define PREFETCHSIZE_A 16 | |||
| #define PREFETCHSIZE_C 16 | |||
| #endif | |||
| #ifdef POWER4 | |||
| #define PREFETCHSIZE_A 48 | |||
| #define PREFETCHSIZE_C 16 | |||
| @@ -49,17 +49,11 @@ typedef __vector unsigned char vec_t; | |||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||
| typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | |||
| vector char mask = | |||
| { 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe, | |||
| 0xf | |||
| }; | |||
| /* | |||
| * BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of | |||
| * bfloat16 floating-point values as input. Hence this | |||
| * merging is needed on A and B matrices. | |||
| */ | |||
| #define MERGE_ROW(x) vec_perm(x, x, mask) | |||
| #define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y) | |||
| #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) | |||
| @@ -104,6 +98,30 @@ vector char mask = | |||
| rowC = (v2sf_t *) &CO[7* ldc+J]; \ | |||
| rowC[0] += result[6] * alpha; | |||
| #define SAVE4x2_ACC_SCALAR(ACC) { \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| res[0] = result[0] * alpha; \ | |||
| res[1] = result[1] * alpha; \ | |||
| res[2] = result[2] * alpha; \ | |||
| res[3] = result[3] * alpha; \ | |||
| CO[0 * ldc] += res[0][0]; \ | |||
| CO[1 * ldc] += res[1][0]; \ | |||
| CO[2 * ldc] += res[2][0]; \ | |||
| CO[3 * ldc] += res[3][0]; \ | |||
| } | |||
| #define SAVE4x2_ACC1_SCALAR(ACC) { \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| res[0] = result[0] * alpha; \ | |||
| res[1] = result[1] * alpha; \ | |||
| res[2] = result[2] * alpha; \ | |||
| res[3] = result[3] * alpha; \ | |||
| CO[4 * ldc] += res[0][0]; \ | |||
| CO[5 * ldc] += res[1][0]; \ | |||
| CO[6 * ldc] += res[2][0]; \ | |||
| CO[7 * ldc] += res[3][0]; \ | |||
| } | |||
| #define MMA __builtin_mma_xvbf16ger2pp | |||
| #define SAVE2x4_ACC(ACC, J) \ | |||
| @@ -179,8 +197,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| l = (k / 2) << 4; | |||
| vec_t *rowA = (vec_t *) & (AO[l << 1]); | |||
| vec_t *rowB = (vec_t *) & (BO[l]); | |||
| vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); | |||
| vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); | |||
| vec_t rowB_h = MERGE_HIGH (rowB[0], vzero); | |||
| vec_t rowB_l = MERGE_LOW (rowB[0], vzero); | |||
| vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); | |||
| vec_t rowA_l = MERGE_LOW (rowA[0], vzero); | |||
| vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero); | |||
| @@ -231,8 +249,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| l = (k / 2) << 4; | |||
| vec_t *rowA = (vec_t *) & (AO[l]); | |||
| vec_t *rowB = (vec_t *) & (BO[l]); | |||
| vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); | |||
| vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); | |||
| vec_t rowB_h = MERGE_HIGH (rowB[0], vzero); | |||
| vec_t rowB_l = MERGE_LOW (rowB[0], vzero); | |||
| vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); | |||
| vec_t rowA_l = MERGE_LOW (rowA[0], vzero); | |||
| MMA (&acc0, rowB_h, rowA_h); | |||
| @@ -271,8 +289,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| vector short rowA = | |||
| { AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; | |||
| vec_t *rowB = (vec_t *) & (BO[l << 1]); | |||
| MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); | |||
| MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); | |||
| MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); | |||
| MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| SAVE_ACC1 (&acc1, 0); | |||
| @@ -306,8 +324,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| l = (k / 2) << 2; | |||
| vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 }; | |||
| vec_t *rowB = (vec_t *) & (BO[(l << 2)]); | |||
| MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); | |||
| MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); | |||
| MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); | |||
| MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); | |||
| } | |||
| SAVE4x2_ACC (&acc0, 0); | |||
| SAVE4x2_ACC1 (&acc1, 0); | |||
| @@ -319,7 +337,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| { | |||
| IFLOAT *BO = B; | |||
| v2sf_t *rowC; | |||
| v2sf_t result[8]; | |||
| v4sf_t result[4], res[4]; | |||
| __vector_quad acc0, acc1; | |||
| __builtin_mma_xxsetaccz (&acc0); | |||
| __builtin_mma_xxsetaccz (&acc1); | |||
| @@ -338,11 +356,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| l = (k / 2) << 1; | |||
| vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; | |||
| vec_t *rowB = (vec_t *) & (BO[(l << 3)]); | |||
| MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); | |||
| MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); | |||
| MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); | |||
| MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); | |||
| } | |||
| SAVE4x2_ACC (&acc0, 0); | |||
| SAVE4x2_ACC1 (&acc1, 0); | |||
| SAVE4x2_ACC_SCALAR (&acc0); | |||
| SAVE4x2_ACC1_SCALAR (&acc1); | |||
| CO += 1; | |||
| AO += k; | |||
| BO += (k << 3); | |||
| @@ -387,16 +405,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| l = (k / 2) << 3; | |||
| vec_t *rowA = (vec_t *) & (AO[(l << 2)]); | |||
| vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]); | |||
| vec_t *rowB = (vec_t *) & (BO[l]); | |||
| vec_t rowB_mrg = MERGE_ROW (rowB[0]); | |||
| MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); | |||
| MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); | |||
| MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); | |||
| MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); | |||
| MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); | |||
| MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero)); | |||
| MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); | |||
| MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero)); | |||
| vector short rowB_mrg = | |||
| { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; | |||
| MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); | |||
| MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); | |||
| MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero)); | |||
| MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero)); | |||
| MMA (&acc4, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); | |||
| MMA (&acc5, (vec_t)rowB_mrg, MERGE_LOW (rowA1[0], vzero)); | |||
| MMA (&acc6, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); | |||
| MMA (&acc7, (vec_t)rowB_mrg, MERGE_LOW (rowA1[1], vzero)); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| @@ -436,12 +454,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| if (k > 1) | |||
| l = (k / 2) << 3; | |||
| vec_t *rowA = (vec_t *) & (AO[(l << 2)]); | |||
| vec_t *rowB = (vec_t *) & (BO[l]); | |||
| vec_t rowB_mrg = MERGE_ROW (rowB[0]); | |||
| MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); | |||
| MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); | |||
| MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); | |||
| MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); | |||
| vector short rowB_mrg = | |||
| { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; | |||
| MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); | |||
| MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); | |||
| MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero)); | |||
| MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero)); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| @@ -475,9 +493,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| l = (k / 2) << 3; | |||
| vec_t *rowA = (vec_t *) & (AO[l << 1]); | |||
| vec_t *rowB = (vec_t *) & (BO[l]); | |||
| vec_t rowB_mrg = MERGE_ROW (rowB[0]); | |||
| MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); | |||
| MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); | |||
| vector short rowB_mrg = | |||
| { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; | |||
| MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); | |||
| MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| SAVE_ACC (&acc1, 4); | |||
| @@ -505,8 +524,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| l = (k / 2) << 3; | |||
| vector short rowA = | |||
| { AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; | |||
| vec_t *rowB = (vec_t *) & (BO[l]); | |||
| MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); | |||
| vector short rowB_mrg = | |||
| { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; | |||
| MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| CO += 4; | |||
| @@ -536,8 +556,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| if (k > 1) | |||
| l = (k / 2) << 2; | |||
| vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 }; | |||
| vec_t *rowB = (vec_t *) & (BO[l << 1]); | |||
| MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); | |||
| vector short rowB_mrg = | |||
| { BO[(l<<1)], 0, BO[(l<<1) + 1], 0, BO[(l<<1) + 2], 0, | |||
| BO[(l<<1) + 3], 0 | |||
| }; | |||
| MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); | |||
| } | |||
| SAVE4x2_ACC (&acc0, 0); | |||
| CO += 2; | |||
| @@ -548,7 +571,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| { | |||
| IFLOAT *BO = B; | |||
| v2sf_t *rowC; | |||
| v2sf_t result[8]; | |||
| v4sf_t result[4], res[4]; | |||
| __vector_quad acc0; | |||
| BLASLONG l = 0; | |||
| __builtin_mma_xxsetaccz (&acc0); | |||
| @@ -566,10 +589,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| if (k > 1) | |||
| l = (k / 2) << 1; | |||
| vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; | |||
| vec_t *rowB = (vec_t *) & (BO[l << 2]); | |||
| MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); | |||
| vector short rowB_mrg = | |||
| { BO[(l<<2) + 0], 0, BO[(l<<2) + 1], 0, BO[(l <<2) + 2], 0, | |||
| BO[(l<<2) + 3], 0 | |||
| }; | |||
| MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); | |||
| } | |||
| SAVE4x2_ACC (&acc0, 0); | |||
| SAVE4x2_ACC_SCALAR (&acc0); | |||
| AO += k; | |||
| BO += (k << 2); | |||
| CO += 1; | |||
| @@ -620,14 +646,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; | |||
| vec_t *rowA = (vec_t *) & (AO[l << 3]); | |||
| vec_t *rowA1 = (vec_t *) & (A1[l << 3]); | |||
| MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); | |||
| MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); | |||
| MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); | |||
| MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); | |||
| MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); | |||
| MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); | |||
| MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); | |||
| MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); | |||
| MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); | |||
| MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); | |||
| MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); | |||
| MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); | |||
| MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], vzero)); | |||
| MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], vzero)); | |||
| MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], vzero)); | |||
| MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], vzero)); | |||
| } | |||
| SAVE2x4_ACC (&acc0, 0); | |||
| SAVE2x4_ACC (&acc1, 4); | |||
| @@ -669,10 +695,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| l = (k / 2) << 2; | |||
| vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; | |||
| vec_t *rowA = (vec_t *) & (AO[l << 3]); | |||
| MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); | |||
| MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); | |||
| MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); | |||
| MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); | |||
| MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero )); | |||
| MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); | |||
| MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); | |||
| MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); | |||
| } | |||
| SAVE2x4_ACC (&acc0, 0); | |||
| SAVE2x4_ACC (&acc1, 4); | |||
| @@ -708,8 +734,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| l = (k / 2) << 2; | |||
| vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; | |||
| vec_t *rowA = (vec_t *) & (AO[(l << 2)]); | |||
| MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); | |||
| MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); | |||
| MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); | |||
| MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); | |||
| } | |||
| SAVE2x4_ACC (&acc0, 0); | |||
| SAVE2x4_ACC (&acc1, 4); | |||
| @@ -740,8 +766,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| if (k > 1) | |||
| l = (k / 2) << 2; | |||
| vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; | |||
| vec_t *rowA = (vec_t *) & (AO[l << 1]); | |||
| MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); | |||
| vector short rowA = | |||
| { AO[(l << 1)], 0, AO[(l << 1) + 1] , 0 , AO[(l<<1) + 2], | |||
| 0, AO[(l << 1) + 3], 0 }; | |||
| MMA (&acc0, (vec_t) rowB, (vec_t)(rowA)); | |||
| } | |||
| SAVE2x4_ACC (&acc0, 0); | |||
| CO += 4; | |||
| @@ -829,10 +857,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| l = (k / 2) << 1; | |||
| vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; | |||
| vec_t *rowA = (vec_t *) & (AO[(l << 4)]); | |||
| MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); | |||
| MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); | |||
| MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); | |||
| MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); | |||
| MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); | |||
| MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); | |||
| MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); | |||
| MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); | |||
| } | |||
| rowC = (v4sf_t *) &CO[0]; | |||
| __builtin_mma_disassemble_acc ((void *)result, &acc0); | |||
| @@ -871,8 +899,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| l = (k / 2) << 1; | |||
| vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; | |||
| vec_t *rowA = (vec_t *) & (AO[(l << 3)]); | |||
| MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); | |||
| MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); | |||
| MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); | |||
| MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); | |||
| } | |||
| rowC = (v4sf_t *) &CO[0]; | |||
| __builtin_mma_disassemble_acc ((void *)result, &acc0); | |||
| @@ -904,8 +932,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, | |||
| if (k > 1) | |||
| l = (k / 2) << 1; | |||
| vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; | |||
| vec_t *rowA = (vec_t *) & (AO[(l << 2)]); | |||
| MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); | |||
| vector short rowA = | |||
| { AO[(l << 2)], 0, AO[(l << 2) + 1] , 0 , | |||
| AO[(l << 2) + 2], 0, AO[(l << 2) + 3], 0 }; | |||
| MMA (&acc0, (vec_t) rowB, (vec_t)(rowA)); | |||
| } | |||
| rowC = (v4sf_t *) &CO[0]; | |||
| __builtin_mma_disassemble_acc ((void *)result, &acc0); | |||
| @@ -155,6 +155,11 @@ | |||
| #define PREFETCHSIZE_C 16 | |||
| #endif | |||
| #ifdef POWER3 | |||
| #define PREFETCHSIZE_A 34 | |||
| #define PREFETCHSIZE_C 16 | |||
| #endif | |||
| #ifdef POWER4 | |||
| #define PREFETCHSIZE_A 34 | |||
| #define PREFETCHSIZE_C 16 | |||
| @@ -129,6 +129,11 @@ | |||
| #define PREFETCHSIZE_C 16 | |||
| #endif | |||
| #ifdef POWER3 | |||
| #define PREFETCHSIZE_A 34 | |||
| #define PREFETCHSIZE_C 16 | |||
| #endif | |||
| #ifdef POWER4 | |||
| #define PREFETCHSIZE_A 34 | |||
| #define PREFETCHSIZE_C 16 | |||
| @@ -54,6 +54,7 @@ SDOTKERNEL = ../riscv64/dot.c | |||
| DDOTKERNEL = ../riscv64/dot.c | |||
| CDOTKERNEL = ../riscv64/zdot.c | |||
| ZDOTKERNEL = ../riscv64/zdot.c | |||
| DSDOTKERNEL = ../generic/dot.c | |||
| SNRM2KERNEL = ../riscv64/nrm2.c | |||
| DNRM2KERNEL = ../riscv64/nrm2.c | |||
| @@ -0,0 +1 @@ | |||
| clean :: | |||
| @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <math.h> | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 | |||
| #define MASK_T e32xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||
| #define VFMAXVV_FLOAT vfmaxvv_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 | |||
| #define MASK_T e64xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||
| #define VFMAXVV_FLOAT vfmaxvv_float64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| @@ -62,19 +66,25 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_max; | |||
| FLOAT_V_T_M1 v_res, v_zero; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_zero = VFMVVF_FLOAT_M1(0, gvl); | |||
| MASK_T mask0, mask1; | |||
| FLOAT zero = 0.0; | |||
| if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| v_max = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -83,6 +93,7 @@ asm volatile( | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -98,6 +109,7 @@ asm volatile( | |||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -106,6 +118,7 @@ asm volatile( | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -117,17 +130,17 @@ asm volatile( | |||
| v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | |||
| j += gvl*2; | |||
| } | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl); | |||
| maxf = v0[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||
| maxf = v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -136,6 +149,7 @@ asm volatile( | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -144,14 +158,13 @@ asm volatile( | |||
| :"v0"); | |||
| #endif | |||
| v1 = VFMVVF_FLOAT(0, gvl); | |||
| v0 = VFREDMAXVS_FLOAT(v0, v1, gvl); | |||
| if(v0[0] > maxf) | |||
| maxf = v0[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||
| if(v_res[0] > maxf) | |||
| maxf = v_res[0]; | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| @@ -162,6 +175,7 @@ asm volatile( | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -170,6 +184,7 @@ asm volatile( | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -185,6 +200,7 @@ asm volatile( | |||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -193,6 +209,7 @@ asm volatile( | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -205,17 +222,17 @@ asm volatile( | |||
| j += gvl*2; | |||
| ix += inc_xv*2; | |||
| } | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl); | |||
| maxf = v0[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); | |||
| maxf = v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -224,6 +241,7 @@ asm volatile( | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -232,10 +250,9 @@ asm volatile( | |||
| :"v0"); | |||
| #endif | |||
| v1 = VFMVVF_FLOAT(0, gvl); | |||
| v0 = VFREDMAXVS_FLOAT(v0, v1, gvl); | |||
| if(v0[0] > maxf) | |||
| maxf = v0[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); | |||
| if(v_res[0] > maxf) | |||
| maxf = v_res[0]; | |||
| j += gvl; | |||
| } | |||
| } | |||
| @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDMINVS_FLOAT vfredminvs_float32xm8 | |||
| #define MASK_T e32xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||
| #define VFMINVV_FLOAT vfminvv_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDMINVS_FLOAT vfredminvs_float64xm8 | |||
| #define MASK_T e64xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||
| #define VFMINVV_FLOAT vfminvv_float64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| @@ -62,11 +66,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT minf=FLT_MAX; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_min; | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| MASK_T mask0, mask1; | |||
| FLOAT zero = 0.0; | |||
| FLOAT zero = 0.0; | |||
| if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| @@ -75,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -83,6 +92,7 @@ asm volatile( | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -97,6 +107,7 @@ asm volatile( | |||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -105,6 +116,7 @@ asm volatile( | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -116,17 +128,17 @@ asm volatile( | |||
| v_min = VFMINVV_FLOAT(v_min, v1, gvl); | |||
| j += gvl*2; | |||
| } | |||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v0 = VFREDMINVS_FLOAT(v_min, v1, gvl); | |||
| minf = v0[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -135,6 +147,7 @@ asm volatile( | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -142,14 +155,13 @@ asm volatile( | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v0 = VFREDMINVS_FLOAT(v0, v1, gvl); | |||
| if(v0[0] < minf) | |||
| minf = v0[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(v_res[0] < minf) | |||
| minf = v_res[0]; | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG idx = 0, inc_xv = inc_x * gvl; | |||
| @@ -160,6 +172,7 @@ asm volatile( | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -168,6 +181,7 @@ asm volatile( | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -182,6 +196,7 @@ asm volatile( | |||
| //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -190,6 +205,7 @@ asm volatile( | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -202,17 +218,17 @@ asm volatile( | |||
| j += gvl*2; | |||
| idx += inc_xv*2; | |||
| } | |||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v0 = VFREDMINVS_FLOAT(v_min, v1, gvl); | |||
| minf = v0[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e64,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -221,6 +237,7 @@ asm volatile( | |||
| :"v0"); | |||
| #else | |||
| asm volatile( | |||
| "vsetvli zero, zero, e8, m1\n\t" | |||
| "vor.vv v0, %1, %1\n\t" | |||
| "vsetvli x0, %3, e32,m8 \n\t" | |||
| "vfrsub.vf %0, %0, %2, v0.t \n\t" | |||
| @@ -228,10 +245,9 @@ asm volatile( | |||
| :"v"(mask0), "f"(zero), "r"(gvl) | |||
| :"v0"); | |||
| #endif | |||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v0 = VFREDMINVS_FLOAT(v0, v1, gvl); | |||
| if(v0[0] < minf) | |||
| minf = v0[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(v_res[0] < minf) | |||
| minf = v_res[0]; | |||
| j += gvl; | |||
| } | |||
| } | |||
| @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <math.h> | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 | |||
| #define MASK_T e32xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||
| #define VFADDVV_FLOAT vfaddvv_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 | |||
| #define MASK_T e64xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||
| #define VFADDVV_FLOAT vfaddvv_float64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| @@ -61,39 +65,43 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if (n <= 0 || inc_x <= 0) return(asumf); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_zero,v_sum; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| MASK_T mask0, mask1; | |||
| if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| if(gvl <= n/2){ | |||
| v_sum = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||
| j += gvl * 2; | |||
| } | |||
| v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl); | |||
| asumf += v0[0]; | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||
| v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl); | |||
| asumf += v0[0]; | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| if(gvl <= n/2){ | |||
| @@ -102,26 +110,26 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||
| j += gvl * 2; | |||
| inc_xv += inc_xv * 2; | |||
| } | |||
| v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl); | |||
| asumf += v0[0]; | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||
| v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl); | |||
| asumf += v0[0]; | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| j += gvl; | |||
| } | |||
| } | |||
| @@ -28,27 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSEV_FLOAT vsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||
| #define VFMULVF_FLOAT vfmulvf_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSEV_FLOAT vse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSEV_FLOAT vsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||
| #define VFMULVF_FLOAT vfmulvf_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSEV_FLOAT vse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | |||
| @@ -65,7 +63,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| if(beta == 0.0){ | |||
| if(alpha == 0.0){//alpha == 0 && beta == 0 | |||
| if(inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| vy0 = VFMVVF_FLOAT(0.0, gvl); | |||
| for(i=0,j=0;i<n/(gvl*2);i++){ | |||
| @@ -75,13 +73,13 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vy0 = VFMVVF_FLOAT(0.0, gvl); | |||
| VSEV_FLOAT(&y[j], vy0, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| vy0 = VFMVVF_FLOAT(0.0, gvl); | |||
| @@ -94,7 +92,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vy0 = VFMVVF_FLOAT(0.0, gvl); | |||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | |||
| j += gvl; | |||
| @@ -103,7 +101,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| }else{//alpha != 0 && beta == 0, y = ax | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| for(i=0,j=0;i<n/(2*gvl);i++){ | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| @@ -117,14 +115,14 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| vy0 = VFMULVF_FLOAT(vx0, alpha, gvl); | |||
| VSEV_FLOAT(&y[j], vy0, gvl); | |||
| j += gvl; | |||
| } | |||
| }else if(inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| @@ -141,14 +139,14 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy0 = VFMULVF_FLOAT(vx0, alpha, gvl); | |||
| VSEV_FLOAT(&y[j], vy0, gvl); | |||
| j += gvl; | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| @@ -165,14 +163,14 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| vy0 = VFMULVF_FLOAT(vx0, alpha, gvl); | |||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{//inc_x !=1 && inc_y != 1 | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| @@ -192,7 +190,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy0 = VFMULVF_FLOAT(vx0, alpha, gvl); | |||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | |||
| @@ -203,7 +201,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| }else{//beta != 0 | |||
| if(alpha == 0.0){//alpha == 0 && beta != 0; y = by | |||
| if(inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| for(i=0,j=0;i<n/(2*gvl);i++){ | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| @@ -217,14 +215,14 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| vy0 = VFMULVF_FLOAT(vy0, beta, gvl); | |||
| VSEV_FLOAT(&y[j], vy0, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| @@ -241,7 +239,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| vy0 = VFMULVF_FLOAT(vy0, beta, gvl); | |||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); | |||
| @@ -251,7 +249,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| }else{//alpha != 0 && beta != 0; y = ax + by | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| for(i=0,j=0;i<n/(2*gvl);i++){ | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| @@ -269,7 +267,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| vx0 = VFMULVF_FLOAT(vx0, alpha, gvl); | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| @@ -278,7 +276,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| j += gvl; | |||
| } | |||
| }else if(inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| @@ -299,7 +297,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vx0 = VFMULVF_FLOAT(vx0, alpha, gvl); | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| @@ -308,7 +306,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| j += gvl; | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| @@ -329,7 +327,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| vx0 = VFMULVF_FLOAT(vx0, alpha, gvl); | |||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| @@ -338,7 +336,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| j += gvl; | |||
| } | |||
| }else{//inc_x != 1 && inc_y != 1 | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| @@ -362,7 +360,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT * | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vx0 = VFMULVF_FLOAT(vx0, alpha, gvl); | |||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| @@ -28,23 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSEV_FLOAT vsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSEV_FLOAT vse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSEV_FLOAT vsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSEV_FLOAT vse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| @@ -60,7 +58,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if (inc_x == 1 && inc_y == 1) { | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if (gvl <= n/2) { | |||
| for (i = 0, j=0; i < n/(2*gvl); i++, j+=2*gvl) { | |||
| @@ -77,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| //tail | |||
| for (; j < n; ) { | |||
| gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n - j); | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl); | |||
| @@ -87,7 +85,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| }else if (inc_y == 1) { | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| for(i=0,j=0; i<n/(2*gvl); i++){ | |||
| @@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| } | |||
| for (; j<n; ) { | |||
| gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n - j); | |||
| vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl); | |||
| @@ -115,7 +113,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| }else if(inc_x == 1){ | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| for(i=0,j=0; i<n/(2*gvl); i++){ | |||
| @@ -134,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| } | |||
| for (; j<n; ) { | |||
| gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n - j); | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl); | |||
| @@ -144,7 +142,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| }else{ | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| @@ -165,7 +163,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| } | |||
| for (; j<n; ) { | |||
| gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n - j); | |||
| vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl); | |||
| @@ -26,21 +26,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VSEV_FLOAT vsev_float32xm8 | |||
| #define VSSEV_FLOAT vssev_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VSEV_FLOAT vse_v_f32m8 | |||
| #define VSSEV_FLOAT vsse_v_f32m8 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VSEV_FLOAT vsev_float64xm8 | |||
| #define VSSEV_FLOAT vssev_float64xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VSEV_FLOAT vse_v_f64m8 | |||
| #define VSSEV_FLOAT vsse_v_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| @@ -56,7 +54,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| memcpy(&y[0], &x[0], n*sizeof(FLOAT)); | |||
| }else if (inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/4){ | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| @@ -77,13 +75,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| VSEV_FLOAT(&y[j], v0, gvl); | |||
| j += gvl; | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| if(gvl <= n/4){ | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| @@ -104,14 +102,14 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| if(gvl <= n/4){ | |||
| @@ -136,7 +134,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl); | |||
| j += gvl; | |||
| @@ -27,25 +27,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||
| #define VFDOTVV_FLOAT vfdotvv_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||
| #define VFDOTVV_FLOAT vfdotvv_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||
| #endif | |||
| #if defined(DSDOT) | |||
| @@ -61,8 +65,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| FLOAT_V_T vr, vx, vy; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| @@ -71,23 +80,22 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| vx = VFMVVF_FLOAT(0, gvl); | |||
| vx = VFREDSUM_FLOAT(vr, vx, gvl); | |||
| dot += vx[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vy = VLEV_FLOAT(&y[j], gvl); | |||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| vx = VFREDSUM_FLOAT(vr, vz, gvl); | |||
| dot += vx[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| } | |||
| }else if(inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| @@ -97,23 +105,22 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| vx = VFMVVF_FLOAT(0, gvl); | |||
| vx = VFREDSUM_FLOAT(vr, vx, gvl); | |||
| dot += vx[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy = VLEV_FLOAT(&y[j], gvl); | |||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| vx = VFREDSUM_FLOAT(vr, vz, gvl); | |||
| dot += vx[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| unsigned int stride_y = inc_y * sizeof(FLOAT); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| @@ -123,23 +130,22 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| vx = VFMVVF_FLOAT(0, gvl); | |||
| vx = VFREDSUM_FLOAT(vr, vx, gvl); | |||
| dot += vx[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| vx = VFREDSUM_FLOAT(vr, vz, gvl); | |||
| dot += vx[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| unsigned int stride_y = inc_y * sizeof(FLOAT); | |||
| @@ -150,20 +156,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| j += gvl; | |||
| } | |||
| if(j > 0){ | |||
| vx = VFMVVF_FLOAT(0, gvl); | |||
| vx = VFREDSUM_FLOAT(vr, vx, gvl); | |||
| dot += vx[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| } | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(vx, vy, gvl); | |||
| vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); | |||
| vx = VFREDSUM_FLOAT(vr, vz, gvl); | |||
| dot += vx[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| dot += v_res[0]; | |||
| } | |||
| } | |||
| return(dot); | |||
| @@ -27,23 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSEV_FLOAT vsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSEV_FLOAT vse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSEV_FLOAT vsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSEV_FLOAT vse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| @@ -57,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| FLOAT_V_T va0, va1, vy0, vy1; | |||
| unsigned int gvl = 0; | |||
| if(inc_y == 1){ | |||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m); | |||
| if(gvl <= m/2){ | |||
| for(k=0,j=0; k<m/(2*gvl); k++){ | |||
| a_ptr = a; | |||
| @@ -81,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| } | |||
| //tail | |||
| for(;j < m;){ | |||
| gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m-j); | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| @@ -98,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| } | |||
| }else{ | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m); | |||
| if(gvl <= m/2){ | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| for(k=0,j=0; k<m/(2*gvl); k++){ | |||
| @@ -124,7 +122,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| } | |||
| //tail | |||
| for(;j < m;){ | |||
| gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m-j); | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); | |||
| @@ -27,41 +27,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||
| #define VFDOTVV_FLOAT vfdotvv_float32xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||
| #define VFDOTVV_FLOAT vfdotvv_float64xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i = 0, j = 0, k = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT *a_ptr = a; | |||
| BLASLONG i = 0, j = 0, k = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT *a_ptr = a; | |||
| FLOAT temp; | |||
| FLOAT_V_T va, vr, vx; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| if(inc_x == 1){ | |||
| for(i = 0; i < n; i++){ | |||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m); | |||
| j = 0; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(k = 0; k < m/gvl; k++){ | |||
| @@ -70,29 +79,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| vr = VFMACCVV_FLOAT(vr, va, vx, gvl); | |||
| j += gvl; | |||
| } | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp = va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp = v_res[0]; | |||
| if(j < m){ | |||
| gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m-j); | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp += va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp += v_res[0]; | |||
| } | |||
| y[iy] += alpha * temp; | |||
| iy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| for(i = 0; i < n; i++){ | |||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m); | |||
| j = 0; | |||
| ix = 0; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| @@ -103,18 +109,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| j += gvl; | |||
| ix += inc_xv; | |||
| } | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp = va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp = v_res[0]; | |||
| if(j < m){ | |||
| gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m-j); | |||
| va = VLEV_FLOAT(&a_ptr[j], gvl); | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(va, vx, gvl); | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp += va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp += v_res[0]; | |||
| } | |||
| y[iy] += alpha * temp; | |||
| iy += inc_y; | |||
| @@ -31,49 +31,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 | |||
| #define MASK_T e64xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||
| #define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||
| #define VFMAXVV_FLOAT vfmaxvv_float64xm8 | |||
| #define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 | |||
| #define VMFIRSTM vmfirstm_e64xm8 | |||
| #define UINT_V_T uint64xm8_t | |||
| #define VIDV_MASK_UINT vidv_mask_uint64xm8 | |||
| #define VIDV_UINT vidv_uint64xm8 | |||
| #define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 | |||
| #define VADDVX_UINT vaddvx_uint64xm8 | |||
| #define VMVVX_UINT vmvvx_uint64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #else | |||
| #define ABS fabsf | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 | |||
| #define MASK_T e32xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||
| #define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||
| #define VFMAXVV_FLOAT vfmaxvv_float32xm8 | |||
| #define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 | |||
| #define VMFIRSTM vmfirstm_e32xm8 | |||
| #define UINT_V_T uint32xm8_t | |||
| #define VIDV_MASK_UINT vidv_mask_uint32xm8 | |||
| #define VIDV_UINT vidv_uint32xm8 | |||
| #define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 | |||
| #define VADDVX_UINT vaddvx_uint32xm8 | |||
| #define VMVVX_UINT vmvvx_uint32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #endif | |||
| @@ -88,42 +92,45 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| UINT_V_T v_max_index; | |||
| MASK_T mask; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| v_max_index = VMVVX_UINT(0, gvl); | |||
| v_max = VFMVVF_FLOAT(-1, gvl); | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||
| vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | |||
| v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | |||
| j += gvl; | |||
| } | |||
| vx = VFMVVF_FLOAT(0, gvl); | |||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||
| maxf = vx[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| maxf = v_res[0]; | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||
| v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| vx = VFMVVF_FLOAT(0, gvl); | |||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||
| FLOAT cur_maxf = vx[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| FLOAT cur_maxf = v_res[0]; | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| @@ -135,7 +142,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| unsigned int idx = 0, inc_v = gvl * inc_x; | |||
| @@ -145,35 +152,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||
| vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | |||
| v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | |||
| j += gvl; | |||
| idx += inc_v; | |||
| } | |||
| vx = VFMVVF_FLOAT(0, gvl); | |||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||
| maxf = vx[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| maxf = v_res[0]; | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||
| v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| vx = VFMVVF_FLOAT(0, gvl); | |||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||
| FLOAT cur_maxf = vx[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| FLOAT cur_maxf = v_res[0]; | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| @@ -32,49 +32,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDMINVS_FLOAT vfredminvs_float64xm8 | |||
| #define MASK_T e64xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||
| #define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||
| #define VFMINVV_FLOAT vfminvv_float64xm8 | |||
| #define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 | |||
| #define VMFIRSTM vmfirstm_e64xm8 | |||
| #define UINT_V_T uint64xm8_t | |||
| #define VIDV_MASK_UINT vidv_mask_uint64xm8 | |||
| #define VIDV_UINT vidv_uint64xm8 | |||
| #define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 | |||
| #define VADDVX_UINT vaddvx_uint64xm8 | |||
| #define VMVVX_UINT vmvvx_uint64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #else | |||
| #define ABS fabsf | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDMINVS_FLOAT vfredminvs_float32xm8 | |||
| #define MASK_T e32xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||
| #define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||
| #define VFMINVV_FLOAT vfminvv_float32xm8 | |||
| #define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 | |||
| #define VMFIRSTM vmfirstm_e32xm8 | |||
| #define UINT_V_T uint32xm8_t | |||
| #define VIDV_MASK_UINT vidv_mask_uint32xm8 | |||
| #define VIDV_UINT vidv_uint32xm8 | |||
| #define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 | |||
| #define VADDVX_UINT vaddvx_uint32xm8 | |||
| #define VMVVX_UINT vmvvx_uint32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #endif | |||
| @@ -89,42 +93,45 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| UINT_V_T v_min_index; | |||
| MASK_T mask; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v_min_index = VMVVX_UINT(0, gvl); | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||
| vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| //index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, gvl); | |||
| v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); | |||
| v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx, gvl); | |||
| j += gvl; | |||
| } | |||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||
| minf = vx[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||
| v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||
| FLOAT cur_minf = vx[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = v_res[0]; | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| @@ -136,7 +143,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| unsigned int idx = 0, inc_v = gvl * inc_x; | |||
| @@ -146,35 +153,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||
| vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| //index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, gvl); | |||
| v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); | |||
| v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx, gvl); | |||
| j += gvl; | |||
| idx += inc_v; | |||
| } | |||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||
| minf = vx[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(vx, 0, gvl); | |||
| v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); | |||
| v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); | |||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||
| FLOAT cur_minf = vx[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = v_res[0]; | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 | |||
| #define MASK_T e64xm8_t | |||
| #define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFMAXVV_FLOAT vfmaxvv_float64xm8 | |||
| #define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 | |||
| #define VMFIRSTM vmfirstm_e64xm8 | |||
| #define UINT_V_T uint64xm8_t | |||
| #define VIDV_MASK_UINT vidv_mask_uint64xm8 | |||
| #define VIDV_UINT vidv_uint64xm8 | |||
| #define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 | |||
| #define VADDVX_UINT vaddvx_uint64xm8 | |||
| #define VMVVX_UINT vmvvx_uint64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #else | |||
| #define ABS fabsf | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 | |||
| #define MASK_T e32xm8_t | |||
| #define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFMAXVV_FLOAT vfmaxvv_float32xm8 | |||
| #define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 | |||
| #define VMFIRSTM vmfirstm_e32xm8 | |||
| #define UINT_V_T uint32xm8_t | |||
| #define VIDV_MASK_UINT vidv_mask_uint32xm8 | |||
| #define VIDV_UINT vidv_uint32xm8 | |||
| #define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 | |||
| #define VADDVX_UINT vaddvx_uint32xm8 | |||
| #define VMVVX_UINT vmvvx_uint32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #endif | |||
| @@ -85,8 +89,13 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| UINT_V_T v_max_index; | |||
| MASK_T mask; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_min; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); | |||
| if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| v_max_index = VMVVX_UINT(0, gvl); | |||
| v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| @@ -94,27 +103,25 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | |||
| v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | |||
| j += gvl; | |||
| } | |||
| vx = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||
| maxf = vx[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| maxf = v_res[0]; | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v_max = VLEV_FLOAT(&x[j], gvl); | |||
| vx = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||
| FLOAT cur_maxf = vx[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| FLOAT cur_maxf = v_res[0]; | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| @@ -126,7 +133,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| unsigned int idx = 0, inc_v = gvl * inc_x; | |||
| @@ -137,28 +144,26 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| //index where element greater than v_max | |||
| mask = VMFLTVV_FLOAT(v_max, vx, gvl); | |||
| v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); | |||
| v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx, gvl); | |||
| j += gvl; | |||
| idx += inc_v; | |||
| } | |||
| vx = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||
| maxf = vx[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| maxf = v_res[0]; | |||
| mask = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| vx = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); | |||
| FLOAT cur_maxf = vx[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| FLOAT cur_maxf = v_res[0]; | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| while(i < n) | |||
| { | |||
| if( x[ix] > minf ) | |||
| if( x[ix] < minf ) | |||
| { | |||
| min = i; | |||
| minf = x[ix]; | |||
| @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDMINVS_FLOAT vfredminvs_float64xm8 | |||
| #define MASK_T e64xm8_t | |||
| #define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFMINVV_FLOAT vfminvv_float64xm8 | |||
| #define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 | |||
| #define VMFIRSTM vmfirstm_e64xm8 | |||
| #define UINT_V_T uint64xm8_t | |||
| #define VIDV_MASK_UINT vidv_mask_uint64xm8 | |||
| #define VIDV_UINT vidv_uint64xm8 | |||
| #define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 | |||
| #define VADDVX_UINT vaddvx_uint64xm8 | |||
| #define VMVVX_UINT vmvvx_uint64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #else | |||
| #define ABS fabsf | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDMINVS_FLOAT vfredminvs_float32xm8 | |||
| #define MASK_T e32xm8_t | |||
| #define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFMINVV_FLOAT vfminvv_float32xm8 | |||
| #define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 | |||
| #define VMFIRSTM vmfirstm_e32xm8 | |||
| #define UINT_V_T uint32xm8_t | |||
| #define VIDV_MASK_UINT vidv_mask_uint32xm8 | |||
| #define VIDV_UINT vidv_uint32xm8 | |||
| #define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 | |||
| #define VADDVX_UINT vaddvx_uint32xm8 | |||
| #define VMVVX_UINT vmvvx_uint32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #endif | |||
| @@ -85,15 +89,20 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| UINT_V_T v_min_index; | |||
| MASK_T mask; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v_min_index = VMVVX_UINT(0, gvl); | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| //index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, gvl); | |||
| v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| @@ -113,26 +122,24 @@ asm volatile( | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx, gvl); | |||
| j += gvl; | |||
| } | |||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||
| minf = vx[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v_min = VLEV_FLOAT(&x[j], gvl); | |||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||
| FLOAT cur_minf = vx[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = v_res[0]; | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| @@ -143,7 +150,7 @@ asm volatile( | |||
| } | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| unsigned int idx = 0, inc_v = gvl * inc_x; | |||
| @@ -154,7 +161,7 @@ asm volatile( | |||
| //index where element less than v_min | |||
| mask = VMFLTVV_FLOAT(vx, v_min, gvl); | |||
| v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| @@ -175,27 +182,25 @@ asm volatile( | |||
| #endif | |||
| */ | |||
| v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); | |||
| v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx, gvl); | |||
| j += gvl; | |||
| idx += inc_v; | |||
| } | |||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||
| minf = vx[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| mask = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| vx = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| vx = VFREDMINVS_FLOAT(v_min, vx, gvl); | |||
| FLOAT cur_minf = vx[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = v_res[0]; | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| @@ -30,47 +30,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 | |||
| #define MASK_T e64xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||
| #define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||
| #define VFMAXVV_FLOAT vfmaxvv_float64xm8 | |||
| #define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 | |||
| #define VMFIRSTM vmfirstm_e64xm8 | |||
| #define UINT_V_T uint64xm8_t | |||
| #define VIDV_MASK_UINT vidv_mask_uint64xm8 | |||
| #define VIDV_UINT vidv_uint64xm8 | |||
| #define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 | |||
| #define VADDVX_UINT vaddvx_uint64xm8 | |||
| #define VFADDVV_FLOAT vfaddvv_float64xm8 | |||
| #define VMVVX_UINT vmvvx_uint64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #else | |||
| #define ABS fabsf | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 | |||
| #define MASK_T e32xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||
| #define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||
| #define VFMAXVV_FLOAT vfmaxvv_float32xm8 | |||
| #define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 | |||
| #define VMFIRSTM vmfirstm_e32xm8 | |||
| #define UINT_V_T uint32xm8_t | |||
| #define VIDV_MASK_UINT vidv_mask_uint32xm8 | |||
| #define VIDV_UINT vidv_uint32xm8 | |||
| #define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 | |||
| #define VADDVX_UINT vaddvx_uint32xm8 | |||
| #define VFADDVV_FLOAT vfaddvv_float32xm8 | |||
| #define VMVVX_UINT vmvvx_uint32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #endif | |||
| #define RVV_M RVV_M8 | |||
| @@ -86,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| UINT_V_T v_max_index; | |||
| MASK_T mask0, mask1; | |||
| unsigned int gvl = 0; | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| gvl = VSETVL(n); | |||
| v_max_index = VMVVX_UINT(0, gvl); | |||
| v_max = VFMVVF_FLOAT(-1, gvl); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| @@ -96,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | |||
| vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); | |||
| vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| @@ -119,7 +130,7 @@ asm volatile( | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | |||
| vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); | |||
| vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| @@ -143,7 +154,7 @@ asm volatile( | |||
| //index where element greater than v_max | |||
| mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); | |||
| v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl); | |||
| v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| @@ -163,7 +174,7 @@ asm volatile( | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl); | |||
| v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); | |||
| //update v_max and start_index j | |||
| v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); | |||
| @@ -171,19 +182,19 @@ asm volatile( | |||
| ix += inc_xv; | |||
| } | |||
| vx0 = VFMVVF_FLOAT(0, gvl); | |||
| vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); | |||
| maxf = vx0[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| maxf = v_res[0]; | |||
| mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); | |||
| max_index = VMFIRSTM(mask0,gvl); | |||
| max_index = v_max_index[max_index]; | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v_max_index = VMVVX_UINT(0, gvl); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | |||
| vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); | |||
| vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| @@ -206,7 +217,7 @@ asm volatile( | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | |||
| vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); | |||
| vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| @@ -227,9 +238,8 @@ asm volatile( | |||
| #endif | |||
| */ | |||
| v_max = VFADDVV_FLOAT(vx0, vx1, gvl); | |||
| vx0 = VFMVVF_FLOAT(0, gvl); | |||
| vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); | |||
| FLOAT cur_maxf = vx0[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| FLOAT cur_maxf = v_res[0]; | |||
| if(cur_maxf > maxf){ | |||
| //tail index | |||
| v_max_index = VIDV_UINT(gvl); | |||
| @@ -31,50 +31,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDMINVS_FLOAT vfredminvs_float64xm8 | |||
| #define MASK_T e64xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||
| #define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||
| #define VFMINVV_FLOAT vfminvv_float64xm8 | |||
| #define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 | |||
| #define VMFIRSTM vmfirstm_e64xm8 | |||
| #define UINT_V_T uint64xm8_t | |||
| #define VIDV_MASK_UINT vidv_mask_uint64xm8 | |||
| #define VIDV_UINT vidv_uint64xm8 | |||
| #define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 | |||
| #define VADDVX_UINT vaddvx_uint64xm8 | |||
| #define VFADDVV_FLOAT vfaddvv_float64xm8 | |||
| #define VMVVX_UINT vmvvx_uint64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define UINT_V_T vuint64m8_t | |||
| #define VIDV_MASK_UINT vid_v_u64m8_m | |||
| #define VIDV_UINT vid_v_u64m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u64m8_m | |||
| #define VADDVX_UINT vadd_vx_u64m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| #define VMVVX_UINT vmv_v_x_u64m8 | |||
| #else | |||
| #define ABS fabsf | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDMINVS_FLOAT vfredminvs_float32xm8 | |||
| #define MASK_T e32xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||
| #define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||
| #define VFMINVV_FLOAT vfminvv_float32xm8 | |||
| #define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 | |||
| #define VMFIRSTM vmfirstm_e32xm8 | |||
| #define UINT_V_T uint32xm8_t | |||
| #define VIDV_MASK_UINT vidv_mask_uint32xm8 | |||
| #define VIDV_UINT vidv_uint32xm8 | |||
| #define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 | |||
| #define VADDVX_UINT vaddvx_uint32xm8 | |||
| #define VFADDVV_FLOAT vfaddvv_float32xm8 | |||
| #define VMVVX_UINT vmvvx_uint32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 | |||
| #define VMFIRSTM vmfirst_m_b4 | |||
| #define UINT_V_T vuint32m8_t | |||
| #define VIDV_MASK_UINT vid_v_u32m8_m | |||
| #define VIDV_UINT vid_v_u32m8 | |||
| #define VADDVX_MASK_UINT vadd_vx_u32m8_m | |||
| #define VADDVX_UINT vadd_vx_u32m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #define VMVVX_UINT vmv_v_x_u32m8 | |||
| #endif | |||
| #define RVV_M RVV_M8 | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| @@ -87,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| UINT_V_T v_min_index; | |||
| MASK_T mask0, mask1; | |||
| unsigned int gvl = 0; | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| gvl = VSETVL(n); | |||
| v_min_index = VMVVX_UINT(0, gvl); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| @@ -97,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | |||
| vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); | |||
| vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| @@ -120,7 +130,7 @@ asm volatile( | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | |||
| vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); | |||
| vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| @@ -144,7 +154,7 @@ asm volatile( | |||
| //index where element less than v_min | |||
| mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); | |||
| v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl); | |||
| v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| @@ -164,27 +174,26 @@ asm volatile( | |||
| :"v0"); | |||
| #endif | |||
| */ | |||
| v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl); | |||
| v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); | |||
| //update v_min and start_index j | |||
| v_min = VFMINVV_FLOAT(v_min, vx0, gvl); | |||
| j += gvl; | |||
| ix += inc_xv; | |||
| } | |||
| vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); | |||
| minf = vx0[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); | |||
| min_index = VMFIRSTM(mask0,gvl); | |||
| min_index = v_min_index[min_index]; | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v_min_index = VMVVX_UINT(0, gvl); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); | |||
| vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); | |||
| vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| @@ -207,7 +216,7 @@ asm volatile( | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); | |||
| vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); | |||
| vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); | |||
| /* | |||
| #if defined(DOUBLE) | |||
| asm volatile( | |||
| @@ -228,9 +237,8 @@ asm volatile( | |||
| #endif | |||
| */ | |||
| v_min = VFADDVV_FLOAT(vx0, vx1, gvl); | |||
| vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); | |||
| FLOAT cur_minf = vx0[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| FLOAT cur_minf = v_res[0]; | |||
| if(cur_minf < minf){ | |||
| //tail index | |||
| v_min_index = VIDV_UINT(gvl); | |||
| @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <math.h> | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFMAXVV_FLOAT vfmaxvv_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFMAXVV_FLOAT vfmaxvv_float64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT maxf=-FLT_MAX; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_max; | |||
| FLOAT_V_T_M1 v_res, v_min; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); | |||
| if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| @@ -68,21 +76,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_max = VFMAXVV_FLOAT(v_max, v1, gvl); | |||
| j += gvl * 2; | |||
| } | |||
| v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl); | |||
| maxf = v0[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| maxf = v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| v0 = VFREDMAXVS_FLOAT(v0, v1, gvl); | |||
| if(v0[0] > maxf) | |||
| maxf = v0[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); | |||
| if(v_res[0] > maxf) | |||
| maxf = v_res[0]; | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| @@ -96,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl * 2; | |||
| idx += inc_xv * 2; | |||
| } | |||
| v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl); | |||
| maxf = v0[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); | |||
| maxf = v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); | |||
| v0 = VFREDMAXVS_FLOAT(v0, v1, gvl); | |||
| if(v0[0] > maxf) | |||
| maxf = v0[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); | |||
| if(v_res[0] > maxf) | |||
| maxf = v_res[0]; | |||
| j += gvl; | |||
| } | |||
| } | |||
| @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <math.h> | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDMINVS_FLOAT vfredminvs_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFMINVV_FLOAT vfminvv_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDMINVS_FLOAT vfredminvs_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFMINVV_FLOAT vfminvv_float64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT minf=FLT_MAX; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_min; | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| for(i=0,j=0; i<n/(gvl*2); i++){ | |||
| @@ -68,21 +76,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v_min = VFMINVV_FLOAT(v_min, v1, gvl); | |||
| j += gvl * 2; | |||
| } | |||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v0 = VFREDMINVS_FLOAT(v_min, v1, gvl); | |||
| minf = v0[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v0 = VFREDMINVS_FLOAT(v0, v1, gvl); | |||
| if(v0[0] < minf) | |||
| minf = v0[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(v_res[0] < minf) | |||
| minf = v_res[0]; | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| @@ -96,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl * 2; | |||
| idx += inc_xv * 2; | |||
| } | |||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v0 = VFREDMINVS_FLOAT(v_min, v1, gvl); | |||
| minf = v0[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| v1 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v0 = VFREDMINVS_FLOAT(v0, v1, gvl); | |||
| if(v0[0] < minf) | |||
| minf = v0[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl); | |||
| if(v_res[0] < minf) | |||
| minf = v_res[0]; | |||
| j += gvl; | |||
| } | |||
| } | |||
| @@ -27,41 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||
| #define VFDOTVV_FLOAT vfdotvv_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||
| #define ABS fabsf | |||
| #define MASK_T e32xm4_t | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 | |||
| #define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 | |||
| #define VMFIRSTM vmfirstm_e32xm4 | |||
| #define VFDIVVF_FLOAT vfdivvf_float32xm4 | |||
| #define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 | |||
| #define MASK_T vbool8_t | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m | |||
| #define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define VFDIVVF_FLOAT vfdiv_vf_f32m4 | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||
| #define VFDOTVV_FLOAT vfdotvv_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||
| #define ABS fabs | |||
| #define MASK_T e64xm4_t | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 | |||
| #define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 | |||
| #define VMFIRSTM vmfirstm_e64xm4 | |||
| #define VFDIVVF_FLOAT vfdivvf_float64xm4 | |||
| #define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 | |||
| #define MASK_T vbool16_t | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m | |||
| #define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16 | |||
| #define VMFIRSTM vmfirst_m_b16 | |||
| #define VFDIVVF_FLOAT vfdiv_vf_f64m4 | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| @@ -73,18 +77,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT_V_T vr, v0, v_zero; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT scale = 0.0, ssq = 0.0; | |||
| MASK_T mask; | |||
| BLASLONG index = 0; | |||
| if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| @@ -95,15 +104,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } | |||
| }else{//found greater element | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| //find max | |||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| //update scale | |||
| scale = vr[0]; | |||
| scale = v_res[0]; | |||
| //ssq in vector vr | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| @@ -111,17 +120,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl; | |||
| } | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| @@ -130,21 +139,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| }else{//found greater element | |||
| //find max | |||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| //update scale | |||
| scale = vr[0]; | |||
| scale = v_res[0]; | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| } | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| @@ -153,7 +162,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| @@ -164,15 +173,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } | |||
| }else{//found greater element | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| //find max | |||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| //update scale | |||
| scale = vr[0]; | |||
| scale = v_res[0]; | |||
| //ssq in vector vr | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| @@ -181,17 +190,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| idx += inc_v; | |||
| } | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| @@ -200,18 +209,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| }else{//found greater element | |||
| //find max | |||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| //update scale | |||
| scale = vr[0]; | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| } | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| } | |||
| } | |||
| return(scale * sqrt(ssq)); | |||
| @@ -27,26 +27,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm8 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFDOTVV_FLOAT vfdotvv_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m8_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f32m8 | |||
| #define ABS fabsf | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm8 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFDOTVV_FLOAT vfdotvv_float64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m8_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f64m8 | |||
| #define ABS fabs | |||
| #endif | |||
| @@ -60,8 +64,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT_V_T vr, v0, v1; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl < n/2){ | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n/(2*gvl); i++){ | |||
| @@ -73,25 +82,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| vr = VFMACCVV_FLOAT(vr, v1, v1, gvl); | |||
| j += gvl; | |||
| } | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| v0 = VFREDSUM_FLOAT(vr, v0, gvl); | |||
| len += v0[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| len += v_res[0]; | |||
| } | |||
| //tail | |||
| for(;j < n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| //v1 = 0 | |||
| v1 = VFMVVF_FLOAT(0, gvl); | |||
| //v1 = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(v0, v0, gvl); | |||
| vr = VFMACCVV_FLOAT(v1, v0, v0, gvl); | |||
| v0 = VFREDSUM_FLOAT(vr, v1, gvl); | |||
| len += v0[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| len += v_res[0]; | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl < n/2){ | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| @@ -104,20 +112,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| vr = VFMACCVV_FLOAT(vr, v1, v1, gvl); | |||
| j += gvl; | |||
| } | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| v0 = VFREDSUM_FLOAT(vr, v0, gvl); | |||
| len += v0[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| len += v_res[0]; | |||
| } | |||
| //tail | |||
| for(;j < n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| //v1 = 0 | |||
| v1 = VFMVVF_FLOAT(0, gvl); | |||
| //v1 = VFMVVF_FLOAT(0, gvl); | |||
| //vr = VFDOTVV_FLOAT(v0, v0, gvl); | |||
| vr = VFMACCVV_FLOAT(v1, v0, v0, gvl); | |||
| v0 = VFREDSUM_FLOAT(vr, v1, gvl); | |||
| len += v0[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| len += v_res[0]; | |||
| j += gvl; | |||
| } | |||
| @@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSEV_FLOAT vsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VFMULVF_FLOAT vfmulvf_float32xm4 | |||
| #define VFMSACVF_FLOAT vfmsacvf_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSEV_FLOAT vse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSEV_FLOAT vsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VFMULVF_FLOAT vfmulvf_float64xm4 | |||
| #define VFMSACVF_FLOAT vfmsacvf_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSEV_FLOAT vse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| @@ -61,7 +61,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| FLOAT_V_T v0, v1, vx, vy; | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vy = VLEV_FLOAT(&y[j], gvl); | |||
| @@ -77,7 +77,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| j += gvl; | |||
| } | |||
| if(j<n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vy = VLEV_FLOAT(&y[j], gvl); | |||
| @@ -90,7 +90,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| VSEV_FLOAT(&y[j], v1, gvl); | |||
| } | |||
| }else if(inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| @@ -109,7 +109,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| ix += inc_xv; | |||
| } | |||
| if(j<n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); | |||
| vy = VLEV_FLOAT(&y[j], gvl); | |||
| @@ -122,7 +122,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| VSEV_FLOAT(&y[j], v1, gvl); | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| @@ -141,7 +141,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| iy += inc_yv; | |||
| } | |||
| if(j<n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx = VLEV_FLOAT(&x[j], gvl); | |||
| vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl); | |||
| @@ -154,7 +154,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl); | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| @@ -176,7 +176,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| iy += inc_yv; | |||
| } | |||
| if(j<n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx = VLSEV_FLOAT(&x[j*inc_x],stride_x, gvl); | |||
| vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl); | |||
| @@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VSEV_FLOAT vsev_float32xm8 | |||
| #define VSSEV_FLOAT vssev_float32xm8 | |||
| #define VFMULVF_FLOAT vfmulvf_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VSEV_FLOAT vse_v_f32m8 | |||
| #define VSSEV_FLOAT vsse_v_f32m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VSEV_FLOAT vsev_float64xm8 | |||
| #define VSSEV_FLOAT vssev_float64xm8 | |||
| #define VFMULVF_FLOAT vfmulvf_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VSEV_FLOAT vse_v_f64m8 | |||
| #define VSSEV_FLOAT vsse_v_f64m8 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| @@ -61,7 +61,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if(da == 0.0){ | |||
| memset(&x[0], 0, n * sizeof(FLOAT)); | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n / 2){ | |||
| for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| @@ -75,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| //tail | |||
| for(; j <n; ){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| v0 = VFMULVF_FLOAT(v0, da, gvl); | |||
| VSEV_FLOAT(&x[j], v0, gvl); | |||
| @@ -84,7 +84,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| }else{ | |||
| if(da == 0.0){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n / 2){ | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){ | |||
| @@ -94,13 +94,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| //tail | |||
| for(; j <n; ){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| VSEV_FLOAT(&x[j], v0, gvl); | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG ix = 0; | |||
| if(gvl < n / 2){ | |||
| @@ -118,7 +118,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| //tail | |||
| for(; j <n; ){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v0 = VFMULVF_FLOAT(v0, da, gvl); | |||
| VSSEV_FLOAT(&x[ix], stride_x, v0, gvl); | |||
| @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <stdio.h> | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VSEV_FLOAT vsev_float32xm8 | |||
| #define VSSEV_FLOAT vssev_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VSEV_FLOAT vse_v_f32m8 | |||
| #define VSSEV_FLOAT vsse_v_f32m8 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VSEV_FLOAT vsev_float64xm8 | |||
| #define VSSEV_FLOAT vssev_float64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VSEV_FLOAT vse_v_f64m8 | |||
| #define VSSEV_FLOAT vsse_v_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| if (n < 0) return(0); | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| for(i=0,j=0; i<n/(2*gvl); i++){ | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| @@ -71,7 +71,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| VSEV_FLOAT(&x[j], vy0, gvl); | |||
| @@ -79,7 +79,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| j+=gvl; | |||
| } | |||
| }else if (inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_xv = inc_x * gvl; | |||
| @@ -98,7 +98,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl); | |||
| @@ -107,7 +107,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| ix += inc_x * gvl; | |||
| } | |||
| }else if(inc_x == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| BLASLONG inc_yv = inc_y * gvl; | |||
| @@ -126,7 +126,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| VSEV_FLOAT(&x[j], vy0, gvl); | |||
| @@ -135,7 +135,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| iy += inc_y * gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_x = inc_x * sizeof(FLOAT); | |||
| stride_y = inc_y * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| @@ -157,7 +157,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl); | |||
| @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSEV_FLOAT vsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSEV_FLOAT vse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSEV_FLOAT vsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSEV_FLOAT vse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| @@ -63,6 +67,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| FLOAT temp2; | |||
| FLOAT *a_ptr = a; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT_V_T va, vx, vy, vr; | |||
| BLASLONG stride_x, stride_y, inc_xv, inc_yv, len; | |||
| @@ -76,7 +84,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i = j + 1; | |||
| len = m - i; | |||
| if(len > 0){ | |||
| gvl = vsetvli(len, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(len); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(k = 0; k < len / gvl; k++){ | |||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | |||
| @@ -89,11 +97,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| } | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 = va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| if(i < m){ | |||
| gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m-i); | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | |||
| @@ -101,9 +108,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLEV_FLOAT(&x[i], gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 += va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| } | |||
| } | |||
| y[j] += alpha * temp2; | |||
| @@ -121,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i = j + 1; | |||
| len = m - i; | |||
| if(len > 0){ | |||
| gvl = vsetvli(len, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(len); | |||
| inc_yv = inc_y * gvl; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(k = 0; k < len / gvl; k++){ | |||
| @@ -136,11 +142,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| iy += inc_yv; | |||
| } | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 = va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| if(i < m){ | |||
| gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m-i); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | |||
| @@ -148,9 +153,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLEV_FLOAT(&x[i], gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 += va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| } | |||
| } | |||
| y[jy] += alpha * temp2; | |||
| @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i = j + 1; | |||
| len = m - i; | |||
| if(len > 0){ | |||
| gvl = vsetvli(len, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(len); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| inc_xv = inc_x * gvl; | |||
| for(k = 0; k < len / gvl; k++){ | |||
| @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| ix += inc_xv; | |||
| } | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 = va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| if(i < m){ | |||
| gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m-i); | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | |||
| @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 += va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| } | |||
| } | |||
| y[j] += alpha * temp2; | |||
| @@ -220,7 +222,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i = j + 1; | |||
| len = m - i; | |||
| if(len > 0){ | |||
| gvl = vsetvli(len, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(len); | |||
| inc_xv = inc_x * gvl; | |||
| inc_yv = inc_y * gvl; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| @@ -237,11 +239,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| ix += inc_xv; | |||
| iy += inc_yv; | |||
| } | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 = va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| if(i < m){ | |||
| gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m-i); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | |||
| @@ -249,9 +250,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 += va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| } | |||
| } | |||
| y[jy] += alpha * temp2; | |||
| @@ -27,33 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSEV_FLOAT vsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||
| #define VFDOTVV_FLOAT vfdotvv_float32xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSEV_FLOAT vse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSEV_FLOAT vsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||
| #define VFDOTVV_FLOAT vfdotvv_float64xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSEV_FLOAT vse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| @@ -65,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| FLOAT temp2; | |||
| FLOAT *a_ptr = a; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT_V_T va, vx, vy, vr; | |||
| BLASLONG stride_x, stride_y, inc_xv, inc_yv; | |||
| @@ -78,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| temp2 = 0.0; | |||
| if(j > 0){ | |||
| i = 0; | |||
| gvl = vsetvli(j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(j); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(k = 0; k < j / gvl; k++){ | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| @@ -91,11 +99,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| } | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 = va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| if(i < j){ | |||
| gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(j-i); | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | |||
| @@ -103,9 +110,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLEV_FLOAT(&x[i], gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 += va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| } | |||
| } | |||
| y[j] += temp1 * a_ptr[j] + alpha * temp2; | |||
| @@ -122,7 +128,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| if(j > 0){ | |||
| iy = 0; | |||
| i = 0; | |||
| gvl = vsetvli(j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(j); | |||
| inc_yv = inc_y * gvl; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(k = 0; k < j / gvl; k++){ | |||
| @@ -137,11 +143,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| iy += inc_yv; | |||
| } | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 = va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| if(i < j){ | |||
| gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(j-i); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | |||
| @@ -149,9 +154,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLEV_FLOAT(&x[i], gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 += va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| } | |||
| } | |||
| y[jy] += temp1 * a_ptr[j] + alpha * temp2; | |||
| @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| if(j > 0){ | |||
| ix = 0; | |||
| i = 0; | |||
| gvl = vsetvli(j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(j); | |||
| inc_xv = inc_x * gvl; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| for(k = 0; k < j / gvl; k++){ | |||
| @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| i += gvl; | |||
| ix += inc_xv; | |||
| } | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 = va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| if(i < j){ | |||
| gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(j-i); | |||
| vy = VLEV_FLOAT(&y[i], gvl); | |||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | |||
| @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 += va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| } | |||
| } | |||
| y[j] += temp1 * a_ptr[j] + alpha * temp2; | |||
| @@ -219,7 +221,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| ix = 0; | |||
| iy = 0; | |||
| i = 0; | |||
| gvl = vsetvli(j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(j); | |||
| inc_xv = inc_x * gvl; | |||
| inc_yv = inc_y * gvl; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| @@ -236,11 +238,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| ix += inc_xv; | |||
| iy += inc_yv; | |||
| } | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 = va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 = v_res[0]; | |||
| if(i < j){ | |||
| gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(j-i); | |||
| vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| va = VLEV_FLOAT(&a_ptr[i], gvl); | |||
| vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); | |||
| @@ -248,9 +249,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA | |||
| vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vr = VFMULVV_FLOAT(vx, va, gvl); | |||
| va = VFMVVF_FLOAT(0, gvl); | |||
| va = VFREDSUM_FLOAT(vr, va, gvl); | |||
| temp2 += va[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp2 += v_res[0]; | |||
| } | |||
| } | |||
| y[jy] += temp1 * a_ptr[j] + alpha * temp2; | |||
| @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <math.h> | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 | |||
| #define MASK_T e32xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||
| #define VFMAXVV_FLOAT vfmaxvv_float32xm8 | |||
| #define VFADDVV_FLOAT vfaddvv_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f32m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 | |||
| #define MASK_T e64xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||
| #define VFMAXVV_FLOAT vfmaxvv_float64xm8 | |||
| #define VFADDVV_FLOAT vfaddvv_float64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMAXVV_FLOAT vfmax_vv_f64m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| @@ -62,19 +66,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_max; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| MASK_T mask0, mask1; | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| v_max = VFMVVF_FLOAT(0, gvl); | |||
| BLASLONG inc_xv = inc_x * gvl * 2; | |||
| for(; i<n/gvl; i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v0 = VFADDVV_FLOAT(v0, v1, gvl); | |||
| v_max = VFMAXVV_FLOAT(v_max, v0, gvl); | |||
| @@ -82,23 +90,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl; | |||
| ix += inc_xv; | |||
| } | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| v_max = VFREDMAXVS_FLOAT(v_max, v0, gvl); | |||
| maxf = v_max[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); | |||
| maxf = v_res[0]; | |||
| if(j<n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v1 = VFADDVV_FLOAT(v0, v1, gvl); | |||
| v0 = VFMVVF_FLOAT(0, gvl); | |||
| v_max = VFREDMAXVS_FLOAT(v1, v0, gvl); | |||
| if(v_max[0] > maxf) | |||
| maxf = v_max[0]; | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl); | |||
| if(v_res[0] > maxf) | |||
| maxf = v_res[0]; | |||
| } | |||
| return(maxf); | |||
| } | |||
| @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <float.h> | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDMINVS_FLOAT vfredminvs_float32xm8 | |||
| #define MASK_T e32xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||
| #define VFMINVV_FLOAT vfminvv_float32xm8 | |||
| #define VFADDVV_FLOAT vfaddvv_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f32m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDMINVS_FLOAT vfredminvs_float64xm8 | |||
| #define MASK_T e64xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||
| #define VFMINVV_FLOAT vfminvv_float64xm8 | |||
| #define VFADDVV_FLOAT vfaddvv_float64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFMINVV_FLOAT vfmin_vv_f64m8 | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| @@ -63,18 +67,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT minf=FLT_MAX; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_min; | |||
| FLOAT_V_T_M1 v_res, v_max; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); | |||
| MASK_T mask0, mask1; | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| v_min = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| BLASLONG inc_xv = inc_x * gvl * 2; | |||
| for(; i<n/gvl; i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v0 = VFADDVV_FLOAT(v0, v1, gvl); | |||
| v_min = VFMINVV_FLOAT(v_min, v0, gvl); | |||
| @@ -82,23 +91,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl; | |||
| ix += inc_xv; | |||
| } | |||
| v0 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v_min = VFREDMINVS_FLOAT(v_min, v0, gvl); | |||
| minf = v_min[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); | |||
| minf = v_res[0]; | |||
| if(j<n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v1 = VFADDVV_FLOAT(v0, v1, gvl); | |||
| v0 = VFMVVF_FLOAT(FLT_MAX, gvl); | |||
| v_min = VFREDMINVS_FLOAT(v1, v0, gvl); | |||
| if(v_min[0] < minf) | |||
| minf = v_min[0]; | |||
| v_res = VFREDMINVS_FLOAT(v_res, v1, v_max, gvl); | |||
| if(v_res[0] < minf) | |||
| minf = v_res[0]; | |||
| } | |||
| return(minf); | |||
| } | |||
| @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <math.h> | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 | |||
| #define MASK_T e32xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 | |||
| #define VFADDVV_FLOAT vfaddvv_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 | |||
| #define MASK_T vbool4_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m | |||
| #define VFADDVV_FLOAT vfadd_vv_f32m8 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 | |||
| #define MASK_T e64xm8_t | |||
| #define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm8 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 | |||
| #define VFADDVV_FLOAT vfaddvv_float64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 | |||
| #define MASK_T vbool8_t | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m8 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m | |||
| #define VFADDVV_FLOAT vfadd_vv_f64m8 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| @@ -61,40 +65,44 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if (n <= 0 || inc_x <= 0) return(asumf); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T v0, v1, v_zero,v_sum; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| MASK_T mask0, mask1; | |||
| if(inc_x == 1){ | |||
| BLASLONG n2 = n * 2; | |||
| gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n2); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| if(gvl <= n2/2){ | |||
| v_sum = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n2/(gvl*2); i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||
| v1 = VLEV_FLOAT(&x[j+gvl], gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||
| j += gvl * 2; | |||
| } | |||
| v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl); | |||
| asumf += v0[0]; | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| } | |||
| for(;j<n2;){ | |||
| gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n2-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||
| v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl); | |||
| asumf += v0[0]; | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| @@ -103,31 +111,31 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v_sum, v1, gvl); | |||
| j += gvl; | |||
| ix += inc_xv; | |||
| } | |||
| v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl); | |||
| asumf += v0[0]; | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| if(j<n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| mask0 = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| mask1 = VMFLTVF_FLOAT(v1, 0, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl); | |||
| v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl); | |||
| v_sum = VFADDVV_FLOAT(v0, v1, gvl); | |||
| v_sum = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl); | |||
| asumf += v_sum[0]; | |||
| v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl); | |||
| asumf += v_res[0]; | |||
| } | |||
| } | |||
| return(asumf); | |||
| @@ -28,27 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||
| #define VFMULVF_FLOAT vfmulvf_float32xm4 | |||
| #define VFMSACVF_FLOAT vfmsacvf_float32xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f32m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||
| #define VFMULVF_FLOAT vfmulvf_float64xm4 | |||
| #define VFMSACVF_FLOAT vfmsacvf_float64xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||
| #define VFMSACVF_FLOAT vfmsac_vf_f64m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y) | |||
| @@ -69,7 +67,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||
| if(inc_y == 1){ | |||
| memset(&y[0], 0, 2 * n * sizeof(FLOAT)); | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| if(gvl <= n/2){ | |||
| vy0 = VFMVVF_FLOAT(0.0, gvl); | |||
| BLASLONG inc_yv = inc_y * gvl * 2; | |||
| @@ -83,7 +81,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vy0 = VFMVVF_FLOAT(0.0, gvl); | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); | |||
| VSSEV_FLOAT(&y[iy+1], stride_y, vy0, gvl); | |||
| @@ -92,7 +90,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||
| } | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG inc_xv = inc_x * gvl * 2; | |||
| BLASLONG inc_yv = inc_y * gvl * 2; | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| @@ -110,7 +108,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||
| iy += inc_yv; | |||
| } | |||
| if(j<n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| vy0 = VFMULVF_FLOAT(vx1, alpha_i, gvl); | |||
| @@ -124,7 +122,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||
| }else{ | |||
| FLOAT_V_T v0, v1; | |||
| if(alpha_r == 0.0 && alpha_i == 0.0){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG inc_yv = inc_y * gvl * 2; | |||
| for(i=0,j=0;i<n/gvl;i++){ | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -139,7 +137,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||
| iy += inc_yv; | |||
| } | |||
| if(j<n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); | |||
| v0 = VFMULVF_FLOAT(vy1, beta_i, gvl); | |||
| @@ -150,7 +148,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||
| VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl); | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG inc_xv = inc_x * gvl * 2; | |||
| BLASLONG inc_yv = inc_y * gvl * 2; | |||
| for(i=0,j=0; i<n/gvl; i++){ | |||
| @@ -174,7 +172,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||
| iy += inc_yv; | |||
| } | |||
| if(j<n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -28,21 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| @@ -56,7 +54,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| FLOAT_V_T vx0, vx1, vy0, vy1; | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG inc_xv = inc_x * 2 * gvl; | |||
| BLASLONG inc_yv = inc_y * 2 * gvl; | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| @@ -82,7 +80,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| iy += inc_yv; | |||
| } | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -27,17 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #endif | |||
| @@ -52,7 +50,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| memcpy(&y[0], &x[0], n * 2 * sizeof(FLOAT)); | |||
| }else{ | |||
| FLOAT_V_T vx0, vx1, vx2, vx3; | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| if(gvl <= n/2){ | |||
| @@ -75,7 +73,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| } | |||
| } | |||
| for(;j<n;){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl); | |||
| @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||
| #define VFDOTVV_FLOAT vfdotvv_float32xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||
| #define VFMSACVV_FLOAT vfmsacvv_float32xm4 | |||
| #define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||
| #define VFMSACVV_FLOAT vfmsac_vv_f32m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||
| #define VFDOTVV_FLOAT vfdotvv_float64xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||
| #define VFMSACVV_FLOAT vfmsacvv_float64xm4 | |||
| #define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||
| #define VFMSACVV_FLOAT vfmsac_vv_f64m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||
| #endif | |||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| @@ -70,9 +74,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| if ( n < 1 ) return(result); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1; | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| vr0 = VFMVVF_FLOAT(0, gvl); | |||
| vr1 = VFMVVF_FLOAT(0, gvl); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| @@ -99,14 +107,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| ix += inc_xv; | |||
| iy += inc_yv; | |||
| } | |||
| vx0 = VFMVVF_FLOAT(0, gvl); | |||
| vr0 = VFREDSUM_FLOAT(vr0, vx0, gvl); | |||
| dot[0] += vr0[0]; | |||
| vr1 = VFREDSUM_FLOAT(vr1, vx0, gvl); | |||
| dot[1] += vr1[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||
| dot[0] += v_res[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||
| dot[1] += v_res[0]; | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -123,11 +130,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| vr1 = VFMULVV_FLOAT(vx1, vy0, gvl); | |||
| vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl); | |||
| #endif | |||
| vx0 = VFMVVF_FLOAT(0, gvl); | |||
| vr0 = VFREDSUM_FLOAT(vr0, vx0, gvl); | |||
| dot[0] += vr0[0]; | |||
| vr1 = VFREDSUM_FLOAT(vr1, vx0, gvl); | |||
| dot[1] += vr1[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||
| dot[0] += v_res[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||
| dot[1] += v_res[0]; | |||
| } | |||
| CREAL(result) = dot[0]; | |||
| CIMAG(result) = dot[1]; | |||
| @@ -27,25 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSEV_FLOAT vsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSEV_FLOAT vse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSEV_FLOAT vsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSEV_FLOAT vse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| @@ -58,7 +56,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| unsigned int gvl = 0; | |||
| BLASLONG stride_a = sizeof(FLOAT) * 2; | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; | |||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m); | |||
| BLASLONG inc_yv = inc_y * gvl * 2; | |||
| BLASLONG inc_x2 = inc_x * 2; | |||
| BLASLONG lda2 = lda * 2; | |||
| @@ -117,7 +115,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| } | |||
| //tail | |||
| if(j/2 < m){ | |||
| gvl = vsetvli(m-j/2, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m-j/2); | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -27,25 +27,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||
| #define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||
| #define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| @@ -57,15 +61,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| FLOAT_V_T va0, va1, vx0, vx1, vr, vi; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| BLASLONG stride_a = sizeof(FLOAT) * 2; | |||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m); | |||
| BLASLONG inc_xv = inc_x * gvl * 2; | |||
| BLASLONG inc_av = gvl * 2; | |||
| BLASLONG inc_y2 = inc_y * 2; | |||
| BLASLONG lda2 = lda * 2; | |||
| for(i = 0; i < n; i++){ | |||
| gvl = vsetvli(m, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m); | |||
| j = 0; | |||
| ix = 0; | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| @@ -90,13 +99,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| j += inc_av; | |||
| ix += inc_xv; | |||
| } | |||
| va0 = VFMVVF_FLOAT(0, gvl); | |||
| vx0 = VFREDSUM_FLOAT(vr, va0, gvl); | |||
| temp_r = vx0[0]; | |||
| vx1 = VFREDSUM_FLOAT(vi, va0, gvl); | |||
| temp_i = vx1[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp_r = v_res[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl); | |||
| temp_i = v_res[0]; | |||
| if(j/2 < m){ | |||
| gvl = vsetvli(m-j/2, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m-j/2); | |||
| va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| @@ -113,11 +121,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl); | |||
| #endif | |||
| va0 = VFMVVF_FLOAT(0, gvl); | |||
| vx0 = VFREDSUM_FLOAT(vr, va0, gvl); | |||
| temp_r += vx0[0]; | |||
| vx1 = VFREDSUM_FLOAT(vi, va0, gvl); | |||
| temp_i += vx1[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| temp_r += v_res[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl); | |||
| temp_i += v_res[0]; | |||
| } | |||
| #if !defined(XCONJ) | |||
| y[iy] += alpha_r * temp_r - alpha_i * temp_i; | |||
| @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||
| #define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||
| #define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ | |||
| @@ -62,7 +66,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||
| FLOAT temp_r2, temp_i2; | |||
| FLOAT *a_ptr = a; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; | |||
| BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2; | |||
| @@ -90,7 +97,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||
| i = j + 1; | |||
| len = m - i; | |||
| if(len > 0){ | |||
| gvl = vsetvli(len, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(len); | |||
| inc_xv = incx * gvl * 2; | |||
| inc_yv = incy * gvl * 2; | |||
| inc_av = gvl * 2; | |||
| @@ -134,13 +141,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||
| iy += inc_yv; | |||
| ia += inc_av; | |||
| } | |||
| va0 = VFMVVF_FLOAT(0, gvl); | |||
| vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); | |||
| temp_r2 = vx0[0]; | |||
| vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); | |||
| temp_i2 = vx1[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||
| temp_r2 = v_res[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||
| temp_i2 = v_res[0]; | |||
| if(i < m){ | |||
| gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(m-i); | |||
| va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -173,11 +179,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||
| vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); | |||
| #endif | |||
| va0 = VFMVVF_FLOAT(0, gvl); | |||
| vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); | |||
| temp_r2 += vx0[0]; | |||
| vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); | |||
| temp_i2 += vx1[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||
| temp_r2 += v_res[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||
| temp_i2 += v_res[0]; | |||
| } | |||
| } | |||
| y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||
| @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float32xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||
| #define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f32m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||
| #define VFMULVV_FLOAT vfmulvv_float64xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||
| #define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMULVV_FLOAT vfmul_vv_f64m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||
| #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ | |||
| @@ -62,7 +66,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||
| FLOAT temp_r2, temp_i2; | |||
| FLOAT *a_ptr = a; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; | |||
| BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; | |||
| @@ -89,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||
| ia = 0; | |||
| i = 0; | |||
| if(j > 0){ | |||
| gvl = vsetvli(j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(j); | |||
| inc_xv = incx * gvl * 2; | |||
| inc_yv = incy * gvl * 2; | |||
| inc_av = gvl * 2; | |||
| @@ -133,13 +140,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||
| iy += inc_yv; | |||
| ia += inc_av; | |||
| } | |||
| va0 = VFMVVF_FLOAT(0, gvl); | |||
| vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); | |||
| temp_r2 = vx0[0]; | |||
| vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); | |||
| temp_i2 = vx1[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||
| temp_r2 = v_res[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||
| temp_i2 = v_res[0]; | |||
| if(i < j){ | |||
| gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(j-i); | |||
| va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -172,11 +178,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B | |||
| vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); | |||
| #endif | |||
| va0 = VFMVVF_FLOAT(0, gvl); | |||
| vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); | |||
| temp_r2 += vx0[0]; | |||
| vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); | |||
| temp_i2 += vx1[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); | |||
| temp_r2 += v_res[0]; | |||
| v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); | |||
| temp_i2 += v_res[0]; | |||
| } | |||
| } | |||
| y[jy] += temp_r1 * a_ptr[ja]; | |||
| @@ -27,41 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float32xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float32xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float32xm4 | |||
| #define VFDOTVV_FLOAT vfdotvv_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f32m4 | |||
| #define ABS fabsf | |||
| #define MASK_T e32xm4_t | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 | |||
| #define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 | |||
| #define VMFIRSTM vmfirstm_e32xm4 | |||
| #define VFDIVVF_FLOAT vfdivvf_float32xm4 | |||
| #define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 | |||
| #define MASK_T vbool8_t | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m | |||
| #define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8 | |||
| #define VMFIRSTM vmfirst_m_b8 | |||
| #define VFDIVVF_FLOAT vfdiv_vf_f32m4 | |||
| #define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VFREDSUM_FLOAT vfredsumvs_float64xm4 | |||
| #define VFMACCVV_FLOAT vfmaccvv_float64xm4 | |||
| #define VFMVVF_FLOAT vfmvvf_float64xm4 | |||
| #define VFDOTVV_FLOAT vfdotvv_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFDOTVV_FLOAT vfdot_vv_f64m4 | |||
| #define ABS fabs | |||
| #define MASK_T e64xm4_t | |||
| #define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 | |||
| #define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 | |||
| #define VMFIRSTM vmfirstm_e64xm4 | |||
| #define VFDIVVF_FLOAT vfdivvf_float64xm4 | |||
| #define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 | |||
| #define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 | |||
| #define MASK_T vbool16_t | |||
| #define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m | |||
| #define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16 | |||
| #define VMFIRSTM vmfirst_m_b16 | |||
| #define VFDIVVF_FLOAT vfdiv_vf_f64m4 | |||
| #define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 | |||
| #define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| @@ -73,19 +77,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT_V_T vr, v0, v_zero; | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| gvl = VSETVL_MAX; | |||
| v_res = VFMVVF_FLOAT_M1(0, gvl); | |||
| v_z0 = VFMVVF_FLOAT_M1(0, gvl); | |||
| FLOAT scale = 0.0, ssq = 0.0; | |||
| MASK_T mask; | |||
| BLASLONG index = 0; | |||
| if(inc_x == 1){ | |||
| BLASLONG n2 = n * 2; | |||
| gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n2); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| for(i=0,j=0; i<n2/gvl; i++){ | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| @@ -96,15 +105,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } | |||
| }else{//found greater element | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| //find max | |||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| //update scale | |||
| scale = vr[0]; | |||
| scale = v_res[0]; | |||
| //ssq in vector vr | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| @@ -112,17 +121,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| j += gvl; | |||
| } | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| //tail | |||
| if(j < n2){ | |||
| gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n2-j); | |||
| v0 = VLEV_FLOAT(&x[j], gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| @@ -131,21 +140,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| }else{//found greater element | |||
| //find max | |||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| //update scale | |||
| scale = vr[0]; | |||
| scale = v_res[0]; | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| } | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| vr = VFMVVF_FLOAT(0, gvl); | |||
| v_zero = VFMVVF_FLOAT(0, gvl); | |||
| unsigned int stride_x = inc_x * sizeof(FLOAT) * 2; | |||
| @@ -154,7 +163,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| @@ -165,15 +174,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } | |||
| }else{//found greater element | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| //find max | |||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| //update scale | |||
| scale = vr[0]; | |||
| scale = v_res[0]; | |||
| //ssq in vector vr | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| @@ -182,7 +191,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| @@ -193,15 +202,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } | |||
| }else{//found greater element | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| //find max | |||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| //update scale | |||
| scale = vr[0]; | |||
| scale = v_res[0]; | |||
| //ssq in vector vr | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| @@ -210,17 +219,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| idx += inc_v; | |||
| } | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| //tail | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| @@ -231,11 +240,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } | |||
| }else{//found greater element | |||
| //find max | |||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| //update scale | |||
| scale = vr[0]; | |||
| scale = v_res[0]; | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| } | |||
| @@ -243,7 +252,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl); | |||
| //fabs(vector) | |||
| mask = VMFLTVF_FLOAT(v0, 0, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl); | |||
| v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl); | |||
| //if scale change | |||
| mask = VMFGTVF_FLOAT(v0, scale, gvl); | |||
| index = VMFIRSTM(mask, gvl); | |||
| @@ -254,22 +263,22 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } | |||
| }else{//found greater element | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq before current vector | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| //find max | |||
| vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl); | |||
| v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl); | |||
| //update ssq before max_index | |||
| ssq = ssq * (scale/vr[0])*(scale/vr[0]); | |||
| ssq = ssq * (scale/v_res[0])*(scale/v_res[0]); | |||
| //update scale | |||
| scale = vr[0]; | |||
| scale = v_res[0]; | |||
| v0 = VFDIVVF_FLOAT(v0, scale, gvl); | |||
| vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl); | |||
| } | |||
| //ssq in vector vr: vr[0] | |||
| vr = VFREDSUM_FLOAT(vr, v_zero, gvl); | |||
| v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); | |||
| //total ssq now | |||
| ssq += vr[0]; | |||
| ssq += v_res[0]; | |||
| } | |||
| } | |||
| return(scale * sqrt(ssq)); | |||
| @@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLEV_FLOAT vlev_float32xm4 | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSEV_FLOAT vsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VFMULVF_FLOAT vfmulvf_float32xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle_v_f32m4 | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSEV_FLOAT vse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLEV_FLOAT vlev_float64xm4 | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSEV_FLOAT vsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VFMULVF_FLOAT vfmulvf_float64xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle_v_f64m4 | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSEV_FLOAT vse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| @@ -59,7 +59,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1; | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| BLASLONG inc_xv = inc_x * 2 * gvl; | |||
| @@ -90,7 +90,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| ix += 2*gvl; | |||
| } | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLEV_FLOAT(&x[ix], gvl); | |||
| vx1 = VLEV_FLOAT(&x[ix+gvl], gvl); | |||
| vy0 = VLEV_FLOAT(&y[ix], gvl); | |||
| @@ -137,7 +137,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| iy += inc_yv; | |||
| } | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| @@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float32xm4_t | |||
| #define VLSEV_FLOAT vlsev_float32xm4 | |||
| #define VSSEV_FLOAT vssev_float32xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float32xm4 | |||
| #define VFMULVF_FLOAT vfmulvf_float32xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLSEV_FLOAT vlse_v_f32m4 | |||
| #define VSSEV_FLOAT vsse_v_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f32m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M4 | |||
| #define FLOAT_V_T float64xm4_t | |||
| #define VLSEV_FLOAT vlsev_float64xm4 | |||
| #define VSSEV_FLOAT vssev_float64xm4 | |||
| #define VFMACCVF_FLOAT vfmaccvf_float64xm4 | |||
| #define VFMULVF_FLOAT vfmulvf_float64xm4 | |||
| #define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLSEV_FLOAT vlse_v_f64m4 | |||
| #define VSSEV_FLOAT vsse_v_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMULVF_FLOAT vfmul_vf_f64m4 | |||
| #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| @@ -60,7 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| if(da_r == 0.0 && da_i == 0.0){ | |||
| memset(&x[0], 0, n * 2 * sizeof(FLOAT)); | |||
| }else if(da_r == 0.0){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG inc_xv = inc_x * 2 * gvl; | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| @@ -77,7 +77,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| ix += inc_xv; | |||
| } | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| @@ -88,7 +88,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); | |||
| } | |||
| }else if(da_i == 0.0){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG inc_xv = inc_x * 2 * gvl; | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| @@ -105,7 +105,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| ix += inc_xv; | |||
| } | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| @@ -116,7 +116,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG inc_xv = inc_x * 2 * gvl; | |||
| for(i=0,j=0; i < n/gvl; i++){ | |||
| @@ -135,7 +135,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| ix += inc_xv; | |||
| } | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| @@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include <stdio.h> | |||
| #if !defined(DOUBLE) | |||
| #define RVV_EFLOAT RVV_E32 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float32xm8_t | |||
| #define VLEV_FLOAT vlev_float32xm8 | |||
| #define VLSEV_FLOAT vlsev_float32xm8 | |||
| #define VSEV_FLOAT vsev_float32xm8 | |||
| #define VSSEV_FLOAT vssev_float32xm8 | |||
| #define VSETVL(n) vsetvl_e32m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT vle_v_f32m8 | |||
| #define VLSEV_FLOAT vlse_v_f32m8 | |||
| #define VSEV_FLOAT vse_v_f32m8 | |||
| #define VSSEV_FLOAT vsse_v_f32m8 | |||
| #else | |||
| #define RVV_EFLOAT RVV_E64 | |||
| #define RVV_M RVV_M8 | |||
| #define FLOAT_V_T float64xm8_t | |||
| #define VLEV_FLOAT vlev_float64xm8 | |||
| #define VLSEV_FLOAT vlsev_float64xm8 | |||
| #define VSEV_FLOAT vsev_float64xm8 | |||
| #define VSSEV_FLOAT vssev_float64xm8 | |||
| #define VSETVL(n) vsetvl_e64m8(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m8_t | |||
| #define VLEV_FLOAT vle_v_f64m8 | |||
| #define VLSEV_FLOAT vlse_v_f64m8 | |||
| #define VSEV_FLOAT vse_v_f64m8 | |||
| #define VSSEV_FLOAT vsse_v_f64m8 | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm | |||
| if (n < 0) return(0); | |||
| if(inc_x == 1 && inc_y == 1){ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| BLASLONG n2 = n * 2; | |||
| if(gvl <= n2/2){ | |||
| for(i=0,j=0; i<n2/(2*gvl); i++){ | |||
| @@ -72,7 +72,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm | |||
| } | |||
| } | |||
| for(;j<n2;){ | |||
| gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n2-j); | |||
| vx0 = VLEV_FLOAT(&x[j], gvl); | |||
| vy0 = VLEV_FLOAT(&y[j], gvl); | |||
| VSEV_FLOAT(&x[j], vy0, gvl); | |||
| @@ -80,7 +80,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm | |||
| j += gvl; | |||
| } | |||
| }else{ | |||
| gvl = vsetvli(n, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n); | |||
| stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| stride_y = inc_y * 2 * sizeof(FLOAT); | |||
| BLASLONG inc_xv = inc_x * gvl * 2; | |||
| @@ -100,7 +100,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm | |||
| iy += inc_yv; | |||
| } | |||
| if(j < n){ | |||
| gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); | |||
| gvl = VSETVL(n-j); | |||
| vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); | |||
| vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||